Code
library(vegan)
library(ggplot2)
library(here)
library(tidyverse)
library(microViz)
library(phyloseq)
library(ggtext)
theme_set(theme_minimal())
The following analysis is performed on data generated by research conducted by the GERMS lab in conjunction with the Iowa Department of Natural Resources.
library(vegan)
library(ggplot2)
library(here)
library(tidyverse)
library(microViz)
library(phyloseq)
library(ggtext)
theme_set(theme_minimal())
Reading in data:
<- read.csv(here("data", "priming_amoA_deltaCt.csv"), header = T) %>%
data.priming rename(sample_id = X)
<- read.csv(here("data", "priming_amoA_rawCt.csv"), header = T) %>%
data.raw rename(sample_id = X)
<- data.priming %>%
data.priming.long pivot_longer(cols = amoA.001:amoA.078, names_to = "amoA", values_to = "deltaCT")
<- data.raw %>%
data.raw.long pivot_longer(cols = amoA.001:amoA.078, names_to = "amoA", values_to = "CT")
$sample_id <- fct_reorder(data.priming.long$sample_id, parse_number(data.priming.long$sample_id))
data.priming.long
<- data.priming[, -1]
df rownames(df) <- data.priming[, 1]
<- df %>%
metadata select(fert_level:field_rep) %>%
mutate(across(everything(), as.factor))
<- df %>%
amoa_counts select(starts_with("amoA"))
data.priming
contains the data for our experiment. There are rows for samples, columns for the delta CTs of the different amoAs, and some metadata.
1:5, 1:5] data.priming[
sample_id amoA.001 amoA.002 amoA.003 amoA.004
1 2b 10.119249 27.41268 8.764504 8.992937
2 35b 9.837943 27.51089 9.300077 10.445448
3 52f 26.345485 26.34548 26.345485 26.345485
4 34f 26.914432 26.91443 26.914432 26.914432
5 16f 8.337293 25.94591 8.371314 25.945907
data.raw
contains the same columns but lists the raw CT values instead of the 16s-normalized ones.
1:5, 1:5] data.raw[
sample_id amoA.001 amoA.002 amoA.003 amoA.004
1 2b 22.70657 40 21.35182 21.58026
2 35b 22.32706 40 21.78919 22.93456
3 52f 40.00000 40 40.00000 40.00000
4 34f 40.00000 40 40.00000 40.00000
5 16f 22.39139 40 22.42541 40.00000
THe long
versions of these dataframes contains the same info but in long format to play nicely with ggplot
.
We’ll start by removing those amoAs from our data that are not present in over 30 samples across both treatments.
We’ll first start by counting the non-detects for each amoA.
<- data.raw.long %>%
non_detect_counts group_by(fert_level, amoA) %>%
count(CT == 40) %>%
rename(non_detect = `CT == 40`) %>%
filter(non_detect == TRUE)
Finding the amoAs that are not detected in > 30 across both samples
<- non_detect_counts %>%
removes pivot_wider(names_from = fert_level, values_from = n, names_prefix = "fert.") %>%
filter(fert.0 > 30 & fert.336 > 30) %>%
pivot_longer(cols = fert.0:fert.336, names_to = "fert_level", values_to = "n")
We’ll now reduce data.priming
by removing those amoAs that are largely non-detects. We’ll also update the long
version while we’re at it
<- data.priming %>%
data.priming.reduced select(-one_of(removes$amoA))
<- data.priming.reduced %>%
data.priming.reduced.long select(-sample_id, field_rep) %>%
pivot_longer(cols = amoA.001:amoA.074)
Here’s a barchart of what we’re removing:
%>%
removes mutate(amoA = str_sub(amoA, -3)) %>%
mutate(favored = case_when(
%in% c("006", "038", "064", "069", "071") ~ "Nothing",
amoA %in% c("021", "028", "030", "048", "073", "075", "076", "077", "078") ~ "Non-fertilized",
amoA %in% c("040", "050", "053") ~ "Fourth quadrant",
amoA TRUE ~ "First quadrant"
%>%
)) mutate(fert_level = str_sub(fert_level, start = 6)) %>%
ggplot(aes(amoA, n, fill = favored )) +
geom_col() +
facet_wrap(~ fert_level) +
theme(
plot.title = element_text(hjust = 0.5),
legend.text = element_markdown(size = 12),
legend.title = element_markdown(size = 12, hjust = 0),
strip.background = element_rect(size = 1, color = "black", fill = "NA"),
panel.grid = element_line(color = "gray95"),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.minor.x = element_blank(),
panel.border = element_rect(color = "black", size = 1, fill = NA)
+
) scale_fill_viridis_d(begin = 0, end = 0.5) +
scale_y_continuous(limits = c(0, 50), expand = expansion(add = c(0, 0))) +
scale_x_discrete(limits = rev) +
coord_flip() +
labs(
y = "Number of samples with > 30 non-detects",
title = "Fertilizer level",
fill = "Favored by..."
)
Note that most of the non-detects that we’re removing are from the non-fertilized group.
Next, we’ll convert the CT values to presence/absence for use later on.
<- data.raw %>%
amoA_presence_absence select(sample_id, starts_with("amoA")) %>%
select(-one_of(removes$amoA)) %>%
mutate(across(starts_with("amoA"), ~ ifelse(.x == 40, 0, 1)))
Calculating the NMDS (positioning the sites):
= metaMDS(data.priming.reduced %>% select(amoA.001:amoA.074), distance = "bray", k = 3)
mds.priming
<- as.data.frame(scores(mds.priming, display = "sites")) %>%
site.scores mutate(sample_id = data.priming.reduced$sample_id,
Crop = data.priming.reduced$crop,
Fert_Level = as.factor(data.priming.reduced$fert_level),
Day = as.factor(data.priming.reduced$doe),
Substrate_Addition = as.factor(data.priming.reduced$addition))
This is enough to plot the NMDS:
<- site.scores %>%
nmds.plot ggplot(aes(NMDS1, NMDS2, fill = Fert_Level)) +
geom_hline(yintercept = 0.0,
colour = "grey",
lty = 2) +
geom_vline(xintercept = 0.0,
colour = "grey",
lty = 2) +
geom_point(aes(shape = Crop), size = 4) +
theme(
plot.title = element_text(hjust = 0.5),
legend.text = element_markdown(size = 12),
legend.title = element_markdown(size = 12, hjust = 0),
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
panel.grid = element_line(color = "gray95"),
panel.border = element_rect(color = "black", size = 1, fill = NA)
+
) scale_fill_discrete(name = "Fertilizer Level<br>
<span style = 'font-size:8pt;'>
(kg N ha<sup>-1</sup> y<sup>-1</sup>)
</span>") +
scale_shape_manual(labels = c("Corn", "*Miscanthus<br>Gigantus*"),
values = c(21, 24)) +
guides(
fill = guide_legend(override.aes = list(shape = 22, size = 5)),
shape = guide_legend(title = "Crop", override.aes = list(shape = c(16, 17)))
)
We can also add ellipses to it if you’re interested in that:
+
nmds.plot stat_ellipse(aes(color = Fert_Level), size = 1, linetype = "dashed", show.legend = FALSE)
And again with shading:
+
nmds.plot stat_ellipse(aes(color = Fert_Level), size = 1, linetype = "dashed", show.legend = FALSE) +
stat_ellipse(aes(fill = Fert_Level), size = 1, linetype = "dashed", show.legend = FALSE, geom = "polygon", alpha = 0.1)
<- data.priming.reduced %>%
X select(-c(amoA.001:amoA.074))
<- data.priming.reduced %>%
Y select(c(amoA.001:amoA.074))
adonis(Y ~ X$fert_level + X$addition + X$crop + X$timepoint)
Call:
adonis(formula = Y ~ X$fert_level + X$addition + X$crop + X$timepoint)
Permutation: free
Number of permutations: 999
Terms added sequentially (first to last)
Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
X$fert_level 1 1.4426 1.44255 56.092 0.37093 0.001 ***
X$addition 2 0.0327 0.01633 0.635 0.00840 0.714
X$crop 1 0.0882 0.08823 3.431 0.02269 0.028 *
X$timepoint 1 0.0367 0.03671 1.427 0.00944 0.207
Residuals 89 2.2889 0.02572 0.58855
Total 94 3.8890 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This tells us that fertilization level is very significant and explains ~37% of the variation in our samples. Crop is also a significant factor on community composition, though it only explains 2.3% of the variation.
All the code below does is perform an ANOVA of the gene’s abundance against all the terms and all of their interactions.
<- lapply(colnames(data.priming.reduced %>% select(amoA.001:amoA.074)), function(x) as.formula(paste0(x, " ~ fert_level * crop * timepoint * addition")))
formulae
<- lapply(formulae, function(x) broom::tidy(aov(x, data = data.priming.reduced)))
res names(res) <- format(formulae)
names(res) <- str_sub(names(res), end = 8)
<- lapply(seq_along(res), function(i) res[[i]] %>% mutate(gene = names(res)[[i]])) %>%
anova_results bind_rows() %>%
filter(term != "Residuals") %>%
mutate(sig = case_when(
< 0.05 & p.value > 0.01 ~ "*",
p.value < 0.01 & p.value > 0.001 ~ "**",
p.value < 0.001 ~ "***",
p.value TRUE ~ "NS"
))
Visualization:
%>%
anova_results mutate(gene = str_sub(gene, -3)) %>%
ggplot(aes(gene, term, fill = sig)) +
geom_tile(color = "black") +
coord_equal() +
labs(y = "",
x = "amoA",
title = "Summary of ANOVA results",
fill = "Significance ") +
theme(
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, hjust = 0, vjust = 0.5)
+
) scale_fill_viridis_d(option = "magma", direction = -1)
Overall, we see that, again, fertilization level has a significant impact on abundance levels of the individual genes, and it’s not even really that close.. There are other factors that might be worth investigating on a gene-by-gene basis, too, but that’s for later.
Let’s start by visualizing the presence/absence table:
%>%
amoA_presence_absence pivot_longer(cols = amoA.001:amoA.074, names_to = "amoA", values_to = "presence") %>%
mutate(amoA = str_sub(amoA, -3),
presence = as.factor(presence)) %>%
left_join(metadata %>% rownames_to_column(var = "sample_id")) %>%
ggplot(aes(sample_id, amoA, fill = presence)) +
geom_tile(color = "black") +
labs(
x = "",
y = "",
fill = "Species is...",
title = "Presence/absence of species in crop priming samples",
subtitle = "(separated by fertilization level)"
+
) scale_fill_viridis_d(labels = c("Absent", "Present"),
begin = 0, end = 1,
option = "magma") +
theme(
axis.text.x = element_blank(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
strip.text = element_text(size = 10, face = "bold")
+
) scale_y_discrete(limits = rev) +
facet_wrap(~ fert_level, scales = "free")
Joining, by = "sample_id"
<- readxl::read_xlsx(here("data", "amoa_mfp_qpcr_org_accessions.xlsx"), sheet = 5) %>%
amoA_organism_info select(-c(contains(c("forward", "reverse", "notes"))))
Counts of best BLAST hits:
%>%
amoA_organism_info count(best_blast_hits, sort = TRUE)
# A tibble: 25 x 2
best_blast_hits n
<chr> <int>
1 Nitrosolobus multiformis AmoA (amoA) gene 10
2 Nitrosospira sp. En13 AmoA 9
3 Nitrosospira multiformis ATCC 25196 7
4 Nitrosospira sp. Wyke8 AmoA 7
5 Nitrosospira lacus strain APG3 6
6 Nitrosospira sp. Np39-19 6
7 Nitrosospira sp. Wyke2 4
8 Nitrosospira sp. NpAV 3
9 Nitrosomonas sp. JL21 2
10 Nitrosospira briensis 2
# … with 15 more rows
<- amoA_organism_info %>%
amoa_tax_table select(array_name, best_blast_hits) %>%
column_to_rownames(var = "array_name") %>%
tax_table()
rownames(amoa_tax_table) <- amoA_organism_info$array_name
<- phyloseq(
ps otu_table(amoA_presence_absence %>% column_to_rownames(var = "sample_id"), taxa_are_rows = FALSE),
sample_data(metadata),
amoa_tax_table )
How does observed richness and evenness change with treatment level? This is a modified diversity function that does a bunch of nice stuff that phyloseq::estimate_richness
doesn’t do.
<- function(physeq, split=TRUE, measures=NULL){
estimate_richness_mod
if( !split ){
<- taxa_sums(physeq)
OTU else if( split ){
} <- as(otu_table(physeq), "matrix")
OTU if( taxa_are_rows(physeq) ){ OTU <- t(OTU) }
}
= c("Observed", "Chao1", "ACE", "Shannon", "Pielou", "Simpson", "InvSimpson", "SimpsonE", "Fisher")
renamevec names(renamevec) <- c("S.obs", "S.chao1", "S.ACE", "shannon", "pielou", "simpson", "invsimpson", "simpsone", "fisher")
if( is.null(measures) ){
= as.character(renamevec)
measures
}
if( any(measures %in% names(renamevec)) ){
%in% names(renamevec)] <- renamevec[names(renamevec) %in% measures]
measures[measures
}
if( !any(measures %in% renamevec) ){
stop("None of the `measures` you provided are supported. Try default `NULL` instead.")
}
= vector("list")
outlist
= c("Chao1", "Observed", "ACE")
estimRmeas if( any(estimRmeas %in% measures) ){
<- c(outlist, list(t(data.frame(estimateR(OTU)))))
outlist
}if( "Shannon" %in% measures ){
<- c(outlist, list(shannon = diversity(OTU, index="shannon")))
outlist
}if( "Pielou" %in% measures){
#print("Starting Pielou")
<- c(outlist, list(pielou = diversity(OTU, index = "shannon")/log(estimateR(OTU)["S.obs",])))
outlist
}if( "Simpson" %in% measures ){
<- c(outlist, list(simpson = diversity(OTU, index="simpson")))
outlist
}if( "InvSimpson" %in% measures ){
<- c(outlist, list(invsimpson = diversity(OTU, index="invsimpson")))
outlist
}if( "SimpsonE" %in% measures ){
<- c(outlist, list(simpsone = diversity(OTU, index="invsimpson")/estimateR(OTU)["S.obs",]))
outlist
}if( "Fisher" %in% measures ){
= tryCatch(fisher.alpha(OTU, se=TRUE),
fisher warning=function(w){
warning("phyloseq::estimate_richness: Warning in fisher.alpha(). See `?fisher.fit` or ?`fisher.alpha`. Treat fisher results with caution")
suppressWarnings(fisher.alpha(OTU, se=TRUE)[, c("alpha", "se")])
}
)if(!is.null(dim(fisher))){
colnames(fisher)[1:2] <- c("Fisher", "se.fisher")
<- c(outlist, list(fisher))
outlist else {
} <- c(outlist, Fisher=list(fisher))
outlist
}
}= do.call("cbind", outlist)
out
= intersect(colnames(out), names(renamevec))
namechange colnames(out)[colnames(out) %in% namechange] <- renamevec[namechange]
= sapply(paste0("(se\\.){0,}", measures), grep, colnames(out), ignore.case=TRUE)
colkeep = out[, sort(unique(unlist(colkeep))), drop=FALSE]
out
<- as.data.frame(out)
out return(out)
}
<- c("Observed", "Shannon")
metrics <- estimate_richness_mod(ps, measures = metrics) %>%
richness rownames_to_column(var = "sample_id") %>%
mutate(sample_id = str_sub(sample_id, start = 2))
<- left_join(sample_data(ps) %>% data.frame() %>% rownames_to_column(var = "sample_id"), richness) %>%
richness pivot_longer(cols = Observed:Shannon, names_to = "Metric", values_to = "Value")
Joining, by = "sample_id"
Significance test of fertilization level on richness.
<- kruskal.test(Value ~ fert_level, data = richness %>% filter(Metric == "Observed"))) (sig_rich_fert
Kruskal-Wallis rank sum test
data: Value by fert_level
Kruskal-Wallis chi-squared = 54.275, df = 1, p-value = 1.743e-13
The p-value < 0.001
gives us strong statistical evidence that richness is significantly different between fertilization treatment groups.
Significance test of fertilization level on richness
<- kruskal.test(Value ~ fert_level, data = richness %>% filter(Metric == "Shannon"))) (sig_even_fert
Kruskal-Wallis rank sum test
data: Value by fert_level
Kruskal-Wallis chi-squared = 54.331, df = 1, p-value = 1.694e-13
The p-value < 0.001
gives us strong statistical evidence that Shannon diversity is significantly different between fertilization treatment groups.
Standard deviations, mean
<- richness %>%
summaries group_by(Metric, fert_level) %>%
summarize(mean_val = mean(Value),
sd_val = sd(Value) / 4,
.groups = "drop")
<- data.frame(
this_annotation Metric = c("Observed", "Shannon"),
lab = c("***", "***"),
x = 1.5,
y = c(50 + 5, 4 + 0.5),
lineheights = c(50 + 2, 4 + 0.25)
)
%>%
summaries ggplot(aes(fert_level, mean_val, fill = fert_level)) +
geom_col(color = "black", size = 1) +
facet_wrap(~ Metric, scales = "free_y") +
theme(
legend.position = "none",
strip.background = element_blank(),
axis.title.y = element_blank(),
strip.placement = "outside",
plot.title = element_text(hjust = 0.5),
strip.text.y = element_text(face = "bold", size = 10),
strip.text = element_text(face = "bold", size = 10),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_line(color = "gray90", linetype = "dashed"),
axis.ticks = element_blank(),
panel.border = element_rect(color = "black", size = 1, fill = "NA")
+
) scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
geom_errorbar(aes(ymin = mean_val - sd_val, ymax = mean_val + sd_val, width = 0.5)) +
geom_text(
data = this_annotation,
aes(x = x, y = y, label = lab),
inherit.aes = FALSE,
size = 5
+
) geom_segment(data = this_annotation,
aes(x = 1,
xend = 2,
y = lineheights,
yend = lineheights),
inherit.aes = FALSE) +
labs(
x = "Fertilization Level\n(*** = p < 0.001 by Kruskal Wallis)",
title = "Alpha diversity metrics by fertilization level"
)
We’ll start beta diversity analysis off by doing an ADONIS/PERMANOVA to determine if treatment centroids/treatment variations are different between groups.
<- vegdist(otu_table(ps)) dis
Warning in vegdist(otu_table(ps)): you have empty rows: their dissimilarities
may be meaningless in method "bray"
Warning in vegdist(otu_table(ps)): missing values in results
<- sample_data(ps)$fert_level
groups <- betadisper(dis, groups) mod
missing observations due to 'd' removed
anova(mod)
Analysis of Variance Table
Response: Distances
Df Sum Sq Mean Sq F value Pr(>F)
Groups 1 0.57864 0.57864 26.104 1.745e-06 ***
Residuals 92 2.03937 0.02217
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since p <<<< 0.0001
, there is strong evidence that the overall community compositions are significantly different (treatment centroid, distance to centroid, community variation) between the two groups. W can visualize this with a 1 SD ellipse:
plot(mod, ellipse = TRUE, hull = FALSE)
We see that there is clear separation between the two treatment centroids. Let’s do some more analysis on the distance-to-centroids that we’re seeing:
<- data.frame(
betadistances time_frame = mod$group,
distance = mod$distances
)
%>%
betadistances ggplot(aes(time_frame, distance)) +
geom_boxplot(size = 1, outlier.shape = NA) +
geom_jitter(aes(fill = time_frame), size = 5, shape = 21, width = 0.2) +
theme(
legend.position = "none",
panel.grid.minor.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
# panel.grid.major.y = element_line(color = "gray", linetype = "dashed"),
plot.title = element_text(size = 17),
plot.subtitle = element_text(size = 9),
axis.ticks.length = unit(0.25, "cm"),
# axis.ticks.x = element_line(color = "gray", size = 0.25),
axis.ticks.x = element_blank(),
axis.text.x = element_text(face = "bold", angle = 0, size = 12),
panel.border = element_rect(color = "black", size = 1, fill = NA),
axis.title.x = element_blank(),
axis.title.y = element_text(size = 14, face = "bold"),
# legend.position = "none"
+
) labs(
color = "",
y = "Distance to centroid"
+
) ::geom_signif(
ggsignifmap_signif_level = TRUE,
comparisons = list(c("0", "336")),
test = "t.test",
step_increase = 0.1,
color = "black",
size = 1,
textsize = 5,
tip_length = 0
)
The significance bar is coming from the PERMANOVA test we did above. We see that there is actually less beta diversity (as meaasured by distance-to-centroid) in the fertilized group than in the non- fertilized group. We’ll see another visualization backing this up in the next section:
Let’s visualize the composition of the communities, separated by fertilization. We’ll start with raw counts - how many times was that best BLAST hit seen in that sample?
comp_barplot(ps, "ta1",
facet_by = "fert_level",
sample_order = "default",
tax_transform_for_plot = "identity") +
coord_flip() +
labs(
title = "Sample composition by fertilization level",
subtitle = "(raw counts)"
+
) theme(
axis.text.x = element_blank(),
axis.text.y = element_text(margin = margin(r = -7)),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, size = 10),
strip.text = element_text(size = 10, face = "bold")
+
) guides(
fill = guide_legend(title = "Best BLAST hit", reverse = TRUE)
)
We see that overall the fertilized group appears to have more richness in it.
How about sample composition? IE, relative abundances?
comp_barplot(ps, "ta1",
facet_by = "fert_level",
sample_order = "default") +
coord_flip() +
labs(
title = "Sample composition by fertilization level",
subtitle = "(relative abundance)"
+
) theme(
axis.text.x = element_blank(),
axis.text.y = element_text(margin = margin(r = -7)),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, size = 10),
strip.text = element_text(size = 10, face = "bold")
+
) guides(
fill = guide_legend(title = "Best BLAST hit", reverse = TRUE)
)
Two big things pop out:
The next chunk is just doing some data transformation stuff to count the number of times each organism was seen in each sample in preparation for the statistical analysis.
<- ps %>%
pa_count otu_table() %>%
%>%
data.frame rownames_to_column(var = "sample_id") %>%
pivot_longer(amoA.001:amoA.074)
<- tax_table(ps) %>%
org_table %>%
data.frame rownames_to_column(var = "name") %>%
rename(bbh = ta1) %>%
mutate(cleaned_names = janitor::make_clean_names(bbh))
<- left_join(pa_count, org_table, by = "name") %>%
bbh_sample_counts group_by(sample_id, bbh) %>%
summarize(value = sum(value)) %>%
pivot_wider(names_from = "bbh", values_from = value)
`summarise()` has grouped output by 'sample_id'. You can override using the `.groups` argument.
<- left_join(bbh_sample_counts,
bbh_level_counts sample_data(ps) %>%
%>%
data.frame rownames_to_column(var = 'sample_id') %>%
right_join(bbh_sample_counts)
%>%
) ungroup()
Joining, by = "sample_id"
Joining, by = c("sample_id", "Nitrosolobus multiformis AmoA (amoA) gene", "Nitrosospira briensis", "Nitrosospira lacus strain APG3", "Nitrosospira multiformis AmoA (amoA) gene", "Nitrosospira multiformis ATCC 25196", "Nitrosospira sp. 9SS1", "Nitrosospira sp. amoA and amoB", "Nitrosospira sp. En13 AmoA", "Nitrosospira sp. GS832", "Nitrosospira sp. NIJS16", "Nitrosospira sp. Np39-19", "Nitrosospira sp. NpAV", "Nitrosospira sp. PJA1", "Nitrosospira sp. strain Nsp.1", "Nitrosospira sp. strain Nsp.5", "Nitrosospira sp. strain Nsp.5 amoA", "Nitrosospira sp. TCH711", "Nitrosospira sp. Wyke2", "Nitrosospira sp. Wyke8 AmoA", "Nitrosovibrio sp. RY3C", "Nitrospira sp. amoA and amoB genes")
%>%
bbh_level_counts pivot_longer(cols = 2:22)
# A tibble: 1,995 x 9
sample_id fert_level crop timepoint doe addition field_rep name value
<chr> <fct> <fct> <fct> <fct> <fct> <fct> <chr> <dbl>
1 13b 336 corn b 5 control 1 Nitrosol… 6
2 13b 336 corn b 5 control 1 Nitrosos… 1
3 13b 336 corn b 5 control 1 Nitrosos… 2
4 13b 336 corn b 5 control 1 Nitrosos… 1
5 13b 336 corn b 5 control 1 Nitrosos… 5
6 13b 336 corn b 5 control 1 Nitrosos… 1
7 13b 336 corn b 5 control 1 Nitrosos… 1
8 13b 336 corn b 5 control 1 Nitrosos… 3
9 13b 336 corn b 5 control 1 Nitrosos… 0
10 13b 336 corn b 5 control 1 Nitrosos… 1
# … with 1,985 more rows
Here, we’re preparing formulas to feed to a lapply
function to perform a Kruskal-Wallis test on all of the organisms.
<- lapply(colnames(bbh_sample_counts %>% select(-sample_id) %>% janitor::clean_names()) , function(x) as.formula(paste0(x, " ~ fert_level * crop * timepoint * addition"))) formulae
Adding missing grouping variables: `sample_id`
1]] <- NULL
formulae[[
<- lapply(formulae, function(x) broom::tidy(aov(x, data = bbh_level_counts %>% janitor::clean_names())))
res names(res) <- format(formulae)
names(res) <- lapply(names(res), function(x) str_split(x, "~")[[1]][1]) %>% unlist()
<- lapply(seq_along(res), function(i) res[[i]] %>% mutate(gene = names(res)[[i]])) %>%
anova_results.counts bind_rows() %>%
filter(term != "Residuals") %>%
mutate(gene = str_trim(gene))
Visualizing the results again:
%>%
anova_results.counts left_join(org_table, by = c("gene" ="cleaned_names")) %>%
mutate(sig = case_when(
< 0.05 & p.value > 0.01 ~ "*",
p.value < 0.01 & p.value > 0.001 ~ "**",
p.value < 0.001 ~ "***",
p.value TRUE ~ "NS"
%>%
)) ggplot(aes(term, bbh, fill = sig)) +
geom_tile(color = "black") +
labs(y = "",
x = "",
title = "Summary of ANOVA results",
fill = "Significance ") +
theme(
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
axis.text.y = element_text()
+
) scale_fill_viridis_d(option = "magma", direction = -1) +
scale_y_discrete(limits = rev) +
coord_equal()
We see the same pattern at the organism level as when we did this at the gene level: fertilization level is by far the most significant factor affecting Presence/Absence of organisms.