Clusters
The goal of this analysis is to dig deeper into the ordinations produced in Preliminary Analysis. In particular, we are interested in what appears to be two different clusters within the fertilized group. Our goals are:
- Characterize the communities in the two groups/Determine important taxa
Preparing the data
We repeat the data prep from Preliminary Analysis:
Getting the data ready
# TODO: Refactor into import_data.R
<- read.csv(here("data", "priming_amoA_deltaCt.csv"), header = T) %>%
data.priming rename(sample_id = X)
<- read.csv(here("data", "priming_amoA_rawCt.csv"), header = T) %>%
data.raw rename(sample_id = X)
<- data.priming %>%
data.priming.long pivot_longer(cols = starts_with("amoA"), names_to = "amoA", values_to = "deltaCT")
<- data.raw %>%
data.raw.long pivot_longer(cols = starts_with("amoA"), names_to = "amoA", values_to = "CT")
$sample_id <- fct_reorder(data.priming.long$sample_id, parse_number(data.priming.long$sample_id))
data.priming.long
<- data.priming[, -1]
df rownames(df) <- data.priming[, 1]
<- df %>%
metadata select(fert_level:field_rep) %>%
mutate(across(everything(), as.factor))
<- df %>%
amoa_counts select(starts_with("amoA"))
<- data.raw.long %>%
non_detect_counts group_by(fert_level, amoA) %>%
count(CT == 40) %>%
rename(non_detect = `CT == 40`) %>%
filter(non_detect == TRUE)
<- readxl::read_xlsx(here("data", "amoa_mfp_qpcr_org_accessions.xlsx"), sheet = 5) %>%
amoA_organism_info select(-c(contains(c("forward", "reverse", "notes"))))
<- non_detect_counts %>%
removes pivot_wider(names_from = fert_level, values_from = n, names_prefix = "fert.") %>%
filter(fert.0 > 30 & fert.336 > 30) %>%
pivot_longer(cols = fert.0:fert.336, names_to = "fert_level", values_to = "n")
<- data.priming %>%
data.priming.reduced select(-one_of(removes$amoA))
<- data.priming.reduced %>%
data.priming.reduced.long select(-sample_id, field_rep) %>%
pivot_longer(cols = contains("amoa"))
<- data.raw %>%
amoA_presence_absence select(sample_id, starts_with("amoA")) %>%
mutate(across(starts_with("amoA"), ~ ifelse(.x == 40, 0, 1)))
<- amoA_organism_info %>%
amoa_tax_table select(array_name, best_blast_hits) %>%
column_to_rownames(var = "array_name") %>%
tax_table()
<- function(physeq, split=TRUE, measures=NULL){
estimate_richness_mod
if( !split ){
<- taxa_sums(physeq)
OTU else if( split ){
} <- as(otu_table(physeq), "matrix")
OTU if( taxa_are_rows(physeq) ){ OTU <- t(OTU) }
}
= c("Observed", "Chao1", "ACE", "Shannon", "Pielou", "Simpson", "InvSimpson", "SimpsonE", "Fisher")
renamevec names(renamevec) <- c("S.obs", "S.chao1", "S.ACE", "shannon", "pielou", "simpson", "invsimpson", "simpsone", "fisher")
if( is.null(measures) ){
= as.character(renamevec)
measures
}
if( any(measures %in% names(renamevec)) ){
%in% names(renamevec)] <- renamevec[names(renamevec) %in% measures]
measures[measures
}
if( !any(measures %in% renamevec) ){
stop("None of the `measures` you provided are supported. Try default `NULL` instead.")
}
= vector("list")
outlist
= c("Chao1", "Observed", "ACE")
estimRmeas if( any(estimRmeas %in% measures) ){
<- c(outlist, list(t(data.frame(estimateR(OTU)))))
outlist
}if( "Shannon" %in% measures ){
<- c(outlist, list(shannon = diversity(OTU, index="shannon")))
outlist
}if( "Pielou" %in% measures){
#print("Starting Pielou")
<- c(outlist, list(pielou = diversity(OTU, index = "shannon")/log(estimateR(OTU)["S.obs",])))
outlist
}if( "Simpson" %in% measures ){
<- c(outlist, list(simpson = diversity(OTU, index="simpson")))
outlist
}if( "InvSimpson" %in% measures ){
<- c(outlist, list(invsimpson = diversity(OTU, index="invsimpson")))
outlist
}if( "SimpsonE" %in% measures ){
<- c(outlist, list(simpsone = diversity(OTU, index="invsimpson")/estimateR(OTU)["S.obs",]))
outlist
}if( "Fisher" %in% measures ){
= tryCatch(fisher.alpha(OTU, se=TRUE),
fisher warning=function(w){
warning("phyloseq::estimate_richness: Warning in fisher.alpha(). See `?fisher.fit` or ?`fisher.alpha`. Treat fisher results with caution")
suppressWarnings(fisher.alpha(OTU, se=TRUE)[, c("alpha", "se")])
}
)if(!is.null(dim(fisher))){
colnames(fisher)[1:2] <- c("Fisher", "se.fisher")
<- c(outlist, list(fisher))
outlist else {
} <- c(outlist, Fisher=list(fisher))
outlist
}
}= do.call("cbind", outlist)
out
= intersect(colnames(out), names(renamevec))
namechange colnames(out)[colnames(out) %in% namechange] <- renamevec[namechange]
= sapply(paste0("(se\\.){0,}", measures), grep, colnames(out), ignore.case=TRUE)
colkeep = out[, sort(unique(unlist(colkeep))), drop=FALSE]
out
<- as.data.frame(out)
out return(out)
}
rownames(amoa_tax_table) <- amoA_organism_info$array_name
<- phyloseq(
ps otu_table(amoA_presence_absence %>% column_to_rownames(var = "sample_id"), taxa_are_rows = FALSE),
sample_data(metadata),
amoa_tax_table
)
= metaMDS(data.priming.reduced %>% select(contains("amoa")), distance = "bray", k = 3)
mds.priming
<- as.data.frame(scores(mds.priming, display = "sites")) %>%
site.scores mutate(sample_id = data.priming.reduced$sample_id,
Crop = data.priming.reduced$crop,
Fert_Level = as.factor(data.priming.reduced$fert_level),
Day = as.factor(data.priming.reduced$doe),
Substrate_Addition = as.factor(data.priming.reduced$addition))
<- vegdist(data.priming %>% select(starts_with('amoA')))
dune_dist
<- anosim(dune_dist, data.priming$fert_level)
amoa_anosim
<- envfit(mds.priming, data.priming.reduced %>% select(contains("amoa")), permutations = 999)
mds.spp.fit
<- as.data.frame(scores(mds.spp.fit, display = "vectors"))
spp.scrs <- cbind(spp.scrs, Species = rownames(spp.scrs))
spp.scrs <- cbind(spp.scrs, pval = mds.spp.fit$vectors$pvals)
spp.scrs
<- as.data.frame(scores(mds.spp.fit, display = "vectors")) %>%
spp.scores mutate(Species = rownames(.),
pval = mds.spp.fit$vectors$pvals)
Here is the NMDS in question:
Plotting the NMDS
<- site.scores %>%
nmds.plot ggplot(aes(NMDS1, NMDS2, fill = Fert_Level)) +
geom_hline(yintercept = 0.0,
colour = "grey",
lty = 2) +
geom_vline(xintercept = 0.0,
colour = "grey",
lty = 2) +
geom_point(size = 4, shape = 21) +
theme(
plot.title = element_text(hjust = 0.5),
legend.text = element_markdown(size = 12),
legend.title = element_markdown(size = 12, hjust = 0),
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
panel.grid = element_line(color = "gray95"),
panel.border = element_rect(color = "black", size = 1, fill = NA)
+
) scale_fill_discrete(name = "Fertilizer Level<br>
<span style = 'font-size:8pt;'>
(kg N ha<sup>-1</sup> y<sup>-1</sup>)
</span>") +
guides(
fill = guide_legend(override.aes = list(shape = 21, size = 5))
)
nmds.plot
Within the fertilized group, there appears to be three interesting clusters:
- One on the left, which we’ll call Cluster 1
- One in the bottom center, which we’ll call Cluster 2
- And the vague one in the bottom right, which we’ll call Cluster 3
We’ll group everything else into Cluster 4 (the “leftovers”). Here’s what that looks like.
Code
<- site.scores %>%
site.scores mutate(Cluster = as.factor(case_when(
< -0.15 ~ 1,
NMDS1 < -0.1 & NMDS1 < 0.05 ~ 2,
NMDS2 < -0.1 & NMDS1 > 0.05 ~ 3,
NMDS2 TRUE ~ 4
)))
%>%
site.scores ggplot(aes(NMDS1, NMDS2, fill = Cluster)) +
geom_hline(yintercept = 0.0,
colour = "grey",
lty = 2) +
geom_vline(xintercept = 0.0,
colour = "grey",
lty = 2) +
geom_point(size = 4, shape = 21) +
theme(
plot.title = element_text(hjust = 0.5),
legend.text = element_markdown(size = 12),
legend.title = element_markdown(size = 12, hjust = 0),
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
panel.grid = element_line(color = "gray95"),
panel.border = element_rect(color = "black", size = 1, fill = NA)
)
Let’s update the phyloseq object with this cluster info:
sample_data(ps) <- metadata %>% rownames_to_column(var = "sample_id") %>%
left_join(site.scores) %>%
select(-crop) %>%
column_to_rownames("sample_id")
Alpha diversity
We’ll start by looking at differences in alpha diversity between the clusters.
Calculate richness, do pairwise Kruskal-Wallis, and plot
<- c("Observed", "Shannon")
metrics <- estimate_richness_mod(ps, measures = metrics) %>%
richness rownames_to_column(var = "sample_id") %>%
mutate(sample_id = str_sub(sample_id, start = 2))
<- left_join(sample_data(ps) %>% data.frame() %>% rownames_to_column(var = "sample_id"), richness) %>%
richness pivot_longer(cols = Observed:Shannon, names_to = "Metric", values_to = "Value")
<- richness %>%
samples_richness_clusters pivot_wider(names_from = "Metric", values_from = "Value") %>%
select(sample_id, Observed, Shannon, Cluster)
<- crossing(1:4, 1:4, .name_repair = "minimal") %>%
comparisons rename(Cluster_1 = 1, Cluster_2 = 2) %>%
filter(Cluster_1 != Cluster_2) %>%
mutate(group = if_else(Cluster_1 < Cluster_2, paste0(Cluster_1, Cluster_2), paste0(Cluster_2, Cluster_1))) %>%
group_by(group) %>%
slice_head(n = 1) %>%
ungroup() %>%
select(-group)
# Doesn't want to be a function for some reason...
# pairwise_kruskal <- function(metric) {
# apply(comparisons, 1, function(x) broom::tidy(kruskal.test({{ metric }} ~ Cluster, data = samples_richness_clusters %>% filter(Cluster %in% c(x['Cluster_1'], x['Cluster_2']))))) %>%
# bind_rows() %>%
# bind_cols(comparisons) %>%
# mutate(metric = {{ metric }})
# }
#
# pairwise_kruskal("Observed")
<- apply(comparisons, 1, function(x) broom::tidy(kruskal.test(Observed ~ Cluster, data = samples_richness_clusters %>% filter(Cluster %in% c(x['Cluster_1'], x['Cluster_2']))) )) %>%
observed bind_rows() %>%
bind_cols(comparisons) %>%
mutate(metric = "Observed")
<- apply(comparisons, 1, function(x) broom::tidy(kruskal.test(Shannon ~ Cluster, data = samples_richness_clusters %>% filter(Cluster %in% c(x['Cluster_1'], x['Cluster_2']))) )) %>%
shannon bind_rows() %>%
bind_cols(comparisons) %>%
mutate(metric = "Shannon")
<- bind_rows(observed, shannon) %>%
p.values mutate(sig = case_when(
< 0.05 & p.value > 0.01 ~ "*",
p.value < 0.01 & p.value > 0.001 ~ "**",
p.value < 0.001 ~ "***",
p.value TRUE ~ "NS"
))
<- richness %>%
summaries group_by(Metric, Cluster) %>%
summarize(mean_val = mean(Value),
sd_val = sd(Value),
n = n(),
.groups = "drop") %>%
mutate(se = abs((sd_val / sqrt(n)) * qt(0.025, n - 1) )) %>%
mutate(ymax = mean_val + se,
ymin = mean_val - se) %>%
bind_cols(
"labels" = rep(c("a", "b", "c", "d"), 2)
%>%
) mutate(offset = ifelse(Metric == "Observed", 2.2, 0.20))
%>%
summaries ggplot(aes(Cluster, mean_val, fill = Cluster)) +
geom_col(color = "black", size = 1) +
facet_wrap(~ Metric, scales = "free_y") +
theme(
legend.position = "none",
strip.background = element_blank(),
axis.title.y = element_blank(),
strip.placement = "outside",
plot.title = element_text(hjust = 0.5),
strip.text.y = element_text(face = "bold", size = 10),
strip.text = element_text(face = "bold", size = 10),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_line(color = "gray90", linetype = "dashed"),
axis.ticks = element_blank(),
panel.border = element_rect(color = "black", size = 1, fill = "NA")
+
) scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
geom_errorbar(aes(ymin = ymin, ymax = ymax, width = 0.5)) +
geom_text(
aes(y = ymax + offset, label = labels),
size = 5
+
) labs(
x = "Gene cluster",
title = "Alpha diversity metrics by Cluster"
)
So according to our pairwise Kruskal-Wallis, each of the clusters has significantly different observed diversity and shannon diversity. In particular, Cluster 4 (the “leftover” cluster) has significantly lower observed/shannon diversity than Cluster 3 (the “vague” cluster), which has lower diversity than the other two clusters. While I’m inclined to believe that that alpha diversities of 4 is less than the alpha diversities of 3, which is in turn lower than the other two, the 95% CIs overlap for Clusters 1 and 2 and I wouldn’t conclude anything there.
Which genes are different between clusters?
If we were just comparing the abundances between two clusters or groups, I could do a differential barchart like the ones at the bottom of this section. Unfortunately, since we have four groups, that doesn’t quite make sense. For now, we’ll just visualize with a heatmap.
<- ps %>%
all_data_long sample_data() %>%
data.frame() %>%
rownames_to_column(var = "sample_id") %>%
left_join(
%>%
ps otu_table() %>%
data.frame() %>%
rownames_to_column(var = "sample_id")
%>%
) pivot_longer(cols = starts_with("amoA"), names_to = "amoA", values_to = "presence")
%>%
all_data_long group_by(amoA, Cluster) %>%
summarize(mean_presence = mean(presence)) %>%
mutate(amoA = str_sub(amoA, -2)) %>%
ggplot(aes(Cluster, amoA, fill = mean_presence)) +
geom_tile(color = "black", aes()) +
coord_fixed(ratio = 0.4) +
scale_fill_viridis_c(option = "cividis") +
labs(
fill = "Mean Presence"
)
This reinforces our pairwise tests for differences in alpha diversity above - cells in the Cluster 4 column are typically darker than the other columns, indicating that there are fewer genes present there.