knitr::opts_chunk$set(warning=FALSE, message=FALSE, fig.width=15, fig.height=15, dpi=300)
#suppressPackageStartupMessages(library(phyloseq))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(GGally))
suppressPackageStartupMessages(library(ggbiplot))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(magrittr))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(vegan))
suppressPackageStartupMessages(library(ape))
suppressPackageStartupMessages(library(pheatmap))
suppressPackageStartupMessages(library(lme4))
theme_set(theme_bw())
Fetal colonization and establishment of the gut microbiome begins in the placenta even before birth (Gritz et al) and often high interindividual variability is observed in gut microbiomes.
In this analyses, we investigate:
Our anaylses are split across the following:
Using ribotagger on whole metagenome data, we looked across mice gut metagenomes grouped by their diets (always Invivos, Invivos changed to CM from day0, always CM), each diet group consists of with 10 biological replicates.
anno = read.table("../out/init.0103.ribotaggerBatch/biom/sample.anno", h=T, sep="\t")
tab = read.table("../out/init.0103.ribotaggerBatch/biom/sample.tab", h=T, sep="\t")
df = merge(anno, tab, by="tag")
#write.csv(df, file="../out/init.0104.ribotaggerOutput.csv", row.names=F)
#' Convert matrix into long form
longForm = function(tab){
tabLong = tab %>% gather(sample, count, -tag)
sampleDF = strsplit(as.character(tabLong$sample), "\\.") %>%
lapply(function(row){
data.frame(
mouseID = row[2],
food = row[3],
Day = row[4]
)
}) %>% do.call(rbind,.)
cbind(tabLong, sampleDF)
}
#its not possible to install phyloseq on water.
#got it to run on my mac
evenReads = read.csv("../out/init.0104.evenSubsampleReads.csv")
colnames(evenReads) = c("tag", colnames(evenReads)[-1])
evenTabLong = longForm(evenReads)
#processed by script/misc.countReads.pl
# total count
readCount = read.table(file="../out/misc.countReads.out.txt", sep="\t") %>%
setNames(c("sample", "count"))
A total of 1.852542510^{8} reads, 9.299763210^{10} bases sequenced.
We plot the total whole-metagenome reads for each sample
evenTabLong$ID = evenTabLong$sample %>% substr(1, 10)
readCount = evenTabLong %>% select(ID, mouseID, food, Day) %>% unique %>% merge(readCount, by.x="ID", by.y="sample", all=T)
levels(readCount$food) = c("InVivos","Change", "CM")
rcPlot = readCount %>% ggplot(aes(reorder(ID, Day),count)) +
geom_point() +
facet_wrap(~food, scale="free") +
xlab("Sample ID") +
ylab("Read Counts") +
theme(axis.text.x=element_text(angle=90))
#scale_y_log10()
#ggsave(rcPlot, file="../out/init.0104.readCounts.pdf")
#need to merge, get the land information
rcPlot
V4reads = tab %>% select(-tag) %>% apply(1, sum) %>% sum
Of the total reads 67294 were from the V4 region identiified by Ribotagger’s unique tagging.
In Rausch et al, it was discussed that the diversity of bacteria within the the B57BL/6J strains., based on genomic abundances, are dominated by Firmicutes and Bacteroidetes (both of which are Phylum level classifications)
We plot the same plot (Figure 1) Rausch et al at the Phylum level
plot
#connect("metamaps.scelse.nus.edu.sg", 7474, "neo4j", "g0ldenEarlGr3y")
#taxIDInfo = taxname(colnames(testing[[1]]), name=T)
#taxIDInfo = merge(data.frame(name=colnames(testing[[1]]), type="ribotagged"), taxIDInfo, all.x=T)
#taxIDInfo$taxID[6] = 31979
#taxIDInfo_family = taxIDInfo %>% apply(1, function(row){
#if(!is.na(row[3])){
# as.integer(row[3])
# df = as.integer(row[3]) %>% path2kingdom %>% make.data.frame %>% filter(rank == 'phylum')
# if(nrow(df) > 0){
# df$original.name = unname(row[1])
# df$original.taxID = unname(row[3])
# df
# }else{
# data.frame(name=NA, taxid=NA, rank='phylum', original.name=row[1], original.taxID=row[3])
# }
#}else{
# data.frame(name=NA, taxid=NA, rank='phylum', original.name=row[1], original.taxID=row[3])
#}
#}) %>% do.call(rbind,.)
#rownames(taxIDInfo_family) = NULL
#taxIDInfo_family$name[23] = 'Bacteroidetes'
#taxIDInfo_family$name[c(12,24,25)] = 'unclassified'
#' tab the matrix
#' title the title
#' the rank at which to display
barplotPhyla = function(tab, title, taxonLevel)
{
famTag = anno %>% select_("tag", taxonLevel)
tab %<>% merge(famTag, by="tag", all.x=T)
# tab %<>% as.data.table
tab %<>% group_by_("mouseID", "food", "Day", taxonLevel) %>% summarise(count = sum(count)) %>% ungroup
tab %<>% filter(food != 3)
tab$dayInt = as.integer(tab$Day)
colnames(tab) = c("mouseID", "food", "Day", "taxa", "count", "dayInt")
levels(tab$taxa) = c("Unclassified", levels(tab$taxa)[-1])
most2least = tab %>% group_by(taxa) %>% summarise(totCount = sum(count)) %>% arrange(desc(totCount)) %$% taxa %>% as.character
tab$taxa = factor(tab$taxa, levels = most2least)
getPalette = colorRampPalette(brewer.pal(9, "Paired"))
Nfamilies = length(unique(tab$taxa))
tab %<>% arrange(Day, food)
tab %<>% mutate(xaxis = paste0(mouseID, Day))
coloursss = rev(getPalette(Nfamilies))
if(taxonLevel == 'p'){
coloursss[1] = '#6a3d9a'
coloursss[2] = '#ffff99'
}
levels(tab$food) = c("invivos", "change diet", "cm")
list(
thePlot = ggplot(tab, aes(x=xaxis, y=count, fill=taxa)) +
geom_bar(stat="identity", position="stack")+
facet_wrap(~food + Day, drop=T, scale="free", ncol=4)+
scale_fill_manual("Taxonomic Assignment: Family", values = coloursss)+
guides(fill= guide_legend(nrow=5)) +
theme(legend.position="bottom")+
ggtitle(sprintf("Phyla level %s", title)),
df = tab)
}
#p0 = barplotPhyla(tabLong, percent= TRUE, "rawCount")
plotNdf = barplotPhyla(evenTabLong, title="rarefied", taxonLevel="p")
plotNdf$thePlot
phyPlot = plotNdf
plotNdf = barplotPhyla(evenTabLong, title="rarefied", taxonLevel="f")
plotNdf$thePlot
famPlot = plotNdf
Immunological studies begin 50 days after the mice have been transfer into the NUS rearing facilities while fed on new diet (CM),
The immediate question is if this change impacts the gut microbiome and in turn the immune system.
To see how samples cluster across diets, we carried out ordination using Principal Coordinate Analysis based on bray-curtis dissimilarity.
We see strong clustering between mice fed the same diet from birth even after the switch.
Change in diet over the course of 50 days was not enough to bring forth a 3rd cluster.
rownames(evenReads) = evenReads$tag
evenReads %<>% select(-tag)
pcoaDF = evenReads %>% t %>% vegdist() %>% pcoa
smallDF = rownames(pcoaDF$vectors) %>%
strsplit("\\.") %>%
lapply(function(row){
data.frame(
mouseID = row[2],
food = row[3],
Day = row[4]
)
}) %>% do.call(rbind,.)
pcoaDF2 = cbind(smallDF, pcoaDF$vectors[,1:2]) %>% setNames(c("mouse", "food", "day","PC1", "PC2"))
pcoaDF2 %<>% mutate(intday = as.integer(day))
levels(pcoaDF2$food) = c("invivos", "change", "cm")
pcoaDF2 %>% ggplot(aes(PC1, PC2, alpha=intday, shape=food)) +
geom_point()+
scale_alpha_continuous("Day", labels=c(0, 7, 21, 42))
We plotted a PCoA plot fo the samples from , the 1st PC which accounted for 26.9529554%
pcoaDF2 %>% ggplot(aes(PC1, PC2, alpha=intday)) +
geom_point()+
geom_path(aes(group=mouse), alpha=0.2) +
scale_alpha_continuous("Day", labels=c(0, 7, 21, 42)) +
facet_wrap(~food, ncol=1)
pcoaDF2 %>% ggplot(aes(PC1, PC2, alpha=intday)) +
geom_point()+
geom_path(aes(group=mouse), alpha=0.2) +
scale_alpha_continuous("Day", labels=c(0, 7, 21, 42)) +
facet_wrap(~food+mouse, ncol=5)
We cluster mice on the 1st day, there doesnt seem to be any clear grouping
#colnames(evenReads)
type12_Day0 = !grepl('\\.3\\.Day', colnames(evenReads)) & grepl('Day0', colnames(evenReads))
pcoaDF_Day0 = t(evenReads) %>% .[type12_Day0, ] %>% vegdist() %>% pcoa
smallDF_Day0 = rownames(pcoaDF_Day0$vectors) %>%
strsplit("\\.") %>%
lapply(function(row){
data.frame(
mouseID = row[2],
food = row[3],
Day = row[4]
)
}) %>% do.call(rbind,.)
pcoaDF2_Day0 = cbind(smallDF_Day0, pcoaDF_Day0$vectors[,1:2]) %>%
setNames(c("mouse", "food", "day","PC1", "PC2"))
levels(pcoaDF2_Day0$food) = c("invivos", "change")
ggplot(pcoaDF2_Day0, aes(PC1, PC2, color=food, label=mouse)) +
geom_point() +
geom_text(check_overlap= TRUE) +
ggtitle("Day0 for InVivos and Change Diet only")
newDFjust0 = famPlot$df %>% filter(dayInt==1)
newDFjust0$grouping = 0
newDFjust0$grouping[which(newDFjust0$mouseID == 'M03')] = 1
newDFjust0$grouping[which(newDFjust0$mouseID == 'M08')] = 1
newDFjust0$grouping[which(newDFjust0$mouseID == 'M10')] = 1
newDFjust0$grouping[which(newDFjust0$mouseID == 'M13')] = 1
newDFjust0$grouping[which(newDFjust0$mouseID == 'M18')] = 1
#m03, m08, m10, m13, m18
newDFjust0 %>% ggplot(aes(xaxis, count, fill=taxa)) +
geom_bar(stat="identity", position="stack")+
facet_wrap(~grouping, drop=T, scale="free")+
guides(fill= guide_legend(nrow=5)) +
theme(legend.position="bottom", axis.text.x=element_text(angle=90))
#ggtitle(sprintf("Phyla level %s", title))
The groups are largely different for
Blank the rest out in the plot below
To observe, if there was any subtle differences between the Invivos and Switch diet, we carried out PCoA on just InVivos and InVivos2CM.
We plot for all samples on all days, at first glance its hard to tell.
evenReads12 = evenReads[,smallDF$food != 3]
pcoaDF12 = evenReads12 %>%
t %>%
vegdist %>%
pcoa
pcoaDF12_2 = cbind(smallDF[smallDF$food != 3,], pcoaDF12$vectors[,1:2]) %>%
setNames(c("mouse", "food", "day","PC1", "PC2"))
pcoaDF12_2 %<>% mutate(intday = as.integer(day))
levels(pcoaDF12_2$food) = c("invivos", "change", "cm")
pcoaDF12_2 %>% ggplot(aes(PC1, PC2, alpha=intday, shape=food, color=food)) +
geom_point()+
scale_alpha_continuous("Day", labels=c(0, 7, 21, 42)) +
xlab(sprintf("PC1 percentage variance %s", pcoaDF12$values$Rel_corr_eig[1]*100)) +
ylab(sprintf("PC2 percentage variance %s", pcoaDF12$values$Rel_corr_eig[2]*100))
PC2 separates the two diets and the 2 groups from day 14 onlywards. The samples look evenly mixed on day0 as expected.
pcoaDF12_2 %>% ggplot(aes(PC1, PC2, shape=food, color=food)) +
geom_point()+
scale_alpha_continuous("Day", labels=c(0, 7, 21, 42)) +
xlab(sprintf("PC1 percentage variance %s", pcoaDF12$values$Rel_corr_eig[1]*100)) +
ylab(sprintf("PC2 percentage variance %s", pcoaDF12$values$Rel_corr_eig[2]*100)) +
facet_wrap(~day)
#need to include the day not just the mouse's identity
pcoaDF12_2 %<>% mutate(m = paste0(mouse, day))
pcoaDF12_2 %>% ggplot(aes(PC1, PC2, alpha=intday, shape=food)) +
geom_text(aes(label=mouse))+
scale_alpha_continuous("Day", labels=c(0, 7, 21, 42)) +
xlab(sprintf("PC1 percentage variance %s", pcoaDF12$values$Rel_corr_eig[1]*100)) +
ylab(sprintf("PC2 percentage variance %s", pcoaDF12$values$Rel_corr_eig[2]*100)) +
theme(legend.position="bottom") +
guides(color = guide_legend(nrow=4))
We carry out Principle Component Analysis on the count matrix to identify the families responsible for driving the separation which we saw earlier
Due to the way we subsampled the data, families with zero counts across all samples are not included.
#' takes family account
#'
#' params nt matrix rownames have the
#' params anno ribotag information
buildMatrix = function(nt, anno)
{
#append rank name
nt$tag = rownames(nt)
tagNames = rownames(nt) %>% lapply(function(ntTag) {
familyTag = dplyr::filter(anno, tag == ntTag) %$% f
data.frame(tag=ntTag, family = familyTag)
}) %>% do.call(rbind,.)
tagNames$family %<>% as.character
tagNames$family[tagNames$family== ""] = "unclassified"
tt = merge(nt, tagNames)
#Collapse into family
t2 = tt %>% select(-tag) %>%
gather(sample, value, -family) %>%
group_by(family, sample) %>%
summarise(value = sum(value)) %>% ungroup
sampleDF = as.character(t2$sample) %>% strsplit("\\.") %>%
lapply(function(row){
data.frame(
mouseID = row[2],
food = row[3],
Day = row[4]
)
}) %>% do.call(rbind,.)
t2 = cbind(t2, sampleDF)
t2 = t2 %>%
filter(food != 3) %>% #remove Mchemy mouse
spread(family, value)
t3 = t2 %>% select(-mouseID, -food, -Day) %>%
as.matrix %>%
t
newSampleDF = t2 %>% select(mouseID, food, Day)
t4 = t3[-1, ]
colnames(t4) = t3[1,] %>% unlist %>% as.character
t5 = t4 %>% apply(1, function(row) as.numeric(row) )
rownames(t5) = colnames(t4)
list(t5, newSampleDF)
}
testing = buildMatrix(evenReads, anno)
columnSummary = testing[[1]] %>%
apply(2, function(x){
sum(x)
}) %>% data.frame
columnSummary$taxa = make.names(rownames(columnSummary))
colnames(columnSummary) = c("count", "taxa")
remaining = filter(columnSummary, count > 0 ) %$% taxa
removedZero = testing[[1]][,make.names(colnames(testing[[1]])) %in% remaining]
#some of the columns are zero, after subsampling
# this will affect the analysis
speciesPCA = prcomp(removedZero, center=T, scale=T)
g <- ggbiplot(speciesPCA, obs.scale = 1, var.scale = 1,
groups = testing[[2]]$food, ellipse = TRUE,
circle = TRUE)
#ggsave(g, file="../out/init.0104.biplot.pdf", w=20, h=20)
g
Much of the variance is explained by PC1 but it doesnt seem to be separating the two groups.
The differences in the diets seem to be correlated with PC2, with specific taxonomic families associated.
Hence, identified families which are positively and negatively correlated PC2.
pc12 = speciesPCA$rotation[,1:2] %>% as.data.frame #%>% setNames(rownames(speciesPCA$rotation))
pc12$families = rownames(pc12)
invivos = pc12 %>% filter(PC1 < 0 , PC2 > 0) %>% arrange(desc(PC2))
invivos %>% pander::pandoc.table("rmarkdown")
##
## ------------------------------------
## PC1 PC2 families
## ------- ------ ---------------------
## -0.1954 0.4141 Peptostreptococcaceae
##
## -0.1516 0.3025 Liliopsida
##
## -0.2056 0.2701 Erysipelotrichaceae
##
## -0.3548 0.2183 Lactobacillaceae
## ------------------------------------
##
## Table: rmarkdown
change = pc12 %>% filter(PC1 < 0 , PC2 < 0) %>% arrange(PC2)
change %>% pander::pandoc.table("rmarkdown")
##
## ---------------------------------
## PC1 PC2 families
## ------- ------- -----------------
## -0.2029 -0.4316 S24-7
##
## -0.2537 -0.1618 Clostridiaceae 1
##
## -0.2014 -0.118 Coriobacteriaceae
## ---------------------------------
##
## Table: rmarkdown
metaTab = tab %>% select(-tag)
rownames(metaTab) = tab$tag
nmdsOutput = metaTab %>% t %>% metaMDS
#pdf("../out/init.0104.nmds.pdf", w=25, h=10)
plot(nmdsOutput$points[,1], nmdsOutput$points[,2], pch = 20, xlab = "NMDS1", ylab = "NMDS2", cex = 5)
#dev.off()
Too little time points to transform this into a stationary time series, with equal mean and variance.
plotNdf = phyPlot
adf = plotNdf$df %>% select(taxa, xaxis, count) %>% as.data.frame %>% spread(taxa, count)
info = plotNdf$df %>% select(food, Day, xaxis) %>% unique
adf = merge(info, adf, by="xaxis")
adf %<>% as.data.frame
colnames(adf) = make.names(colnames(adf))
phyLimit = min(which(adf[,-c(1:3)] %>% apply(2, sum) == 0)) -1 + 3
adf = adf[,1:phyLimit]
adf %<>% select(-xaxis, -Day)
ggpairs(data=adf %>% select(-Deferribacteres), color='food', title="Phylum")
#Peter wants a simple correlation plot, so we give him simple plots, actually also noticed this, the
m2 = adf %>% filter(food == 'invivos') %>% .[,-1] %>% cor(method="spearman")
m2[is.na(m2)] = 0
write.table(m2, file="../out/init.0104.corrInvivos.csv", row.names=F)
pheatmap(m2[hclust(dist(m2))$order, hclust(dist(m2))$order],
display_numbers=TRUE, main="Invivos, Family Rank ")
m3 = adf %>% filter(food != 'invivos') %>% .[,-1] %>% cor(method="spearman")
m3[is.na(m3)] = 0
write.table(m3, file="../out/init.0104.corrInvivos.csv", row.names=F)
#pdf("../out/init.0104.corrMatrix.pdf", h=10, w=10)
#hclust(dist(m3))$order
pheatmap(m3[hclust(dist(m2))$order, hclust(dist(m2))$order], display_numbers=TRUE, main="ChangeDiet, Family Rank", cluster_rows=F, cluster_cols=F)
#dev.off()
#ggsave(plotNdf$thePlot, file="../out/famLevel.pdf", w=10, h=10)
plotNdf = famPlot
adf = plotNdf$df %>% select(taxa, xaxis, count) %>% as.data.frame %>% spread(taxa, count)
info = plotNdf$df %>% select(food, Day, xaxis) %>% unique
adf = merge(info, adf, by="xaxis")
adf %<>% as.data.frame
colnames(adf) = make.names(colnames(adf))
phyLimit = min(which(adf[,-c(1:3)] %>% apply(2, sum) == 0)) -1 + 3
adf = adf[,1:phyLimit]
adf %<>% select(-xaxis, -Day)
ggpairsplot = ggpairs(data=adf[,1:16], color='food')
#pdf("../out/init.0104.correlationStructure_family_corrPlot.pdf", w=20, h=20)
ggpairsplot
#dev.off()
We note that the correlation is not fair for species with low count and also the distribution may not be normally distributed.
Thus we calculate the distribution of correlation values based on spearman correlation (which is rank based)
numbersSample = famPlot$df %>% group_by(taxa) %>% summarize(cc = sum(count)) %>% arrange(desc(cc))
ppp0 = qplot(x=1:25, y=numbersSample$cc, stat="identity", geom="bar")
#ggsave(ppp0, file="../out/init.0104.byNumbers.pdf")
significant = numbersSample %>% head(n=10) %$% taxa %>% make.names
m2 = adf %>% filter(food == 'invivos') %>% .[,-1] %>% cor(method="spearman")
m2[is.na(m2)] = 0
write.table(m2, file="../out/init.0104.corrInvivos.csv", row.names=F)
#pheatmap(m2, display_numbers=TRUE, main="Invivos, Family Rank ")
pheatmap(m2[hclust(dist(m2))$order, hclust(dist(m2))$order], display_numbers=TRUE, main="Invivos, Family Rank ")
We observe two clusters of positive correlation:
Cluster | Family | Phylum |
---|---|---|
1 | Deferribacteracea | Deferribacteres |
1 | Anaeroplasmatacea | Tenericutes |
1 | vadinBB60 | Fimicutes |
1 | Ruminococcaceae | Firmicutes |
1 | Lachnospiraceae | Firmicutes |
1 | Defluvitaleaceae | Frimicutes |
2 | Clostridiaceae 1 | Firmicutes |
2 | S24.7 | Bacteroidetes |
2 | Coriobacteriaceae | Actinobacteria |
2 | Lactobacillaceae | Firmicutes |
Members within clusters 1 and 2 are positively correlated with each other but there exists a negative correlation between groups.
m3 = adf %>% filter(food != 'invivos') %>% .[,-1] %>% cor(method="spearman")
m3[is.na(m3)] = 0
#write.table(m3, file="../out/init.0104.corrInvivos.csv", row.names=F)
#pdf("../out/init.0104.corrMatrix.pdf", h=10, w=10)
#pheatmap(m3, display_numbers=TRUE, main="ChangeDiet, Family Rank")
pheatmap(m3[hclust(dist(m2))$order, hclust(dist(m2))$order], display_numbers=TRUE, main="ChangeDiet, Family Rank", cluster_rows=F, cluster_cols=F)
#dev.off()
Cluster 1 became less positively correlated whereas cluster2 grew in size, the negative correlation between groups remain.
m2[upper.tri(m2)] = NA
testingM2 = m2 %<>% data.frame
testingM2$family= rownames(testingM2)
testingM2 %<>% gather(family, corr) #%>% filter(corr != NA)
colnames(testingM2) = c("family1", "family2", "corr")
testingM2 = testingM2[!is.na(testingM2$corr),] %>% filter(corr != 1)
#write.csv(testingM2, file="../out/init.0104.correlation.csv", row.names=F)
#highlight correlations which are between families of "decent size"
testingM2$bigNumber = 0
testingM2$bigNumber[testingM2$family1 %in% significant & testingM2$family2 %in% significant] = 1
testingM2$x = "diet"
testingM2$xj <- jitter(as.numeric(factor(testingM2$x)), factor = 10)
testingM2 %<>% mutate(lab = paste(family1, family2, sep="__"))
corrboxplot = ggplot(testingM2,aes(x=x,y=corr)) +
geom_boxplot(outlier.shape = NA) +
geom_point(aes(x=xj,colour=as.factor(bigNumber))) +#+
geom_text(data = filter(testingM2, bigNumber == 1, abs(corr) > 0.4),
aes(x=xj,label=lab, colour=as.factor(bigNumber)), size=2)+
scale_color_discrete("Sample numbers", labels=c("not top10", "top10"))
#ggsave(corrboxplot, file="../out/init.0104.corrboxplot-invivos.pdf", w=10, h=10)
corrboxplot
If we just select for those with correlation values above 0.4:
filter(testingM2, bigNumber == 1, abs(corr) > 0.4) %>%
select(-lab, -xj) %>%
arrange(corr)
m3[upper.tri(m3)] = NA
testingM3 = m3 %<>% data.frame
testingM3$family= rownames(testingM3)
testingM3 %<>% gather(family, corr) #%>% filter(corr != NA)
colnames(testingM3) = c("family1", "family2", "corr")
testingM3 = testingM3[!is.na(testingM3$corr),] %>% filter(corr != 1)
#write.csv(testingM2, file="../out/init.0104.correlation.csv", row.names=F)
#highlight correlations which are between families of "decent size"
testingM3$bigNumber = 0
testingM3$bigNumber[testingM3$family1 %in% significant & testingM3$family2 %in% significant] = 1
testingM3$x = "diet"
testingM3$xj <- jitter(as.numeric(factor(testingM3$x)), factor = 10)
testingM3 %<>% mutate(lab = paste(family1, family2, sep="__"))
corrboxplot = ggplot(testingM3,aes(x=x,y=corr)) +
geom_boxplot(outlier.shape = NA) +
geom_point(aes(x=xj,colour=as.factor(bigNumber))) +#+
geom_text(data = filter(testingM3, bigNumber == 1, abs(corr) > 0.4),
aes(x=xj,label=lab, colour=as.factor(bigNumber)), size=2)+
scale_color_discrete("Sample numbers", labels=c("not top10", "top10"))
ggsave(corrboxplot, file="../out/init.0104.corrboxplot-change.pdf", w=10, h=10)
corrboxplot
filter(testingM3, bigNumber == 1, abs(corr) > 0.4) %>%
select(-lab, -xj) %>%
arrange(corr)
We build mix-effect regression models to model the effect of time and diet on normalised counts of a given bacterial family and account for the random effect of mouse subjects as well as possible interactions subjects have with the diet.
inputDF = famPlot$df %>%
select(-xaxis, food, -Day) %>%
spread(taxa, count)
colnames(inputDF) = make.names(colnames(inputDF))
#model = lmer(Erysipelotrichaceae ~ food + dayInt + (1 + food | mouseID), data = inputDF, REML=T)
#p1 = ggplot(inputDF, aes(mouseID, predictedValues, fill=food)) +
# geom_bar(stat="identity") +
# geom_point(aes(y=Erysipelotrichaceae)) +
# facet_wrap(~dayInt)
#ggsave(p1, file="../out/init.bar.pdf")
#pdf("../out/init.slopes.pdf", w=10, h=10)
regressionInfo = c("Lactobacillaceae", "S24.7", "Unclassified", "Lachnospiraceae", "Erysipelotrichaceae", "Ruminococcaceae", "Clostridiaceae.1", "Anaeroplasmataceae", "vadinBB60", "Coriobacteriaceae") %>% #head(n=1) %>%
lapply(function(fam) {
model = paste0(fam," ~ dayInt + food + (1 | mouseID)") %>%
as.formula %>%
lmer(data = inputDF, REML=T)
inputDF$predictedValues = predict(model)
list(df = inputDF, m=model, family = fam)
})
save(regressionInfo, file="../out/init.0104.regression.rda")
lapply(regressionInfo, function(x) {
x$df %>% ggplot(aes(x=dayInt, y=predictedValues, color=mouseID)) +
geom_line(aes(linetype=food)) +
geom_point(aes_string(y=x$family))+
facet_wrap(~food)+
ggtitle(x$family)
})
#dev.off()
#ggsave(p1, file="../out/init.0104.justIntercept.pdf", w=10,h=10)
# Things to do:
# 1. hunt for day slopes which increased, decreased the most over time
# 2. hunt for diet slopes which increased
# 3. must not be too low a count
Tag level analyses (Incomplete)
Comparing abundance profiles from samples within the same diet groups we observe clusters of positive and negative correlations:
Why Family: It was a good balance between rank resolution and minimizing unclassified taxonomy.
barplot = function(tab, percent=FALSE, title){
justCounts = tab %>% group_by(tag) %>% summarise(summedCount = sum(count)) %>% arrange(desc(summedCount))
N90 = sum(justCounts$summedCount) * 0.9
N90tags = as.character(justCounts[which(cumsum(justCounts$summedCount) <= N90),]$tag)
N90df = filter(tab, tag %in% N90tags)
famTag = anno %>% select(tag, f)
N90df %<>% merge(famTag, by="tag", all.x=T)
N90df = N90df %>% group_by(mouseID, food, Day, f) %>% summarise(count = sum(count))
N90df$dayInt = as.integer(N90df$Day)
levels(N90df$f) = c("Unclassified", levels(N90df$f)[-1])
most2least = N90df %>% group_by(f) %>% summarise(totCount = sum(count)) %>% arrange(desc(totCount)) %$% f %>% as.character
N90df$f = factor(N90df$f, levels = most2least)
getPalette = colorRampPalette(brewer.pal(9, "Set1"))
Nfamilies = length(unique(N90df$f))
if(percent){
N90df %<>% group_by(mouseID, Day, food) %>%
mutate(normCount = count/sum(count))
ggplot(N90df, aes(x=mouseID, y=normCount, fill=f)) +
geom_bar(stat="identity", position="stack")+
facet_wrap(~food+Day, scale="free")+
scale_fill_manual("Taxonomic Assignment: Family", values = getPalette(Nfamilies))+
scale_y_continuous(labels = scales::percent)+
theme(axis.text.x = element_text(angle=45)) +
guides(fill = guide_legend(nrow=4)) +
theme(legend.position="bottom")+
ggtitle(sprintf("Family level N90 %s", title))
}else{
ggplot(N90df, aes(x=mouseID, y=count, fill=f)) +
geom_bar(stat="identity", position="stack")+
facet_wrap(~food+Day, scale="free")+
scale_fill_manual("Taxonomic Assignment: Family", values = getPalette(Nfamilies))+
guides(fill= guide_legend(nrow=4)) +
theme(legend.position="bottom") +
ggtitle(sprintf("Family level N90 %s", title))
}
}
p0 = barplot(tabLong, FALSE, "rawCount")
p0
#ggsave(p0, file="../out/init.0104.plotTime_rawCount.pdf", w=20, h=10)
#p0 = barplot(tabLong, TRUE, "rawCount")
#p0
#ggsave(p0, file="../out/init.0104.plotTime_rawCount_precent.pdf", w=20, h=10)
p1 = barplot(normTabLong, FALSE, "z-score normalised")
p1
#ggsave(p1, file="../out/init.0104.plotTime_adjustedZscore.pdf", w=20, h=10)
#p1 = barplot(normTabLong, TRUE, "normalised")
#p1
#ggsave(p1, file="../out/init.0104.plotTime_adjustedZscore_percent.pdf", w=20, h=10)
We plan to do correlation analyses wrt to time and how the different mice correlated as time changes
There exists several methods to normalise the sequencing read count data, such as Z-score normalization, subsampling aka rarefication, regression based.
In this report, we will be testing out Rarification-Subsampling with phyloseq’s rarefy_even_depth
function
\[z_{standard} = \frac{x - \mu}{\sigma}\] \[z_{modified} = z - min(z)\]
#Standardization
standardizedTab = tab %>% select(-tag) %>% sqrt %>% scale(T,T)
standardizedTab = standardizedTab - min(standardizedTab)
#Normalization:
# Assuming that the downstream analyses do not require a normal distribution, there's no need to do the transformation
# And even if we do, the distribution is not going to be normal
#txtdensity(sqrt(standardizedTab[,1]/sum(standardizedTab[,1])))
normTab = standardizedTab
#Rever back to data.frame and add tags
normTab %<>% as.data.frame
normTab$tag = tab$tag
normTabLong = longForm(normTab)
Subsampling normalization using phyloseq’s rarefy_even_depth
(Used in various multi-sample metagenomics papers)
Proportion normalised RNA seq by taking a proportion fo the total number of reads in a given sequencing lane
Regression based normalisation eg. (using neg bionomial distributions implemented in DESeq2)