knitr::opts_chunk$set(warning=FALSE, message=FALSE, fig.width=15, fig.height=15, dpi=300)
#suppressPackageStartupMessages(library(phyloseq))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(GGally))
suppressPackageStartupMessages(library(ggbiplot))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(magrittr))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(vegan))
suppressPackageStartupMessages(library(ape))
suppressPackageStartupMessages(library(pheatmap))
suppressPackageStartupMessages(library(lme4))
theme_set(theme_bw())

1 Introduction

Fetal colonization and establishment of the gut microbiome begins in the placenta even before birth (Gritz et al) and often high interindividual variability is observed in gut microbiomes.

In this analyses, we investigate:

  1. Inter-individual differences between baby mice fed on same and changing diets (Always Invivos feed, Invivos2CM feed, CM feed)
  2. Effects of changing from one diet to another within the span of 50 days (sampling done on Days 0, 7, 21, 49)

1.1 Background

Our anaylses are split across the following:

  1. Ribosomal DNA variable region tags adjacent to V4 constant region using Ribotagger on whole metagenome data (This report)
  2. Whole metagenome analyses

Using ribotagger on whole metagenome data, we looked across mice gut metagenomes grouped by their diets (always Invivos, Invivos changed to CM from day0, always CM), each diet group consists of with 10 biological replicates.

anno = read.table("../out/init.0103.ribotaggerBatch/biom/sample.anno", h=T, sep="\t")
tab  = read.table("../out/init.0103.ribotaggerBatch/biom/sample.tab", h=T, sep="\t")
df = merge(anno, tab, by="tag")
#write.csv(df, file="../out/init.0104.ribotaggerOutput.csv", row.names=F)
#' Convert matrix into long form
longForm = function(tab){
    tabLong = tab %>% gather(sample, count, -tag)
    sampleDF = strsplit(as.character(tabLong$sample), "\\.") %>%
    lapply(function(row){
        data.frame(
            mouseID = row[2],
            food = row[3],
            Day = row[4]
        )
    }) %>% do.call(rbind,.)
    cbind(tabLong, sampleDF)
}
#its not possible to install phyloseq on water.
#got it to run on my mac
evenReads = read.csv("../out/init.0104.evenSubsampleReads.csv")
colnames(evenReads) = c("tag", colnames(evenReads)[-1])
evenTabLong = longForm(evenReads)

2 Exploratory Analyses

2.1 Read Count

#processed by script/misc.countReads.pl
#   total count
readCount = read.table(file="../out/misc.countReads.out.txt", sep="\t") %>% 
    setNames(c("sample", "count"))

2.1.1 Total reads

A total of 1.852542510^{8} reads, 9.299763210^{10} bases sequenced.

We plot the total whole-metagenome reads for each sample

evenTabLong$ID = evenTabLong$sample %>% substr(1, 10)
readCount = evenTabLong %>% select(ID, mouseID, food, Day) %>% unique %>% merge(readCount, by.x="ID", by.y="sample", all=T)
levels(readCount$food) = c("InVivos","Change", "CM")

rcPlot = readCount %>% ggplot(aes(reorder(ID, Day),count)) + 
    geom_point() + 
    facet_wrap(~food, scale="free") + 
    xlab("Sample ID") +
    ylab("Read Counts") +
    theme(axis.text.x=element_text(angle=90))
    #scale_y_log10()
#ggsave(rcPlot, file="../out/init.0104.readCounts.pdf")
#need to merge, get the land information
rcPlot

2.1.2 V4 reads

V4reads = tab %>% select(-tag) %>% apply(1, sum) %>% sum

Of the total reads 67294 were from the V4 region identiified by Ribotagger’s unique tagging.

In Rausch et al, it was discussed that the diversity of bacteria within the the B57BL/6J strains., based on genomic abundances, are dominated by Firmicutes and Bacteroidetes (both of which are Phylum level classifications)

We plot the same plot (Figure 1) Rausch et al at the Phylum level

plot

plot

2.2 Phylum level Abundance

#connect("metamaps.scelse.nus.edu.sg", 7474, "neo4j", "g0ldenEarlGr3y")
#taxIDInfo = taxname(colnames(testing[[1]]), name=T)
#taxIDInfo = merge(data.frame(name=colnames(testing[[1]]), type="ribotagged"), taxIDInfo, all.x=T)
#taxIDInfo$taxID[6] = 31979

#taxIDInfo_family = taxIDInfo %>% apply(1, function(row){
#if(!is.na(row[3])){
#    as.integer(row[3])
#    df = as.integer(row[3]) %>% path2kingdom %>% make.data.frame %>% filter(rank == 'phylum')
#    if(nrow(df) > 0){
#        df$original.name   = unname(row[1])
#        df$original.taxID  = unname(row[3])
#        df
#    }else{
#        data.frame(name=NA, taxid=NA, rank='phylum', original.name=row[1], original.taxID=row[3])
#    }
#}else{
#        data.frame(name=NA, taxid=NA, rank='phylum', original.name=row[1], original.taxID=row[3])
#}
#}) %>% do.call(rbind,.)
#rownames(taxIDInfo_family) = NULL
#taxIDInfo_family$name[23] = 'Bacteroidetes'
#taxIDInfo_family$name[c(12,24,25)] = 'unclassified'


#' tab the matrix
#' title the title
#' the rank at which to display

barplotPhyla = function(tab, title, taxonLevel)
{
    famTag = anno %>% select_("tag", taxonLevel)
    tab %<>% merge(famTag, by="tag", all.x=T)
#    tab %<>% as.data.table
    tab %<>% group_by_("mouseID", "food", "Day", taxonLevel) %>% summarise(count = sum(count)) %>% ungroup
    tab %<>% filter(food != 3)
    tab$dayInt = as.integer(tab$Day)
colnames(tab) = c("mouseID", "food", "Day", "taxa", "count", "dayInt")
    levels(tab$taxa) = c("Unclassified", levels(tab$taxa)[-1])
    most2least = tab %>% group_by(taxa) %>% summarise(totCount = sum(count)) %>% arrange(desc(totCount)) %$% taxa %>% as.character
    tab$taxa = factor(tab$taxa, levels = most2least)
    getPalette = colorRampPalette(brewer.pal(9, "Paired"))
    Nfamilies = length(unique(tab$taxa))
    tab %<>% arrange(Day, food)
    tab %<>% mutate(xaxis = paste0(mouseID, Day))
    coloursss = rev(getPalette(Nfamilies))

if(taxonLevel == 'p'){
    coloursss[1] = '#6a3d9a'
    coloursss[2] = '#ffff99'
}
    levels(tab$food) = c("invivos", "change diet", "cm")
    list(
        thePlot = ggplot(tab, aes(x=xaxis, y=count, fill=taxa)) +
        geom_bar(stat="identity", position="stack")+
        facet_wrap(~food + Day, drop=T, scale="free", ncol=4)+ 
        scale_fill_manual("Taxonomic Assignment: Family", values = coloursss)+
        guides(fill= guide_legend(nrow=5)) +
        theme(legend.position="bottom")+
        ggtitle(sprintf("Phyla level %s", title)),
    df = tab)
}

#p0 = barplotPhyla(tabLong, percent= TRUE, "rawCount")
plotNdf = barplotPhyla(evenTabLong, title="rarefied", taxonLevel="p")
plotNdf$thePlot

phyPlot = plotNdf

2.3 Family level Abundance

plotNdf = barplotPhyla(evenTabLong, title="rarefied", taxonLevel="f")
plotNdf$thePlot

famPlot = plotNdf

Immunological studies begin 50 days after the mice have been transfer into the NUS rearing facilities while fed on new diet (CM),

The immediate question is if this change impacts the gut microbiome and in turn the immune system.

To see how samples cluster across diets, we carried out ordination using Principal Coordinate Analysis based on bray-curtis dissimilarity.

We see strong clustering between mice fed the same diet from birth even after the switch.

Change in diet over the course of 50 days was not enough to bring forth a 3rd cluster.

rownames(evenReads) = evenReads$tag
evenReads %<>% select(-tag)
pcoaDF = evenReads %>% t %>% vegdist() %>% pcoa
smallDF = rownames(pcoaDF$vectors) %>%
strsplit("\\.") %>%
    lapply(function(row){
        data.frame(
            mouseID = row[2],
            food = row[3],
            Day = row[4]
        )
    }) %>% do.call(rbind,.)

pcoaDF2 = cbind(smallDF, pcoaDF$vectors[,1:2]) %>% setNames(c("mouse", "food", "day","PC1", "PC2"))
pcoaDF2 %<>% mutate(intday = as.integer(day))
levels(pcoaDF2$food) = c("invivos", "change", "cm")

pcoaDF2 %>% ggplot(aes(PC1, PC2, alpha=intday, shape=food)) + 
        geom_point()+
        scale_alpha_continuous("Day", labels=c(0, 7, 21, 42))

We plotted a PCoA plot fo the samples from , the 1st PC which accounted for 26.9529554%

pcoaDF2 %>% ggplot(aes(PC1, PC2, alpha=intday)) + 
        geom_point()+
        geom_path(aes(group=mouse), alpha=0.2) + 
        scale_alpha_continuous("Day", labels=c(0, 7, 21, 42)) +
        facet_wrap(~food, ncol=1)
pcoaDF2 %>% ggplot(aes(PC1, PC2, alpha=intday)) + 
        geom_point()+
        geom_path(aes(group=mouse), alpha=0.2) + 
        scale_alpha_continuous("Day", labels=c(0, 7, 21, 42)) +
        facet_wrap(~food+mouse, ncol=5)

2.4 Day zero

We cluster mice on the 1st day, there doesnt seem to be any clear grouping

#colnames(evenReads)
type12_Day0 = !grepl('\\.3\\.Day', colnames(evenReads)) & grepl('Day0', colnames(evenReads))
pcoaDF_Day0 = t(evenReads) %>% .[type12_Day0, ] %>% vegdist() %>% pcoa

smallDF_Day0 = rownames(pcoaDF_Day0$vectors) %>%
strsplit("\\.") %>%
    lapply(function(row){
        data.frame(
            mouseID = row[2],
            food = row[3],
            Day = row[4]
        )
    }) %>% do.call(rbind,.)

pcoaDF2_Day0 = cbind(smallDF_Day0, pcoaDF_Day0$vectors[,1:2]) %>%
setNames(c("mouse", "food", "day","PC1", "PC2"))
levels(pcoaDF2_Day0$food) = c("invivos", "change")
ggplot(pcoaDF2_Day0, aes(PC1, PC2, color=food, label=mouse)) + 
    geom_point()                        +
    geom_text(check_overlap= TRUE)      +
ggtitle("Day0 for InVivos and Change Diet only")

newDFjust0 = famPlot$df %>% filter(dayInt==1)
newDFjust0$grouping = 0
newDFjust0$grouping[which(newDFjust0$mouseID == 'M03')] = 1
newDFjust0$grouping[which(newDFjust0$mouseID == 'M08')] = 1
newDFjust0$grouping[which(newDFjust0$mouseID == 'M10')] = 1
newDFjust0$grouping[which(newDFjust0$mouseID == 'M13')] = 1
newDFjust0$grouping[which(newDFjust0$mouseID == 'M18')] = 1

#m03, m08, m10, m13, m18
newDFjust0 %>% ggplot(aes(xaxis, count, fill=taxa)) +
        geom_bar(stat="identity", position="stack")+
        facet_wrap(~grouping, drop=T, scale="free")+ 
        guides(fill= guide_legend(nrow=5)) +
        theme(legend.position="bottom", axis.text.x=element_text(angle=90))

        #ggtitle(sprintf("Phyla level %s", title))

The groups are largely different for

  1. Lactobacillaceae
  2. S24-7
  3. Lachnospiraceae

Blank the rest out in the plot below

To observe, if there was any subtle differences between the Invivos and Switch diet, we carried out PCoA on just InVivos and InVivos2CM.

2.5 Just Invivos and Invivos2CM

We plot for all samples on all days, at first glance its hard to tell.

evenReads12 = evenReads[,smallDF$food != 3]
pcoaDF12 = evenReads12    %>%
            t       %>%
            vegdist %>%
            pcoa

pcoaDF12_2 = cbind(smallDF[smallDF$food != 3,], pcoaDF12$vectors[,1:2]) %>%
                setNames(c("mouse", "food", "day","PC1", "PC2"))
pcoaDF12_2 %<>% mutate(intday = as.integer(day))
levels(pcoaDF12_2$food) = c("invivos", "change", "cm")

pcoaDF12_2 %>% ggplot(aes(PC1, PC2, alpha=intday, shape=food, color=food)) +
        geom_point()+
        scale_alpha_continuous("Day", labels=c(0, 7, 21, 42)) +
        xlab(sprintf("PC1 percentage variance %s", pcoaDF12$values$Rel_corr_eig[1]*100)) +
        ylab(sprintf("PC2 percentage variance %s", pcoaDF12$values$Rel_corr_eig[2]*100))

2.5.1 Split according to the day of sampling.

PC2 separates the two diets and the 2 groups from day 14 onlywards. The samples look evenly mixed on day0 as expected.

pcoaDF12_2 %>% ggplot(aes(PC1, PC2, shape=food, color=food)) +
        geom_point()+
        scale_alpha_continuous("Day", labels=c(0, 7, 21, 42)) +
        xlab(sprintf("PC1 percentage variance %s", pcoaDF12$values$Rel_corr_eig[1]*100)) +
        ylab(sprintf("PC2 percentage variance %s", pcoaDF12$values$Rel_corr_eig[2]*100)) + 
        facet_wrap(~day)

#need to include the day not just the mouse's identity
pcoaDF12_2 %<>% mutate(m = paste0(mouse, day))
pcoaDF12_2 %>% ggplot(aes(PC1, PC2, alpha=intday, shape=food)) +
        geom_text(aes(label=mouse))+
        scale_alpha_continuous("Day", labels=c(0, 7, 21, 42)) +
        xlab(sprintf("PC1 percentage variance %s", pcoaDF12$values$Rel_corr_eig[1]*100)) +
        ylab(sprintf("PC2 percentage variance %s", pcoaDF12$values$Rel_corr_eig[2]*100)) +
        theme(legend.position="bottom") +
        guides(color = guide_legend(nrow=4))

2.5.2 What’s driving the two top PCs of the PCoA plot?

We carry out Principle Component Analysis on the count matrix to identify the families responsible for driving the separation which we saw earlier

Due to the way we subsampled the data, families with zero counts across all samples are not included.

#' takes family account
#'
#' params nt matrix rownames have the
#' params anno ribotag information

buildMatrix = function(nt, anno)
{
    #append rank name
    nt$tag = rownames(nt)
    tagNames = rownames(nt) %>% lapply(function(ntTag) {
        familyTag = dplyr::filter(anno, tag == ntTag) %$% f
        data.frame(tag=ntTag, family = familyTag)
    }) %>% do.call(rbind,.)
    tagNames$family %<>% as.character
    tagNames$family[tagNames$family== ""] = "unclassified"
    tt = merge(nt, tagNames)

    #Collapse into family
    t2 = tt %>% select(-tag) %>%
        gather(sample, value, -family) %>%
        group_by(family, sample) %>%
        summarise(value = sum(value)) %>% ungroup

    sampleDF = as.character(t2$sample) %>% strsplit("\\.") %>%
    lapply(function(row){
        data.frame(
            mouseID = row[2],
            food = row[3],
            Day = row[4]
        )
    }) %>% do.call(rbind,.)

    t2 = cbind(t2, sampleDF)

    t2 = t2 %>%
    filter(food != 3) %>% #remove Mchemy mouse
    spread(family, value)

    t3 = t2 %>% select(-mouseID, -food, -Day) %>%
    as.matrix %>%
    t

    newSampleDF = t2 %>% select(mouseID, food, Day)
    t4 = t3[-1, ]

    colnames(t4) = t3[1,] %>% unlist %>% as.character
    t5 = t4 %>% apply(1, function(row) as.numeric(row) )
    rownames(t5) = colnames(t4)
    list(t5, newSampleDF)
}

testing = buildMatrix(evenReads, anno)

columnSummary = testing[[1]] %>%
apply(2, function(x){
    sum(x)
}) %>% data.frame

columnSummary$taxa = make.names(rownames(columnSummary))
colnames(columnSummary) = c("count", "taxa")
remaining = filter(columnSummary, count > 0 ) %$% taxa
removedZero = testing[[1]][,make.names(colnames(testing[[1]])) %in% remaining]

#some of the columns are zero, after subsampling
# this will affect the analysis

speciesPCA = prcomp(removedZero, center=T, scale=T)
g <- ggbiplot(speciesPCA, obs.scale = 1, var.scale = 1, 
        groups = testing[[2]]$food, ellipse = TRUE, 
        circle = TRUE)
#ggsave(g, file="../out/init.0104.biplot.pdf", w=20, h=20)
g

Much of the variance is explained by PC1 but it doesnt seem to be separating the two groups.

The differences in the diets seem to be correlated with PC2, with specific taxonomic families associated.

Hence, identified families which are positively and negatively correlated PC2.

pc12 = speciesPCA$rotation[,1:2] %>% as.data.frame #%>% setNames(rownames(speciesPCA$rotation))
pc12$families = rownames(pc12)

invivos = pc12 %>% filter(PC1 < 0 , PC2 > 0) %>% arrange(desc(PC2))
invivos %>% pander::pandoc.table("rmarkdown")
## 
## ------------------------------------
##   PC1    PC2         families       
## ------- ------ ---------------------
## -0.1954 0.4141 Peptostreptococcaceae
## 
## -0.1516 0.3025      Liliopsida      
## 
## -0.2056 0.2701  Erysipelotrichaceae 
## 
## -0.3548 0.2183   Lactobacillaceae   
## ------------------------------------
## 
## Table: rmarkdown
change  = pc12 %>% filter(PC1 < 0 , PC2 < 0) %>% arrange(PC2)
change %>% pander::pandoc.table("rmarkdown")
## 
## ---------------------------------
##   PC1     PC2       families     
## ------- ------- -----------------
## -0.2029 -0.4316       S24-7      
## 
## -0.2537 -0.1618 Clostridiaceae 1 
## 
## -0.2014 -0.118  Coriobacteriaceae
## ---------------------------------
## 
## Table: rmarkdown
metaTab = tab %>% select(-tag)
rownames(metaTab) = tab$tag
nmdsOutput = metaTab %>% t %>% metaMDS
#pdf("../out/init.0104.nmds.pdf", w=25, h=10)
plot(nmdsOutput$points[,1], nmdsOutput$points[,2], pch = 20, xlab = "NMDS1", ylab = "NMDS2", cex = 5)
#dev.off()

2.6 Correlation structure of the dataset

2.6.1 AIM: To find (anti) correlations between taxon as we travel across time.

Too little time points to transform this into a stationary time series, with equal mean and variance.

2.6.1.1 A. At the Phylum Rank

plotNdf = phyPlot
adf = plotNdf$df %>% select(taxa, xaxis, count) %>% as.data.frame %>% spread(taxa, count)
info = plotNdf$df %>% select(food, Day, xaxis) %>% unique
adf = merge(info, adf, by="xaxis")
adf %<>% as.data.frame
colnames(adf) = make.names(colnames(adf))
phyLimit = min(which(adf[,-c(1:3)] %>% apply(2, sum) == 0)) -1 + 3
adf = adf[,1:phyLimit]
adf %<>% select(-xaxis, -Day)

ggpairs(data=adf %>% select(-Deferribacteres), color='food', title="Phylum")

#Peter wants a simple correlation plot, so we give him simple plots, actually also noticed this, the
m2 = adf %>% filter(food == 'invivos') %>% .[,-1] %>% cor(method="spearman")
m2[is.na(m2)] = 0
write.table(m2, file="../out/init.0104.corrInvivos.csv", row.names=F)
pheatmap(m2[hclust(dist(m2))$order, hclust(dist(m2))$order],
         display_numbers=TRUE, main="Invivos, Family Rank ")

m3 = adf %>% filter(food != 'invivos') %>% .[,-1] %>% cor(method="spearman")
m3[is.na(m3)] = 0
write.table(m3, file="../out/init.0104.corrInvivos.csv", row.names=F)
#pdf("../out/init.0104.corrMatrix.pdf", h=10, w=10)

#hclust(dist(m3))$order
pheatmap(m3[hclust(dist(m2))$order, hclust(dist(m2))$order], display_numbers=TRUE, main="ChangeDiet, Family Rank", cluster_rows=F, cluster_cols=F)

#dev.off()

2.6.1.2 B. At the Family Rank

#ggsave(plotNdf$thePlot, file="../out/famLevel.pdf", w=10, h=10)
plotNdf = famPlot
adf = plotNdf$df %>% select(taxa, xaxis, count) %>% as.data.frame %>% spread(taxa, count)
info = plotNdf$df %>% select(food, Day, xaxis) %>% unique
adf = merge(info, adf, by="xaxis")
adf %<>% as.data.frame
colnames(adf) = make.names(colnames(adf))
phyLimit = min(which(adf[,-c(1:3)] %>% apply(2, sum) == 0)) -1 + 3
adf = adf[,1:phyLimit]
adf %<>% select(-xaxis, -Day)

ggpairsplot = ggpairs(data=adf[,1:16], color='food')
#pdf("../out/init.0104.correlationStructure_family_corrPlot.pdf", w=20, h=20)
ggpairsplot

#dev.off()

2.7 Correlation and Regression

We note that the correlation is not fair for species with low count and also the distribution may not be normally distributed.

Thus we calculate the distribution of correlation values based on spearman correlation (which is rank based)

numbersSample = famPlot$df %>% group_by(taxa) %>% summarize(cc = sum(count)) %>% arrange(desc(cc))
ppp0 = qplot(x=1:25, y=numbersSample$cc, stat="identity", geom="bar")
#ggsave(ppp0, file="../out/init.0104.byNumbers.pdf")
significant = numbersSample %>% head(n=10) %$% taxa %>% make.names

2.7.1 Invivos

m2 = adf %>% filter(food == 'invivos') %>% .[,-1] %>% cor(method="spearman")
m2[is.na(m2)] = 0
write.table(m2, file="../out/init.0104.corrInvivos.csv", row.names=F)
#pheatmap(m2, display_numbers=TRUE, main="Invivos, Family Rank ")
pheatmap(m2[hclust(dist(m2))$order, hclust(dist(m2))$order], display_numbers=TRUE, main="Invivos, Family Rank ")

We observe two clusters of positive correlation:

Cluster Family Phylum
1 Deferribacteracea Deferribacteres
1 Anaeroplasmatacea Tenericutes
1 vadinBB60 Fimicutes
1 Ruminococcaceae Firmicutes
1 Lachnospiraceae Firmicutes
1 Defluvitaleaceae Frimicutes
2 Clostridiaceae 1 Firmicutes
2 S24.7 Bacteroidetes
2 Coriobacteriaceae Actinobacteria
2 Lactobacillaceae Firmicutes

Members within clusters 1 and 2 are positively correlated with each other but there exists a negative correlation between groups.

2.8 Change diet - InVivos2CM

m3 = adf %>% filter(food != 'invivos') %>% .[,-1] %>% cor(method="spearman")
m3[is.na(m3)] = 0
#write.table(m3, file="../out/init.0104.corrInvivos.csv", row.names=F)
#pdf("../out/init.0104.corrMatrix.pdf", h=10, w=10)
#pheatmap(m3, display_numbers=TRUE, main="ChangeDiet, Family Rank")
pheatmap(m3[hclust(dist(m2))$order, hclust(dist(m2))$order], display_numbers=TRUE, main="ChangeDiet, Family Rank", cluster_rows=F, cluster_cols=F)

#dev.off()

Cluster 1 became less positively correlated whereas cluster2 grew in size, the negative correlation between groups remain.

m2[upper.tri(m2)] = NA
testingM2 = m2 %<>% data.frame
testingM2$family= rownames(testingM2)
testingM2 %<>% gather(family, corr) #%>% filter(corr != NA)
colnames(testingM2) = c("family1", "family2", "corr")
testingM2 = testingM2[!is.na(testingM2$corr),] %>% filter(corr != 1)
#write.csv(testingM2, file="../out/init.0104.correlation.csv", row.names=F)

#highlight correlations which are between families of "decent size"
testingM2$bigNumber = 0
testingM2$bigNumber[testingM2$family1 %in% significant & testingM2$family2 %in% significant] = 1
testingM2$x = "diet"
testingM2$xj <- jitter(as.numeric(factor(testingM2$x)), factor = 10)
testingM2 %<>% mutate(lab = paste(family1, family2, sep="__"))
corrboxplot = ggplot(testingM2,aes(x=x,y=corr)) +
  geom_boxplot(outlier.shape = NA) +
  geom_point(aes(x=xj,colour=as.factor(bigNumber))) +#+
  geom_text(data = filter(testingM2, bigNumber == 1, abs(corr) > 0.4), 
        aes(x=xj,label=lab, colour=as.factor(bigNumber)), size=2)+ 
  scale_color_discrete("Sample numbers", labels=c("not top10", "top10"))
#ggsave(corrboxplot, file="../out/init.0104.corrboxplot-invivos.pdf", w=10, h=10)
corrboxplot

If we just select for those with correlation values above 0.4:

filter(testingM2, bigNumber == 1, abs(corr) > 0.4) %>% 
    select(-lab, -xj) %>% 
    arrange(corr)
m3[upper.tri(m3)] = NA
testingM3 = m3 %<>% data.frame
testingM3$family= rownames(testingM3)
testingM3 %<>% gather(family, corr) #%>% filter(corr != NA)
colnames(testingM3) = c("family1", "family2", "corr")
testingM3 = testingM3[!is.na(testingM3$corr),] %>% filter(corr != 1)
#write.csv(testingM2, file="../out/init.0104.correlation.csv", row.names=F)

#highlight correlations which are between families of "decent size"
testingM3$bigNumber = 0
testingM3$bigNumber[testingM3$family1 %in% significant & testingM3$family2 %in% significant] = 1
testingM3$x = "diet"
testingM3$xj <- jitter(as.numeric(factor(testingM3$x)), factor = 10)
testingM3 %<>% mutate(lab = paste(family1, family2, sep="__"))
corrboxplot = ggplot(testingM3,aes(x=x,y=corr)) +
  geom_boxplot(outlier.shape = NA) +
  geom_point(aes(x=xj,colour=as.factor(bigNumber))) +#+
  geom_text(data = filter(testingM3, bigNumber == 1, abs(corr) > 0.4), 
        aes(x=xj,label=lab, colour=as.factor(bigNumber)), size=2)+ 
  scale_color_discrete("Sample numbers", labels=c("not top10", "top10"))
ggsave(corrboxplot, file="../out/init.0104.corrboxplot-change.pdf", w=10, h=10)
corrboxplot
filter(testingM3, bigNumber == 1, abs(corr) > 0.4) %>% 
    select(-lab, -xj) %>% 
    arrange(corr)

We build mix-effect regression models to model the effect of time and diet on normalised counts of a given bacterial family and account for the random effect of mouse subjects as well as possible interactions subjects have with the diet.

inputDF = famPlot$df %>% 
            select(-xaxis, food, -Day) %>% 
            spread(taxa, count)
colnames(inputDF) = make.names(colnames(inputDF))

#model = lmer(Erysipelotrichaceae ~ food + dayInt + (1 + food | mouseID), data = inputDF, REML=T)
#p1 = ggplot(inputDF, aes(mouseID, predictedValues, fill=food)) + 
#    geom_bar(stat="identity") +
#    geom_point(aes(y=Erysipelotrichaceae)) + 
#    facet_wrap(~dayInt)
#ggsave(p1, file="../out/init.bar.pdf")


#pdf("../out/init.slopes.pdf", w=10, h=10)
regressionInfo = c("Lactobacillaceae", "S24.7",                "Unclassified",         "Lachnospiraceae",      "Erysipelotrichaceae", "Ruminococcaceae",      "Clostridiaceae.1",     "Anaeroplasmataceae",   "vadinBB60", "Coriobacteriaceae") %>% #head(n=1) %>%
lapply(function(fam) {
    model = paste0(fam," ~  dayInt + food + (1 | mouseID)") %>%
            as.formula                                      %>%
            lmer(data = inputDF, REML=T)
    inputDF$predictedValues = predict(model)
    list(df = inputDF, m=model, family = fam)
})

save(regressionInfo, file="../out/init.0104.regression.rda")

lapply(regressionInfo, function(x) {
x$df %>% ggplot(aes(x=dayInt, y=predictedValues, color=mouseID)) +
    geom_line(aes(linetype=food)) +
    geom_point(aes_string(y=x$family))+
    facet_wrap(~food)+
    ggtitle(x$family)
})
#dev.off()
#ggsave(p1, file="../out/init.0104.justIntercept.pdf", w=10,h=10)
# Things to do:
# 1. hunt for day slopes which increased, decreased the most over time
# 2. hunt for diet slopes which increased 
# 3. must not be too low a count

Tag level analyses (Incomplete)

Comparing abundance profiles from samples within the same diet groups we observe clusters of positive and negative correlations:

Why Family: It was a good balance between rank resolution and minimizing unclassified taxonomy.

barplot = function(tab, percent=FALSE, title){
    justCounts  = tab %>% group_by(tag) %>% summarise(summedCount = sum(count)) %>% arrange(desc(summedCount))
    N90         = sum(justCounts$summedCount) * 0.9
    N90tags     = as.character(justCounts[which(cumsum(justCounts$summedCount) <= N90),]$tag)
    N90df = filter(tab, tag %in% N90tags)
    famTag = anno %>% select(tag, f)
    N90df %<>% merge(famTag, by="tag", all.x=T)
    N90df = N90df %>% group_by(mouseID, food, Day, f) %>% summarise(count = sum(count))
    N90df$dayInt = as.integer(N90df$Day)

    levels(N90df$f) = c("Unclassified", levels(N90df$f)[-1])
    most2least = N90df %>% group_by(f) %>% summarise(totCount = sum(count)) %>% arrange(desc(totCount)) %$% f %>% as.character
    N90df$f = factor(N90df$f, levels = most2least)

    getPalette = colorRampPalette(brewer.pal(9, "Set1"))
    Nfamilies = length(unique(N90df$f))

    if(percent){
        N90df %<>% group_by(mouseID, Day, food) %>%
            mutate(normCount = count/sum(count))

    ggplot(N90df, aes(x=mouseID, y=normCount, fill=f)) +
        geom_bar(stat="identity", position="stack")+
        facet_wrap(~food+Day, scale="free")+ 
        scale_fill_manual("Taxonomic Assignment: Family", values = getPalette(Nfamilies))+
        scale_y_continuous(labels = scales::percent)+
        theme(axis.text.x = element_text(angle=45)) +
        guides(fill = guide_legend(nrow=4)) +
        theme(legend.position="bottom")+
        ggtitle(sprintf("Family level N90 %s", title))
    }else{
    ggplot(N90df, aes(x=mouseID, y=count, fill=f)) +
        geom_bar(stat="identity", position="stack")+
        facet_wrap(~food+Day, scale="free")+
        scale_fill_manual("Taxonomic Assignment: Family", values = getPalette(Nfamilies))+
        guides(fill= guide_legend(nrow=4)) +
        theme(legend.position="bottom") +
        ggtitle(sprintf("Family level N90 %s", title))
    }
}

p0 = barplot(tabLong, FALSE, "rawCount")
p0
#ggsave(p0, file="../out/init.0104.plotTime_rawCount.pdf", w=20, h=10)
#p0 = barplot(tabLong, TRUE, "rawCount")
#p0
#ggsave(p0, file="../out/init.0104.plotTime_rawCount_precent.pdf", w=20, h=10)
p1 = barplot(normTabLong, FALSE, "z-score normalised")
p1
#ggsave(p1, file="../out/init.0104.plotTime_adjustedZscore.pdf", w=20, h=10)
#p1 = barplot(normTabLong, TRUE, "normalised")
#p1
#ggsave(p1, file="../out/init.0104.plotTime_adjustedZscore_percent.pdf", w=20, h=10)

We plan to do correlation analyses wrt to time and how the different mice correlated as time changes

3 Methods

3.1 Standardization and Normalization

There exists several methods to normalise the sequencing read count data, such as Z-score normalization, subsampling aka rarefication, regression based.

In this report, we will be testing out Rarification-Subsampling with phyloseq’s rarefy_even_depth function

  • Z-score normalisation on sqrt data (Assuming equal variance)

\[z_{standard} = \frac{x - \mu}{\sigma}\] \[z_{modified} = z - min(z)\]

#Standardization
    standardizedTab = tab %>% select(-tag) %>% sqrt %>% scale(T,T)
    standardizedTab = standardizedTab - min(standardizedTab)
#Normalization:
# Assuming that the downstream analyses do not require a normal distribution, there's no need to do the transformation
# And even if we do, the distribution is not going to be normal
#txtdensity(sqrt(standardizedTab[,1]/sum(standardizedTab[,1])))
    normTab = standardizedTab
#Rever back to data.frame and add tags
    normTab %<>% as.data.frame
    normTab$tag = tab$tag
normTabLong = longForm(normTab)
  • Subsampling normalization using phyloseq’s rarefy_even_depth (Used in various multi-sample metagenomics papers)

    • the documentation actually discourages against the use of this method although its popularity and suggests using negative bionomial based regression techniques.
  • Proportion normalised RNA seq by taking a proportion fo the total number of reads in a given sequencing lane

  • Regression based normalisation eg. (using neg bionomial distributions implemented in DESeq2)

4 Remaining analyses

  • Table for the mapping of families to the phyla
  • daniel said prevotella and ??? a something (genus rank are extrememly variable)

5 References

  1. Gritz, E. C., & Bhandari, V. (2015). The Human Neonatal Gut Microbiome: A Brief Review. Frontiers in Pediatrics, 3(March), 1–12. http://doi.org/10.3389/fped.2015.00017
  2. Rausch, P., Basic, M., Batra, A., Bischoff, S. C., Blaut, M., Clavel, T., … Baines, J. F. (2016). Analysis of factors contributing to variation in the C57BL/6J fecal microbiota across German animal facilities. International Journal of Medical Microbiology. http://doi.org/10.1016/j.ijmm.2016.03.004