Skip to content

Latest commit

 

History

History
657 lines (569 loc) · 29.6 KB

IBD_Twins_Microbiome_Utrecht_Groningen.md

File metadata and controls

657 lines (569 loc) · 29.6 KB

Healthy cotwins share gut microbiome signatures with their inflammatory bowel disease twins and unrelated patients

Author of code: Marjolein A.Y. Klaassen, Ranko Gacesa, Eelco C. Brand

Year: 2020

Aim: Study gut microbiomes of IBD-discordant and -concordant twin pairs, which offers the unique opportunity to assess individuals at risk of developing IBD, namely healthy cotwins from IBD-discordant twin pairs.


1a. Filtering steps (taxa)

# ================================================================================
# ================================================================================
# function for filtering microbiome (metaphlan) results
# - takes dataframe (any)
# - replaces NAs with 0.0
# - filters OUT taxa present in less then presPerc samples
# - filters OUT taxa with mean releative abundance < minMRelAb
# - filters OUT taxa with median relative abundance < minMedRelAb
# - filters OUT taxonomic levels not in keepLevels ()
#   -> keepLevels should be vector of following: T = strain, S = species, G = Genera, F = families
#                                                O = Orders, C = classes, P = phyla, K = Kingdoms
#   -> example: keepLevels=c('S','G') keeps only species and genera
# - filters OUT domains not in keepDomains: 
#   -> input is vector of following: Eukaryota, Bacteria, Viruses, Archaea
#   -> example: keepDomains=c('B') keeps only bacteria, removes viruses, archaea and eukarya
# - if rescaleTaxa = True, rescales relative abundancies after filtering
# returns modified dataframe
# NOTES:
# - assumes metaphlan encoding (k__<kingdom>.o__<order> ... ); it will mess stuff
# up if non-metagenome rows are encoded like this!
# DEFAULTS: 
# - removes non-bacteria
# - keeps all except kingdoms
# ================================================================================

inDF = relative_abundance_data_frame

filterMetaGenomeDF <- function(inDF,presPerc = 0.1,minMRelAb = 0.01,minMedRelAb=0.0, rescaleTaxa=F,verbose=T,
                               keepDomains=c('Bacteria','Archaea'),
                               keepLevels=c('T','S','G','F','O','C','P')) {
  
  tCols = grep('k__',colnames(inDF)) # colums with microbiome
  tColsNMG = grep('k__',colnames(inDF),invert = T) # colums with microbiome
  # replaces NAs in microbiome with 0s
  for (c in tCols) {
    inDF[,c][is.na(inDF[,c])] <- 0.0
  }
  # filter for presence
  # -----------------
  nrRemoved = 0
  toRemove = c()
  for (c in tCols) {
    nrnZ = as.numeric(sum(inDF[,c]!=0.0))
    if ( (nrnZ/as.numeric(nrow(inDF))) < presPerc) {
      if (verbose) {
        print (paste('col',c,': ',colnames(inDF)[c],'; nr non Zero:',nrnZ,'=',nrnZ/as.numeric(nrow(inDF)),'>> Column removed!'))
      }
      nrRemoved = nrRemoved + 1
      toRemove <- c(toRemove,c)
    }
  }
  if (length(toRemove) > 0) {
    inDF <- inDF[,-toRemove]
  }
  tCols = grep('^[dgtspcfko]__',colnames(inDF)) # colums with microbiome
  if (verbose) {print (paste(' > presence filter: Removed',nrRemoved,'taxa!, ',length(tCols),'taxa left!')); }
  
  # filter for abundance (mean)
  # ---------------------------
  nrRemoved = 0
  toRemove = c()
  for (c in tCols) {
    mn = mean(inDF[,c])
    if ( mn < minMRelAb) {
      #if (verbose) {
      #  print (paste('col',c,': ',colnames(inDF)[c],'; mean rel abundance:',mn,' >> Column removed!'))
      #}
      nrRemoved = nrRemoved + 1
      toRemove <- c(toRemove,c)
    }
  }
  if (length(toRemove) > 0) {
    inDF <- inDF[,-toRemove]
  }
  tCols = grep('k__',colnames(inDF)) # colums with microbiome
  if (verbose) {print (paste(' > mean abundance filter: Removed',nrRemoved,'taxa!, ',length(tCols),'taxa left!')); }
  
  # filter for abundance (median)
  # -----------------------------
  nrRemoved = 0
  toRemove = c()
  for (c in tCols) {
    mn = median(inDF[,c])
    if ( mn < minMedRelAb) {
      #if (verbose) {
      #  print (paste('col',c,': ',colnames(inDF)[c],'; median rel abundance:',mn,' >> Column removed!'))
      #}
      nrRemoved = nrRemoved + 1
      toRemove <- c(toRemove,c)
    }
  }
  if (length(toRemove) > 0) {
    inDF <- inDF[,-toRemove]
  }
  if (verbose) {print (paste(' > median abundance filter: Removed',nrRemoved,'taxa!, ',length(tCols),'taxa left!')); }  
  
  # keep domains
  # -----------------------------
  if (keepDomains!='All' & keepDomains !="") {
    toKeep = grep('k__',colnames(inDF),invert = T)
    for (d in keepDomains) {
      #d print(d)
      toKeep = c(toKeep,grep(paste('k__',d,sep=''),colnames(inDF)))
    }
    inDF <- inDF[,toKeep]
  }
  
  # remove taxonomic levels
  # -----------------------------
  inDFnonTaxa <- as.data.frame(inDF[,grep('k__',colnames(inDF),invert=T)])
  colnames(inDFnonTaxa) <- colnames(inDF)[grep('k__',colnames(inDF),invert=T)]
  inDF2 <- inDF[,grep('k__',colnames(inDF),invert=F)]
  # pick strains (T)
  taxaTCols <- grep('t__',colnames(inDF2))
  taxaT <- inDF2[,taxaTCols]
  if (length(taxaT) > 0) {inDF2 <- inDF2[,-taxaTCols]}
  # pick species (S)
  taxaSCols <- grep('s__',colnames(inDF2))
  taxaS <- inDF2[,taxaSCols]
  if (length(taxaS) > 0) {inDF2 <- inDF2[,-taxaSCols]}
  # pick genera (G)
  taxaGCols <- grep('g__',colnames(inDF2))
  taxaG <- inDF2[,taxaGCols]
  if (length(taxaG) > 0) {inDF2 <- inDF2[,-taxaGCols]}
  # pick families (F)
  taxaFCols <- grep('f__',colnames(inDF2))
  taxaF <- inDF2[,taxaFCols]
  if (length(taxaF) > 0) {inDF2 <- inDF2[,-taxaFCols]}
  # pick orders (O)
  taxaOCols <- grep('o__',colnames(inDF2))
  taxaO <- inDF2[,taxaOCols]
  if (length(taxaO) > 0) {inDF2 <- inDF2[,-taxaOCols]}
  # pick classes (C)
  taxaCCols <- grep('c__',colnames(inDF2))
  taxaC <- inDF2[,taxaCCols]
  if (length(taxaC) > 0) {inDF2 <- inDF2[,-taxaCCols]}
  # pick phyla (P)
  taxaPCols <- grep('p__',colnames(inDF2))
  taxaPColsKeep <- grep('p__',colnames(inDF2),invert = T)
  taxaPColsKeepN <- colnames(inDF2)[grep('p__',colnames(inDF2),invert = T)]
  taxaP <- inDF2[,taxaPCols]
  if (length(taxaP) > 0) {inDF2 <- as.data.frame(inDF2[,taxaPColsKeep])}
  colnames(inDF2) <- taxaPColsKeepN
  taxaK <- inDF2
  # pick 
  oDF <- inDFnonTaxa
  if (verbose) {print ('Keeping following taxonomic levels:'); print(keepLevels)}
  if ('K' %in% keepLevels) {
    if (verbose){print(paste(' -> kept',ncol(taxaK),'Kingdoms'))}
    if (rescaleTaxa) {taxaK <- taxaK/rowSums(taxaK)}
    oDF <- cbind(oDF,taxaK)}
  if ('P' %in% keepLevels) {
    if (verbose){print(paste(' -> kept',ncol(taxaP),'Phyla'))} 
    if (rescaleTaxa) {taxaP <- taxaP/rowSums(taxaP)}
    oDF <- cbind(oDF,taxaP)}
  if ('C' %in% keepLevels) {
    if (verbose) {print(paste(' -> kept',ncol(taxaC),'Classes'))} 
    if (rescaleTaxa) {taxaC <- taxaC/rowSums(taxaC)}
    oDF <- cbind(oDF,taxaC)}
  if ('O' %in% keepLevels) {
    if (verbose){print(paste(' -> kept',ncol(taxaO),'Orders'))}
    if (rescaleTaxa) {taxaO <- taxaO/rowSums(taxaO)}
    oDF <- cbind(oDF,taxaO)}
  if ('F' %in% keepLevels) {
    if (verbose){print(paste(' -> kept',ncol(taxaF),'Families'))}
    if (rescaleTaxa) {taxaF <- taxaF/rowSums(taxaF)}
    oDF <- cbind(oDF,taxaF)}
  if ('G' %in% keepLevels) {if (verbose){ print(paste(' -> kept',ncol(taxaG),'Genera'))}
    if (rescaleTaxa) {taxaG <- taxaG/rowSums(taxaG)}
    oDF <- cbind(oDF,taxaG)}
  if ('S' %in% keepLevels) {if (verbose){print(paste(' -> kept',ncol(taxaS),'Species'))}
    if (rescaleTaxa) {taxaS <- taxaS/rowSums(taxaS)}
    oDF <- cbind(oDF,taxaS)}
  if ('T' %in% keepLevels) {if (verbose){print(paste(' -> kept',ncol(taxaT),'Strains'))}
    if (rescaleTaxa) {taxaT <- taxaT/rowSums(taxaT)}
    oDF <- cbind(oDF,taxaT)}
  
  if (verbose) {print ('data processing done, returning Dataframe')}
  oDF
}

# Using the aforementioned function on our data 
Taxa_filtered_dataframe <- filterMetaGenomeDF(taxa_dataframe,presPerc = -1,minMRelAb = -1,minMedRelAb = -1,rescaleTaxa = T,verbose = T,keepDomains = c("Bacteria"),keepLevels = c("S"))



# ================================================================================

1b. Filtering steps (pathways)

inDF = relative_abundance_data_frame

# In MetaCyc, the names of most pathways start with "PWY_.....". However, part of the pathways do not start with this prefix, and could therefore be missed by our function. Therefore, we added "PWY_" to all pathways (resulting in some pathways having "PWY_PWY_" as prefix). 
colnames(inDF) = paste0('PWY_', colnames(inDF))

filterHumannDF <- function(inDF,presPerc = 0.05,minMRelAb = 0.001,minMedRelAb=0.0,minSum=90.0, rescale=T,verbose=T,type='MetaCyc') {
  
  nonPWYpwys <- c("ARG+POLYAMINE-SYN: superpathway of arginine and polyamine biosynthesis",
                  "CHLOROPHYLL-SYN: chlorophyllide a biosynthesis I (aerobic, light-dependent)",
                  "GLYCOLYSIS-E-D: superpathway of glycolysis and Entner-Doudoroff",
                  "GLYCOLYSIS-TCA-GLYOX-BYPASS: superpathway of glycolysis, pyruvate dehydrogenase, TCA, and glyoxylate bypass",
                  "GLYCOLYSIS: glycolysis I (from glucose 6-phosphate)",
                  "GLYOXYLATE-BYPASS: glyoxylate cycle",
                  "HEME-BIOSYNTHESIS-II: heme biosynthesis I (aerobic)",
                  "MANNOSYL-CHITO-DOLICHOL-BIOSYNTHESIS: protein N-glycosylation (eukaryotic, high mannose)",
                  "NAD-BIOSYNTHESIS-II: NAD salvage pathway II",                  
                  "REDCITCYC: TCA cycle VIII (helicobacter)",
                  "TCA-GLYOX-BYPASS: superpathway of glyoxylate bypass and TCA",
                  "TCA: TCA cycle I (prokaryotic)")
  
  colnames(inDF)[colnames(inDF) %in% nonPWYpwys] <- paste0('PWY_',colnames(inDF)[colnames(inDF) %in% nonPWYpwys])
  
  if (type=='MetaCyc') {
    nonPWYdf <- as.data.frame(inDF[,-grep('PWY',colnames(inDF))])
    cnsNonPWYdf <- colnames(inDF[colnames(inDF)[-grep('PWY',colnames(inDF))] ])
  } else if (type=='EC') {
    nonPWYdf <- as.data.frame(inDF[,-grep('^EC_',colnames(inDF))])
    cnsNonPWYdf <- colnames(inDF[colnames(inDF)[-grep('^EC_',colnames(inDF))] ])
  } else if (type=='RXN') {
    nonPWYdf <- as.data.frame(inDF[,-grep('RXN',colnames(inDF))])
    cnsNonPWYdf <- colnames(inDF[colnames(inDF)[-grep('RXN',colnames(inDF))] ])
  } else if (type=='PFAM') {
    nonPWYdf <- as.data.frame(inDF[,-grep('^PF[01]',colnames(inDF))])
    cnsNonPWYdf <- colnames(inDF[colnames(inDF)[-grep('^PF[01]',colnames(inDF))] ])
  } else if (type=='GO') {
    nonPWYdf <- as.data.frame(inDF[,-grep('^GO',colnames(inDF))])
    cnsNonPWYdf <- colnames(inDF[colnames(inDF)[-grep('^GO',colnames(inDF))] ])
  } else if (type=='KEGG') {
    nonPWYdf <- as.data.frame(inDF[,-grep('^K[012]',colnames(inDF))])
    cnsNonPWYdf <- colnames(inDF[colnames(inDF)[-grep('^K[012]',colnames(inDF))] ])
  }
  colnames(nonPWYdf) <- cnsNonPWYdf
  if (type=='MetaCyc') {
    yesPWYdf <- as.data.frame(inDF[,grep('PWY',colnames(inDF))])
    cnsYesPWYdf <- colnames(inDF[colnames(inDF)[grep('PWY',colnames(inDF))] ])
  } else if (type=='EC') {
    yesPWYdf <- as.data.frame(inDF[,grep('^EC_',colnames(inDF))])
    cnsYesPWYdf <- colnames(inDF[colnames(inDF)[grep('^EC_',colnames(inDF))] ])
  } else if (type=='RXN') {
    yesPWYdf <- as.data.frame(inDF[,grep('RXN',colnames(inDF))])
    cnsYesPWYdf <- colnames(inDF[colnames(inDF)[grep('RXN',colnames(inDF))] ])
  } else if (type=='PFAM') {
    yesPWYdf <- as.data.frame(inDF[,grep('^PF[01]',colnames(inDF))])
    cnsYesPWYdf <- colnames(inDF[colnames(inDF)[grep('^PF[01]',colnames(inDF))] ])
  } else if (type=='GO') {
    yesPWYdf <- as.data.frame(inDF[,grep('^GO',colnames(inDF))])
    cnsYesPWYdf <- colnames(inDF[colnames(inDF)[grep('^GO',colnames(inDF))] ])
  } else if (type=='KEGG') {
    yesPWYdf <- as.data.frame(inDF[,grep('^K[012]',colnames(inDF))])
    cnsYesPWYdf <- colnames(inDF[colnames(inDF)[grep('^K[012]',colnames(inDF))] ])
  }
  
  # replaces NAs with 0s
  for (c in colnames(yesPWYdf)) {
    yesPWYdf[,c][is.na(yesPWYdf[,c])] <- 0.0
  }
  # rescale to rel ab (if rescale = T)
  if (rescale==T) {
    if (verbose) {print ('  >> rescaling')}
    rsums <- rowSums(yesPWYdf)
    rsums[rsums==0] <- 1.0
    yesPWYdf <- yesPWYdf/rsums
  }
  
  # filter for presence
  # -----------------
  nrRemoved = 0
  toRemove = c()
  for (c in colnames(yesPWYdf)) {
    nrnZ = as.numeric(sum(yesPWYdf[,c]!=0.0))
    if (nrnZ/as.numeric(nrow(yesPWYdf)) < presPerc) {
      nrRemoved = nrRemoved + 1
      toRemove <- c(toRemove,c)
    }
  }
  if (length(toRemove) > 0) {
    yesPWYdf <- yesPWYdf[,!(colnames(yesPWYdf) %in% toRemove)]
  }
  if (verbose) {print (paste(' > presence filter: Removed',nrRemoved,'pathways!, ',length(colnames(yesPWYdf)),'pathways left!')); }
  
  # filter for abundance (mean)
  # ---------------------------
  nrRemoved = 0
  toRemove = c()
  for (c in colnames(yesPWYdf)) {
    mn = mean(yesPWYdf[,c])
    if ( mn < minMRelAb) {
      nrRemoved = nrRemoved + 1
      toRemove <- c(toRemove,c)
    }
  }
  if (length(toRemove) > 0) {
    yesPWYdf <- yesPWYdf[,!(colnames(yesPWYdf) %in% toRemove)]
  }
  if (verbose) {print (paste(' > mean abundance filter: Removed',nrRemoved,'pathways!, ',length(colnames(yesPWYdf)),'pathways left!')); }
  
  # filter for abundance (median)
  # -----------------------------
  nrRemoved = 0
  toRemove = c()
  for (c in colnames(yesPWYdf)) {
    mn = median(yesPWYdf[,c])
    if ( mn < minMedRelAb) {
      nrRemoved = nrRemoved + 1
      toRemove <- c(toRemove,c)
    }
  }
  if (length(toRemove) > 0) {
    yesPWYdf <- yesPWYdf[,!(colnames(yesPWYdf) %in% toRemove)]
  }
  if (verbose) {print (paste(' > median abundance filter: Removed',nrRemoved,'pathways!, ',length(colnames(yesPWYdf)),'pathways left!')); }
  
  # do final rescale
  if (rescale==T) {
    if (verbose) {print ('  >> rescaling')}
    rsums <- rowSums(yesPWYdf)
    rsums[rsums==0] <- 1.0
    yesPWYdf <- yesPWYdf/rsums
  }
  inDF <- cbind.data.frame(nonPWYdf,yesPWYdf)
  if (verbose) {print ('> DONE')}
  inDF
}

# Using the aforementioned function on our data
pwy_filtered_dataframe = filterHumannDF(pwy_dataframe,presPerc = -1,minMRelAb = -1,minMedRelAb = -1,rescale = T,minSum = 1,verbose = T)



2. PCoA analyses


library("vegan")
library("ggplot2")

# Calculating Bray-Curtis distances between samples 
Beta = vegdist(taxa_file, pwy_file, method="bray") 
PCoA=cmdscale(Beta, k = 5) # R uses cmdscale() to calculate classical multi-dimensional scaling, a synonym for principal coordinates analysis.

# PCoA_taxa/PCoA_pwy --> dataframes including Bray-Curtis distances and phenotypes based on respectively the taxonomic and functional composition.
# Calculate centroids as the mean of the first principal components (v1 and v2) 
centroids <- aggregate(cbind(PCoA_taxa/PCoA_pwy$V1,PCoA_taxa/PCoA_pwy$V2)~PCoA_taxa/PCoA_pwy$groups,PCoA_taxa/PCoA_pwy,mean)

# Plotting PCoAs based on Bray_Curtis distances
ggplot(PCoA_taxa/PCoA_pwy, aes(x=V1,y=V2, geom= "blank", color=PCoA_taxa/PCoA_pwy$ColorsDots)) + geom_point()  + ggtitle("") + theme_classic() + geom_point (data=centroids, size=8, alpha=0.9, col=centroids$colors) + labs(x="PCoA1", y="PCoA2") +  scale_color_identity ("", breaks=c("green3", "black", "blue2", "orangered"), labels=c("healthy controls (n=495)", "healthy twins (n=38)", "IBD twins (n=61)", "IBD controls (n=99)"), guide="legend") 

## p-values between Bray-Curtis distances per group 

Microbiome_data_frame = dataframe[,c(columns_including_microbiome_features)]
Phenotype_data_frame = dataframe[,c(columns_including_clinical_features)]
#
my_results <- matrix(ncol = 3, nrow=ncol(Phenotype_data_frame))       
#For each column in Phenotype_data_frame (factor)  
for (i in 1:ncol(Phenotype_dat)) {
  #Create a table for complete cases
  final_factor_table <- Phenotype_data_frame[complete.cases(Phenotype_data_frame[,i]),]
  filter_tax_table <- Microbiome_data_frame[rownames(final_factor_table),]
  ad <- adonis(formula = filter_tax_table ~ final_factor_table[,i] , data = final_factor_table, permutations = 10000, method = "bray")
  aov_table <- ad$aov.tab
  my_results[i,1]=aov_table[1,1]
  my_results[i,2]=aov_table[1,5]
  my_results[i,3]=aov_table[1,6]
}
#
rownames(my_results) = colnames(Phenotype_dat)
colnames(my_results) = c("Df", "R2", "Pr(>F)")
#
Adonis_my_results <- as.data.frame(my_results)

# P-value correction
library(stats)
p_correction_fdr <- as.data.frame(p.adjust(Adonis_my_results$`Pr(>F)`, method = "fdr"))
rownames(p_correction_bonferroni) <- rownames(Adonis_my_results)
#Merge with adonis_results table
final_adonis_results <- merge(Adonis_my_results, p_correction_fdr, by="row.names")
rownames(final_adonis_results) <- final_adonis_results[,1]
final_adonis_results <- final_adonis_results[,-1]
#
colnames(final_adonis_results)[4] <- "FDR_p_value"
#

3. Alpha-diversity and gene richness

library("vegan")

# Calculating alpha-diversity (Shannon Index) of each sample 
dataframe_taxa["ShannonIndex"] = diversity(dataframe_taxa, index = "shannon", MARGIN = 1)

# Calculating the gene richess (ie number of genes) in each sample 
dataframe_pwy["GeneRichness_persample"] = rowSums(dataframe_pwy > 0)

4.Similarity in gut microbiome composition

library("vegan")
## 1. Firstly, we created a dataframe containing the Bray-Curtis distances between individuals from the same twin pair. 

Beta = vegdist(db_twins, method="bray")
betaMat <- as.matrix(Beta)
PCoA=cmdscale(Beta, k = 5) # R uses cmdscale() to calculate classical multi-dimensional scaling, a synonym for principal coordinates analysis.
PCoA_df = as.data.frame(PCoA)

resDF <- NULL
for (tID in unique(c(PCoA_df$Twin_pair_number)) ) {
  # iterate over twins
  print(tID)
  babyDF <- PCoA_df[PCoA_df$Twin_pair_number==tID, ]
  # test if twin is actually two samples
  if (nrow(babyDF) != 2) {
    print(paste0('WARNING:',tID,' has ',nrow(babyDF),' samples!'))
  } else {
    # calculate distance between twins
    twinDist <- betaMat[ babyDF$Record.Id[1],babyDF$Record.Id[2]  ]
    # make one result
    # collect disease type
    # NOTE: DOES NOT ACCOUNT FOR HEALTHY-HEALTHY PAIRS!!!!
    diseaseTypeL <- unique(babyDF$Disease_status)
    if ("Ulcerative Colitis" %in% diseaseTypeL) {
      disT <- "UC"
    } else {
      disT <- "CD"
    }
    oneResult <- data.frame(ID1=babyDF$Record.Id[1],
                            ID2=babyDF$Record.Id[2],
                            twinConcordancy=babyDF$Concordancy,
                            twinZigosity=babyDF$zyg_def,
                            diseaseType=disT,
                            bcPairwise=twinDist)
    # collect one result
    resDF <- rbind.data.frame(resDF,oneResult)
  }
}

# 2. Secondly, we created random pairs of unrelated twins, ensuring that both twin individuals did not originate from the same twin pair. 
Beta = vegdist(db_Twin_Beta, method="bray")
betaMat <- as.matrix(Beta)
library(reshape2)
betaMat2 <- betaMat
betaMat2[lower.tri(betaMat2,diag = T)] <- 0
flatDM <- subset(melt(betaMat2),value !=0)

randomPairs <- sample(x=row.names(flatDM),size = 1000,replace = F)
# In this dataframe, there were 2 columns containing information on to which twin pair, a twin individual belonged --> "Twin_1_pair", and "Twin_2_Pair". 
# To delete real twins from this inter-twin dataframe, we removed the lines in the dataframe in which the twin individuals belonged to the same twin pair. 
randomPairs_not_related =randomPairs[ !randomPairs$Twin_1_TwinPair==randomPairs$Twin_2_TwinPair,  ] # delete twin pairs 

# 3. Lastly, we created random pairs within the healthy controls, IBD controls and unrelated twin pairs:
Beta = vegdist(db_Twin_Beta, method="bray")
betaMat <- as.matrix(Beta)
library(reshape2)
betaMat2 <- betaMat
betaMat2[lower.tri(betaMat2,diag = T)] <- 0
flatDM <- subset(melt(betaMat2),value !=0)

randomPairs <- sample(x=row.names(flatDM),size = 1000,replace = F)

# 4. After merging these dataframes (leading to one big dissimilarity matrix including phenotypes), we plotted it. 
ggplot(merged_dataframe ,aes(x=groups,y=value,col=coloring_column)) + geom_boxplot(size=1.5)  + geom_jitter(alpha=0.1) + scale_x_discrete(limits=c("dag3_NoIBD.NoIBD", "twin_NoIBD.NoIBD", "twin_NoIBD.UC", "twin_NoIBD.CD", "twin_UC.UC", "twin_CD.CD", "twin_CD.UC", "twin_concordant.Monozygous", "twin_disconcordant.Monozygous", "twin_concordant.Dizygous", "twin_disconcordant.Dizygous", "dag3_unrelated.IBD.controls)) + theme(axis.text.x = element_text(angle=90)) +  scale_color_identity ("", breaks=merged_dataframe$groups, labels=merged_dataframe$coloring_column, guide="legend")


5. MaAsLin2 analyses

library(Maaslin2)

### 1. Comparing gut microbiomes between healthy twins and their IBD cotwins 
#a. Taxa 
Maaslin2(paste0(dataFolder,'/maaslin_input_twinsH_twinsIBD_taxa.tsv'), paste0(dataFolder,'/maaslin_input_twinsH_twinsIBD_metadata.tsv'),"res_rev1_v22_taxa_IBDtwins_vs_TwinsHC_prev10_rndeff",
         fixed_effects = c("IBDorHC","IBD_type_chart_review","zyg_def","DiseaseLocation_UCCD","Sex", "Age_years_calculated", "Antibiotics_past_3_months", "PPI_current", "BMI_survey","READ.NR"), 
         normalization = "NONE",transform = "AST", min_prevalence = 0.10, min_abundance = 0.0000001, plot_heatmap = T, plot_scatter = T,
         random_effects=c("Twin_pair_number"))
         
#b. Pathways 
Maaslin2(paste0(dataFolder,'/maaslin_input_twinsH_twinsIBD_pwys.tsv'), paste0(dataFolder,'/maaslin_input_twinsH_twinsIBD_metadata.tsv'),"res_rev1_v22_pwys_IBDtwins_vs_TwinsHC_prev10_rndeff",
         fixed_effects = c("IBDorHC","IBD_type_chart_review","zyg_def","DiseaseLocation_UCCD", "Sex", "Age_years_calculated", "Antibiotics_past_3_months", "PPI_current", "BMI_survey","READ.NR"), 
         normalization = "NONE",transform = "AST", min_prevalence = 0.10, min_abundance = 0.0000001, random_effects=c("Twin_pair_number"), plot_heatmap = T, plot_scatter = T)
         

### 2. Comparing gut microbiomes between healthy controls vs. unrelated IBD patients, the IBD-twins and the healthy cotwins. 
#a. Taxa
Maaslin2(paste0(dataFolder,'/maaslin_input_allCohorts_taxa.tsv'), paste0(dataFolder,'/maaslin_input_allCohorts_metadata.tsv'), "res_rev1_v22_allCohorts_taxa_prev10_norndeff",
         fixed_effects = c("Cohort","IBD_type_chart_review", "Sex", "Age_years_calculated", "Antibiotics_past_3_months", "PPI_current", "BMI_survey","READ.NR"), 
         normalization = "NONE",transform = "AST", min_prevalence = 0.10, min_abundance = 0.0000001, plot_heatmap = T, plot_scatter = T)
         
#b. Pathways    
Maaslin2(paste0(dataFolder,'/maaslin_input_allCohorts_pwys.tsv'), paste0(dataFolder,'/maaslin_input_allCohorts_metadata.tsv'), "res_rev1_v22_allCohorts_pwys_prev10_norndeff",
         fixed_effects = c("Cohort","IBD_type_chart_review", "Sex", "Age_years_calculated", "Antibiotics_past_3_months", "PPI_current", "BMI_survey","READ.NR"), 
         normalization = "NONE",transform = "AST", min_prevalence = 0.10, min_abundance = 0.0000001, plot_heatmap = T, plot_scatter = T)

### 3. Comparing gut microbiomes between IBD twins and unrelated IBD controls 
#a.  Taxa 
Maaslin2(paste0(dataFolder,'/maaslin_input_dagIBD_twinsIBD_taxa.tsv'), paste0(dataFolder,'/maaslin_input_dagIBD_twinsIBD_metadata.tsv'),"res_rev1_v22_taxa_DAG3IBD_vs_TwinsIBD_prev10_norndeff",
         fixed_effects = c("Cohort", "IBD_type_chart_review", "Sex", "Age_years_calculated", "Antibiotics_past_3_months", "PPI_current", "BMI_survey","READ.NR"), 
         normalization = "NONE",transform = "AST", min_prevalence = 0.10, min_abundance = 0.0000001, plot_heatmap = T, plot_scatter = T)

# b. Pathways
Maaslin2(paste0(dataFolder,'/maaslin_input_dagIBD_twinsIBD_pwys.tsv'), paste0(dataFolder,'/maaslin_input_dagIBD_twinsIBD_metadata.tsv'),"res_rev1_v22_pwys_DAG3IBD_vs_TwinsIBD_prev10_norndeff",
         fixed_effects = c("Cohort", "IBD_type_chart_review", "Sex", "Age_years_calculated", "Antibiotics_past_3_months", "PPI_current", "BMI_survey","READ.NR"), 
         normalization = "NONE",transform = "AST", min_prevalence = 0.10, min_abundance = 0.0000001, plot_heatmap = T, plot_scatter = T)

6. Visualization of MaAsLin2 results

####Visualize the MaAsLin2 results in Venn-diagrams and balloon plots###
#Example code shown here is for the species analyses.
#Graphs for pathways can be constructed in the same way.
#The data file used is the all_results.tsv files from the MaAsLin2 run
#The dataframe df.spec.all contains the outcomes of te MaAsLin2 run
 
#Open libraries needed
library("venn")
library("ggplot2")
library("ggpubr")
 
##Select the associations of interest from the MaAsLin2 results
#Select only Cohort comparison
df.spec.all.sel <- subset(df.spec.all, metadata == "Cohort")
head(df.spec.all.sel)
 
##Make sure that both the species and the direction of the beta are considered for the Venn-diagrams
#Add a variable describing whether the beta is "up" or "down" compared to HC
df.spec.all.sel$direction <- with(df.spec.all.sel,ifelse(coef > 0,
                                                                   "up",
                                                                   ifelse(coef <0,
                                                                          "down",
                                                                          NA)))
#Make a variable that combines the featurename and direction
df.spec.all.sel$feature_direction <- paste(df.spec.all.sel$feature,df.spec.all.sel$direction,sep="_")
 
#Select only the twin-IBD vs. HC comparison
df.twinIBDvHC.spec.all.sel <- subset(df.spec.all.sel, value == "IBD_IBDTWINS_IBD")
#Select only the Healthy cotwins vs. HC comparison
df.twinHealthyvHC.spec.all.sel <- subset(df.spec.all.sel, value == "HC_IBDTWINS_HC")
#Select only the unrelated IBD patients vs. HC comparison
df.IBDCvHC.spec.all.sel <- subset(df.spec.all.sel, value == "IBD_DAG3_IBD")
 
####Select only the features with an FDR < 0.10 than the threshold
fdr.cutoff <- 0.10
###IBD-twin vs HC
df.twinIBDvHC.spec.all.sel.sign <- subset(df.twinIBDvHC.spec.all.sel, qval < fdr.cutoff)
head(df.twinIBDvHC.spec.all.sel.sign)
###Healthy twin vs HC
df.twinHealthyvHC.spec.all.sel.sign <- subset(df.twinHealthyvHC.spec.all.sel, qval < fdr.cutoff)
head(df.twinHealthyvHC.spec.all.sel.sign)
###IBD control vs HC
df.IBDCvHC.spec.all.sel.sign <- subset(df.IBDCvHC.spec.all.sel, qval < fdr.cutoff)
head(df.IBDCvHC.spec.all.sel.sign)
 
####Venn diagrams####
#Make a venn diagram for the overlapping relatively abundant species with FDR<0.10
#Make the venn object
venn.spec <- list(df.twinIBDvHC.spec.all.sel.sign$feature,
                  df.twinHealthyvHC.spec.all.sel.sign$feature,
                  df.IBDCvHC.spec.all.sel.sign$feature)
#Plot the venn diagram
venn(venn.spec.0.10,ilabels = T,
     snames=c("IBD-twins","Healthy\ncotwins","Unrelated IBD\npatients"),
     zcolor = c("firebrick3","gold","dodgerblue3"),opacity=0.5, ilcs = 1.5,sncs=1.1,box=FALSE,ggplot=F)
text(x=500,y=1025,paste("Differentially relative abundant species (FDR < ",format(fdr.cutoff,nsmall=2),")
                        compared to healthy controls",sep=""),cex=1.3,font=2)
 
####Combining the data for the significant associations
#Add an extra variable to be able to specify which comparison is used
df.twinIBDvHC.spec.all.sel.sign$group <- "IBD-twins"
df.twinHealthyvHC.spec.all.sel.sign$group <- "Healthy cotwins"
df.IBDCvHC.spec.all.sel.sign$group <- "Unrelated IBD patients"
 
#Combine all dataframes into one
df.spec.sign <- rbind(df.twinIBDvHC.spec.all.sel.sign,
                      df.twinHealthyvHC.spec.all.sel.sign,
                      df.IBDCvHC.spec.all.sel.sign)
 
#Order the "group" variable in the way you want it to appear in the plot
df.spec.sign$group <- factor(df.spec.sign$group,levels=c(
  "Unrelated IBD patients","IBD-twins","Healthy cotwins"))
 
#Make an extra short feature variable to make the y-axis readable
#The feature variable contains the names of the species
df.spec.sign$feature.extrashort <- paste(substring(df.spec.sign$feature, regexpr(".s__", df.spec.sign$feature) +4))
 
#Order the species in the way you want them to appear in the plot
#It is now ordered based on the mean coefficients over the 3 datasets 
table.extrashort<-aggregate( coef ~ feature.extrashort, df.spec.sign, mean )
table.extrashort<-table.extrashort[order(table.extrashort$coef),]
order.feature.extrashort<-as.character(table.extrashort$feature.extrashort)
df.spec.sign$feature.extrashort <- factor(df.spec.sign$feature.extrashort, levels=c(order.feature.extrashort))
 
####Select only the overlapping relatively abundant species####
#Check the count of each species
spec.counts <- count(df.spec.sign$feature)
#Only keep those with a count >1 (those are shared between at least 2 comparisons)
spec.shared <- subset(spec.counts,freq > 1)
#Select these species in the dataset
df.spec.sign.shared <- df.spec.sign[df.spec.sign$feature%in%spec.shared$x,]
 
####Balloonplot####
#Plot the actual balloonplot
####all differentially abundant species that are shared between at least 2 comparisons
p.balloon.spec.shared <- ggballoonplot(data=df.spec.sign.shared, x = "group", 
                                       y = "feature.extrashort", 
                                       size = "qval",
                                       fill = "coef",
                                       ylab = "Species",
                                       xlab = "Compared to healthy controls",
                                       ggtheme = theme_bw())+
  scale_fill_gradient2(low = "darkblue",
                       mid = "white",
                       high = "darkred",
                       midpoint = 0,
                       space = "Lab",
                       na.value = "grey50",
                       guide = "colourbar",
                       aesthetics = "fill"
  )+
  scale_size(range = c(5,1)) + guides(size = guide_legend(reverse=TRUE)) +
  labs(fill = "Beta coefficient",
       size = "False discovery rate")+
  ggtitle(label=paste("Shared differentially relative abundant species (FDR < ",format(fdr.cutoff,nsmall=2),")
                      compared to healthy controls",sep=""))+
  theme(plot.title=element_text(size=8,face="bold",hjust = 0.75),
        axis.text.y=element_text(size=6),
        legend.title=element_text(size=6,face="bold"),
        legend.text=element_text(size=4))
 
#The final balloonplot
p.balloon.spec.shared