fix conflict

Merge branch 'devel' of github.com:bcbio/bcbioR into devel # Conflicts: # inst/templates/chipseq/diffbind/diffbind.Rmd
bcbio · Oct 11, 2024 · 983a73d · 983a73d
2 parents c1aa8fc + 1e718b3
commit 983a73d
Show file tree

Hide file tree

Showing 5 changed files with 184 additions and 44 deletions.
diff --git a/inst/templates/chipseq/QC/QC.Rmd b/inst/templates/chipseq/QC/QC.Rmd
@@ -29,6 +29,7 @@ params:
 # This set up the working directory to this file so all files can be found
 library(rstudioapi)
 setwd(fs::path_dir(getSourceEditorContext()$path))
+
 # NOTE: This code will check version, this is our recommendation, it may work
 #.      other versions
 stopifnot(R.version$major>= 4) # requires R4
@@ -73,6 +74,8 @@ library(bcbioR)
 library(janitor)
 library(ChIPpeakAnno)
 library(UpSetR)
+
+colors=cb_friendly_cols(1:15)
 ggplot2::theme_set(theme_light(base_size = 14))
 opts_chunk[["set"]](
     cache = FALSE,
@@ -360,8 +363,7 @@ ggplot(peaks, aes(x = peak_enrichment, fill = .data[[params$factor_of_interest]]
 
 We examine the amount of overlap between peaks in replicates of the same experimental condition. 
 
-``` {r peak overlap, results = 'asis', fig.width = 8}
-
+``` {r peak overlap, results = 'asis', fig.width = 8, fig.height = 6}
 for (current_sample_group in unique(peaks$sample_group)){
   cat("## ", current_sample_group, "\n")
 
@@ -381,19 +383,27 @@ for (current_sample_group in unique(peaks$sample_group)){
   # connectedpeaks examples (https://support.bioconductor.org/p/133486/#133603), if 5     peaks in group1 overlap with 2 peaks in group 2, setting connectedPeaks to "merge"      will add 1 to the overlapping counts
   overlaps <- findOverlapsOfPeaks(peaks_sample_group_granges, connectedPeaks = 'merge')
   
-  set_counts <- overlaps$venn_cnt[, colnames(overlaps$venn_cnt)] %>% 
-    as.data.frame() %>% 
-    mutate(group_number = row_number()) %>%
-    pivot_longer(!Counts & !group_number, names_to = 'sample', values_to = 'member') %>%
-    filter(member > 0) %>%
-    group_by(Counts, group_number) %>% 
-    summarize(group = paste(sample, collapse = '&'))
+  n_samples <- length(names(overlaps$overlappingPeaks))
+
+  if (n_samples > 3){
+    set_counts <- overlaps$venn_cnt[, colnames(overlaps$venn_cnt)] %>% 
+      as.data.frame() %>% 
+      mutate(group_number = row_number()) %>%
+      pivot_longer(!Counts & !group_number, names_to = 'sample', values_to = 'member') %>%
+      filter(member > 0) %>%
+      group_by(Counts, group_number) %>% 
+      summarize(group = paste(sample, collapse = '&'))
+    
+    set_counts_upset <- set_counts$Counts
+    names(set_counts_upset) <- set_counts$group
   
-  set_counts_upset <- set_counts$Counts
-  names(set_counts_upset) <- set_counts$group
-
-  p <- upset(fromExpression(set_counts_upset), order.by = "freq", text.scale = 1.5)
-  print(p)
+    p <- upset(fromExpression(set_counts_upset), order.by = "freq", text.scale = 1.5)
+    print(p)
+  } else{
+    venn_sample_names <- gsub(paste0(current_sample_group, '_'), '', names(overlaps$all.peaks))
+    invisible(capture.output(makeVennDiagram(overlaps, connectedPeaks = "merge", fill = colors[1:n_samples], 
+                    NameOfPeaks = venn_sample_names)))
+  }
   
   cat('\n\n')
 

diff --git a/inst/templates/chipseq/QC/params_qc-example.R b/inst/templates/chipseq/QC/params_qc-example.R
@@ -2,11 +2,10 @@
 
 
 # Example data
-coldata_fn='~/Downloads/chipseq_peakanalysis_PRDM16.csv'
+coldata_fn='/workspace/data/chipseq_peakanalysis_h3k27ac_narrow/chipseq_peakanalysis_H3K27Ac.csv'
 # This folder is in the output directory inside multiqc folder
-multiqc_data_dir='~/O2/s3_results/chipseq_peakanalysis_prdm16/multiqc/narrowPeak/multiqc_data/'
+multiqc_data_dir='/workspace/data/chipseq_peakanalysis_h3k27ac_narrow/multiqc/narrowPeak/multiqc_data/'
 # This folder is in the output director
-peaks_dir = '~/O2/s3_results/chipseq_peakanalysis_prdm16/bowtie2/mergedLibrary/macs2/narrowPeak/'
+peaks_dir = '/workspace/data/chipseq_peakanalysis_h3k27ac_narrow/bowtie2/mergedLibrary/macs2/narrowPeak/'
 # This folder is in the output directory 
-# counts_fn = '~/O2/s3_results/chipseq_peakanalysis_h3k27ac/bowtie2/mergedLibrary/macs2/broadPeak/consensus/H3K27ac/deseq2/H3K27ac.consensus_peaks.rds'
-counts_fn = '~/O2/s3_results/chipseq_peakanalysis_prdm16/bowtie2/mergedLibrary/macs2/narrowPeak/consensus/PRDM16/deseq2/PRDM16.consensus_peaks.rds'
+counts_fn = '/workspace/data/chipseq_peakanalysis_h3k27ac_narrow/bowtie2/mergedLibrary/macs2/narrowPeak/consensus/H3K27ac/deseq2/H3K27ac.consensus_peaks.rds'
diff --git a/inst/templates/chipseq/diffbind/diffbind.Rmd b/inst/templates/chipseq/diffbind/diffbind.Rmd
@@ -29,16 +29,17 @@ params:
   denominator: WT
   species: mouse
   counts_fn: diffbind_counts.csv
-  results_sig_anno_fn: diffbind_results_sig_anno.csv
+  results_sig_anno_fn: diffbind_results_anno.csv
 ---
 Template developed with materials from https://hbctraining.github.io/main/.
 
 ```{r, cache = FALSE, message = FALSE, warning=FALSE}
 # This set up the working directory to this file so all files can be found
 library(rstudioapi)
 setwd(fs::path_dir(getSourceEditorContext()$path))
+
 # NOTE: This code will check version, this is our recommendation, it may work
-#.      other versions
+#.      with other versions
 stopifnot(R.version$major>= 4) # requires R4
 stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3
 stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0)
@@ -102,15 +103,19 @@ library(qs)
 library(EnhancedVolcano)
 library(ggprism)
 library(ChIPseeker)
+library(msigdbr)
+library(fgsea)
 
 if (params$species == 'mouse'){
   library(TxDb.Mmusculus.UCSC.mm10.knownGene)
   txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
   anno_db <- 'org.Mm.eg.db'
+  library(org.Mm.eg.db)
 } else if (params$species == human){
   library(TxDb.Hsapiens.UCSC.hg38.knownGene)
   txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
   anno_db <- 'org.Hs.eg.db'
+  library(org.Hs.eg.db)
 }
 
 
@@ -237,18 +242,49 @@ including estimation of size factors and dispersions, fitting and testing the
 model, evaluating the supplied contrast, and shrinking the LFCs. A p-value and FDR 
 is assigned to each candidate binding site indicating confidence that they are differentially bound.
 
+We use [ChIPpeakAnno](https://bioconductor.org/packages/release/bioc/html/ChIPpeakAnno.html) 
+to identify any gene features within 1000 bp of a differentially bound site. 
+
+
 ```{r DB analysis}
 diffbind_norm <- dba.contrast(diffbind_norm, contrast = c('Condition', params$numerator, params$denominator))
 results_obj <- dba.analyze(diffbind_norm, bGreylist = F)
 
 results_report <- dba.report(results_obj, th = 1) 
 results_report_sig <- dba.report(results_obj)
 
-
 results <- results_report %>% as.data.frame()
-results_sig <- results_report_sig %>% as.data.frame()
+
+```
+
+```{r annotate DB peaks}
+
+anno_data <- toGRanges(txdb, feature = 'gene')
+results_anno_batch <- annotatePeakInBatch(results_report, 
+                                              AnnotationData = anno_data,
+                                              output = 'overlapping',
+                                              maxgap = 1000)
+
+results_anno_batch_df <- results_anno_batch %>% as.data.frame()
+
+if(params$species == 'mouse'){
+  entrez_to_symbol <- AnnotationDbi::select(org.Mm.eg.db, results_anno_batch_df$feature, 
+                                            "ENTREZID", columns = 'SYMBOL') %>%
+    filter(!is.na(ENTREZID)) %>% distinct()
+} else if (params$species == 'human'){
+  entrez_to_symbol <- AnnotationDbi::select(org.Hs.eg.db, results_anno_batch_df$feature, 
+                                            "ENTREZID", columns = 'SYMBOL') %>%
+    filter(!is.na(ENTREZID)) %>% distinct()
+}
+
+results_anno_batch_df <- results_anno_batch_df %>% 
+  left_join(entrez_to_symbol %>% dplyr::select(feature = ENTREZID, gene_name = SYMBOL))
+
+write_csv(results_anno_batch_df, params$results_sig_anno_fn)
+
 ```
 
+
 ## MA plot
 
 This plot can help to:
@@ -257,11 +293,10 @@ This plot can help to:
 - Visualize data dispersion: The distribution of points along the A-axis gives a sense of the spread of binding levels and any patterns or anomalies in the dataset.
 
 ```{r MA plot}
-results_for_ma <- results%>%
+results_for_ma <- results_anno_batch_df%>%
   mutate(peak = paste(seqnames, start, end, sep = '_')) %>%
   mutate(t = 0) %>% 
   dplyr::select(peak, AveExpr = Conc, logFC = Fold, P.Value = p.value, adj.P.Val = FDR, t)
-rownames(results_for_ma) <- results_for_ma$peak
 degMA(as.DEGSet(results_for_ma, contrast = paste(params$numerator, params$denominator, sep = ' vs. ')))
 
 ```
@@ -270,7 +305,9 @@ degMA(as.DEGSet(results_for_ma, contrast = paste(params$numerator, params$denomi
 
 ```{r DB table}
 
-results_sig %>% sanitize_datatable()
+results_sig_anno_batch_df <- results_anno_batch_df %>% filter(FDR < 0.05)
+results_sig_anno_batch_df %>% dplyr::select(names(results), feature, gene_name) %>%
+  sanitize_datatable()
 
 ```
 
@@ -280,15 +317,19 @@ results_sig %>% sanitize_datatable()
 This volcano plot shows the binding sites that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in purple are sites that have padj < 0.05 and a log2-fold change magnitude > 0.5. Points in blue have a padj > 0.05 and a log2-fold change magnitude > 0.5. Grey points are non-significant. The dashed lines correspond to the cutoff values of log2-fold change and padj that we have chosen.
 
 ```{r volcano, fig.height = 8}
-results_mod <- results %>% 
+results_mod <- results_sig_anno_batch_df %>% 
   mutate(Fold = replace(Fold, Fold < -5, -5)) %>% 
   mutate(Fold = replace(Fold, Fold > 5, 5)) %>%
-  mutate(peak = paste(seqnames, start, end, sep = '_'))
-show <- as.data.frame(results_mod[1:6, c("Fold", "FDR", "peak")])
+  mutate(peak = paste(seqnames, start, end, sep = '_')) 
+# show <- as.data.frame(results_mod[1:6, c("Fold", "FDR", "gene_name")])
+
+show <- results_mod %>% filter(!is.na(gene_name)) %>% slice_min(n = 6, order_by = FDR)
+
+results_mod <- results_mod %>% mutate(gene_name = ifelse(peak %in% show$peak  , gene_name, NA))
 EnhancedVolcano(results_mod,
-                lab= results_mod$peak, 
+                lab= results_mod$gene_name,
                 pCutoff = 0.05, 
-                selectLab = c(show$peak),
+                selectLab = c(show$gene_name),
                 FCcutoff = 0.5,
                 x = 'Fold',
                 y = 'FDR', 
@@ -321,34 +362,80 @@ We use the [ChIPseeker](https://www.bioconductor.org/packages/release/bioc/html/
 package to determine the genomic context of the differentially bound peaks and 
 visualize these annotations. We consider the promoter region to be within 2000 bp in either direction of the TSS. 
 
-We also use [ChIPpeakAnno](https://bioconductor.org/packages/release/bioc/html/ChIPpeakAnno.html) 
-to identify any gene features within 1000 bp of a differentially bound site. 
-
 ```{r annotate, echo = F}
 
 results_sig_anno <- annotatePeak(results_report_sig, 
                                  tssRegion = c(-2000, 2000), 
                                  TxDb = txdb, 
-                                 annoDb = params$anno_db, 
+                                 annoDb = anno_db, 
                                  verbose = F)
 results_sig_anno_df <- results_sig_anno %>% as.data.frame()
 
 plotAnnoPie(results_sig_anno)
 
 plotDistToTSS(results_sig_anno)
 
-anno_data <- toGRanges(txdb, feature = 'gene')
-results_sig_anno_batch <- annotatePeakInBatch(results_report_sig, 
-                                              AnnotationData = anno_data,
-                                              output = 'overlapping',
-                                              maxgap = 1000)
+```
 
-results_sig_anno_batch_df <- results_sig_anno_batch %>% as.data.frame()
+# Functional Enrichment
 
-write_csv(results_sig_anno_batch_df, params$results_sig_anno_fn)
+Over-Representation Analysis (ORA) is a statistical method used to determine whether a predefined set of genes (e.g., genes belonging to a specific biological pathway or function) is over-represented (or enriched) among a list of differentially bound genes (DEGs) from ChIP-seq. Adventages of ORA:
+
+- Simplicity: Easy to perform and interpret.
+- Biological Insight: Helps to identify pathways and processes that are significantly affected in the condition studied.
+- Prior Knowledge Integration: Utilizes existing biological knowledge through predefined gene sets.
+
+```{r get databases}
+if(params$species == 'human'){
+  all_in_life=get_databases()
+} else if (params$species == 'mouse'){
+  all_in_life = get_databases('Mus musculus')
+}
 ```
 
-# Functional Enrichment
+```{r ora}
+
+universe_mapping = results_anno_batch_df %>% 
+    filter(!is.na(FDR), !is.na(feature)) %>% 
+  dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct()
+
+ora_input = results_anno_batch_df %>% 
+  filter(!is.na(FDR), FDR < 0.01, abs(Fold) > 0.3, !is.na(feature)) %>% 
+  dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct()
+all = run_fora(ora_input, universe_mapping, all_in_life)
+
+ora_input = results_anno_batch_df %>% 
+  filter(!is.na(FDR), FDR < 0.01, Fold > 0.3, !is.na(feature)) %>% 
+  dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct()
+up = run_fora(ora_input, universe_mapping, all_in_life)
+
+ora_input = results_anno_batch_df %>% 
+  filter(!is.na(FDR), FDR < 0.01, Fold < -0.3, !is.na(feature)) %>% 
+  dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct()
+down = run_fora(ora_input, universe_mapping, all_in_life)
+  
+```
+
+
+## Significant pathways using all DB genes
+
+```{r all pathways}
+all %>% sanitize_datatable()
+```
+
+
+## Significant pathways using increased DB genes
+
+```{r up pathways}
+up %>% sanitize_datatable()
+```
+
+
+## Significant pathways using decreased DB genes
+
+```{r down pathways, results='asis'}
+down %>% sanitize_datatable()
+```
 
 # R session
 

diff --git a/inst/templates/chipseq/libs/load_data.R b/inst/templates/chipseq/libs/load_data.R
@@ -114,3 +114,47 @@ make_diffbind_samplesheet <- function(coldata, bam_dir, peaks_dir, column = NULL
 
   return(samplesheet)
 }
+
+get_databases=function(sps="human"){
+  all_in_life=list(
+    msigdbr(species = sps, category = "H") %>% mutate(gs_subcat="Hallmark"),
+    #  msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"),
+    msigdbr(species = sps, category = "C2", subcategory = "CP:KEGG"),
+    #  msigdbr(species = "human", category = "C2", subcategory = "CP:PID"),
+    msigdbr(species = sps, category = "C5", subcategory = "GO:BP"),
+    msigdbr(species = sps, category = "C5", subcategory = "GO:MF")
+    #  msigdbr(species = "human", category = "C5", subcategory = "HPO"),
+    #  msigdbr(species = "human", category = "C3", subcategory = "TFT:GTRD"),
+    #  msigdbr(species = "human", category = "C6") %>% mutate(gs_subcat="Oncogenic")
+  )
+  all_in_life
+}
+
+run_fora=function(input, uni,all_in_life){
+  # browser()
+  total_deg=length(unique(input))/length(unique(uni$ENTREZID))
+  pathways_ora_all = lapply(all_in_life, function(p){
+    pathway = split(x = p$entrez_gene, f = p$gs_name)
+    db_name = paste(p$gs_cat[1], p$gs_subcat[1],sep=":")
+    respath <- fora(pathways = pathway,
+                    genes = unique(input$ENTREZID),
+                    universe = unique(uni$ENTREZID),
+                    minSize  = 15,
+                    maxSize  = 500)
+    # coll_respath = collapsePathwaysORA(respath[order(pval)][padj < 0.1],
+    #                                    pathway, unique(input$ENTREZID), unique(uni$ENTREZID))
+    as_tibble(respath)  %>%
+      mutate(database=db_name, NES=(overlap/size)/(total_deg))
+  }) %>% bind_rows() %>%
+    mutate(analysis="ORA")
+  ora_tb = pathways_ora_all %>% unnest(overlapGenes) %>%
+    group_by(pathway) %>%
+    left_join(uni, by =c("overlapGenes"="ENTREZID")) %>%
+    dplyr::select(pathway, padj, NES, SYMBOL, analysis,
+                  database) %>%
+    group_by(pathway,padj,NES,database,analysis) %>%
+    summarise(genes=paste(SYMBOL,collapse = ","))
+  ora_tb
+
+}
+
diff --git a/inst/templates/rnaseq/DE/DEG.Rmd b/inst/templates/rnaseq/DE/DEG.Rmd
@@ -604,7 +604,7 @@ fa_list=lapply(de_list,function(contrast){
   input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL'))
   up=run_fora(input_entrezid, universe_mapping,all_in_life)
 
-  ora_input = res %>% filter(!is.na(padj), padj<0.01, lfc<0.3) %>% pull(gene_id)
+  ora_input = res %>% filter(!is.na(padj), padj<0.01, lfc<-0.3) %>% pull(gene_id)
   #change to the right species
   input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL'))
   down=run_fora(input_entrezid, universe_mapping,all_in_life)