Skip to content

Commit

Permalink
fix conflict
Browse files Browse the repository at this point in the history
Merge branch 'devel' of github.com:bcbio/bcbioR into devel

# Conflicts:
#	inst/templates/chipseq/diffbind/diffbind.Rmd
  • Loading branch information
lpantano committed Oct 11, 2024
2 parents c1aa8fc + 1e718b3 commit 983a73d
Show file tree
Hide file tree
Showing 5 changed files with 184 additions and 44 deletions.
38 changes: 24 additions & 14 deletions inst/templates/chipseq/QC/QC.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ params:
# This set up the working directory to this file so all files can be found
library(rstudioapi)
setwd(fs::path_dir(getSourceEditorContext()$path))
# NOTE: This code will check version, this is our recommendation, it may work
#. other versions
stopifnot(R.version$major>= 4) # requires R4
Expand Down Expand Up @@ -73,6 +74,8 @@ library(bcbioR)
library(janitor)
library(ChIPpeakAnno)
library(UpSetR)
colors=cb_friendly_cols(1:15)
ggplot2::theme_set(theme_light(base_size = 14))
opts_chunk[["set"]](
cache = FALSE,
Expand Down Expand Up @@ -360,8 +363,7 @@ ggplot(peaks, aes(x = peak_enrichment, fill = .data[[params$factor_of_interest]]

We examine the amount of overlap between peaks in replicates of the same experimental condition.

``` {r peak overlap, results = 'asis', fig.width = 8}
``` {r peak overlap, results = 'asis', fig.width = 8, fig.height = 6}
for (current_sample_group in unique(peaks$sample_group)){
cat("## ", current_sample_group, "\n")
Expand All @@ -381,19 +383,27 @@ for (current_sample_group in unique(peaks$sample_group)){
# connectedpeaks examples (https://support.bioconductor.org/p/133486/#133603), if 5 peaks in group1 overlap with 2 peaks in group 2, setting connectedPeaks to "merge" will add 1 to the overlapping counts
overlaps <- findOverlapsOfPeaks(peaks_sample_group_granges, connectedPeaks = 'merge')
set_counts <- overlaps$venn_cnt[, colnames(overlaps$venn_cnt)] %>%
as.data.frame() %>%
mutate(group_number = row_number()) %>%
pivot_longer(!Counts & !group_number, names_to = 'sample', values_to = 'member') %>%
filter(member > 0) %>%
group_by(Counts, group_number) %>%
summarize(group = paste(sample, collapse = '&'))
n_samples <- length(names(overlaps$overlappingPeaks))
if (n_samples > 3){
set_counts <- overlaps$venn_cnt[, colnames(overlaps$venn_cnt)] %>%
as.data.frame() %>%
mutate(group_number = row_number()) %>%
pivot_longer(!Counts & !group_number, names_to = 'sample', values_to = 'member') %>%
filter(member > 0) %>%
group_by(Counts, group_number) %>%
summarize(group = paste(sample, collapse = '&'))
set_counts_upset <- set_counts$Counts
names(set_counts_upset) <- set_counts$group
set_counts_upset <- set_counts$Counts
names(set_counts_upset) <- set_counts$group
p <- upset(fromExpression(set_counts_upset), order.by = "freq", text.scale = 1.5)
print(p)
p <- upset(fromExpression(set_counts_upset), order.by = "freq", text.scale = 1.5)
print(p)
} else{
venn_sample_names <- gsub(paste0(current_sample_group, '_'), '', names(overlaps$all.peaks))
invisible(capture.output(makeVennDiagram(overlaps, connectedPeaks = "merge", fill = colors[1:n_samples],
NameOfPeaks = venn_sample_names)))
}
cat('\n\n')
Expand Down
9 changes: 4 additions & 5 deletions inst/templates/chipseq/QC/params_qc-example.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@


# Example data
coldata_fn='~/Downloads/chipseq_peakanalysis_PRDM16.csv'
coldata_fn='/workspace/data/chipseq_peakanalysis_h3k27ac_narrow/chipseq_peakanalysis_H3K27Ac.csv'
# This folder is in the output directory inside multiqc folder
multiqc_data_dir='~/O2/s3_results/chipseq_peakanalysis_prdm16/multiqc/narrowPeak/multiqc_data/'
multiqc_data_dir='/workspace/data/chipseq_peakanalysis_h3k27ac_narrow/multiqc/narrowPeak/multiqc_data/'
# This folder is in the output director
peaks_dir = '~/O2/s3_results/chipseq_peakanalysis_prdm16/bowtie2/mergedLibrary/macs2/narrowPeak/'
peaks_dir = '/workspace/data/chipseq_peakanalysis_h3k27ac_narrow/bowtie2/mergedLibrary/macs2/narrowPeak/'
# This folder is in the output directory
# counts_fn = '~/O2/s3_results/chipseq_peakanalysis_h3k27ac/bowtie2/mergedLibrary/macs2/broadPeak/consensus/H3K27ac/deseq2/H3K27ac.consensus_peaks.rds'
counts_fn = '~/O2/s3_results/chipseq_peakanalysis_prdm16/bowtie2/mergedLibrary/macs2/narrowPeak/consensus/PRDM16/deseq2/PRDM16.consensus_peaks.rds'
counts_fn = '/workspace/data/chipseq_peakanalysis_h3k27ac_narrow/bowtie2/mergedLibrary/macs2/narrowPeak/consensus/H3K27ac/deseq2/H3K27ac.consensus_peaks.rds'
135 changes: 111 additions & 24 deletions inst/templates/chipseq/diffbind/diffbind.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,17 @@ params:
denominator: WT
species: mouse
counts_fn: diffbind_counts.csv
results_sig_anno_fn: diffbind_results_sig_anno.csv
results_sig_anno_fn: diffbind_results_anno.csv
---
Template developed with materials from https://hbctraining.github.io/main/.

```{r, cache = FALSE, message = FALSE, warning=FALSE}
# This set up the working directory to this file so all files can be found
library(rstudioapi)
setwd(fs::path_dir(getSourceEditorContext()$path))
# NOTE: This code will check version, this is our recommendation, it may work
#. other versions
#. with other versions
stopifnot(R.version$major>= 4) # requires R4
stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3
stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0)
Expand Down Expand Up @@ -102,15 +103,19 @@ library(qs)
library(EnhancedVolcano)
library(ggprism)
library(ChIPseeker)
library(msigdbr)
library(fgsea)
if (params$species == 'mouse'){
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
anno_db <- 'org.Mm.eg.db'
library(org.Mm.eg.db)
} else if (params$species == human){
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
anno_db <- 'org.Hs.eg.db'
library(org.Hs.eg.db)
}
Expand Down Expand Up @@ -237,18 +242,49 @@ including estimation of size factors and dispersions, fitting and testing the
model, evaluating the supplied contrast, and shrinking the LFCs. A p-value and FDR
is assigned to each candidate binding site indicating confidence that they are differentially bound.

We use [ChIPpeakAnno](https://bioconductor.org/packages/release/bioc/html/ChIPpeakAnno.html)
to identify any gene features within 1000 bp of a differentially bound site.


```{r DB analysis}
diffbind_norm <- dba.contrast(diffbind_norm, contrast = c('Condition', params$numerator, params$denominator))
results_obj <- dba.analyze(diffbind_norm, bGreylist = F)
results_report <- dba.report(results_obj, th = 1)
results_report_sig <- dba.report(results_obj)
results <- results_report %>% as.data.frame()
results_sig <- results_report_sig %>% as.data.frame()
```

```{r annotate DB peaks}
anno_data <- toGRanges(txdb, feature = 'gene')
results_anno_batch <- annotatePeakInBatch(results_report,
AnnotationData = anno_data,
output = 'overlapping',
maxgap = 1000)
results_anno_batch_df <- results_anno_batch %>% as.data.frame()
if(params$species == 'mouse'){
entrez_to_symbol <- AnnotationDbi::select(org.Mm.eg.db, results_anno_batch_df$feature,
"ENTREZID", columns = 'SYMBOL') %>%
filter(!is.na(ENTREZID)) %>% distinct()
} else if (params$species == 'human'){
entrez_to_symbol <- AnnotationDbi::select(org.Hs.eg.db, results_anno_batch_df$feature,
"ENTREZID", columns = 'SYMBOL') %>%
filter(!is.na(ENTREZID)) %>% distinct()
}
results_anno_batch_df <- results_anno_batch_df %>%
left_join(entrez_to_symbol %>% dplyr::select(feature = ENTREZID, gene_name = SYMBOL))
write_csv(results_anno_batch_df, params$results_sig_anno_fn)
```


## MA plot

This plot can help to:
Expand All @@ -257,11 +293,10 @@ This plot can help to:
- Visualize data dispersion: The distribution of points along the A-axis gives a sense of the spread of binding levels and any patterns or anomalies in the dataset.

```{r MA plot}
results_for_ma <- results%>%
results_for_ma <- results_anno_batch_df%>%
mutate(peak = paste(seqnames, start, end, sep = '_')) %>%
mutate(t = 0) %>%
dplyr::select(peak, AveExpr = Conc, logFC = Fold, P.Value = p.value, adj.P.Val = FDR, t)
rownames(results_for_ma) <- results_for_ma$peak
degMA(as.DEGSet(results_for_ma, contrast = paste(params$numerator, params$denominator, sep = ' vs. ')))
```
Expand All @@ -270,7 +305,9 @@ degMA(as.DEGSet(results_for_ma, contrast = paste(params$numerator, params$denomi

```{r DB table}
results_sig %>% sanitize_datatable()
results_sig_anno_batch_df <- results_anno_batch_df %>% filter(FDR < 0.05)
results_sig_anno_batch_df %>% dplyr::select(names(results), feature, gene_name) %>%
sanitize_datatable()
```

Expand All @@ -280,15 +317,19 @@ results_sig %>% sanitize_datatable()
This volcano plot shows the binding sites that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in purple are sites that have padj < 0.05 and a log2-fold change magnitude > 0.5. Points in blue have a padj > 0.05 and a log2-fold change magnitude > 0.5. Grey points are non-significant. The dashed lines correspond to the cutoff values of log2-fold change and padj that we have chosen.

```{r volcano, fig.height = 8}
results_mod <- results %>%
results_mod <- results_sig_anno_batch_df %>%
mutate(Fold = replace(Fold, Fold < -5, -5)) %>%
mutate(Fold = replace(Fold, Fold > 5, 5)) %>%
mutate(peak = paste(seqnames, start, end, sep = '_'))
show <- as.data.frame(results_mod[1:6, c("Fold", "FDR", "peak")])
mutate(peak = paste(seqnames, start, end, sep = '_'))
# show <- as.data.frame(results_mod[1:6, c("Fold", "FDR", "gene_name")])
show <- results_mod %>% filter(!is.na(gene_name)) %>% slice_min(n = 6, order_by = FDR)
results_mod <- results_mod %>% mutate(gene_name = ifelse(peak %in% show$peak , gene_name, NA))
EnhancedVolcano(results_mod,
lab= results_mod$peak,
lab= results_mod$gene_name,
pCutoff = 0.05,
selectLab = c(show$peak),
selectLab = c(show$gene_name),
FCcutoff = 0.5,
x = 'Fold',
y = 'FDR',
Expand Down Expand Up @@ -321,34 +362,80 @@ We use the [ChIPseeker](https://www.bioconductor.org/packages/release/bioc/html/
package to determine the genomic context of the differentially bound peaks and
visualize these annotations. We consider the promoter region to be within 2000 bp in either direction of the TSS.

We also use [ChIPpeakAnno](https://bioconductor.org/packages/release/bioc/html/ChIPpeakAnno.html)
to identify any gene features within 1000 bp of a differentially bound site.

```{r annotate, echo = F}
results_sig_anno <- annotatePeak(results_report_sig,
tssRegion = c(-2000, 2000),
TxDb = txdb,
annoDb = params$anno_db,
annoDb = anno_db,
verbose = F)
results_sig_anno_df <- results_sig_anno %>% as.data.frame()
plotAnnoPie(results_sig_anno)
plotDistToTSS(results_sig_anno)
anno_data <- toGRanges(txdb, feature = 'gene')
results_sig_anno_batch <- annotatePeakInBatch(results_report_sig,
AnnotationData = anno_data,
output = 'overlapping',
maxgap = 1000)
```

results_sig_anno_batch_df <- results_sig_anno_batch %>% as.data.frame()
# Functional Enrichment

write_csv(results_sig_anno_batch_df, params$results_sig_anno_fn)
Over-Representation Analysis (ORA) is a statistical method used to determine whether a predefined set of genes (e.g., genes belonging to a specific biological pathway or function) is over-represented (or enriched) among a list of differentially bound genes (DEGs) from ChIP-seq. Adventages of ORA:

- Simplicity: Easy to perform and interpret.
- Biological Insight: Helps to identify pathways and processes that are significantly affected in the condition studied.
- Prior Knowledge Integration: Utilizes existing biological knowledge through predefined gene sets.

```{r get databases}
if(params$species == 'human'){
all_in_life=get_databases()
} else if (params$species == 'mouse'){
all_in_life = get_databases('Mus musculus')
}
```

# Functional Enrichment
```{r ora}
universe_mapping = results_anno_batch_df %>%
filter(!is.na(FDR), !is.na(feature)) %>%
dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct()
ora_input = results_anno_batch_df %>%
filter(!is.na(FDR), FDR < 0.01, abs(Fold) > 0.3, !is.na(feature)) %>%
dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct()
all = run_fora(ora_input, universe_mapping, all_in_life)
ora_input = results_anno_batch_df %>%
filter(!is.na(FDR), FDR < 0.01, Fold > 0.3, !is.na(feature)) %>%
dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct()
up = run_fora(ora_input, universe_mapping, all_in_life)
ora_input = results_anno_batch_df %>%
filter(!is.na(FDR), FDR < 0.01, Fold < -0.3, !is.na(feature)) %>%
dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct()
down = run_fora(ora_input, universe_mapping, all_in_life)
```


## Significant pathways using all DB genes

```{r all pathways}
all %>% sanitize_datatable()
```


## Significant pathways using increased DB genes

```{r up pathways}
up %>% sanitize_datatable()
```


## Significant pathways using decreased DB genes

```{r down pathways, results='asis'}
down %>% sanitize_datatable()
```

# R session

Expand Down
44 changes: 44 additions & 0 deletions inst/templates/chipseq/libs/load_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,47 @@ make_diffbind_samplesheet <- function(coldata, bam_dir, peaks_dir, column = NULL

return(samplesheet)
}

get_databases=function(sps="human"){
all_in_life=list(
msigdbr(species = sps, category = "H") %>% mutate(gs_subcat="Hallmark"),
# msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"),
msigdbr(species = sps, category = "C2", subcategory = "CP:KEGG"),
# msigdbr(species = "human", category = "C2", subcategory = "CP:PID"),
msigdbr(species = sps, category = "C5", subcategory = "GO:BP"),
msigdbr(species = sps, category = "C5", subcategory = "GO:MF")
# msigdbr(species = "human", category = "C5", subcategory = "HPO"),
# msigdbr(species = "human", category = "C3", subcategory = "TFT:GTRD"),
# msigdbr(species = "human", category = "C6") %>% mutate(gs_subcat="Oncogenic")
)
all_in_life
}

run_fora=function(input, uni,all_in_life){
# browser()
total_deg=length(unique(input))/length(unique(uni$ENTREZID))
pathways_ora_all = lapply(all_in_life, function(p){
pathway = split(x = p$entrez_gene, f = p$gs_name)
db_name = paste(p$gs_cat[1], p$gs_subcat[1],sep=":")
respath <- fora(pathways = pathway,
genes = unique(input$ENTREZID),
universe = unique(uni$ENTREZID),
minSize = 15,
maxSize = 500)
# coll_respath = collapsePathwaysORA(respath[order(pval)][padj < 0.1],
# pathway, unique(input$ENTREZID), unique(uni$ENTREZID))
as_tibble(respath) %>%
mutate(database=db_name, NES=(overlap/size)/(total_deg))
}) %>% bind_rows() %>%
mutate(analysis="ORA")
ora_tb = pathways_ora_all %>% unnest(overlapGenes) %>%
group_by(pathway) %>%
left_join(uni, by =c("overlapGenes"="ENTREZID")) %>%
dplyr::select(pathway, padj, NES, SYMBOL, analysis,
database) %>%
group_by(pathway,padj,NES,database,analysis) %>%
summarise(genes=paste(SYMBOL,collapse = ","))
ora_tb

}

2 changes: 1 addition & 1 deletion inst/templates/rnaseq/DE/DEG.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -604,7 +604,7 @@ fa_list=lapply(de_list,function(contrast){
input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL'))
up=run_fora(input_entrezid, universe_mapping,all_in_life)
ora_input = res %>% filter(!is.na(padj), padj<0.01, lfc<0.3) %>% pull(gene_id)
ora_input = res %>% filter(!is.na(padj), padj<0.01, lfc<-0.3) %>% pull(gene_id)
#change to the right species
input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL'))
down=run_fora(input_entrezid, universe_mapping,all_in_life)
Expand Down

0 comments on commit 983a73d

Please sign in to comment.