Add statistic sorting gsea (#270)

* adding another soritn metric * heavy improving of helpfile * merge develop to update GSEA doc (#279)
ICB-DCM · Jul 23, 2024 · 8e5c24c · 8e5c24c
1 parent bbdcb09
commit 8e5c24c
Show file tree

Hide file tree

Showing 4 changed files with 83 additions and 82 deletions.
diff --git a/docs/interface-details/enrichment-analysis.md b/docs/interface-details/enrichment-analysis.md
@@ -24,18 +24,21 @@ In the side panel, you have the following options:
 - in case of a gene set enrichment analysis, you will have the following options
 
   - **Select the metric to sort the genes after**: Gene set enrichment analysis sort 
-    **all** your gene by some measure and then supplies this ranked list of genes to 
+    **all** your genes by some measure and then supplies this ranked list of genes to 
     an algorithm, that goes through this list and subsequently checks whether the gene 
-    in question belongt to a specified geneset. From this a score is calculated. The 
-    choice here is between logFoldChanges or absolute logFoldChanges.
+    in question belong to a specified geneset. From this a score is calculated. The 
+    choice here is between logFoldChanges, absolute logFoldChanges or t-statistic value.
+    Note, that for the latter no effect size is taken into account. Still, a positive t-statistic value indicates that the gene is upregulated in the treatment group, while a negative value indicates that the gene is downregulated in the treatment group. More information can be found [here](https://www.gsea-msigdb.org/gsea/doc/GSEAUserGuideFrame.html?Run_GSEA_Page)
+
 
   - With **Choose type for LFC-based ordering**, **Choose reference of log2 
     FoldChange**, and **Choose treatment group of log2 FoldChange** you can specify 
     which two groups to compare against each other.
 - in case of an overrepresentation analysis you hand over a list of genes, which then 
   will be compared to the geneset of interest, based on a supplied `universe of genes`,
   i.e. a set of all "possible" gene names. Thus your choice are
-  - **Select a Universe for enrichment (default is[clusterProfilers](https://bioconductor.org/packages/release/bioc/html/clusterProfiler.html)
+
+  - **Select a Universe for enrichment (default is [clusterProfilers](https://bioconductor.org/packages/release/bioc/html/clusterProfiler.html)
     default)**: Aside from the default options you can also choose the list of **your 
     own** genes from data, before or after preprocessing.
   - **Choose a gene set to hand over to enrich**: either you provide your own set, in 

diff --git a/program/shinyApp/R/enrichment_analysis/server.R b/program/shinyApp/R/enrichment_analysis/server.R
@@ -449,12 +449,15 @@ enrichment_analysis_Server <- function(id, data, params, updates){
             selectInput(
               inputId = ns("ValueToAttach"),
               label = "Select the metric to sort the genes after",
-              choices = c("LFC_abs", "LFC"),
+              choices = list(
+                "log fold change (LFC)"="LFC",
+                "absolute LFC"="LFC_abs", 
+                "t-statistic value"="statistic_value"),
               selected = input$ValueToAttach
             )
           })
           req(input$ValueToAttach)
-          if(input$ValueToAttach == "LFC" | input$ValueToAttach == "LFC_abs"){
+          if(input$ValueToAttach == "LFC" | input$ValueToAttach == "LFC_abs" | input$ValueToAttach == "statistic_value"){
             output$sample_annotation_types_cmp_GSEA_ui <- renderUI({
               req(data_input_shiny())
               if(is.null(ea_reactives$data)){
@@ -598,7 +601,8 @@ enrichment_analysis_Server <- function(id, data, params, updates){
       })
       ## Do enrichment ----
       geneSetChoice <- reactive({
-        if(isTruthy(input$GeneSet2Enrich)){
+
+        if(isTruthy(input$GeneSet2Enrich) & input$ORA_or_GSE == "OverRepresentation_Analysis" ){
           if(input$GeneSet2Enrich == "DE_Genes"){
             # TODO add option to send DE genes
             geneSetChoice_tmp <- DE_genelist()
@@ -627,7 +631,7 @@ enrichment_analysis_Server <- function(id, data, params, updates){
             geneSetChoice_tmp <- res_tmp[[session$token]]$Heatmap$gene_list
           }
         }else{
-          if(input$ValueToAttach == "LFC" | input$ValueToAttach == "LFC_abs"){
+          if(input$ValueToAttach == "LFC" | input$ValueToAttach == "LFC_abs" | input$ValueToAttach == "statistic_value"){
             #takes all genes after preprocessing
             #get LFC
             ctrl_samples_idx <- which(colData(ea_reactives$data)[,input$sample_annotation_types_cmp_GSEA] %in% input$Groups2Compare_ref_GSEA)
@@ -662,6 +666,8 @@ enrichment_analysis_Server <- function(id, data, params, updates){
             Data2Plot_tmp <- Data2Plot
             if(input$ValueToAttach == "LFC"){
               geneSetChoice_tmp <- Data2Plot_tmp$LFC
+            }else if(input$ValueToAttach == "statistic_value"){
+              geneSetChoice_tmp <- Data2Plot_tmp$statistic
             }
             else if(input$ValueToAttach == "LFC_abs"){
               geneSetChoice_tmp <- abs(Data2Plot_tmp$LFC)

diff --git a/program/shinyApp/R/fun_LFC.R b/program/shinyApp/R/fun_LFC.R
@@ -6,43 +6,45 @@ getLFCs <- function(
   completeOutput = FALSE
 ){
   df <- as.data.frame(data)
-  # Todo by @Lea: discuss and finalize how to handle this. constant row are not removed but small noise is added should in here a check if all 0 rows?
+  # constant rows result in NA p-values
   ttest_raw <- function(df, grp1, grp2) {
     x <- df[grp1]
     y <- df[grp2]
     x <- as.numeric(x)
     y <- as.numeric(y)
-    results <- t.test(x, y)
-    return(results$p.value)
-  }
-  #remove constant rows
-  removedAsConst_1 <- which(apply(df[,ctrl_samples_idx],1,sd) < 1e-6)
-  df[removedAsConst_1,ctrl_samples_idx] <- df[removedAsConst_1,ctrl_samples_idx] + t(apply(df[removedAsConst_1,ctrl_samples_idx],1,function(x){
-    rnorm(
-      n = length(x),
-      mean = 0,
-      sd=0.0000001
-    )
-  }))
-  removedAsConst_2 <- which(apply(df[,comparison_samples_idx],1,sd) < 1e-6)
-  df[removedAsConst_2,comparison_samples_idx] <- df[removedAsConst_2,comparison_samples_idx] + t(apply(df[removedAsConst_2,comparison_samples_idx],1,function(x){
-    rnorm(
-      n = length(x),
-      mean = 0,
-      sd=0.0000001
+    results <- NULL
+    tryCatch({
+      results <- t.test(y, x)
+      results <- list(p.value = results$p.value, statistic = unname(results$statistic))
+    },
+
+    error = function(e) {
+        results <- list(p.value = NA, statistic = NA)
+        }
     )
-  }))
+
+    if(is.null(results)){
+      results <- list(p.value = NA, statistic = NA)
+    }
 
-  rawpvalue <- apply(df, 1, ttest_raw, grp1 = ctrl_samples_idx, grp2 = comparison_samples_idx)
+    return(unlist(results))
+  }
 
-  p_adj <- p.adjust(rawpvalue, method = "fdr")
+  rawpvalue_stat <- apply(df, 1, ttest_raw, grp1 = ctrl_samples_idx, grp2 = comparison_samples_idx)
+
+  p_adj <- p.adjust(rawpvalue_stat["p.value",], method = "fdr")
   Ctrl_mean <- apply(df[,ctrl_samples_idx],1,mean)
   Cmp_mean <- apply(df[,comparison_samples_idx],1,mean)
+  # check if any of those 0
+  # put them to NA =< if they are signficiant but have 0 mean fc cannot be computed but potentially really interesting
+  Ctrl_mean[which(Ctrl_mean == 0)] <- NA
+  Cmp_mean[which(Cmp_mean == 0)] <- NA
+
   FC <- Cmp_mean/Ctrl_mean
   LFC <- log2(FC)
 
   # Data 2 Plot
-  results <- cbind(LFC, rawpvalue,p_adj)
+  results <- t(rbind(rawpvalue_stat,LFC,p_adj))
   results <- as.data.frame(results)
   results$probename <- rownames(results)
   if(completeOutput){

diff --git a/program/shinyApp/helpfiles/EA_Options.md b/program/shinyApp/helpfiles/EA_Options.md
@@ -6,89 +6,79 @@ For more details read here on
 [Gene Set Enrichment Analysis](https://www.pnas.org/doi/abs/10.1073/pnas.0506580102) 
 and/or [Over-Representation Analysis](https://doi.org/10.1093/bioinformatics/bth456).
 
-You can choose between the two analyses by selecting either of the two below `Choose 
-type of Analysis.`
+**Choose an organism**
+- You need to select the correct organism from which your data is derived. This is used to translate and harmonize the entities labels to entrez gene ids.
+- Choices include Human and mouse, in the brackets you see the exact version used. 
+
+Now, you can choose between the two analyses by selecting either of the two below `Choose 
+type of Analysis.` Depending on your choice different options need to be set.
+
 
 <h3>Gene Set Enrichment Options</h3> 
 <details>
-<summary>Click here</summary>
+<summary><span style="color:blue">Click here</span></summary>
 <br>
 
-**1. Specify Organism:**
-   - Choose the organism for which the enrichment analysis will be performed.
-   - Dropdown menu labeled "Specify your current organism."
-   - Choices include "hsa" (Human) or "mmu" (Mouse).
-
-**2. Choose Metric for Gene Sorting (GeneSetEnrichment):**
-   - Select the metric to sort genes after performing Gene Set Enrichment.
-   - Dropdown menu labeled "Select the metric to sort the genes after."
-   - Choices include "LFC_abs" (absolute Log Fold Change) or "LFC" (Log Fold Change).
-   - Single selection only.
+**1. Choose Metric for Gene Sorting:**
+   - The GSEA takes a ranked list as input. The ranking can be done on different metrics. Which metric to use varies on the context
+   - The possible options are "Log Fold change", "absolute LogFold change" and "t-statistic value"
+    - Log Fold change: The log2 fold change of the gene expression between the two groups specified further below. Note, taht no arbitrary cutoff  based on significance level is applied.
+    - Absolute LogFold change: The absolute value of the log2 fold change of the gene expression between the two groups specified further below. Note, that no arbitrary cutoff  based on significance level is applied.
+    - t-statistic value: The t-statistic value of the gene expression between the two groups specified further below. Note, that here no effect size is taken into account. Still, a positive t-statistic value indicates that the gene is upregulated in the treatment group, while a negative value indicates that the gene is downregulated in the treatment group.
 
-**3. Choose Comparison to sort by LFC:**
-   - Select annotation type and groups to calculate LFC and sort accordingly.
-   - Dropdown menus labeled "Choose type for LFC-based ordering," "Choose reference of log2 FoldChange," and "Choose treatment group of log2 FoldChange."
-   - Choices are based on sample annotations from the dataset.
+**2. Choose type for LFC_based ordering:**
+   - Here, you select the annotation type (options are the names of the columns within your supplied sample annotation). 
+   - This selection will be used to determine the options for **treatment group** and the **reference group** for the log2 fold change calculation.
+   - Note, you need at least 2 samples per group to calculate the log2 fold change.
 
-**4. Choose Gene Sets for Enrichment:**
+**3. Choose Gene Sets for Enrichment:**
    - Select the gene sets for which enrichment analysis will be performed.
-   - Dropdown menu labeled "Choose sets to do enrichment for."
    - Choices include various gene set collections like KEGG, GO, Hallmarks, etc.
    - See the help icon next to the dropdown menu for more details on the sets.
    - Multiple selections allowed.
 
-**5. Test Correction Method:**
-   - Choose the test correction method for the enrichment analysis. For more details 
-     on the correction methods, click on the help icon at `Significance Analysis`.
-   - Dropdown menu labeled "Test correction."
+**4. Test Correction Method:**
+   - Choose the test correction method for the enrichment analysis. 
    - Choices include "None," "Bonferroni," "Benjamini-Hochberg," "Benjamini Yekutieli," "Holm," "Hommel," "Hochberg," and "FDR."
-   - Single selection only.
-
+
 </details>
 
 <h3>Over-Representation Analysis Options</h3>
 <details>
-<summary>Click here</summary>
+<summary><span style="color:blue">Click here</span></summary>
 <br>
 
-**1. Specify Organism:**
-   - Choose the organism for which the enrichment analysis will be performed.
-   - Dropdown menu labeled "Specify your current organism."
-   - Choices include "hsa" (Human) or "mmu" (Mouse).
-
-**2. Choose Gene Set for Over-Representation Analysis:**
+**1. Choose Gene Sets for Enrichment:**
    - Select the gene sets for which enrichment analysis will be performed.
-   - Dropdown menu labeled "Choose sets to do enrichment for."
    - Choices include various gene set collections like KEGG, GO, Hallmarks, etc.
    - See the help icon next to the dropdown menu for more details on the sets.
    - Multiple selections allowed.
 
-**3. Provide Geneset for Over-Representation Analysis:**
- - Provide a custom gene set for over-representation analysis.
- - Dropdown menu labeled "Choose a gene set to hand over to enrich."
- - Options
+**2. Choose a gene set to hand over to enrich:**
+ - ORA takes a gene set of interest into account, which needs to be specified here.
+ - Options:
    - ProvidedGeneSet: Upload a custom gene set file for over-representation analysis 
-     in the file upload below.
-   - HeatmapGenes: Use the genes from the heatmap for over-representation analysis.
+     in the file upload below. This should be a .csv file with a single column of gene ids.
+   - HeatmapGenes: Use the genes from the heatmap for over-representation analysis. Note, that
+   for this option it is necassary to press respective button with the Heatmap tab!
 
-**4. Upload Gene Set for Over-Representation Analysis:**
+**3. Upload Gene Set for Over-Representation Analysis:**
    - Upload a custom gene set file for over-representation analysis.
    - File upload input labeled "Select a file (.csv, 1 column, ENSEMBL, e.g., ENSMUSG....)"
    - Visible when "ProvidedGeneSet" is selected.
 
-**5. Select Universe for Over-Representation Analysis:**
-   - Choose the universe for over-representation analysis.
-   - Dropdown menu labeled "Select a Universe for enrichment."
-   - Choices include "default," "allPresentGenes_after_pre_process," or "allPresentGenes_before_pre_process."
+**4. Select an Universe for Enrichment:**
+   - Choose the universe for over-representation analysis. As ORA check if you set of interest is enriched vs what you expect you need to specify the 'normal' situation, called the universe.
+   - Options:
+     - default: The default universe is all genes in the organism selected (clusterProfiler's default) - Not recommended if you performed e.g. an omic experiment hence actually profiling the universe of genes.
+     - after_pre_process: The universe is all genes that are present in the data after pre-processing.
+     - before_pre_process: The universe is all genes that are present in the data before pre-processing.
    - Single selection only.
 
-**6. Test Correction Method:**
-   - Choose the test correction method for the enrichment analysis. For more details 
-     on the correction methods, click on the help icon at `Significance Analysis`.
-   - Dropdown menu labeled "Test correction."
-   - Choices include _None_, _Bonferroni_, _Benjamini-Hochberg_, _Benjamini Yekutieli_,
-     _Holm_, _Hommel_, _Hochberg_ and _FDR_.
-   - Single selection only.
+**5. Test Correction Method:**
+   - Choose the test correction method for the enrichment analysis. 
+   - Choices include "None," "Bonferroni," "Benjamini-Hochberg," "Benjamini Yekutieli," "Holm," "Hommel," "Hochberg," and "FDR."
+
 
 </details>