Merge pull request #13 from yfyang86/v0.0.2

V0.0.2
yfyang86 · Dec 6, 2022 · ce96dcb · ce96dcb
2 parents 2a63562 + 0d97da3
commit ce96dcb
Show file tree

Hide file tree

Showing 43 changed files with 9,583 additions and 30 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,4 +14,11 @@ Thumbs.db
 ./simulation
 simulation/*
 
-.vscode
+.vscode
+
+
+.so
+.o
+
+proc2
+.proc2
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ $$LRT \sim (1-\hat{EP}) \chi_0^2 + \hat{EP} \chi_1^2,$$ where $\hat{EP}$ is esti
 
 Specially, PMAT-C with $\hat{EP} =0.5$ leads to PMAT. Thus, our tool provides PMAT-C in default.
 
-**NOTE:** Currently, all the parameters are hard coded in the `./proc/src/mathematics.cpp` file (the leading 4 entries in the `samParam` array):
+**NOTE**: Currently, all the parameters are hard coded in the `./proc/src/mathematics.cpp` file (the leading 4 entries in the `samParam` array):
 ```cpp
 double foldednomalpvalue(double *a, int m, double *b, int n, bool logit=true, double logittuning=0.00001, bool nonZero = true){
     bool useBartlette = true;
@@ -174,42 +174,44 @@ chr1    837884  838368  0.106765        14      0.12877 0.84016 0.81546
 chr1    848507  848646  0.076560        5       0.5     0.67124 0.65959
 chr1    136631  136718  0.140452        6       0.015472        0.85088 0.83067
 ```
-The following table explains the columns in output file.
-
-| Column| Comment | Example |
-|:--------|:--------|:--------|
-| 1 | chromosome | Example: chr1 |
-| 2| start  | Example: 661864 |
-| 3 | end | Example: 661928 |
-| 4 | avarge absolute 5mC difference between pairs| Example: 0.066676 |
-| 5| #CpGs in a region | Example: 5 |
-| 6 | p-value via FN-C test or FN test | Example: 0.0021516 |
-| 7 | average 5mC in one group| Example: 0.75823 |
-| 8 | average 5mC in the other group | Example: 0.79141 |
 
+The following table explains the columns in output file (with column Names).
 
-Run the following command to add column names and FDR value via BH method. (Perl script is available upon request.)
+| Column| Name | Comment | Example |
+|:--------|:--------|:--------|:--------|
+| 1 | chr | chromosome | Example: chr1 |
+| 2 | start | start  | Example: 661864 |
+| 3 | stop | end | Example: 661928 |
+| 4 | abs.methyl.diff | avarge absolute 5mC difference between pairs| Example: 0.066676 |
+| 5 | CpGs | #CpGs in a region | Example: 5 |
+| 6 | pFN | p-value via FN-C test or FN test | Example: 0.0021516 |
+| 7 | 5mC.A | average 5mC in one group| Example: 0.75823 |
+| 8 | 5mC.B | average 5mC in the other group | Example: 0.79141 |
+| 9 | FDR | ajusted p-value (FDR) by B-H method | Example: 0.55623 |
 
-```bash
-echo -e "chr\tstart\tstop\tabs.methyl.diff\tCpGs\tpFN\t5mC.A\t5mC.B" > test.mr.DMR.fdr
-num_lines.pl test.mr.DMR > o
-pvalues_BP_correction.pl o 0 8 | sort -n -k1 | mycut.pl -v -f1 | format_tab.pl >> test.mr.DMR.fdr
-```
-
-Alternatively, readers can add column names and FDR values via BH method using the following R script.
+Users could extend/verify the last column (FDR) by the follorwing R script. We should point out `method = "BH"` and `method = "fdr"` calculate the same statistic in R `p.adjust()`.
 ```R
-data <- read.table("test.mr.DMR", header = F)
-colnames(data) <- c("chr", "start", "stop", "abs.methyl.diff", "CpGs", "pFN", "5mC.A", "5mC.B")
-data$FDR <- p.adjust(data$pFN, method = "BH")
-write.table(data, "test.mr.DMR.fdr", row.names= F, sep = "\t", quote = F)
+data <- read.table("test.mr.DMR", header = T)
+FDR2 <- p.adjust(data$pFN, method = "BH")
+#### Verification ####
+# plot(data$FDR, FDR2, xlab = 'PMAT', ylab = 'R-p.adjust',
+#      xlim = c(0,1), ylim = c(0,1))
+var(data$FDR / FDR2)
+#> [1] 1.70628e-10
 ```
 
+The result illustrates that `PMAT` and `R::p.adjust` derive the same `fdr`.
 
+**NOTE**: Other p-value adjustments are possible in R, but they are not within the scope of PMAT. We provide a simple R script `caladj.R` for users with this intrest.
+```bash
+Rscript caladj.R DMR_OUTPUT_of_PMAT [METHOD]
+```
+Here `[METHOD]` is optional, it could be one of  `holm`, `hochberg`, `hommel`, `bonferroni`, `BH`, `BY`, `fdr`, and `none`. By default, we use `fdr` (`BH`). Please check the manual of `p.adjust` in R for more details. For example `Rscript caladj.R test.mr.DMR fdr` will generate a `test.mr.DMR.fdr` file, with `FDR` value as the last column.
 
 ## License
 
 C/C++ version (in the `./proc` folder):
-- `mm2.h`, `mm2.cpp`： Apache-2.0
+- `mm2.h`, `mm2.cpp`, `utils.h`, `utils.cpp`： Apache-2.0
 - Others： GPL v2.0
 
 ## Change logs:
@@ -221,7 +223,7 @@ C/C++ version (in the `./proc` folder):
   - [x] Windows
 - [ ] 2022-11: Example/Simulation
 	- [x] Simple demo
-	- [ ] Full example (in plan)
+	- [x] Full example (in plan)
 - [ ] Config file support for the Bartlett correction : In the comming Version
   - [x] Documentation and Examples.
   - [ ] `Config.txt` support without re-compile. 

diff --git a/proc/Makefile b/proc/Makefile
@@ -22,6 +22,7 @@
 	src/vstack.o\
 	src/segmentstack.o\
 	src/mtc.o\
+	src/utils.o\
 	src/metseg.o
 
 

diff --git a/proc/caladj.R b/proc/caladj.R
@@ -0,0 +1,17 @@
+#!/usr/local/bin/Rscript
+args = commandArgs(trailingOnly=TRUE)
+data <- read.table(args[1], header = T)
+## users could change the p-value adjustment method
+if (length(args) == 1){
+    method = "fdr"
+}else{
+    if (args[2] %in% c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none")){
+        method = args[2]
+    }else{
+        method = "fdr"
+    }
+}
+data$FDR <- p.adjust(data$pFN, method = method)
+names(data)[ncol(data)] = toupper(method)
+write.table(data, file = paste("test.mr.DMR.", method, sep = '') , 
+            row.names= F, sep = "\t", quote = F)
diff --git a/proc/src/metseg.cpp b/proc/src/metseg.cpp
@@ -33,6 +33,7 @@
 #include <ctype.h>
 #include <float.h>
 #include "mm2.h" //ad by xp
+#include "utils.h"
 
 #define MAXN 13
 #define MAXM 13
@@ -2632,9 +2633,23 @@ int main(int argc, char** argv) {
   if(nfo.mode == 1 || nfo.mode == 2) {
     fprintf(stderr, "Number of Tests: %d\n", nfo.outputList->numberTests);
     multiple_testing_correction(nfo.outputList, nfo.mode, nfo.mtc);
+
+    std::vector<double> fnpvalue;
+    fnpvalue.clear();
+       for(int i=0;i<nfo.outputList->i;i++){
+      if(nfo.outputList->segment_out_st[i].meandiff >= nfo.minMethDist || nfo.outputList->segment_out_st[i].meandiff <= -1* nfo.minMethDist) {
+         fnpvalue.push_back(nfo.outputList->segment_out_st[i].mwu);
+         }
+      }
+    std::vector<double> fnpvalue_re = padj_bh(fnpvalue);
+
+    fprintf(stdout, "%s",
+    "chr\tstart\tstop\tabs.methyl.diff\tCpGs\tpFN\t5mC.A\t5mC.B\tFDR\n");
+    int jj = 0;
+
     for(int i=0;i<nfo.outputList->i;i++){
       if(nfo.outputList->segment_out_st[i].meandiff >= nfo.minMethDist || nfo.outputList->segment_out_st[i].meandiff <= -1* nfo.minMethDist) {
-        fprintf(stdout, "%s\t%d\t%d\t%f\t%d\t%.5g\t%.5g\t%.5g\n",
+        fprintf(stdout, "%s\t%d\t%d\t%f\t%d\t%.5g\t%.5g\t%.5g\t%.5g\n",
                 nfo.outputList->segment_out_st[i].chr,
                 nfo.outputList->segment_out_st[i].start,
                 nfo.outputList->segment_out_st[i].stop,
@@ -2644,7 +2659,9 @@ int main(int argc, char** argv) {
                 nfo.outputList->segment_out_st[i].mwu,
                 //nfo.outputList->segment_out_st[i].p,
                 nfo.outputList->segment_out_st[i].methA,
-                nfo.outputList->segment_out_st[i].methB);
+                nfo.outputList->segment_out_st[i].methB,
+                fnpvalue_re[jj]);
+        ++jj;
         }
     }
   }

diff --git a/proc/src/utils.cpp b/proc/src/utils.cpp
@@ -0,0 +1,154 @@
+#include <fstream>
+#include <string>
+#include <iomanip>
+#include <sstream>
+#include <iostream>
+#include <map>
+#include <random>
+#include "utils.h"
+
+bool inx_tie_incr(inx_tie x, inx_tie y){
+    return (x.value < y.value);
+}
+
+bool inx_tie_decr(inx_tie x, inx_tie y){
+    return (y.value < x.value);
+}
+
+static std::map<std::string, int> solver_sam_load_config_setting(){
+/*
+a: 0.67672
+b: -0.03785
+c: 0.
+d: 0.
+*/
+    std::map<std::string, int> s_config;
+    s_config["a"] = 0;
+    s_config["b"] = 1;
+    s_config["c"] = 2;
+    s_config["d"] = 3;
+    s_config["useBartlett"]=4;
+    s_config["useFoldedNormal"]=5;
+    return s_config;
+}
+
+
+void solver_sam_load_config(double *  config){
+  std::string line;
+  std::ifstream config_file("config.txt");
+  std::string option;
+  double option_val;
+  static std::map<std::string, int> s_congfig = solver_sam_load_config_setting();
+  std::map<std::string, int>::iterator it;
+
+  for (size_t j=0; j<4; j++) config[j] = 0.;
+
+  if (!config_file.is_open()) {
+      std::cerr << "Could not open the file - '"
+           << "config.txt" << "'" << std::endl;
+      exit(EXIT_FAILURE);
+  }else {
+      while ( getline (config_file,option, ':') && getline (config_file,line) ){
+                std::istringstream(line) >> option_val;
+                it = s_congfig.find(option);
+                if (it!=s_congfig.end()) config[it->second] = option_val;
+    }
+  config_file.close();
+  }
+}
+
+/** BH method Randomize break the tie
+ * @brief
+ * By applying the above formulation, the FDR controlling MCP introduced in 
+ * Benjamini and Hochberg (1995) can be rephrased into a FDR $p$-value 
+ * adjustment. Benjamini and Hochberg's MCP is as follows: compare each 
+ * ordered $p$-value $p_{(i)}$ with $\alpha \cdot i / m$. 
+ * Let $k=\max \left\{i: p_{(i)} \leqslant \alpha \cdot i / m\right\}$, 
+ * if such $k$ exists reject $\mathrm{H}_{(1)}^0, \ldots, \mathrm{H}_{(k)}^0$.
+ * A $q$ sized $\mathrm{MCP}$ rejects an hypotheses $\mathrm{H}_{(i)}^0$ 
+ * if for some $k, k=i, \ldots, m, p_{(k)} \cdot m / k \leqslant \alpha$, 
+ * thus we define the BH FDR $p$-value adjustment as
+ * $$
+ * Q_{\text {adj }}^{\mathrm{BH}}\left(p_{(i)}\right)
+ *     =
+ *         \min _{i \leqslant k}\left\{p_{(k)} \cdot m / k\right\}
+ * $$
+ * @example{
+ *  std::vector<double> ex;
+ *  for (auto v : padj_bh(ex)){
+ *    std::cout<<v<<"\t";
+ *   }
+ * }
+*/
+std::vector<double> padj_bh(std::vector<double> & x){
+  /*
+        i <- lp:1L
+        o <- order(p, decreasing = TRUE)
+        ro <- order(o)
+        pmin(1, cummin(n/i * p[o]))[ro]
+  */
+    order_tie tmp_tie;
+    int cnt = 1;
+    std::map<int, order_tie> xp;
+    std::vector<inx_tie> yinx;
+    std::vector<inx_tie> roinx;
+    inx_tie xxxx;
+    int m = ((int) x.size());
+    std::vector<int> xorder;
+    std::vector<int> xorderinc;
+    std::vector<double> y = x;
+
+
+    for (auto xui:x){
+        xxxx.inx = cnt - 1;
+        xxxx.value = xui;
+        yinx.push_back(xxxx);
+        cnt++;
+    }    
+
+    std::sort(yinx.begin(), yinx.end(), inx_tie_decr);//decreasing
+
+    cnt = 1;
+    for (auto xui:yinx){
+        xxxx.inx = cnt - 1;
+        xxxx.value = (double) xui.inx;
+        roinx.push_back(xxxx);
+        cnt++;
+    }    
+
+    std::sort(roinx.begin(), roinx.end(), inx_tie_incr);//increasing
+
+    for (int i = 0; i< m; ++i){// n log(n)
+        xorder.push_back(i);
+        xorderinc.push_back(i);
+    }
+
+    for (int i = 0; i< m; ++i){
+        xorder[i]  = yinx[i].inx; // n
+        xorderinc[i] = roinx[i].inx;
+    }
+
+
+    for (int i = 0; i< m; ++i){
+        x[i] = yinx[i].value * m /(m-i+.0);
+    }
+
+
+    for (int i = 0; i< m; ++i){
+      if (i+1 < m && x[i+1]>x[i]){
+        x[i+1] = x[i];
+      }
+    }
+
+
+
+    for (int i = 0; i< m; ++i){
+        y[i] = x[xorderinc[i]];
+        y[i] = (
+          (y[i] > 1. || (!std::isfinite(y[i]))?  1. : y[i])
+        );
+    }
+
+    return y;
+}
+
diff --git a/proc/src/utils.h b/proc/src/utils.h
@@ -0,0 +1,35 @@
+#ifndef MAIN_UTILS_H
+#define MAIN_UTILS_H
+
+#include <fstream>
+#include <string>
+#include <iomanip>
+#include <sstream>
+#include <iostream>
+#include <map>
+#include <random>
+#include <map>
+#include <algorithm>
+#include <vector>
+
+static std::map<std::string, int> solver_sam_load_config_setting();
+void solver_sam_load_config(double * conifg);
+
+struct  order_tie
+{
+   double value;
+   std::vector<int> loc;
+   int cnt;
+};
+
+struct inx_tie{
+    double value;
+    int inx;
+};
+
+bool inx_tie_incr(inx_tie x, inx_tie y);
+bool inx_tie_decr(inx_tie x, inx_tie y);
+
+std::vector<double> padj_bh(std::vector<double> & x);
+
+#endif
-Original file line number
+Diff line change
@@ Expand Up / @@ -14,4 +14,11 @@ Thumbs.db @@
     ./simulation
     simulation/*
-    .vscode
+    .vscode
+    .so
+    .o
+    proc2
+    .proc2