diff --git a/Makefile b/Makefile index e7c54b6..2e3a1e6 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ -all: satc satc_dump satc_merge sig_anch download_kmc nomad +all: satc satc_dump satc_merge sig_anch download_kmc splash supervised_test -NOMAD_LIBS_DIR = libs +SPLASH_LIBS_DIR = libs LIBS_DIR = . #/usr/local/lib INCLUDE_DIR= libs MIMALLOC_INLUCDE_DIR = libs/mimalloc/include @@ -14,14 +14,16 @@ COMMON_DIR=src/common OUT_BIN_DIR=bin CC = g++ -CFLAGS = -fPIC -Wall -O3 -m64 -std=c++17 -mavx -pthread -I $(INCLUDE_DIR) -I $(MIMALLOC_INLUCDE_DIR) -fpermissive -CLINK = -lm -std=c++17 -lpthread -static-libstdc++ -lgfortran +CFLAGS = -fPIC -Wall -O3 -m64 -std=c++17 -pthread -I $(INCLUDE_DIR) -I $(MIMALLOC_INLUCDE_DIR) -fpermissive +CLINK = -lm -std=c++17 -lpthread -static-libstdc++ -release: CLINK = -lm -std=c++17 -static -lgfortran -lquadmath -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -release: CFLAGS = -fPIC -Wall -O3 -DNDEBUG -m64 -std=c++17 -mavx -pthread -I $(INCLUDE_DIR) -I $(MIMALLOC_INLUCDE_DIR) -fpermissive +MIMALLOC_OBJ=libs/mimalloc/mimalloc.o + +release: CLINK = -lm -std=c++17 -static -Wl,--whole-archive -lpthread -Wl,--no-whole-archive +release: CFLAGS = -fPIC -Wall -O3 -DNDEBUG -m64 -std=c++17 -pthread -I $(INCLUDE_DIR) -I $(MIMALLOC_INLUCDE_DIR) -fpermissive release: all -debug: CFLAGS = -fPIC -Wall -O0 -g -m64 -std=c++17 -mavx -pthread -I $(INCLUDE_DIR) -I $(MIMALLOC_INLUCDE_DIR) -fpermissive +debug: CFLAGS = -fPIC -Wall -O0 -g -m64 -std=c++17 -pthread -I $(INCLUDE_DIR) -I $(MIMALLOC_INLUCDE_DIR) -fpermissive debug: all ifdef MSVC # Avoid the MingW/Cygwin sections @@ -47,18 +49,22 @@ prefix = /usr/local # optional install location exec_prefix = $(prefix) +$(MIMALLOC_OBJ): + $(CC) -DMI_MALLOC_OVERRIDE -O3 -DNDEBUG -fPIC -Wall -Wextra -Wno-unknown-pragmas -fvisibility=hidden -ftls-model=initial-exec -fno-builtin-malloc -c -I libs/mimalloc/include libs/mimalloc/src/static.c -o $(MIMALLOC_OBJ) + %.o: %.cpp $(CC) $(CFLAGS) -c $< -o $@ satc: $(OUT_BIN_DIR)/satc $(OUT_BIN_DIR)/satc: $(SATC_MAIN_DIR)/satc.o \ - $(SATC_MAIN_DIR)/kmc_api/kmc_file.o \ - $(SATC_MAIN_DIR)/kmc_api/mmer.o \ - $(SATC_MAIN_DIR)/kmc_api/kmer_api.o + $(COMMON_DIR)/kmc_api/kmc_file.o \ + $(COMMON_DIR)/kmc_api/mmer.o \ + $(COMMON_DIR)/kmc_api/kmer_api.o \ + $(COMMON_DIR)/illumina_adapters_static.o -mkdir -p $(OUT_BIN_DIR) $(CC) -o $@ $^ \ - $(NOMAD_LIBS_DIR)/$(LIB_ZSTD) \ + $(SPLASH_LIBS_DIR)/$(LIB_ZSTD) \ $(CLINK) satc_merge: $(OUT_BIN_DIR)/satc_merge @@ -69,7 +75,7 @@ $(OUT_BIN_DIR)/satc_merge: $(SATC_MERGE_MAIN_DIR)/satc_merge.o \ $(SATC_MERGE_MAIN_DIR)/extra_stats.o -mkdir -p $(OUT_BIN_DIR) $(CC) -o $@ $^ \ - $(NOMAD_LIBS_DIR)/$(LIB_ZSTD) \ + $(SPLASH_LIBS_DIR)/$(LIB_ZSTD) \ $(CLINK) satc_dump: $(OUT_BIN_DIR)/satc_dump @@ -77,7 +83,7 @@ satc_dump: $(OUT_BIN_DIR)/satc_dump $(OUT_BIN_DIR)/satc_dump: $(SATC_DUMP_MAIN_DIR)/satc_dump.o -mkdir -p $(OUT_BIN_DIR) $(CC) -o $@ $^ \ - $(NOMAD_LIBS_DIR)/$(LIB_ZSTD) \ + $(SPLASH_LIBS_DIR)/$(LIB_ZSTD) \ $(CLINK) sig_anch: $(OUT_BIN_DIR)/sig_anch @@ -92,24 +98,26 @@ download_kmc: -mkdir -p $(OUT_BIN_DIR) ./download_kmc.sh $(OUT_BIN_DIR) -nomad: - cp src/nomad.py bin/nomad +splash: + cp src/splash.py bin/splash +supervised_test: + cp src/supervised_test/supervised_test.R bin install: all install bin/* /usr/local/bin uninstall: - -rm /usr/local/bin/satc + -rm /usr/local/bin/satc -rm /usr/local/bin/satc_dump -rm /usr/local/bin/satc_merge - -rm /usr/local/bin/sig_anch - -rm /usr/local/bin/nomad + -rm /usr/local/bin/sig_anch + -rm /usr/local/bin/splash -rm /usr/local/bin/kmc -rm /usr/local/bin/kmc_tools -clean: +clean: -rm $(SATC_MAIN_DIR)/*.o - -rm $(SATC_MAIN_DIR)/kmc_api/*.o + -rm $(COMMON_DIR)/kmc_api/*.o -rm $(SATC_MERGE_MAIN_DIR)/*.o -rm $(SATC_DUMP_MAIN_DIR)/*.o -rm $(COMMON_DIR)/*.o diff --git a/NOMAD_extendor_classification.R b/NOMAD_extendor_classification.R index 475949b..1eb0c2e 100644 --- a/NOMAD_extendor_classification.R +++ b/NOMAD_extendor_classification.R @@ -41,7 +41,7 @@ riffle <- function(a, b) { # this function interleaves the elements of two vec ################################################## ############ input arguments ##################### args <- commandArgs(trailingOnly = TRUE) -directory = args[1] # the output directory used for the NOMAD run +directory = args[1] # the output directory used for the SPLASH run which_anchors_file = args[2] # flag to decide which anchor file (after correction or all anchors) to use, could be "after_correction" or "all" effect_size_cutoff = args[3] # the effect size cutoff for significant anchors (default 0.2) num_samples_cutoff = args[4] # the minimum number of sampels for an anchor to be called (default 20) diff --git a/README.md b/README.md index ceec790..32ae75f 100644 --- a/README.md +++ b/README.md @@ -1,44 +1,45 @@ -# NOMAD 2.0 +# SPLASH 2 ## Introduction -NOMAD is an unsupervised and reference-free unifying framework to discover regulated sequence variation through statistical analysis of k-mer composition in both DNA and RNA sequence. -NOMAD leverages our observation that detecting sample-regulated sequence variation, such as alternative splicing, RNA editing, gene fusions, V(D)J, transposable element mobilization, allele-specific splicing, genetic variation in a population, and many other regulated events can be unified–in theory and in practice. -This is achieved with a simple model, NOMAD, that analyzes k-mer composition of raw sequencing reads (Chaung et al. 2022). -NOMAD finds constant sequences (anchors) that are followed by a set of sequences (targets) with sample-specific target variation and provides valid p-values. -NOMAD is reference-free, sidestepping the computational challenges associated with alignment and making it significantly faster and more efficient than alignment, and enabling discovery and statistical precision not currently available, even from pseudo-alignment. +SPLASH is an unsupervised and reference-free unifying framework to discover regulated sequence variation through statistical analysis of k-mer composition in both DNA and RNA sequence. +SPLASH leverages our observation that detecting sample-regulated sequence variation, such as alternative splicing, RNA editing, gene fusions, V(D)J, transposable element mobilization, allele-specific splicing, genetic variation in a population, and many other regulated events can be unified–in theory and in practice. +This is achieved with a simple model, SPLASH, that analyzes k-mer composition of raw sequencing reads (Chaung et al. 2022). +SPLASH finds constant sequences (anchors) that are followed by a set of sequences (targets) with sample-specific target variation and provides valid p-values. +SPLASH is reference-free, sidestepping the computational challenges associated with alignment and making it significantly faster and more efficient than alignment, and enabling discovery and statistical precision not currently available, even from pseudo-alignment. -The first version of [NOMAD](https://github.com/salzman-lab/nomad/) pipeline proved its usefullness. +The first version of [SPLASH](https://github.com/salzman-lab/splash/) pipeline proved its usefullness. It was implemented mainly in Python with the use of NextFlow. Here we provide a new and improved implementation based in C++ and Python (Kokot et al. 2023). This new version is much more efficient and allows for the analysis of datasets >1TB size in hours on a workstation or even a laptop. ## How does it work -A key concept of NOMAD is the analysis of composition of pairs of substrings *anchor*–*target* across many samples. +A key concept of SPLASH is the analysis of composition of pairs of substrings *anchor*–*target* across many samples. The substrings can be adjacent in reads or can be separated by a *gap*. -The image below presents the NOMAD pipeline on a high-level. +The image below presents the SPLASH pipeline on a high-level. +![image](https://github.com/refresh-bio/SPLASH/assets/9378882/8210fee0-c877-4374-9938-e3c01ea69e76) -![image](https://user-images.githubusercontent.com/9378882/225988504-70266e4d-37e0-4c85-8c95-e47ad208cda9.png) + ## Installation ### Precompiled binaries -The easiest way to get nomad is to use [precompiled release](https://github.com/refresh-bio/NOMAD/releases). -To get the version 2.0.0 and run the example is is sufficient to do: +The easiest way to get SPLASH is to use [precompiled release](https://github.com/refresh-bio/SPLASH/releases). +To get the version 2.1.4 and run the example is is sufficient to do: ``` -curl -L https://github.com/refresh-bio/NOMAD/releases/download/v2.0.0/nomad-2.0.0.linux.x64.tar.gz | tar xz +curl -L https://github.com/refresh-bio/SPLASH/releases/download/v2.1.4/splash-2.1.4.linux.x64.tar.gz | tar xz cd example ./run-example.sh ``` ### Compile from sources -NOMAD is implemented as a number of applications written in the C++ programming language and a Python wrapper to run the whole pipeline. +SPLASH is implemented as a number of applications written in the C++ programming language and a Python wrapper to run the whole pipeline. Currently the software may be used only under Linux. A compiler supporting C++17 is needed to compile the code. -Use following snippet to install NOMAD. +Use following snippet to install SPLASH. ``` -git clone https://github.com/refresh-bio/nomad -cd nomad +git clone https://github.com/refresh-bio/splash +cd splash make -j sudo make install ``` @@ -47,7 +48,7 @@ To verify the installation on small example one may perform: ``` cd example ./download.py #download examplary data -nomad input.txt #run the pipeline with default parameters +splash input.txt #run the pipeline with default parameters ``` The result consists of two TSV files, namely, 1. `result.after_correction.all_anchors.tsv` @@ -57,7 +58,7 @@ The second file contains only anchors whose corrected p-value is below 0.05. ## Inputs There is a lot of parameters allowing to customize the pipeline. They can be grouped into several categories. -The parameters will be displayed when running nomad without parameters (or with `--help`). +The parameters will be displayed when running splash without parameters (or with `--help`). ### Input reads parameters: * `--input_file` — @@ -68,10 +69,14 @@ The parameters will be displayed when running nomad without parameters (or with ### Anchor filtering parameters: * `--poly_ACGT_len` — filter out all anchors containing poly(ACGT) of length at least (0 means no filtering) (default: 8) +* `--artifacts` — path to artifacts, each anchor containing artifact will be filtered out (default: ) +* `--dont_filter_illumina_adapters` — if used anchors containing Illumina adapters will not be filtered out (default: False) * `--anchor_unique_targets_threshold` — filter out all anchors for which the number of unique targets is <= anchor_unique_targets_threshold (default: 1) * `--anchor_count_threshold` — filter out all anchors for which the total count <= anchor_count_threshold (default: 50) * `--anchor_samples_threshold` — filter out all anchors for which the number of unique samples is <= anchor_samples_threshold (default: 1) * `--anchor_sample_counts_threshold` — filter out anchor from sample if its count in this sample is <= anchor_sample_counts_threshold (default: 5) +* `--n_most_freq_targets_for_stats` — use at most n_most_freq_targets_for_stats for each contignency table, 0 means use all (default: 0) +* `--min_hamming_threshold` — keep only anchors with a pair of targets that differ by >= min_hamming_threshold (default: 0) ### Statistical significance parameters: * `--pvals_correction_col_name` — for which column correction should be applied (default: pval_opt) @@ -79,18 +84,21 @@ The parameters will be displayed when running nomad without parameters (or with ### Reporting parameters: * `--outname_prefix` — prefix of output file names (default: result) -* `--n_most_freq_targets_for_stats` — use at most n_most_freq_targets_for_stats for each contignency table, 0 means use all (default: 0) * `--dump_Cjs` — output Cjs (default: False) * `--max_pval_opt_for_Cjs` — dump only Cjs for anchors that have pval_opt <= max_pval_opt_for_Cjs (default: 0.1) * `--n_most_freq_targets` — number of most frequent tragets printed per each anchor in stats mode (default: 2) * `--with_effect_size_cts` — if set effect_size_cts will be computed (default: False) +* `--with_pval_asymp_opt` — if set pval_asymp_opt will be computed (default: False) * `--dump_sample_anchor_target_count_txt` — if set contignency tables will be generated in text format (default: False) * `--dump_sample_anchor_target_count_binary` — if set contignency tables will be generated in binary (SATC) format, to convert to text format later `satc_dump` program may be used, it may take optionally mapping from id to sample_name (--sample_names param) (default: False) -* `--dont_clean_up` — if set then intermediate files will not be removed (default: False) +* `--dump_sample_anchor_target_count_txt` — if set contignency tables will be generated in text format (default: False) +* `dump_sample_anchor_target_count_binary` — if set contignency tables will be generated in binary (satc) format, to convert to text format later satc_dump program may be used, it may take optionally mapping from id to sample_name (--sample_names param) (default: False) +* `--dont_clean_up` — if set then intermediate files will not be removed (default: False) ### Directory parameters: -* `--sample_name_to_id` — file name with mapping sample name <-> sammpe id (default: sample_name_to_id.mapping.txt) * `--bin_path` — path to a directory where satc, satc_dump, satc_merge, sig_anch, kmc, kmc_tools binaries are (if any not found there nomad will check if installed and use installed) (default: ./) -* `--tmp_dir` — path to a directory where temporary files will be stored (default: let nomad decide) +* `--sample_name_to_id` — file name with mapping sample name <-> sammpe id (default: sample_name_to_id.mapping.txt) +* `--bin_path` — path to a directory where satc, satc_dump, satc_merge, sig_anch, kmc, kmc_tools binaries are (if any not found there splash will check if installed and use installed) (default: ./) +* `--tmp_dir` — path to a directory where temporary files will be stored (default: let splash decide) * `--logs_dir` — director where run logs of each thread will be stored (default: logs) ### High Performance Computing parameters: @@ -105,7 +113,9 @@ The parameters will be displayed when running nomad without parameters (or with * `--opt_num_inits` — the number of altMaximize random initializations (default: 10) * `--opt_num_iters` — the maximum number of iterations in altMaximize (default: 50) * `--num_rand_cf` — the number of random c and f used for pval_base (default: 50) +* `--num_splits` — the number of contingency table splits (default: 1) * `--opt_train_fraction` — in calc_stats mode use this fraction to create training data from contingency table (default: 0.25) +* `--without_alt_max` — if set int alt max and related stats will not be computed (default: False) ### 10X SC-RNAseq parameters: - `run_10X` @@ -127,7 +137,7 @@ There are following columns in the resulting tsv files | anchor | anchor | | | pval_opt | p-value from alternating maximization | number of iterations and other parameters can be configured with switches. Optimization formulation and statistical exposition in (Baharav et al. 2023). | | effect_size_bin | measure of anchor's effect size | bounded between [0,1], indicates how well the data is divided between 2 groups of columns. 0 indicates no different between groups, 1 indicates two groups of columns with disjoint row supports (Baharav et al. 2023). | -| pval_asymp_base | asymptotically valid p-value based on alternating maximization | compute optimizing c and f used for pval_opt, and evaluate approximate p-value given by asymptotic normality (Baharav et al. 2023). | +| pval_asymp_opt | asymptotically valid p-value based on alternating maximization | compute optimizing c and f used for pval_opt, and evaluate approximate p-value given by asymptotic normality (Baharav et al. 2023). | | pval_base | p-value based on random partitionings | Compute base p-value (using num_rand_cf random c,f), Bonferroni correction to yield valid p-value. | | M | anchor count | number of anchor occurences in the input (= the sum of elements in contingency table) | | anch_uniqTargs | number of unique targets for anchor | (=number of rows in contingency table) | @@ -156,16 +166,16 @@ Its format is one sample per line. Each line should contain the name of a sample ## Additional output ### Most frequent targets per each anchor -By default NOMAD will store 2 most frequent targets per each anchor in the resulting TSV files. This should be sufficient for splicing, but for RNA editing/missmatches 4 may be a better choice. It may be set with `--n_most_freq_targets` switch. If the number of targets for a given anchor is lower than specified value there will be a single `-` for each missing target. +By default SPLASH will store 2 most frequent targets per each anchor in the resulting TSV files. This should be sufficient for splicing, but for RNA editing/missmatches 4 may be a better choice. It may be set with `--n_most_freq_targets` switch. If the number of targets for a given anchor is lower than specified value there will be a single `-` for each missing target. ### SATC format -NOMAD stores intermediate and optional output files in SATC format (**S**ample **A**nchor **T**arget **C**ount). +SPLASH stores intermediate and optional output files in SATC format (**S**ample **A**nchor **T**arget **C**ount). ### Sample representation -The unique id is assigned to each sample. The ids are consecutive numbers starting with 0. The first sample from the input file gets id 0, the second one gets 1, and so on. By default NOMAD will store the mapping in `sample_name_to_id.mapping.txt` file, but this may be redefined with `--sample_name_to_id` parameter. This mapping file may be useful to access the data stored in SATC format. +The unique id is assigned to each sample. The ids are consecutive numbers starting with 0. The first sample from the input file gets id 0, the second one gets 1, and so on. By default SPLASH will store the mapping in `sample_name_to_id.mapping.txt` file, but this may be redefined with `--sample_name_to_id` parameter. This mapping file may be useful to access the data stored in SATC format. ### Output sample, anchor, target, count #### Textual When `--dump_sample_anchor_target_count_txt` switch is used there will be an additional output directory (named `result_dump` by default, but the `results` part may be redefined with `--outname_prefix` switch). This directory will contain a number of files (equal to the number of bins, default 128, may be redefined with `--n_bins` switch). The extension of these files is `.satc.dump`. Each line of these files is a tab-separated list of . This is the easiest way to be able to reproduce contingency tables used during computation. Each file contains some number of anchors, but it is assured that a specific anchor is present in a single file. Since these text files could be large, it may be proficient to use binary (SATC) files instead. #### Binary -When `--dump_sample_anchor_target_count_binary` switch is used there will be an additional output directory (named `result_satc` by default, but the `results` part may be redefined with `--outname_prefix` switch). This directory will contain a number of files (equal to the number of bins, default 128, may be redefined with `--n_bins` switch). The extension of these files is `.satc`. These are binary files in SATC format. Their content may be converted to textual representation with `satc_dump` program (part of the NOMAD package). Each file contains some number of anchors, but it is assured that a specific anchor is present in a single file. Since these text files could be large, it may be proficient to use binary (SATC) files instead, especially if one wants to investigate only some of all anchors. +When `--dump_sample_anchor_target_count_binary` switch is used there will be an additional output directory (named `result_satc` by default, but the `results` part may be redefined with `--outname_prefix` switch). This directory will contain a number of files (equal to the number of bins, default 128, may be redefined with `--n_bins` switch). The extension of these files is `.satc`. These are binary files in SATC format. Their content may be converted to textual representation with `satc_dump` program (part of the SPLASH package). Each file contains some number of anchors, but it is assured that a specific anchor is present in a single file. Since these text files could be large, it may be proficient to use binary (SATC) files instead, especially if one wants to investigate only some of all anchors. ### satc_dump To convert SATC files into textual representation one may use `satc_dump` program. The simples usage is ``` @@ -177,10 +187,10 @@ There are also additional parameters that may be useful, namely: * `--n_bins ` — if set to value different than 0 the input is interpreted as a list of bins (each bin in separate line, first list is bin_0, second line is bin_1, etc. (in case of ill-formed input results will be incorrect) * `--separately` — if set with n_bins != 0 output param will be treated as suffix name and there will be output for each bin - If `--sample_names` is not used in the output there will be sample ids instead of its names. NOMAD by default write mapping to `sample_name_to_id.mapping.txt` file (may be redefined with `--sample_name_to_id` switch of NOMAD. + If `--sample_names` is not used in the output there will be sample ids instead of its names. SPLASH by default write mapping to `sample_name_to_id.mapping.txt` file (may be redefined with `--sample_name_to_id` switch of SPLASH. ### Interpreting results - `plotGeneration.py` provides basic functionality to visualize contingency tables. To use, run NOMAD with the --dump_sample_anchor_target_count_binary flag. Then, `plotGeneration.py` can be run with inputs: + `plotGeneration.py` provides basic functionality to visualize contingency tables. To use, run SPLASH with the --dump_sample_anchor_target_count_binary flag. Then, `plotGeneration.py` can be run with inputs: - `--dsName` — dataset name to be used in title of plots - `outFolder` — path to save files to - `metadataPath` — path to .tsv file containing two columns titled `sampleName` and `metadata` @@ -188,7 +198,7 @@ There are also additional parameters that may be useful, namely: - `pvDfPath` — path to p-value .tsv file (result.after_correction.scores.tsv) - `sampleMappingTxt` — path to sample_name_to_id.mapping.txt file - `anchorFile` — file of anchors to be plotted, one anchor per line -- `satc_dump_file` — path to satc_dump utility file, within NOMAD /bin folder +- `satc_dump_file` — path to satc_dump utility file, within SPLASH /bin folder - `skipSATC` — optional flag, if .satc files have already been dumped for this set of anchors into the output folder, and the desire is to regenerate plots. The generated output is 1 file per anchor, each file containing 4 subplots. At bottom left is the raw I x J contingency table, where each column is a sample, and each row represents a target (low abundance targets not shown). Each sample’s target counts are normalized by $n_j$ and plotted. At bottom right, the targets for the contingency table are displayed in a I x k table (corresponding to the rows of the contingency table, visually aligned). Each target of length k is shown, with basepairs colored as in the below colorbar. The target sequence for each row is displayed on the right. At top left, the sample metadata is shown in a 1 x J table. Each entry corresponds to the column in the contingency table directly below it. The sample metadata is shown in a colorbar below. At top right, the column counts ($n_j$) are shown in a 1 x J table, where the colorbar below provides the scale. Again, columns are sorted as the contingency table. @@ -197,9 +207,9 @@ The generated output is 1 file per anchor, each file containing 4 subplots. At b The script `c_analysis.ipynb` shows how the saved c vectors can be loaded in for further analysis. `--dump_Cjs` must be enabled for this. ## Biological interpretation and classification of anchors -To facilitate downstream analysis of anchors, we provide a postprocessing script `NOMAD_extendor_classification.R`, that can be run on the anchors file generataed from the NOMAD run to classify anchors to biologically meaningful events such as alternative splicing, and base pair changes. `NOMAD_extendor_classification.R` needs the following inputs: +To facilitate downstream analysis of anchors, we provide a postprocessing script `SPLASH_extendor_classification.R`, that can be run on the anchors file generataed from the SPLASH run to classify anchors to biologically meaningful events such as alternative splicing, and base pair changes. `SPLASH_extendor_classification.R` needs the following inputs: -- `directory` — the output directory used for the NOMAD run +- `directory` — the output directory used for the SPLASH run - `which_anchors_file` — flag to decide which anchor file (after correction or all anchors) to use, could be "after_correction" or "all" - `effect_size_cutoff` — the effect size cutoff for significant anchors (default 0.2) - `num_samples_cutoff` — the minimum number of sampels for an anchor to be called (default 20) @@ -214,7 +224,7 @@ To facilitate downstream analysis of anchors, we provide a postprocessing script - `bowtie2_reference` — path to bowtie2 index for the reference genome - `paralogs_file` — path to file containing list of paralogous genes from reference genome -The script will generate a file `classified_anchors.tsv` in the same directory used for NOMAD run, containing significant anchors along with their biological classification and alignment information. +The script will generate a file `classified_anchors.tsv` in the same directory used for SPLASH run, containing significant anchors along with their biological classification and alignment information. ## Annotation files for human genome (based on T2T): The following three files are needed by the classification script for annotation anchors from human datasets and can be downloaded from the following links: @@ -224,7 +234,7 @@ The following three files are needed by the classification script for annotation ## References Marek Kokot, Roozbeh Dehghannasiri, Tavor Baharav, Julia Salzman, and Sebastian Deorowicz. -[NOMAD2 provides ultra-efficient, scalable, and unsupervised discovery on raw sequencing reads] (https://..) +[SPLASH2 provides ultra-efficient, scalable, and unsupervised discovery on raw sequencing reads] (https://..) bioRxiv (2023) Kaitlin Chaung, Tavor Baharav, Ivan Zheludev, Julia Salzman. [A statistical, reference-free algorithm subsumes myriad problems in genome science and enables novel discovery](https://doi.org/10.1101/2022.06.24.497555), bioRxiv (2022) diff --git a/build_release.py b/build_release.py index f8308fc..3978136 100755 --- a/build_release.py +++ b/build_release.py @@ -11,19 +11,21 @@ def replace_in_file(file_path, search_text, new_text): new_line = line.replace(search_text, new_text) print(new_line, end='') -def get_ver(nomad_path): - with open(nomad_path) as f: +def get_ver(splash_path): + with open(splash_path) as f: for line in f.readlines(): line = line.strip() - if "NOMAD_VERSION" in line: + if "SPLASH_VERSION" in line: return line.split("=")[-1].strip().split("\"")[1] - print("Error: cannot read NOMAD_VERSION") + print("Error: cannot read SPLASH_VERSION") sys.exit(1) def run_cmd(cmd): p = subprocess.Popen(cmd, shell=True) p.communicate() +run_cmd("git submodule init") +run_cmd("git submodule update") run_cmd("make clean") run_cmd("make -j32 release") @@ -37,12 +39,12 @@ def run_cmd(cmd): with open("bin/example/run-example.sh", "w") as f: f.write("#!/bin/bash\n") f.write("./download.py\n") - f.write("../nomad --bin_path .. input.txt\n") + f.write("../splash --bin_path .. input.txt\n") os.chmod("bin/example/run-example.sh", os.stat("bin/example/run-example.sh").st_mode | stat.S_IEXEC) -ver = get_ver("bin/nomad") +ver = get_ver("bin/splash") -run_cmd(f"cd bin; tar -c * | pigz > ../nomad-{ver}.linux.x64.tar.gz; cd ..;") +run_cmd(f"cd bin; tar -c * | pigz > ../splash-{ver}.linux.x64.tar.gz; cd ..;") run_cmd("rm -rf bin") diff --git a/example/analysis_scripts/c_analysis.ipynb b/example/analysis_scripts/c_analysis.ipynb index 005d70f..61fddab 100644 --- a/example/analysis_scripts/c_analysis.ipynb +++ b/example/analysis_scripts/c_analysis.ipynb @@ -30,7 +30,7 @@ "metadata": {}, "outputs": [], "source": [ - "NOMAD2_result_folder=''" + "SPLASH2_result_folder=''" ] }, { @@ -41,7 +41,7 @@ "outputs": [], "source": [ "### read in p-values, afetr_correction.scores.csv file\n", - "fname=NOMAD2_result_folder+'/result.after_correction.scores.tsv'\n", + "fname=SPLASH2_result_folder+'/result.after_correction.scores.tsv'\n", "df = pd.read_csv(fname,sep='\\t')" ] }, @@ -95,7 +95,7 @@ "dfArr = []\n", "\n", "##### path to intermediary_files output folder\n", - "fldrName= NOMAD2_result_folder+'/result_Cjs/'\n", + "fldrName= SPLASH2_result_folder+'/result_Cjs/'\n", "\n", "for fname in tqdm(glob.glob(fldrName+'bin*.cjs')):\n", " dfArr.append(\n", diff --git a/example/analysis_scripts/plotGeneration.py b/example/analysis_scripts/plotGeneration.py index 60fb190..40d6fe3 100644 --- a/example/analysis_scripts/plotGeneration.py +++ b/example/analysis_scripts/plotGeneration.py @@ -66,7 +66,7 @@ def get_args(): parser.add_argument( "--satc_dump_file", type=str, - help='Path to satc_dump utility file (within NOMAD2, /bin/satc_dump)' + help='Path to satc_dump utility file (within SPLASH2, /bin/satc_dump)' ) args = parser.parse_args() diff --git a/libs/refresh/deterministic_random.h b/libs/refresh/deterministic_random.h new file mode 100644 index 0000000..6fa55f0 --- /dev/null +++ b/libs/refresh/deterministic_random.h @@ -0,0 +1,129 @@ +#pragma once +#include +#include +#include + +#undef min +#undef max + +template +class det_uniform_int_distribution { +public: + // types + typedef IntType result_type; + typedef std::pair param_type; + + // constructors and reset functions + explicit det_uniform_int_distribution(IntType a = 0, IntType b = std::numeric_limits::max()); + explicit det_uniform_int_distribution(const param_type& parm); + void reset(); + + // generating functions + template + result_type operator()(URNG& g); + template + result_type operator()(URNG& g, const param_type& parm); + + // property functions + result_type a() const; + result_type b() const; + param_type param() const; + void param(const param_type& parm); + result_type min() const; + result_type max() const; + +private: + typedef typename std::make_unsigned::type diff_type; + + IntType lower; + IntType upper; +}; + +template +det_uniform_int_distribution::det_uniform_int_distribution(IntType a, IntType b) { + lower = a; + upper = b; +} + +template +det_uniform_int_distribution::det_uniform_int_distribution(const param_type& parm) { + param(parm); +} + +template +void det_uniform_int_distribution::reset() {} + +template +template +auto det_uniform_int_distribution::operator()(URNG& g) -> result_type { + return operator()(g, param()); +} + +template +template +auto det_uniform_int_distribution::operator()(URNG& g, const param_type& parm) -> result_type { + diff_type diff = (diff_type)parm.second - (diff_type)parm.first + 1; + if (diff == 0) // If the +1 overflows we are using the full range, just return g() + return g(); + + diff_type badDistLimit = std::numeric_limits::max() / diff; + do { + diff_type generatedRand = g(); + + if (generatedRand / diff < badDistLimit) + return (IntType)((generatedRand % diff) + (diff_type)parm.first); + } while (true); +} + +template +auto det_uniform_int_distribution::a() const -> result_type { + return lower; +} + +template +auto det_uniform_int_distribution::b() const -> result_type { + return upper; +} + +template +auto det_uniform_int_distribution::param() const -> param_type { + return param_type(lower, upper); +} + +template +void det_uniform_int_distribution::param(const param_type& parm) { + std::tie(lower, upper) = parm; + if (upper < lower) + throw std::exception(); +} + +template +auto det_uniform_int_distribution::min() const -> result_type { + return lower; +} + +template +auto det_uniform_int_distribution::max() const -> result_type { + return upper; +}; + + + + +template +void partial_shuffle(RandomIt first, RandomIt middle, RandomIt last, URBG&& g) +{ + typedef typename std::iterator_traits::difference_type diff_t; + typedef det_uniform_int_distribution distr_t; + typedef typename distr_t::param_type param_t; + + distr_t D; + diff_t n = middle - first; + diff_t N = last - first - 1; + for (diff_t i =0; i < n; ++i) { + using std::swap; + swap(first[i], first[D(g, param_t(i, N))]); + } +} + + diff --git a/libs/refresh/parallel-queues.h b/libs/refresh/parallel-queues.h index 9e250da..19158b0 100644 --- a/libs/refresh/parallel-queues.h +++ b/libs/refresh/parallel-queues.h @@ -187,6 +187,11 @@ namespace refresh { } } + bool check_completed() { + std::lock_guard lck(mtx); + return is_completed; + } + void cancel() { std::lock_guard lck(mtx); diff --git a/src/common/artifacts_filter.h b/src/common/artifacts_filter.h new file mode 100644 index 0000000..4f894da --- /dev/null +++ b/src/common/artifacts_filter.h @@ -0,0 +1,75 @@ +#ifndef _ARTIFACTS_FILTER_H +#define _ARTIFACTS_FILTER_H +#include "../common/satc_data.h" +#include +#include + +class ArtifactsFilter +{ + std::map> artifacts; //index is artifact len +public: + ArtifactsFilter(const std::string& path) //empty path means no filtering + { + if (path == "") + return; + + std::ifstream in(path); + + if (!in) + { + std::cerr << "Error: cannot open file " << path << "\n"; + exit(1); + } + std::string artifact; + + while (in >> artifact) + { + auto len = artifact.length(); + assert(len <= 32); + artifacts[len].insert(str_kmer_to_uint64_t(artifact)); + } + } + ArtifactsFilter() : + ArtifactsFilter("") + { + + } + + void Add(uint32_t len, const std::vector& new_artifacts) { + auto& art = artifacts[len]; + for (auto x : new_artifacts) + art.insert(x); + } + + bool ContainsArtifact(uint64_t anchor, uint32_t len) const + { + for (const auto& x : artifacts) + { + uint32_t art_len = x.first; + + //skip too long artifacts + if (art_len > len) + continue; + + const auto& art = x.second; + + uint64_t mask = ((1ull) << (2 * art_len)) - 1; + + auto no_parts = len - art_len + 1; + uint64_t check = anchor; + for (uint32_t i = 0; i < no_parts - 1; ++i) + { + if (art.count(check & mask)) + return true; + check >>= 2; + } + //last one + if (art.count(check & mask)) + return true; + } + return false; + } +}; + + +#endif diff --git a/src/common/hamming_filter.h b/src/common/hamming_filter.h new file mode 100644 index 0000000..6a2635e --- /dev/null +++ b/src/common/hamming_filter.h @@ -0,0 +1,73 @@ +#ifndef _HAMMING_FILTER_H +#define _HAMMING_FILTER_H +#include +#include +#include "target_count.h" + + +class HammingFilter +{ + uint32_t min_hamming_threshold; + + int calculateHamming(uint64_t a, uint64_t b) const { + + // generate mask with even bits + static uint64_t MASK_EVEN_BITS = []() { + uint64_t v = 0; + for (int i = 0; i < 32; ++i) { + v <<= 2; + v |= 1ULL; + } + return v; + }(); + + //a &= ~HOMOPOLYMER_MASK; + //b &= ~HOMOPOLYMER_MASK; + + uint64_t diffs_even = (a ^ b) & MASK_EVEN_BITS; + uint64_t diffs_odd = ((a >> 1) ^ (b >> 1)) & MASK_EVEN_BITS; + + uint64_t diffs = diffs_even | diffs_odd; + #ifdef _WIN32 + int cnt = __popcnt64(diffs); + #else + int cnt = __builtin_popcountll(diffs); + #endif + return cnt; + } + +public: + HammingFilter(uint32_t min_hamming_threshold) + : min_hamming_threshold(min_hamming_threshold) + { + + } + HammingFilter() : + HammingFilter(0) + { + + } + + bool ContainsDistantPair(const std::vector& targets) const + { + if (min_hamming_threshold == 0) + return true; + + if (targets.size() < 2) + return false; + + for (size_t i = 0; i < targets.size(); ++i) + { + for (size_t j = i+1; j < targets.size(); ++j) + { + uint64_t target_i = targets[i].target; + uint64_t target_j = targets[j].target; + if (static_cast(calculateHamming(target_i, target_j)) >= min_hamming_threshold) + return true; + } + } + + return false; + } +}; +#endif diff --git a/src/common/illumina_adapters_static.cpp b/src/common/illumina_adapters_static.cpp new file mode 100644 index 0000000..cf5dfd5 --- /dev/null +++ b/src/common/illumina_adapters_static.cpp @@ -0,0 +1,778 @@ +#include "illumina_adapters_static.h" + +std::vector IlluminaAdaptersStatic::Get12Mers() { + return { + 143048, // AAAGAGTGTAGA + 368117, // AACCGCTCTTCC + 411825, // AACGCAGAGTAC + 479953, // AACTCCAGTCAC + 551217, // AAGACGGCATAC + 561435, // AAGAGCACACGT + 563956, // AAGAGCGGTTCA + 564078, // AAGAGCGTCGTG + 564659, // AAGAGCTCGTAT + 572195, // AAGAGTGTAGAT + 598150, // AAGCAGAAGACG + 601779, // AAGCAGTGGTAT + 930214, // AATGATACGGCG + 940165, // AATGCCGAGACC + 1013422, // AATTCTCGGGTG + 1128280, // ACACATCTCCGA + 1128326, // ACACATCTGACG + 1148535, // ACACGACGCTCT + 1161089, // ACACGTCTGAAC + 1171413, // ACACTCTTTCCC + 1275273, // ACATCTCCGAGC + 1276007, // ACATCTGACGCT + 1333389, // ACCACCGAGATC + 1450865, // ACCGAGATCTAC + 1455980, // ACCGATCTCGTA + 1472470, // ACCGCTCTTCCG + 1599357, // ACGACGCTCTTC + 1609133, // ACGAGATCGGTC + 1647300, // ACGCAGAGTACA + 1695576, // ACGCTCTTCCGA + 1697158, // ACGCTGCCGACG + 1723490, // ACGGCATACGAG + 1728837, // ACGGCGACCACC + 1800221, // ACGTCTGAACTC + 1817055, // ACGTGTGCTCTT + 1959672, // ACTCTGCGTTGA + 1965404, // ACTCTTTCCCTA + 2001874, // ACTGGAGTTCAG + 2131603, // AGAAGACGGCAT + 2188150, // AGACCGATCTCG + 2204870, // AGACGGCATACG + 2210717, // AGACGTGTGCTC + 2245741, // AGAGCACACGTC + 2255826, // AGAGCGGTTCAG + 2256315, // AGAGCGTCGTGT + 2258638, // AGAGCTCGTATG + 2278634, // AGAGTACATGGG + 2288781, // AGAGTGTAGATC + 2320418, // AGATCGGAAGAG + 2321270, // AGATCGGTCTCG + 2322717, // AGATCTACACTC + 2324154, // AGATCTCGGTGG + 2341680, // AGATGTGTATAA + 2377438, // AGCACACGTCTG + 2392602, // AGCAGAAGACGG + 2400485, // AGCAGGAATGCC + 2407117, // AGCAGTGGTATC + 2442785, // AGCCCACGAGAC + 2538788, // AGCGGTTCAGCA + 2544187, // AGCGTCAGATGT + 2546610, // AGCGTCGTGTAG + 2583781, // AGCTCGTATGCC + 2636130, // AGGAATGCCGAG + 2753070, // AGGGAAAGAGTG + 3060545, // AGTGGTATCAAC + 3066077, // AGTGTAGATCTC + 3098734, // AGTTCAGACGTG + 3216245, // ATACACATCTCC + 3216248, // ATACACATCTGA + 3229599, // ATACCACTGCTT + 3246298, // ATACGAGATCGG + 3253780, // ATACGGCGACCA + 3414306, // ATCAACGCAGAG + 3572260, // ATCGGAAGAGCA + 3572262, // ATCGGAAGAGCG + 3572263, // ATCGGAAGAGCT + 3585897, // ATCGGTCTCGGC + 3609055, // ATCTACACTCTT + 3627157, // ATCTCCGAGCCC + 3632045, // ATCTCGGTGGTC + 3632357, // ATCTCGTATGCC + 3638905, // ATCTGACGCTGC + 3720856, // ATGATACGGCGA + 3760662, // ATGCCGAGACCG + 3763703, // ATGCCGTCTTCT + 3874278, // ATGTACTCTGCG + 3912456, // ATGTGTATAAGA + 4028896, // ATTCCTGCTGAA + 4053689, // ATTCTCGGGTGC + 4297260, // CAACGCAGAGTA + 4343841, // CAAGCAGAAGAC + 4484576, // CACACGTCTGAA + 4513122, // CACATCTCCGAG + 4513305, // CACATCTGACGC + 4557020, // CACCGAGATCTA + 4594143, // CACGACGCTCTT + 4644359, // CACGTCTGAACT + 4685655, // CACTCTTTCCCT + 4727204, // CAGAAGACGGCA + 4746983, // CAGACGTGTGCT + 4763962, // CAGAGTACATGG + 4779724, // CAGATGTGTATA + 4794425, // CAGCAGGAATGC + 4830350, // CAGCGTCAGATG + 4853336, // CAGGAATGCCGA + 4959440, // CAGTGGTATCAA + 5005878, // CATACGAGATCG + 5101093, // CATCTCCGAGCC + 5104030, // CATCTGACGCTG + 5162873, // CATGTACTCTGC + 5201528, // CATTCCTGCTGA + 5333559, // CCACCGAGATCT + 5451904, // CCATATAAGAAA + 5485022, // CCATGTACTCTG + 5557280, // CCCATATAAGAA + 5565559, // CCCATGTACTCT + 5706118, // CCCTACACGACG + 5801357, // CCGAGACCGATC + 5803460, // CCGAGATCTACA + 5805336, // CCGAGCCCACGA + 5823923, // CCGATCTCGTAT + 5889880, // CCGCTCTTCCGA + 5996007, // CCGTCTTCTGCT + 6047257, // CCTACACGACGC + 6191126, // CCTGCTGAACCG + 6374792, // CGACCACCGAGA + 6397429, // CGACGCTCTTCC + 6428215, // CGAGACCGATCT + 6436535, // CGAGATCGGTCT + 6436625, // CGAGATCTACAC + 6444130, // CGAGCCCACGAG + 6518478, // CGATCTCGTATG + 6589203, // CGCAGAGTACAT + 6665039, // CGCCGTATCATT + 6782307, // CGCTCTTCCGAT + 6788632, // CGCTGCCGACGA + 6824516, // CGGAAGAGCACA + 6824555, // CGGAAGAGCGGT + 6824557, // CGGAAGAGCGTC + 6824566, // CGGAAGAGCTCG + 6852332, // CGGAGATGTGTA + 6891218, // CGGCAGCGTCAG + 6893960, // CGGCATACGAGA + 6897017, // CGGCATTCCTGC + 6915350, // CGGCGACCACCG + 7005450, // CGGGTGCCAAGG + 7042707, // CGGTCTCGGCAT + 7056790, // CGGTGGTCGCCG + 7066186, // CGGTTCAGCAGG + 7136695, // CGTATGCCGTCT + 7152571, // CGTCAGATGTGT + 7185563, // CGTCGGCAGCGT + 7191338, // CGTCGTGTAGGG + 7200885, // CGTCTGAACTCC + 7206815, // CGTCTTCTGCTT + 7251816, // CGTGGGCTCGGA + 7260800, // CGTGTAGGGAAA + 7268221, // CGTGTGCTCTTC + 7310417, // CGTTGATACCAC + 7411815, // CTACACGACGCT + 7413245, // CTACACTCTTTC + 7702865, // CTCCGAGCCCAC + 7768302, // CTCGGAGATGTG + 7771095, // CTCGGCATTCCT + 7777872, // CTCGGGTGCCAA + 7781081, // CTCGGTGGTCGC + 7786075, // CTCGTATGCCGT + 7793270, // CTCGTGGGCTCG + 7838691, // CTCTGCGTTGAT + 7851076, // CTCTTATACACA + 7853623, // CTCTTCCGATCT + 7861617, // CTCTTTCCCTAC + 7870071, // CTGAACCGCTCT + 7871819, // CTGAACTCCAGT + 7890838, // CTGACGCTGCCG + 7978545, // CTGCGTTGATAC + 7987289, // CTGCTGAACCGC + 8007496, // CTGGAGTTCAGA + 8091596, // CTGTCTCTTATA + 8176717, // CTTATACACATC + 8345361, // CTTTCCCTACAC + 8424370, // GAAAGAGTGTAG + 8480637, // GAACCGCTCTTC + 8508596, // GAACTCCAGTCA + 8526412, // GAAGACGGCATA + 8528966, // GAAGAGCACACG + 8529597, // GAAGAGCGGTTC + 8529627, // GAAGAGCGTCGT + 8529772, // GAAGAGCTCGTA + 8623649, // GAATGCCGAGAC + 8641963, // GAATTCTCGGGT + 8721955, // GACCACCGAGAT + 8752603, // GACCGATCTCGT + 8812502, // GACGCTCTTCCG + 8812897, // GACGCTGCCGAC + 8819480, // GACGGCATACGA + 8842871, // GACGTGTGCTCT + 8889076, // GACTGGAGTTCA + 8935645, // GAGACCGATCTC + 8968925, // GAGATCGGTCTC + 8969287, // GAGATCTACACT + 8974028, // GAGATGTGTATA + 8982967, // GAGCACACGTCT + 8999304, // GAGCCCACGAGA + 9023305, // GAGCGGTTCAGC + 9025260, // GAGCGTCGTGTA + 9034553, // GAGCTCGTATGC + 9155127, // GAGTGTAGATCT + 9163291, // GAGTTCAGACGT + 9196007, // GATACCACTGCT + 9202053, // GATACGGCGACC + 9281673, // GATCGGAAGAGC + 9285082, // GATCGGTCTCGG + 9290871, // GATCTACACTCT + 9296619, // GATCTCGGTGGT + 9296697, // GATCTCGTATGC + 9366722, // GATGTGTATAAG + 9509752, // GCACACGTCTGA + 9570409, // GCAGAAGACGGC + 9579598, // GCAGAGTACATG + 9596195, // GCAGCGTCAGAT + 9601942, // GCAGGAATGCCG + 9628468, // GCAGTGGTATCA + 9640077, // GCATACGAGATC + 9688990, // GCATTCCTGCTG + 9838947, // GCCGAGACCGAT + 9887609, // GCCGTCTTCTGC + 9982306, // GCGACCACCGAG + 10155154, // GCGGTTCAGCAG + 10176750, // GCGTCAGATGTG + 10186442, // GCGTCGTGTAGG + 10216212, // GCGTTGATACCA + 10330683, // GCTCGGAGATGT + 10335126, // GCTCGTATGCCG + 10352013, // GCTCTTCCGATC + 10356125, // GCTGAACCGCTC + 10494700, // GGAAAGAGTGTA + 10520849, // GGAAGAGCACAC + 10521007, // GGAAGAGCGGTT + 10521014, // GGAAGAGCGTCG + 10521051, // GGAAGAGCTCGT + 10544520, // GGAATGCCGAGA + 10549098, // GGAATTCTCGGG + 10632115, // GGAGATGTGTAT + 10679430, // GGAGTTCAGACG + 10787656, // GGCAGCGTCAGA + 10798627, // GGCATACGAGAT + 10810855, // GGCATTCCTGCT + 10884184, // GGCGACCACCGA + 10971278, // GGCTCGGAGATG + 11012283, // GGGAAAGAGTGT + 11131427, // GGGCTCGGAGAT + 11325540, // GGTATCAACGCA + 11376333, // GGTCGCCGTATC + 11393615, // GGTCTCGGCATT + 11449947, // GGTGGTCGCCGT + 11487528, // GGTTCAGCAGGA + 11656815, // GTACTCTGCGTT + 11679595, // GTAGATCTCGGT + 11706402, // GTAGGGAAAGAG + 11733124, // GTATAAGAGACA + 11747730, // GTATCAACGCAG + 11769567, // GTATGCCGTCTT + 11833068, // GTCAGATGTGTA + 11950900, // GTCGCCGTATCA + 11965037, // GTCGGCAGCGTC + 11988136, // GTCGTGTAGGGA + 12020029, // GTCTCGGCATTC + 12021415, // GTCTCGTGGGCT + 12025028, // GTCTCTTATACA + 12026324, // GTCTGAACTCCA + 12050046, // GTCTTCTGCTTG + 12089903, // GTGACTGGAGTT + 12181336, // GTGCTCTTCCGA + 12230050, // GTGGGCTCGGAG + 12242182, // GTGGTATCAACG + 12245356, // GTGGTCGCCGTA + 12264310, // GTGTAGATCTCG + 12265986, // GTGTAGGGAAAG + 12267656, // GTGTATAAGAGA + 12295669, // GTGTGCTCTTCC + 12394939, // GTTCAGACGTGT + 12395680, // GTTCAGCAGGAA + 12464455, // GTTGATACCACT + 12815465, // TAATGATACGGC + 12864982, // TACACATCTCCG + 12864993, // TACACATCTGAC + 12870045, // TACACGACGCTC + 12875765, // TACACTCTTTCC + 12985195, // TACGAGATCGGT + 13015121, // TACGGCGACCAC + 13072830, // TACTCTGCGTTG + 13163950, // TAGATCTCGGTG + 13271179, // TAGGGAAAGAGT + 13378066, // TATAAGAGACAG + 13386973, // TATACACATCTC + 13386974, // TATACACATCTG + 13436488, // TATCAACGCAGA + 13523837, // TATGCCGTCTTC + 13657227, // TCAACGCAGAGT + 13668872, // TCAAGCAGAAGA + 13769657, // TCAGACGTGTGC + 13777843, // TCAGATGTGTAT + 13781518, // TCAGCAGGAATG + 14009441, // TCCCTACACGAC + 14034246, // TCCGAGCCCACG + 14130693, // TCCTGCTGAACC + 14249171, // TCGCCGTATCAT + 14289041, // TCGGAAGAGCAC + 14289050, // TCGGAAGAGCGG + 14289051, // TCGGAAGAGCGT + 14289053, // TCGGAAGAGCTC + 14295995, // TCGGAGATGTGT + 14305716, // TCGGCAGCGTCA + 14307166, // TCGGCATTCCTG + 14334274, // TCGGGTGCCAAG + 14343588, // TCGGTCTCGGCA + 14347109, // TCGGTGGTCGCC + 14367085, // TCGTATGCCGTC + 14379302, // TCGTCGGCAGCG + 14395866, // TCGTGGGCTCGG + 14398112, // TCGTGTAGGGAA + 14436223, // TCTACACTCTTT + 14508628, // TCTCCGAGCCCA + 14525685, // TCTCGGCATTCC + 14527380, // TCTCGGGTGCCA + 14528182, // TCTCGGTGGTCG + 14529430, // TCTCGTATGCCG + 14531229, // TCTCGTGGGCTC + 14545681, // TCTCTTATACAC + 14550866, // TCTGAACTCCAG + 14555621, // TCTGACGCTGCC + 14577548, // TCTGCGTTGATA + 14627091, // TCTTATACACAT + 14669252, // TCTTTCCCTACA + 14703071, // TGAACCGCTCTT + 14710061, // TGAACTCCAGTC + 14786136, // TGACGCTGCCGA + 14805181, // TGACTGGAGTTC + 14881913, // TGATACCACTGC + 14883425, // TGATACGGCGAC + 15042648, // TGCCGAGACCGA + 15054814, // TGCCGTCTTCTG + 15136965, // TGCGTTGATACC + 15170915, // TGCTCTTCCGAT + 15171943, // TGCTGAACCGCT + 15220186, // TGGAATTCTCGG + 15252769, // TGGAGTTCAGAC + 15365768, // TGGGCTCGGAGA + 15414297, // TGGTATCAACGC + 15426995, // TGGTCGCCGTAT + 15497115, // TGTACTCTGCGT + 15502810, // TGTAGATCTCGG + 15509512, // TGTAGGGAAAGA + 15516193, // TGTATAAGAGAC + 15589169, // TGTCTCTTATAC + 15628246, // TGTGCTCTTCCG + 15649826, // TGTGTATAAGAG + 15786778, // TTAATGATACGG + 15929655, // TTATACACATCT + 16000130, // TTCAAGCAGAAG + 16025326, // TTCAGACGTGTG + 16028291, // TTCAGCAGGAAT + 16085272, // TTCCCTACACGA + 16115585, // TTCCTGCTGAAC + 16214757, // TTCTCGGGTGCC + 16239850, // TTCTTATATGGG + 16303390, // TTGATACCACTG + 16529606, // TTTAATGATACG + 16582944, // TTTCAAGCAGAA + 16604230, // TTTCCCTACACG + 16642874, // TTTCTTATATGG + 16715313, // TTTTAATGATAC + 16728648, // TTTTCAAGCAGA + 16761740, // TTTTTAATGATA + 16765074, // TTTTTCAAGCAG + 16773347, // TTTTTTAATGAT + 16774180, // TTTTTTCAAGCA + 16776248, // TTTTTTTAATGA + 16776457, // TTTTTTTCAAGC + 16776974, // TTTTTTTTAATG + 16777026, // TTTTTTTTCAAG + 16777155, // TTTTTTTTTAAT + 16777168, // TTTTTTTTTCAA + 16777200, // TTTTTTTTTTAA + 16777204, // TTTTTTTTTTCA + 14436223, // TCTACACTCTTT + 10521007, // GGAAGAGCGGTT + 11656815, // GTACTCTGCGTT + 12089903, // GTGACTGGAGTT + 11769567, // GTATGCCGTCTT + 1817055, // ACGTGTGCTCTT + 14703071, // TGAACCGCTCTT + 4594143, // CACGACGCTCTT + 3246559, // ATACGAGCTCTT + 3609055, // ATCTACACTCTT + 7206815, // CGTCTTCTGCTT + 3229599, // ATACCACTGCTT + 6665039, // CGCCGTATCATT + 11393615, // GGTCTCGGCATT + 4547087, // CACCCGAGAATT + 14295995, // TCGGAGATGTGT + 7152571, // CGTCAGATGTGT + 2256315, // AGAGCGTCGTGT + 12394939, // GTTCAGACGTGT + 11012283, // GGGAAAGAGTGT + 10330683, // GCTCGGAGATGT + 2544187, // AGCGTCAGATGT + 9296619, // GATCTCGGTGGT + 11679595, // GTAGATCTCGGT + 12985195, // TACGAGATCGGT + 6824555, // CGGAAGAGCGGT + 8529627, // GAAGAGCGTCGT + 8752603, // GACCGATCTCGT + 15497115, // TGTACTCTGCGT + 14289051, // TCGGAAGAGCGT + 7185563, // CGTCGGCAGCGT + 7786075, // CTCGTATGCCGT + 11449947, // GGTGGTCGCCGT + 9163291, // GAGTTCAGACGT + 561435, // AAGAGCACACGT + 13657227, // TCAACGCAGAGT + 13271179, // TAGGGAAAGAGT + 7871819, // CTGAACTCCAGT + 3763703, // ATGCCGTCTTCT + 6436535, // CGAGATCGGTCT + 7136695, // CGTATGCCGTCT + 8982967, // GAGCACACGTCT + 8842871, // GACGTGTGCTCT + 7870071, // CTGAACCGCTCT + 1148535, // ACACGACGCTCT + 5005943, // CATACGAGCTCT + 5565559, // CCCATGTACTCT + 9290871, // GATCTACACTCT + 7853623, // CTCTTCCGATCT + 6428215, // CGAGACCGATCT + 9155127, // GAGTGTAGATCT + 5333559, // CCACCGAGATCT + 15929655, // TTATACACATCT + 4746983, // CAGACGTGTGCT + 5996007, // CCGTCTTCTGCT + 10810855, // GGCATTCCTGCT + 9196007, // GATACCACTGCT + 12021415, // GTCTCGTGGGCT + 15171943, // TGCTGAACCGCT + 1276007, // ACATCTGACGCT + 7411815, // CTACACGACGCT + 10798631, // GGCATACGAGCT + 7771095, // CTCGGCATTCCT + 4685655, // CACTCTTTCCCT + 12464455, // GTTGATACCACT + 8969287, // GAGATCTACACT + 4644359, // CACGTCTGAACT + 10632115, // GGAGATGTGTAT + 13777843, // TCAGATGTGTAT + 601779, // AAGCAGTGGTAT + 5823923, // CCGATCTCGTAT + 15426995, // TGGTCGCCGTAT + 7838691, // CTCTGCGTTGAT + 15170915, // TGCTCTTCCGAT + 6782307, // CGCTCTTCCGAT + 2588003, // AGCTCTTCCGAT + 9838947, // GCCGAGACCGAT + 572195, // AAGAGTGTAGAT + 11131427, // GGGCTCGGAGAT + 8721955, // GACCACCGAGAT + 10798627, // GGCATACGAGAT + 9596195, // GCAGCGTCAGAT + 14249171, // TCGCCGTATCAT + 7042707, // CGGTCTCGGCAT + 2131603, // AGAAGACGGCAT + 6589203, // CGCAGAGTACAT + 14627091, // TCTTATACACAT + 16028291, // TTCAGCAGGAAT + 9525379, // GCACCCGAGAAT + 13072830, // TACTCTGCGTTG + 12050046, // GTCTTCTGCTTG + 16025326, // TTCAGACGTGTG + 7768302, // CTCGGAGATGTG + 10176750, // GCGTCAGATGTG + 13163950, // TAGATCTCGGTG + 564078, // AAGAGCGTCGTG + 3098734, // AGTTCAGACGTG + 2753070, // AGGGAAAGAGTG + 15054814, // TGCCGTCTTCTG + 2377438, // AGCACACGTCTG + 5485022, // CCATGTACTCTG + 13386974, // TATACACATCTG + 9688990, // GCATTCCTGCTG + 5104030, // CATCTGACGCTG + 14307166, // TCGGCATTCCTG + 16303390, // TTGATACCACTG + 6518478, // CGATCTCGTATG + 10971278, // GGCTCGGAGATG + 4830350, // CAGCGTCAGATG + 9579598, // GCAGAGTACATG + 13781518, // TCAGCAGGAATG + 2324154, // AGATCTCGGTGG + 16642874, // TTTCTTATATGG + 4763962, // CAGAGTACATGG + 16239850, // TTCTTATATGGG + 2278634, // AGAGTACATGGG + 7191338, // CGTCGTGTAGGG + 9285082, // GATCGGTCTCGG + 15502810, // TGTAGATCTCGG + 14395866, // TCGTGGGCTCGG + 3246298, // ATACGAGATCGG + 14289050, // TCGGAAGAGCGG + 2392602, // AGCAGAAGACGG + 10186442, // GCGTCGTGTAGG + 7066186, // CGGTTCAGCAGG + 14528182, // TCTCGGTGGTCG + 10521014, // GGAAGAGCGTCG + 2321270, // AGATCGGTCTCG + 2188150, // AGACCGATCTCG + 12264310, // GTGTAGATCTCG + 7793270, // CTCGTGGGCTCG + 5005878, // CATACGAGATCG + 3874278, // ATGTACTCTGCG + 930214, // AATGATACGGCG + 3572262, // ATCGGAAGAGCG + 14379302, // TCGTCGGCAGCG + 15628246, // TGTGCTCTTCCG + 1472470, // ACCGCTCTTCCG + 8812502, // GACGCTCTTCCG + 6453206, // CGAGCTCTTCCG + 12864982, // TACACATCTCCG + 7890838, // CTGACGCTGCCG + 14529430, // TCTCGTATGCCG + 9601942, // GCAGGAATGCCG + 7056790, // CGGTGGTCGCCG + 6267990, // CCTTGGCACCCG + 3760662, // ATGCCGAGACCG + 6915350, // CGGCGACCACCG + 6191126, // CCTGCTGAACCG + 2204870, // AGACGGCATACG + 1128326, // ACACATCTGACG + 1697158, // ACGCTGCCGACG + 5706118, // CCCTACACGACG + 10679430, // GGAGTTCAGACG + 598150, // AAGCAGAAGACG + 14034246, // TCCGAGCCCACG + 16604230, // TTTCCCTACACG + 8528966, // GAAGAGCACACG + 12242182, // GTGGTATCAACG + 2546610, // AGCGTCGTGTAG + 8424370, // GAAAGAGTGTAG + 12230050, // GTGGGCTCGGAG + 4513122, // CACATCTCCGAG + 2636130, // AGGAATGCCGAG + 16401762, // TTGGCACCCGAG + 9982306, // GCGACCACCGAG + 1723490, // ACGGCATACGAG + 6444130, // CGAGCCCACGAG + 3414306, // ATCAACGCAGAG + 15649826, // TGTGTATAAGAG + 2320418, // AGATCGGAAGAG + 11706402, // GTAGGGAAAGAG + 2255826, // AGAGCGGTTCAG + 2001874, // ACTGGAGTTCAG + 6891218, // CGGCAGCGTCAG + 11747730, // GTATCAACGCAG + 10155154, // GCGGTTCAGCAG + 14550866, // TCTGAACTCCAG + 13378066, // TATAAGAGACAG + 9366722, // GATGTGTATAAG + 12265986, // GTGTAGGGAAAG + 7413245, // CTACACTCTTTC + 8529597, // GAAGAGCGGTTC + 14805181, // TGACTGGAGTTC + 13523837, // TATGCCGTCTTC + 7268221, // CGTGTGCTCTTC + 8480637, // GAACCGCTCTTC + 1599357, // ACGACGCTCTTC + 12986237, // TACGAGCTCTTC + 12020029, // GTCTCGGCATTC + 1411133, // ACCCGAGAATTC + 3632045, // ATCTCGGTGGTC + 1609133, // ACGAGATCGGTC + 6824557, // CGGAAGAGCGTC + 11965037, // GTCGGCAGCGTC + 14367085, // TCGTATGCCGTC + 2245741, // AGAGCACACGTC + 14710061, // TGAACTCCAGTC + 8968925, // GAGATCGGTCTC + 8935645, // GAGACCGATCTC + 3066077, // AGTGTAGATCTC + 13386973, // TATACACATCTC + 2210717, // AGACGTGTGCTC + 14531229, // TCTCGTGGGCTC + 10356125, // GCTGAACCGCTC + 12870045, // TACACGACGCTC + 9640093, // GCATACGAGCTC + 2322717, // AGATCTACACTC + 1800221, // ACGTCTGAACTC + 2407117, // AGCAGTGGTATC + 11376333, // GGTCGCCGTATC + 10352013, // GCTCTTCCGATC + 5801357, // CCGAGACCGATC + 2288781, // AGAGTGTAGATC + 1333389, // ACCACCGAGATC + 9640077, // GCATACGAGATC + 8176717, // CTTATACACATC + 13769657, // TCAGACGTGTGC + 9887609, // GCCGTCTTCTGC + 5162873, // CATGTACTCTGC + 3638905, // ATCTGACGCTGC + 6897017, // CGGCATTCCTGC + 14881913, // TGATACCACTGC + 9296697, // GATCTCGTATGC + 4794425, // CAGCAGGAATGC + 3585897, // ATCGGTCTCGGC + 9570409, // GCAGAAGACGGC + 7781081, // CTCGGTGGTCGC + 7987289, // CTGCTGAACCGC + 4513305, // CACATCTGACGC + 6047257, // CCTACACGACGC + 15414297, // TGGTATCAACGC + 1275273, // ACATCTCCGAGC + 6893961, // CGGCATACGAGC + 9281673, // GATCGGAAGAGC + 9023305, // GAGCGGTTCAGC + 12875765, // TACACTCTTTCC + 12295669, // GTGTGCTCTTCC + 368117, // AACCGCTCTTCC + 6397429, // CGACGCTCTTCC + 1613301, // ACGAGCTCTTCC + 14525685, // TCTCGGCATTCC + 5644533, // CCCGAGAATTCC + 3216245, // ATACACATCTCC + 7200885, // CGTCTGAACTCC + 14555621, // TCTGACGCTGCC + 3632357, // ATCTCGTATGCC + 2400485, // AGCAGGAATGCC + 14347109, // TCGGTGGTCGCC + 5101093, // CATCTCCGAGCC + 1171413, // ACACTCTTTCCC + 3627157, // ATCTCCGAGCCC + 15136965, // TGCGTTGATACC + 9202053, // GATACGGCGACC + 940165, // AATGCCGAGACC + 1728837, // ACGGCGACCACC + 14130693, // TCCTGCTGAACC + 411825, // AACGCAGAGTAC + 1450865, // ACCGAGATCTAC + 7861617, // CTCTTTCCCTAC + 15589169, // TGTCTCTTATAC + 7978545, // CTGCGTTGATAC + 551217, // AAGACGGCATAC + 12864993, // TACACATCTGAC + 14883425, // TGATACGGCGAC + 8812897, // GACGCTGCCGAC + 14009441, // TCCCTACACGAC + 8623649, // GAATGCCGAGAC + 2442785, // AGCCCACGAGAC + 15516193, // TGTATAAGAGAC + 15252769, // TGGAGTTCAGAC + 4343841, // CAAGCAGAAGAC + 479953, // AACTCCAGTCAC + 14289041, // TCGGAAGAGCAC + 7702865, // CTCCGAGCCCAC + 7310417, // CGTTGATACCAC + 13015121, // TACGGCGACCAC + 6436625, // CGAGATCTACAC + 8345361, // CTTTCCCTACAC + 14545681, // TCTCTTATACAC + 10520849, // GGAAGAGCACAC + 1161089, // ACACGTCTGAAC + 16115585, // TTCCTGCTGAAC + 3060545, // AGTGGTATCAAC + 9882940, // GCCGTATCATTA + 6852332, // CGGAGATGTGTA + 11833068, // GTCAGATGTGTA + 9025260, // GAGCGTCGTGTA + 10494700, // GGAAAGAGTGTA + 1455980, // ACCGATCTCGTA + 12245356, // GTGGTCGCCGTA + 4297260, // CAACGCAGAGTA + 4557020, // CACCGAGATCTA + 1965404, // ACTCTTTCCCTA + 8091596, // CTGTCTCTTATA + 8974028, // GAGATGTGTATA + 4779724, // CAGATGTGTATA + 14577548, // TCTGCGTTGATA + 8526412, // GAAGACGGCATA + 1959672, // ACTCTGCGTTGA + 14645752, // TCTTCTGCTTGA + 9509752, // GCACACGTCTGA + 3216248, // ATACACATCTGA + 5201528, // CATTCCTGCTGA + 11988136, // GTCGTGTAGGGA + 7251816, // CGTGGGCTCGGA + 11487528, // GGTTCAGCAGGA + 3720856, // ATGATACGGCGA + 12181336, // GTGCTCTTCCGA + 5889880, // CCGCTCTTCCGA + 1695576, // ACGCTCTTCCGA + 9035608, // GAGCTCTTCCGA + 1128280, // ACACATCTCCGA + 14786136, // TGACGCTGCCGA + 4853336, // CAGGAATGCCGA + 8294744, // CTTGGCACCCGA + 15042648, // TGCCGAGACCGA + 10884184, // GGCGACCACCGA + 8819480, // GACGGCATACGA + 6788632, // CGCTGCCGACGA + 5805336, // CCGAGCCCACGA + 16085272, // TTCCCTACACGA + 143048, // AAAGAGTGTAGA + 15365768, // TGGGCTCGGAGA + 10544520, // GGAATGCCGAGA + 15275400, // TGGCACCCGAGA + 6374792, // CGACCACCGAGA + 6893960, // CGGCATACGAGA + 8999304, // GAGCCCACGAGA + 12267656, // GTGTATAAGAGA + 8007496, // CTGGAGTTCAGA + 10787656, // GGCAGCGTCAGA + 13436488, // TATCAACGCAGA + 3912456, // ATGTGTATAAGA + 15509512, // TGTAGGGAAAGA + 563956, // AAGAGCGGTTCA + 8889076, // GACTGGAGTTCA + 14305716, // TCGGCAGCGTCA + 8508596, // GAACTCCAGTCA + 9628468, // GCAGTGGTATCA + 11950900, // GTCGCCGTATCA + 14343588, // TCGGTCTCGGCA + 4727204, // CAGAAGACGGCA + 11325540, // GGTATCAACGCA + 3572260, // ATCGGAAGAGCA + 2538788, // AGCGGTTCAGCA + 5800916, // CCGAGAATTCCA + 12026324, // GTCTGAACTCCA + 14508628, // TCTCCGAGCCCA + 10216212, // GCGTTGATACCA + 3253780, // ATACGGCGACCA + 1647300, // ACGCAGAGTACA + 5803460, // CCGAGATCTACA + 14669252, // TCTTTCCCTACA + 12025028, // GTCTCTTATACA + 11733124, // GTATAAGAGACA + 6824516, // CGGAAGAGCACA + 7851076, // CTCTTATACACA + 5977328, // CCGTATCATTAA + 2341680, // AGATGTGTATAA + 8251360, // CTTCTGCTTGAA + 4484576, // CACACGTCTGAA + 4028896, // ATTCCTGCTGAA + 14398112, // TCGTGTAGGGAA + 12395680, // GTTCAGCAGGAA + 10769952, // GGCACCCGAGAA + 5557280, // CCCATATAAGAA + 4959440, // CAGTGGTATCAA + 7132096, // CGTATCATTAAA + 16228224, // TTCTGCTTGAAA + 7260800, // CGTGTAGGGAAA + 5451904, // CCATATAAGAAA + 11751168, // GTATCATTAAAA + 14581248, // TCTGCTTGAAAA + 13450240, // TATCATTAAAAA + 7993344, // CTGCTTGAAAAA + 3469312, // ATCATTAAAAAA + 15196160, // TGCTTGAAAAAA + 13877248, // TCATTAAAAAAA + 10452992, // GCTTGAAAAAAA + 5177344, // CATTAAAAAAAA + 8257536, // CTTGAAAAAAAA + 3932160, // ATTAAAAAAAAA + 16252928, // TTGAAAAAAAAA + 15728640, // TTAAAAAAAAAA + 14680064, // TGAAAAAAAAAA + }; +} \ No newline at end of file diff --git a/src/common/illumina_adapters_static.h b/src/common/illumina_adapters_static.h new file mode 100644 index 0000000..b90a3d7 --- /dev/null +++ b/src/common/illumina_adapters_static.h @@ -0,0 +1,12 @@ +#ifndef _ILLUMINA_ADAPTERS_STATIC +#define _ILLUMINA_ADAPTERS_STATIC +#include +#include + +class IlluminaAdaptersStatic { + +public: + static std::vector Get12Mers(); +}; + +#endif // !_ILLUMINA_ADAPTERS_STATIC diff --git a/src/satc/kmc_api/kmc_file.cpp b/src/common/kmc_api/kmc_file.cpp similarity index 100% rename from src/satc/kmc_api/kmc_file.cpp rename to src/common/kmc_api/kmc_file.cpp diff --git a/src/satc/kmc_api/kmc_file.h b/src/common/kmc_api/kmc_file.h similarity index 100% rename from src/satc/kmc_api/kmc_file.h rename to src/common/kmc_api/kmc_file.h diff --git a/src/satc/kmc_api/kmer_api.cpp b/src/common/kmc_api/kmer_api.cpp similarity index 100% rename from src/satc/kmc_api/kmer_api.cpp rename to src/common/kmc_api/kmer_api.cpp diff --git a/src/satc/kmc_api/kmer_api.h b/src/common/kmc_api/kmer_api.h similarity index 100% rename from src/satc/kmc_api/kmer_api.h rename to src/common/kmc_api/kmer_api.h diff --git a/src/satc/kmc_api/kmer_defs.h b/src/common/kmc_api/kmer_defs.h similarity index 100% rename from src/satc/kmc_api/kmer_defs.h rename to src/common/kmc_api/kmer_defs.h diff --git a/src/satc/kmc_api/mmer.cpp b/src/common/kmc_api/mmer.cpp similarity index 100% rename from src/satc/kmc_api/mmer.cpp rename to src/common/kmc_api/mmer.cpp diff --git a/src/satc/kmc_api/mmer.h b/src/common/kmc_api/mmer.h similarity index 100% rename from src/satc/kmc_api/mmer.h rename to src/common/kmc_api/mmer.h diff --git a/src/common/satc_data.h b/src/common/satc_data.h index 7124920..aa323d6 100644 --- a/src/common/satc_data.h +++ b/src/common/satc_data.h @@ -9,6 +9,12 @@ #include "../../libs/refresh/zstd_file.h" +#ifdef _WIN32 +#define _bswap64(x) _byteswap_uint64(x) +#else +#define _bswap64(x) __builtin_bswap64(x) +#endif + #define USE_ZSTD_FOR_TEMPS template @@ -271,6 +277,31 @@ bool LoadBigEndian(std::vector::iterator& p, T& data, uint8_t data_size return true; } +inline uint32_t get_rev_compl_shift(uint32_t len) +{ + return (len + 31) / 32 * 32 - len; +} + +//instead of x >> 2*p there is (x>>p) >> p, because +//for p=32 we would have x >> 64 which is UB +inline uint64_t shr_2p(uint64_t x, uint64_t p) { + return (x >> p) >> p; +} + +inline uint64_t get_rev_compl(const uint64_t kmer, const uint32_t shift) { + uint64_t res; + + const uint64_t mask4 = 0b0000111100001111000011110000111100001111000011110000111100001111ull; + const uint64_t mask2 = 0b0011001100110011001100110011001100110011001100110011001100110011ull; + + uint64_t sf4 = ((kmer & mask4) << 4) + ((kmer >> 4) & mask4); + uint64_t sf2 = ((sf4 & mask2) << 2) + ((sf4 >> 2) & mask2); + + res = shr_2p((~_bswap64(sf2)), shift); + + return res; +} + inline uint64_t str_kmer_to_uint64_t(const std::string& kmer) { uint64_t res{}; static auto mapping = []() { @@ -351,14 +382,14 @@ struct Header { } }; -enum class RecFmt { SATC, NOMAD }; +enum class RecFmt { SATC, SPLASH }; struct RecFmtConv { inline static RecFmt from_string(const std::string& str) { if (str == "satc") return RecFmt::SATC; - else if (str == "nomad") - return RecFmt::NOMAD; + else if (str == "splash") + return RecFmt::SPLASH; else { std::cerr << "Error: cannot convert \"" << str << "\" to RecordPrintFormat\n"; exit(1); @@ -369,8 +400,8 @@ struct RecFmtConv { { case RecFmt::SATC: return "satc"; - case RecFmt::NOMAD: - return "nomad"; + case RecFmt::SPLASH: + return "splash"; default: std::cerr << "Error: unsupported, swich in RecordPrintFormat should be extended"; exit(1); @@ -509,7 +540,7 @@ struct Record oss << kmer_to_string(target, header.target_len_symbols) << "\t"; oss << count << "\n"; } - else if (format == RecFmt::NOMAD) { + else if (format == RecFmt::SPLASH) { oss << count << " "; oss << kmer_to_string(anchor, header.anchor_len_symbols); oss << kmer_to_string(target, header.target_len_symbols) << " "; diff --git a/src/common/target_count.h b/src/common/target_count.h new file mode 100644 index 0000000..db0c5b5 --- /dev/null +++ b/src/common/target_count.h @@ -0,0 +1,16 @@ +#ifndef _TARGET_COUNT_H +#define _TARGET_COUNT_H +#include + +struct TargetCount { + uint64_t target; + uint32_t count; + TargetCount(uint64_t target, uint32_t count) : + target(target), + count(count) + { + + } +}; + +#endif //_TARGET_COUNT_H diff --git a/src/common/version.h b/src/common/version.h index 8871377..de57976 100644 --- a/src/common/version.h +++ b/src/common/version.h @@ -1,7 +1,7 @@ #pragma once -#define NOMAD_VER "2.0.0" +#define SPLASH_VER "2.1.4" -inline void NOMAD_VER_PRINT(std::ostream& oss) { - oss << "Version: " << NOMAD_VER << "\n"; +inline void SPLASH_VER_PRINT(std::ostream& oss) { + oss << "Version: " << SPLASH_VER << "\n"; } diff --git a/src/satc/satc.cpp b/src/satc/satc.cpp index 06bb4b1..8e78ef6 100644 --- a/src/satc/satc.cpp +++ b/src/satc/satc.cpp @@ -4,11 +4,15 @@ #include #include #include "../common/version.h" -#include "kmc_api/kmer_api.h" -#include "kmc_api/kmc_file.h" +#include "../common/kmc_api/kmer_api.h" +#include "../common/kmc_api/kmc_file.h" #include "../common/murmur64.h" #include "../common/satc_data.h" #include "../common/poly_ACGT_filter.h" +#include "../common/artifacts_filter.h" +#include "../common/hamming_filter.h" +#include "../common/illumina_adapters_static.h" +#include "../common/target_count.h" struct SampleDesc { @@ -24,6 +28,9 @@ struct Params uint64_t n_bins{}; uint64_t anchor_sample_counts_threshold{}; //keep only anchors with counts > anchor_sample_counts_threshold uint64_t poly_ACGT_len{}; + uint64_t min_hamming_threshold{}; + std::string artifacts; + bool dont_filter_illumina_adapters = false; std::string out_base; void Print(std::ostream& oss) const { @@ -33,11 +40,14 @@ struct Params oss << "outbase : " << out_base << "\n"; oss << "anchor_sample_counts_threshold : " << anchor_sample_counts_threshold << "\n"; oss << "poly_ACGT_len : " << poly_ACGT_len << "\n"; + oss << "artifacts : " << artifacts << "\n"; + oss << "dont_filter_illumina_adapters : " << std::boolalpha << dont_filter_illumina_adapters << "\n"; + oss << "min_hamming_threshold : " << min_hamming_threshold << "\n"; oss << "input sample : " << input_sample.input_kmc_db_path << " " << input_sample.sample_id << "\n"; } static void Usage(char* prog_name) { std::cerr << "satc (sample anchor target count)\n"; - NOMAD_VER_PRINT(std::cerr); + SPLASH_VER_PRINT(std::cerr); std::cerr << "Usage: \n\t" << prog_name << " [options] \n"; std::cerr << "Positional parameters:\n" @@ -50,12 +60,17 @@ struct Params << " --target_len - target len\n" << " --n_bins - number of output bins\n" << " --poly_ACGT_len - all anchors containing polyACGT of this length will be filtered out (0 means no filtering)\n" - << " --anchor_sample_counts_threshold - keep only anchors with counts > anchor_sample_counts_threshold\n"; + << " --artifacts - path to artifacts, each anchor containing artifact will be filtered out\n" + << " --dont_filter_illumina_adapters - if used anchors containing Illumina adapters will not be filtered out\n" + << " --anchor_sample_counts_threshold - keep only anchors with counts > anchor_sample_counts_threshold\n" + << " --min_hamming_threshold - keep only anchors with a pair of targets that differ by >= min_hamming_threshold\n"; } }; struct Stats { uint64_t tot_poly_filtered_out{}; + uint64_t tot_artifacts_filtered_out{}; + uint64_t tot_hamming_distance_filtered_out{}; uint64_t tot_cnt_threshold_filtered_out{}; uint64_t tot_unique_anchors{}; uint64_t tot_out_recs{}; @@ -63,6 +78,8 @@ struct Stats { void print(std::ostream& oss) const { oss << "# poly filtered anchors : " << tot_poly_filtered_out << "\n"; + oss << "# artifacts filtered anchors : " << tot_artifacts_filtered_out << "\n"; + oss << "# hamming distance filtered anchors : " << tot_hamming_distance_filtered_out << "\n"; oss << "# filtered cnt threshold anchors : " << tot_cnt_threshold_filtered_out << "\n"; oss << "# unique anchors : " << tot_unique_anchors << "\n"; oss << "# out recs : " << tot_out_recs << "\n"; @@ -105,6 +122,14 @@ Params read_params(int argc, char** argv) std::string tmp = argv[++i]; res.poly_ACGT_len = std::stoull(tmp); } + else if (param == "--min_hamming_threshold") { + std::string tmp = argv[++i]; + res.min_hamming_threshold = std::stoull(tmp); + } + else if (param == "--artifacts") + res.artifacts = argv[++i]; + else if (param == "--dont_filter_illumina_adapters") + res.dont_filter_illumina_adapters = true; } if (i >= argc) { std::cerr << "Error: outbase missing\n"; @@ -160,17 +185,6 @@ void split(CKmerAPI& to_split, uint64_t& anchor, uint64_t& target, uint8_t ancho target = to_split.subkmer(to_split.get_len() - target_len_symbols, target_len_symbols); } -struct TargetCount { - uint64_t target; - uint32_t count; - TargetCount(uint64_t target, uint32_t count) : - target(target), - count(count) - { - - } -}; - void sort_merge_and_store(uint64_t anchor, std::vector& targets_in_current_anchor, const Header& header, @@ -178,6 +192,8 @@ void sort_merge_and_store(uint64_t anchor, std::vector& bins, uint64_t anchor_sample_counts_threshold, const PolyACGTFilter& poly_ACGT_filter, + const ArtifactsFilter& artifacts_filter, + const HammingFilter& hamming_filter, Stats& stats) { rec.anchor = anchor; if (targets_in_current_anchor.empty()) @@ -192,13 +208,23 @@ void sort_merge_and_store(uint64_t anchor, for (auto& x : targets_in_current_anchor) tot_count += x.count; + if (tot_count <= anchor_sample_counts_threshold) { + ++stats.tot_cnt_threshold_filtered_out; + return; + } + if (poly_ACGT_filter.IsPolyACGT(anchor, header.anchor_len_symbols)) { ++stats.tot_poly_filtered_out; return; } - if (tot_count <= anchor_sample_counts_threshold) { - ++stats.tot_cnt_threshold_filtered_out; + if (artifacts_filter.ContainsArtifact(anchor, header.anchor_len_symbols)) { + ++stats.tot_artifacts_filtered_out; + return; + } + + if (!hamming_filter.ContainsDistantPair(targets_in_current_anchor)) { + ++stats.tot_hamming_distance_filtered_out; return; } @@ -233,6 +259,8 @@ void process_kmc_db(const std::string& path, std::vector& bins, uint64_t anchor_sample_counts_threshold, const PolyACGTFilter& poly_ACGT_filter, + const ArtifactsFilter& artifacts_filter, + const HammingFilter& hamming_filter, Stats& stats) { CKMCFile kmc_db; @@ -279,7 +307,17 @@ void process_kmc_db(const std::string& path, if (prev_anchor == anchor) targets_in_current_anchor.emplace_back(target, count); else { - sort_merge_and_store(prev_anchor, targets_in_current_anchor, header, rec, bins, anchor_sample_counts_threshold, poly_ACGT_filter, stats); + sort_merge_and_store(prev_anchor, + targets_in_current_anchor, + header, + rec, + bins, + anchor_sample_counts_threshold, + poly_ACGT_filter, + artifacts_filter, + hamming_filter, + stats); + targets_in_current_anchor.clear(); targets_in_current_anchor.emplace_back(target, count); prev_anchor = anchor; @@ -291,7 +329,16 @@ void process_kmc_db(const std::string& path, } std::cerr << "\n"; - sort_merge_and_store(prev_anchor, targets_in_current_anchor, header, rec, bins, anchor_sample_counts_threshold, poly_ACGT_filter, stats); + sort_merge_and_store(prev_anchor, + targets_in_current_anchor, + header, + rec, + bins, + anchor_sample_counts_threshold, + poly_ACGT_filter, + artifacts_filter, + hamming_filter, + stats); targets_in_current_anchor.clear(); } @@ -361,8 +408,21 @@ int main(int argc, char** argv) Stats stats; PolyACGTFilter poly_ACGT_filter(params.poly_ACGT_len); - - process_kmc_db(params.input_sample.input_kmc_db_path, params.input_sample.sample_id, header, out_files, params.anchor_sample_counts_threshold, poly_ACGT_filter, stats); + ArtifactsFilter artifacts_filter(params.artifacts); + HammingFilter hamming_filter(params.min_hamming_threshold); + + if (!params.dont_filter_illumina_adapters) + artifacts_filter.Add(12, IlluminaAdaptersStatic::Get12Mers()); + + process_kmc_db(params.input_sample.input_kmc_db_path, + params.input_sample.sample_id, + header, + out_files, + params.anchor_sample_counts_threshold, + poly_ACGT_filter, + artifacts_filter, + hamming_filter, + stats); stats.print(std::cerr); } diff --git a/src/satc_dump/satc_dump.cpp b/src/satc_dump/satc_dump.cpp index 48cf535..e9a752b 100644 --- a/src/satc_dump/satc_dump.cpp +++ b/src/satc_dump/satc_dump.cpp @@ -30,7 +30,7 @@ struct Params { static void Usage(char* prog_name) { std::cerr << "satc_dump\n"; - NOMAD_VER_PRINT(std::cerr); + SPLASH_VER_PRINT(std::cerr); std::cerr << "Usage: \n\t" << prog_name << " [options] \n"; std::cerr << "or\n"; std::cerr << "\t" << prog_name << " --which-bin --n_bins \n"; @@ -42,7 +42,7 @@ struct Params { << "Options:\n" << " --anchor_list - path to text file containing anchors separated by whitespaces, only anchors from this file will be dumped\n" << " --sample_names - path for decode sample id, each line should contain \n" - << " --format - output format, available options: satc, nomad (default: satc)\n" + << " --format - output format, available options: satc, splash (default: satc)\n" << " --n_bins - if set to value different than 0 the input is interpreted as a list of bins (each bin in separate line, first list is bin_0, second line is bin_1, etc. (in case of ill-formed input results will be incorrect)\n" << " --separately - if set with n_bins != 0 output param will be treated as suffix name and there will be output for each bin\n"; } diff --git a/src/satc_merge/extra_stats.cpp b/src/satc_merge/extra_stats.cpp index 5f90a3f..cb6c39b 100644 --- a/src/satc_merge/extra_stats.cpp +++ b/src/satc_merge/extra_stats.cpp @@ -1,4 +1,5 @@ #include "extra_stats.h" +#include "../../libs/refresh/deterministic_random.h" #include #include #include @@ -109,8 +110,8 @@ void CExtraStats::determine_monte_carlo_trials(size_t no_trials) auto tot_count = cum_counts.back(); std::mt19937_64 mt; - std::uniform_int_distribution<> dist1(0, tot_count - 1); - std::uniform_int_distribution<> dist2(0, tot_count - 2); + det_uniform_int_distribution<> dist1(0, tot_count - 1); + det_uniform_int_distribution<> dist2(0, tot_count - 2); // std::multiset> unq_s; // !!! TODO: zmienic na unordered_multiset, albo od razu wyniki wpisywac do monte_carlo_trials diff --git a/src/satc_merge/get_train_mtx.h b/src/satc_merge/get_train_mtx.h index 9e440be..cc14210 100644 --- a/src/satc_merge/get_train_mtx.h +++ b/src/satc_merge/get_train_mtx.h @@ -9,53 +9,24 @@ using std::cerr; using std::endl; -namespace detail { - //my iteartor to not generate the range... - struct MyIt { - - using iterator_category = std::random_access_iterator_tag; - using value_type = size_t; - using reference = value_type&; - using difference_type = size_t; - - size_t v; - MyIt(size_t v) : v(v) {} - - - reference operator*() { - return v; - } - MyIt& operator++() { - ++v; - return *this; - } - difference_type operator-(const MyIt& rhs) const { - return v - rhs.v; - } +template +void my_sample(size_t max_val, std::vector& dest, size_t to_select, RNG&& eng) { + dest.resize(max_val); + std::iota(dest.begin(), dest.end(), 0ull); - bool operator==(const MyIt& rhs) { - return v == rhs.v; - } - }; -} + for (size_t i = 0; i < to_select; ++i) + std::swap(dest[i], dest[i + eng() % (dest.size() - i)]); -template<> struct std::iterator_traits -{ - using iterator_category = typename detail::MyIt::iterator_category; - using value_type = typename detail::MyIt::value_type; - using difference_type = typename detail::MyIt::difference_type; -}; - -template -void my_sample(size_t start, size_t end, OutIter dest, size_t N, RNG&& eng) { - std::sample(detail::MyIt(start), detail::MyIt(end), dest, N, eng); + dest.resize(to_select); + sort(dest.begin(), dest.end()); } template -//refresh::matrix_sparse get_train_mtx_2(const refresh::matrix_sparse& X, double opt_train_fraction, RNG&& eng) -refresh::matrix_sparse_compact get_train_mtx_2(const refresh::matrix_sparse_compact& X, double opt_train_fraction, RNG&& eng) +refresh::matrix_sparse_compact get_train_mtx_2( + const refresh::matrix_sparse_compact& X, + double opt_train_fraction, + RNG&& eng) { - //refresh::matrix_sparse res(X.rows(), X.cols()); refresh::matrix_sparse_compact res(X.rows(), X.cols()); res.reserve(X.data_size()); @@ -72,9 +43,8 @@ refresh::matrix_sparse_compact get_ tot_cs += p_col_end->second; size_t to_select = tot_cs * opt_train_fraction; - indices.resize(to_select + 1); // + 1 for a guard - my_sample(0, tot_cs, indices.begin(), to_select, eng); - indices.back() = std::numeric_limits::max(); // guard + my_sample(tot_cs, indices, to_select, eng); + indices.push_back(std::numeric_limits::max()); // guard double cs = 0; diff --git a/src/satc_merge/matrix_sparse_compact.h b/src/satc_merge/matrix_sparse_compact.h index 3f7d657..0e11c55 100644 --- a/src/satc_merge/matrix_sparse_compact.h +++ b/src/satc_merge/matrix_sparse_compact.h @@ -123,6 +123,11 @@ template class matrix_sparse_compact return *this; } + bool operator==(const matrix_sparse_compact& x) const + { + return n_rows == x.n_rows && n_cols == x.n_cols && data == x.data; + } + ~matrix_sparse_compact() {} @@ -351,7 +356,7 @@ template class matrix_sparse_compact if (p_src1->first == p_src2->first) { elem.second -= p_src2->second; - if(elem.second != (T)0) + if (elem.second != (T)0) *p_dest++ = elem; ++p_src1; ++p_src2; @@ -390,6 +395,60 @@ template class matrix_sparse_compact return *this; } + matrix_sparse_compact& operator+=(matrix_sparse_compact& x) + { + data_t new_elems; + + auto p_src1 = data.begin(); + auto p_src2 = x.data.begin(); + auto p_dest = data.begin(); + + auto p_end1 = data.end(); + auto p_end2 = x.data.end(); + + while (p_src1 != p_end1 && p_src2 != p_end2) + { + auto elem = *p_src1; + + if (p_src1->first == p_src2->first) + { + elem.second += p_src2->second; + if (elem.second != (T)0) + *p_dest++ = elem; + ++p_src1; + ++p_src2; + } + else if (p_src1->first < p_src2->first) + { + *p_dest++ = elem; + ++p_src1; + } + else + { + new_elems.emplace_back(*p_src2); + ++p_src2; + } + } + + if (p_src1 != p_end1) + while (p_src1 != p_end1) + *p_dest++ = *p_src1++; + else + new_elems.insert(new_elems.end(), p_src2, p_end2); + + data.erase(p_dest, p_end1); + + if (!new_elems.empty()) + { + data.insert(data.end(), new_elems.begin(), new_elems.end()); + + // TODO: consider using inplace_merge here + std::sort(data.begin(), data.end()); + } + + return *this; + } + template matrix_sparse_compact compact(std::vector& preserve_rows, std::vector& preserve_cols) const { diff --git a/src/satc_merge/pvals.cpp b/src/satc_merge/pvals.cpp index 0620cf2..df2d905 100644 --- a/src/satc_merge/pvals.cpp +++ b/src/satc_merge/pvals.cpp @@ -5,6 +5,7 @@ #include #include +#include "../../libs/refresh/deterministic_random.h" #include "pvals.h" #include "get_train_mtx.h" #include "helmert_decomposition.h" @@ -117,9 +118,6 @@ void normalize_vec_0_1(refresh::matrix_1d& vec) vec(i) = (vec(i) - _min) / dif; } -// --is_10X --n_most_freq_targets 10 --anchor_count_threshold 50 --anchor_unique_targets_threshold 1 --anchor_samples_threshold 1 stats/bin_1.txt_new out/S10_cnts_1 out/S11_cnts_1 out/S12_cnts_1 out/S13_cnts_1 -// --is_10X --n_most_freq_targets 10 --anchor_count_threshold 50 --anchor_unique_targets_threshold 1 --anchor_samples_threshold 1 stats/bin_1.txt_new c:/linux/nomad-pp-dev/example/S10_cnts_1.zst c:/linux/nomad-pp-dev/example/S11_cnts_1.zst c:/linux/nomad-pp-dev/example/S12_cnts_1.zst c:/linux/nomad-pp-dev/example/S13_cnts_1.zst -//double effectSize_cts(const refresh::matrix_sparse& X, const refresh::matrix_1d& cOpt, const refresh::matrix_1d& fOpt) { double effectSize_cts(const refresh::matrix_sparse_compact& X, const refresh::matrix_1d& cOpt, const refresh::matrix_1d& fOpt) { auto f_abs_sum = (X * abs(cOpt)).sum(); @@ -159,7 +157,7 @@ double effectSize_bin(const refresh::matrix_sparse_compact& X, const refresh::matrix_1d& cOpt, const refresh::matrix_1d& fOpt) +double computeAsympSPLASH(const refresh::matrix_sparse_compact& X, const refresh::matrix_1d& cOpt, const refresh::matrix_1d& fOpt) { if (cOpt.all_items_same()) return 1.0; @@ -203,7 +201,7 @@ double computeAsympNOMAD(const refresh::matrix_sparse_compact= v2) { @@ -349,10 +347,10 @@ double altMaximize(const refresh::matrix_sparse_compact& X_org, refresh::matrix_1d& cOpt, refresh::matrix_1d& fOpt, int no_tries, int opt_num_iters) void generate_alt_max_cf(const refresh::matrix_sparse_compact& X_org, refresh::matrix_1d& cOpt, refresh::matrix_1d& fOpt, int no_tries, int opt_num_iters) { - std::default_random_engine eng; - std::uniform_int_distribution dist_0_1(0, 1); + std::mt19937_64 eng; + det_uniform_int_distribution dist_0_1(0, 1); double minus_1_and_1[] = { -1, 1 }; std::vector non_zero_cols; @@ -422,8 +420,8 @@ void generate_alt_max_cf(const refresh::matrix_sparse_compact& X_org, const refresh::matrix_1d& cOpt, refresh::matrix_1d& fOpt) { - std::default_random_engine eng; - std::uniform_int_distribution dist_0_1(0, 1); + std::mt19937_64 eng; + det_uniform_int_distribution dist_0_1(0, 1); std::vector non_zero_rows; @@ -449,9 +447,9 @@ void generate_f_from_c(const refresh::matrix_sparse_compact fMax; @@ -503,10 +501,12 @@ void get_most_freq_targets( } } +double calc_pval_base( + const refresh::matrix_sparse_compact& sp_anch_contingency_table, + size_t num_rand_cf, + std::mt19937_64& eng) { -//double calc_pval_base(const refresh::matrix_sparse& sp_anch_contingency_table, size_t num_rand_cf, std::default_random_engine& eng) { -double calc_pval_base(const refresh::matrix_sparse_compact& sp_anch_contingency_table, size_t num_rand_cf, std::default_random_engine& eng) { - std::uniform_int_distribution dist_0_1(0, 1); + det_uniform_int_distribution dist_0_1(0, 1); double minus_1_and_1[] = { -1, 1 }; double zero_and_1[] = { 0, 1 }; double min_pval = std::numeric_limits::max(); @@ -528,11 +528,16 @@ double calc_pval_base(const refresh::matrix_sparse_compact& sp_anch_contingency_table, size_t num_rand_cf, std::default_random_engine& eng) { // Extended version, returning also old_pval_base and fOptR and cOptR -double calc_pval_base_ext(const refresh::matrix_sparse_compact& sp_anch_contingency_table, size_t num_rand_cf, std::default_random_engine& eng, - double &pval_base_old, refresh::matrix_1d &fOptR, refresh::matrix_1d &cOptR) { - std::uniform_int_distribution dist_0_1(0, 1); +double calc_pval_base_ext( + const refresh::matrix_sparse_compact& sp_anch_contingency_table, + size_t num_rand_cf, + std::mt19937_64& eng, + double &pval_base_old, + refresh::matrix_1d &fOptR, + refresh::matrix_1d &cOptR) { + + det_uniform_int_distribution dist_0_1(0, 1); double minus_1_and_1[] = { -1, 1 }; double zero_and_1[] = { 0, 1 }; double min_pval = std::numeric_limits::max(); @@ -574,12 +579,14 @@ void compute_stats( AnchorStats& anchor_stats, bool without_alt_max, bool with_effect_size_cts, + bool with_pval_asymp_opt, bool compute_also_old_base_pvals, uint32_t n_most_freq_targets, double opt_train_fraction, int opt_num_inits, int opt_num_iters, size_t num_rand_cf, + size_t num_splits, CjWriter& cj_writer, double max_pval_opt_for_Cjs, CBCToCellType* cbc_to_cell_type, @@ -596,8 +603,7 @@ void compute_stats( // refresh::matrix_sparse sp_anch_contingency_table(n_uniq_targets, n_unique_samples); refresh::matrix_sparse_compact sp_anch_contingency_table(n_uniq_targets, n_unique_samples); - std::default_random_engine eng; - //std::default_random_engine eng(std::random_device{}()); + std::mt19937_64 eng; SampleToColMapper mapper(unique_samples); @@ -614,7 +620,7 @@ void compute_stats( sp_anch_contingency_table.reserve(anchor.data.size()); - size_t row_id = 0; + size_t row_id = 0; for (const auto& e : anchor.data) { if (target != e.target) { //new target -> new row ++row_id; @@ -674,13 +680,55 @@ void compute_stats( anchor_stats.pval_opt = testPval(Xtest, cOpt, fOpt); + if (with_pval_asymp_opt) + anchor_stats.pval_asymp_opt = computeAsympSPLASH(Xtest, cOpt, fOpt); + anchor_stats.effect_size_bin = effectSize_bin(Xtest, cOpt, fOpt); if (with_effect_size_cts) anchor_stats.effect_size_cts = effectSize_cts(Xtest, cOpt, fOpt); - anchor_stats.pval_asymp_base = computeAsympNOMAD(Xtest, cOpt, fOpt); + if (num_splits > 1) { + //I create new cOpt because in the cOpt I will have the optimal one, which may be needed to print it + refresh::matrix_1d new_cOpt(Xtrain.cols()); + for (size_t split_no = 1; split_no < num_splits; ++split_no) + { + //reconstruct contingency table + sp_anch_contingency_table = std::move(Xtest += Xtrain); + + //make a new split + Xtrain = get_train_mtx_2(sp_anch_contingency_table, opt_train_fraction, eng); + sp_anch_contingency_table -= Xtrain; + auto Xtest = std::move(sp_anch_contingency_table); + + generate_alt_max_cf(Xtrain, new_cOpt, fOpt, opt_num_inits, opt_num_iters); + + if (with_pval_asymp_opt) { + auto new_pval_asymp_opt = computeAsympSPLASH(Xtest, new_cOpt, fOpt); + if (new_pval_asymp_opt < anchor_stats.pval_asymp_opt) + anchor_stats.pval_asymp_opt = new_pval_asymp_opt; + } + + auto new_pval_opt = testPval(Xtest, new_cOpt, fOpt); + if (new_pval_opt < anchor_stats.pval_opt) + { + anchor_stats.pval_opt = new_pval_opt; + anchor_stats.effect_size_bin = effectSize_bin(Xtest, new_cOpt, fOpt); + + if (with_effect_size_cts) + anchor_stats.effect_size_cts = effectSize_cts(Xtest, new_cOpt, fOpt); + + if (cj_writer) + cOpt = new_cOpt; + } + } + + anchor_stats.pval_opt *= num_splits; + if (with_pval_asymp_opt) { + anchor_stats.pval_asymp_opt *= num_splits; + } + } if (cj_writer && anchor_stats.pval_opt <= max_pval_opt_for_Cjs) { for (size_t col_id = 0; col_id < cOpt.size(); ++col_id) diff --git a/src/satc_merge/pvals.h b/src/satc_merge/pvals.h index a4d7738..8c5019a 100644 --- a/src/satc_merge/pvals.h +++ b/src/satc_merge/pvals.h @@ -21,7 +21,7 @@ struct AnchorStats { double effect_size_cts; double effect_size_bin; double effect_size_bin_old; - double pval_asymp_base; + double pval_asymp_opt; double entropy; double avg_no_homopolymer_targets; double avg_hamming_distance_max_target; @@ -43,7 +43,7 @@ struct AnchorStats { std::vector cell_types_ids; //unique ids of cell types for a given Xtrain contignency table AnchorStats(double pval_base, double pval_base_old, double pval_opt, - double effect_size_cts, double effect_size_bin, double effect_size_bin_old, double pval_asymp_base, double entropy, + double effect_size_cts, double effect_size_bin, double effect_size_bin_old, double pval_asymp_opt, double entropy, double avg_no_homopolymer_targets, double avg_hamming_distance_max_target, double avg_hamming_distance_all_pairs, double avg_edit_distance_max_target, double avg_edit_distance_all_pairs @@ -54,7 +54,7 @@ struct AnchorStats { effect_size_cts(effect_size_cts), effect_size_bin(effect_size_bin), effect_size_bin_old(effect_size_bin_old), - pval_asymp_base(pval_asymp_base), + pval_asymp_opt(pval_asymp_opt), entropy(entropy), avg_no_homopolymer_targets(avg_no_homopolymer_targets), avg_hamming_distance_max_target(avg_hamming_distance_max_target), @@ -70,7 +70,7 @@ struct AnchorStats { effect_size_cts(0), effect_size_bin(0), effect_size_bin_old(0), - pval_asymp_base(1), + pval_asymp_opt(1), entropy(0), avg_no_homopolymer_targets(0), avg_hamming_distance_max_target(0), @@ -322,12 +322,14 @@ void compute_stats( AnchorStats &anchor_stats, bool without_alt_max, bool with_effect_size_cts, + bool with_pval_asymp_opt, bool compute_also_old_base_pvals, uint32_t n_most_freq_targets, double opt_train_fraction, int opt_num_inits, int opt_num_iters, size_t num_rand_cf, + size_t num_splits, CjWriter& cj_writer, double max_pval_opt_for_Cjs, CBCToCellType* cbc_to_cell_type, diff --git a/src/satc_merge/satc_merge.cpp b/src/satc_merge/satc_merge.cpp index 226f6f3..79bffc1 100644 --- a/src/satc_merge/satc_merge.cpp +++ b/src/satc_merge/satc_merge.cpp @@ -33,37 +33,6 @@ class Timer { } }; -//merge all samples from single bin -enum class RunMode { CalcStats, JustMerge, JustMergeAndDump }; -std::string to_string(RunMode run_mode) { - switch (run_mode) - { - case RunMode::CalcStats: - return "calc_stats"; - case RunMode::JustMerge: - return "just_merge"; - case RunMode::JustMergeAndDump: - return "just_merge_and_dump"; - default: - std::cerr << "Error: unsupported RunMode\n"; - exit(1); - break; - } -} - -RunMode run_mode_from_string(const std::string& run_mode) { - if (run_mode == "calc_stats") - return RunMode::CalcStats; - else if (run_mode == "just_merge") - return RunMode::JustMerge; - else if (run_mode == "just_merge_and_dump") - return RunMode::JustMergeAndDump; - else { - std::cerr << "Error: unknown run mode " << run_mode << "\n"; - exit(1); - } -} - struct Params { uint64_t anchor_count_threshold{}; //remove all anchors for which the sum of counters across all samples and targets is <= anchor_count_threshold @@ -84,10 +53,14 @@ struct Params size_t num_rand_cf = 50; + size_t num_splits = 1; + bool without_alt_max = false; bool with_effect_size_cts = false; + bool with_pval_asymp_opt = false; + bool compute_also_old_base_pvals = false; std::string outpath; @@ -102,7 +75,9 @@ struct Params std::vector bins; - RunMode run_mode = RunMode::CalcStats; + std::string dump_sample_anchor_target_count_txt; + std::string dump_sample_anchor_target_count_binary; + RecFmt format = RecFmt::SATC; //only for JustMergeAndDump std::string cell_type_samplesheet; @@ -110,30 +85,31 @@ struct Params void print(std::ostream& oss) { oss << "Parameters:\n"; - oss << "\tanchor_count_threshold : " << anchor_count_threshold << "\n"; - oss << "\tanchor_unique_targets_threshold : " << anchor_unique_targets_threshold << "\n"; - oss << "\tanchor_samples_threshold : " << anchor_samples_threshold << "\n"; - oss << "\tis_10X : " << std::boolalpha << is_10X << "\n"; - oss << "\twithout_alt_max : " << std::boolalpha << without_alt_max << "\n"; - oss << "\twith_effect_size_cts : " << std::boolalpha << with_effect_size_cts << "\n"; - oss << "\tcompute_also_old_base_pvals : " << std::boolalpha << compute_also_old_base_pvals << "\n"; - oss << "\toutpath : " << outpath << "\n"; - oss << "\tanchor_list : " << anchor_list << "\n"; - oss << "\tcjs_out : " << cjs_out << "\n"; - oss << "\tmax_pval_opt_for_Cjs : " << max_pval_opt_for_Cjs << "\n"; - oss << "\tsample_names : " << sample_names << "\n"; - oss << "\tn_most_freq_targets : " << n_most_freq_targets << "\n"; - oss << "\tn_most_freq_targets_for_stats : " << n_most_freq_targets_for_stats << "\n"; - oss << "\topt_train_fraction : " << opt_train_fraction << "\n"; - oss << "\topt_num_inits : " << opt_num_inits << "\n"; - oss << "\topt_num_iters : " << opt_num_iters << "\n"; - oss << "\tnum_rand_cf : " << num_rand_cf << "\n"; - oss << "\tcell_type_samplesheet : " << cell_type_samplesheet << "\n"; - oss << "\tCjs_samplesheet : " << Cjs_samplesheet << "\n"; - - oss << "\trun_mode : " << to_string(run_mode) << "\n"; - if (run_mode == RunMode::JustMergeAndDump) - oss << "\t\t format : " << RecFmtConv::to_string(format) << "\n"; + oss << "\tanchor_count_threshold : " << anchor_count_threshold << "\n"; + oss << "\tanchor_unique_targets_threshold : " << anchor_unique_targets_threshold << "\n"; + oss << "\tanchor_samples_threshold : " << anchor_samples_threshold << "\n"; + oss << "\tis_10X : " << std::boolalpha << is_10X << "\n"; + oss << "\twithout_alt_max : " << std::boolalpha << without_alt_max << "\n"; + oss << "\twith_effect_size_cts : " << std::boolalpha << with_effect_size_cts << "\n"; + oss << "\twith_pval_asymp_opt : " << std::boolalpha << with_pval_asymp_opt << "\n"; + oss << "\tcompute_also_old_base_pvals : " << std::boolalpha << compute_also_old_base_pvals << "\n"; + oss << "\toutpath : " << outpath << "\n"; + oss << "\tanchor_list : " << anchor_list << "\n"; + oss << "\tcjs_out : " << cjs_out << "\n"; + oss << "\tmax_pval_opt_for_Cjs : " << max_pval_opt_for_Cjs << "\n"; + oss << "\tsample_names : " << sample_names << "\n"; + oss << "\tn_most_freq_targets : " << n_most_freq_targets << "\n"; + oss << "\tn_most_freq_targets_for_stats : " << n_most_freq_targets_for_stats << "\n"; + oss << "\topt_train_fraction : " << opt_train_fraction << "\n"; + oss << "\topt_num_inits : " << opt_num_inits << "\n"; + oss << "\topt_num_iters : " << opt_num_iters << "\n"; + oss << "\tnum_rand_cf : " << num_rand_cf << "\n"; + oss << "\tnum_splits : " << num_splits << "\n"; + oss << "\tcell_type_samplesheet : " << cell_type_samplesheet << "\n"; + oss << "\tCjs_samplesheet : " << Cjs_samplesheet << "\n"; + oss << "\t dump_sample_anchor_target_count_txt : " << dump_sample_anchor_target_count_txt << "\n"; + oss << "\t dump_sample_anchor_target_count_binary : " << dump_sample_anchor_target_count_binary << "\n"; + oss << "\t format : " << RecFmtConv::to_string(format) << "\n"; oss << "\tinput bins:\n"; for (const auto& bin : bins) oss << "\t\t" << bin << "\n"; @@ -141,7 +117,7 @@ struct Params static void Usage(char* prog_name) { std::cerr << "satc_merge\n"; - NOMAD_VER_PRINT(std::cerr); + SPLASH_VER_PRINT(std::cerr); std::cerr << "Usage: \n\t" << prog_name << " [options] \n"; std::cerr << "Positional parameters:\n" @@ -149,27 +125,30 @@ struct Params << " - file with list of paths of bins to be processed\n"; std::cerr << "Options:\n" - << " --is_10X - if used input treat as 10X data\n"// text file containing anchors separated by whitespaces, only anchors from this file will be dumped\n"; - << " --anchor_count_threshold - filter out all anchors for which the total count <= anchor_count_threshold\n" - << " --anchor_unique_targets_threshold - filter out all anchors for which the number of unique targets is <= anchor_unique_targets_threshold\n" - << " --n_most_freq_targets - in calc_stats mode output also n_most_freq_targets most frequent targets and their counts (default: 0)\n" - << " --n_most_freq_targets_for_stats - in calc_stats mode use at most n_most_freq_targets_for_stats for each contignency table, 0 means use all (default: 0)\n" - << " --opt_train_fraction (0.0;1.0) - in calc_stats mode use this fraction to create train X from contingency table\n" - << " --opt_num_inits - in calc_stats mode the number of altMaximize runs (default: 10)\n" - << " --opt_num_iters - in calc_stats mode the number of iteration in altMaximize (default: 50)\n" - << " --num_rand_cf - in calc_stats mode the number of rand cf (default: 50)\n" - << " --anchor_samples_threshold - filter out all anchors for which the number of unique samples is <= anchor_samples_threshold\n" - << " --anchor_list - path to text file containing anchors separated by whitespaces, only anchors from this file will be processed\n" - << " --cjs_out - path to output text file where Cjs will be stored\n" - << " --max_pval_opt_for_Cjs - dump only Cjs for anchors that have pval_opt <= max_pval_opt_for_Cjs\n" - << " --run_mode - what to do with merged anchors, available options: calc_stats, just_merge, just_merge_and_dump (default: calc_stats)\n" - << " --without_alt_max - disable alt max computation\n" - << " --with_effect_size_cts - compute effect_size_cts\n" - << " --compute_also_old_base_pvals - compute old base pvals\n" - << " --sample_names - path for decode sample id, each line should contain \n" - << " --cell_type_samplesheet - path for mapping barcode to cell type, is used Helmert-based supervised mode is turned on\n" - << " --Cjs_samplesheet - path for file with predefined Cjs for non-10X supervised mode\n" - << " --format - output format when 'just_merge_and_dump' is used as run_mode, available options: satc, nomad (default: satc)\n"; + << " --is_10X - if used input treat as 10X data\n"// text file containing anchors separated by whitespaces, only anchors from this file will be dumped\n"; + << " --anchor_count_threshold - filter out all anchors for which the total count <= anchor_count_threshold\n" + << " --anchor_unique_targets_threshold - filter out all anchors for which the number of unique targets is <= anchor_unique_targets_threshold\n" + << " --n_most_freq_targets - output also n_most_freq_targets most frequent targets and their counts (default: 0)\n" + << " --n_most_freq_targets_for_stats - use at most n_most_freq_targets_for_stats for each contingency table, 0 means use all (default: 0)\n" + << " --opt_train_fraction (0.0;1.0) - use this fraction to create train X from contingency table\n" + << " --opt_num_inits - the number of altMaximize runs (default: 10)\n" + << " --opt_num_iters - the number of iteration in altMaximize (default: 50)\n" + << " --num_rand_cf - the number of rand cf (default: 50)\n" + << " --num_splits - the number of contingency table splits (default: 1)\n" + << " --anchor_samples_threshold - filter out all anchors for which the number of unique samples is <= anchor_samples_threshold\n" + << " --anchor_list - path to text file containing anchors separated by whitespaces, only anchors from this file will be processed\n" + << " --cjs_out - path to output text file where Cjs will be stored\n" + << " --max_pval_opt_for_Cjs - dump only Cjs for anchors that have pval_opt <= max_pval_opt_for_Cjs\n" + << " --dump_sample_anchor_target_count_txt - dump merged anchors in textual representation\n" + << " --dump_sample_anchor_target_count_binary - dump merged anchors in textual satc format\n" + << " --without_alt_max - disable alt max computation\n" + << " --with_effect_size_cts - compute effect_size_cts\n" + << " --with_pval_asymp_opt - compute pval_asymp_opt\n" + << " --compute_also_old_base_pvals - compute old base pvals\n" + << " --sample_names - path for decode sample id, each line should contain \n" + << " --cell_type_samplesheet - path for mapping barcode to cell type, is used Helmert-based supervised mode is turned on\n" + << " --Cjs_samplesheet - path for file with predefined Cjs for non-10X supervised mode\n" + << " --format - output format when txt dump, available options: satc, splash (default: satc)\n"; } }; @@ -246,12 +225,19 @@ Params read_params(int argc, char** argv) std::string tmp = argv[++i]; res.num_rand_cf = std::stoull(tmp); } + if (param == "--num_splits") { + std::string tmp = argv[++i]; + res.num_splits = std::stoull(tmp); + } if (param == "--without_alt_max") { res.without_alt_max = true; } if (param == "--with_effect_size_cts") { res.with_effect_size_cts = true; } + if (param == "--with_pval_asymp_opt") { + res.with_pval_asymp_opt = true; + } if (param == "--compute_also_old_base_pvals") { res.compute_also_old_base_pvals = true; } @@ -277,9 +263,13 @@ Params read_params(int argc, char** argv) if (param == "--is_10X") { res.is_10X = true; } - if (param == "--run_mode") { - res.run_mode = run_mode_from_string(argv[++i]); + if (param == "--dump_sample_anchor_target_count_txt") { + res.dump_sample_anchor_target_count_txt = argv[++i]; } + if (param == "--dump_sample_anchor_target_count_binary") { + res.dump_sample_anchor_target_count_binary = argv[++i]; + } + if (param == "--format") res.format = RecFmtConv::from_string(argv[++i]); } @@ -426,7 +416,7 @@ class Bin { }; -void write_out_header(std::ofstream& out, bool without_alt_max, bool with_effect_size_cts, bool compute_also_old_base_pvals, bool is_removing_least_freq_targets_enabled, uint32_t n_most_freq_targets, CBCToCellType* cbc_to_cell_type, Non10XSupervised* non_10X_supervised) { +void write_out_header(std::ofstream& out, bool without_alt_max, bool with_effect_size_cts, bool with_pval_asymp_opt, bool compute_also_old_base_pvals, bool is_removing_least_freq_targets_enabled, uint32_t n_most_freq_targets, CBCToCellType* cbc_to_cell_type, Non10XSupervised* non_10X_supervised) { //mkokot_TODO: czy doimplementowac zakomentowane? out << "anchor" << "\t"; @@ -463,8 +453,10 @@ void write_out_header(std::ofstream& out, bool without_alt_max, bool with_effect } if (!without_alt_max) { out - << "effect_size_bin" << "\t" - << "pval_asymp_base" << "\t"; + << "effect_size_bin" << "\t"; + + if (with_pval_asymp_opt) + out << "pval_asymp_opt" << "\t"; } out //<< "effect_size_cts_SVD" << "\t" // TODO: ? @@ -512,6 +504,7 @@ void write_out_rec( uint64_t n_unique_samples, bool without_alt_max, bool with_effect_size_cts, + bool with_pval_asymp_opt, bool compute_also_old_base_pvals, bool is_removing_least_freq_targets_enabled, uint32_t n_most_freq_targets, @@ -595,9 +588,9 @@ void write_out_rec( out << anchor_stats.Cjs_num_rest << "\t"; } if (!without_alt_max) { - out - << anchor_stats.effect_size_bin << "\t" - << anchor_stats.pval_asymp_base << "\t"; + out << anchor_stats.effect_size_bin << "\t"; + if (with_pval_asymp_opt) + out << anchor_stats.pval_asymp_opt << "\t"; } out //<< "effect_size_cts_SVD" << "\t" // TODO: ? @@ -662,6 +655,7 @@ class StatsWriter : public IAnchorProcessor { std::ofstream out; bool without_alt_max; bool with_effect_size_cts; + bool with_pval_asymp_opt; bool compute_also_old_base_pvals; bool is_removing_least_freq_targets_enabled; uint32_t n_most_freq_targets; @@ -669,6 +663,7 @@ class StatsWriter : public IAnchorProcessor { int opt_num_inits; int opt_num_iters; size_t num_rand_cf; + size_t num_splits; CExtraStats extra_stats; AnchorStats anchor_stats; CjWriter cj_writer; @@ -684,6 +679,7 @@ class StatsWriter : public IAnchorProcessor { const std::string& outpath, bool without_alt_max, bool with_effect_size_cts, + bool with_pval_asymp_opt, bool compute_also_old_base_pvals, bool is_removing_least_freq_targets_enabled, uint32_t n_most_freq_targets, @@ -691,6 +687,7 @@ class StatsWriter : public IAnchorProcessor { int opt_num_inits, int opt_num_iters, size_t num_rand_cf, + size_t num_splits, const std::string& cjs_out, bool is_10X, size_t anchor_len_symbols, @@ -703,6 +700,7 @@ class StatsWriter : public IAnchorProcessor { // out(outpath), without_alt_max(without_alt_max), with_effect_size_cts(with_effect_size_cts), + with_pval_asymp_opt(with_pval_asymp_opt), compute_also_old_base_pvals(compute_also_old_base_pvals), is_removing_least_freq_targets_enabled(is_removing_least_freq_targets_enabled), n_most_freq_targets(n_most_freq_targets), @@ -710,6 +708,7 @@ class StatsWriter : public IAnchorProcessor { opt_num_inits(opt_num_inits), opt_num_iters(opt_num_iters), num_rand_cf(num_rand_cf), + num_splits(num_splits), cj_writer(cjs_out, is_10X, anchor_len_symbols, barcode_len_symbols, sample_names), max_pval_opt_for_Cjs(max_pval_opt_for_Cjs), cbc_to_cell_type(cbc_to_cell_type), @@ -726,6 +725,7 @@ class StatsWriter : public IAnchorProcessor { out, without_alt_max, with_effect_size_cts, + with_pval_asymp_opt, compute_also_old_base_pvals, is_removing_least_freq_targets_enabled, n_most_freq_targets, @@ -764,12 +764,14 @@ class StatsWriter : public IAnchorProcessor { anchor_stats, without_alt_max, with_effect_size_cts, + with_pval_asymp_opt, compute_also_old_base_pvals, n_most_freq_targets, opt_train_fraction, opt_num_inits, opt_num_iters, num_rand_cf, + num_splits, cj_writer, max_pval_opt_for_Cjs, cbc_to_cell_type, @@ -788,6 +790,7 @@ class StatsWriter : public IAnchorProcessor { unique_samples.size(), without_alt_max, with_effect_size_cts, + with_pval_asymp_opt, compute_also_old_base_pvals, is_removing_least_freq_targets_enabled, n_most_freq_targets, @@ -830,17 +833,7 @@ class SatcWriter : public IAnchorProcessor { out_header.serialize(out); } - void ProcessAnchor( - Anchor&& anchor, - bool is_10X, - size_t anchor_len_symbols, - size_t target_len_symbols, - size_t n_uniqe_targets, - size_t tot_cnt, - size_t n_uniqe_targets_before_filter, - size_t tot_cnt_before_filter, - const std::unordered_set& unique_samples - ) override { + void ProcessAnchor(const Anchor& anchor) { Record rec; rec.anchor = anchor.anchor; @@ -852,6 +845,20 @@ class SatcWriter : public IAnchorProcessor { rec.serialize(out, out_header); } + } + + void ProcessAnchor( + Anchor&& anchor, + bool is_10X, + size_t anchor_len_symbols, + size_t target_len_symbols, + size_t n_uniqe_targets, + size_t tot_cnt, + size_t n_uniqe_targets_before_filter, + size_t tot_cnt_before_filter, + const std::unordered_set& unique_samples + ) override { + ProcessAnchor(anchor); anchor.data.clear(); anchor.data.shrink_to_fit(); @@ -895,17 +902,9 @@ class SatcDumpWriter : public IAnchorProcessor { header.target_len_symbols = target_len_symbols; header.gap_len_symbols = gap_len_symbols; } - void ProcessAnchor( - Anchor&& anchor, - bool is_10X, - size_t anchor_len_symbols, - size_t target_len_symbols, - size_t n_uniqe_targets, - size_t tot_cnt, - size_t n_uniqe_targets_before_filter, - size_t tot_cnt_before_filter, - const std::unordered_set& unique_samples - ) override { + + void ProcessAnchor(const Anchor& anchor) + { Record rec; rec.anchor = anchor.anchor; @@ -917,12 +916,75 @@ class SatcDumpWriter : public IAnchorProcessor { rec.print(out, header, format, sample_name_decoder); } + } + + void ProcessAnchor( + Anchor&& anchor, + bool is_10X, + size_t anchor_len_symbols, + size_t target_len_symbols, + size_t n_uniqe_targets, + size_t tot_cnt, + size_t n_uniqe_targets_before_filter, + size_t tot_cnt_before_filter, + const std::unordered_set& unique_samples + ) override { + ProcessAnchor(anchor); anchor.data.clear(); anchor.data.shrink_to_fit(); } }; +class CombinedAnchorWriter : public IAnchorProcessor { + std::unique_ptr satc_dump_writer; + std::unique_ptr satc_writer; + std::unique_ptr stats_writer; + +public: + CombinedAnchorWriter( + std::unique_ptr&& satc_dump_writer, + std::unique_ptr&& satc_writer, + std::unique_ptr&& stats_writer) + : + satc_dump_writer(std::move(satc_dump_writer)), + satc_writer(std::move(satc_writer)), + stats_writer(std::move(stats_writer)) + { + + } + + void ProcessAnchor( + Anchor&& anchor, + bool is_10X, + size_t anchor_len_symbols, + size_t target_len_symbols, + size_t n_uniqe_targets, + size_t tot_cnt, + size_t n_uniqe_targets_before_filter, + size_t tot_cnt_before_filter, + const std::unordered_set& unique_samples + ) override { + if (satc_dump_writer) + satc_dump_writer->ProcessAnchor(anchor); + if (satc_writer) + satc_writer->ProcessAnchor(anchor); + + //must be last because it destroys anchor content + if (stats_writer) + stats_writer->ProcessAnchor(std::move(anchor), + is_10X, + anchor_len_symbols, + target_len_symbols, + n_uniqe_targets, + tot_cnt, + n_uniqe_targets_before_filter, + tot_cnt_before_filter, + unique_samples + ); + } +}; + bool get_top_anchors(std::vector>& bins, AcceptedAnchors& anchor_filter, std::vector& top_anchors) { for (size_t bin_id = 0; bin_id < bins.size(); ++bin_id) { uint64_t anchor; @@ -954,8 +1016,9 @@ bool get_top_anchors(std::vector>& bins, AcceptedAnchors& a } std::unique_ptr get_anchor_processor( - RunMode run_mode, const std::string& outpath, + const std::string& dump_sample_anchor_target_count_txt, + const std::string dump_sample_anchor_target_count_binary, RecFmt format, uint8_t sample_id_size_bytes, uint8_t barcode_size_bytes, @@ -969,6 +1032,7 @@ std::unique_ptr get_anchor_processor( const std::string& sample_names, bool without_alt_max, bool with_effect_size_cts, + bool with_pval_asymp_opt, bool compute_also_old_base_pvals, bool is_removing_least_freq_targets_enabled, uint32_t n_most_freq_targets, @@ -976,16 +1040,19 @@ std::unique_ptr get_anchor_processor( int opt_num_inits, int opt_num_iters, size_t num_rand_cf, + size_t num_splits, const std::string& cjs_out, bool is_10X, double max_pval_opt_for_Cjs, CBCToCellType* cbc_to_cell_type, Non10XSupervised* non_10X_supervised) { - switch (run_mode) - { - case RunMode::JustMerge: - return std::make_unique( - outpath, + + std::unique_ptr satc_dump_writer; + + if (dump_sample_anchor_target_count_txt != "") { + satc_dump_writer = std::make_unique( + dump_sample_anchor_target_count_txt, + format, sample_id_size_bytes, barcode_size_bytes, anchor_size_bytes, @@ -994,12 +1061,16 @@ std::unique_ptr get_anchor_processor( barcode_len_symbols, anchor_len_symbols, target_len_symbols, - gap_len_symbols + gap_len_symbols, + sample_names ); - case RunMode::JustMergeAndDump: - return std::make_unique( - outpath, - format, + } + + std::unique_ptr satc_writer; + + if (dump_sample_anchor_target_count_binary != "") { + satc_writer = std::make_unique( + dump_sample_anchor_target_count_binary, sample_id_size_bytes, barcode_size_bytes, anchor_size_bytes, @@ -1008,34 +1079,36 @@ std::unique_ptr get_anchor_processor( barcode_len_symbols, anchor_len_symbols, target_len_symbols, - gap_len_symbols, - sample_names + gap_len_symbols ); - case RunMode::CalcStats: - return std::make_unique( - outpath, - without_alt_max, - with_effect_size_cts, - compute_also_old_base_pvals, - is_removing_least_freq_targets_enabled, - n_most_freq_targets, - opt_train_fraction, - opt_num_inits, - opt_num_iters, - num_rand_cf, - cjs_out, - is_10X, - anchor_len_symbols, - barcode_len_symbols, - sample_names, - max_pval_opt_for_Cjs, - cbc_to_cell_type, - non_10X_supervised); - break; - default: - std::cerr << "Error: this is not supported, mode " << to_string(run_mode) << "\n"; - exit(1); } + + std::unique_ptr stats_writer = std::make_unique( + outpath, + without_alt_max, + with_effect_size_cts, + with_pval_asymp_opt, + compute_also_old_base_pvals, + is_removing_least_freq_targets_enabled, + n_most_freq_targets, + opt_train_fraction, + opt_num_inits, + opt_num_iters, + num_rand_cf, + num_splits, + cjs_out, + is_10X, + anchor_len_symbols, + barcode_len_symbols, + sample_names, + max_pval_opt_for_Cjs, + cbc_to_cell_type, + non_10X_supervised); + + return std::make_unique( + std::move(satc_dump_writer), + std::move(satc_writer), + std::move(stats_writer)); } void run_non_10X(const Params& params) { @@ -1080,8 +1153,10 @@ void run_non_10X(const Params& params) { max_sample_id_size_bytes = bin_i_header.sample_id_size_bytes; } - std::unique_ptr anchor_processor = get_anchor_processor(params.run_mode, + std::unique_ptr anchor_processor = get_anchor_processor( params.outpath, + params.dump_sample_anchor_target_count_txt, + params.dump_sample_anchor_target_count_binary, params.format, max_sample_id_size_bytes, bin0_header.barcode_size_bytes, @@ -1095,6 +1170,7 @@ void run_non_10X(const Params& params) { params.sample_names, params.without_alt_max, params.with_effect_size_cts, + params.with_pval_asymp_opt, params.compute_also_old_base_pvals, params.n_most_freq_targets_for_stats != 0, params.n_most_freq_targets, @@ -1102,6 +1178,7 @@ void run_non_10X(const Params& params) { params.opt_num_inits, params.opt_num_iters, params.num_rand_cf, + params.num_splits, params.cjs_out, false, params.max_pval_opt_for_Cjs, @@ -1248,8 +1325,9 @@ void run_10X(const Params& params) { timer.Start(); std::unique_ptr anchor_processor = get_anchor_processor( - params.run_mode, params.outpath, + params.dump_sample_anchor_target_count_txt, + params.dump_sample_anchor_target_count_binary, params.format, max_sample_id_size_bytes, header.barcode_size_bytes, @@ -1263,6 +1341,7 @@ void run_10X(const Params& params) { params.sample_names, params.without_alt_max, params.with_effect_size_cts, + params.with_pval_asymp_opt, params.compute_also_old_base_pvals, false, //mkokot_TODO: add support to remove least freq targets from contignency table for 10X params.n_most_freq_targets, @@ -1270,6 +1349,7 @@ void run_10X(const Params& params) { params.opt_num_inits, params.opt_num_iters, params.num_rand_cf, + params.num_splits, params.cjs_out, true, params.max_pval_opt_for_Cjs, diff --git a/src/sig_anch/sig_anch.cpp b/src/sig_anch/sig_anch.cpp index f83b8cd..856ddf6 100644 --- a/src/sig_anch/sig_anch.cpp +++ b/src/sig_anch/sig_anch.cpp @@ -129,7 +129,7 @@ void correct_pvals_stream(); bool usage() { cerr << "sig_anch\n"; - NOMAD_VER_PRINT(cerr); + SPLASH_VER_PRINT(cerr); cerr << "Usage: sig_anch [options]" << endl << "Options:" << endl << " --fdr_threshold " << endl diff --git a/src/nomad.py b/src/splash.py similarity index 77% rename from src/nomad.py rename to src/splash.py index 786038c..8e01be4 100755 --- a/src/nomad.py +++ b/src/splash.py @@ -18,11 +18,11 @@ def _split_lines(self, text, width): # this is the RawTextHelpFormatter._split_lines return argparse.HelpFormatter._split_lines(self, text, width) -NOMAD_VERSION="2.0.0" +SPLASH_VERSION="2.1.4" parser = argparse.ArgumentParser( - prog = "nomad", - description = "Welcome to NOMAD\nVersion: " + NOMAD_VERSION, + prog = "splash", + description = "Welcome to SPLASH\nVersion: " + SPLASH_VERSION, #epilog = 'Text at the bottom of help', #formatter_class=SmartFormatter formatter_class=argparse.ArgumentDefaultsHelpFormatter @@ -40,30 +40,39 @@ def _split_lines(self, text, width): group_filters_and_thresholds = parser.add_argument_group('Filters and thresholds') group_filters_and_thresholds.add_argument("--poly_ACGT_len", default=8, type=int, help="filter out all anchors containing poly(ACGT) of length at least (0 means no filtering)") +group_filters_and_thresholds.add_argument("--artifacts", default="", type=str, help="path to artifacts, each anchor containing artifact will be filtered out") +group_filters_and_thresholds.add_argument("--dont_filter_illumina_adapters", default=False, action='store_true', help="if used anchors containing Illumina adapters will not be filtered out") group_filters_and_thresholds.add_argument("--anchor_unique_targets_threshold", default=1, type=int, help="filter out all anchors for which the number of unique targets is <= anchor_unique_targets_threshold") group_filters_and_thresholds.add_argument("--anchor_count_threshold", default=50, type=int, help="filter out all anchors for which the total count <= anchor_count_threshold") group_filters_and_thresholds.add_argument("--anchor_samples_threshold", default=1, type=int, help="filter out all anchors for which the number of unique samples is <= anchor_samples_threshold") group_filters_and_thresholds.add_argument("--anchor_sample_counts_threshold", default=5, type=int, help="filter out anchor from sample if its count in this sample is <= anchor_sample_counts_threshold") group_filters_and_thresholds.add_argument("--n_most_freq_targets_for_stats", default=0, type=int, help="use at most n_most_freq_targets_for_stats for each contignency table, 0 means use all") group_filters_and_thresholds.add_argument("--fdr_threshold", default=0.05, type=float, help="keep anchors having corrected p-val below this value") +group_filters_and_thresholds.add_argument("--min_hamming_threshold", default=0, type=int, help="keep only anchors with a pair of targets that differ by >= min_hamming_threshold") group_additional_out = parser.add_argument_group('Additional output configuration') group_additional_out.add_argument("--dump_Cjs", default=False, action='store_true', help="output Cjs") group_additional_out.add_argument("--max_pval_opt_for_Cjs", default=0.10, type=float, help="dump only Cjs for anchors that have pval_opt <= max_pval_opt_for_Cjs") -group_additional_out.add_argument("--n_most_freq_targets", default=2, type=int, help="number of most frequent tragets printed per each anchor in stats mode") +group_additional_out.add_argument("--n_most_freq_targets", default=2, type=int, help="number of most frequent tragets printed per each anchor") group_additional_out.add_argument("--with_effect_size_cts", default=False, action='store_true', help="if set effect_size_cts will be computed") +group_additional_out.add_argument("--with_pval_asymp_opt", default=False, action='store_true', help="if set pval_asymp_opt will be computed") group_additional_out.add_argument("--sample_name_to_id", default="sample_name_to_id.mapping.txt", type=str, help="file name with mapping sample name <-> sammpe id") group_additional_out.add_argument("--dump_sample_anchor_target_count_txt", default=False, action='store_true', help="if set contignency tables will be generated in text format") group_additional_out.add_argument("--dump_sample_anchor_target_count_binary", default=False, action='store_true', help="if set contignency tables will be generated in binary (satc) format, to convert to text format later satc_dump program may be used, it may take optionally maping from id to sample_name (--sample_names param)") +group_additional_out.add_argument("--supervised_test_samplesheet", default="", help="if used script for finding/visualizing anchors with metadata-dependent variation will be run (forces --dump_sample_anchor_target_count_binary)") +group_additional_out.add_argument("--supervised_test_anchor_sample_fraction_cutoff", default=0.4, type=float, help="the cutoff for the minimum fraction of samples for each anchor") +group_additional_out.add_argument("--supervised_test_num_anchors", default=20000, type=int, help="maximum number of anchors to be tested example") group_tuning_stats = parser.add_argument_group('Tuning statistics computation') group_tuning_stats.add_argument("--opt_num_inits", default=10, type=int, help="the number of altMaximize runs") group_tuning_stats.add_argument("--opt_num_iters", default=50, type=int, help="the number of iteration in altMaximize") group_tuning_stats.add_argument("--num_rand_cf", default=50, type=int, help="the number of rand cf") +group_tuning_stats.add_argument("--num_splits", default=1, type=int, help="the number of contingency table splits") group_tuning_stats.add_argument("--opt_train_fraction", default=0.25, type=float, help="in calc_stats mode use this fraction to create train X from contingency table") +group_tuning_stats.add_argument("--without_alt_max", default=False, action='store_true', help="if set int alt max and related stats will not be computed") group_technical = parser.add_argument_group('Technical and performance-related') -group_technical.add_argument("--bin_path", default="./", type=str, help="path to a directory where satc, satc_dump, satc_merge, sig_anch, kmc, kmc_tools binaries are (if any not found there nomad will check if installed and use installed)") +group_technical.add_argument("--bin_path", default="bin", type=str, help="path to a directory where satc, satc_dump, satc_merge, sig_anch, kmc, kmc_tools binaries are (if any not found there splash will check if installed and use installed)") group_technical.add_argument("--tmp_dir", default="", type=str, help="path to a directory where temporary files will be stored") group_technical.add_argument("--n_threads_stage_1", default=4, type=int, help="number of threads for the first stage, too large value is not recomended because of intensive disk access here, but may be profitable if there is a lot of small size samples in the input") group_technical.add_argument("--n_threads_stage_1_internal", default=8, type=int, help="number of threads per each stage 1 thread") @@ -90,6 +99,8 @@ def _split_lines(self, text, width): target_len = args.target_len gap_len = args.gap_len poly_ACGT_len=args.poly_ACGT_len +artifacts = args.artifacts +dont_filter_illumina_adapters = args.dont_filter_illumina_adapters anchor_unique_targets_threshold = args.anchor_unique_targets_threshold anchor_count_threshold = args.anchor_count_threshold anchor_samples_threshold = args.anchor_samples_threshold @@ -99,15 +110,22 @@ def _split_lines(self, text, width): opt_num_inits=args.opt_num_inits opt_num_iters=args.opt_num_iters num_rand_cf=args.num_rand_cf +num_splits=args.num_splits opt_train_fraction=args.opt_train_fraction kmc_use_RAM_only_mode = args.kmc_use_RAM_only_mode kmc_max_mem_GB = args.kmc_max_mem_GB +without_alt_max = args.without_alt_max with_effect_size_cts = args.with_effect_size_cts +with_pval_asymp_opt = args.with_pval_asymp_opt sample_name_to_id = args.sample_name_to_id dump_sample_anchor_target_count_txt = args.dump_sample_anchor_target_count_txt dump_sample_anchor_target_count_binary = args.dump_sample_anchor_target_count_binary pvals_correction_col_name = args.pvals_correction_col_name fdr_threshold = args.fdr_threshold +min_hamming_threshold = args.min_hamming_threshold +supervised_test_samplesheet = args.supervised_test_samplesheet +supervised_test_anchor_sample_fraction_cutoff = args.supervised_test_anchor_sample_fraction_cutoff +supervised_test_num_anchors = args.supervised_test_num_anchors outname_prefix=args.outname_prefix dump_Cjs=args.dump_Cjs max_pval_opt_for_Cjs=args.max_pval_opt_for_Cjs @@ -116,15 +134,30 @@ def _split_lines(self, text, width): input_file=args.input_file tmp_dir=args.tmp_dir +if supervised_test_samplesheet != "": + if run_10X: + print("Error: --supervised_test_samplesheet cannot be used with --run_10X in the current release") + sys.exit(1) + if dump_sample_anchor_target_count_binary == False: + print("Warning: --supervised_test_samplesheet forces --dump_sample_anchor_target_count_binary") + dump_sample_anchor_target_count_binary = True + if shutil.which("Rscript") is None: + print("Error: Rscript must be installed to run splash with --supervised_test_samplesheet") + sys.exit(1) + + if n_most_freq_targets < 2: + print("Warning: --supervised_test_samplesheet requires at least 2 n_most_freq_targets, setting --n_most_freq_targets 2") + n_most_freq_targets = 2 + if tmp_dir == "": - tmp_dir="nomad-tmp-"+uuid.uuid4().hex + tmp_dir="splash-tmp-"+uuid.uuid4().hex def get_cur_time(): return strftime("%Y-%m-%d %H:%M:%S", localtime()) -print("Welcome to NOMAD") -print("Version: ", NOMAD_VERSION) +print("Welcome to SPLASH") +print("Version: ", SPLASH_VERSION) print("Current time:", get_cur_time(), flush=True) @@ -193,6 +226,8 @@ def get_prog_or_fail(prog): sig_anch=get_prog_or_fail("sig_anch") kmc=get_prog_or_fail("kmc") kmc_tools=get_prog_or_fail("kmc_tools") +if supervised_test_samplesheet != "": + supervised_test = get_prog_or_fail("supervised_test.R") was_error = False @@ -235,8 +270,8 @@ def run_cmd(cmd, out, err): for line in out.split('\n'): if line.startswith("Version:"): ver = line.split()[-1] - if ver != NOMAD_VERSION: - print(f"Error: {executable_path} version is {ver} while this script is {NOMAD_VERSION}. Please use the same version.") + if ver != SPLASH_VERSION: + print(f"Error: {executable_path} version is {ver} while this script is {SPLASH_VERSION}. Please use the same version.") sys.exit(1) else: version_is_OK=True @@ -248,7 +283,7 @@ def run_cmd(cmd, out, err): if not os.path.exists(logs_dir): os.makedirs(logs_dir) -with open(f"{logs_dir}/nomad-cmd.log", "w") as f: +with open(f"{logs_dir}/splash-cmd.log", "w") as f: f.writelines(" ".join(sys.argv)) inputs = [] @@ -276,6 +311,8 @@ def remove_kmc_output(path): print("Current time:", get_cur_time(), flush=True) def stage_1_task(id, input, out, err): + _artifacts_param = f"--artifacts {artifacts}" if artifacts != "" else "" + _dont_filter_illumina_adapters_param = "--dont_filter_illumina_adapters" if dont_filter_illumina_adapters else "" fname = input[0] sample_name = input[1] @@ -300,7 +337,17 @@ def stage_1_task(id, input, out, err): if clean_up: remove_kmc_output(f"{tmp_dir}/{sample_name}") - cmd = f"{satc} --anchor_len {anchor_len} --target_len {target_len} --n_bins {n_bins} --anchor_sample_counts_threshold {anchor_sample_counts_threshold} --poly_ACGT_len {poly_ACGT_len} {tmp_dir}/{sample_name} {tmp_dir}/{sample_name}.sorted {id}" + cmd = f"{satc} \ + --anchor_len {anchor_len} \ + --target_len {target_len} \ + --n_bins {n_bins} \ + --anchor_sample_counts_threshold {anchor_sample_counts_threshold} \ + --min_hamming_threshold {min_hamming_threshold} \ + --poly_ACGT_len {poly_ACGT_len} \ + {_artifacts_param} \ + {_dont_filter_illumina_adapters_param} \ + {tmp_dir}/{sample_name} \ + {tmp_dir}/{sample_name}.sorted {id}" run_cmd(cmd, out, err) if clean_up: @@ -367,10 +414,7 @@ def stage_1_worker(tid): # For each bin merge all samples # output: file {n_bins} files, where each have # its (sample, anchor, target, count) -# this stage may be (and is intended to be) used to compute pvals -# the current state of this may be enabled with --run_mode parameter -# in the case when --run_mode is set to just_merge_and_dump -# the format of the dump may be configured with --format parameter +# this stage may be is used to compute pvals and other stats ############################################################################### print("Stage 1 done") @@ -396,12 +440,26 @@ def stage_2_task(bin_id, out, err): for x in satc_merge_inputs: f.write(f"{x}\n") + _without_alt_max_param = "--without_alt_max" if without_alt_max else "" _with_effect_size_cts_param = "--with_effect_size_cts" if with_effect_size_cts else "" + _with_pval_asymp_opt_param = "--with_pval_asymp_opt" if with_pval_asymp_opt else "" _cjs_out_param = f"--cjs_out {Cjs_dir}/bin{bin_id}.cjs" if dump_Cjs else "" + _dump_sample_anchor_target_count_txt_param = "" + if dump_sample_anchor_target_count_txt: + dump_dir = f"{outname_prefix}_dumps" + _dump_sample_anchor_target_count_txt_param = f"--dump_sample_anchor_target_count_txt {dump_dir}/bin{bin_id}.satc.dump" + + _dump_sample_anchor_target_count_binary_param = "" + if dump_sample_anchor_target_count_binary: + satc_dir = f"{outname_prefix}_satc" + _dump_sample_anchor_target_count_binary_param = f"--dump_sample_anchor_target_count_binary {satc_dir}/bin{bin_id}.satc" + cmd=f"{satc_merge} \ + {_without_alt_max_param} \ {_with_effect_size_cts_param} \ + {_with_pval_asymp_opt_param} \ {_cjs_out_param} \ --max_pval_opt_for_Cjs {max_pval_opt_for_Cjs} \ --anchor_count_threshold {anchor_count_threshold} \ @@ -412,43 +470,16 @@ def stage_2_task(bin_id, out, err): --opt_num_inits {opt_num_inits} \ --opt_num_iters {opt_num_iters} \ --num_rand_cf {num_rand_cf} \ + --num_splits {num_splits} \ --opt_train_fraction {opt_train_fraction} \ - --run_mode calc_stats \ + {_dump_sample_anchor_target_count_txt_param} \ + {_dump_sample_anchor_target_count_binary_param} \ --sample_names {sample_name_to_id} \ + --format satc \ {anchor_list_param} \ {tmp_dir}/{outname_prefix}.bin{bin_id}.stats.tsv {in_file_name}" run_cmd(cmd, out, err) - if dump_sample_anchor_target_count_txt: - dump_dir = f"{outname_prefix}_dumps" - if not os.path.exists(dump_dir): - os.makedirs(dump_dir) - - cmd=f"{satc_merge} \ - --anchor_count_threshold {anchor_count_threshold} \ - --anchor_samples_threshold {anchor_samples_threshold} \ - --anchor_unique_targets_threshold {anchor_unique_targets_threshold} \ - --run_mode just_merge_and_dump \ - --sample_names {sample_name_to_id} \ - --format satc \ - {anchor_list_param} \ - {dump_dir}/bin{bin_id}.satc.dump {in_file_name}" - run_cmd(cmd, out, err) - - if dump_sample_anchor_target_count_binary: - satc_dir = f"{outname_prefix}_satc" - if not os.path.exists(satc_dir): - os.makedirs(satc_dir) - - cmd=f"{satc_merge} \ - --anchor_count_threshold {anchor_count_threshold} \ - --anchor_samples_threshold {anchor_samples_threshold} \ - --anchor_unique_targets_threshold {anchor_unique_targets_threshold} \ - --run_mode just_merge \ - {anchor_list_param} \ - {satc_dir}/bin{bin_id}.satc {in_file_name}" - run_cmd(cmd, out, err) - if clean_up: os.remove(in_file_name) for f in satc_merge_inputs: @@ -525,5 +556,26 @@ def stage_2_worker(tid): os.rmdir(tmp_dir) check_and_handle_error() print("Stage 3 done") -print("NOMAD finished!") + +if supervised_test_samplesheet != "": + print("Starting supervised_test") + print("Current time:", get_cur_time(), flush=True) + cmd = f"Rscript {supervised_test} \ + {n_bins} \ + {sample_name_to_id} \ + {satc_dump} \ + {outname_prefix}_satc \ + {outname_prefix}.after_correction.scores.tsv \ + . \ + {supervised_test_samplesheet} \ + supervised_test \ + {supervised_test_anchor_sample_fraction_cutoff} \ + {supervised_test_num_anchors}" + + stdoutfile = open(f"{logs_dir}/supervised_test.log", "w") + run_cmd(cmd, stdoutfile, stdoutfile) + stdoutfile.close() + print("supervised_test done") + +print("SPLASH finished!") print("Current time:", get_cur_time(), flush=True) diff --git a/src/supervised_test/supervised_test.R b/src/supervised_test/supervised_test.R new file mode 100644 index 0000000..d26a413 --- /dev/null +++ b/src/supervised_test/supervised_test.R @@ -0,0 +1,156 @@ +if (!require("data.table")) { + install.packages("data.table", dependencies = TRUE) + library(data.table) +} + +if (!require("glmnet")) { + install.packages("glmnet", dependencies = TRUE) + library(glmnet) +} + +if (!require("ggplot2")) { + install.packages("ggplot2", dependencies = TRUE) + library(ggplot2) +} + +if (!require("gridExtra")) { + install.packages("gridExtra", dependencies = TRUE) + library(gridExtra) +} + +# script only looks at the first column of the metadata file (for sample names) and the second column (for the assigned class/group/category/celltype to each sample) +# metadata file must have column names and they could be anything, after reading in metadata file, the two columns are named as "sample_name" and "group" +# An example metadata file: +#Run type +#SRR8788980 carcinoma +#SRR8788981 melanoma +#SRR8788982 lymphoma +#SRR8788983 carcinoma +#SRR8657060 carcinoma + + +###################################################### +############ INPUT PARAMETERS ######################## + +args <- commandArgs(trailingOnly = TRUE) +num_bins = as.numeric(args[1]) +sample_name_to_id_mapping = args[2] +satc_dump_path = args[3] +satc_dir = args[4] +anchors_file = args[5] +directory = args[6] # example: "/oak/stanford/groups/horence/Roozbeh/NOMAD_10x/runs/CCLE_all/" +metadata_file = args[7] # example: "/oak/stanford/groups/horence/Roozbeh/NOMAD_10x/utility_files/CCLE_metadata_modified.tsv" +run_name = args[8] # the output files and plots will be generated in ${directory}/${run_name}_supervised_metadata +anchor_sample_fraction_cutoff = as.numeric(args[9]) # the cutoff for the minimum fraction of samples for each anchor, suggested 0.4 +num_anchors_for_supervised_test = as.integer(args[10]) # maximum number of anchors to be tested example, suggested 20000 + + + +############## reading in metadata and sample_conversion files ######################### +metadata = fread(metadata_file) +sample_name_id_conversion = fread(sample_name_to_id_mapping,header = FALSE) +############################################################### + +################################################################ +##### select the anchors for the supervised GLM when no specific list of anchors is provided ################ +anchors = fread(anchors_file,select = c("anchor", "effect_size_bin", "number_nonzero_samples", "most_freq_target_1", "cnt_most_freq_target_1", "most_freq_target_2", "cnt_most_freq_target_2","avg_hamming_distance_max_target")) +max_sample_num = max(anchors$number_nonzero_samples,na.rm = TRUE) +anchors = anchors[number_nonzero_samples > (max_sample_num * anchor_sample_fraction_cutoff) & avg_hamming_distance_max_target > 5] ## keep only anchors that are in at least the given fraction of samples +setorder(anchors,-effect_size_bin) +selected_anchors = anchors[1:min(num_anchors_for_supervised_test,nrow(anchors)),] + +################################################################ +################################################################ +system(paste("mkdir ", directory,"/",run_name,"_supervised_metadata",sep="")) # this is the directory for the supervised analysis files +################################################################################## +####### satc_dump for getting counts for the selected anchors #################### +write.table(selected_anchors$anchor,paste(directory, "/",run_name,"_supervised_metadata/", "selected_anchors_GLMnet.txt",sep=""),sep="\t",row.names=FALSE,quote=FALSE,col.names = FALSE) +for (counter in 0:num_bins-1) { + system(paste(satc_dump_path, paste(" --anchor_list ", paste(directory, "/",run_name,"_supervised_metadata/", "selected_anchors_GLMnet.txt ", sep = ""), satc_dir, "/bin", counter, ".satc ", directory, "/",run_name,"_supervised_metadata/", "satcOut",counter,".tsv ", sep = ""))) +} +# now below I read in satcout files generated by satc_dump +satcOut_files = list.files(path = paste(directory,"/",run_name,"_supervised_metadata/",sep=""), pattern = "satcOut") +counts = data.table() +for (counter in 1:length(satcOut_files)) { + counts = rbind(counts, fread(paste(directory,"/",run_name,"_supervised_metadata/",satcOut_files[counter], sep = ""))) + print(counter) +} +setnames(counts,c("V1","V2","V3","V4"),c("sample_id","anchor","target","count")) +################################################################################### +################################################################################### + +################# merging counts with metadata ####################### +###################################################################### +metadata = metadata[,c(1,2)] +names(metadata) = c("sample_name", "group") +metadata = merge(metadata, sample_name_id_conversion, all.x = TRUE, all.y = FALSE, by.x="sample_name", by.y="V1") +metadata = metadata[!is.na(V2)] +counts = merge(counts, metadata, all.x = TRUE, all.y = FALSE, by.x = "sample_id", by.y = "V2") +###################################################################### +###################################################################### + +system(paste("mkdir ", directory,"/",run_name,"_supervised_metadata/plots",sep="")) +GLM_output_dt = data.table() # the data table that will keep glm information for the selected anchors +############################################################# +####### perform GLM for each selected anchor ################ +for (counter in 1:nrow(selected_anchors)){ + tryCatch({ + anchor_interest = selected_anchors$anchor[counter] + counts_anchor = counts[anchor==anchor_interest] # target counts for the selected anchor + counts_anchor[,target_count := sum(count),by=target] # compute total counts for each target + top_two_targets = counts_anchor[!duplicated(target)][order(-target_count)]$target[1:2] # find the top two targets + counts_anchor = counts_anchor[target %in% top_two_targets] + + #counts_anchor[,anchor_count_per_sample:=sum(count),by=paste(anchor,sample_name)] + counts_anchor[,anchor_count_per_sample:=sum(count),by=list(anchor,sample_name)] #mkokot_TODO: is this fix ok? + counts_anchor[,fraction:=count/anchor_count_per_sample,by=1:nrow(counts_anchor)] # compute target fraction per sample + counts_anchor_reshape = reshape(counts_anchor[,list(sample_name,target,fraction,count)], idvar="sample_name", timevar="target", direction="wide") + names(counts_anchor_reshape) = c("sample_name","target1_frac","target1_count","target2_frac","target2_count") + counts_anchor_reshape[is.na(target1_frac),target1_frac:=0] + counts_anchor_reshape[is.na(target2_frac),target2_frac:=0] + counts_anchor_reshape[is.na(target1_count),target1_count:=0] + counts_anchor_reshape[is.na(target2_count),target2_count:=0] + + sample_names = data.table(counts_anchor_reshape$sample_name) + sample_names = merge(sample_names,counts_anchor[!duplicated(sample_name),list(sample_name,group)],all.x=TRUE,all.y = FALSE,by.x="V1",by.y="sample_name") + sample_names[,class:=as.numeric(as.factor(group))] # class is the numeric conversion of the groups which is needed for the multinomnial GLM + + counts_anchor_reshape[,sample_name:=NULL] + + regression_formula = as.formula( "class ~ target1_frac + target2_frac") + + counts_anchor_reshape = cbind(counts_anchor_reshape,sample_names$group,sample_names$class) + colnames(counts_anchor_reshape)[5] = "group" + colnames(counts_anchor_reshape)[6] = "class" + counts_anchor_reshape = counts_anchor_reshape[!is.na(group)] + counts_anchor_reshape[,num_per_group:=.N,by=group] + counts_anchor_reshape[,weight:=1/num_per_group] + counts_anchor_reshape = counts_anchor_reshape[num_per_group>9] + + # below if there is at least two groups for the anchor we perform GLMnet multinomial regression + if (length(unique(counts_anchor_reshape$group))>1){ + x_glmnet = model.matrix(regression_formula, counts_anchor_reshape) + glmnet_model = cv.glmnet(x_glmnet, as.factor(counts_anchor_reshape$group), family = c("multinomial"), intercept = FALSE, alpha = 1, nlambda = 50, nfolds = 5, weights = counts_anchor_reshape$weight) + glmnet_coeffs = coef(glmnet_model) + + largest_GLM_coefficient = max(unlist(lapply(lapply(glmnet_coeffs,abs),max))) # this will give us the GLM coefficient with the highest magnitude + # if the largest coefficient >1 then we plot target counts fractions and add it to the output table + if (largest_GLM_coefficient > 1){ + metadata_group_names = paste(names(glmnet_coeffs), collapse = "||") # collapse all the group names for this into a string + GLM_coefficients = paste(lapply(glmnet_coeffs, paste, collapse = ":"),collapse = "||") # collapse all the GLM coefficients into a string + GLM_output_dt_anchor = cbind(selected_anchors[anchor==anchor_interest], metadata_group_names, GLM_coefficients, largest_GLM_coefficient) + GLM_output_dt = rbind(GLM_output_dt, GLM_output_dt_anchor) + + #generate and write boxplot/scatterplot for the anchor + pdf(file=paste(directory, "/",run_name,"_supervised_metadata/plots/", anchor_interest, ".pdf", sep = ""), width = 8, height = 6) + g1 = ggplot(counts_anchor_reshape, aes(y = target1_frac, x = as.factor(group), color = as.factor(group))) + geom_boxplot() + theme_bw() +ggtitle(anchor_interest) + g2 = ggplot(counts_anchor_reshape, aes(x = target1_count, y = target2_count, shape = as.factor(group), color = as.factor(group))) + geom_point() + theme_bw() + print(grid.arrange(g1, g2, nrow = 2)) + dev.off() + } + } + },error=function(e){cat("ERROR :",conditionMessage(e), "\n")}) +} +write.table(GLM_output_dt, paste(directory,"/",run_name,"_supervised_metadata/GLM_supervised_file.tsv", sep = ""), sep = "\t", row.names = FALSE, quote = FALSE) +###################################################################################### +######################################################################################