Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Before after scaling-2L #15

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
281 changes: 224 additions & 57 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ rule link_bam:
shell:
"ln -s {input.src} {output}"


# filter bams
rule index_bam:
input:
Expand All @@ -28,26 +27,15 @@ rule filter_bam:
bai="data/input_alignments/{sample}.bai"
params:
min_mapping_qual=config["min_mapping_quality"],
chrom=config["chrom"]
output:
"data/filtered_alignments/{sample}.filtered.bam"
threads:
4
shell:
"samtools view -b -@{threads} -F 0x0200 -F 0x0100 -F 0x004 -q {params.min_mapping_qual} {input.bam} {params.chrom} > {output}"

# k-mer counting with jellyfish
rule extract_bam_reads:
input:
bam="data/filtered_alignments/{sample}.filtered.bam"
params:
chrom=config["chrom"],
kmer_size=config["kmer_size"]
output:
jf="data/kmer_counts/{sample}.jf"
threads:
6
4
shell:
"samtools fasta {input.bam} | jellyfish count -t {threads} -m {params.kmer_size} -s 1000M -C -o {output.jf} /dev/fd/0"
"samtools view -b -@{threads} -F 0x0200 -F 0x0100 -F 0x004 -q {params.min_mapping_qual} {input.bam} {params.chrom} | samtools fasta - | jellyfish count -t {threads} -m {params.kmer_size} -s 1000M -C -o {output.jf} /dev/fd/0"


# jellyfish dump is single threaded
# but specify number of threads to avoid
Expand All @@ -60,80 +48,259 @@ rule dump_kmers:
threads:
6
shell:
"jellyfish dump -c -o {output.counts} {input.jf}"
"jellyfish dump -c -L 2 {input.jf} > {output.counts}"

# find document frequencies
# note that pre-allocating a buffer (-S) makes
# a massive difference in run time
rule sort_kmers:
rule extract_features:
input:
counts="data/kmer_counts/{sample}.counts"
kmer_counts="data/kmer_counts/{sample}.counts",

params:
n_features=config["n_features"],
use_binary=config["use_binary_features"]
output:
sorted_kmers="data/doc_freq/{sample}.sorted.counts"
feature_matrix="data/feature_extraction/{sample}.features"
threads:
6
2
shell:
"cut -f 1 -d \" \" {input.counts} | sort -S 16G --parallel {threads} > {output.sorted_kmers}"
"scripts/feature_extractor.py --kmer-freq-fl {input.kmer_counts} --n-features {params.n_features} {params.use_binary} --feature-matrix {output.feature_matrix}"

# merge operation doesn't actually use
# multiple threads but using all threads
# lets us prevent other processes from running
# so that we can use all memory as buffer
rule create_pass_list:
input:
sorted_kmers=expand("data/doc_freq/{sample}.sorted.counts",
sample=config["samples"])
#log1p feature scaling Combinations w/PCA

rule log1p_binary_feature:
input:
kmer_counts="data/kmer_counts/{sample}.counts",
params:
batch_size=config["merge_batch_size"],
min_df=config["min_doc_freq"]
n_features=config["n_features"]
output:
pass_list="data/doc_freq/kmer_passlist.bf"
feature_matrix="data/log1p_features_before_after/{sample}.features"
threads:
24
4
shell:
"sort --batch-size {params.batch_size} -S 110G -T data/doc_freq/ -m {input.sorted_kmers} | uniq -c | scripts/create_bloomfilter.py --min-df {params.min_df} --output-bf {output.pass_list}"
"scripts/feature_extractor.py --kmer-freq-fl {input.kmer_counts} --n-features {params.n_features} --feature-matrix {output.feature_matrix} --feature-scaling-before log1p --feature-scaling-after binary"

# feature extraction
rule create_random_projection:
rule log1p_count_feature:
input:
kmer_counts="data/kmer_counts/{sample}.counts",
params:
n_features=config["n_features"],
n_rand_dim=config["n_rand_dim"],
n_samples=len(config["samples"])
n_features=config["n_features"]
output:
rand_proj="data/feature_extraction/rand_proj.joblib"
feature_matrix="data/log1p_features_count/{sample}.features"
threads:
4
shell:
"scripts/create_rand_proj.py --n-features {params.n_features} --n-components {params.n_rand_dim} --n-samples {params.n_samples} --output-fl {output.rand_proj}"
"scripts/feature_extractor.py --kmer-freq-fl {input.kmer_counts} --n-features {params.n_features} --feature-matrix {output.feature_matrix} --feature-scaling-before log1p --feature-scaling-after counts"

rule extract_features:
rule log1p_log1p_feature:
input:
kmer_counts="data/kmer_counts/{sample}.counts",
params:
n_features=config["n_features"]
output:
feature_matrix="data/log1p_features_log1p/{sample}.features"
threads:
4
shell:
"scripts/feature_extractor.py --kmer-freq-fl {input.kmer_counts} --n-features {params.n_features} --feature-matrix {output.feature_matrix} --feature-scaling-before log1p --feature-scaling-after log1p"


#Binarizer feature scaling combinations w/PCA

rule binarizer_binary_feature:
input:
kmer_counts="data/kmer_counts/{sample}.counts",
rand_proj="data/feature_extraction/rand_proj.joblib",
pass_list="data/doc_freq/kmer_passlist.bf"
params:
n_features=config["n_features"],
use_binary=config["use_binary_features"]
n_features=config["n_features"]
output:
feature_matrix="data/feature_extraction/{sample}.features"
feature_matrix="data/binarizer_features_binary/{sample}.features"
threads:
3
2
shell:
"scripts/feature_extractor.py --rand-proj-fl {input.rand_proj} --passlist-bf {input.pass_list} --kmer-freq-fl {input.kmer_counts} --n-features {params.n_features} {params.use_binary} --feature-matrix {output.feature_matrix}"
"scripts/feature_extractor.py --kmer-freq-fl {input.kmer_counts} --n-features {params.n_features} --feature-matrix {output.feature_matrix} --feature-scaling-before binary --feature-scaling-after binary"

rule pca:
rule binarizer_count_feature:
input:
feature_matrices=expand("data/feature_extraction/{sample}.features",
kmer_counts="data/kmer_counts/{sample}.counts",
params:
n_features=config["n_features"]
output:
feature_matrix="data/binarizer_features_count/{sample}.features"
threads:
2
shell:
"scripts/feature_extractor.py --kmer-freq-fl {input.kmer_counts} --n-features {params.n_features} --feature-matrix {output.feature_matrix} --feature-scaling-before binary --feature-scaling-after counts"

rule binarizer_log1p_feature:
input:
kmer_counts="data/kmer_counts/{sample}.counts",
params:
n_features=config["n_features"]
output:
feature_matrix="data/binarizer_features_log1p/{sample}.features"
threads:
2
shell:
"scripts/feature_extractor.py --kmer-freq-fl {input.kmer_counts} --n-features {params.n_features} --feature-matrix {output.feature_matrix} --feature-scaling-before binary --feature-scaling-after log1p"

# Counts feature combination w/PCA

rule counts_count_feature:
input:
kmer_counts="data/kmer_counts/{sample}.counts",
params:
n_features=config["n_features"]
output:
feature_matrix="data/count_features_counts/{sample}.features"
threads:
2
shell:
"scripts/feature_extractor.py --kmer-freq-fl {input.kmer_counts} --n-features {params.n_features} --feature-matrix {output.feature_matrix} --feature-scaling-before counts --feature-scaling-after counts"

rule counts_log1p_feature:
input:
kmer_counts="data/kmer_counts/{sample}.counts",
params:
n_features=config["n_features"]
output:
feature_matrix="data/count_features_log1p/{sample}.features"
threads:
2
shell:
"scripts/feature_extractor.py --kmer-freq-fl {input.kmer_counts} --n-features {params.n_features} --feature-matrix {output.feature_matrix} --feature-scaling-before counts --feature-scaling-after log1p"


rule counts_binary_feature:
input:
kmer_counts="data/kmer_counts/{sample}.counts",
params:
n_features=config["n_features"]
output:
feature_matrix="data/count_features_binary/{sample}.features"
threads:
2
shell:
"scripts/feature_extractor.py --kmer-freq-fl {input.kmer_counts} --n-features {params.n_features} --feature-matrix {output.feature_matrix} --feature-scaling-before counts --feature-scaling-after binary"


rule pca_log1p_before:
input:
feature_matrices=expand("data/log1p_features_before_after/{sample}.features",
sample=config["samples"])
params:
groups_fl=config["groups_fl"]
output:
plot="data/pca/pca_1_2.png"
plot="data/pca/pca_before.png"
threads:
4
shell:
"scripts/pca.py --feature-matrices {input.feature_matrices} --groups-fl {params.groups_fl} --plot-fl {output.plot}"


rule pca_log1p_count:
input:
feature_matrices=expand("data/log1p_features_count/{sample}.features",
sample=config["samples"])
params:
groups_fl=config["groups_fl"]
output:
plot="data/pca/pca_log1p_count.png"
threads:
4
shell:
"scripts/pca.py --feature-matrices {input.feature_matrices} --groups-fl {params.groups_fl} --plot-fl {output.plot}"

rule pca_log1p_log1p:
input:
feature_matrices=expand("data/log1p_features_log1p/{sample}.features",
sample=config["samples"])
params:
groups_fl=config["groups_fl"]
output:
plot="data/pca/pca_log1p_log1p.png"
threads:
4
shell:
"scripts/pca.py --feature-matrices {input.feature_matrices} --groups-fl {params.groups_fl} --plot-fl {output.plot}"

rule pca_binarizer_binary:
input:
feature_matrices=expand("data/binarizer_features_binary/{sample}.features",
sample=config["samples"])
params:
groups_fl=config["groups_fl"]
output:
plot="data/pca/pca_binary_binarizer.png"
threads:
4
shell:
"scripts/pca.py --feature-matrices {input.feature_matrices} --groups-fl {params.groups_fl} --plot-fl {output.plot}"

rule pca_binarizer_log1p:
input:
feature_matrices=expand("data/binarizer_features_log1p/{sample}.features",
sample=config["samples"])
params:
groups_fl=config["groups_fl"]
output:
plot="data/pca/pca_binarizer_log1p.png"
threads:
4
shell:
"scripts/pca.py --feature-matrices {input.feature_matrices} --groups-fl {params.groups_fl} --plot-fl {output.plot}"

rule pca_binarizer_count:
input:
feature_matrices=expand("data/binarizer_features_count/{sample}.features",
sample=config["samples"])
params:
groups_fl=config["groups_fl"]
output:
plot="data/pca/pca_binarizer_count.png"
threads:
4
shell:
"scripts/pca.py --feature-matrices {input.feature_matrices} --groups-fl {params.groups_fl} --plot-fl {output.plot}"

rule pca_count_count:
input:
feature_matrices=expand("data/count_features_counts/{sample}.features",
sample=config["samples"])
params:
groups_fl=config["groups_fl"]
output:
plot="data/pca/pca_count_count.png"
threads:
4
shell:
"scripts/pca.py --feature-matrices {input.feature_matrices} --groups-fl {params.groups_fl} --plot-fl {output.plot}"

rule pca_count_log1p:
input:
feature_matrices=expand("data/count_features_log1p/{sample}.features",
sample=config["samples"])
params:
groups_fl=config["groups_fl"]
output:
plot="data/pca/pca_count_log1p.png"
threads:
4
shell:
"scripts/pca.py --feature-matrices {input.feature_matrices} --groups-fl {params.groups_fl} --plot-fl {output.plot}"

rule pca_count_binary:
input:
feature_matrices=expand("data/count_features_binary/{sample}.features",
sample=config["samples"])
params:
groups_fl=config["groups_fl"]
output:
plot="data/pca/pca_count_binarizer.png"
threads:
4
shell:
"scripts/pca.py --feature-matrices {input.feature_matrices} --groups-fl {params.groups_fl} --plot-fl {output.plot}"




# top-level rules
rule setup_inputs:
input:
Expand Down
Loading