forked from indrops/indrops
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Snakefile
157 lines (146 loc) · 9.38 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
shell.executable("/bin/bash")
import itertools
conda: "/home/adrien.dufour/NeuroDev_ADD/Envs/single_cell.yaml"
configfile: "/home/adrien.dufour/NeuroDev_ADD/SingleCell/indrops-master/project.yaml"
#shell.prefix("conda activate indrops")
YAMLPATH = "/home/adrien.dufour/NeuroDev_ADD/SingleCell/indrops-master/project.yaml"
FASTQ = []
LIBRARY = []
WORKERS = range(config['cores']['default'])
WORKER = ['1', '2', '3']
READS = ['R1', 'R2']
SPLIT = []
RUN = []
for each in config['sequencing_runs']:
RUN = each['name']
dir_lib = each['dir']
SPLIT = each['split_affixes']
LIBRARY.append(each['library_name'])
def aggregate_input(wildcards):
library_quant = [os.path.join(config['project_dir'], wildcards.library, "quant_dir",
"worker{i}_".format(i=i) + str(config['cores']['quantify_barcodes']) + ".counts.tsv") \
for i in WORKERS]
return library_quant
rule all:
input:
[os.path.join(config['project_dir'], 'fastqc', x.replace('.fastq', '_fastqc.html')) for x in FASTQ if 'R1' in x],
expand(os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}.fastq"), split=SPLIT, library=LIBRARY, run=RUN),
expand(os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}.fastq.counts.pickle"), split=SPLIT, library=LIBRARY, run=RUN),
expand(os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}metrics.yaml"), split=SPLIT, library=LIBRARY, run=RUN),
expand(os.path.join(config['project_dir'], "{library}", "abundant_barcodes.pickle"), split=SPLIT, library=LIBRARY, run=RUN),
expand(os.path.join(config['project_dir'], "{library}", "{library}.barcode_abundance_by_barcode.png"), split=SPLIT, library=LIBRARY, run=RUN),
expand(os.path.join(config['project_dir'], "{library}", "{library}.barcode_abundance.png"), library=LIBRARY),
expand(os.path.join(config['project_dir'], "{library}", "{library}.filtering_stats.csv"), library=LIBRARY),
expand(os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}.fastq.sorted.fastq.gz"), split=SPLIT, run=RUN, library=LIBRARY),
expand(os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}.fastq.sorted.fastq.gz.index.pickle"), split=SPLIT, run=RUN, library=LIBRARY),
expand(os.path.join(config['project_dir'], "{library}", "quant_dir", "worker{i}_" + str(config['cores']['quantify_barcodes']) + ".counts.tsv"), i=range(config['cores']['quantify_barcodes']), library=LIBRARY, allow_missing=True),
expand(os.path.join(config['project_dir'], "{library}", "quant_dir", "worker{i}_" + str(config['cores']['quantify_barcodes']) + ".metrics.tsv"), i=range(config['cores']['quantify_barcodes']), library=LIBRARY, allow_missing=True),
expand(os.path.join(config['project_dir'], "{library}", "quant_dir", "worker{i}_" + str(config['cores']['quantify_barcodes']) + ".ambig.counts.tsv"), i=range(config['cores']['quantify_barcodes']), library=LIBRARY, allow_missing=True),
expand(os.path.join(config['project_dir'], "{library}", "quant_dir", "worker{i}_" + str(config['cores']['quantify_barcodes']) + ".ambig.partners"), i=range(config['cores']['quantify_barcodes']), library=LIBRARY, allow_missing=True),
expand(os.path.join(config['project_dir'], "{library}", "{library}.bam"), library=LIBRARY),
expand(os.path.join(config['project_dir'], "{library}", "{library}.bam.bai"), library=LIBRARY),
expand(os.path.join(config['project_dir'], "{library}", "{library}.counts.tsv.gz"), library=LIBRARY),
expand(os.path.join(config['project_dir'], "{library}", "{library}.quant_metrics.tsv.gz"), library=LIBRARY)
rule fastqc_biological_reads:
input:
expand(os.path.join("home/adrien.dufour/PROTECT/debug_data/", "{split}_R1.fastq"), split=SPLIT)
params:
outdir=os.path.join(config['project_dir'], 'fastqc')
output:
[os.path.join(config['project_dir'], 'fastqc', x.replace('.fastq', '_fastqc.html')) for x in FASTQ if 'R1' in x]
shell:
"fastqc {input} -o {params.outdir}"
rule filter_reads:
input:
fastq=FASTQ,
yaml=YAMLPATH
output:
os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}.fastq"),
os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}.fastq.counts.pickle"),
os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}metrics.yaml"),
conda:
"/home/adrien.dufour/NeuroDev_ADD/Envs/single_cell.yaml"
params:
workers=config['cores']['default']
log:
#"logs/{wildcards.run}_{wildcards.library}_{params.worker}_filter.log"
shell:
"""
for i in {{0..{params.workers}}}; do
python indrops.py {input.yaml} filter --runs {RUN} --libraries {LIBRARY} --total-workers {params.workers} --worker-index $i
done;
"""
# Resulting workload (a list of run parts), will be split among N --total-workers,
# where worker with --worker-index i will do steps (i, N+i, 2N+i, ...)
rule abundant_barcodes:
input:
expand(os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}.fastq.counts.pickle"), split=SPLIT, library=LIBRARY, run=RUN),
yaml=YAMLPATH
output:
os.path.join(config['project_dir'], "{library}", "abundant_barcodes.pickle"),
os.path.join(config['project_dir'], "{library}", "{library}.barcode_abundance_by_barcode.png"),
os.path.join(config['project_dir'], "{library}", "{library}.barcode_abundance.png"),
os.path.join(config['project_dir'], "{library}", "{library}.filtering_stats.csv")
conda:
"/home/adrien.dufour/NeuroDev_ADD/Envs/single_cell.yaml"
shell:
"""
python indrops.py {input.yaml} identify_abundant_barcodes --libraries {LIBRARY}
"""
rule sort_reads:
input:
expand(os.path.join(config['project_dir'], "{library}", "{library}.filtering_stats.csv"), library=LIBRARY),
yaml=YAMLPATH
output:
os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}.fastq.sorted.fastq.gz"),
os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}.fastq.sorted.fastq.gz.index.pickle")
conda:
"/home/adrien.dufour/NeuroDev_ADD/Envs/single_cell.yaml"
params:
workers=config['cores']['default']
log:
#"logs/{library}_{split}_sort.log"
shell:
"""
for i in {{0..{params.workers}}}; do
python indrops.py {input.yaml} sort --libraries {LIBRARY} --total-workers {params.workers} --worker-index $i
done;
"""
rule quantify_barcodes:
input:
expand(os.path.join(config['project_dir'], "{library}", "filtered_parts", "{library}_{run}_{split}.fastq.sorted.fastq.gz.index.pickle"), split=SPLIT, run=RUN, library=LIBRARY),
yaml=YAMLPATH,
output:
expand(os.path.join(config['project_dir'], "{library}", "quant_dir", "worker{i}_" + str(config['cores']['quantify_barcodes']) + ".counts.tsv"), i=range(config['cores']['quantify_barcodes']), library=LIBRARY, allow_missing=True),
expand(os.path.join(config['project_dir'], "{library}", "quant_dir", "worker{i}_" + str(config['cores']['quantify_barcodes']) + ".metrics.tsv"), i=range(config['cores']['quantify_barcodes']), library=LIBRARY, allow_missing=True),
expand(os.path.join(config['project_dir'], "{library}", "quant_dir", "worker{i}_" + str(config['cores']['quantify_barcodes']) + ".ambig.counts.tsv"), i=range(config['cores']['quantify_barcodes']), library=LIBRARY, allow_missing=True),
expand(os.path.join(config['project_dir'], "{library}", "quant_dir", "worker{i}_" + str(config['cores']['quantify_barcodes']) + ".ambig.partners"), i=range(config['cores']['quantify_barcodes']), library=LIBRARY, allow_missing=True)
conda:
"/home/adrien.dufour/NeuroDev_ADD/Envs/single_cell.yaml"
params:
cores=config['cores']['quantify_barcodes'],
max_idx=config['cores']['quantify_barcodes'] - 1
shell:
"""
for i in {{0..{params.cores}}}; do
python indrops.py {input.yaml} quantify --libraries {LIBRARY} --total-workers {params.cores} --worker-index $i
done;
"""
rule aggregate_umis:
input:
#lambda wildcards: aggregate_input(wildcards),
expand(os.path.join(config['project_dir'], "{library}", "quant_dir", "worker{i}_" + str(config['cores']['quantify_barcodes']) + ".counts.tsv"), i=range(config['cores']['quantify_barcodes']), library=LIBRARY, allow_missing=True),
yaml=YAMLPATH
params:
workers=config['cores']['default']
output:
expand(os.path.join(config['project_dir'], "{library}", "{library}.bam"), library=LIBRARY),
expand(os.path.join(config['project_dir'], "{library}", "{library}.bam.bai"), library=LIBRARY),
expand(os.path.join(config['project_dir'], "{library}", "{library}.counts.tsv.gz"), library=LIBRARY),
expand(os.path.join(config['project_dir'], "{library}", "{library}.quant_metrics.tsv.gz"), library=LIBRARY)
conda:
"/home/adrien.dufour/NeuroDev_ADD/Envs/single_cell.yaml"
shell:
"""
python indrops.py {input.yaml} aggregate --total-workers {params.workers} --libraries {LIBRARY}
"""