-
Notifications
You must be signed in to change notification settings - Fork 0
/
2_STAR.sh
executable file
·80 lines (68 loc) · 2.8 KB
/
2_STAR.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env bash
# Run this in the mutation conda environment!
WORKDIR="/mnt/data/mutation"
N_THREADS=20
# Build STAR genome index files
## Human
if [ ! -f "${WORKDIR}/annotation/gencode.v35.annotation.gtf" ]; then
echo "Unzipping human gencode annotation GTF file"
gunzip -k "${WORKDIR}/annotation/gencode.v35.annotation.gtf.gz"
fi
if [ ! -f "${WORKDIR}/annotation/GRCh38.primary_assembly.genome.fa" ]; then
echo "Unzipping human genome fasta file"
gunzip -k "${WORKDIR}/annotation/GRCh38.primary_assembly.genome.fa.gz"
fi
if [ ! -d "${WORKDIR}/annotation/STAR_index/human" ]; then
echo "Generating STAR index for human"
STAR \
--runThreadN "${N_THREADS}" \
--runMode genomeGenerate \
--genomeDir "${WORKDIR}/annotation/STAR_index/human" \
--genomeFastaFiles "${WORKDIR}/annotation/GRCh38.primary_assembly.genome.fa" \
--sjdbGTFfile "${WORKDIR}/annotation/gencode.v35.annotation.gtf"
fi
## Mouse
if [ ! -f "${WORKDIR}/annotation/gencode.vM26.annotation.gtf" ]; then
echo "Unzipping mouse gencode annotation GTF file"
gunzip -k "${WORKDIR}/annotation/gencode.vM26.annotation.gtf.gz"
fi
if [ ! -f "${WORKDIR}/annotation/GRCm39.primary_assembly.genome.fa" ]; then
echo "Unzipping mouse genome fasta file"
gunzip -k "${WORKDIR}/annotation/GRCm39.primary_assembly.genome.fa.gz"
fi
if [ ! -d "${WORKDIR}/annotation/STAR_index/mouse" ]; then
echo "Generating STAR index for mouse"
STAR \
--runThreadN "${N_THREADS}" \
--runMode genomeGenerate \
--genomeDir "${WORKDIR}/annotation/STAR_index/mouse" \
--genomeFastaFiles "${WORKDIR}/annotation/GRCm39.primary_assembly.genome.fa" \
--sjdbGTFfile "${WORKDIR}/annotation/gencode.vM26.annotation.gtf"
fi
# Gather samples to work on; we have single end reads
sample_IDs=$(awk -F, '{if ($4 == "Good") print $3}' "${WORKDIR}/design_matrix.csv")
# Map fastq reads to the genome
for sample_ID in $sample_IDs; do
echo "Working on ${sample_ID}"
sample_path="${WORKDIR}/fastq/${sample_ID}.fastq.gz"
# Map reads to the graft (human) reference genome
STAR \
--runThreadN "${N_THREADS}" \
--genomeDir "${WORKDIR}/annotation/STAR_index/human" \
--readFilesIn "${sample_path}" \
--twopassMode Basic \
--readFilesCommand zcat \
--outSAMattributes NM \
--outFileNamePrefix ${WORKDIR}/bam/human/${sample_ID} \
--outSAMtype BAM SortedByCoordinate
# Map reads to the host (mouse) reference genome
STAR \
--runThreadN "${N_THREADS}" \
--genomeDir "${WORKDIR}/annotation/STAR_index/mouse" \
--readFilesIn "${sample_path}" \
--twopassMode Basic \
--readFilesCommand zcat \
--outSAMattributes NM \
--outFileNamePrefix ${WORKDIR}/bam/mouse/${sample_ID} \
--outSAMtype BAM SortedByCoordinate
done