diff --git a/Genome-Reference-File-Retreival/All.chrs.GRCh38.genotypes.20170504.bcf.sh b/Genome-Reference-File-Retreival/All.chrs.GRCh38.genotypes.20170504.bcf.sh deleted file mode 100644 index 7d396d1..0000000 --- a/Genome-Reference-File-Retreival/All.chrs.GRCh38.genotypes.20170504.bcf.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!bin/bash - -#################################################### -# Set env variables and work directory #################################################### -export REFDIR="/media/drew/easystore/ReferenceGenomes" -export IDXDIR=$REFDIR/GCA_000001405.15_GRCh38_no_alt_analysis_set/ -export REFFA=$IDXDIR/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna -cd $IDXDIR - -#wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/supporting/GRCh38_positions/ALL.chr{{1..22},X,Y}_GRCh38.genotypes.20170504.vcf.gz{,.tbi} - -#for chr in {1..22} X Y; do -for chr in X Y; do - (bcftools view --no-version -h ALL.chr${chr}_GRCh38.genotypes.20170504.vcf.gz | \ - grep -v "^##contig= human_g1k_v37.fasta -samtools faidx human_g1k_v37.fasta - -# Genetic map -wget https://data.broadinstitute.org/alkesgroup/Eagle/downloads/tables/genetic_map_hg19_withX.txt.gz - -# 1000 Genomes project phase 3 -for chr in {1..22} X Y; do - bcftools view --no-version -Ou -c 2 ALL.chr${chr}.phase3*integrated_v[125][ab].20130502.genotypes.vcf.gz | \ - bcftools norm --no-version -Ou -m -any | \ - bcftools norm --no-version -Ob -o ALL.chr${chr}.phase3_integrated.20130502.genotypes.bcf -d none -f human_g1k_v37.fasta && \ - bcftools index -f ALL.chr${chr}.phase3_integrated.20130502.genotypes.bcf -done - -# List of common germline duplications and deletions -wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/integrated_sv_map/ALL.wgs.mergedSV.v8.20130502.svs.genotypes.vcf.gz{,.tbi} -bcftools query -i 'AC>1 && END-POS+1>10000 && SVTYPE!="INDEL" && (SVTYPE=="CNV" || SVTYPE=="DEL" || SVTYPE=="DUP")' \ - -f "%CHROM\t%POS0\t%END\t%SVTYPE\n" ALL.wgs.mergedSV.v8.20130502.svs.genotypes.vcf.gz > cnp.grch37.bed - -# Minimal divergence intervals from segmental duplications -wget -O- http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/genomicSuperDups.txt.gz | gzip -d | - awk '!($2=="chrX" && $8=="chrY" || $2=="chrY" && $8=="chrX") {print $2"\t"$3"\t"$4"\t"$30}' > genomicSuperDups.bed - -awk '{print $1,$2; print $1,$3}' genomicSuperDups.bed | \ - sort -k1,1 -k2,2n | uniq | \ - awk 'chrom==$1 {print chrom"\t"pos"\t"$2} {chrom=$1; pos=$2}' | \ - bedtools intersect -a genomicSuperDups.bed -b - | \ - bedtools sort | \ - bedtools groupby -c 4 -o min | \ - awk 'BEGIN {i=0; s[0]="+"; s[1]="-"} {if ($4!=x) i=(i+1)%2; x=$4; print $0"\t0\t"s[i]}' | \ - bedtools merge -s -c 4 -o distinct | \ - sed 's/^chr//' | grep -v gl | bgzip > dup.grch37.bed.gz && \ - tabix -f -p bed dup.grch37.bed.gz - -# Download cytoband file -wget -O cytoBand.hg19.txt.gz http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz - - -# Setup variables -ref="$REFDIR/human_g1k_v37.fasta" -mhc_reg="6:27486711-33448264" -kir_reg="19:54574747-55504099" -map="$REFDIR/genetic_map_hg19_withX.txt.gz" -kgp_pfx="$REFDIR/ALL.chr" -kgp_sfx=".phase3_integrated.20130502.genotypes" -rule="GRCh37" -cnp="$REFDIR/cnp.grch37.bed" -dup="$REFDIR/dup.grch37.bed.gz" -cyto="$REFDIR/cytoBand.hg19.txt.gz" diff --git a/MiSeqScripts/1.MiSeq-index-and-bwa-mem.sh b/MiSeqScripts/1.MiSeq-index-and-bwa-mem.sh index d7b2144..5e418e4 100644 --- a/MiSeqScripts/1.MiSeq-index-and-bwa-mem.sh +++ b/MiSeqScripts/1.MiSeq-index-and-bwa-mem.sh @@ -24,12 +24,12 @@ for pfx in 2019_09 2019_12; do datadir=${datadir[$pfx]} cd $miseqdir export WORK_DIR=./$datadir - mkdir -p ./MiSeq_Results_out + mkdir -p ./MiSeq_Results_out/1.RAW_BAMS touch ./sm.file.txt touch ./sm.txt export INPUT_FILE=./sm.file.txt export INPUTFILE=./sm.txt - export RESULTS=MiSeq_Results_out + export RESULTS=/MiSeq_Results_out/1.RAW_BAMS find -iname "*_R1_*.fastq.gz" > $INPUT_FILE for i in $(cat $INPUT_FILE); do basename -s "R1_001.fastq.gz" $i >> $INPUTFILE diff --git a/MiSeqScripts/2.MiSeq-remove_dups.sh b/MiSeqScripts/2.MiSeq-remove_dups.sh index b797f8e..fc50d4a 100644 --- a/MiSeqScripts/2.MiSeq-remove_dups.sh +++ b/MiSeqScripts/2.MiSeq-remove_dups.sh @@ -26,13 +26,16 @@ for pfx in 2019_09 2019_12; do export WORKDIR=$WORK_DIR/$miseqdir export INPUT_FILE=$WORKDIR/sm.txt cd $WORKDIR + mkdir -p $datadir/2.TMP_DUP_BAMS + export INPUT=$datadir/1.RAW_BAMS + export OUTPUT=$datadir/2.TMP_DUP_BAMS sm_arr=( $(cat $INPUT_FILE) ) n=${#sm_arr[@]} for i in $(seq 1 $n); do sm_arr=( $(cat $INPUT_FILE) ); \ sm=${sm_arr[(($i-1))]}; \ - java -jar $HOME/toolbin/picard.jar MarkDuplicates I=$datadir/$sm.raw.bam \ - O=$datadir/$sm.tmp.bam M=$datadir/$sm.dups_metrics.txt && \ - samtools index $datadir/$sm.tmp.bam + java -jar $HOME/toolbin/picard.jar MarkDuplicates I=$INPUT/$sm.raw.bam \ + O=$OUTPUT/$sm.tmp.bam M=$OUTPUT/$sm.dups_metrics.txt && \ + samtools index $OUTPUT/$sm.tmp.bam done done diff --git a/MiSeqScripts/3.MiSeq-recalibrate_basepairs.sh b/MiSeqScripts/3.MiSeq-recalibrate_basepairs.sh index 1c41a03..3a71bbb 100644 --- a/MiSeqScripts/3.MiSeq-recalibrate_basepairs.sh +++ b/MiSeqScripts/3.MiSeq-recalibrate_basepairs.sh @@ -12,29 +12,27 @@ export REFFA=$IDXDIR/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna export VCF1000G=$REFDIR/GRCh38/1000G_phase1.snps.high_confidence.hg38.vcf.gz export TBI1000G=$REFDIR/GRCh38/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi -for file in $IDXDIR; do - ln -s $file -done - declare -A miseqdir=( ["2019_09"]="2019_09" ["2019_12"]="2019_12" ) declare -A datadir=( ["2019_09"]="MiSeq_Results_out" ["2019_12"]="MiSeq_Results_out" ) for pfx in 2019_09 2019_12; do miseqdir=${miseqdir[$pfx]} datadir=${datadir[$pfx]} - export WORK_DIR=$WORKDIR/$miseqdir - export INPUT_FILE=$WORK_DIR/sm.txt - cd $WORK_DIR + cd $miseqdir + mkdir -p $datadir/3.GRP_BAMs + export OUTDIR=$datadir/3.GRP_BAMs + export INPUT_FILE=sm.txt + export INPUTDIR=$datadir/2.TMP_DUP_BAMs sm_arr=( $(cat $INPUT_FILE) ) n=${#sm_arr[@]} for i in $(seq 1 $n); do sm_arr=( $(cat $INPUT_FILE) ); sm=${sm_arr[(($i-1))]}; - $GATK BaseRecalibrator -R $REFFA -I $datadir/$sm.tmp.bam --known-sites $VCF1000G \ - -O $datadir/$sm.grp && \ - $GATK ApplyBQSR -R $REFFA -I $datadir/$sm.tmp.bam \ - --bqsr-recal-file $datadir/$sm.grp -O $datadir/$sm.bam && \ - samtools index $datadir/$sm.bam + $GATK BaseRecalibrator -R $REFFA -I $INPUTDIR/$sm.tmp.bam --known-sites $VCF1000G \ + -O $OUTDIR/$sm.grp && \ + $GATK ApplyBQSR -R $REFFA -I $INPUTDIR/$sm.tmp.bam \ + --bqsr-recal-file $OUTDIR/$sm.grp -O $OUTDIR/$sm.bam && \ + samtools index $OUTDIR/$sm.bam done done diff --git a/MiSeqScripts/4.MiSeq-coverage.sh b/MiSeqScripts/4.MiSeq-coverage.sh index 24118ba..96869da 100644 --- a/MiSeqScripts/4.MiSeq-coverage.sh +++ b/MiSeqScripts/4.MiSeq-coverage.sh @@ -1,35 +1,47 @@ #!bin/bash ########################################################################### -# MISEQ DATA -# PILOT DATA +# Export Env Variables ########################################################################### +export GATK=$HOME/toolbin/gatk-4.1.8.1/gatk export REFDIR=/media/drew/easystore/ReferenceGenomes/ -export BEDDIR=/media/drew/easystore/ReferenceGenomes/BEDs +export BEDDIR=$REFDIR/BEDs export BEDFILE=$BEDDIR/3215481_Covered.bed -export IDXDIR=$REFDIR/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set -export REFFILE=$IDXDIR/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna -export REFFAI=$IDXDIR/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai - -for file in $IDXDIR; do - ln -s $file -done - -#cd ~/Downloads/GoodCell-Resources/GuniosAnalysis/2019_09/LVB_fastq_Sept2019_concat_fastq -#find -iname "*_R1_0012.fastq.gz" | cut -d/ -f2 | cut -d_ -f1 >> ../sm.txt +export IDXDIR=$REFDIR/GCA_000001405.15_GRCh38_no_alt_analysis_set +export REFFA=$IDXDIR/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna +export REFFAI=$IDXDIR/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai +export WORKDIR=/media/drew/easystore/GoodCell-Resources/AnalysisBaseDir/MiSeq_Data ########################################################################### ## COMPUTE COVERAGE OVER TARGETS ## ########################################################################### -sm_arr=( $(cat sm.txt) ) -n=${#sm_arr[@]} -for sm in ${sm_arr[@]}; do - bedtools coverage -g $REFFAI -sorted -a $BEDFILE -b $sm.bam -mean | \ - cut -f5 > $sm.cov +declare -A miseqdir=( ["2019_09"]="2019_09" ["2019_12"]="2019_12" ) +declare -A datadir=( ["2019_09"]="MiSeq_Results_out/3.GRP_BAMs" ["2019_12"]="MiSeq_Results_out/3.GRP_BAMs" ) +declare -A coverage=( ["2019_09"]="MiSeq_Results_out/Coverage_out" ["2019_12"]="MiSeq_Results_out/Coverage_out" ) +declare -A v2s=( ["2019_09"]="MiSeq_Results_out/4.V2_BAMs" ["2019_12"]="MiSeq_Results_out/4.V2_BAMs" ) + +for pfx in 2019_09 2019_12; do + miseqdir=${miseqdir[$pfx]} + datadir=${datadir[$pfx]} + coverage=${coverage[$pfx]} + cd $WORKDIR/$miseqdir + export INPUT_FILE=sm.txt + export INPUTDIR=$datadir + export OUTPUT=$coverage + sm_arr=( $(cat $INPUT_FILE) ) + n=${#sm_arr[@]} + echo $n + for i in $(seq 1 $n); do + sm_arr=( $(cat $INPUT_FILE) ); + sm=${sm_arr[(($i-1))]}; + echo $sm + pwd + bedtools coverage -g $REFFAI -sorted -a $BEDFILE -b $INPUTDIR/$sm.bam -mean | \ + cut -f5 > $OUTPUT/$sm.cov + done + (echo -en "CHROM\tBEG\tEND\tNAME\t"; tr '\n' '\t' < sm.txt | sed 's/\t$/\n/'; \ + paste $BEDFILE $(cat sm.txt | sed 's/$/.cov/' | tr '\n' '\t' | sed 's/\t$/\n/')) \ + > $BEDDIR/3215481_Covered.GRCh38.tsv done -(echo -en "CHROM\tBEG\tEND\tNAME\t"; tr '\n' '\t' < sm.txt | sed 's/\t$/\n/'; \ - paste $BEDFILE $(cat sm.txt | sed 's/$/.cov/' | tr '\n' '\t' | sed 's/\t$/\n/')) \ - > 3215481_Covered.GRCh38.tsv -#/bin/rm $(cat sm.txt | sed 's/$/.cov/' | tr '\n' '\t' | sed 's/\t$/\n/') diff --git a/MiSeqScripts/4.MiSeq-coverage.sh.~1~ b/MiSeqScripts/4.MiSeq-coverage.sh.~1~ new file mode 100644 index 0000000..505b64a --- /dev/null +++ b/MiSeqScripts/4.MiSeq-coverage.sh.~1~ @@ -0,0 +1,44 @@ +#!bin/bash + +########################################################################### +# Export Env Variables +########################################################################### + +export GATK=$HOME/toolbin/gatk-4.1.8.1/gatk +export REFDIR=/media/drew/easystore/ReferenceGenomes/ +export BEDDIR=$REFDIR/BEDs +export BEDFILE=$BEDDIR/3215481_Covered.bed + +export IDXDIR=$REFDIR/GCA_000001405.15_GRCh38_no_alt_analysis_set +export REFFA=$IDXDIR/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna +export REFFAI=$IDXDIR/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai +export WORKDIR=/media/drew/easystore/GoodCell-Resources/AnalysisBaseDir/MiSeq_Data +########################################################################### +## COMPUTE COVERAGE OVER TARGETS ## +########################################################################### + +declare -A miseqdir=( ["2019_09"]="2019_09" ["2019_12"]="2019_12" ) +declare -A datadir=( ["2019_09"]="MiSeq_Results_out/3.GRP_BAMs" ["2019_12"]="MiSeq_Results_out/3.GRP_BAMs" ) +declare -A coverage=( ["2019_09"]="MiSeq_Results_out/Coverage_out" ["2019_12"]="MiSeq_Results_out/Coverage_out" ) +declare -A v2s=( ["2019_09"]="MiSeq_Results_out/4.V2_BAMs" ["2019_12"]="MiSeq_Results_out/4.V2_BAMs" ) + +for pfx in 2019_09 2019_12; do + miseqdir=${miseqdir[$pfx]} + datadir=${datadir[$pfx]} + coverage=${coverage[$pfx]} + cd $WORKDIR/$miseqdir + export INPUT_FILE=sm.txt + export INPUTDIR=$datadir + export OUTPUT=$coverage + sm_arr=( $(cat $INPUT_FILE) ) + n=${#sm_arr[@]} + for i in $(seq 1 $n); do + sm_arr=( $(cat $INPUT_FILE) ); + sm=${sm_arr[(($i-1))]}; + bedtools coverage -g $REFFAI -sorted -a $BEDFILE -b $INPUTDIR/$sm.bam -mean | \ + cut -f5 > $OUTPUT/$sm.cov + done + (echo -en "CHROM\tBEG\tEND\tNAME\t"; tr '\n' '\t' < sm.txt | sed 's/\t$/\n/'; \ + paste $BEDFILE $(cat sm.txt | sed 's/$/.cov/' | tr '\n' '\t' | sed 's/\t$/\n/')) \ + > $BEDDIR/3215481_Covered.GRCh38.tsv +done diff --git a/MiSeqScripts/4.MiSeq-coverage.sh~ b/MiSeqScripts/4.MiSeq-coverage.sh~ deleted file mode 100644 index a3389f8..0000000 --- a/MiSeqScripts/4.MiSeq-coverage.sh~ +++ /dev/null @@ -1,41 +0,0 @@ -#!bin/bash - -########################################################################### -# MISEQ DATA -# PILOT DATA -########################################################################### - -export REFDIR=$HOME/Downloads/GoodCell-Resources/GuniosAnalysis/ReferenceGenomes/ - -export IDX_DIR=$HOME/Downloads/GoodCell-Resources/GuniosAnalysis/ReferenceGenomes/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set - -export REF_FILE=$HOME/Downloads/GoodCell-Resources/GuniosAnalysis/ReferenceGenomes/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna - -for file in $IDX_DIR; do - ln -s $file -done - -cd ~/Downloads/GoodCell-Resources/GuniosAnalysis/2019_09/LVB_fastq_Sept2019_concat_fastq -find -iname "*_R1_0012.fastq.gz" | cut -d/ -f2 | cut -d_ -f1 >> ../sm.txt - -mkdir -P ../MiSeqResults/raw-sams -mkdir -P ../MiSeqResults/raw-bams -mkdir -P ../MiSeqResults/tmp-bams -mkdir -P ../MiSeqResults/idx-bams - -########################################################################### -## COMPUTE COVERAGE OVER TARGETS ## -########################################################################### - -ln -s $HOME/res/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai - -sm_arr=( $(cat sm.txt) ) -n=${#sm_arr[@]} -for sm in ${sm_arr[@]}; do - bedtools coverage -g GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai -sorted -a 3215481_Covered.GRCh38.bed -b $sm.bam -mean | \ - cut -f5 > $sm.cov -done -(echo -en "CHROM\tBEG\tEND\tNAME\t"; tr '\n' '\t' < sm.txt | sed 's/\t$/\n/'; \ -paste 3215481_Covered.GRCh38.bed $(cat sm.txt | sed 's/$/.cov/' | tr '\n' '\t' | sed 's/\t$/\n/')) \ - > 3215481_Covered.GRCh38.tsv -/bin/rm $(cat sm.txt | sed 's/$/.cov/' | tr '\n' '\t' | sed 's/\t$/\n/') diff --git a/MiSeqScripts/Lift-over-Targets.sh~ b/MiSeqScripts/Lift-over-Targets.sh~ deleted file mode 100644 index c04a1ef..0000000 --- a/MiSeqScripts/Lift-over-Targets.sh~ +++ /dev/null @@ -1,8 +0,0 @@ -#!bin/bash -export REFDIR=/media/drew/easystore/ReferenceGenomes -export BEDDIR=/media/drew/easystore/ReferenceGenomes/BEDs -export BEDFILE=$BEDDIR/3215481_Covered.bed -export LIFTOVER=$REFDIR/GRCh38/hg19ToHg38.over.chain.gz - -grep ^chr $BEDFILE | liftOver /dev/stdin $LIFTOVER hg19ToHg38.over.chain.gz \ - $BEDFILE /dev/stderr diff --git a/MiSeqScripts/MiSeq-fastq-alignment.sh b/MiSeqScripts/MiSeq-fastq-alignment.sh deleted file mode 100644 index 4cb798a..0000000 --- a/MiSeqScripts/MiSeq-fastq-alignment.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!bin/bash - -########################################################################### -# MISEQ DATA -# PILOT DATA -########################################################################### - -export REFDIR=$HOME/Downloads/GoodCell-Resources/GuniosAnalysis/ReferenceGenomes/ - -export IDX_DIR=$HOME/Downloads/GoodCell-Resources/GuniosAnalysis/ReferenceGenomes/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set - -export REF_FILE=$HOME/Downloads/GoodCell-Resources/GuniosAnalysis/ReferenceGenomes/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna - -for file in $IDX_DIR; do - ln -s $file -done - -cd ~/Downloads/GoodCell-Resources/GuniosAnalysis/2019_09/LVB_fastq_Sept2019_concat_fastq -find -iname "*_R1_0012.fastq.gz" | cut -d/ -f2 | cut -d_ -f1 >> ../sm.txt - -mkdir -P ../MiSeqResults/raw-sams -mkdir -P ../MiSeqResults/raw-bams -mkdir -P ../MiSeqResults/tmp-bams -mkdir -P ../MiSeqResults/idx-bams - -#for sfx in 1-1 1-2 1-3 2-1 2-2 2-3 2-4 3-1 3-2 3-3; do -# unzip raw/Batch$sfx.zip -#done - -#find -iname "*_R1_0[01][12].fastq.gz" | grep -v "MPC10\|NA12878\|NA18507" | cut -d/ -f4 | cut -d_ -f1 >> sm.txt - -## NEXTSEQ DATA -#unzip FASTQ_Part1.zip -#unzip FASTQ_Part2.zip -#find -iname "*_R1_001.fastq.gz" | cut -d/ -f2 | cut -d_ -f1 >> sm.txt - -########################################################################### -## FASTQ ALIGNMENT ## -########################################################################### - -sm_arr=( $(cat ../sm.txt) ) -n=${#sm_arr[@]} - -# pilot data and MiSeq data -for i in $(seq 1 $n); do - fastq_arr=( $(find -iname "*_R1_0012.fastq.gz" | grep -v "MPC10\|NA12878\|NA18507") ); \ - sm_arr=( $(cat ../sm.txt) ); \ - fastq_r1=${fastq_arr[(($i-1))]}; \ - fastq_r2=${fastq_r1%_R1_0012.fastq.gz}_R2_0012.fastq.gz; \ - sm=${sm_arr[(($i-1))]}; \ - str="@RG\tID:$sm\tPL:ILLUMINA\tPU:$sm\tLB:$sm\tSM:$sm"; \ - bwa mem -M -R "$str" $IDX_DIR $fastq_r1 $fastq_r2 -o ../MiSeqResults/raw-sam/$sm.raw.sam - samtools view -Sb -o ../MiSeqResults/raw-bam/$sm.raw.bam - samtools sort -T $sm -O BAM -o ../MiSeqResults/tmp-bam/$sm.tmp.bam ../MiSeqResults/raw-bam/$sm.raw.bam - samtools index -b -@ 2 ../MiSeqResults/raw-bam/$sm.raw.bam ../MiSeqResults/idx-bam/$sm.idx.bam -done - -########################################################################### -## REMOVE DUPLICATES ## -########################################################################### - -ln -s $HOME/bin/picard.jar - -sm_arr=( $(cat ../sm.txt) ) -n=${#sm_arr[@]} -for i in $(seq 1 $n); do - sm_arr=( $(cat ../sm.txt) ); \ - sm=${sm_arr[(($i-1))]}; \ - java \ - -jar picard.jar \ - MarkDuplicates \ - I=$sm.raw.bam \ - O=$sm.tmp.bam \ - M=$sm.txt && \ - samtools index $sm.tmp.bam -done diff --git a/MiSeqScripts/MiSeq-recalibrate.sh b/MiSeqScripts/MiSeq-recalibrate.sh deleted file mode 100644 index 14895dd..0000000 --- a/MiSeqScripts/MiSeq-recalibrate.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!bin/bash - -########################################################################### -# MISEQ DATA -# PILOT DATA -########################################################################### - -export REFDIR=$HOME/Downloads/GoodCell-Resources/GuniosAnalysis/ReferenceGenomes/ - -export IDX_DIR=$HOME/Downloads/GoodCell-Resources/GuniosAnalysis/ReferenceGenomes/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set - -export REF_FILE=$HOME/Downloads/GoodCell-Resources/GuniosAnalysis/ReferenceGenomes/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna - -for file in $IDX_DIR; do - ln -s $file -done - -cd ~/Downloads/GoodCell-Resources/GuniosAnalysis/2019_09/LVB_fastq_Sept2019_concat_fastq -find -iname "*_R1_0012.fastq.gz" | cut -d/ -f2 | cut -d_ -f1 >> ../sm.txt - -mkdir -P ../MiSeqResults/raw-sams -mkdir -P ../MiSeqResults/raw-bams -mkdir -P ../MiSeqResults/tmp-bams -mkdir -P ../MiSeqResults/idx-bams -########################################################################### -## RECALIBRATE BASE PAIRS ## -########################################################################### - -ln -s $HOME/bin/gatk-4.1.3.0 -ln -s $HOME/Downloads/GCA_000001405.15_GRCh38_no_alt_analysis_set.dict -ln -s $HOME/res/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz -ln -s $HOME/res/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi - -sm_arr=( $(cat sm.txt) ) -n=${#sm_arr[@]} -for i in $(seq 1 $n); do - sm_arr=( $(cat sm.txt) ); \ - sm=${sm_arr[(($i-1))]}; \ - gatk-4.1.3.0/gatk \ - BaseRecalibrator \ - -R GCA_000001405.15_GRCh38_no_alt_analysis_set.fna \ - -I $sm.tmp.bam \ - --known-sites 1000G_phase1.snps.high_confidence.hg38.vcf.gz \ - -O $sm.grp && \ - gatk-4.1.3.0/gatk \ - ApplyBQSR \ - -R GCA_000001405.15_GRCh38_no_alt_analysis_set.fna \ - -I $sm.tmp.bam \ - --bqsr-recal-file $sm.grp \ - -O $sm.bam && \ - samtools index $sm.bam -done diff --git a/misc/#CreateMaptsv.new.sh# b/misc/#CreateMaptsv.new.sh# new file mode 100644 index 0000000..78e6203 --- /dev/null +++ b/misc/#CreateMaptsv.new.sh# @@ -0,0 +1,39 @@ +#!bin/bash + +############################################################################## +# Set env variables and create array of microarrya data directories +############################################################################## + +export WORKDIR="/media/drew/easystore/AnalysisBaseDir/GSA_Data" +declare -A wdir=( ["20180117"]="2018_07" ["20200110"]="2020_01" ) + +############################################################################## +# create input file with the IDAT fles and from those create maps +############################################################################## + +for pfx in 20180117 20200110; do + wdir=${wdir[$pfx]} + touch $WORKDIR/$wdir/dir.txt + cd $WORKDIR/$wdir + find -iname "*_idat" | xargs ./dir.txt + for f in ./dir.txt; do + dir=$f + cd $dir + find -iname "*.idat" | xargs > $WORKDIR/$wdir/files.txt +done + + bcftools +gtc2vcf --gtcs GTCs -o $wdir.maps.tsv + + +for f in $WORKDIR/$wdir/files.txt; do + bcftools +gtc2vcf -i -g $f +done + +done + + + + + + + diff --git a/misc/.#CreateMaptsv.new.sh b/misc/.#CreateMaptsv.new.sh new file mode 120000 index 0000000..0115f3e --- /dev/null +++ b/misc/.#CreateMaptsv.new.sh @@ -0,0 +1 @@ +drew@drew-hp-pavilion.114235:1603135760 \ No newline at end of file diff --git a/MiSeqScripts/0.make-index-files b/misc/0.make-index-files similarity index 100% rename from MiSeqScripts/0.make-index-files rename to misc/0.make-index-files diff --git a/Genome-Reference-File-Retreival/1000genomes-GRCh38-and-others.sh b/misc/1000genomes-GRCh38-wget.sh similarity index 100% rename from Genome-Reference-File-Retreival/1000genomes-GRCh38-and-others.sh rename to misc/1000genomes-GRCh38-wget.sh diff --git a/misc/CreateMaptsv.new.sh b/misc/CreateMaptsv.new.sh new file mode 100644 index 0000000..c7d105d --- /dev/null +++ b/misc/CreateMaptsv.new.sh @@ -0,0 +1,54 @@ +#!bin/bash + +############################################################################## +# Set env variables +############################################################################## + +export date="20200810" +export wrkdr="/media/drew/easystore/AnalysisBaseDir/GSA_Data" +export REFDIR="/media/drew/easystore/ReferenceGenomes" +export REFGFF=$REFDIR/Ensembl/Homo_sapiens.GRCh38.fixed.101.gff3.gz +export REFVCF=$REFDIR/GRCh38/clinvar_$date.GRCh38.vcf.gz +export REFTBI=$REFDIR/GRCh38/clinvar_$date.GRCh38.vcf.gz.tbi + +############################################################################## +# create arrays +############################################################################## + +declare -A wdir=( ["20180117"]="2018_07" ["20200110"]="2020_01" ) +declare -A bpms=( ["20180117"]="/media/drew/easystore/GoodCell-Resources/AnalysisBaseDir/2018_07/GenomeStudio_Files/Manifest_Files/GSA-24v1-0_A2.bpm" ["20200110"]="/media/drew/easystore/ReferenceGenomes/GSA_24v2_0/GSA-24v2-0_A2.bpm" ) +declare -A egts=( ["20180117"]="/media/drew/easystore/ReferenceGenomes/GSA_24v1_0/GSA-24v1-0_A2_ClusterFile.egt" ["20200110"]="/media/drew/easystore/ReferenceGenomes/GSA_24v2_0/GSA-24v2-0_A2_ClusterFile.egt" ) +declare -A csvs=( ["20180117"]="/media/drew/easystore/GoodCell-Resources/AnalysisBaseDir/2018_07/GenomeStudio_Files/Manifest_Files/GSA-24v1-0_A2.csv" ["20200110"]="/media/drew/easystore/ReferenceGenomes/GSA_24v2_0/GSA-24v2-0_A2.csv" ) + +############################################################################## +# iterate over data +############################################################################## + +find -iname "*_idat" | xargs $wrkdr/$wdir/dir.txt +for f in $wrkdr/$wdir/dir.txt; do + dir=$f + cd $dir + touch $wrkdr/$wdir/files.txt + find -iname "*.idat" | xargs > $wrkdr/$wdir/files.txt +done + +for f in $wrkdr/$wdir/files.txt; do + bcftools +gtc2vcf -i -g $f +done + +for pfx in 20180117 20200110; do + wdir=${wdir[$pfx]} + bpm=${bpms[$pfx]} + egt=${egts[$pfx]} + csv=${csvs[${pfx]} + touch $wrkdr/$wdir/dir.txt + cd $wrkdr/$wdir + bcftools +gtc2vcf --gtcs GTCs -o $wdir.maps.tsv +done + + + + + + + diff --git a/Genome-Reference-File-Retreival/make_sm.txt.sh b/misc/make_sm.txt.sh similarity index 100% rename from Genome-Reference-File-Retreival/make_sm.txt.sh rename to misc/make_sm.txt.sh