diff --git a/README.md b/README.md index da8a56d6..4eeb4e23 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ vep --input_file --format vcf --output_file -o ` +`./scripts/convert_vep_vcf_to_tsv_capice.sh -i -o ` ### CAPICE CAPICE can be run by using the following command: diff --git a/resources/predict_input.tsv.gz b/resources/predict_input.tsv.gz index 00d9e5fe..450a5a4a 100644 Binary files a/resources/predict_input.tsv.gz and b/resources/predict_input.tsv.gz differ diff --git a/resources/train_input.tsv.gz b/resources/train_input.tsv.gz index 73436dd6..58a85ae6 100644 Binary files a/resources/train_input.tsv.gz and b/resources/train_input.tsv.gz differ diff --git a/scripts/convert_vep_vcf_to_tsv_capice.sh b/scripts/convert_vep_vcf_to_tsv_capice.sh index 516c1ea6..959fdab2 100755 --- a/scripts/convert_vep_vcf_to_tsv_capice.sh +++ b/scripts/convert_vep_vcf_to_tsv_capice.sh @@ -3,9 +3,6 @@ # Stops script if any error occurs. set -e -# Possibly variable variables -PRE_HEADER="%CHROM\t%POS\t%REF\t%ALT\t%Consequence\t%SYMBOL\t%SYMBOL_SOURCE\t%Gene\t%Feature\t%Feature_type\t%cDNA_position\t%CDS_position\t%Protein_position\t%Amino_acids\t%STRAND\t%SIFT\t%PolyPhen\t%EXON\t%INTRON\t%SpliceAI_pred_DP_AG\t%SpliceAI_pred_DP_AL\t%SpliceAI_pred_DP_DG\t%SpliceAI_pred_DP_DL\t%SpliceAI_pred_DS_AG\t%SpliceAI_pred_DS_AL\t%SpliceAI_pred_DS_DG\t%SpliceAI_pred_DS_DL\t%gnomAD_AF" - # Defines error echo. errcho() { echo "$@" 1>&2; } @@ -57,8 +54,11 @@ digestCommandLine() { if [[ ${TRAIN} == true ]] then - id="\t%ID" - PRE_HEADER="$PRE_HEADER$id" + HEADER="CHROM\tPOS\tID\tREF\tALT\t" + FORMAT="%CHROM\t%POS\t%ID\t%REF\t%ALT\t%CSQ\n" + else + HEADER="CHROM\tPOS\tREF\tALT\t" + FORMAT="%CHROM\t%POS\t%REF\t%ALT\t%CSQ\n" fi validateCommandLine @@ -125,12 +125,11 @@ processFile() { local output="${output%.gz}" # Strips '.gz' to better work with code below. local output_tmp="${output}.tmp" - local format="${PRE_HEADER}\n" - local args=() args+=("+split-vep") args+=("-d") - args+=("-f" "${format}") + args+=("-f" "${FORMAT}") + args+=("-A" "tab") args+=("-o" "${output_tmp}") args+=("${input}") @@ -140,7 +139,7 @@ processFile() { echo "BCFTools finished, building output file." - echo -e "${PRE_HEADER}" | cat - "${output_tmp}" > "${output}" && rm "${output_tmp}" + echo -e "${HEADER}$(bcftools +split-vep -l "${input}" | cut -f 2 | tr '\n' '\t' | sed 's/\t$//')" | cat - "${output_tmp}" > "${output}" && rm "${output_tmp}" echo "Output file ready, gzipping." diff --git a/src/molgenis/capice/utilities/load_file_postprocessor.py b/src/molgenis/capice/utilities/load_file_postprocessor.py index e347f3d2..9f3b5366 100644 --- a/src/molgenis/capice/utilities/load_file_postprocessor.py +++ b/src/molgenis/capice/utilities/load_file_postprocessor.py @@ -20,24 +20,11 @@ def process(self): dataset : pandas.DataFrame Processed dataset with corrected % sign and renamed columns. """ - self.log.debug('Starting correcting % sign.') - self._correct_percentage_sign() - self.log.debug('% sign corrected, starting renaming of columns.') + self.log.info('LoadFilePostProcessor starting.') self._col_renamer() self.log.info('LoadFilePostProcessor successful.') return self.dataset - def _correct_percentage_sign(self): - new_columns = [] - for column in self.dataset.columns: - if column.startswith('%'): - new_columns.append(column.split('%')[1]) - elif column.startswith('#'): - new_columns.append(column.split('#')[1]) - else: - new_columns.append(column) - self.dataset.columns = new_columns - def _col_renamer(self): """ Function to rename "Gene, Feature, SYMBOL, INTRON and EXON" to diff --git a/tests/capice/utilities/test_file_postprocessor.py b/tests/capice/utilities/test_file_postprocessor.py index 55fae6f2..9886d124 100644 --- a/tests/capice/utilities/test_file_postprocessor.py +++ b/tests/capice/utilities/test_file_postprocessor.py @@ -28,15 +28,15 @@ def test_load_file_pre_processor(self): print('Load file preprocessor.') data = pd.DataFrame( { - "%CHROM": [1, 2, 3], - "%POS": [100, 200, 300], - "%REF": ['A', 'T', 'G'], - "%ALT": ['T', 'G', 'A'], - "%SYMBOL_SOURCE": ['foo', 'foo', 'bar'], - "%Feature": ['bar', 'bar', 'buz'], - "%SYMBOL": ['g1', 'g2', 'g3'], - "%INTRON": [1, 0, 0], - "%EXON": [0, 1, 1] + "CHROM": [1, 2, 3], + "POS": [100, 200, 300], + "REF": ['A', 'T', 'G'], + "ALT": ['T', 'G', 'A'], + "SYMBOL_SOURCE": ['foo', 'foo', 'bar'], + "Feature": ['bar', 'bar', 'buz'], + "SYMBOL": ['g1', 'g2', 'g3'], + "INTRON": [1, 0, 0], + "EXON": [0, 1, 1] } ) diff --git a/tests/capice/utilities/test_load_file_postprocessor.py b/tests/capice/utilities/test_load_file_postprocessor.py index 1b720f94..c397fe53 100644 --- a/tests/capice/utilities/test_load_file_postprocessor.py +++ b/tests/capice/utilities/test_load_file_postprocessor.py @@ -10,13 +10,13 @@ def setUpClass(cls): print('Setting up.') df = pd.DataFrame( { - '#CHROM': [1], + 'CHROM': [1], 'POS': [123], 'REF': ['A'], 'ALT': ['G'], 'Gene': [123], 'SYMBOL_SOURCE': ['hgnc'], - '%Feature': ['NM1.123'], + 'Feature': ['NM1.123'], 'SYMBOL': ['ACDC'], 'INTRON': [5], 'EXON': [11], diff --git a/tests/resources/breakends_vep.tsv.gz b/tests/resources/breakends_vep.tsv.gz index 27a6d223..1cec16a9 100644 Binary files a/tests/resources/breakends_vep.tsv.gz and b/tests/resources/breakends_vep.tsv.gz differ diff --git a/tests/resources/edge_cases_vep.tsv.gz b/tests/resources/edge_cases_vep.tsv.gz index 39debf2b..362c1274 100644 Binary files a/tests/resources/edge_cases_vep.tsv.gz and b/tests/resources/edge_cases_vep.tsv.gz differ diff --git a/tests/resources/symbolic_alleles_vep.tsv.gz b/tests/resources/symbolic_alleles_vep.tsv.gz index 5bcc1b12..f56b4d8e 100644 Binary files a/tests/resources/symbolic_alleles_vep.tsv.gz and b/tests/resources/symbolic_alleles_vep.tsv.gz differ