Skip to content

Commit

Permalink
Merge branch 'master' into fix/uniform_extra_require_naming
Browse files Browse the repository at this point in the history
  • Loading branch information
svandenhoek committed Oct 4, 2022
2 parents 7d96fa2 + 34e0df7 commit 9a48cce
Show file tree
Hide file tree
Showing 10 changed files with 21 additions and 35 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ vep --input_file <path to your input file> --format vcf --output_file <path to y
```

Then you have to convert the VEP output to TSV using our own BCFTools script:
`/scripts/convert_vep_vcf_to_tsv_capice.sh -i </path/to/vep_output.vcf.gz> -o </path/to/capice_input.tsv.gz>`
`./scripts/convert_vep_vcf_to_tsv_capice.sh -i </path/to/vep_output.vcf.gz> -o </path/to/capice_input.tsv.gz>`

### CAPICE
CAPICE can be run by using the following command:
Expand Down
Binary file modified resources/predict_input.tsv.gz
Binary file not shown.
Binary file modified resources/train_input.tsv.gz
Binary file not shown.
17 changes: 8 additions & 9 deletions scripts/convert_vep_vcf_to_tsv_capice.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@
# Stops script if any error occurs.
set -e

# Possibly variable variables
PRE_HEADER="%CHROM\t%POS\t%REF\t%ALT\t%Consequence\t%SYMBOL\t%SYMBOL_SOURCE\t%Gene\t%Feature\t%Feature_type\t%cDNA_position\t%CDS_position\t%Protein_position\t%Amino_acids\t%STRAND\t%SIFT\t%PolyPhen\t%EXON\t%INTRON\t%SpliceAI_pred_DP_AG\t%SpliceAI_pred_DP_AL\t%SpliceAI_pred_DP_DG\t%SpliceAI_pred_DP_DL\t%SpliceAI_pred_DS_AG\t%SpliceAI_pred_DS_AL\t%SpliceAI_pred_DS_DG\t%SpliceAI_pred_DS_DL\t%gnomAD_AF"

# Defines error echo.
errcho() { echo "$@" 1>&2; }

Expand Down Expand Up @@ -57,8 +54,11 @@ digestCommandLine() {

if [[ ${TRAIN} == true ]]
then
id="\t%ID"
PRE_HEADER="$PRE_HEADER$id"
HEADER="CHROM\tPOS\tID\tREF\tALT\t"
FORMAT="%CHROM\t%POS\t%ID\t%REF\t%ALT\t%CSQ\n"
else
HEADER="CHROM\tPOS\tREF\tALT\t"
FORMAT="%CHROM\t%POS\t%REF\t%ALT\t%CSQ\n"
fi

validateCommandLine
Expand Down Expand Up @@ -125,12 +125,11 @@ processFile() {
local output="${output%.gz}" # Strips '.gz' to better work with code below.
local output_tmp="${output}.tmp"

local format="${PRE_HEADER}\n"

local args=()
args+=("+split-vep")
args+=("-d")
args+=("-f" "${format}")
args+=("-f" "${FORMAT}")
args+=("-A" "tab")
args+=("-o" "${output_tmp}")
args+=("${input}")

Expand All @@ -140,7 +139,7 @@ processFile() {

echo "BCFTools finished, building output file."

echo -e "${PRE_HEADER}" | cat - "${output_tmp}" > "${output}" && rm "${output_tmp}"
echo -e "${HEADER}$(bcftools +split-vep -l "${input}" | cut -f 2 | tr '\n' '\t' | sed 's/\t$//')" | cat - "${output_tmp}" > "${output}" && rm "${output_tmp}"

echo "Output file ready, gzipping."

Expand Down
15 changes: 1 addition & 14 deletions src/molgenis/capice/utilities/load_file_postprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,24 +20,11 @@ def process(self):
dataset : pandas.DataFrame
Processed dataset with corrected % sign and renamed columns.
"""
self.log.debug('Starting correcting % sign.')
self._correct_percentage_sign()
self.log.debug('% sign corrected, starting renaming of columns.')
self.log.info('LoadFilePostProcessor starting.')
self._col_renamer()
self.log.info('LoadFilePostProcessor successful.')
return self.dataset

def _correct_percentage_sign(self):
new_columns = []
for column in self.dataset.columns:
if column.startswith('%'):
new_columns.append(column.split('%')[1])
elif column.startswith('#'):
new_columns.append(column.split('#')[1])
else:
new_columns.append(column)
self.dataset.columns = new_columns

def _col_renamer(self):
"""
Function to rename "Gene, Feature, SYMBOL, INTRON and EXON" to
Expand Down
18 changes: 9 additions & 9 deletions tests/capice/utilities/test_file_postprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ def test_load_file_pre_processor(self):
print('Load file preprocessor.')
data = pd.DataFrame(
{
"%CHROM": [1, 2, 3],
"%POS": [100, 200, 300],
"%REF": ['A', 'T', 'G'],
"%ALT": ['T', 'G', 'A'],
"%SYMBOL_SOURCE": ['foo', 'foo', 'bar'],
"%Feature": ['bar', 'bar', 'buz'],
"%SYMBOL": ['g1', 'g2', 'g3'],
"%INTRON": [1, 0, 0],
"%EXON": [0, 1, 1]
"CHROM": [1, 2, 3],
"POS": [100, 200, 300],
"REF": ['A', 'T', 'G'],
"ALT": ['T', 'G', 'A'],
"SYMBOL_SOURCE": ['foo', 'foo', 'bar'],
"Feature": ['bar', 'bar', 'buz'],
"SYMBOL": ['g1', 'g2', 'g3'],
"INTRON": [1, 0, 0],
"EXON": [0, 1, 1]
}
)

Expand Down
4 changes: 2 additions & 2 deletions tests/capice/utilities/test_load_file_postprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ def setUpClass(cls):
print('Setting up.')
df = pd.DataFrame(
{
'#CHROM': [1],
'CHROM': [1],
'POS': [123],
'REF': ['A'],
'ALT': ['G'],
'Gene': [123],
'SYMBOL_SOURCE': ['hgnc'],
'%Feature': ['NM1.123'],
'Feature': ['NM1.123'],
'SYMBOL': ['ACDC'],
'INTRON': [5],
'EXON': [11],
Expand Down
Binary file modified tests/resources/breakends_vep.tsv.gz
Binary file not shown.
Binary file modified tests/resources/edge_cases_vep.tsv.gz
Binary file not shown.
Binary file modified tests/resources/symbolic_alleles_vep.tsv.gz
Binary file not shown.

0 comments on commit 9a48cce

Please sign in to comment.