Skip to content

Commit

Permalink
Add custom allele frequency (#119)
Browse files Browse the repository at this point in the history
* updated annotations.py

* Update annotations.snakefile

* updated annotations.md
  • Loading branch information
Marcel-Mueck authored Aug 7, 2024
1 parent 473872f commit 19b2ec8
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 59 deletions.
8 changes: 6 additions & 2 deletions deeprvat/annotations/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -1946,17 +1946,21 @@ def merge_af(annotations_path: str, af_df_path: str, out_file: str):
@cli.command()
@click.argument("annotations_path", type=click.Path(exists=True))
@click.argument("out_file", type=click.Path())
def calculate_maf(annotations_path: str, out_file: str):
@click.option("--af-column-name", type=str, default="af")
def calculate_maf(annotations_path: str, out_file: str, af_column_name: str):
"""
Calculate minor allele frequency (MAF) from allele frequency data in annotations.
Parameters:
- af-column-name(str): Name of the allele frequency column to calculate MAF from
- annotations_path (str): Path to the annotations file containing allele frequency data.
- out_file (str): Path to the output file to save the calculated MAF data.
"""
logger.info(f"reading annotation file {annotations_path}")
annotation_file = pd.read_parquet(annotations_path)
af = annotation_file["af"]
af = annotation_file[af_column_name]
annotation_file = annotation_file.drop(
columns=["UKB_AF_MB", "UKB_MAF"], errors="ignore"
)
Expand Down
14 changes: 12 additions & 2 deletions docs/annotations.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,15 +110,25 @@ Data for VEP plugins and the CADD cache are stored in `annotation data`.


## Running the pipeline on your own data
modify the path in the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml) s.t. they point to the output directory of the preprocessing pipeline run on your data.
Modify the path in the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml) s.t. they point to the output directory of the preprocessing pipeline run on your data.
You can add/remove VEP plugins in the `additional_vep_plugin_cmds` part of the config by adding /removing plugin commands to be added to the vep run command. You can omit absplice/deepSea by setting `include_absplice`/ `include_deepSEA` to `False`in the config. When you add/remove annotations you have to alter the values in `example/config/annotation_colnames_filling_values.yaml`. This file consist of the names of the columns of the tool used, the name to be used in the output data frame, the default value replacing all `NA` values as well as the data type, for example:
```shell
'CADD_RAW' :
- 'CADD_raw'
- 0
- float
```
Here `CADD_RAW` is the name of the column of the VEP output when the plugin is used, it is then renamed in the final annotation dataframe to `CADD_raw`, all `NA` values are set to `0` and the values are of type `float`.
Here `CADD_RAW` is the name of the column of the VEP output when the plugin is used, it is then renamed in the final annotation dataframe to `CADD_raw`, all `NA` values are set to `0` and the values are of type `float`.

You can also change the way the allele frequencies are calculated by adding `af_mode` key to the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml). By default, the allele frequencies are calculated from the data the annotation pipeline is run with. To use gnomade or gnomadg allele frequncies (from VEP ) instead, add
```shell
af_mode : 'af_gnomade'
```
or
```shell
af_mode : 'af_gnomadg'
```
to the config file.

## References

Expand Down
146 changes: 91 additions & 55 deletions pipelines/annotations.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -421,65 +421,100 @@ else:
shell:
'touch {output.chckpt}'

rule calculate_allele_frequency:
input:
genotype_file=genotype_file,
variants=variant_pq,
output:
allele_frequencies = anno_tmp_dir / "af_df.parquet",
resources:
mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
shell:
" ".join(
[
f"deeprvat_annotations",
"get-af-from-gt",
"{input.genotype_file}",
"{input.variants}",
"{output.allele_frequencies}",
]
)

if af_mode is None:
rule calculate_allele_frequency:
input:
genotype_file=genotype_file,
variants=variant_pq,
output:
allele_frequencies = anno_tmp_dir / "af_df.parquet",
resources:
mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
shell:
" ".join(
[
f"deeprvat_annotations",
"get-af-from-gt",
"{input.genotype_file}",
"{input.variants}",
"{output.allele_frequencies}",
]
)

rule merge_allele_frequency:
input:
allele_frequencies=rules.calculate_allele_frequency.output.allele_frequencies,
chckpt_absplice = anno_dir / 'chckpts' / 'merge_absplice_scores.chckpt',
chckpt_deepsea = anno_dir / 'chckpts' / 'merge_deepsea_pcas.chckpt',
ckckpt_concat_annotations = rules.concat_annotations.output.chckpt,

output:
#annotations = anno_dir / "annotations.parquet",
chckpt = anno_dir / 'chckpts' / 'merge_allele_frequency.chckpt'
params:
annotations_in = anno_dir / "annotations.parquet",
annotations_out = anno_dir / "annotations.parquet",
resources:
mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
shell:
" ".join(
[
f"deeprvat_annotations",
"merge-af",
"{params.annotations_in}",
"{input.allele_frequencies}",
"{params.annotations_out}",
]
)+" && touch {output.chckpt}"

rule merge_allele_frequency:
input:
allele_frequencies=rules.calculate_allele_frequency.output.allele_frequencies,
chckpt_absplice = anno_dir / 'chckpts' / 'merge_absplice_scores.chckpt',
chckpt_deepsea = anno_dir / 'chckpts' / 'merge_deepsea_pcas.chckpt',
ckckpt_concat_annotations = rules.concat_annotations.output.chckpt,

output:
#annotations = anno_dir / "annotations.parquet",
chckpt = anno_dir / 'chckpts' / 'merge_allele_frequency.chckpt'
params:
annotations_in = anno_dir / "annotations.parquet",
annotations_out = anno_dir / "annotations.parquet",
resources:
mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
shell:
" ".join(
[
f"deeprvat_annotations",
"merge-af",
"{params.annotations_in}",
"{input.allele_frequencies}",
"{params.annotations_out}",
]
)+" && touch {output.chckpt}"

rule calculate_MAF:
input:
chckpt = rules.merge_allele_frequency.output.chckpt,
output:
chckpt = anno_dir / 'chckpts' / 'calculate_MAF.chckpt'
params:
annotations_in = rules.merge_allele_frequency.params.annotations_out,
annotations_out = anno_dir / "annotations.parquet",
resources:
mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
shell:
" ".join([f"deeprvat_annotations", "calculate-maf", "{params.annotations_in}", "{params.annotations_out}"])+ " && touch {output.chckpt}"
rule calculate_MAF:
input:
chckpt = rules.merge_allele_frequency.output.chckpt,
output:
chckpt = anno_dir / 'chckpts' / 'calculate_MAF.chckpt'
params:
annotations_in = rules.merge_allele_frequency.params.annotations_out,
annotations_out = anno_dir / "annotations.parquet",
resources:
mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
shell:
" ".join([f"deeprvat_annotations", "calculate-maf", "{params.annotations_in}", "{params.annotations_out}"])+ " && touch {output.chckpt}"

elif(af_mode == 'af_gnomade'):
rule calculate_MAF:
input:
chckpt_absplice = anno_dir / 'chckpts' / 'merge_absplice_scores.chckpt',
chckpt_deepsea = anno_dir / 'chckpts' / 'merge_deepsea_pcas.chckpt',
ckckpt_concat_annotations = rules.concat_annotations.output.chckpt,
output:
chckpt = anno_dir / 'chckpts' / 'calculate_MAF.chckpt'
params:
annotations_in = anno_dir / "annotations.parquet",
annotations_out = anno_dir / "annotations.parquet",
resources:
mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
shell:
" ".join([f"deeprvat_annotations", "calculate-maf","--af-column-name gnomADe_AF" , "{params.annotations_in}", "{params.annotations_out}"])+ " && touch {output.chckpt}"

elif(af_mode == 'af_gnomadg'):
rule calculate_MAF:
input:
chckpt_absplice = anno_dir / 'chckpts' / 'merge_absplice_scores.chckpt',
chckpt_deepsea = anno_dir / 'chckpts' / 'merge_deepsea_pcas.chckpt',
ckckpt_concat_annotations = rules.concat_annotations.output.chckpt,
output:
chckpt = anno_dir / 'chckpts' / 'calculate_MAF.chckpt'
params:
annotations_in = anno_dir / "annotations.parquet",
annotations_out = anno_dir / "annotations.parquet",
resources:
mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
shell:
" ".join([f"deeprvat_annotations", "calculate-maf", "--af-column-name gnomADg_AF", "{params.annotations_in}", "{params.annotations_out}" ])+ " && touch {output.chckpt}"

else: print('af_mode unknown')


rule add_gene_ids:
Expand Down Expand Up @@ -566,3 +601,4 @@ rule select_rename_fill_columns:
) +" && touch {output.chckpt}"



0 comments on commit 19b2ec8

Please sign in to comment.