Add custom allele frequency (#119)

* updated annotations.py * Update annotations.snakefile * updated annotations.md
PMBio · Aug 7, 2024 · 19b2ec8 · 19b2ec8
1 parent 473872f
commit 19b2ec8
Show file tree

Hide file tree

Showing 3 changed files with 109 additions and 59 deletions.
diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py
@@ -1946,17 +1946,21 @@ def merge_af(annotations_path: str, af_df_path: str, out_file: str):
 @cli.command()
 @click.argument("annotations_path", type=click.Path(exists=True))
 @click.argument("out_file", type=click.Path())
-def calculate_maf(annotations_path: str, out_file: str):
+@click.option("--af-column-name", type=str, default="af")
+def calculate_maf(annotations_path: str, out_file: str, af_column_name: str):
     """
     Calculate minor allele frequency (MAF) from allele frequency data in annotations.
 
     Parameters:
+    - af-column-name(str): Name of the allele frequency column to calculate MAF from
     - annotations_path (str): Path to the annotations file containing allele frequency data.
     - out_file (str): Path to the output file to save the calculated MAF data.
+
+
     """
     logger.info(f"reading annotation file {annotations_path}")
     annotation_file = pd.read_parquet(annotations_path)
-    af = annotation_file["af"]
+    af = annotation_file[af_column_name]
     annotation_file = annotation_file.drop(
         columns=["UKB_AF_MB", "UKB_MAF"], errors="ignore"
     )

diff --git a/docs/annotations.md b/docs/annotations.md
@@ -110,15 +110,25 @@ Data for VEP plugins and the CADD cache are stored in `annotation data`.
 
 
 ## Running the pipeline on your own data 
-modify the path in the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml) s.t. they point to the output directory of the preprocessing pipeline run on your data. 
+Modify the path in the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml) s.t. they point to the output directory of the preprocessing pipeline run on your data. 
 You can add/remove VEP plugins in the `additional_vep_plugin_cmds` part of the config by adding /removing plugin commands to be added to the vep run command. You can omit absplice/deepSea by setting `include_absplice`/ `include_deepSEA` to `False`in the config. When you add/remove annotations you have to alter the values in `example/config/annotation_colnames_filling_values.yaml`. This file consist of  the names of the columns of the tool used, the name to be used in the output data frame, the default value replacing all `NA` values as well as the data type, for example:
 ```shell
   'CADD_RAW' : 
     - 'CADD_raw'
     - 0
     - float
 ```
-Here `CADD_RAW` is the name of the column of the VEP output when the plugin is used, it is then renamed in the final annotation dataframe to `CADD_raw`, all `NA` values are set to `0` and the values are of type `float`.
+Here `CADD_RAW` is the name of the column of the VEP output when the plugin is used, it is then renamed in the final annotation dataframe to `CADD_raw`, all `NA` values are set to `0` and the values are of type `float`. 
+
+You can also change the way the allele frequencies are calculated by adding `af_mode` key to the [config file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_annotation_config.yaml). By default, the allele frequencies are calculated from the data the annotation pipeline is run with. To use gnomade or gnomadg allele frequncies (from VEP ) instead, add 
+```shell
+af_mode : 'af_gnomade'
+```
+or 
+```shell
+af_mode : 'af_gnomadg'
+```
+to the config file.
 
 ## References
 

diff --git a/pipelines/annotations.snakefile b/pipelines/annotations.snakefile
@@ -421,65 +421,100 @@ else:
         shell: 
             'touch {output.chckpt}'
 
-rule calculate_allele_frequency:
-    input:
-        genotype_file=genotype_file,
-        variants=variant_pq,
-    output:
-        allele_frequencies = anno_tmp_dir / "af_df.parquet",
-    resources:
-        mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
-    shell:
-        " ".join(
-            [
-                f"deeprvat_annotations",
-                "get-af-from-gt",
-                "{input.genotype_file}",
-                "{input.variants}",
-                "{output.allele_frequencies}",
-            ]
-        )
 
+if af_mode is None:
+    rule calculate_allele_frequency:
+        input:
+            genotype_file=genotype_file,
+            variants=variant_pq,
+        output:
+            allele_frequencies = anno_tmp_dir / "af_df.parquet",
+        resources:
+            mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
+        shell:
+            " ".join(
+                [
+                    f"deeprvat_annotations",
+                    "get-af-from-gt",
+                    "{input.genotype_file}",
+                    "{input.variants}",
+                    "{output.allele_frequencies}",
+                ]
+            )
 
-rule merge_allele_frequency:
-    input:
-        allele_frequencies=rules.calculate_allele_frequency.output.allele_frequencies,
-        chckpt_absplice = anno_dir / 'chckpts' / 'merge_absplice_scores.chckpt',
-        chckpt_deepsea = anno_dir / 'chckpts' / 'merge_deepsea_pcas.chckpt',
-        ckckpt_concat_annotations = rules.concat_annotations.output.chckpt,
-
-    output:
-        #annotations = anno_dir / "annotations.parquet",
-        chckpt = anno_dir / 'chckpts' / 'merge_allele_frequency.chckpt'
-    params:
-        annotations_in = anno_dir / "annotations.parquet",
-        annotations_out = anno_dir / "annotations.parquet",
-    resources:
-        mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
-    shell:
-        " ".join(
-            [
-                f"deeprvat_annotations",
-                "merge-af",
-                "{params.annotations_in}",
-                "{input.allele_frequencies}",
-                "{params.annotations_out}",
-            ]
-        )+" && touch {output.chckpt}"
 
+    rule merge_allele_frequency:
+        input:
+            allele_frequencies=rules.calculate_allele_frequency.output.allele_frequencies,
+            chckpt_absplice = anno_dir / 'chckpts' / 'merge_absplice_scores.chckpt',
+            chckpt_deepsea = anno_dir / 'chckpts' / 'merge_deepsea_pcas.chckpt',
+            ckckpt_concat_annotations = rules.concat_annotations.output.chckpt,
+
+        output:
+            #annotations = anno_dir / "annotations.parquet",
+            chckpt = anno_dir / 'chckpts' / 'merge_allele_frequency.chckpt'
+        params:
+            annotations_in = anno_dir / "annotations.parquet",
+            annotations_out = anno_dir / "annotations.parquet",
+        resources:
+            mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
+        shell:
+            " ".join(
+                [
+                    f"deeprvat_annotations",
+                    "merge-af",
+                    "{params.annotations_in}",
+                    "{input.allele_frequencies}",
+                    "{params.annotations_out}",
+                ]
+            )+" && touch {output.chckpt}"
 
-rule calculate_MAF:
-    input:
-        chckpt = rules.merge_allele_frequency.output.chckpt,
-    output:
-        chckpt = anno_dir / 'chckpts' / 'calculate_MAF.chckpt'
-    params: 
-        annotations_in = rules.merge_allele_frequency.params.annotations_out,
-        annotations_out = anno_dir / "annotations.parquet",
-    resources:
-        mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
-    shell:
-        " ".join([f"deeprvat_annotations", "calculate-maf", "{params.annotations_in}", "{params.annotations_out}"])+ " && touch {output.chckpt}"
+    rule calculate_MAF:
+        input:
+            chckpt = rules.merge_allele_frequency.output.chckpt,
+        output:
+            chckpt = anno_dir / 'chckpts' / 'calculate_MAF.chckpt'
+        params: 
+            annotations_in = rules.merge_allele_frequency.params.annotations_out,
+            annotations_out = anno_dir / "annotations.parquet",
+        resources:
+            mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
+        shell:
+            " ".join([f"deeprvat_annotations", "calculate-maf", "{params.annotations_in}", "{params.annotations_out}"])+ " && touch {output.chckpt}"
+
+elif(af_mode == 'af_gnomade'):
+    rule calculate_MAF:
+        input:
+            chckpt_absplice = anno_dir / 'chckpts' / 'merge_absplice_scores.chckpt',
+            chckpt_deepsea = anno_dir / 'chckpts' / 'merge_deepsea_pcas.chckpt',
+            ckckpt_concat_annotations = rules.concat_annotations.output.chckpt,
+        output:
+            chckpt = anno_dir / 'chckpts' / 'calculate_MAF.chckpt'
+        params: 
+            annotations_in = anno_dir / "annotations.parquet",
+            annotations_out = anno_dir / "annotations.parquet",
+        resources:
+            mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
+        shell:
+            " ".join([f"deeprvat_annotations", "calculate-maf","--af-column-name gnomADe_AF" , "{params.annotations_in}", "{params.annotations_out}"])+ " && touch {output.chckpt}"
+
+elif(af_mode == 'af_gnomadg'):
+    rule calculate_MAF:
+        input:
+            chckpt_absplice = anno_dir / 'chckpts' / 'merge_absplice_scores.chckpt',
+            chckpt_deepsea = anno_dir / 'chckpts' / 'merge_deepsea_pcas.chckpt',
+            ckckpt_concat_annotations = rules.concat_annotations.output.chckpt,
+        output:
+            chckpt = anno_dir / 'chckpts' / 'calculate_MAF.chckpt'
+        params: 
+            annotations_in = anno_dir / "annotations.parquet",
+            annotations_out = anno_dir / "annotations.parquet",
+        resources:
+            mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
+        shell:
+            " ".join([f"deeprvat_annotations", "calculate-maf", "--af-column-name gnomADg_AF", "{params.annotations_in}", "{params.annotations_out}" ])+ " && touch {output.chckpt}"
+
+else: print('af_mode unknown')
 
 
 rule add_gene_ids:
@@ -566,3 +601,4 @@ rule select_rename_fill_columns:
         ) +" && touch {output.chckpt}"
 
 
+