Skip to content

Commit

Permalink
Merge pull request #748 from broadinstitute/jg/transcript_csq_filteri…
Browse files Browse the repository at this point in the history
…ng_struct

Modify `filter_vep_transcript_csqs_expr` so it can also accept hl.expr.StructExpression
  • Loading branch information
jkgoodrich authored Dec 20, 2024
2 parents 727887c + 02296b4 commit de65b83
Showing 1 changed file with 72 additions and 42 deletions.
114 changes: 72 additions & 42 deletions gnomad/utils/vep.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
# noqa: D100

import functools
import json
import logging
import operator
import os
import subprocess
from typing import Callable, List, Optional, Union

import hail as hl

from gnomad.resources.resource_utils import VersionedTableResource
from gnomad.utils.filtering import combine_functions

logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -860,19 +861,21 @@ def filter_vep_transcript_csqs(


def filter_vep_transcript_csqs_expr(
csq_expr: hl.expr.ArrayExpression,
csq_expr: Union[hl.expr.StructExpression, hl.expr.ArrayExpression],
synonymous: bool = False,
canonical: bool = False,
mane_select: bool = False,
ensembl_only: bool = False,
protein_coding: bool = False,
csqs: List[str] = None,
csqs: Optional[List[str]] = None,
keep_csqs: bool = True,
genes: Optional[List[str]] = None,
keep_genes: bool = True,
match_by_gene_symbol: bool = False,
additional_filtering_criteria: Optional[List[Callable]] = None,
) -> Union[hl.Table, hl.MatrixTable]:
additional_filtering_criteria: Optional[
List[Union[hl.expr.BooleanExpression, Callable]]
] = None,
) -> Union[hl.expr.BooleanExpression, hl.expr.ArrayExpression]:
"""
Filter VEP transcript consequences based on specified criteria, and optionally filter to variants where transcript consequences is not empty after filtering.
Expand All @@ -882,7 +885,7 @@ def filter_vep_transcript_csqs_expr(
is not already annotated on the `csq_expr` elements, the most severe
consequence will be added to the `csq_expr` for filtering.
:param csq_expr: ArrayExpression of VEP transcript consequences.
:param csq_expr: VEP transcript consequences StructExpression or ArrayExpression.
:param synonymous: Whether to filter to variants where the most severe consequence
is 'synonymous_variant'. Default is False.
:param canonical: Whether to filter to only canonical transcripts. Default is False.
Expand All @@ -908,50 +911,77 @@ def filter_vep_transcript_csqs_expr(
consequences by 'gene_symbol' instead of 'gene_id'. Default is False.
:param additional_filtering_criteria: Optional list of additional filtering
criteria to apply to the VEP transcript consequences.
:return: ArrayExpression of filtered VEP transcript consequences.
:return: BooleanExpression indicating whether the consequence should be filtered
or an ArrayExpression of the filtered VEP transcript consequences.
"""
criteria = [lambda csq: True]
is_struct = isinstance(csq_expr, hl.expr.StructExpression)
if synonymous:
logger.info("Filtering to most severe consequence of synonymous_variant...")
csqs = ["synonymous_variant"]

if csqs is not None:
if "most_severe_consequence" not in csq_expr.dtype.element_type.fields:
fields = csq_expr if is_struct else csq_expr.dtype.element_type.fields
if "most_severe_consequence" not in fields:
logger.info("Adding most_severe_consequence annotation...")
csq_expr = add_most_severe_consequence_to_consequence(csq_expr)

csqs = hl.literal(csqs)
if keep_csqs:
criteria.append(lambda csq: csqs.contains(csq.most_severe_consequence))
else:
criteria.append(lambda csq: ~csqs.contains(csq.most_severe_consequence))
if canonical:
logger.info("Filtering to canonical transcripts")
criteria.append(lambda csq: csq.canonical == 1)
if mane_select:
logger.info("Filtering to MANE Select transcripts...")
criteria.append(lambda csq: hl.is_defined(csq.mane_select))
if ensembl_only:
logger.info("Filtering to Ensembl transcripts...")
criteria.append(lambda csq: csq.transcript_id.startswith("ENST"))
if protein_coding:
logger.info("Filtering to protein coding transcripts...")
criteria.append(lambda csq: csq.biotype == "protein_coding")
if genes is not None:
logger.info("Filtering to genes of interest...")
genes = hl.literal(genes)
gene_field = "gene_symbol" if match_by_gene_symbol else "gene_id"
if keep_genes:
criteria.append(lambda csq: genes.contains(csq[gene_field]))
else:
criteria.append(lambda csq: ~genes.contains(csq[gene_field]))
if additional_filtering_criteria is not None:
logger.info("Filtering to variants with additional criteria...")
criteria = criteria + additional_filtering_criteria

if len(criteria) == 1:
logger.warning("No changes have been made to input transcript consequences!")

return csq_expr.filter(lambda x: combine_functions(criteria, x))
def _filter_vep_csq_expr(
csq: hl.expr.StructExpression,
additional_filtering_criteria: Optional[List[hl.expr.BooleanExpression]] = None,
) -> hl.expr.BooleanExpression:
"""
Filter VEP consequence StructExpression based on specified criteria.
:param csq: VEP consequence StructExpression.
:param additional_filtering_criteria: Optional list of additional filtering.
:return: BooleanExpression for filtering VEP consequence StructExpression.
"""
criteria = hl.bool(True)
if csqs is not None:
found = hl.literal(csqs).contains(csq.most_severe_consequence)
if keep_csqs:
criteria &= found
else:
criteria &= ~found
if canonical:
logger.info("Filtering to canonical transcripts")
criteria &= csq.canonical == 1
if mane_select:
logger.info("Filtering to MANE Select transcripts...")
criteria &= hl.is_defined(csq.mane_select)
if ensembl_only:
logger.info("Filtering to Ensembl transcripts...")
criteria &= csq.transcript_id.startswith("ENST")
if protein_coding:
logger.info("Filtering to protein coding transcripts...")
criteria &= csq.biotype == "protein_coding"
if genes is not None:
logger.info("Filtering to genes of interest...")
gene_field = "gene_symbol" if match_by_gene_symbol else "gene_id"
found = hl.literal(genes).contains(csq[gene_field])
if keep_genes:
criteria &= found
else:
criteria &= ~found
if additional_filtering_criteria is not None:
logger.info("Filtering to variants with additional criteria...")
criteria &= functools.reduce(operator.iand, additional_filtering_criteria)

return criteria

if is_struct:
return _filter_vep_csq_expr(csq_expr, additional_filtering_criteria)
else:
return csq_expr.filter(
lambda x: _filter_vep_csq_expr(
x,
(
[f(x) for f in additional_filtering_criteria]
if additional_filtering_criteria is not None
else None
),
)
)


def add_most_severe_csq_to_tc_within_vep_root(
Expand Down

0 comments on commit de65b83

Please sign in to comment.