Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mutational load function (SHM) #536

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
9f8558f
Added mutational_load function to calculate differences between seque…
MKanetscheider Aug 9, 2024
4755736
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 9, 2024
4b020b6
Merge branch 'scverse:main' into mutational_load
MKanetscheider Aug 13, 2024
56a8594
Rewrote mutational_load function based on previous feedback and added…
MKanetscheider Aug 15, 2024
c599a39
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 15, 2024
5c7c92c
Fixed an issue with pre-commit
MKanetscheider Aug 15, 2024
c84708c
Merge branch 'mutational_load' of https://github.com/MKanetscheider/s…
MKanetscheider Aug 15, 2024
12ada2f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 15, 2024
a416e06
Further optimized mutational_load function and formating of docstring…
MKanetscheider Aug 18, 2024
c0e795c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 18, 2024
9793062
Fixed small issues with the code layout as suggested by grst
MKanetscheider Aug 20, 2024
d12f7b5
Merge branch 'scverse:main' into mutational_load
MKanetscheider Aug 26, 2024
906df48
Merge branch 'mutational_load' of https://github.com/MKanetscheider/s…
MKanetscheider Aug 26, 2024
196177e
Added a first beta-test case, which revealed some bugs that were also…
MKanetscheider Aug 29, 2024
11101a5
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 29, 2024
cf53b72
Specified 'except' condition
MKanetscheider Aug 29, 2024
9c6a56c
Merge branch 'mutational_load' of https://github.com/MKanetscheider/s…
MKanetscheider Aug 29, 2024
e5c4d76
Merge branch 'main' into mutational_load
MKanetscheider Aug 30, 2024
ea80d69
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 30, 2024
e662e36
Merge branch 'main' into mutational_load
MKanetscheider Oct 15, 2024
ae9563b
Add notebook section about somatic hypermutation
grst Oct 17, 2024
4ad7d59
Merge branch 'main' into mutational_load
grst Nov 20, 2024
8abcd01
Update SHM description text in tutorial
grst Nov 20, 2024
42f79da
Merge branch 'main' into mutational_load
grst Nov 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,12 @@ V(D)J gene usage

tl.spectratype

Calculating mutations
^^^^^^^^^^^^^^^^^^^^^
.. autosummary::
:toctree: ./generated

tl.mutational_load

Plotting: `pl`
--------------
Expand Down
1 change: 1 addition & 0 deletions src/scirpy/tl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,6 @@
from ._diversity import alpha_diversity
from ._group_abundance import group_abundance
from ._ir_query import ir_query, ir_query_annotate, ir_query_annotate_df
from ._mutational_load import mutational_load
grst marked this conversation as resolved.
Show resolved Hide resolved
from ._repertoire_overlap import repertoire_overlap
from ._spectratype import spectratype
227 changes: 227 additions & 0 deletions src/scirpy/tl/_mutational_load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
from collections.abc import Sequence
from typing import Literal, Union

import numpy as np
import pandas as pd

from scirpy.get import airr as get_airr
from scirpy.util import DataHandler


def simple_hamming_distance(sequence: str, germline: str, frequency: bool, ignore_chars: list[str]):
if not sequence or not germline or len(sequence) != len(germline):
raise ValueError("Sequences might not be IMGT aligned")

distance = 0
num_chars = len(sequence)

for l1, l2 in zip(sequence, germline):
if l1 in ignore_chars or l2 in ignore_chars:
num_chars -= 1

elif l1 != l2:
distance += 1

if num_chars == 0:
return np.nan # can be used as a flag for filtering
Comment on lines +26 to +27
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this basically returns None when no (non-ignored) characters were compared. Could you please elaborate what's the reasoning behind this instead of returning 0?


elif frequency:
return distance / num_chars

else:
return distance


@DataHandler.inject_param_docs()
def mutational_load(
adata: DataHandler.TYPE,
*,
airr_mod="airr",
airr_key="airr",
chain_idx_key="chain_indices",
sequence_alignment: str = "sequence_alignment",
germline_alignment: str = "germline_alignment_d_mask",
chains: Union[
Literal["VJ_1", "VDJ_1", "VJ_2", "VDJ_2"],
Sequence[Literal["VJ_1", "VDJ_1", "VJ_2", "VDJ_2"]],
] = "VDJ_1",
region: Literal["IMGT_V(D)J", "IMGT_V_segment", "subregion"] = "IMGT_VDJ",
junction_col: str = "junction",
frequency: bool = True,
ignore_chars: list[str] = "None",
inplace: bool = True,
) -> Union[None, pd.DataFrame]:
"""\
Calculates observable mutation by comparing sequence with germline alignment and counting differences
Needs germline alignment information, which can be obtained by using the interoperability to Dandelion (https://sc-dandelion.readthedocs.io/en/latest/notebooks/5_dandelion_diversity_and_mutation-10x_data.html)
Behaviour: N- (masked/decayed base calls) and .-(gaps) characters are ignored

Parameters
----------
{adata}
{airr_mod}
{airr_key}
chain_idx_key
key to select chain indices
sequence_alignment
Awkward array key to access sequence alignment information
germline_alignment
Awkward array key to access sequence alignment information -> best practice to select D gene-masked alignment
chains
One or multiple chains from which to use CDR3 sequences
region
Specify the way mutations of the V segment are calculated according to IMGT unique numbering scheme (https://doi.org/10.1016/S0145-305X(02)00039-3)
IMGT_V(D)J -> Includes the entire available sequence and germline alignment without any sub-regions/divisions
IMGT_V_segment -> Only V_segment as defined by the IMGT unique numbering scheme is included => Nucleotide 1 to 312
subregion -> same as IMGT_V(D)J, but distinguish subregions into CDR1-3 and FWR1-4
junction_col
Awkward array key to access junction region information
frequency
Specify to obtain either total or relative counts
ignore_chars
A list of characters to ignore while calculating differences:
N are masked or degraded nucleotide
. are gaps => beneficial to ignore as sometimes more gaps appear in sequence in respect to germline
{inplace}

Returns
-------
Depending on the value of inplace adds a column to adata or returns a pd.DataFrame
"""
params = DataHandler(adata, airr_mod, airr_key, chain_idx_key)
airr_df = get_airr(params, [sequence_alignment, germline_alignment, junction_col], chains)
if not inplace:
mutation_df = pd.DataFrame(index=airr_df.index)

if frequency:
frequency_string = "mu_freq"
else:
frequency_string = "mu_count"
if ignore_chars == "None":
ignore_chars = [".", "N"]

if region == "IMGT_V(D)J":
for chain in chains:
mutations = []
for row in range(len(airr_df)):
mutation = simple_hamming_distance(
airr_df.iloc[row].loc[f"{chain}_{sequence_alignment}"],
airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"],
frequency=frequency,
ignore_chars=ignore_chars,
)
mutations.append(mutation)

if inplace:
params.set_obs(f"{chain}_IMGT_V(D)J_{frequency_string}", mutations)

else:
mutation_df[f"{chain}_IMGT_V(D)J_{frequency_string}"] = mutations
return mutation_df

# calculate SHM up to nucleotide 312. Referring to the IMGT unique numbering scheme this includes:
# fwr1, cdr1, fwr2, cdr2, fwr3, but not cdr3 and fwr4
if region == "IMGT_V_segment":
for chain in chains:
mutations = []
for row in range(len(airr_df)):
v_region_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][:312]
v_region_sequence = airr_df.iloc[row].loc[f"{chain}_{sequence_alignment}"][:312]

mutation = simple_hamming_distance(
v_region_sequence, v_region_germline, frequency=frequency, ignore_chars=ignore_chars
)
mutations.append(mutation)

if inplace:
params.set_obs(f"{chain}_v_segment_{frequency_string}", mutations)

else:
mutation_df[f"{chain}_v_segment_{frequency_string}"] = mutations
return mutation_df

if region == "subregion":
subregion_df = get_airr(params, ["fwr1", "fwr2", "fwr3", "fwr4", "cdr1", "cdr2", "cdr3"], chains)
for chain in chains:
airr_df[f"{chain}_junction_len"] = [len(a) for a in airr_df[f"{chain}_junction"]]

mutation_dict = {"fwr1": [], "fwr2": [], "fwr3": [], "fwr4": [], "cdr1": [], "cdr2": [], "cdr3": []}
grst marked this conversation as resolved.
Show resolved Hide resolved

for row in range(len(airr_df)):
fwr1_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][:78]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where do the numbers of the indices come from? Can we be sure they will remain stable?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These indices come from the IMGT unique numbering scheme (https://pubmed.ncbi.nlm.nih.gov/12477501/). This scheme is a standard approach to ensure that we can compare different V-regions of different cells. The neat thing is that sequences are aligned in a way that fwr 1-3 and cdr1-2 are always on the same spot in the germline and sequence alignment that's why these fixed indices work. cdr3 and fwr4 can be inferred by knowing the junction length and total sequence length as it is used in my code.

cdr1_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][78:114]
fwr2_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][114:165]
cdr2_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][165:195]
fwr3_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][195:312]
cdr3_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][
312 : (312 + airr_df.iloc[row].loc[f"{chain}_junction_len"] - 6)
]
fwr4_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][
(312 + airr_df.iloc[row].loc[f"{chain}_junction_len"] - 6) :
]

mutation_dict["fwr1"].append(
simple_hamming_distance(
subregion_df.iloc[row].loc[f"{chain}_fwr1"],
fwr1_germline,
frequency=frequency,
ignore_chars=ignore_chars,
)
)
mutation_dict["cdr1"].append(
simple_hamming_distance(
subregion_df.iloc[row].loc[f"{chain}_cdr1"],
cdr1_germline,
frequency=frequency,
ignore_chars=ignore_chars,
)
)
mutation_dict["fwr2"].append(
simple_hamming_distance(
subregion_df.iloc[row].loc[f"{chain}_fwr2"],
fwr2_germline,
frequency=frequency,
ignore_chars=ignore_chars,
)
)
mutation_dict["cdr2"].append(
simple_hamming_distance(
subregion_df.iloc[row].loc[f"{chain}_cdr2"],
cdr2_germline,
frequency=frequency,
ignore_chars=ignore_chars,
)
)
mutation_dict["fwr3"].append(
simple_hamming_distance(
subregion_df.iloc[row].loc[f"{chain}_fwr3"],
fwr3_germline,
frequency=frequency,
ignore_chars=ignore_chars,
)
)
mutation_dict["cdr3"].append(
simple_hamming_distance(
subregion_df.iloc[row].loc[f"{chain}_cdr3"],
cdr3_germline,
frequency=frequency,
ignore_chars=ignore_chars,
)
)
mutation_dict["fwr4"].append(
simple_hamming_distance(
subregion_df.iloc[row].loc[f"{chain}_fwr4"],
fwr4_germline,
frequency=frequency,
ignore_chars=ignore_chars,
)
)
grst marked this conversation as resolved.
Show resolved Hide resolved

for key in mutation_dict:
if inplace:
params.set_obs(f"{chain}_{key}_{frequency_string}", mutation_dict[key])
if not inplace:
mutation_df[f"{chain}_{key}_{frequency_string}"] = mutation_dict[key]

if not inplace:
return mutation_df
Loading