Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mutational load function (SHM) #536

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
9f8558f
Added mutational_load function to calculate differences between seque…
MKanetscheider Aug 9, 2024
4755736
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 9, 2024
4b020b6
Merge branch 'scverse:main' into mutational_load
MKanetscheider Aug 13, 2024
56a8594
Rewrote mutational_load function based on previous feedback and added…
MKanetscheider Aug 15, 2024
c599a39
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 15, 2024
5c7c92c
Fixed an issue with pre-commit
MKanetscheider Aug 15, 2024
c84708c
Merge branch 'mutational_load' of https://github.com/MKanetscheider/s…
MKanetscheider Aug 15, 2024
12ada2f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 15, 2024
a416e06
Further optimized mutational_load function and formating of docstring…
MKanetscheider Aug 18, 2024
c0e795c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 18, 2024
9793062
Fixed small issues with the code layout as suggested by grst
MKanetscheider Aug 20, 2024
d12f7b5
Merge branch 'scverse:main' into mutational_load
MKanetscheider Aug 26, 2024
906df48
Merge branch 'mutational_load' of https://github.com/MKanetscheider/s…
MKanetscheider Aug 26, 2024
196177e
Added a first beta-test case, which revealed some bugs that were also…
MKanetscheider Aug 29, 2024
11101a5
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 29, 2024
cf53b72
Specified 'except' condition
MKanetscheider Aug 29, 2024
9c6a56c
Merge branch 'mutational_load' of https://github.com/MKanetscheider/s…
MKanetscheider Aug 29, 2024
e5c4d76
Merge branch 'main' into mutational_load
MKanetscheider Aug 30, 2024
ea80d69
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 30, 2024
e662e36
Merge branch 'main' into mutational_load
MKanetscheider Oct 15, 2024
ae9563b
Add notebook section about somatic hypermutation
grst Oct 17, 2024
4ad7d59
Merge branch 'main' into mutational_load
grst Nov 20, 2024
8abcd01
Update SHM description text in tutorial
grst Nov 20, 2024
42f79da
Merge branch 'main' into mutational_load
grst Nov 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/scirpy/tl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,6 @@
from ._diversity import alpha_diversity
from ._group_abundance import group_abundance
from ._ir_query import ir_query, ir_query_annotate, ir_query_annotate_df
from ._mutational_load import mutational_load
grst marked this conversation as resolved.
Show resolved Hide resolved
from ._repertoire_overlap import repertoire_overlap
from ._spectratype import spectratype
278 changes: 278 additions & 0 deletions src/scirpy/tl/_mutational_load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
from collections.abc import Sequence
from typing import Literal, Union

import pandas as pd

from scirpy.get import airr as get_airr
from scirpy.util import DataHandler


def simple_hamming_distance(sequence: str, germline: str, frequency=False):
if (sequence is None) or (germline is None):
return None
if len(sequence) != len(germline):
raise Exception("Sequences might not be IMGT aligned")

distance = 0
if frequency:
frac_counter = len(germline)
for i, c in enumerate(germline):
if (c == ".") or (c == "N"):
frac_counter -= 1
continue
elif (sequence[i] == ".") or (sequence[i] == "N"):
frac_counter -= 1
continue
elif c != sequence[i]:
distance += 1
if frac_counter == 0:
return frac_counter
else:
return distance / frac_counter

else:
for i, c in enumerate(germline):
if (c == ".") or (c == "N"):
continue
if (sequence[i] == ".") or (sequence[i] == "N"):
continue
elif c != sequence[i]:
distance += 1

return distance
grst marked this conversation as resolved.
Show resolved Hide resolved


@DataHandler.inject_param_docs()
def mutational_load(
adata: DataHandler.TYPE,
*,
airr_mod="airr",
airr_key="airr",
chain_idx_key="chain_indices",
sequence_alignment: str = "sequence_alignment",
germline_alignment: str = "germline_alignment_d_mask",
chains: Union[
Literal["VJ_1", "VDJ_1", "VJ_2", "VDJ_2"],
Sequence[Literal["VJ_1", "VDJ_1", "VJ_2", "VDJ_2"]],
] = "VDJ_1",
region: Literal["IMGT_V(D)J", "IMGT_V_segment", "subregion"] = "IMGT_VDJ",
junction_col: str = "junction",
frequency: bool = True,
inplace: bool = True,
) -> Union[None, pd.DataFrame]:
"""\
Calculates observable mutation by comparing sequence with germline alignment and counting differences
Behaviour: N- (masked/decayed base calls) and .-(gaps) characters are ignored

Parameters
----------
{adata}
{airr_mod}
{airr_key}
chain_idx_key
key to select chain indices
sequence_alignment
Awkward array key to access sequence alignment information
germline_alignment
Awkward array key to access sequence alignment information -> best practice to select D gene-masked alignment
chains
One or multiple chains from which to use CDR3 sequences
region
Specify the way mutations are calculated according to IMGT unique numbering scheme (https://doi.org/10.1016/S0145-305X(02)00039-3)
IMGT_V(D)J ->
IMGT_V_segment ->
subregion ->
junction_col
Awkward array key to access junction region information
frequency
Specify to obtain either total or relative counts
{inplace}

Returns
-------
Depending on the value of inplace adds a column to adata or returns a pd.DataFrame
"""
params = DataHandler(adata, airr_mod, airr_key, chain_idx_key)
airr_df = get_airr(params, [sequence_alignment, germline_alignment, junction_col], chains)
if not inplace:
mutation_df = pd.DataFrame(index=airr_df.index)

if region == "IMGT_V(D)J":
for chain in chains:
mutations = []
for row in range(len(airr_df)):
#
if frequency:
rel_mutation = simple_hamming_distance(
airr_df.iloc[row].loc[f"{chain}_{sequence_alignment}"],
airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"],
frequency=True,
)
mutations.append(rel_mutation)

else:
count_mutation = simple_hamming_distance(
airr_df.iloc[row].loc[f"{chain}_{sequence_alignment}"],
airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"],
)

mutations.append(count_mutation)

if inplace and frequency:
params.set_obs(f"{chain}_V(D)J_mu_freq", mutations)

if not inplace and frequency:
mutation_df[f"{chain}_V(D)J_mu_freq"] = mutations

if inplace and not frequency:
params.set_obs(f"{chain}_V(D)J_mu_count", mutations)

if not inplace and not frequency:
mutation_df[f"{chain}_V(D)J_mu_count"] = mutations

if not inplace:
return mutation_df

# calculate SHM up to nucleotide 312. Referring to the IMGT unique numbering scheme this includes:
# fwr1, cdr1, fwr2, cdr2, fwr3, but not cdr3 and fwr4
if region == "IMGT_V_segment":
for chain in chains:
mutations = []
for row in range(len(airr_df)):
v_region_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][:312]
v_region_sequence = airr_df.iloc[row].loc[f"{chain}_{sequence_alignment}"][:312]

if frequency:
rel_mutation = simple_hamming_distance(v_region_sequence, v_region_germline, frequency=True)
mutations.append(rel_mutation)
else:
count_mutation = simple_hamming_distance(v_region_sequence, v_region_germline)
mutations.append(count_mutation)

if inplace and frequency:
params.set_obs(f"{chain}_v_segment_mu_freq", mutations)

if not inplace and frequency:
mutation_df[f"{chain}_v_segment_mu_freq"] = mutations

if inplace and not frequency:
params.set_obs(f"{chain}_v_segment_mu_count", mutations)

if not inplace and not frequency:
mutation_df[f"{chain}_v_segment_mu_count"] = mutations

if not inplace:
return mutation_df

if region == "subregion":
subregion_df = get_airr(params, ["fwr1", "fwr2", "fwr3", "fwr4", "cdr1", "cdr2", "cdr3"], chains)
for chain in chains:
airr_df[f"{chain}_junction_len"] = [len(a) for a in airr_df[f"{chain}_junction"]]

mutation_fwr1 = []
mutation_fwr2 = []
mutation_fwr3 = []
mutation_fwr4 = []
mutation_cdr1 = []
mutation_cdr2 = []
mutation_cdr3 = []

for row in range(len(airr_df)):
fwr1_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][:78]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where do the numbers of the indices come from? Can we be sure they will remain stable?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These indices come from the IMGT unique numbering scheme (https://pubmed.ncbi.nlm.nih.gov/12477501/). This scheme is a standard approach to ensure that we can compare different V-regions of different cells. The neat thing is that sequences are aligned in a way that fwr 1-3 and cdr1-2 are always on the same spot in the germline and sequence alignment that's why these fixed indices work. cdr3 and fwr4 can be inferred by knowing the junction length and total sequence length as it is used in my code.

cdr1_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][78:114]
fwr2_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][114:165]
cdr2_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][165:195]
fwr3_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][195:312]
cdr3_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][
312 : (312 + airr_df.iloc[row].loc[f"{chain}_junction_len"] - 6)
]
fwr4_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][
(312 + airr_df.iloc[row].loc[f"{chain}_junction_len"] - 6) :
]

if frequency:
fwr1_mu_rel = simple_hamming_distance(
subregion_df.iloc[row].loc[f"{chain}_fwr1"], fwr1_germline, frequency=True
)
cdr1_mu_rel = simple_hamming_distance(
subregion_df.iloc[row].loc[f"{chain}_cdr1"], cdr1_germline, frequency=True
)
fwr2_mu_rel = simple_hamming_distance(
subregion_df.iloc[row].loc[f"{chain}_fwr2"], fwr2_germline, frequency=True
)
cdr2_mu_rel = simple_hamming_distance(
subregion_df.iloc[row].loc[f"{chain}_cdr2"], cdr2_germline, frequency=True
)
fwr3_mu_rel = simple_hamming_distance(
subregion_df.iloc[row].loc[f"{chain}_fwr3"], fwr3_germline, frequency=True
)
cdr3_mu_rel = simple_hamming_distance(
subregion_df.iloc[row].loc[f"{chain}_cdr3"], cdr3_germline, frequency=True
)
fwr4_mu_rel = simple_hamming_distance(
subregion_df.iloc[row].loc[f"{chain}_fwr4"], fwr4_germline, frequency=True
)

mutation_fwr1.append(fwr1_mu_rel)
mutation_fwr2.append(fwr2_mu_rel)
mutation_fwr3.append(fwr3_mu_rel)
mutation_fwr4.append(fwr4_mu_rel)
mutation_cdr1.append(cdr1_mu_rel)
mutation_cdr2.append(cdr2_mu_rel)
mutation_cdr3.append(cdr3_mu_rel)

else:
fwr1_mu_count = simple_hamming_distance(subregion_df.iloc[row].loc[f"{chain}_fwr1"], fwr1_germline)
cdr1_mu_count = simple_hamming_distance(subregion_df.iloc[row].loc[f"{chain}_cdr1"], cdr1_germline)
fwr2_mu_count = simple_hamming_distance(subregion_df.iloc[row].loc[f"{chain}_fwr2"], fwr2_germline)
cdr2_mu_count = simple_hamming_distance(subregion_df.iloc[row].loc[f"{chain}_cdr2"], cdr2_germline)
fwr3_mu_count = simple_hamming_distance(subregion_df.iloc[row].loc[f"{chain}_fwr3"], fwr3_germline)
cdr3_mu_count = simple_hamming_distance(subregion_df.iloc[row].loc[f"{chain}_cdr3"], cdr3_germline)
fwr4_mu_count = simple_hamming_distance(subregion_df.iloc[row].loc[f"{chain}_fwr4"], fwr4_germline)

mutation_fwr1.append(fwr1_mu_count)
mutation_fwr2.append(fwr2_mu_count)
mutation_fwr3.append(fwr3_mu_count)
mutation_fwr4.append(fwr4_mu_count)
mutation_cdr1.append(cdr1_mu_count)
mutation_cdr2.append(cdr2_mu_count)
mutation_cdr3.append(cdr3_mu_count)

if not inplace and frequency:
mutation_df[f"{chain}_fwr1_mu_freq"] = mutation_fwr1
mutation_df[f"{chain}_cdr1_mu_freq"] = mutation_cdr1
mutation_df[f"{chain}_fwr2_mu_freq"] = mutation_fwr2
mutation_df[f"{chain}_cdr2_mu_freq"] = mutation_cdr2
mutation_df[f"{chain}_fwr3_mu_freq"] = mutation_fwr3
mutation_df[f"{chain}_cdr3_mu_freq"] = mutation_cdr3
mutation_df[f"{chain}_fwr4_mu_freq"] = mutation_fwr4

if inplace and frequency:
params.set_obs(f"{chain}_fwr1_mu_freq", mutation_fwr1)
params.set_obs(f"{chain}_cdr1_mu_freq", mutation_cdr1)
params.set_obs(f"{chain}_fwr2_mu_freq", mutation_fwr2)
params.set_obs(f"{chain}_cdr2_mu_freq", mutation_cdr2)
params.set_obs(f"{chain}_fwr3_mu_freq", mutation_fwr3)
params.set_obs(f"{chain}_cdr3_mu_freq", mutation_cdr3)
params.set_obs(f"{chain}_fwr4_mu_freq", mutation_fwr4)

if inplace and not frequency:
params.set_obs(f"{chain}_fwr1_mu_count", mutation_fwr1)
params.set_obs(f"{chain}_cdr1_mu_count", mutation_cdr1)
params.set_obs(f"{chain}_fwr2_mu_count", mutation_fwr2)
params.set_obs(f"{chain}_cdr2_mu_count", mutation_cdr2)
params.set_obs(f"{chain}_fwr3_mu_count", mutation_fwr3)
params.set_obs(f"{chain}_cdr3_mu_count", mutation_cdr3)
params.set_obs(f"{chain}_fwr4_mu_count", mutation_fwr4)

if not inplace and not frequency:
mutation_df[f"{chain}_fwr1_mu_count"] = mutation_fwr1
mutation_df[f"{chain}_cdr1_mu_count"] = mutation_cdr1
mutation_df[f"{chain}_fwr2_mu_count"] = mutation_fwr2
mutation_df[f"{chain}_cdr2_mu_count"] = mutation_cdr2
mutation_df[f"{chain}_fwr3_mu_count"] = mutation_fwr3
mutation_df[f"{chain}_cdr3_mu_count"] = mutation_cdr3
mutation_df[f"{chain}_fwr4_mu_count"] = mutation_fwr4
grst marked this conversation as resolved.
Show resolved Hide resolved

if not inplace:
return mutation_df
Loading