Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the max genetic distance between parents at a breakpoint #163

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 38 additions & 1 deletion sc2ts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,16 @@
from . import core
from . import lineages

def pairwise(iterable):
"""
s -> (s0,s1), (s1,s2), (s2, s3), ...
We can replace this with itertools.pairwise once using min Python 3.10
"""

a, b = itertools.tee(iterable)
next(b, None)
return zip(a, b)


@numba.njit
def _get_root_path(parent, node):
Expand Down Expand Up @@ -247,6 +257,15 @@ def is_parent_lineage_consistent(self):
== arg.parent_imputed_lineages
)

def fwd_bkwd_hmm_parents(self):
"""
Returns successive tuples of (forward parent 1, backward parent 1), etc
"""
for parents in itertools.zip_longest(
self.hmm_runs["forward"].parents, self.hmm_runs["backward"].parents
):
yield np.array(parents)

def asdict(self):
return dataclasses.asdict(self)

Expand Down Expand Up @@ -806,13 +825,30 @@ def export_recombinant_breakpoints(self):
break, and their MRCA.

Recombinants with multiple breaks are represented by multiple rows.
You can only list the breakpoints in recombination nodes that have 2 parents by
You can list the breakpoints in recombination nodes that have only 2 parents by
doing e.g. df.drop_duplicates('node', keep=False)
"""
recombs = self.combine_recombinant_info()
data = []
parents = set()
# much quicker to get all the haplotype calculations in a single sweep:
for rec in recombs:
if rec.is_path_length_consistent():
parents.update(u for fwd_bkwd in rec.fwd_bkwd_hmm_parents() for u in fwd_bkwd)
parents = {k: index for index, k in enumerate(parents)} # Convert to a mapping
H = self.ts.genotype_matrix(samples=list(parents.keys()), isolated_as_missing=False).T

for rec in recombs:
arg = rec.arg_info
fwd_bck_parents_max_dist = np.full(len(arg.parents) -1, np.nan)
if rec.is_path_length_consistent():
# Calculate the sequence difference between fwd and bkwd parents
seq_diff = [
np.sum(H[parents[fwd], :] != H[parents[bwd], :])
for fwd, bwd in rec.fwd_bkwd_hmm_parents()
]
fwd_bck_parents_max_dist = np.array(
[max(parent_pair) for parent_pair in pairwise(seq_diff)], dtype=int)
for j in range(len(arg.parents) - 1):
row = rec.data_summary()
mrca = arg.mrcas[j]
Expand All @@ -825,6 +861,7 @@ def export_recombinant_breakpoints(self):
row["right_parent_imputed_lineage"] = arg.parent_imputed_lineages[j + 1]
row["mrca"] = mrca
row["mrca_date"] = self.nodes_date[mrca]
row["fwd_bck_parents_max_dist"] = fwd_bck_parents_max_dist[j]
data.append(row)
return pd.DataFrame(sorted(data, key=operator.itemgetter("node")))

Expand Down