Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cross coalescence rates: advanced analysis tutorial. #277

Open
hyanwong opened this issue Jul 24, 2024 · 1 comment
Open

Cross coalescence rates: advanced analysis tutorial. #277

hyanwong opened this issue Jul 24, 2024 · 1 comment

Comments

@hyanwong
Copy link
Member

@nspope sent me this, which is a good core for a new tutorial on cross coalescence rates. Probably worth waiting until some of the core functionality (e.g. rates_from_ecdf) is implemented in tskit.

import stdpopsim
import numpy as np
import matplotlib.pyplot as plt

def rates_from_ecdf(weights, atoms, quantiles):
    """
    Estimate rates from weighted time-to-event data
    """
    assert weights.size == atoms.size
    assert quantiles.size > 2
    assert quantiles[0] == 0.0 and quantiles[-1] == 1.0

    # sort and filter inputs
    time_order = np.argsort(atoms)
    weights = weights[time_order]
    atoms = atoms[time_order]
    nonzero = weights > 0
    atoms = atoms[nonzero]
    weights = weights[nonzero]

    # find interior quantiles
    weights = np.append(0, weights)
    atoms = np.append(0, atoms)
    ecdf = np.cumsum(weights)
    indices = np.searchsorted(ecdf, ecdf[-1] * quantiles[1:-1], side='left')
    lower, upper = atoms[indices - 1], atoms[indices]
    ecdfl, ecdfu = ecdf[indices - 1] / ecdf[-1], ecdf[indices] / ecdf[-1]

    # interpolate ECDF
    assert np.all(ecdfu - ecdfl > 0)
    slope = (upper - lower) / (ecdfu - ecdfl)
    breaks = np.append(0, lower + slope * (quantiles[1:-1] - ecdfl))

    # calculate coalescence rates within intervals
    # this uses a Kaplan-Meier-type censored estimator: https://github.com/tskit-dev/tskit/pull/2119
    coalrate = np.full(quantiles.size - 1, np.nan)
    propcoal = np.diff(quantiles[:-1]) / (1 - quantiles[:-2])
    coalrate[:-1] = -np.log(1 - propcoal) / np.diff(breaks)
    last = indices[-1]
    coalrate[-1] = np.sum(weights[last:]) / np.dot(atoms[last:] - breaks[-1], weights[last:]) 

    return coalrate, breaks


# --- on data

import stdpopsim
engine = stdpopsim.get_engine("msprime")
homsap = stdpopsim.get_species("HomSap")
demogr = homsap.get_demographic_model("OutOfAfrica_4J17")
contig = homsap.get_contig("chr17", genetic_map='HapMapII_GRCh38', right=1e7)
sample = {"CEU" : 100, "YRI" : 100, "JPT" : 100, "CHB" : 100}
ts = engine.simulate(demogr, contig, sample, seed=1024).trim()

pop_idx = {p.metadata['name'] : i for i, p in enumerate(ts.populations())}
sample_sets = [
    np.flatnonzero(ts.nodes_population[:ts.num_samples] == i) 
    for i in range(ts.num_populations)
]
quantiles = np.linspace(0.0, 1.0, 25)
indexes = [(pop_idx["CEU"], pop_idx["CHB"])] # CEU vs CHB
node_coal_weight = ts.pair_coalescence_counts(sample_sets, indexes=indexes).flatten()
rates, breaks = rates_from_ecdf(node_coal_weight, ts.nodes_time, quantiles)


# sanity check: mean of cross-coalescence time distribution should equal branch-mode divergence
print(
    "Mean of cross-coalescence time distr: ",
    2 * node_coal_weight @ ts.nodes_time / np.sum(node_coal_weight),
    "Branch-mode divergence: ",
    ts.divergence(sample_sets, indexes=indexes, mode='branch'),
)


# sanity check: compare estimated rates to expectation under model
theory_rates = demogr.model.debug().coalescence_rate_trajectory(breaks, {"CEU":1, "CHB":1})[0]
plt.clf()
plt.step(breaks, rates, color='firebrick', label='estimated')
plt.step(breaks, theory_rates, color='black', label='theoretical')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Generations in past')
plt.ylabel('Cross-coalescence rate')
plt.legend()
plt.savefig('rate-check.png')


# as an aside: rates are a 1-to-1 transformation of the pair coalescence time distribution. We can plot this distribution using e.g. a weighted density estimate or histogram.
hist_density, hist_breaks = np.histogram(
    np.log10(ts.nodes_time[ts.num_samples:]), 
    weights=node_coal_weight[ts.num_samples:],
    bins=25,
    density=True,
)
plt.clf()
plt.fill_between(hist_breaks[:-1], hist_density, step='pre', alpha=0.5)
plt.xlabel('log10 generations in past')
plt.ylabel('Density')
plt.savefig('rate-hist.png')

# this also gives a very easy way to get coalescence rates if we have a fixed timegrid
# (but, rates will be noisier in bins with few nodes, and have to watch for nan)
hist_density, hist_breaks = np.histogram(
    ts.nodes_time, # note natural time scale
    weights=node_coal_weight,
    bins=np.logspace(2, 5, 25),
    density=False,
)
hist_density /= hist_density.sum()
hist_rates = -np.log(1 - hist_density / (1 - np.cumsum(hist_density))) / np.diff(hist_breaks)
hist_theory_rates = demogr.model.debug().coalescence_rate_trajectory(hist_breaks, {"CEU":1, "CHB":1})[0]
hist_theory_rates = hist_theory_rates[1:] # remove [0, first break]

plt.clf()
plt.step(hist_breaks[:-1], hist_rates, color='firebrick', label='estimated')
plt.step(hist_breaks[:-1], hist_theory_rates, color='black', label='theoretical')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Generations in past')
plt.ylabel('Cross-coalescence rate')
plt.legend()
plt.savefig('rate-check-2.png')
@nspope
Copy link

nspope commented Jul 24, 2024

just a note that

print(
    "Mean of cross-coalescence time distr: ",
    2 * node_coal_weight @ ts.nodes_time / np.sum(node_coal_weight),
    "Branch-mode divergence: ",
    ts.divergence(sample_sets, indexes=indexes, mode='branch'),
)

will only match if the tree sequence is trimmed first (which it isn't in this example)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants