Skip to content

Commit

Permalink
Improve H&K genedrop performance
Browse files Browse the repository at this point in the history
  • Loading branch information
timothymillar committed Dec 3, 2023
1 parent 9886852 commit d6df007
Show file tree
Hide file tree
Showing 3 changed files with 459 additions and 174 deletions.
129 changes: 6 additions & 123 deletions sgkit/stats/genedrop.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,138 +7,19 @@
from xarray import Dataset

from sgkit import variables
from sgkit.accelerate import numba_guvectorize, numba_jit
from sgkit.typing import ArrayLike
from sgkit.utils import (
conditional_merge_datasets,
create_dataset,
define_variable_if_absent,
)

from .pedigree import (
_compress_hamilton_kerr_parameters,
parent_indices,
topological_argsort,
)
from .pedigree import parent_indices

EST_DIPLOID = "diploid"
EST_HAMILTON_KERR = "Hamilton-Kerr"


@numba_guvectorize( # type: ignore
[
"void(int8[:,:], int64[:,:], uint32, int8[:,:])",
"void(int16[:,:], int64[:,:], uint32, int16[:,:])",
"void(int32[:,:], int64[:,:], uint32, int32[:,:])",
"void(int64[:,:], int64[:,:], uint32, int64[:,:])",
],
"(n,k),(n,p),()->(n,k)",
)
def genedrop_diploid(
genotypes: ArrayLike,
parent: ArrayLike,
seed: ArrayLike,
out: ArrayLike,
) -> None: # pragma: no cover
n_sample, n_parent = parent.shape
_, ploidy = genotypes.shape
if n_parent != 2:
raise ValueError("The parents dimension must be length 2")
if ploidy != 2:
raise ValueError("Genotypes are not diploid")
order = topological_argsort(parent)
np.random.seed(seed)
for i in range(n_sample):
t = order[i]
unknown_parent = 0
for j in range(n_parent):
p = parent[t, j]
if p < 0:
# founder
unknown_parent += 1
else:
idx = np.random.randint(2)
out[t, j] = out[p, idx]
if unknown_parent == 1:
raise ValueError("Pedigree contains half-founders")
elif unknown_parent == 2:
# copy founder
out[t, 0] = genotypes[t, 0]
out[t, 1] = genotypes[t, 1]


@numba_jit(nogil=True)
def _random_gamete_Hamilton_Kerr(
genotype: ArrayLike, ploidy: int, tau: int, lambda_: float
) -> ArrayLike:
if ploidy < len(genotype):
# remove fill values
genotype = genotype[genotype > -2]
if ploidy != len(genotype):
raise ValueError("Genotype ploidy does not match number of alleles")
if tau > ploidy:
# TODO: this can be an option for encoding somatic genome duplication (with suitable lambda)
raise NotImplementedError("Gamete tau cannot exceed parental ploidy")
gamete = np.random.choice(genotype, tau, replace=False)
if lambda_ > 0.0:
if tau == 2:
if np.random.rand() <= lambda_:
gamete[1] = gamete[0]
elif tau != 2:
raise NotImplementedError("Non-zero lambda is only implemented for tau = 2")
return gamete


@numba_guvectorize( # type: ignore
[
"void(int8[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int8[:,:])",
"void(int16[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int16[:,:])",
"void(int32[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int32[:,:])",
"void(int64[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int64[:,:])",
],
"(n,k),(n,p),(n,p),(n,p),()->(n,k)",
)
def genedrop_Hamilton_Kerr(
genotypes: ArrayLike,
parent: ArrayLike,
tau: ArrayLike,
lambda_: ArrayLike,
seed: int,
out: ArrayLike,
) -> None: # pragma: no cover
if parent.shape[1] != 2:
parent, tau, lambda_ = _compress_hamilton_kerr_parameters(parent, tau, lambda_)
n_sample, n_parent = parent.shape
_, max_ploidy = genotypes.shape
order = topological_argsort(parent)
np.random.seed(seed)
for i in range(n_sample):
t = order[i]
alleles = 0
unknown_alleles = 0
for j in range(n_parent):
p = parent[t, j]
tau_p = tau[t, j]
lambda_p = lambda_[t, j]
if p < 0:
unknown_alleles += tau_p
elif tau_p > 0:
ploidy_p = tau[p, 0] + tau[p, 1]
gamete = _random_gamete_Hamilton_Kerr(out[p], ploidy_p, tau_p, lambda_p)
out[t, alleles : alleles + tau_p] = gamete
alleles += tau_p
if unknown_alleles == 0:
if alleles < max_ploidy:
# pad with fill value
out[t, alleles:] = -2
elif unknown_alleles == alleles:
# founder
out[t] = genotypes[t]
else:
# partial founder
raise ValueError("Pedigree contains half-founders")


def simulate_genedrop(
ds: Dataset,
*,
Expand All @@ -165,7 +46,7 @@ def simulate_genedrop(
which is only suitable for pedigrees in which all samples are
diploids resulting from sexual reproduction.
The "Hamilton-Kerr" method is suitable for autopolyploid and
mixed-ploidy datasets following Kerr et al. 2012 and [2]
mixed-ploidy datasets following Kerr et al. 2012 [2] and
Hamilton and Kerr 2017 [3].
call_genotype
Input variable name holding call_genotype as defined by
Expand Down Expand Up @@ -217,8 +98,8 @@ def simulate_genedrop(
If the Hamilton-Kerr method is used and a sample has more than
two contributing parents.
ValueError
If the Hamilton-Kerr method is used and the number of alleles
in a genotype does not match the sum of tau values (i.e., ploidy).
If the Hamilton-Kerr method is used and the number of alleles in a
founder genotype does not match the sum of its tau values (i.e., ploidy).
NotImplementedError
If the Hamilton-Kerr method is used and a tau value exceeds the
parental ploidy.
Expand Down Expand Up @@ -295,6 +176,8 @@ def simulate_genedrop(
"Computation of the inverse additive relationship matrix for autopolyploid
and multiple-ploidy populations." Theoretical and Applied Genetics 131: 851-860.
"""
from .genedrop_numba_fns import genedrop_diploid, genedrop_Hamilton_Kerr

ds = define_variable_if_absent(ds, variables.parent, parent, parent_indices)
variables.validate(
ds, {parent: variables.parent_spec, call_genotype: variables.call_genotype_spec}
Expand Down
169 changes: 169 additions & 0 deletions sgkit/stats/genedrop_numba_fns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
import numpy as np

from sgkit.accelerate import numba_guvectorize, numba_jit
from sgkit.typing import ArrayLike

from .pedigree import _compress_hamilton_kerr_parameters, topological_argsort


@numba_guvectorize( # type: ignore
[
"void(int8[:,:], int64[:,:], uint32, int8[:,:])",
"void(int16[:,:], int64[:,:], uint32, int16[:,:])",
"void(int32[:,:], int64[:,:], uint32, int32[:,:])",
"void(int64[:,:], int64[:,:], uint32, int64[:,:])",
],
"(n,k),(n,p),()->(n,k)",
)
def genedrop_diploid(
genotypes: ArrayLike,
parent: ArrayLike,
seed: ArrayLike,
out: ArrayLike,
) -> None: # pragma: no cover
n_sample, n_parent = parent.shape
_, ploidy = genotypes.shape
if n_parent != 2:
raise ValueError("The parents dimension must be length 2")
if ploidy != 2:
raise ValueError("Genotypes are not diploid")
order = topological_argsort(parent)
np.random.seed(seed)
for i in range(n_sample):
t = order[i]
unknown_parent = 0
for j in range(n_parent):
p = parent[t, j]
if p < 0:
# founder
unknown_parent += 1
else:
idx = np.random.randint(2)
out[t, j] = out[p, idx]
if unknown_parent == 1:
raise ValueError("Pedigree contains half-founders")
elif unknown_parent == 2:
# copy founder
out[t, 0] = genotypes[t, 0]
out[t, 1] = genotypes[t, 1]


@numba_jit(nogil=True)
def _random_inheritance_Hamilton_Kerr(
genotypes: ArrayLike,
parent: ArrayLike,
tau: ArrayLike,
lambda_: ArrayLike,
marked: ArrayLike,
i: int,
):
_, n_parent = parent.shape
_, max_ploidy = genotypes.shape
next_allele = 0
ploidy_i = 0
for j in range(n_parent):
p = parent[i, j]
tau_p = tau[i, j]
ploidy_i += tau_p
if p < 0:
# unknown parent
continue
lambda_p = lambda_[i, j]
ploidy_p = tau[p, 0] + tau[p, 1]
if tau_p > ploidy_p:
raise NotImplementedError("Gamete tau cannot exceed parental ploidy.")
if lambda_p > 0.0:
if tau_p != 2:
raise NotImplementedError(
"Non-zero lambda is only implemented for tau = 2."
)
homozygous_gamete = np.random.rand() < lambda_p
else:
homozygous_gamete = False
if homozygous_gamete:
# diploid gamete with duplicated allele
uniform = np.random.rand()
choice = int(uniform * ploidy_p)
for k in range(max_ploidy):
parent_allele = genotypes[p, k]
if parent_allele < -1:
# non-allele
pass
elif choice > 0:
# not the chosen allele
choice -= 1
else:
# chosen allele is duplicated
genotypes[i, next_allele] = parent_allele
genotypes[i, next_allele + 1] = parent_allele
next_allele += 2
break
else:
# random alleles without replacement
marked[:] = False
for h in range(tau_p):
uniform = np.random.rand()
scale = ploidy_p - h
choice = int(uniform * scale)
k = 0
while choice >= 0:
parent_allele = genotypes[p, k]
if marked[k] > 0:
# already inherited
pass
elif parent_allele < -1:
# non-allele
pass
elif choice > 0:
# not the chosen allele
choice -= 1
else:
# chosen allele
genotypes[i, next_allele] = parent_allele
marked[k] = True
next_allele += 1
choice -= 1
k += 1
if next_allele == 0:
# full founder requires ploidy validation
alleles_i = 0
for k in range(max_ploidy):
if genotypes[i, k] >= -1:
alleles_i += 1
if alleles_i != ploidy_i:
raise ValueError("Genotype ploidy does not match number of alleles.")
elif next_allele != ploidy_i:
raise ValueError("Pedigree contains half-founders.")
elif next_allele < max_ploidy:
# pad with non-alleles
genotypes[i, next_allele:] = -2


@numba_guvectorize( # type: ignore
[
"void(int8[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int8[:,:])",
"void(int16[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int16[:,:])",
"void(int32[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int32[:,:])",
"void(int64[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int64[:,:])",
],
"(n,k),(n,p),(n,p),(n,p),()->(n,k)",
)
def genedrop_Hamilton_Kerr(
genotypes: ArrayLike,
parent: ArrayLike,
tau: ArrayLike,
lambda_: ArrayLike,
seed: int,
out: ArrayLike,
) -> None: # pragma: no cover
if parent.shape[1] != 2:
parent, tau, lambda_ = _compress_hamilton_kerr_parameters(parent, tau, lambda_)
out[:] = genotypes
n_sample, _ = parent.shape
_, max_ploidy = genotypes.shape
order = topological_argsort(parent)
marked = np.zeros(max_ploidy, dtype=np.bool8)
np.random.seed(seed)
for idx in range(n_sample):
i = order[idx]
_random_inheritance_Hamilton_Kerr(out, parent, tau, lambda_, marked, i)
Loading

0 comments on commit d6df007

Please sign in to comment.