Improve H&K genedrop performance

sgkit-dev · Dec 3, 2023 · d6df007 · d6df007
1 parent 9886852
commit d6df007
Show file tree

Hide file tree

Showing 3 changed files with 459 additions and 174 deletions.
diff --git a/sgkit/stats/genedrop.py b/sgkit/stats/genedrop.py
@@ -7,138 +7,19 @@
 from xarray import Dataset
 
 from sgkit import variables
-from sgkit.accelerate import numba_guvectorize, numba_jit
 from sgkit.typing import ArrayLike
 from sgkit.utils import (
     conditional_merge_datasets,
     create_dataset,
     define_variable_if_absent,
 )
 
-from .pedigree import (
-    _compress_hamilton_kerr_parameters,
-    parent_indices,
-    topological_argsort,
-)
+from .pedigree import parent_indices
 
 EST_DIPLOID = "diploid"
 EST_HAMILTON_KERR = "Hamilton-Kerr"
 
 
-@numba_guvectorize(  # type: ignore
-    [
-        "void(int8[:,:], int64[:,:], uint32, int8[:,:])",
-        "void(int16[:,:], int64[:,:], uint32, int16[:,:])",
-        "void(int32[:,:], int64[:,:], uint32, int32[:,:])",
-        "void(int64[:,:], int64[:,:], uint32, int64[:,:])",
-    ],
-    "(n,k),(n,p),()->(n,k)",
-)
-def genedrop_diploid(
-    genotypes: ArrayLike,
-    parent: ArrayLike,
-    seed: ArrayLike,
-    out: ArrayLike,
-) -> None:  # pragma: no cover
-    n_sample, n_parent = parent.shape
-    _, ploidy = genotypes.shape
-    if n_parent != 2:
-        raise ValueError("The parents dimension must be length 2")
-    if ploidy != 2:
-        raise ValueError("Genotypes are not diploid")
-    order = topological_argsort(parent)
-    np.random.seed(seed)
-    for i in range(n_sample):
-        t = order[i]
-        unknown_parent = 0
-        for j in range(n_parent):
-            p = parent[t, j]
-            if p < 0:
-                # founder
-                unknown_parent += 1
-            else:
-                idx = np.random.randint(2)
-                out[t, j] = out[p, idx]
-        if unknown_parent == 1:
-            raise ValueError("Pedigree contains half-founders")
-        elif unknown_parent == 2:
-            # copy founder
-            out[t, 0] = genotypes[t, 0]
-            out[t, 1] = genotypes[t, 1]
-
-
-@numba_jit(nogil=True)
-def _random_gamete_Hamilton_Kerr(
-    genotype: ArrayLike, ploidy: int, tau: int, lambda_: float
-) -> ArrayLike:
-    if ploidy < len(genotype):
-        # remove fill values
-        genotype = genotype[genotype > -2]
-    if ploidy != len(genotype):
-        raise ValueError("Genotype ploidy does not match number of alleles")
-    if tau > ploidy:
-        # TODO: this can be an option for encoding somatic genome duplication (with suitable lambda)
-        raise NotImplementedError("Gamete tau cannot exceed parental ploidy")
-    gamete = np.random.choice(genotype, tau, replace=False)
-    if lambda_ > 0.0:
-        if tau == 2:
-            if np.random.rand() <= lambda_:
-                gamete[1] = gamete[0]
-        elif tau != 2:
-            raise NotImplementedError("Non-zero lambda is only implemented for tau = 2")
-    return gamete
-
-
-@numba_guvectorize(  # type: ignore
-    [
-        "void(int8[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int8[:,:])",
-        "void(int16[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int16[:,:])",
-        "void(int32[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int32[:,:])",
-        "void(int64[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int64[:,:])",
-    ],
-    "(n,k),(n,p),(n,p),(n,p),()->(n,k)",
-)
-def genedrop_Hamilton_Kerr(
-    genotypes: ArrayLike,
-    parent: ArrayLike,
-    tau: ArrayLike,
-    lambda_: ArrayLike,
-    seed: int,
-    out: ArrayLike,
-) -> None:  # pragma: no cover
-    if parent.shape[1] != 2:
-        parent, tau, lambda_ = _compress_hamilton_kerr_parameters(parent, tau, lambda_)
-    n_sample, n_parent = parent.shape
-    _, max_ploidy = genotypes.shape
-    order = topological_argsort(parent)
-    np.random.seed(seed)
-    for i in range(n_sample):
-        t = order[i]
-        alleles = 0
-        unknown_alleles = 0
-        for j in range(n_parent):
-            p = parent[t, j]
-            tau_p = tau[t, j]
-            lambda_p = lambda_[t, j]
-            if p < 0:
-                unknown_alleles += tau_p
-            elif tau_p > 0:
-                ploidy_p = tau[p, 0] + tau[p, 1]
-                gamete = _random_gamete_Hamilton_Kerr(out[p], ploidy_p, tau_p, lambda_p)
-                out[t, alleles : alleles + tau_p] = gamete
-            alleles += tau_p
-        if unknown_alleles == 0:
-            if alleles < max_ploidy:
-                # pad with fill value
-                out[t, alleles:] = -2
-        elif unknown_alleles == alleles:
-            # founder
-            out[t] = genotypes[t]
-        else:
-            # partial founder
-            raise ValueError("Pedigree contains half-founders")
-
-
 def simulate_genedrop(
     ds: Dataset,
     *,
@@ -165,7 +46,7 @@ def simulate_genedrop(
         which is only suitable for pedigrees in which all samples are
         diploids resulting from sexual reproduction.
         The "Hamilton-Kerr" method is suitable for autopolyploid and
-        mixed-ploidy datasets following Kerr et al. 2012 and [2]
+        mixed-ploidy datasets following Kerr et al. 2012 [2] and
         Hamilton and Kerr 2017 [3].
     call_genotype
         Input variable name holding call_genotype as defined by
@@ -217,8 +98,8 @@ def simulate_genedrop(
         If the Hamilton-Kerr method is used and a sample has more than
         two contributing parents.
     ValueError
-        If the Hamilton-Kerr method is used and the number of alleles
-        in a genotype does not match the sum of tau values (i.e., ploidy).
+        If the Hamilton-Kerr method is used and the number of alleles in a
+        founder genotype does not match the sum of its tau values (i.e., ploidy).
     NotImplementedError
         If the Hamilton-Kerr method is used and a tau value exceeds the
         parental ploidy.
@@ -295,6 +176,8 @@ def simulate_genedrop(
     "Computation of the inverse additive relationship matrix for autopolyploid
     and multiple-ploidy populations." Theoretical and Applied Genetics 131: 851-860.
     """
+    from .genedrop_numba_fns import genedrop_diploid, genedrop_Hamilton_Kerr
+
     ds = define_variable_if_absent(ds, variables.parent, parent, parent_indices)
     variables.validate(
         ds, {parent: variables.parent_spec, call_genotype: variables.call_genotype_spec}

diff --git a/sgkit/stats/genedrop_numba_fns.py b/sgkit/stats/genedrop_numba_fns.py
@@ -0,0 +1,169 @@
+import numpy as np
+
+from sgkit.accelerate import numba_guvectorize, numba_jit
+from sgkit.typing import ArrayLike
+
+from .pedigree import _compress_hamilton_kerr_parameters, topological_argsort
+
+
+@numba_guvectorize(  # type: ignore
+    [
+        "void(int8[:,:], int64[:,:], uint32, int8[:,:])",
+        "void(int16[:,:], int64[:,:], uint32, int16[:,:])",
+        "void(int32[:,:], int64[:,:], uint32, int32[:,:])",
+        "void(int64[:,:], int64[:,:], uint32, int64[:,:])",
+    ],
+    "(n,k),(n,p),()->(n,k)",
+)
+def genedrop_diploid(
+    genotypes: ArrayLike,
+    parent: ArrayLike,
+    seed: ArrayLike,
+    out: ArrayLike,
+) -> None:  # pragma: no cover
+    n_sample, n_parent = parent.shape
+    _, ploidy = genotypes.shape
+    if n_parent != 2:
+        raise ValueError("The parents dimension must be length 2")
+    if ploidy != 2:
+        raise ValueError("Genotypes are not diploid")
+    order = topological_argsort(parent)
+    np.random.seed(seed)
+    for i in range(n_sample):
+        t = order[i]
+        unknown_parent = 0
+        for j in range(n_parent):
+            p = parent[t, j]
+            if p < 0:
+                # founder
+                unknown_parent += 1
+            else:
+                idx = np.random.randint(2)
+                out[t, j] = out[p, idx]
+        if unknown_parent == 1:
+            raise ValueError("Pedigree contains half-founders")
+        elif unknown_parent == 2:
+            # copy founder
+            out[t, 0] = genotypes[t, 0]
+            out[t, 1] = genotypes[t, 1]
+
+
+@numba_jit(nogil=True)
+def _random_inheritance_Hamilton_Kerr(
+    genotypes: ArrayLike,
+    parent: ArrayLike,
+    tau: ArrayLike,
+    lambda_: ArrayLike,
+    marked: ArrayLike,
+    i: int,
+):
+    _, n_parent = parent.shape
+    _, max_ploidy = genotypes.shape
+    next_allele = 0
+    ploidy_i = 0
+    for j in range(n_parent):
+        p = parent[i, j]
+        tau_p = tau[i, j]
+        ploidy_i += tau_p
+        if p < 0:
+            # unknown parent
+            continue
+        lambda_p = lambda_[i, j]
+        ploidy_p = tau[p, 0] + tau[p, 1]
+        if tau_p > ploidy_p:
+            raise NotImplementedError("Gamete tau cannot exceed parental ploidy.")
+        if lambda_p > 0.0:
+            if tau_p != 2:
+                raise NotImplementedError(
+                    "Non-zero lambda is only implemented for tau = 2."
+                )
+            homozygous_gamete = np.random.rand() < lambda_p
+        else:
+            homozygous_gamete = False
+        if homozygous_gamete:
+            # diploid gamete with duplicated allele
+            uniform = np.random.rand()
+            choice = int(uniform * ploidy_p)
+            for k in range(max_ploidy):
+                parent_allele = genotypes[p, k]
+                if parent_allele < -1:
+                    # non-allele
+                    pass
+                elif choice > 0:
+                    # not the chosen allele
+                    choice -= 1
+                else:
+                    # chosen allele is duplicated
+                    genotypes[i, next_allele] = parent_allele
+                    genotypes[i, next_allele + 1] = parent_allele
+                    next_allele += 2
+                    break
+        else:
+            # random alleles without replacement
+            marked[:] = False
+            for h in range(tau_p):
+                uniform = np.random.rand()
+                scale = ploidy_p - h
+                choice = int(uniform * scale)
+                k = 0
+                while choice >= 0:
+                    parent_allele = genotypes[p, k]
+                    if marked[k] > 0:
+                        # already inherited
+                        pass
+                    elif parent_allele < -1:
+                        # non-allele
+                        pass
+                    elif choice > 0:
+                        # not the chosen allele
+                        choice -= 1
+                    else:
+                        # chosen allele
+                        genotypes[i, next_allele] = parent_allele
+                        marked[k] = True
+                        next_allele += 1
+                        choice -= 1
+                    k += 1
+    if next_allele == 0:
+        # full founder requires ploidy validation
+        alleles_i = 0
+        for k in range(max_ploidy):
+            if genotypes[i, k] >= -1:
+                alleles_i += 1
+        if alleles_i != ploidy_i:
+            raise ValueError("Genotype ploidy does not match number of alleles.")
+    elif next_allele != ploidy_i:
+        raise ValueError("Pedigree contains half-founders.")
+    elif next_allele < max_ploidy:
+        # pad with non-alleles
+        genotypes[i, next_allele:] = -2
+
+
+@numba_guvectorize(  # type: ignore
+    [
+        "void(int8[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int8[:,:])",
+        "void(int16[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int16[:,:])",
+        "void(int32[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int32[:,:])",
+        "void(int64[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int64[:,:])",
+    ],
+    "(n,k),(n,p),(n,p),(n,p),()->(n,k)",
+)
+def genedrop_Hamilton_Kerr(
+    genotypes: ArrayLike,
+    parent: ArrayLike,
+    tau: ArrayLike,
+    lambda_: ArrayLike,
+    seed: int,
+    out: ArrayLike,
+) -> None:  # pragma: no cover
+    if parent.shape[1] != 2:
+        parent, tau, lambda_ = _compress_hamilton_kerr_parameters(parent, tau, lambda_)
+    out[:] = genotypes
+    n_sample, _ = parent.shape
+    _, max_ploidy = genotypes.shape
+    order = topological_argsort(parent)
+    marked = np.zeros(max_ploidy, dtype=np.bool8)
+    np.random.seed(seed)
+    for idx in range(n_sample):
+        i = order[idx]
+        _random_inheritance_Hamilton_Kerr(out, parent, tau, lambda_, marked, i)