Merge pull request #66 from EricHUANG970823/new-branch

Dunnett's test
maximtrp · Feb 17, 2024 · 31aabc5 · 31aabc5
2 parents 18bc6a7 + 7aee929
commit 31aabc5
Show file tree

Hide file tree

Showing 2 changed files with 81 additions and 4 deletions.
diff --git a/scikit_posthocs/__init__.py b/scikit_posthocs/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '0.8.1'
+__version__ = '0.8.2'
 
 from scikit_posthocs._global import global_simes_test, global_f_test
 from scikit_posthocs._omnibus import test_osrt, test_durbin, test_mackwolfe
@@ -10,8 +10,8 @@
     posthoc_nemenyi_friedman, posthoc_npm_test, posthoc_quade,
     posthoc_scheffe, posthoc_siegel_friedman, posthoc_tamhane,
     posthoc_ttest, posthoc_tukey, posthoc_tukey_hsd,
-    posthoc_vanwaerden, posthoc_wilcoxon, __convert_to_df,
-    __convert_to_block_df,
+    posthoc_vanwaerden, posthoc_wilcoxon, posthoc_dunnett,
+    __convert_to_df, __convert_to_block_df,
 )
 
 from scikit_posthocs._plotting import (

diff --git a/scikit_posthocs/_posthocs.py b/scikit_posthocs/_posthocs.py
@@ -5,7 +5,7 @@
 from statsmodels.sandbox.stats.multicomp import multipletests
 from statsmodels.stats.multicomp import pairwise_tukeyhsd
 from statsmodels.stats.libqsturng import psturng
-from pandas import DataFrame
+from pandas import DataFrame, Series, MultiIndex
 
 
 def __convert_to_df(
@@ -1561,6 +1561,83 @@ def compare_stats(i, j):
     return DataFrame(vs, index=groups, columns=groups)
 
 
+def posthoc_dunnett(a: Union[list, np.ndarray, DataFrame],
+                    val_col: str = None,
+                    group_col: str = None,
+                    control: str = None,
+                    sort: bool = False,
+                    to_matrix: bool = True) -> Series | DataFrame:
+    """
+    Dunnett's test [1, 2, 3] for multiple comparisons against a control group, used after parametric
+    ANOVA. The control group is specified by the `control` parameter.
+
+    Parameters
+    ----------
+    a : array_like or pandas DataFrame object
+        An array, any object exposing the array interface or a pandas
+        DataFrame. Array must be two-dimensional.
+
+    val_col : str, optional
+        Name of a DataFrame column that contains dependent variable values (test
+        or response variable). Values should have a non-nominal scale. Must be
+        specified if `a` is a pandas DataFrame object.
+
+    group_col : str, optional
+        Name of a DataFrame column that contains independent variable values
+        (grouping or predictor variable). Values should have a nominal scale
+        (categorical). Must be specified if `a` is a pandas DataFrame object.
+
+    control : str, optional
+        Name of the control group within the `group_col` column. Values should
+        have a nominal scale (categorical). Must be specified if `a` is a pandas
+
+    sort : bool, optional
+        Specifies whether to sort DataFrame by group_col or not. Recommended
+        unless you sort your data manually.
+
+    to_matrix: bool, optional
+        Specifies whether to return a DataFrame or a Series. If True, a DataFrame
+        is returned with some NaN values since it's not pairwise comparison.
+        Default is True.
+
+    Returns
+    -------
+    result : pandas.Series or pandas.DataFrame
+        P values.
+
+    References
+    ----------
+    .. [1] Charles W. Dunnett (1955). “A Multiple Comparison Procedure for Comparing Several Treatments with a Control.”
+    .. [2] https://en.wikipedia.org/wiki/Dunnett%27s_test
+    .. [3] https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.dunnett.html#id1
+    """
+    x, _val_col, _group_col = __convert_to_df(a, val_col, group_col)
+    x = x.sort_values(by=[_group_col], ascending=True) if sort else x
+    x = x.set_index(_group_col)[_val_col]
+    x_embedded = x.groupby(_group_col).agg(lambda y: y.dropna().tolist())
+    control_data = x_embedded.loc[control]
+    treatment_data = x_embedded.drop(control)
+
+    pvals = ss.dunnett(*treatment_data, control=control_data).pvalue
+
+    multi_index = MultiIndex.from_product([[control], treatment_data.index.tolist()])
+    dunnett_sr = Series(pvals, index=multi_index)
+
+    if not to_matrix:
+        return dunnett_sr
+
+    else:
+        levels = x.index.unique().tolist()
+        result_df = DataFrame(index=levels, columns=levels)
+
+        for pair in dunnett_sr.index:
+            a, b = pair
+            result_df.loc[a, b] = dunnett_sr[pair]
+            result_df.loc[b, a] = dunnett_sr[pair]
+        result_df.loc[control, control] = 1.0
+        return result_df
+
+
 def posthoc_ttest(
         a: Union[list, np.ndarray, DataFrame],
         val_col: str = None,