From eea601df6f0f20c999467ba1c99a1f70c90fefa3 Mon Sep 17 00:00:00 2001 From: deadsalmonbrain Date: Sun, 8 May 2022 21:38:51 +0300 Subject: [PATCH 1/5] added isEnoughData validation --- .../BaseStatisticTransformer.scala | 26 ++--- .../BasicStatInferenceParameters.scala | 11 +++ .../ai/salmonbrain/ruleofthumb/ExpData.scala | 4 +- .../MannWhitneyStatisticsTransformer.scala | 34 +++++-- .../ruleofthumb/MannWhitneyTest.scala | 97 ++++++++----------- .../WelchStatisticsTransformer.scala | 23 +++-- 6 files changed, 111 insertions(+), 84 deletions(-) diff --git a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/BaseStatisticTransformer.scala b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/BaseStatisticTransformer.scala index 8aca502..a6c2fc9 100644 --- a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/BaseStatisticTransformer.scala +++ b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/BaseStatisticTransformer.scala @@ -1,11 +1,12 @@ package ai.salmonbrain.ruleofthumb +import ai.salmonbrain.ruleofthumb.CentralTendency.CentralTendency import org.apache.commons.math3.stat.inference.TestUtils import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.DefaultParamsWritable +import org.apache.spark.sql.Encoders import org.apache.spark.sql.types.{ BooleanType, StringType, StructField, StructType } -import org.apache.spark.sql.{ Dataset, Encoders } trait BaseStatisticTransformer extends Transformer @@ -43,17 +44,18 @@ trait BaseStatisticTransformer ) } - protected def checkVariants(dataset: Dataset[_]): Unit = { - val expectedVariants = Set($(treatmentName), $(controlName)) - val observedVariants = dataset - .select($(variantColumn)) - .distinct() - .collect() - .map(row => row.getAs[String]($(variantColumn))) - .toSet - assert( - expectedVariants == observedVariants, - s"Variants must be named ${$(treatmentName)} and ${$(controlName)}" + protected def getInvalidStatResult(centralTendency: CentralTendency): StatResult = { + StatResult( + Double.NaN, + Double.NaN, + Long.MinValue, + Double.NaN, + Double.NaN, + Double.NaN, + Double.NaN, + Double.NaN, + Double.NaN, + centralTendency.toString ) } } diff --git a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/BasicStatInferenceParameters.scala b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/BasicStatInferenceParameters.scala index e414553..53d4bde 100644 --- a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/BasicStatInferenceParameters.scala +++ b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/BasicStatInferenceParameters.scala @@ -24,6 +24,13 @@ trait BasicStatInferenceParameters extends Params { ) setDefault(srmAlpha, 0.05) + val minValidSampleSize: Param[Int] = new Param[Int]( + this, + "minValidSampleSize", + "parameter for skip invalid groups" + ) + setDefault(minValidSampleSize, 10) + /** @group setParam */ def setAlpha(value: Double): this.type = set(alpha, value) @@ -36,4 +43,8 @@ trait BasicStatInferenceParameters extends Params { def setSrmAlpha(value: Double): this.type = set(srmAlpha, value) + /** @group setParam */ + def setMinValidSampleSize(value: Int): this.type = + set(minValidSampleSize, value) + } diff --git a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/ExpData.scala b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/ExpData.scala index 5bf1b6d..1aa018d 100644 --- a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/ExpData.scala +++ b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/ExpData.scala @@ -17,10 +17,12 @@ case class StatisticsReport( statResult: StatResult, alpha: Double, beta: Double, + minValidSampleSize: Int, srm: Boolean, controlSize: Long, treatmentSize: Long, - testType: String + testType: String, + isEnoughData: Boolean ) case class CI( diff --git a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyStatisticsTransformer.scala b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyStatisticsTransformer.scala index 833b133..812d8b0 100644 --- a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyStatisticsTransformer.scala +++ b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyStatisticsTransformer.scala @@ -19,7 +19,6 @@ class MannWhitneyStatisticsTransformer(override val uid: String) extends BaseSta def this() = this(Identifiable.randomUID("mannWhitneyStatisticsTransformer")) override def transform(dataset: Dataset[_]): DataFrame = { - checkVariants(dataset) dataset .groupBy( $(experimentColumn), @@ -35,7 +34,10 @@ class MannWhitneyStatisticsTransformer(override val uid: String) extends BaseSta ) .withColumn( "statisticsData", - doStatistic($(alpha), $(beta))(col($(controlName)), col($(treatmentName))) + doStatistic($(alpha), $(beta), $(minValidSampleSize))( + col($(controlName)), + col($(treatmentName)) + ) ) .drop("control", "treatment") } @@ -44,23 +46,37 @@ class MannWhitneyStatisticsTransformer(override val uid: String) extends BaseSta override def transformSchema(schema: StructType): StructType = schema - def doStatistic(alpha: Double, beta: Double): UserDefinedFunction = udf { + def doStatistic(alpha: Double, beta: Double, minValidSampleSize: Int): UserDefinedFunction = udf { ( control: mutable.WrappedArray[Double], treatment: mutable.WrappedArray[Double] ) => - val statResult = - MannWhitneyTest.mannWhitneyTest(control.toArray, treatment.toArray, alpha, beta) - val controlSize = control.length - val treatmentSize = treatment.length + val controlSize = Option(control).getOrElse(mutable.WrappedArray.empty).length + val treatmentSize = Option(treatment).getOrElse(mutable.WrappedArray.empty).length + val isEnoughData = math.min(controlSize, treatmentSize) >= minValidSampleSize + val (statResult, srmResult) = + if (isEnoughData) + ( + MannWhitneyTest.mannWhitneyTest( + control.toArray, + treatment.toArray, + alpha, + beta + ), + srm(controlSize, treatmentSize, $(srmAlpha)) + ) + else (getInvalidStatResult(CentralTendency.MEDIAN), false) + StatisticsReport( statResult, alpha, beta, - srm(controlSize, treatmentSize, $(srmAlpha)), + minValidSampleSize, + srmResult, controlSize, treatmentSize, - TestType.MANN_WHITNEY.toString + TestType.MANN_WHITNEY.toString, + isEnoughData ) } } diff --git a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyTest.scala b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyTest.scala index a51f5bb..e2437c9 100644 --- a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyTest.scala +++ b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyTest.scala @@ -6,7 +6,6 @@ import org.apache.commons.math3.stat.inference.MannWhitneyUTest object MannWhitneyTest extends BaseStatTest { val median = new Median() - val MINIMUM_SAMPLE_SIZE = 5 def mannWhitneyTest( control: Array[Double], @@ -15,63 +14,49 @@ object MannWhitneyTest extends BaseStatTest { beta: Double ): StatResult = { assert(alpha < 1 && beta < 1) - math.min(control.length, treatment.length) match { - case sampleSize if sampleSize >= MINIMUM_SAMPLE_SIZE => - val mannWhitneyUTest = new MannWhitneyUTest() - val uStatistic = mannWhitneyUTest.mannWhitneyU(control, treatment) - val pValue = mannWhitneyUTest.mannWhitneyUTest(control, treatment) - val controlMedian = median.evaluate(control) - val treatmentMedian = median.evaluate(treatment) - val treatmentMedianVariance = medianVariance(treatment) - val controlMedianVariance = medianVariance(control) - val std = math.sqrt(treatmentMedianVariance + controlMedianVariance) - val size = math.max(control.length, treatment.length) + val mannWhitneyUTest = new MannWhitneyUTest() + val uStatistic = mannWhitneyUTest.mannWhitneyU(control, treatment) + val pValue = mannWhitneyUTest.mannWhitneyUTest(control, treatment) + val controlMedian = median.evaluate(control) + val treatmentMedian = median.evaluate(treatment) + val treatmentMedianVariance = medianVariance(treatment) + val controlMedianVariance = medianVariance(control) + val std = math.sqrt(treatmentMedianVariance + controlMedianVariance) + val size = math.max(control.length, treatment.length) - val ci = CI( - controlMedian, - controlMedianVariance, - treatmentMedian, - treatmentMedianVariance, - std, - normalDistribution.inverseCumulativeProbability(alpha / 2), - normalDistribution.inverseCumulativeProbability(1 - alpha / 2), - size - ) - val sampleSize = sampleSizeEstimation( - alpha, - beta, - treatmentMedian, - controlMedian, - treatment.length, - control.length - ) + val ci = CI( + controlMedian, + controlMedianVariance, + treatmentMedian, + treatmentMedianVariance, + std, + normalDistribution.inverseCumulativeProbability(alpha / 2), + normalDistribution.inverseCumulativeProbability(1 - alpha / 2), + size + ) + + val sampleSize = sampleSizeEstimation( + alpha, + beta, + treatmentMedian, + controlMedian, + treatment.length, + control.length + ) + + StatResult( + uStatistic, + pValue, + sampleSize, + controlMedian, + treatmentMedian, + controlMedianVariance, + treatmentMedianVariance, + ci.lowerPercent, + ci.upperPercent, + CentralTendency.MEDIAN.toString + ) - StatResult( - uStatistic, - pValue, - sampleSize, - controlMedian, - treatmentMedian, - controlMedianVariance, - treatmentMedianVariance, - ci.lowerPercent, - ci.upperPercent, - CentralTendency.MEDIAN.toString - ) - case _ => - StatResult( - Double.NaN, - Double.NaN, - Long.MinValue, - Double.NaN, - Double.NaN, - Double.NaN, - Double.NaN, - Double.NaN, - Double.NaN, - CentralTendency.MEDIAN.toString - ) - } } /* diff --git a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/WelchStatisticsTransformer.scala b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/WelchStatisticsTransformer.scala index e5a1f6e..7df5b56 100644 --- a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/WelchStatisticsTransformer.scala +++ b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/WelchStatisticsTransformer.scala @@ -15,7 +15,6 @@ class WelchStatisticsTransformer(override val uid: String) extends BaseStatistic override def transform(dataset: Dataset[_]): DataFrame = { import dataset.sqlContext.implicits._ - checkVariants(dataset) dataset .groupBy( $(experimentColumn), @@ -33,27 +32,39 @@ class WelchStatisticsTransformer(override val uid: String) extends BaseStatistic count(col($(valueColumn))) as "length" ) ) - .withColumn("statisticsData", doStatistic($(alpha), $(beta))($"control", $"treatment")) + .withColumn( + "statisticsData", + doStatistic($(alpha), $(beta), $(minValidSampleSize))($"control", $"treatment") + ) .drop("control", "treatment") } - def doStatistic(alpha: Double, beta: Double): UserDefinedFunction = udf { + def doStatistic(alpha: Double, beta: Double, minValidSampleSize: Int): UserDefinedFunction = udf { ( control: Row, treatment: Row ) => - val statResult = WelchTTest.welchTTest(control, treatment, alpha, beta) val controlSize = control.getAs[Long]("length") val treatmentSize = treatment.getAs[Long]("length") + val isEnoughData = math.min(controlSize, treatmentSize) >= minValidSampleSize + val (statResult, srmResult) = + if (isEnoughData) + ( + WelchTTest.welchTTest(control, treatment, alpha, beta), + srm(controlSize.toInt, treatmentSize.toInt, $(srmAlpha)) + ) + else (getInvalidStatResult(CentralTendency.MEAN), false) StatisticsReport( statResult, alpha, beta, - srm(controlSize.toInt, treatmentSize.toInt, $(srmAlpha)), + minValidSampleSize, + srmResult, controlSize, treatmentSize, - TestType.WELCH.toString + TestType.WELCH.toString, + isEnoughData ) } } From a7179175e06f0371947871614aed426fc795ab1e Mon Sep 17 00:00:00 2001 From: deadsalmonbrain Date: Mon, 9 May 2022 15:50:43 +0300 Subject: [PATCH 2/5] added fix in python api --- python/ai/salmonbrain/params.py | 11 +++++++++++ python/setup.py | 2 +- ruleofthumb/build.gradle | 2 +- .../MannWhitneyStatisticsTransformer.scala | 5 +++-- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/python/ai/salmonbrain/params.py b/python/ai/salmonbrain/params.py index 7a99959..d347efe 100644 --- a/python/ai/salmonbrain/params.py +++ b/python/ai/salmonbrain/params.py @@ -142,11 +142,19 @@ class BasicStatInferenceParameters(Params): typeConverter=TypeConverters.toFloat, ) + minValidSampleSize = Param( + Params._dummy(), + "minValidSampleSize", + "parameter for skip invalid groups", + typeConverter=TypeConverters.toInt, + ) + def __init__(self): super(BasicStatInferenceParameters, self).__init__() self._setDefault(alpha=0.05) self._setDefault(beta=0.2) self._setDefault(srmAlpha=0.05) + self._setDefault(minValidSampleSize=10) def setAlpha(self, value): return self._set(alpha=value) @@ -156,3 +164,6 @@ def setBeta(self, value): def setSrmAlpha(self, value): return self._set(srmAlpha=value) + + def minValidSampleSize(self, value): + return self._set(minValidSampleSize=value) diff --git a/python/setup.py b/python/setup.py index 730c8ba..e26acea 100644 --- a/python/setup.py +++ b/python/setup.py @@ -9,7 +9,7 @@ setup( name="dead-salmon-brain", - version="0.0.6", + version="0.0.7", description="Dead Salmon Brain is a cluster computing system for analysing A/B experiments", license="Apache License v2.0", author="Dead Salmon Brain", diff --git a/ruleofthumb/build.gradle b/ruleofthumb/build.gradle index 821994e..6bb680a 100644 --- a/ruleofthumb/build.gradle +++ b/ruleofthumb/build.gradle @@ -28,7 +28,7 @@ application { mainClass = 'ai.salmonbrain.ruleofthumb.Main' } -version '0.0.6' +version '0.0.7' repositories { mavenCentral() diff --git a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyStatisticsTransformer.scala b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyStatisticsTransformer.scala index 812d8b0..2a03169 100644 --- a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyStatisticsTransformer.scala +++ b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyStatisticsTransformer.scala @@ -9,6 +9,7 @@ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{ DataFrame, Dataset } import scala.collection.mutable +import scala.collection.mutable.WrappedArray.make /** * Transformer to apply Mann–Whitney U test @@ -51,8 +52,8 @@ class MannWhitneyStatisticsTransformer(override val uid: String) extends BaseSta control: mutable.WrappedArray[Double], treatment: mutable.WrappedArray[Double] ) => - val controlSize = Option(control).getOrElse(mutable.WrappedArray.empty).length - val treatmentSize = Option(treatment).getOrElse(mutable.WrappedArray.empty).length + val controlSize = Option(control).getOrElse(make[Double](Array())).length + val treatmentSize = Option(treatment).getOrElse(make[Double](Array())).length val isEnoughData = math.min(controlSize, treatmentSize) >= minValidSampleSize val (statResult, srmResult) = if (isEnoughData) From f393a4d47ec7448e3636bd18e3d735dcee7faf27 Mon Sep 17 00:00:00 2001 From: deadsalmonbrain Date: Mon, 9 May 2022 16:20:59 +0300 Subject: [PATCH 3/5] bug fix fill parameters --- python/ai/salmonbrain/params.py | 2 +- python/ai/salmonbrain/ruleofthumb.py | 4 ++++ python/tests/test_ruleofthumb.py | 6 +++--- .../salmonbrain/ruleofthumb/AutoStatisticsTransformer.scala | 1 + 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/python/ai/salmonbrain/params.py b/python/ai/salmonbrain/params.py index d347efe..653c9c8 100644 --- a/python/ai/salmonbrain/params.py +++ b/python/ai/salmonbrain/params.py @@ -165,5 +165,5 @@ def setBeta(self, value): def setSrmAlpha(self, value): return self._set(srmAlpha=value) - def minValidSampleSize(self, value): + def setMinValidSampleSize(self, value): return self._set(minValidSampleSize=value) diff --git a/python/ai/salmonbrain/ruleofthumb.py b/python/ai/salmonbrain/ruleofthumb.py index 7657d74..0e17757 100644 --- a/python/ai/salmonbrain/ruleofthumb.py +++ b/python/ai/salmonbrain/ruleofthumb.py @@ -122,6 +122,7 @@ def setParams( alpha=0.05, beta=0.2, srmAlpha=0.05, + minValidSampleSize=10, metricSourceColumn="metricSource", entityIdColumn="entityUid", experimentColumn="experimentUid", @@ -154,6 +155,7 @@ def __init__( alpha=0.05, beta=0.2, srmAlpha=0.05, + minValidSampleSize=10, metricSourceColumn="metricSource", entityIdColumn="entityUid", experimentColumn="experimentUid", @@ -190,6 +192,7 @@ def __init__( alpha=0.05, beta=0.2, srmAlpha=0.05, + minValidSampleSize=10, metricSourceColumn="metricSource", entityIdColumn="entityUid", experimentColumn="experimentUid", @@ -226,6 +229,7 @@ def __init__( alpha=0.05, beta=0.2, srmAlpha=0.05, + minValidSampleSize=10, metricSourceColumn="metricSource", entityIdColumn="entityUid", experimentColumn="experimentUid", diff --git a/python/tests/test_ruleofthumb.py b/python/tests/test_ruleofthumb.py index 52a7c96..7013910 100644 --- a/python/tests/test_ruleofthumb.py +++ b/python/tests/test_ruleofthumb.py @@ -126,7 +126,7 @@ def test_cumulativeMetricTransformer(data_sample: DataFrame): def test_welchStatisticsTransformer(data_sample: DataFrame): cum = CumulativeMetricTransformer() - welch = WelchStatisticsTransformer() + welch = WelchStatisticsTransformer(minValidSampleSize=3) result = welch.transform(cum.transform(data_sample)) p_values = [ @@ -137,7 +137,7 @@ def test_welchStatisticsTransformer(data_sample: DataFrame): def test_mannWhitneyStatisticsTransformer(data_sample: DataFrame): cum = CumulativeMetricTransformer() - welch = WelchStatisticsTransformer() + welch = WelchStatisticsTransformer(minValidSampleSize=3) result = welch.transform(cum.transform(data_sample)) p_values = [ @@ -148,7 +148,7 @@ def test_mannWhitneyStatisticsTransformer(data_sample: DataFrame): def test_autoStatisticsTransformer(data_sample: DataFrame): cum = CumulativeMetricTransformer() - auto = AutoStatisticsTransformer() + auto = AutoStatisticsTransformer(minValidSampleSize=3) result = auto.transform(cum.transform(data_sample)) p_values = [ diff --git a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/AutoStatisticsTransformer.scala b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/AutoStatisticsTransformer.scala index a718f44..6ed62f0 100644 --- a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/AutoStatisticsTransformer.scala +++ b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/AutoStatisticsTransformer.scala @@ -99,6 +99,7 @@ class AutoStatisticsTransformer(override val uid: String) extends BaseStatisticT .setAdditiveColumn($(additiveColumn)) .setAlpha($(alpha)) .setBeta($(beta)) + .setMinValidSampleSize($(minValidSampleSize)) .setDataProviderColumn($(metricSourceColumn)) .setEntityIdColumn($(entityIdColumn)) .setExperimentColumn($(experimentColumn)) From c18a7b6bb429a00e48469d1a988f08c867d5eccf Mon Sep 17 00:00:00 2001 From: deadsalmonbrain Date: Fri, 13 May 2022 10:04:12 +0300 Subject: [PATCH 4/5] change default method to compute nonparametric variance --- python/ai/salmonbrain/params.py | 11 ++ python/ai/salmonbrain/ruleofthumb.py | 4 + python/tests/test_ruleofthumb.py | 2 +- .../BaseStatisticTransformer.scala | 5 +- .../BasicStatInferenceParameters.scala | 11 ++ .../ai/salmonbrain/ruleofthumb/ExpData.scala | 3 +- .../MannWhitneyStatisticsTransformer.scala | 12 +- .../ruleofthumb/MannWhitneyTest.scala | 103 +++++++++++------- .../salmonbrain/ruleofthumb/WelchTTest.scala | 8 +- 9 files changed, 111 insertions(+), 48 deletions(-) diff --git a/python/ai/salmonbrain/params.py b/python/ai/salmonbrain/params.py index 653c9c8..436de04 100644 --- a/python/ai/salmonbrain/params.py +++ b/python/ai/salmonbrain/params.py @@ -149,12 +149,20 @@ class BasicStatInferenceParameters(Params): typeConverter=TypeConverters.toInt, ) + useLinearApproximationForVariance = Param( + Params._dummy(), + "useLinearApproximationForVariance", + "parameter for control variance computing method for nonparametric tests", + typeConverter=TypeConverters.toBoolean, + ) + def __init__(self): super(BasicStatInferenceParameters, self).__init__() self._setDefault(alpha=0.05) self._setDefault(beta=0.2) self._setDefault(srmAlpha=0.05) self._setDefault(minValidSampleSize=10) + self._setDefault(useLinearApproximationForVariance=False) def setAlpha(self, value): return self._set(alpha=value) @@ -167,3 +175,6 @@ def setSrmAlpha(self, value): def setMinValidSampleSize(self, value): return self._set(minValidSampleSize=value) + + def setUseLinearApproximationForVariance(self, value): + return self._set(useLinearApproximationForVariance=value) diff --git a/python/ai/salmonbrain/ruleofthumb.py b/python/ai/salmonbrain/ruleofthumb.py index 0e17757..71b449f 100644 --- a/python/ai/salmonbrain/ruleofthumb.py +++ b/python/ai/salmonbrain/ruleofthumb.py @@ -123,6 +123,7 @@ def setParams( beta=0.2, srmAlpha=0.05, minValidSampleSize=10, + useLinearApproximationForVariance=False, metricSourceColumn="metricSource", entityIdColumn="entityUid", experimentColumn="experimentUid", @@ -156,6 +157,7 @@ def __init__( beta=0.2, srmAlpha=0.05, minValidSampleSize=10, + useLinearApproximationForVariance=False, metricSourceColumn="metricSource", entityIdColumn="entityUid", experimentColumn="experimentUid", @@ -193,6 +195,7 @@ def __init__( beta=0.2, srmAlpha=0.05, minValidSampleSize=10, + useLinearApproximationForVariance=False, metricSourceColumn="metricSource", entityIdColumn="entityUid", experimentColumn="experimentUid", @@ -230,6 +233,7 @@ def __init__( beta=0.2, srmAlpha=0.05, minValidSampleSize=10, + useLinearApproximationForVariance=False, metricSourceColumn="metricSource", entityIdColumn="entityUid", experimentColumn="experimentUid", diff --git a/python/tests/test_ruleofthumb.py b/python/tests/test_ruleofthumb.py index 7013910..9ca105a 100644 --- a/python/tests/test_ruleofthumb.py +++ b/python/tests/test_ruleofthumb.py @@ -5,7 +5,7 @@ CumulativeMetricTransformer, WelchStatisticsTransformer, OutlierRemoveTransformer, - AutoStatisticsTransformer + AutoStatisticsTransformer, ) diff --git a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/BaseStatisticTransformer.scala b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/BaseStatisticTransformer.scala index a6c2fc9..d9997c2 100644 --- a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/BaseStatisticTransformer.scala +++ b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/BaseStatisticTransformer.scala @@ -48,14 +48,15 @@ trait BaseStatisticTransformer StatResult( Double.NaN, Double.NaN, - Long.MinValue, + -1, Double.NaN, Double.NaN, Double.NaN, Double.NaN, Double.NaN, Double.NaN, - centralTendency.toString + centralTendency.toString, + isZeroVariance = false ) } } diff --git a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/BasicStatInferenceParameters.scala b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/BasicStatInferenceParameters.scala index 53d4bde..2519dbb 100644 --- a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/BasicStatInferenceParameters.scala +++ b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/BasicStatInferenceParameters.scala @@ -31,6 +31,17 @@ trait BasicStatInferenceParameters extends Params { ) setDefault(minValidSampleSize, 10) + val useLinearApproximationForVariance: Param[Boolean] = new Param[Boolean]( + this, + "useLinearApproximationForVariance", + "parameter for control variance computing method for nonparametric tests" + ) + setDefault(useLinearApproximationForVariance, false) + + /** @group setParam */ + def setUseLinearApproximationForVariance(value: Boolean): this.type = + set(useLinearApproximationForVariance, value) + /** @group setParam */ def setAlpha(value: Double): this.type = set(alpha, value) diff --git a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/ExpData.scala b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/ExpData.scala index 1aa018d..c9cab85 100644 --- a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/ExpData.scala +++ b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/ExpData.scala @@ -68,7 +68,8 @@ case class StatResult( treatmentVariance: Double, percentageLeft: Double, percentageRight: Double, - centralTendencyType: String = CentralTendency.MEAN.toString + centralTendencyType: String, + isZeroVariance: Boolean ) case class Metric(metricName: String, metricValue: Double) diff --git a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyStatisticsTransformer.scala b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyStatisticsTransformer.scala index 2a03169..1e05a63 100644 --- a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyStatisticsTransformer.scala +++ b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyStatisticsTransformer.scala @@ -35,7 +35,7 @@ class MannWhitneyStatisticsTransformer(override val uid: String) extends BaseSta ) .withColumn( "statisticsData", - doStatistic($(alpha), $(beta), $(minValidSampleSize))( + doStatistic($(alpha), $(beta), $(minValidSampleSize), $(useLinearApproximationForVariance))( col($(controlName)), col($(treatmentName)) ) @@ -47,7 +47,12 @@ class MannWhitneyStatisticsTransformer(override val uid: String) extends BaseSta override def transformSchema(schema: StructType): StructType = schema - def doStatistic(alpha: Double, beta: Double, minValidSampleSize: Int): UserDefinedFunction = udf { + def doStatistic( + alpha: Double, + beta: Double, + minValidSampleSize: Int, + useLinearApproximationForVariance: Boolean + ): UserDefinedFunction = udf { ( control: mutable.WrappedArray[Double], treatment: mutable.WrappedArray[Double] @@ -62,7 +67,8 @@ class MannWhitneyStatisticsTransformer(override val uid: String) extends BaseSta control.toArray, treatment.toArray, alpha, - beta + beta, + useLinearApproximationForVariance ), srm(controlSize, treatmentSize, $(srmAlpha)) ) diff --git a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyTest.scala b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyTest.scala index e2437c9..b0d4f43 100644 --- a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyTest.scala +++ b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyTest.scala @@ -1,62 +1,87 @@ package ai.salmonbrain.ruleofthumb import org.apache.commons.math3.distribution.BinomialDistribution +import org.apache.commons.math3.stat.descriptive.moment.Variance import org.apache.commons.math3.stat.descriptive.rank.Median import org.apache.commons.math3.stat.inference.MannWhitneyUTest object MannWhitneyTest extends BaseStatTest { val median = new Median() + val variance = new Variance() def mannWhitneyTest( control: Array[Double], treatment: Array[Double], alpha: Double, - beta: Double + beta: Double, + useLinearApproximationForVariance: Boolean ): StatResult = { assert(alpha < 1 && beta < 1) - val mannWhitneyUTest = new MannWhitneyUTest() - val uStatistic = mannWhitneyUTest.mannWhitneyU(control, treatment) - val pValue = mannWhitneyUTest.mannWhitneyUTest(control, treatment) val controlMedian = median.evaluate(control) val treatmentMedian = median.evaluate(treatment) - val treatmentMedianVariance = medianVariance(treatment) - val controlMedianVariance = medianVariance(control) - val std = math.sqrt(treatmentMedianVariance + controlMedianVariance) - val size = math.max(control.length, treatment.length) - - val ci = CI( - controlMedian, - controlMedianVariance, - treatmentMedian, - treatmentMedianVariance, - std, - normalDistribution.inverseCumulativeProbability(alpha / 2), - normalDistribution.inverseCumulativeProbability(1 - alpha / 2), - size - ) - val sampleSize = sampleSizeEstimation( - alpha, - beta, - treatmentMedian, - controlMedian, - treatment.length, - control.length - ) + val (treatmentMedianVariance, controlMedianVariance) = + if (useLinearApproximationForVariance) + (medianVariance(treatment), medianVariance(control)) + else (variance.evaluate(treatment), variance.evaluate(control)) - StatResult( - uStatistic, - pValue, - sampleSize, - controlMedian, - treatmentMedian, - controlMedianVariance, - treatmentMedianVariance, - ci.lowerPercent, - ci.upperPercent, - CentralTendency.MEDIAN.toString - ) + (treatmentMedianVariance, controlMedianVariance) match { + case x if x._1 < EPS || x._2 < EPS => + StatResult( + Double.NaN, + Double.NaN, + -1, + controlMedian, + treatmentMedian, + controlMedianVariance, + treatmentMedianVariance, + Double.NaN, + Double.NaN, + CentralTendency.MEDIAN.toString, + isZeroVariance = true + ) + case _ => + val mannWhitneyUTest = new MannWhitneyUTest() + val uStatistic = mannWhitneyUTest.mannWhitneyU(control, treatment) + val pValue = mannWhitneyUTest.mannWhitneyUTest(control, treatment) + + val std = math.sqrt(treatmentMedianVariance + controlMedianVariance) + val size = math.max(control.length, treatment.length) + + val ci = CI( + controlMedian, + controlMedianVariance, + treatmentMedian, + treatmentMedianVariance, + std, + normalDistribution.inverseCumulativeProbability(alpha / 2), + normalDistribution.inverseCumulativeProbability(1 - alpha / 2), + size + ) + + val sampleSize = sampleSizeEstimation( + alpha, + beta, + treatmentMedian, + controlMedian, + treatment.length, + control.length + ) + StatResult( + uStatistic, + pValue, + sampleSize, + controlMedian, + treatmentMedian, + controlMedianVariance, + treatmentMedianVariance, + ci.lowerPercent, + ci.upperPercent, + CentralTendency.MEDIAN.toString, + isZeroVariance = false + ) + } } /* diff --git a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/WelchTTest.scala b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/WelchTTest.scala index 8c6340d..4102a0b 100644 --- a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/WelchTTest.scala +++ b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/WelchTTest.scala @@ -49,7 +49,9 @@ object WelchTTest extends BaseStatTest { controlVariance, treatmentVariance, Double.NaN, - Double.NaN + Double.NaN, + CentralTendency.MEAN.toString, + isZeroVariance = true ) case _ => val qt = controlVariance / controlSampleSize + treatmentVariance / treatmentSampleSize @@ -96,7 +98,9 @@ object WelchTTest extends BaseStatTest { controlVariance, treatmentVariance, ci.lowerPercent, - ci.upperPercent + ci.upperPercent, + CentralTendency.MEAN.toString, + isZeroVariance = false ) } } From eb4b9b904bca6940e639c56d802930482d5e3f3f Mon Sep 17 00:00:00 2001 From: deadsalmonbrain Date: Fri, 13 May 2022 10:11:09 +0300 Subject: [PATCH 5/5] change method to compute std for nonparametric --- .../scala/ai/salmonbrain/ruleofthumb/MannWhitneyTest.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyTest.scala b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyTest.scala index b0d4f43..d048238 100644 --- a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyTest.scala +++ b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyTest.scala @@ -45,7 +45,9 @@ object MannWhitneyTest extends BaseStatTest { val uStatistic = mannWhitneyUTest.mannWhitneyU(control, treatment) val pValue = mannWhitneyUTest.mannWhitneyUTest(control, treatment) - val std = math.sqrt(treatmentMedianVariance + controlMedianVariance) + val std = math.sqrt( + controlMedianVariance / control.length + treatmentMedianVariance / treatment.length + ) val size = math.max(control.length, treatment.length) val ci = CI(