Skip to content

Commit

Permalink
BREAKING: improved speed of median/quantile calculation, by only part…
Browse files Browse the repository at this point in the history
…ially sorting
  • Loading branch information
jkriege2 committed Mar 18, 2024
1 parent be48ed8 commit b2aad7c
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 29 deletions.
102 changes: 73 additions & 29 deletions lib/jkqtmath/jkqtpstatbasics.h
Original file line number Diff line number Diff line change
Expand Up @@ -811,7 +811,75 @@ inline double jkqtpstatMedianOfSortedVector(const TVector& data, size_t* Noutput



/*! \brief calculates the median of a given data range \a first ... \a last, this version partially sorts the range (i.e. no internal copy and checking for NAN-values is performed!)
\ingroup jkqtptools_math_statistics_basic
\tparam InputIt standard iterator type of \a first and \a last.
\param first iterator pointing to the first item in the dataset to use \f$ X_1 \f$
\param last iterator pointing behind the last item in the dataset to use \f$ X_N \f$
\return the median of the data returned between \a first and \a last (excluding invalid doubles).
If the given range \a first ... \a last is empty, NAN is returned
\note This operation implies uses std::nth_element() to calculate the median in linear time O(N)!
The given range is partially sorted after the call!
*/
template <class InputIt>
inline double jkqtpstatMedianAndPartialSort(InputIt first, InputIt last) {
// filter out any NAN-values:
const size_t n=std::distance(first,last);

// handle empty range
if (n<=0) {
return JKQTP_DOUBLE_NAN;
}

// calculate median using nth_element() in linear timme!
const auto middleItr = first + n / 2;
std::nth_element(first, middleItr, last);
if (n % 2 == 0) {
// since nth_element() performs partial sorting,
// "All of the elements before this new nth element are less than or equal to the elements after the new nth element." (from cppreference.com)
const auto leftMiddleItr = std::max_element(first, middleItr);
// the second elemnt to average is the maximum on the left of middleItr!
return (*leftMiddleItr + *middleItr) / 2.0;
} else {
return *middleItr;
}
}

/*! \brief calculates the median of a given data range \a first ... \a last
\ingroup jkqtptools_math_statistics_basic
\tparam InputIt standard iterator type of \a first and \a last.
\param first iterator pointing to the first item in the dataset to use \f$ X_1 \f$
\param last iterator pointing behind the last item in the dataset to use \f$ X_N \f$
\param[out] Noutput optionally returns the number of accumulated valid values in this variable
\return the median of the data returned between \a first and \a last (excluding invalid doubles).
If the given range \a first ... \a last is empty, NAN is returned
\note This operation implies an internal copy of the data, and then uses std::nth_element() to calculate the median in linear time O(N)!
\note Each value is the specified range is converted to a double using jkqtp_todouble().
Entries in the range that are invalid double (using JKQTPIsOKFloat() )
are ignored when calculating.
*/
template <class InputIt>
inline double jkqtpstatMedian(InputIt first, InputIt last, size_t* Noutput=nullptr) {
// filter out any NAN-values:
std::vector<double> dataFiltered;
jkqtpstatFilterGoodFloat(first, last, std::back_inserter(dataFiltered));
const size_t n=dataFiltered.size();

// handle empty range
if (n<=0) {
if (Noutput) *Noutput=0;
return JKQTP_DOUBLE_NAN;
}

// calculate median using nth_element() in linear timme!
return jkqtpstatMedianAndPartialSort(dataFiltered.begin(), dataFiltered.end());
}

/*! \brief calculates the Five-Number Statistical Summary (minimum, median, maximum and two user-defined quantiles (as well as derived from these the inter quartile range)) of a sorted vector
\ingroup jkqtptools_math_statistics_basic
Expand Down Expand Up @@ -1079,29 +1147,6 @@ inline JKQTPStat5NumberStatistics jkqtpstat5NumberStatistics(InputIt first, Inpu
}


/*! \brief calculates the median of a given data range \a first ... \a last
\ingroup jkqtptools_math_statistics_basic
\tparam InputIt standard iterator type of \a first and \a last.
\param first iterator pointing to the first item in the dataset to use \f$ X_1 \f$
\param last iterator pointing behind the last item in the dataset to use \f$ X_N \f$
\param[out] Noutput optionally returns the number of accumulated valid values in this variable
\return the median of the data returned between \a first and \a last (excluding invalid doubles).
If the given range \a first ... \a last is empty, NAN is returned
\note This operation implies an internal copy of the data, as well as sorting it!
\note Each value is the specified range is converted to a double using jkqtp_todouble().
Entries in the range that are invalid double (using JKQTPIsOKFloat() )
are ignored when calculating.
*/
template <class InputIt>
inline double jkqtpstatMedian(InputIt first, InputIt last, size_t* Noutput=nullptr) {
std::vector<double> dataFiltered;
jkqtpstatFilterGoodFloat(first, last, std::back_inserter(dataFiltered));
std::sort(dataFiltered.begin(), dataFiltered.end());
return jkqtpstatMedianOfSortedVector(dataFiltered, Noutput);
}


/*! \brief calculates the \a quantile -th quantile of a given data range \a first ... \a last
Expand All @@ -1125,13 +1170,14 @@ template <class InputIt>
inline double jkqtpstatQuantile(InputIt first, InputIt last, double quantile, size_t* Noutput=nullptr) {
std::vector<double> dataFiltered;
jkqtpstatFilterGoodFloat(first, last, std::back_inserter(dataFiltered));
std::sort(dataFiltered.begin(), dataFiltered.end());
if (dataFiltered.size()<=0) {
if (Noutput) *Noutput=0;
return JKQTP_DOUBLE_NAN;
} else {
if (Noutput) *Noutput=dataFiltered.size();
return dataFiltered[jkqtp_bounded<size_t>(0, static_cast<size_t>(quantile*static_cast<double>(dataFiltered.size()-1)), dataFiltered.size()-1)];
auto qelement=dataFiltered.begin()+jkqtp_bounded<size_t>(0, static_cast<size_t>(quantile*static_cast<double>(dataFiltered.size()-1)), dataFiltered.size()-1);
std::nth_element(dataFiltered.begin(), qelement, dataFiltered.end());
return *qelement;
}
}

Expand Down Expand Up @@ -1164,20 +1210,18 @@ template <class InputIt>
inline double jkqtpstatMAD(InputIt first, InputIt last, double* median=nullptr, size_t* Noutput=nullptr) {
std::vector<double> dataFiltered;
jkqtpstatFilterGoodFloat(first, last, std::back_inserter(dataFiltered));
std::sort(dataFiltered.begin(), dataFiltered.end());
if (dataFiltered.size()<=0) {
if (Noutput) *Noutput=0;
if (median) *median=JKQTP_DOUBLE_NAN;
return JKQTP_DOUBLE_NAN;
} else {
if (Noutput) *Noutput=dataFiltered.size();
double med=jkqtpstatMedianOfSortedVector(dataFiltered);
const double med=jkqtpstatMedianAndPartialSort(dataFiltered.begin(), dataFiltered.end());
if (median) *median=med;
for(double& v: dataFiltered) {
v=fabs(v-med);
}
std::sort(dataFiltered.begin(), dataFiltered.end());
return jkqtpstatMedianOfSortedVector(dataFiltered);
return jkqtpstatMedianAndPartialSort(dataFiltered.begin(), dataFiltered.end());
}
}

Expand Down
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ find_package(Qt${QT_VERSION_MAJOR} COMPONENTS Test REQUIRED)

message( STATUS ".. BUILDING UNIT TESTS FOR JKQTCommon:" )
add_subdirectory(jkqtcommmon)
add_subdirectory(jkqtmath)

7 changes: 7 additions & 0 deletions tests/jkqtmath/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
cmake_minimum_required(VERSION 3.23)

set(CMAKE_INCLUDE_CURRENT_DIR ON)
set(CMAKE_AUTOMOC ON)

jkqtplotter_add_jkqtcommmon_test(jkqtpstatisticstools_test)

78 changes: 78 additions & 0 deletions tests/jkqtmath/jkqtpstatisticstools_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#include <QObject>
#include <QtTest>
#include "jkqtmath/jkqtpstatisticstools.h"

#ifndef QCOMPARE_EQ
#define QCOMPARE_EQ(A,B) if (!static_cast<bool>((A)==(B))) {qDebug()<<QTest::toString(A)<< "!=" << QTest::toString(B); } QVERIFY((A)==(B))
#endif
#ifndef QVERIFY_THROWS_NO_EXCEPTION
#define QVERIFY_THROWS_NO_EXCEPTION(B) B
#endif
#ifndef QVERIFY_THROWS_EXCEPTION
#define QVERIFY_THROWS_EXCEPTION(type, A) QVERIFY_EXCEPTION_THROWN(A, type)
#endif



class JKQTPStatisticsToolsTest : public QObject
{
Q_OBJECT

public:
inline JKQTPStatisticsToolsTest() {
}

inline ~JKQTPStatisticsToolsTest() {
}

private slots:
inline void test_jkqtpstatMedian() {
const std::vector<double> empty;
const std::vector<double> oneel = {1.23};
const std::vector<double> twoel = {2, 1};
const std::vector<double> threeel = {3, 2, 1};
const std::vector<double> fourel = {4, 2, 1, 3};
const std::vector<double> fourelnan = {4, 1, JKQTP_NAN, 2, JKQTP_NAN, JKQTP_NAN, 3};
QCOMPARE_EQ(JKQTPIsOKFloat(jkqtpstatMedian(empty.begin(), empty.end())),false);
QCOMPARE_EQ(jkqtpstatMedian(oneel.begin(), oneel.end()),1.23);
QCOMPARE_EQ(jkqtpstatMedian(twoel.begin(), twoel.end()),(1.0+2.0)/2.0);
QCOMPARE_EQ(jkqtpstatMedian(threeel.begin(), threeel.end()),2.0);
QCOMPARE_EQ(jkqtpstatMedian(fourel.begin(), fourel.end()),(2.0+3.0)/2.0);
QCOMPARE_EQ(jkqtpstatMedian(fourelnan.begin(), fourelnan.end()),(2.0+3.0)/2.0);
}

inline void test_jkqtpstatMedianOfSortedVector() {
const std::vector<double> empty;
const std::vector<double> oneel = {1.23};
const std::vector<double> twoel = {1, 2};
const std::vector<double> threeel = {1, 2, 3};
const std::vector<double> fourel = {1, 2, 3, 4};
const std::vector<double> fourelnan = {1, JKQTP_NAN, 2, JKQTP_NAN, 3, 4, JKQTP_NAN};
QCOMPARE_EQ(JKQTPIsOKFloat(jkqtpstatMedianOfSortedVector(empty)),false);
QCOMPARE_EQ(jkqtpstatMedianOfSortedVector(oneel),oneel[0]);
QCOMPARE_EQ(jkqtpstatMedianOfSortedVector(twoel),(twoel[0]+twoel[1])/2.0);
QCOMPARE_EQ(jkqtpstatMedianOfSortedVector(threeel),threeel[1]);
QCOMPARE_EQ(jkqtpstatMedianOfSortedVector(fourel),(fourel[1]+fourel[2])/2.0);
QCOMPARE_EQ(JKQTPIsOKFloat(jkqtpstatMedianOfSortedVector(fourelnan)),false);
}
inline void test_jkqtpstatQuantile() {
const std::vector<double> empty;
const std::vector<double> oneel = {1.23};
const std::vector<double> twoel = {2, 1};
const std::vector<double> threeel = {3, 2, 1};
const std::vector<double> fourel = {4, 2, 1, 3};
const std::vector<double> fourelnan = {4, 1, JKQTP_NAN, 2, JKQTP_NAN, JKQTP_NAN, 3};
QCOMPARE_EQ(JKQTPIsOKFloat(jkqtpstatQuantile(empty.begin(), empty.end(),0.25)), false);
QCOMPARE_EQ(jkqtpstatQuantile(oneel.begin(), oneel.end(),0.25),1.23);
QCOMPARE_EQ(jkqtpstatQuantile(twoel.begin(), twoel.end(),0.33),1);
QCOMPARE_EQ(jkqtpstatQuantile(threeel.begin(), threeel.end(), 0.33),1.0);
QCOMPARE_EQ(jkqtpstatQuantile(fourel.begin(), fourel.end(), 0.25),1);
QCOMPARE_EQ(jkqtpstatQuantile(fourelnan.begin(), fourelnan.end(), 0.25),1);
}

};


QTEST_APPLESS_MAIN(JKQTPStatisticsToolsTest)

#include "jkqtpstatisticstools_test.moc"

0 comments on commit b2aad7c

Please sign in to comment.