From df5bc6215abc00b150bfb39d5de2defd9c03e190 Mon Sep 17 00:00:00 2001
From: Benoit Coste <benoit.coste@protonmail.com>
Date: Fri, 17 Jul 2020 11:24:53 +0200
Subject: [PATCH] New function extract_dataframe that extract morphometrics as
 a dataframe (#841)

if the optional argument as_frame=True, otherwise the legacy return type (dict) is used

The dataframe looks like:

     name     neurite_type  ...  max_segment_midpoint_Y  max_segment_midpoint_Z
0  Neuron             axon  ...                0.000000               49.520306
1  Neuron  apical_dendrite  ...                0.000000               53.750948
4  simple             axon  ...               -2.000000                0.000000
5  simple  apical_dendrite  ...                     NaN                     NaN


And the 'neurite_type' key becomes optional for `extract_dataframe` and
`extract_stats`
---
 apps/morph_stats                      |  13 ++--
 neurom/apps/morph_stats.py            | 100 +++++++++++++++++++++++---
 neurom/apps/tests/test_morph_stats.py |  58 ++++++++++++++-
 neurom/tests/test_stats.py            |   4 +-
 setup.py                              |   1 +
 test_data/extracted-stats.csv         |   9 +++
 6 files changed, 160 insertions(+), 25 deletions(-)
 create mode 100644 test_data/extracted-stats.csv

diff --git a/apps/morph_stats b/apps/morph_stats
index 5333f56da..7563991cd 100755
--- a/apps/morph_stats
+++ b/apps/morph_stats
@@ -31,24 +31,19 @@
 import argparse
 import json
 import logging
-import os
 import sys
 
-import pkg_resources
-
 import neurom as nm
 from neurom import exceptions
 from neurom.apps import get_config
 from neurom.apps.morph_stats import (extract_stats, generate_flattened_dict,
-                                     get_header, sanitize_config, full_config)
+                                     get_header, sanitize_config, full_config,
+                                     EXAMPLE_CONFIG)
 from neurom.io.utils import get_files_by_path
 from neurom.utils import NeuromJSON
 
 L = logging.getLogger(__name__)
 
-CONFIG_PATH = pkg_resources.resource_filename(
-    'neurom', 'config')
-
 IGNORABLE_EXCEPTIONS = {
     'SomaError': exceptions.SomaError,
 }
@@ -58,7 +53,7 @@ def get_parser():
     """Parse command line arguments."""
     parser = argparse.ArgumentParser(
         description='Morphology statistics extractor',
-        epilog='Example config: {}'.format(os.path.join(CONFIG_PATH, 'morph_stats.yaml')))
+        epilog='Example config: {}'.format(EXAMPLE_CONFIG))
 
     parser.add_argument('datapath', nargs='?',
                         help='Path to a morphology data file or a directory')
@@ -98,7 +93,7 @@ def main(args):
         config = full_config()
     else:
         try:
-            config = get_config(args.config, os.path.join(CONFIG_PATH, 'morph_stats.yaml'))
+            config = get_config(args.config, EXAMPLE_CONFIG)
             config = sanitize_config(config)
         except exceptions.ConfigError as e:
             L.error(str(e))
diff --git a/neurom/apps/morph_stats.py b/neurom/apps/morph_stats.py
index f197f870c..385205ae9 100644
--- a/neurom/apps/morph_stats.py
+++ b/neurom/apps/morph_stats.py
@@ -30,21 +30,31 @@
 import logging
 from collections import defaultdict
 from itertools import product
+import os
+import pkg_resources
+
+
 import numpy as np
+import pandas as pd
+
 import neurom as nm
+from neurom.fst._core import FstNeuron
 from neurom.features import NEURONFEATURES, NEURITEFEATURES
-
 from neurom.exceptions import ConfigError
 
 L = logging.getLogger(__name__)
 
+EXAMPLE_CONFIG = os.path.join(pkg_resources.resource_filename(
+    'neurom', 'config'), 'morph_stats.yaml')
+
 
 def eval_stats(values, mode):
     """Extract a summary statistic from an array of list of values.
 
     Arguments:
         values: A numpy array of values
-        mode: A summary stat to extract. One of ['min', 'max', 'median', 'mean', 'std', 'raw']
+        mode: A summary stat to extract. One of:
+            ['min', 'max', 'median', 'mean', 'std', 'raw', 'total']
 
     Note: fails silently if values is empty, and None is returned
     """
@@ -73,12 +83,73 @@ def _stat_name(feat_name, stat_mode):
     return '%s_%s' % (stat_mode, feat_name)
 
 
+def extract_dataframe(neurons, config):
+    """Extract stats grouped by neurite type from neurons.
+
+    Arguments:
+        neurons: a neuron, population or neurite tree
+        config (dict): configuration dict. The keys are:
+            - neurite_type: a list of neurite types for which features are extracted
+              If not provided, all neurite_type will be used
+            - neurite: a dictionary {{neurite_feature: mode}} where:
+                - neurite_feature is a string from NEURITEFEATURES
+                - mode is an aggregation operation provided as a string such as:
+                  ['min', 'max', 'median', 'mean', 'std', 'raw', 'total']
+
+    Returns:
+        The extracted statistics
+
+    Note:
+        An example config can be found at:
+
+    {config_path}
+    """
+    if isinstance(neurons, FstNeuron):
+        neurons = [neurons]
+    config = config.copy()
+
+    # Only NEURITEFEATURES are considered since the dataframe is built by neurite_type
+    # NEURONFEATURES are discarded
+    if 'neuron' in config:
+        del config['neuron']
+
+    stats = {nrn.name: extract_stats(nrn, config) for nrn in neurons}
+    columns = list(next(iter(next(iter(stats.values())).values())).keys())
+
+    rows = [[name, neurite_type] + list(features.values())
+            for name, data in stats.items()
+            for neurite_type, features in data.items()]
+    return pd.DataFrame(columns=['name', 'neurite_type'] + columns,
+                        data=rows)
+
+
 def extract_stats(neurons, config):
-    """Extract stats from neurons."""
-    stats = defaultdict(dict)
+    """Extract stats from neurons.
+
+    Arguments:
+        neurons: a neuron, population or neurite tree
+        config (dict): configuration dict. The keys are:
+            - neurite_type: a list of neurite types for which features are extracted
+              If not provided, all neurite_type will be used
+            - neurite: a dictionary {{neurite_feature: mode}} where:
+                - neurite_feature is a string from NEURITEFEATURES
+                - mode is an aggregation operation provided as a string such as:
+                  ['min', 'max', 'median', 'mean', 'std', 'raw', 'total']
+
+    Returns:
+        The extracted statistics
+
+    Note:
+        An example config can be found at:
+
+    {config_path}
+    """
 
-    def _fill_compoundified(data, stat_name, stat):
-        """Insert the stat in the dict and eventually split it into XYZ components."""
+    def _fill_stats_dict(data, stat_name, stat):
+        """Insert the stat data in the dict.
+
+        And if the stats is a 3D array, splits it into XYZ components.
+        """
         if stat is None or not isinstance(stat, np.ndarray) or stat.shape not in ((3, ), ):
             data[stat_name] = stat
         else:
@@ -86,21 +157,24 @@ def _fill_compoundified(data, stat_name, stat):
                 compound_stat_name = stat_name + '_' + suffix
                 data[compound_stat_name] = stat[i]
 
+    stats = defaultdict(dict)
+
     for (feature_name, modes), neurite_type in product(config['neurite'].items(),
-                                                       config['neurite_type']):
+                                                       config.get('neurite_type',
+                                                                  _NEURITE_MAP.keys())):
         neurite_type = _NEURITE_MAP[neurite_type]
         for mode in modes:
             stat_name = _stat_name(feature_name, mode)
             stat = eval_stats(nm.get(feature_name, neurons, neurite_type=neurite_type), mode)
-            _fill_compoundified(stats[neurite_type.name], stat_name, stat)
+            _fill_stats_dict(stats[neurite_type.name], stat_name, stat)
 
-    for feature_name, modes in config['neuron'].items():
+    for feature_name, modes in config.get('neuron', {}).items():
         for mode in modes:
             stat_name = _stat_name(feature_name, mode)
             stat = eval_stats(nm.get(feature_name, neurons), mode)
-            _fill_compoundified(stats, stat_name, stat)
+            _fill_stats_dict(stats, stat_name, stat)
 
-    return stats
+    return dict(stats)
 
 
 def get_header(results):
@@ -161,3 +235,7 @@ def sanitize_config(config):
         config['neuron'] = {}
 
     return config
+
+
+extract_stats.__doc__ = extract_stats.__doc__.format(config_path=EXAMPLE_CONFIG)
+extract_dataframe.__doc__ = extract_dataframe.__doc__.format(config_path=EXAMPLE_CONFIG)
diff --git a/neurom/apps/tests/test_morph_stats.py b/neurom/apps/tests/test_morph_stats.py
index a2072d7a0..99e31d710 100644
--- a/neurom/apps/tests/test_morph_stats.py
+++ b/neurom/apps/tests/test_morph_stats.py
@@ -31,14 +31,18 @@
 import numpy as np
 from nose.tools import (assert_almost_equal, assert_equal,
                         assert_greater_equal, assert_raises, ok_)
+import pandas as pd
+from pandas.testing import assert_frame_equal
 
 import neurom as nm
 from neurom.apps import morph_stats as ms
 from neurom.exceptions import ConfigError
 from neurom.features import NEURITEFEATURES, NEURONFEATURES
 
+
 _path = os.path.dirname(os.path.abspath(__file__))
-DATA_PATH = os.path.join(_path, '../../../test_data/swc')
+DATA_PATH = os.path.join(_path, '../../../test_data')
+SWC_PATH = os.path.join(DATA_PATH, 'swc')
 
 
 REF_CONFIG = {
@@ -127,7 +131,7 @@ def test_eval_stats_applies_numpy_function():
 
 
 def test_extract_stats_single_neuron():
-    nrn = nm.load_neuron(os.path.join(DATA_PATH, 'Neuron.swc'))
+    nrn = nm.load_neuron(os.path.join(SWC_PATH, 'Neuron.swc'))
     res = ms.extract_stats(nrn, REF_CONFIG)
     assert_equal(set(res.keys()), set(REF_OUT.keys()))
     # Note: soma radius is calculated from the sphere that gives the area
@@ -140,6 +144,56 @@ def test_extract_stats_single_neuron():
             assert_almost_equal(res[k][kk], REF_OUT[k][kk], places=3)
 
 
+def test_extract_dataframe():
+    # Vanilla test
+    nrns = nm.load_neurons([os.path.join(SWC_PATH, name)
+                            for name in ['Neuron.swc', 'simple.swc']])
+    actual = ms.extract_dataframe(nrns, REF_CONFIG)
+    expected = pd.read_csv(os.path.join(DATA_PATH, 'extracted-stats.csv'), index_col=0)
+    assert_frame_equal(actual, expected)
+
+    # Test with a single neuron in the population
+    nrns = nm.load_neurons(os.path.join(SWC_PATH, 'Neuron.swc'))
+    actual = ms.extract_dataframe(nrns, REF_CONFIG)
+    assert_frame_equal(actual, expected[expected.name == 'Neuron'], check_dtype=False)
+
+    # Test with a config without the 'neuron' key
+    nrns = nm.load_neurons([os.path.join(SWC_PATH, name)
+                            for name in ['Neuron.swc', 'simple.swc']])
+    config = {'neurite': {'section_lengths': ['total']},
+              'neurite_type': ['AXON', 'APICAL_DENDRITE', 'BASAL_DENDRITE', 'ALL']}
+    actual = ms.extract_dataframe(nrns, config)
+    expected = expected[['name', 'neurite_type', 'total_section_length']]
+    assert_frame_equal(actual, expected)
+
+    # Test with a FstNeuron argument
+    nrn = nm.load_neuron(os.path.join(SWC_PATH, 'Neuron.swc'))
+    actual = ms.extract_dataframe(nrn, config)
+    assert_frame_equal(actual, expected[expected.name == 'Neuron'], check_dtype=False)
+
+    # Test with a List[FstNeuron] argument
+    nrns = [nm.load_neuron(os.path.join(SWC_PATH, name))
+            for name in ['Neuron.swc', 'simple.swc']]
+    actual = ms.extract_dataframe(nrns, config)
+    assert_frame_equal(actual, expected)
+
+    # Test without any neurite_type keys, it should pick the defaults
+    config = {'neurite': {'total_length_per_neurite': ['total']}}
+    actual = ms.extract_dataframe(nrns, config)
+    expected = pd.DataFrame(
+        columns=['name', 'neurite_type', 'total_total_length_per_neurite'],
+        data=[['Neuron', 'axon', 207.879752],
+              ['Neuron', 'basal_dendrite', 418.432416],
+              ['Neuron', 'apical_dendrite', 214.373046],
+              ['Neuron', 'all', 840.685214],
+              ['simple', 'axon', 15.000000],
+              ['simple', 'basal_dendrite', 16.000000],
+              ['simple', 'apical_dendrite', 0.000000],
+              ['simple', 'all', 31.000000]])
+    assert_frame_equal(actual, expected)
+
+
+
 def test_get_header():
     fake_results = {'fake_name0': REF_OUT,
                     'fake_name1': REF_OUT,
diff --git a/neurom/tests/test_stats.py b/neurom/tests/test_stats.py
index a60890553..29affee49 100644
--- a/neurom/tests/test_stats.py
+++ b/neurom/tests/test_stats.py
@@ -32,10 +32,9 @@
 these tests are only sanity checks.
 """
 
+import numpy as np
 from neurom import stats as st
 from nose import tools as nt
-import numpy as np
-import random
 
 np.random.seed(42)
 
@@ -70,7 +69,6 @@ def test_fit_normal_regression():
     nt.assert_almost_equal(fit_.errs[0], 0.021479979161, 12)
     nt.assert_almost_equal(fit_.errs[1], 0.7369569123250506, 12)
 
-
 def test_fit_default_is_normal():
     fit0_ = st.fit(NORMAL)
     fit1_ = st.fit(NORMAL, 'norm')
diff --git a/setup.py b/setup.py
index b110c9c34..3d8b6ef58 100644
--- a/setup.py
+++ b/setup.py
@@ -42,6 +42,7 @@
         'h5py>=2.7.1',
         'matplotlib>=3.2.1',
         'numpy>=1.8.0',
+        'pandas>=1.0.5',
         'pyyaml>=3.10',
         'scipy>=1.2.0',
         'tqdm>=4.8.4',
diff --git a/test_data/extracted-stats.csv b/test_data/extracted-stats.csv
new file mode 100644
index 000000000..642931522
--- /dev/null
+++ b/test_data/extracted-stats.csv
@@ -0,0 +1,9 @@
+,name,neurite_type,max_section_length,total_section_length,total_section_volume,max_section_branch_order,max_segment_midpoint_X,max_segment_midpoint_Y,max_segment_midpoint_Z
+0,Neuron,axon,11.018460736176685,207.8797522090813,276.7385765728952,10.0,0.0,0.0,49.52030596415
+1,Neuron,apical_dendrite,11.758281556059444,214.37304577550353,271.9412385728449,10.0,64.40167498405,0.0,53.750947521650005
+2,Neuron,basal_dendrite,11.652508126101711,418.43241643793476,556.2279268208382,10.0,64.00787233325,48.48197694465,51.575580778049996
+3,Neuron,all,11.758281556059444,840.6852144225195,1104.9077419665782,10.0,64.40167498405,48.48197694465,53.750947521650005
+4,simple,axon,6.0,15.0,24.08554367752175,1.0,3.0,-2.0,0.0
+5,simple,apical_dendrite,,0.0,0.0,,,,
+6,simple,basal_dendrite,6.0,16.0,27.227136331111538,1.0,3.0,5.0,0.0
+7,simple,all,6.0,31.0,51.312680008633286,1.0,3.0,5.0,0.0