From d7d3ba1841355256cfeee9926a069234e93e5052 Mon Sep 17 00:00:00 2001 From: Evan Peterson Date: Wed, 7 Aug 2019 14:00:13 -0600 Subject: [PATCH 01/20] Add another IDE to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index e2e0dab..99be1ac 100644 --- a/.gitignore +++ b/.gitignore @@ -102,6 +102,7 @@ ENV/ # IDE configurations .idea/ +.vscode/ # miscellaneous test_all_datasets.py From 3629e13c2e8c17855aa63d695a4ae2c664706a0c Mon Sep 17 00:00:00 2001 From: Evan Peterson Date: Wed, 7 Aug 2019 14:03:23 -0600 Subject: [PATCH 02/20] Add utilities needed for refactor --- metalearn/metafeatures/base.py | 176 ++++++++++++++++++++++++++++ metalearn/metafeatures/constants.py | 27 +++++ 2 files changed, 203 insertions(+) create mode 100644 metalearn/metafeatures/base.py create mode 100644 metalearn/metafeatures/constants.py diff --git a/metalearn/metafeatures/base.py b/metalearn/metafeatures/base.py new file mode 100644 index 0000000..4a7019c --- /dev/null +++ b/metalearn/metafeatures/base.py @@ -0,0 +1,176 @@ +import inspect +from abc import ABC, abstractmethod +from typing import List, Callable, Dict, Union, Optional, Any +import itertools + +from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup + +class ResourceComputer: + + def __init__( + self, + computer: Callable, + returns: List[str], + argmap: Optional[Dict[str,Any]] = {} + ) -> None: + """ + Decorates ``computer``, a resource computing function + with metadata about that function. + + Parameters + ---------- + computer + The function that computes the resources. + returns + The names of the resources that ``computer`` returns, specified in + the same order as ``computer`` returns them. + argmap + A custom map of ``computer``'s argument names to the global resource names + that will be passed as ``computer``'s arguments when ``computer`` is called. + """ + self._computer = computer + self.returns = returns + + self.argmap = {} + + # reversing is needed because `self.defaults` gives the default + # argument values corresponding to the *last* `n` arguments in the + # function signature. + reversed_args = self.args[::-1] + reversed_defaults = self.defaults[::-1] + arg_default_pairs = itertools.zip_longest(reversed_args, reversed_defaults) + + for local_name, default in arg_default_pairs: + # By default, just use the `computer` function's + # normal local argument names in the argmap, + # making sure to preserve default argument values + # when they are supplied. + if default is not None: + # The function has a default value for this arg; + # use that. + self.argmap[local_name] = default + else: + # This function has no default. Tell the system + # to pass in the global resource identified by + # this arg's ``local_name`` when calling this + # ``computer``. + self.argmap[local_name] = local_name + + for local_name, resource_name in argmap.items(): + # Now include any argument name or value overrides + # the developer has provided. Note: `resource_name` + # may be a global resource name (e.g. `"XSample"`) or + # a direct value for the argument (e.g. `5`) + self.argmap[local_name] = resource_name + + def __call__(self, *args, **kwargs): + """ + Allows a ``ResourceComputer`` instance to be callable. + Just forwards all arguments on to self._computer. + """ + return self._computer(*args, **kwargs) + + @property + def args(self) -> list: + """Returns a list of the positional parameter names of self._computer""" + return inspect.getfullargspec(self._computer).args + + @property + def defaults(self) -> list: + """ + From https://docs.python.org/3/library/inspect.html#inspect.getfullargspec + [Returns] an n-tuple of default argument values corresponding to the last `n` + positional parameters [of self._computer]. + """ + defaults = inspect.getfullargspec(self._computer).defaults + return [] if defaults is None else defaults + + @property + def name(self) -> str: + """Returns the function name of self._computer""" + return self._computer.__name__ + + +class MetafeatureComputer(ResourceComputer): + + def __init__( + self, + computer: Callable, + returns: List[str], # TODO: Add support for passing just a string, not a list? + problem_type: ProblemType, + groups: List[MetafeatureGroup], + argmap: Optional[Dict[str,str]] = {} + ) -> None: + """ + Decorates ``computer``, a metafeature computing function + with metadata about that function. + + Parameters + ---------- + computer + The function that computes the metafeatures. + returns + The names of the metafeatures that ``computer`` returns, specified in + the same order as ``computer`` returns them. + problem_type + The type of ML problem `computer`'s metafeatures can be computed for. + groups + The metafeature groups this computer's returned metafeatures belong to. + e.g. statistical, info-theoretic, simple, etc. + argmap + A custom map of ``computer``'s argument names to the global resource names + that will be passed as ``computer``'s arguments when ``computer`` is called. + """ + super(MetafeatureComputer, self).__init__(computer, returns, argmap) + self.groups = groups + self.problem_type = problem_type + + +class ResourceComputerMap: + def __init__(self, computers: Union[ResourceComputer,List[ResourceComputer],None] = None) -> None: + """ + Wraps a dictionary map of resource names to their computers. + Includes visibility into whether duplicate computers + are trying to become associated with a resource in the map e.g. + if a package developer has accidentally declared two computers + that return the same resource. + """ + self._map: Dict[str,ResourceComputer] = {} + if computers is not None: + self.add(computers) + + def __contains__(self, key): + """Called to implement membership test operators. e.g. `key in my_resouce_map`.""" + return key in self._map + + def add(self, computers: Union[ResourceComputer,List[ResourceComputer]]) -> None: + """ + Adds more resource name/resource computer key/value + pairs to a resource map, throwing an error on duplicates. + """ + if isinstance(computers, list): + for computer in computers: + self._add_one(computer) + elif isinstance(computers, ResourceComputer): + self._add_one(computers) + else: + raise ValueError("computers must be ResourceComputer or List[ResourceComputer]") + + def get(self, key: str = None) -> Union[Dict[str,ResourceComputer],ResourceComputer]: + """Used for getting the resource map.""" + if key is not None: + return self._map[key] + return self._map + + def _add_one(self, computer: ResourceComputer) -> None: + if not isinstance(computer, ResourceComputer): + raise ValueError(f"computer is not a ResourceComputer; it is a {type(computer)}") + + for resource_name in computer.returns: + if resource_name in self._map: + raise ValueError( + f"duplicate computer '{computer.name}' provided for resource '{resource_name}', " + f"which is already present in the resouce map, registered " + f"by computer '{self.get(resource_name).name}'" + ) + self._map[resource_name] = computer \ No newline at end of file diff --git a/metalearn/metafeatures/constants.py b/metalearn/metafeatures/constants.py new file mode 100644 index 0000000..b76cc22 --- /dev/null +++ b/metalearn/metafeatures/constants.py @@ -0,0 +1,27 @@ +from enum import Enum + +# Constant Enums +class ProblemType(Enum): + CLASSIFICATION = "classification" + REGRESSION = "regression" + ANY = "any" + +class MetafeatureGroup(Enum): + ALL = "all" + SIMPLE = "simple" + TEXT = "text" + STATISTICAL = "statistical" + INFO_THEORETIC = "info_theoretic" + LANDMARKING = "landmarking" + MODEL_BASED = "model_based" + TARGET_DEPENDENT = "target_dependent" + +# Constant strings +VALUE_KEY = 'value' +COMPUTE_TIME_KEY = 'compute_time' +NUMERIC = "NUMERIC" +TEXT = "TEXT" +CATEGORICAL = "CATEGORICAL" +NO_TARGETS = "NO_TARGETS" +NUMERIC_TARGETS = "NUMERIC_TARGETS" +TIMEOUT = "TIMEOUT" \ No newline at end of file From 6099500fc698b7a3fdcae681d8300b2788eeeda0 Mon Sep 17 00:00:00 2001 From: Evan Peterson Date: Wed, 7 Aug 2019 14:07:28 -0600 Subject: [PATCH 03/20] Do refactor of metafeatures.json --- MANIFEST.in | 1 - metalearn/__init__.py | 2 +- metalearn/metafeatures/common_operations.py | 10 +- .../decision_tree_metafeatures.py | 131 + .../general_resource_computers.py | 247 + .../information_theoretic_metafeatures.py | 202 +- .../metafeatures/landmarking_metafeatures.py | 128 +- metalearn/metafeatures/metafeatures.json | 4174 ----------------- metalearn/metafeatures/metafeatures.py | 287 +- metalearn/metafeatures/resources.py | 77 +- metalearn/metafeatures/simple_metafeatures.py | 118 +- .../metafeatures/statistical_metafeatures.py | 124 +- metalearn/metafeatures/text_metafeatures.py | 131 +- tests/benchmark_metafeatures.py | 3 +- tests/compare_with_openml.py | 3 +- tests/test_metafeatures.py | 30 +- 16 files changed, 1217 insertions(+), 4451 deletions(-) create mode 100644 metalearn/metafeatures/general_resource_computers.py delete mode 100644 metalearn/metafeatures/metafeatures.json diff --git a/MANIFEST.in b/MANIFEST.in index 7c0d500..1650733 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1 @@ -include metalearn/metafeatures/metafeatures.json include metalearn/metafeatures/metafeatures_schema.json \ No newline at end of file diff --git a/metalearn/__init__.py b/metalearn/__init__.py index 47929e8..3c7a77a 100644 --- a/metalearn/__init__.py +++ b/metalearn/__init__.py @@ -1,2 +1,2 @@ from .metafeatures.metafeatures import Metafeatures -from .metafeatures.resources import METAFEATURE_CONFIG, METAFEATURES_JSON_SCHEMA \ No newline at end of file +from .metafeatures.resources import METAFEATURES_JSON_SCHEMA \ No newline at end of file diff --git a/metalearn/metafeatures/common_operations.py b/metalearn/metafeatures/common_operations.py index aa313ed..59f7d9e 100644 --- a/metalearn/metafeatures/common_operations.py +++ b/metalearn/metafeatures/common_operations.py @@ -3,6 +3,8 @@ from scipy.stats import skew, kurtosis +import metalearn.metafeatures.constants as consts + def profile_distribution(data): """ Compute the mean, standard deviation, min, quartile1, quartile2, quartile3, and max of a vector @@ -16,7 +18,7 @@ def profile_distribution(data): features = dictionary containing the min, max, mean, and standard deviation """ if len(data) == 0: - return (data, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan) + return (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan) else: ddof = 1 if len(data) > 1 else 0 dist_mean = np.mean(data) @@ -24,13 +26,13 @@ def profile_distribution(data): dist_min, dist_quartile1, dist_quartile2, dist_quartile3, dist_max = np.percentile(data, [0,25,50,75,100]) dist_skew = skew(data) dist_kurtosis = kurtosis(data) - return (data, dist_mean, dist_stdev, dist_skew, dist_kurtosis, dist_min, dist_quartile1, dist_quartile2, dist_quartile3, dist_max) + return (dist_mean, dist_stdev, dist_skew, dist_kurtosis, dist_min, dist_quartile1, dist_quartile2, dist_quartile3, dist_max) def get_numeric_features(dataframe, column_types): - return [feature for feature in dataframe.columns if column_types[feature] == "NUMERIC"] + return [feature for feature in dataframe.columns if column_types[feature] == consts.NUMERIC] def get_categorical_features(dataframe, column_types): - return [feature for feature in dataframe.columns if column_types[feature] == "CATEGORICAL"] + return [feature for feature in dataframe.columns if column_types[feature] == consts.CATEGORICAL] def dtype_is_numeric(dtype): return "int" in str(dtype) or "float" in str(dtype) diff --git a/metalearn/metafeatures/decision_tree_metafeatures.py b/metalearn/metafeatures/decision_tree_metafeatures.py index 3622150..3f85cc7 100644 --- a/metalearn/metafeatures/decision_tree_metafeatures.py +++ b/metalearn/metafeatures/decision_tree_metafeatures.py @@ -3,6 +3,9 @@ from sklearn.tree import DecisionTreeClassifier from metalearn.metafeatures.common_operations import profile_distribution +from metalearn.metafeatures.base import ResourceComputer, MetafeatureComputer +from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup + class DecisionTree: @@ -61,20 +64,148 @@ def get_width(self): def get_decision_tree(X, Y, seed): return (DecisionTree(X, Y, seed),) +get_decision_tree = ResourceComputer( + get_decision_tree, + ["DecisionTree"], + { + "X": "XPreprocessed", + "Y": "YSample", + "seed": 9 + } +) + + def traverse_tree(tree): return (TraversedDecisionTree(tree),) +traverse_tree = ResourceComputer( + traverse_tree, + ["TraversedDecisionTree"], + { "tree": "DecisionTree" } +) + + def get_decision_tree_level_sizes(tree): return profile_distribution(tree.level_sizes) +get_decision_tree_level_sizes = MetafeatureComputer( + get_decision_tree_level_sizes, + [ + "MeanDecisionTreeLevelSize", + "StdevDecisionTreeLevelSize", + "SkewDecisionTreeLevelSize", + "KurtosisDecisionTreeLevelSize", + "MinDecisionTreeLevelSize", + "Quartile1DecisionTreeLevelSize", + "Quartile2DecisionTreeLevelSize", + "Quartile3DecisionTreeLevelSize", + "MaxDecisionTreeLevelSize" + ], + ProblemType.CLASSIFICATION, + [MetafeatureGroup.MODEL_BASED], + { + "tree": "TraversedDecisionTree" + } +) + + def get_decision_tree_branch_lengths(tree): return profile_distribution(tree.branch_lengths) +get_decision_tree_branch_lengths = MetafeatureComputer( + get_decision_tree_branch_lengths, + [ + "MeanDecisionTreeBranchLength", + "StdevDecisionTreeBranchLength", + "SkewDecisionTreeBranchLength", + "KurtosisDecisionTreeBranchLength", + "MinDecisionTreeBranchLength", + "Quartile1DecisionTreeBranchLength", + "Quartile2DecisionTreeBranchLength", + "Quartile3DecisionTreeBranchLength", + "MaxDecisionTreeBranchLength" + ], + ProblemType.CLASSIFICATION, + [MetafeatureGroup.MODEL_BASED], + { + "tree": "TraversedDecisionTree" + } +) + + def get_decision_tree_attributes(tree): return profile_distribution(tree.get_attributes()) +get_decision_tree_attributes = MetafeatureComputer( + get_decision_tree_attributes, + [ + "MeanDecisionTreeAttribute", + "StdevDecisionTreeAttribute", + "SkewDecisionTreeAttribute", + "KurtosisDecisionTreeAttribute", + "MinDecisionTreeAttribute", + "Quartile1DecisionTreeAttribute", + "Quartile2DecisionTreeAttribute", + "Quartile3DecisionTreeAttribute", + "MaxDecisionTreeAttribute" + ], + ProblemType.CLASSIFICATION, + [MetafeatureGroup.MODEL_BASED], + { + "tree": "DecisionTree" + } +) + + def get_decision_tree_general_info(tree): return tree.get_general_info() +get_decision_tree_general_info = MetafeatureComputer( + get_decision_tree_general_info, + [ + "DecisionTreeNodeCount", + "DecisionTreeLeafCount", + "DecisionTreeHeight", + ], + ProblemType.CLASSIFICATION, + [MetafeatureGroup.MODEL_BASED], + { + "tree": "DecisionTree" + } +) + + def get_decision_tree_width(tree): return (tree.get_width(),) + +get_decision_tree_width = MetafeatureComputer( + get_decision_tree_width, + ["DecisionTreeWidth"], + ProblemType.CLASSIFICATION, + [MetafeatureGroup.MODEL_BASED], + { + "tree": "TraversedDecisionTree" + } +) + + +""" +A list of all ResourceComputer +instances in this module. +""" +resource_computers = [ + get_decision_tree, + traverse_tree +] + +""" +A list of all MetafeatureComputer +instances in this module. +""" +metafeature_computers = [ + get_decision_tree_level_sizes, + get_decision_tree_branch_lengths, + get_decision_tree_attributes, + get_decision_tree_general_info, + get_decision_tree_width +] \ No newline at end of file diff --git a/metalearn/metafeatures/general_resource_computers.py b/metalearn/metafeatures/general_resource_computers.py new file mode 100644 index 0000000..7e1c144 --- /dev/null +++ b/metalearn/metafeatures/general_resource_computers.py @@ -0,0 +1,247 @@ +import numpy as np +import pandas as pd +from sklearn.model_selection import StratifiedShuffleSplit + +from metalearn.metafeatures.base import ResourceComputer +import metalearn.metafeatures.constants as consts + +def get_cv_seed(seed_base, seed_offset = 1): + return (seed_base + seed_offset,) + +get_cv_seed = ResourceComputer(get_cv_seed, ["cv_seed"]) + + +def sample_columns(X, sample_shape, seed): + if sample_shape[1] is None or X.shape[1] <= sample_shape[1]: + X_sample = X + else: + np.random.seed(seed) + sampled_column_indices = np.random.choice( + X.shape[1], size=sample_shape[1], replace=False + ) + sampled_columns = X.columns[sampled_column_indices] + X_sample = X[sampled_columns] + return (X_sample,) + +sample_columns = ResourceComputer( + sample_columns, + ["XSampledColumns"], + { "seed": 2 } +) + + +def sample_rows(X, Y, sample_shape, seed): + """ + Stratified uniform sampling of rows, according to the classes in Y. + Ensures there are enough samples from each class in Y for cross + validation. + """ + if sample_shape[0] is None or X.shape[0] <= sample_shape[0]: + X_sample, Y_sample = X, Y + elif Y is None: + np.random.seed(seed) + row_indices = np.random.choice( + X.shape[0], size=sample_shape[0], replace=False + ) + X_sample, Y_sample = X.iloc[row_indices], Y + else: + drop_size = X.shape[0] - sample_shape[0] + sample_size = sample_shape[0] + sss = StratifiedShuffleSplit( + n_splits=2, test_size=drop_size, train_size=sample_size, random_state=seed + ) + row_indices, _ = next(sss.split(X, Y)) + X_sample, Y_sample = X.iloc[row_indices], Y.iloc[row_indices] + return (X_sample, Y_sample) + +sample_rows = ResourceComputer( + sample_rows, + ["XSample","YSample"], + { "X": "XSampledColumns", "seed": 3 } +) + + +def get_preprocessed_data(X_sample, X_sampled_columns, column_types, seed): + series_array = [] + for feature in X_sample.columns: + is_text = False + feature_series = X_sample[feature].copy() + col = feature_series.values + dropped_nan_series = X_sampled_columns[feature].dropna( + axis=0,how='any' + ) + num_nan = np.sum(feature_series.isnull()) + np.random.seed(seed) + col[feature_series.isnull()] = np.random.choice( + dropped_nan_series, size=num_nan + ) + if column_types[feature_series.name] == consts.CATEGORICAL: + feature_series = pd.get_dummies(feature_series) + elif column_types[feature_series.name] == consts.TEXT: + is_text = True + if not is_text: + series_array.append(feature_series) + return (pd.concat(series_array, axis=1, copy=False),) + +get_preprocessed_data = ResourceComputer( + get_preprocessed_data, + ["XPreprocessed"], + { + "X_sample": "XSample", + "X_sampled_columns": "XSampledColumns", + "seed": 4 + } +) + + +def get_categorical_features_with_no_missing_values( + X_sample, column_types +): + categorical_features_with_no_missing_values = [] + for feature in X_sample.columns: + if column_types[feature] == consts.CATEGORICAL: + no_nan_series = X_sample[feature].dropna( + axis=0, how='any' + ) + categorical_features_with_no_missing_values.append( + no_nan_series + ) + return (categorical_features_with_no_missing_values,) + +get_categorical_features_with_no_missing_values = ResourceComputer( + get_categorical_features_with_no_missing_values, + ["NoNaNCategoricalFeatures"], + { "X_sample": "XSample" } +) + + +def get_categorical_features_and_class_with_no_missing_values( + X_sample, Y_sample, column_types +): + categorical_features_and_class_with_no_missing_values = [] + for feature in X_sample.columns: + if column_types[feature] == consts.CATEGORICAL: + df = pd.concat([X_sample[feature],Y_sample], axis=1).dropna( + axis=0, how='any' + ) + categorical_features_and_class_with_no_missing_values.append( + (df[feature],df[Y_sample.name]) + ) + return (categorical_features_and_class_with_no_missing_values,) + +get_categorical_features_and_class_with_no_missing_values = ResourceComputer( + get_categorical_features_and_class_with_no_missing_values, + ["NoNaNCategoricalFeaturesAndClass"], + { + "X_sample": "XSample", + "Y_sample": "YSample" + } +) + + +def get_numeric_features_with_no_missing_values( + X_sample, column_types +): + numeric_features_with_no_missing_values = [] + for feature in X_sample.columns: + if column_types[feature] == consts.NUMERIC: + no_nan_series = X_sample[feature].dropna( + axis=0, how='any' + ) + numeric_features_with_no_missing_values.append( + no_nan_series + ) + return (numeric_features_with_no_missing_values,) + +get_numeric_features_with_no_missing_values = ResourceComputer( + get_numeric_features_with_no_missing_values, + ["NoNaNNumericFeatures"], + { "X_sample": "XSample" } +) + + +def get_binned_numeric_features_with_no_missing_values( + numeric_features_array +): + binned_feature_array = [ + ( + pd.cut(feature, + round(feature.shape[0]**(1./3.))) + ) for feature in numeric_features_array + ] + return (binned_feature_array,) + +get_binned_numeric_features_with_no_missing_values = ResourceComputer( + get_binned_numeric_features_with_no_missing_values, + ["NoNaNBinnedNumericFeatures"], + { "numeric_features_array": "NoNaNNumericFeatures" } +) + + +def get_binned_numeric_features_and_class_with_no_missing_values( + X_sample, Y_sample, column_types +): + numeric_features_and_class_with_no_missing_values = [] + for feature in X_sample.columns: + if column_types[feature] == consts.NUMERIC: + # renaming avoids name collisions and problems when y does not have a name + df = pd.concat([X_sample[feature].rename('x'), Y_sample.rename('y')], axis=1) + df.dropna(axis=0, how='any', inplace=True) + numeric_features_and_class_with_no_missing_values.append( + (df['x'],df['y']) + ) + binned_feature_class_array = [ + ( + pd.cut(feature_class_pair[0], + round(feature_class_pair[0].shape[0]**(1./3.))), + feature_class_pair[1] + ) for feature_class_pair in numeric_features_and_class_with_no_missing_values + ] + return (binned_feature_class_array,) + +get_binned_numeric_features_and_class_with_no_missing_values = ResourceComputer( + get_binned_numeric_features_and_class_with_no_missing_values, + ["NoNaNBinnedNumericFeaturesAndClass"], + { + "X_sample": "XSample", + "Y_sample": "YSample" + } +) + + +def get_text_features_with_no_missing_values( + X_sample, column_types +): + text_features_with_no_missing_values = [] + for feature in X_sample.columns: + if column_types[feature] == consts.TEXT: + no_nan_series = X_sample[feature].dropna( + axis=0, how='any' + ) + text_features_with_no_missing_values.append( + no_nan_series + ) + return (text_features_with_no_missing_values,) + +get_text_features_with_no_missing_values = ResourceComputer( + get_text_features_with_no_missing_values, + ["NoNaNTextFeatures"], + { "X_sample": "XSample" } +) + +""" +A list of all ResourceComputer +instances in this module. +""" +resource_computers = [ + get_cv_seed, + sample_columns, + sample_rows, + get_preprocessed_data, + get_categorical_features_with_no_missing_values, + get_categorical_features_and_class_with_no_missing_values, + get_numeric_features_with_no_missing_values, + get_binned_numeric_features_with_no_missing_values, + get_binned_numeric_features_and_class_with_no_missing_values, + get_text_features_with_no_missing_values +] \ No newline at end of file diff --git a/metalearn/metafeatures/information_theoretic_metafeatures.py b/metalearn/metafeatures/information_theoretic_metafeatures.py index 20237ae..7309602 100644 --- a/metalearn/metafeatures/information_theoretic_metafeatures.py +++ b/metalearn/metafeatures/information_theoretic_metafeatures.py @@ -2,7 +2,9 @@ from scipy.stats import entropy from sklearn.metrics import mutual_info_score -from .common_operations import * +from metalearn.metafeatures.common_operations import * +from metalearn.metafeatures.base import MetafeatureComputer +from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup def get_entropy(col): return entropy(col.value_counts()) @@ -10,18 +12,152 @@ def get_entropy(col): def get_class_entropy(Y_sample): return (get_entropy(Y_sample),) +get_class_entropy = MetafeatureComputer( + get_class_entropy, + ["ClassEntropy"], + ProblemType.CLASSIFICATION, + [MetafeatureGroup.INFO_THEORETIC], + { + "Y_sample": "YSample" + } +) + + def get_attribute_entropy(feature_array): entropies = [get_entropy(feature) for feature in feature_array] return profile_distribution(entropies) +get_categorical_attribute_entropy = MetafeatureComputer( + get_attribute_entropy, + [ + "MeanCategoricalAttributeEntropy", + "StdevCategoricalAttributeEntropy", + "SkewCategoricalAttributeEntropy", + "KurtosisCategoricalAttributeEntropy", + "MinCategoricalAttributeEntropy", + "Quartile1CategoricalAttributeEntropy", + "Quartile2CategoricalAttributeEntropy", + "Quartile3CategoricalAttributeEntropy", + "MaxCategoricalAttributeEntropy" + ], + ProblemType.ANY, + [MetafeatureGroup.INFO_THEORETIC], + { + "feature_array": "NoNaNCategoricalFeatures" + } +) + +get_numeric_attribute_entropy = MetafeatureComputer( + get_attribute_entropy, + [ + "MeanNumericAttributeEntropy", + "StdevNumericAttributeEntropy", + "SkewNumericAttributeEntropy", + "KurtosisNumericAttributeEntropy", + "MinNumericAttributeEntropy", + "Quartile1NumericAttributeEntropy", + "Quartile2NumericAttributeEntropy", + "Quartile3NumericAttributeEntropy", + "MaxNumericAttributeEntropy" + ], + ProblemType.ANY, + [MetafeatureGroup.INFO_THEORETIC], + { + "feature_array": "NoNaNBinnedNumericFeatures" + } +) + + def get_joint_entropy(feature_class_array): entropies = [get_entropy(translate_into_tuples(feature_class_pair[0],feature_class_pair[1])) for feature_class_pair in feature_class_array] return profile_distribution(entropies) +get_categorical_joint_entropy = MetafeatureComputer( + get_joint_entropy, + [ + "MeanCategoricalJointEntropy", + "StdevCategoricalJointEntropy", + "SkewCategoricalJointEntropy", + "KurtosisCategoricalJointEntropy", + "MinCategoricalJointEntropy", + "Quartile1CategoricalJointEntropy", + "Quartile2CategoricalJointEntropy", + "Quartile3CategoricalJointEntropy", + "MaxCategoricalJointEntropy" + ], + ProblemType.CLASSIFICATION, + [MetafeatureGroup.INFO_THEORETIC], + { + "feature_class_array": "NoNaNCategoricalFeaturesAndClass" + } +) + +get_numeric_joint_entropy = MetafeatureComputer( + get_joint_entropy, + [ + "MeanNumericJointEntropy", + "StdevNumericJointEntropy", + "SkewNumericJointEntropy", + "KurtosisNumericJointEntropy", + "MinNumericJointEntropy", + "Quartile1NumericJointEntropy", + "Quartile2NumericJointEntropy", + "Quartile3NumericJointEntropy", + "MaxNumericJointEntropy" + ], + ProblemType.CLASSIFICATION, + [MetafeatureGroup.INFO_THEORETIC], + { + "feature_class_array": "NoNaNBinnedNumericFeaturesAndClass" + } +) + + def get_mutual_information(feature_class_array): mi_scores = [mutual_info_score(*feature_class_pair) for feature_class_pair in feature_class_array] return profile_distribution(mi_scores) +get_categorical_mutual_information = MetafeatureComputer( + get_mutual_information, + [ + "MeanCategoricalMutualInformation", + "StdevCategoricalMutualInformation", + "SkewCategoricalMutualInformation", + "KurtosisCategoricalMutualInformation", + "MinCategoricalMutualInformation", + "Quartile1CategoricalMutualInformation", + "Quartile2CategoricalMutualInformation", + "Quartile3CategoricalMutualInformation", + "MaxCategoricalMutualInformation" + ], + ProblemType.CLASSIFICATION, + [MetafeatureGroup.INFO_THEORETIC], + { + "feature_class_array": "NoNaNCategoricalFeaturesAndClass" + } +) + +get_numeric_mutual_information = MetafeatureComputer( + get_mutual_information, + [ + "MeanNumericMutualInformation", + "StdevNumericMutualInformation", + "SkewNumericMutualInformation", + "KurtosisNumericMutualInformation", + "MinNumericMutualInformation", + "Quartile1NumericMutualInformation", + "Quartile2NumericMutualInformation", + "Quartile3NumericMutualInformation", + "MaxNumericMutualInformation" + ], + ProblemType.CLASSIFICATION, + [MetafeatureGroup.INFO_THEORETIC], + { + "feature_class_array": "NoNaNBinnedNumericFeaturesAndClass" + } +) + + def get_equivalent_number_features(class_entropy, mutual_information): if mutual_information == 0: enf = np.nan @@ -29,6 +165,29 @@ def get_equivalent_number_features(class_entropy, mutual_information): enf = class_entropy / mutual_information return (enf,) +get_equivalent_number_categorical_features = MetafeatureComputer( + get_equivalent_number_features, + ["EquivalentNumberOfCategoricalFeatures"], + ProblemType.CLASSIFICATION, + [MetafeatureGroup.INFO_THEORETIC], + { + "class_entropy": "ClassEntropy", + "mutual_information": "MeanCategoricalMutualInformation" + } +) + +get_equivalent_number_numeric_features = MetafeatureComputer( + get_equivalent_number_features, + ["EquivalentNumberOfNumericFeatures"], + ProblemType.CLASSIFICATION, + [MetafeatureGroup.INFO_THEORETIC], + { + "class_entropy": "ClassEntropy", + "mutual_information": "MeanNumericMutualInformation" + } +) + + def get_noise_signal_ratio(attribute_entropy, mutual_information): if mutual_information == 0: nsr = np.nan @@ -36,5 +195,46 @@ def get_noise_signal_ratio(attribute_entropy, mutual_information): nsr = (attribute_entropy - mutual_information) / mutual_information return (nsr,) +get_categorical_noise_signal_ratio = MetafeatureComputer( + get_noise_signal_ratio, + ["CategoricalNoiseToSignalRatio"], + ProblemType.CLASSIFICATION, + [MetafeatureGroup.INFO_THEORETIC], + { + "attribute_entropy": "MeanCategoricalAttributeEntropy", + "mutual_information": "MeanCategoricalMutualInformation" + } +) + +get_numeric_noise_signal_ratio = MetafeatureComputer( + get_noise_signal_ratio, + ["NumericNoiseToSignalRatio"], + ProblemType.CLASSIFICATION, + [MetafeatureGroup.INFO_THEORETIC], + { + "attribute_entropy": "MeanNumericAttributeEntropy", + "mutual_information": "MeanNumericMutualInformation" + } +) + + def translate_into_tuples(col1, col2): return pd.Series([x for x in zip(col1, col2)]) + +""" +A list of all MetafeatureComputer +instances in this module. +""" +metafeature_computers = [ + get_class_entropy, + get_categorical_attribute_entropy, + get_numeric_attribute_entropy, + get_categorical_joint_entropy, + get_numeric_joint_entropy, + get_categorical_mutual_information, + get_numeric_mutual_information, + get_equivalent_number_categorical_features, + get_equivalent_number_numeric_features, + get_categorical_noise_signal_ratio, + get_numeric_noise_signal_ratio +] \ No newline at end of file diff --git a/metalearn/metafeatures/landmarking_metafeatures.py b/metalearn/metafeatures/landmarking_metafeatures.py index f9b9f3e..3dc069a 100644 --- a/metalearn/metafeatures/landmarking_metafeatures.py +++ b/metalearn/metafeatures/landmarking_metafeatures.py @@ -10,7 +10,9 @@ from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier -from .common_operations import * +from metalearn.metafeatures.common_operations import * +from metalearn.metafeatures.base import MetafeatureComputer +from metalearn.metafeatures.constants import MetafeatureGroup, ProblemType ''' @@ -42,12 +44,42 @@ def get_naive_bayes(X, Y, n_folds, cv_seed): pipeline = Pipeline([('naive_bayes', GaussianNB())]) return run_pipeline(X, Y, pipeline, n_folds, cv_seed) +get_naive_bayes = MetafeatureComputer( + get_naive_bayes, + [ + "NaiveBayesErrRate", + "NaiveBayesKappa" + ], + ProblemType.CLASSIFICATION, + [MetafeatureGroup.LANDMARKING], + { + "X": "XPreprocessed", + "Y": "YSample" + } +) + + def get_knn_1(X, Y, n_folds, cv_seed): pipeline = Pipeline([( 'knn_1', KNeighborsClassifier(n_neighbors = 1, n_jobs=1) )]) return run_pipeline(X, Y, pipeline, n_folds, cv_seed) +get_knn_1 = MetafeatureComputer( + get_knn_1, + [ + "kNN1NErrRate", + "kNN1NKappa" + ], + ProblemType.CLASSIFICATION, + [MetafeatureGroup.LANDMARKING], + { + "X": "XPreprocessed", + "Y": "YSample" + } +) + + def get_decision_stump(X, Y, seed, n_folds, cv_seed): pipeline = Pipeline([( 'decision_stump', DecisionTreeClassifier( @@ -56,6 +88,22 @@ def get_decision_stump(X, Y, seed, n_folds, cv_seed): )]) return run_pipeline(X, Y, pipeline, n_folds, cv_seed) +get_decision_stump = MetafeatureComputer( + get_decision_stump, + [ + "DecisionStumpErrRate", + "DecisionStumpKappa" + ], + ProblemType.CLASSIFICATION, + [MetafeatureGroup.LANDMARKING], + { + "X": "XPreprocessed", + "Y": "YSample", + "seed": 5 + } +) + + def get_random_tree(X, Y, depth, seed, n_folds, cv_seed): pipeline = Pipeline([( 'random_tree', DecisionTreeClassifier( @@ -65,8 +113,86 @@ def get_random_tree(X, Y, depth, seed, n_folds, cv_seed): )]) return run_pipeline(X, Y, pipeline, n_folds, cv_seed) +get_random_tree_depth_1 = MetafeatureComputer( + get_random_tree, + [ + "RandomTreeDepth1ErrRate", + "RandomTreeDepth1Kappa" + ], + ProblemType.CLASSIFICATION, + [MetafeatureGroup.LANDMARKING], + { + "X": "XPreprocessed", + "Y": "YSample", + "depth": 1, + "seed": 6 + } +) + +get_random_tree_depth_2 = MetafeatureComputer( + get_random_tree, + [ + "RandomTreeDepth2ErrRate", + "RandomTreeDepth2Kappa" + ], + ProblemType.CLASSIFICATION, + [MetafeatureGroup.LANDMARKING], + { + "X": "XPreprocessed", + "Y": "YSample", + "depth": 2, + "seed": 7 + } +) + +get_random_tree_depth_3 = MetafeatureComputer( + get_random_tree, + [ + "RandomTreeDepth3ErrRate", + "RandomTreeDepth3Kappa" + ], + ProblemType.CLASSIFICATION, + [MetafeatureGroup.LANDMARKING], + { + "X": "XPreprocessed", + "Y": "YSample", + "depth": 3, + "seed": 8 + } +) + + def get_lda(X, Y, n_folds, cv_seed): pipeline = Pipeline([( 'lda', LinearDiscriminantAnalysis() )]) return run_pipeline(X, Y, pipeline, n_folds, cv_seed) + +get_lda = MetafeatureComputer( + get_lda, + [ + "LinearDiscriminantAnalysisErrRate", + "LinearDiscriminantAnalysisKappa" + ], + ProblemType.CLASSIFICATION, + [MetafeatureGroup.LANDMARKING], + { + "X": "XPreprocessed", + "Y": "YSample" + } +) + + +""" +A list of all MetafeatureComputer +instances in this module. +""" +metafeature_computers = [ + get_naive_bayes, + get_knn_1, + get_decision_stump, + get_random_tree_depth_1, + get_random_tree_depth_2, + get_random_tree_depth_3, + get_lda +] \ No newline at end of file diff --git a/metalearn/metafeatures/metafeatures.json b/metalearn/metafeatures/metafeatures.json deleted file mode 100644 index ba00400..0000000 --- a/metalearn/metafeatures/metafeatures.json +++ /dev/null @@ -1,4174 +0,0 @@ -{ - "resources": { - "X_raw": { - "function": "", - "arguments": {} - }, - "X": { - "function": "", - "arguments": {} - }, - "Y": { - "function": "", - "arguments": {} - }, - "column_types": { - "function": "", - "arguments": {} - }, - "sample_shape": { - "function": "", - "arguments": {} - }, - "seed_base": { - "function": "", - "arguments": {} - }, - "n_folds": { - "function": "", - "arguments": {} - }, - "cv_seed": { - "function": "self._get_cv_seed", - "arguments": { - "seed_base": "seed_base", - "seed_offset": 1 - }, - "returns": [ - "cv_seed" - ] - }, - "XSampledColumns": { - "function": "self._sample_columns", - "arguments": { - "X": "X", - "sample_shape": "sample_shape", - "seed": 2 - }, - "returns": [ - "XSampledColumns" - ] - }, - "XSample": { - "function": "self._sample_rows", - "arguments": { - "X": "XSampledColumns", - "Y": "Y", - "sample_shape": "sample_shape", - "seed": 3 - }, - "returns": [ - "XSample", - "YSample" - ] - }, - "YSample": { - "function": "self._sample_rows", - "arguments": { - "X": "XSampledColumns", - "Y": "Y", - "sample_shape": "sample_shape", - "seed": 3 - }, - "returns": [ - "XSample", - "YSample" - ] - }, - "XPreprocessed": { - "function": "self._get_preprocessed_data", - "arguments": { - "X_sample": "XSample", - "X_sampled_columns": "XSampledColumns", - "column_types": "column_types", - "seed": 4 - }, - "returns": [ - "XPreprocessed" - ] - }, - "NoNaNCategoricalFeatures": { - "function": "self._get_categorical_features_with_no_missing_values", - "arguments": { - "X_sample": "XSample", - "column_types": "column_types" - }, - "returns": [ - "NoNaNCategoricalFeatures" - ] - }, - "NoNaNCategoricalFeaturesAndClass": { - "function": "self._get_categorical_features_and_class_with_no_missing_values", - "arguments": { - "X_sample": "XSample", - "Y_sample": "YSample", - "column_types": "column_types" - }, - "returns": [ - "NoNaNCategoricalFeaturesAndClass" - ] - }, - "NoNaNNumericFeatures": { - "function": "self._get_numeric_features_with_no_missing_values", - "arguments": { - "X_sample": "XSample", - "column_types": "column_types" - }, - "returns": [ - "NoNaNNumericFeatures" - ] - }, - "NoNaNBinnedNumericFeatures": { - "function": "self._get_binned_numeric_features_with_no_missing_values", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "NoNaNBinnedNumericFeatures" - ] - }, - "NoNaNBinnedNumericFeaturesAndClass": { - "function": "self._get_binned_numeric_features_and_class_with_no_missing_values", - "arguments": { - "X_sample": "XSample", - "Y_sample": "YSample", - "column_types": "column_types" - }, - "returns": [ - "NoNaNBinnedNumericFeaturesAndClass" - ] - }, - "NoNaNTextFeatures": { - "function": "self._get_text_features_with_no_missing_values", - "arguments": { - "X_sample": "XSample", - "column_types": "column_types" - }, - "returns": [ - "NoNaNTextFeatures" - ] - }, - "ArrayOfStringLengthsOfTextFeatures": { - "function": "get_string_lengths_array_from_text_features", - "arguments": { - "text_features_array": "NoNaNTextFeatures" - }, - "returns": [ - "ArrayOfStringLengthsOfTextFeatures" - ] - }, - "DecisionTree": { - "function": "get_decision_tree", - "arguments": { - "X": "XPreprocessed", - "Y": "YSample", - "seed": 9 - }, - "returns": [ - "DecisionTree" - ] - }, - "TraversedDecisionTree": { - "function": "traverse_tree", - "arguments": { - "tree": "DecisionTree" - }, - "returns": [ - "TraversedDecisionTree" - ] - } - }, - "metafeatures": { - "NumberOfInstances": { - "function": "get_dataset_stats", - "arguments": { - "X": "X_raw", - "column_types": "column_types" - }, - "returns": [ - "NumberOfInstances", - "NumberOfFeatures", - "NumberOfNumericFeatures", - "NumberOfCategoricalFeatures", - "RatioOfNumericFeatures", - "RatioOfCategoricalFeatures" - ] - }, - "NumberOfFeatures": { - "function": "get_dataset_stats", - "arguments": { - "X": "X_raw", - "column_types": "column_types" - }, - "returns": [ - "NumberOfInstances", - "NumberOfFeatures", - "NumberOfNumericFeatures", - "NumberOfCategoricalFeatures", - "RatioOfNumericFeatures", - "RatioOfCategoricalFeatures" - ] - }, - "NumberOfClasses": { - "function": "get_class_stats", - "arguments": { - "Y": "Y" - }, - "returns": [ - "NumberOfClasses", - "ClassProbabilities", - "MeanClassProbability", - "StdevClassProbability", - "SkewClassProbability", - "KurtosisClassProbability", - "MinClassProbability", - "Quartile1ClassProbability", - "Quartile2ClassProbability", - "Quartile3ClassProbability", - "MaxClassProbability", - "MinorityClassSize", - "MajorityClassSize" - ] - }, - "NumberOfNumericFeatures": { - "function": "get_dataset_stats", - "arguments": { - "X": "X_raw", - "column_types": "column_types" - }, - "returns": [ - "NumberOfInstances", - "NumberOfFeatures", - "NumberOfNumericFeatures", - "NumberOfCategoricalFeatures", - "RatioOfNumericFeatures", - "RatioOfCategoricalFeatures" - ] - }, - "NumberOfCategoricalFeatures": { - "function": "get_dataset_stats", - "arguments": { - "X": "X_raw", - "column_types": "column_types" - }, - "returns": [ - "NumberOfInstances", - "NumberOfFeatures", - "NumberOfNumericFeatures", - "NumberOfCategoricalFeatures", - "RatioOfNumericFeatures", - "RatioOfCategoricalFeatures" - ] - }, - "RatioOfNumericFeatures": { - "function": "get_dataset_stats", - "arguments": { - "X": "X_raw", - "column_types": "column_types" - }, - "returns": [ - "NumberOfInstances", - "NumberOfFeatures", - "NumberOfNumericFeatures", - "NumberOfCategoricalFeatures", - "RatioOfNumericFeatures", - "RatioOfCategoricalFeatures" - ] - }, - "RatioOfCategoricalFeatures": { - "function": "get_dataset_stats", - "arguments": { - "X": "X_raw", - "column_types": "column_types" - }, - "returns": [ - "NumberOfInstances", - "NumberOfFeatures", - "NumberOfNumericFeatures", - "NumberOfCategoricalFeatures", - "RatioOfNumericFeatures", - "RatioOfCategoricalFeatures" - ] - }, - "Dimensionality": { - "function": "get_dimensionality", - "arguments": { - "number_of_features": "NumberOfFeatures", - "number_of_instances": "NumberOfInstances" - }, - "returns": [ - "Dimensionality" - ] - }, - "NumberOfMissingValues": { - "function": "get_missing_values", - "arguments": { - "X": "X_raw" - }, - "returns": [ - "NumberOfMissingValues", - "RatioOfMissingValues", - "NumberOfInstancesWithMissingValues", - "RatioOfInstancesWithMissingValues", - "NumberOfFeaturesWithMissingValues", - "RatioOfFeaturesWithMissingValues" - ] - }, - "RatioOfMissingValues": { - "function": "get_missing_values", - "arguments": { - "X": "X_raw" - }, - "returns": [ - "NumberOfMissingValues", - "RatioOfMissingValues", - "NumberOfInstancesWithMissingValues", - "RatioOfInstancesWithMissingValues", - "NumberOfFeaturesWithMissingValues", - "RatioOfFeaturesWithMissingValues" - ] - }, - "NumberOfInstancesWithMissingValues": { - "function": "get_missing_values", - "arguments": { - "X": "X_raw" - }, - "returns": [ - "NumberOfMissingValues", - "RatioOfMissingValues", - "NumberOfInstancesWithMissingValues", - "RatioOfInstancesWithMissingValues", - "NumberOfFeaturesWithMissingValues", - "RatioOfFeaturesWithMissingValues" - ] - }, - "RatioOfInstancesWithMissingValues": { - "function": "get_missing_values", - "arguments": { - "X": "X_raw" - }, - "returns": [ - "NumberOfMissingValues", - "RatioOfMissingValues", - "NumberOfInstancesWithMissingValues", - "RatioOfInstancesWithMissingValues", - "NumberOfFeaturesWithMissingValues", - "RatioOfFeaturesWithMissingValues" - ] - }, - "NumberOfFeaturesWithMissingValues": { - "function": "get_missing_values", - "arguments": { - "X": "X_raw" - }, - "returns": [ - "NumberOfMissingValues", - "RatioOfMissingValues", - "NumberOfInstancesWithMissingValues", - "RatioOfInstancesWithMissingValues", - "NumberOfFeaturesWithMissingValues", - "RatioOfFeaturesWithMissingValues" - ] - }, - "RatioOfFeaturesWithMissingValues": { - "function": "get_missing_values", - "arguments": { - "X": "X_raw" - }, - "returns": [ - "NumberOfMissingValues", - "RatioOfMissingValues", - "NumberOfInstancesWithMissingValues", - "RatioOfInstancesWithMissingValues", - "NumberOfFeaturesWithMissingValues", - "RatioOfFeaturesWithMissingValues" - ] - }, - "MeanClassProbability": { - "function": "get_class_stats", - "arguments": { - "Y": "Y" - }, - "returns": [ - "NumberOfClasses", - "ClassProbabilities", - "MeanClassProbability", - "StdevClassProbability", - "SkewClassProbability", - "KurtosisClassProbability", - "MinClassProbability", - "Quartile1ClassProbability", - "Quartile2ClassProbability", - "Quartile3ClassProbability", - "MaxClassProbability", - "MinorityClassSize", - "MajorityClassSize" - ] - }, - "StdevClassProbability": { - "function": "get_class_stats", - "arguments": { - "Y": "Y" - }, - "returns": [ - "NumberOfClasses", - "ClassProbabilities", - "MeanClassProbability", - "StdevClassProbability", - "SkewClassProbability", - "KurtosisClassProbability", - "MinClassProbability", - "Quartile1ClassProbability", - "Quartile2ClassProbability", - "Quartile3ClassProbability", - "MaxClassProbability", - "MinorityClassSize", - "MajorityClassSize" - ] - }, - "SkewClassProbability": { - "function": "get_class_stats", - "arguments": { - "Y": "Y" - }, - "returns": [ - "NumberOfClasses", - "ClassProbabilities", - "MeanClassProbability", - "StdevClassProbability", - "SkewClassProbability", - "KurtosisClassProbability", - "MinClassProbability", - "Quartile1ClassProbability", - "Quartile2ClassProbability", - "Quartile3ClassProbability", - "MaxClassProbability", - "MinorityClassSize", - "MajorityClassSize" - ] - }, - "KurtosisClassProbability": { - "function": "get_class_stats", - "arguments": { - "Y": "Y" - }, - "returns": [ - "NumberOfClasses", - "ClassProbabilities", - "MeanClassProbability", - "StdevClassProbability", - "SkewClassProbability", - "KurtosisClassProbability", - "MinClassProbability", - "Quartile1ClassProbability", - "Quartile2ClassProbability", - "Quartile3ClassProbability", - "MaxClassProbability", - "MinorityClassSize", - "MajorityClassSize" - ] - }, - "MinClassProbability": { - "function": "get_class_stats", - "arguments": { - "Y": "Y" - }, - "returns": [ - "NumberOfClasses", - "ClassProbabilities", - "MeanClassProbability", - "StdevClassProbability", - "SkewClassProbability", - "KurtosisClassProbability", - "MinClassProbability", - "Quartile1ClassProbability", - "Quartile2ClassProbability", - "Quartile3ClassProbability", - "MaxClassProbability", - "MinorityClassSize", - "MajorityClassSize" - ] - }, - "Quartile1ClassProbability": { - "function": "get_class_stats", - "arguments": { - "Y": "Y" - }, - "returns": [ - "NumberOfClasses", - "ClassProbabilities", - "MeanClassProbability", - "StdevClassProbability", - "SkewClassProbability", - "KurtosisClassProbability", - "MinClassProbability", - "Quartile1ClassProbability", - "Quartile2ClassProbability", - "Quartile3ClassProbability", - "MaxClassProbability", - "MinorityClassSize", - "MajorityClassSize" - ] - }, - "Quartile2ClassProbability": { - "function": "get_class_stats", - "arguments": { - "Y": "Y" - }, - "returns": [ - "NumberOfClasses", - "ClassProbabilities", - "MeanClassProbability", - "StdevClassProbability", - "SkewClassProbability", - "KurtosisClassProbability", - "MinClassProbability", - "Quartile1ClassProbability", - "Quartile2ClassProbability", - "Quartile3ClassProbability", - "MaxClassProbability", - "MinorityClassSize", - "MajorityClassSize" - ] - }, - "Quartile3ClassProbability": { - "function": "get_class_stats", - "arguments": { - "Y": "Y" - }, - "returns": [ - "NumberOfClasses", - "ClassProbabilities", - "MeanClassProbability", - "StdevClassProbability", - "SkewClassProbability", - "KurtosisClassProbability", - "MinClassProbability", - "Quartile1ClassProbability", - "Quartile2ClassProbability", - "Quartile3ClassProbability", - "MaxClassProbability", - "MinorityClassSize", - "MajorityClassSize" - ] - }, - "MaxClassProbability": { - "function": "get_class_stats", - "arguments": { - "Y": "Y" - }, - "returns": [ - "NumberOfClasses", - "ClassProbabilities", - "MeanClassProbability", - "StdevClassProbability", - "SkewClassProbability", - "KurtosisClassProbability", - "MinClassProbability", - "Quartile1ClassProbability", - "Quartile2ClassProbability", - "Quartile3ClassProbability", - "MaxClassProbability", - "MinorityClassSize", - "MajorityClassSize" - ] - }, - "MinorityClassSize": { - "function": "get_class_stats", - "arguments": { - "Y": "Y" - }, - "returns": [ - "NumberOfClasses", - "ClassProbabilities", - "MeanClassProbability", - "StdevClassProbability", - "SkewClassProbability", - "KurtosisClassProbability", - "MinClassProbability", - "Quartile1ClassProbability", - "Quartile2ClassProbability", - "Quartile3ClassProbability", - "MaxClassProbability", - "MinorityClassSize", - "MajorityClassSize" - ] - }, - "MajorityClassSize": { - "function": "get_class_stats", - "arguments": { - "Y": "Y" - }, - "returns": [ - "NumberOfClasses", - "ClassProbabilities", - "MeanClassProbability", - "StdevClassProbability", - "SkewClassProbability", - "KurtosisClassProbability", - "MinClassProbability", - "Quartile1ClassProbability", - "Quartile2ClassProbability", - "Quartile3ClassProbability", - "MaxClassProbability", - "MinorityClassSize", - "MajorityClassSize" - ] - }, - "MeanCardinalityOfCategoricalFeatures": { - "function": "get_categorical_cardinalities", - "arguments": { - "X": "X", - "column_types": "column_types" - }, - "returns": [ - "CardinalitiesOfCategoricalFeatures", - "MeanCardinalityOfCategoricalFeatures", - "StdevCardinalityOfCategoricalFeatures", - "SkewCardinalityOfCategoricalFeatures", - "KurtosisCardinalityOfCategoricalFeatures", - "MinCardinalityOfCategoricalFeatures", - "Quartile1CardinalityOfCategoricalFeatures", - "Quartile2CardinalityOfCategoricalFeatures", - "Quartile3CardinalityOfCategoricalFeatures", - "MaxCardinalityOfCategoricalFeatures" - ] - }, - "StdevCardinalityOfCategoricalFeatures": { - "function": "get_categorical_cardinalities", - "arguments": { - "X": "X", - "column_types": "column_types" - }, - "returns": [ - "CardinalitiesOfCategoricalFeatures", - "MeanCardinalityOfCategoricalFeatures", - "StdevCardinalityOfCategoricalFeatures", - "SkewCardinalityOfCategoricalFeatures", - "KurtosisCardinalityOfCategoricalFeatures", - "MinCardinalityOfCategoricalFeatures", - "Quartile1CardinalityOfCategoricalFeatures", - "Quartile2CardinalityOfCategoricalFeatures", - "Quartile3CardinalityOfCategoricalFeatures", - "MaxCardinalityOfCategoricalFeatures" - ] - }, - "SkewCardinalityOfCategoricalFeatures": { - "function": "get_categorical_cardinalities", - "arguments": { - "X": "X", - "column_types": "column_types" - }, - "returns": [ - "CardinalitiesOfCategoricalFeatures", - "MeanCardinalityOfCategoricalFeatures", - "StdevCardinalityOfCategoricalFeatures", - "SkewCardinalityOfCategoricalFeatures", - "KurtosisCardinalityOfCategoricalFeatures", - "MinCardinalityOfCategoricalFeatures", - "Quartile1CardinalityOfCategoricalFeatures", - "Quartile2CardinalityOfCategoricalFeatures", - "Quartile3CardinalityOfCategoricalFeatures", - "MaxCardinalityOfCategoricalFeatures" - ] - }, - "KurtosisCardinalityOfCategoricalFeatures": { - "function": "get_categorical_cardinalities", - "arguments": { - "X": "X", - "column_types": "column_types" - }, - "returns": [ - "CardinalitiesOfCategoricalFeatures", - "MeanCardinalityOfCategoricalFeatures", - "StdevCardinalityOfCategoricalFeatures", - "SkewCardinalityOfCategoricalFeatures", - "KurtosisCardinalityOfCategoricalFeatures", - "MinCardinalityOfCategoricalFeatures", - "Quartile1CardinalityOfCategoricalFeatures", - "Quartile2CardinalityOfCategoricalFeatures", - "Quartile3CardinalityOfCategoricalFeatures", - "MaxCardinalityOfCategoricalFeatures" - ] - }, - "MinCardinalityOfCategoricalFeatures": { - "function": "get_categorical_cardinalities", - "arguments": { - "X": "X", - "column_types": "column_types" - }, - "returns": [ - "CardinalitiesOfCategoricalFeatures", - "MeanCardinalityOfCategoricalFeatures", - "StdevCardinalityOfCategoricalFeatures", - "SkewCardinalityOfCategoricalFeatures", - "KurtosisCardinalityOfCategoricalFeatures", - "MinCardinalityOfCategoricalFeatures", - "Quartile1CardinalityOfCategoricalFeatures", - "Quartile2CardinalityOfCategoricalFeatures", - "Quartile3CardinalityOfCategoricalFeatures", - "MaxCardinalityOfCategoricalFeatures" - ] - }, - "Quartile1CardinalityOfCategoricalFeatures": { - "function": "get_categorical_cardinalities", - "arguments": { - "X": "X", - "column_types": "column_types" - }, - "returns": [ - "CardinalitiesOfCategoricalFeatures", - "MeanCardinalityOfCategoricalFeatures", - "StdevCardinalityOfCategoricalFeatures", - "SkewCardinalityOfCategoricalFeatures", - "KurtosisCardinalityOfCategoricalFeatures", - "MinCardinalityOfCategoricalFeatures", - "Quartile1CardinalityOfCategoricalFeatures", - "Quartile2CardinalityOfCategoricalFeatures", - "Quartile3CardinalityOfCategoricalFeatures", - "MaxCardinalityOfCategoricalFeatures" - ] - }, - "Quartile2CardinalityOfCategoricalFeatures": { - "function": "get_categorical_cardinalities", - "arguments": { - "X": "X", - "column_types": "column_types" - }, - "returns": [ - "CardinalitiesOfCategoricalFeatures", - "MeanCardinalityOfCategoricalFeatures", - "StdevCardinalityOfCategoricalFeatures", - "SkewCardinalityOfCategoricalFeatures", - "KurtosisCardinalityOfCategoricalFeatures", - "MinCardinalityOfCategoricalFeatures", - "Quartile1CardinalityOfCategoricalFeatures", - "Quartile2CardinalityOfCategoricalFeatures", - "Quartile3CardinalityOfCategoricalFeatures", - "MaxCardinalityOfCategoricalFeatures" - ] - }, - "Quartile3CardinalityOfCategoricalFeatures": { - "function": "get_categorical_cardinalities", - "arguments": { - "X": "X", - "column_types": "column_types" - }, - "returns": [ - "CardinalitiesOfCategoricalFeatures", - "MeanCardinalityOfCategoricalFeatures", - "StdevCardinalityOfCategoricalFeatures", - "SkewCardinalityOfCategoricalFeatures", - "KurtosisCardinalityOfCategoricalFeatures", - "MinCardinalityOfCategoricalFeatures", - "Quartile1CardinalityOfCategoricalFeatures", - "Quartile2CardinalityOfCategoricalFeatures", - "Quartile3CardinalityOfCategoricalFeatures", - "MaxCardinalityOfCategoricalFeatures" - ] - }, - "MaxCardinalityOfCategoricalFeatures": { - "function": "get_categorical_cardinalities", - "arguments": { - "X": "X", - "column_types": "column_types" - }, - "returns": [ - "CardinalitiesOfCategoricalFeatures", - "MeanCardinalityOfCategoricalFeatures", - "StdevCardinalityOfCategoricalFeatures", - "SkewCardinalityOfCategoricalFeatures", - "KurtosisCardinalityOfCategoricalFeatures", - "MinCardinalityOfCategoricalFeatures", - "Quartile1CardinalityOfCategoricalFeatures", - "Quartile2CardinalityOfCategoricalFeatures", - "Quartile3CardinalityOfCategoricalFeatures", - "MaxCardinalityOfCategoricalFeatures" - ] - }, - "MeanCardinalityOfNumericFeatures": { - "function": "get_numeric_cardinalities", - "arguments": { - "X": "X", - "column_types": "column_types" - }, - "returns": [ - "CardinalitiesOfNumericFeatures", - "MeanCardinalityOfNumericFeatures", - "StdevCardinalityOfNumericFeatures", - "SkewCardinalityOfNumericFeatures", - "KurtosisCardinalityOfNumericFeatures", - "MinCardinalityOfNumericFeatures", - "Quartile1CardinalityOfNumericFeatures", - "Quartile2CardinalityOfNumericFeatures", - "Quartile3CardinalityOfNumericFeatures", - "MaxCardinalityOfNumericFeatures" - ] - }, - "StdevCardinalityOfNumericFeatures": { - "function": "get_numeric_cardinalities", - "arguments": { - "X": "X", - "column_types": "column_types" - }, - "returns": [ - "CardinalitiesOfNumericFeatures", - "MeanCardinalityOfNumericFeatures", - "StdevCardinalityOfNumericFeatures", - "SkewCardinalityOfNumericFeatures", - "KurtosisCardinalityOfNumericFeatures", - "MinCardinalityOfNumericFeatures", - "Quartile1CardinalityOfNumericFeatures", - "Quartile2CardinalityOfNumericFeatures", - "Quartile3CardinalityOfNumericFeatures", - "MaxCardinalityOfNumericFeatures" - ] - }, - "SkewCardinalityOfNumericFeatures": { - "function": "get_numeric_cardinalities", - "arguments": { - "X": "X", - "column_types": "column_types" - }, - "returns": [ - "CardinalitiesOfNumericFeatures", - "MeanCardinalityOfNumericFeatures", - "StdevCardinalityOfNumericFeatures", - "SkewCardinalityOfNumericFeatures", - "KurtosisCardinalityOfNumericFeatures", - "MinCardinalityOfNumericFeatures", - "Quartile1CardinalityOfNumericFeatures", - "Quartile2CardinalityOfNumericFeatures", - "Quartile3CardinalityOfNumericFeatures", - "MaxCardinalityOfNumericFeatures" - ] - }, - "KurtosisCardinalityOfNumericFeatures": { - "function": "get_numeric_cardinalities", - "arguments": { - "X": "X", - "column_types": "column_types" - }, - "returns": [ - "CardinalitiesOfNumericFeatures", - "MeanCardinalityOfNumericFeatures", - "StdevCardinalityOfNumericFeatures", - "SkewCardinalityOfNumericFeatures", - "KurtosisCardinalityOfNumericFeatures", - "MinCardinalityOfNumericFeatures", - "Quartile1CardinalityOfNumericFeatures", - "Quartile2CardinalityOfNumericFeatures", - "Quartile3CardinalityOfNumericFeatures", - "MaxCardinalityOfNumericFeatures" - ] - }, - "MinCardinalityOfNumericFeatures": { - "function": "get_numeric_cardinalities", - "arguments": { - "X": "X", - "column_types": "column_types" - }, - "returns": [ - "CardinalitiesOfNumericFeatures", - "MeanCardinalityOfNumericFeatures", - "StdevCardinalityOfNumericFeatures", - "SkewCardinalityOfNumericFeatures", - "KurtosisCardinalityOfNumericFeatures", - "MinCardinalityOfNumericFeatures", - "Quartile1CardinalityOfNumericFeatures", - "Quartile2CardinalityOfNumericFeatures", - "Quartile3CardinalityOfNumericFeatures", - "MaxCardinalityOfNumericFeatures" - ] - }, - "Quartile1CardinalityOfNumericFeatures": { - "function": "get_numeric_cardinalities", - "arguments": { - "X": "X", - "column_types": "column_types" - }, - "returns": [ - "CardinalitiesOfNumericFeatures", - "MeanCardinalityOfNumericFeatures", - "StdevCardinalityOfNumericFeatures", - "SkewCardinalityOfNumericFeatures", - "KurtosisCardinalityOfNumericFeatures", - "MinCardinalityOfNumericFeatures", - "Quartile1CardinalityOfNumericFeatures", - "Quartile2CardinalityOfNumericFeatures", - "Quartile3CardinalityOfNumericFeatures", - "MaxCardinalityOfNumericFeatures" - ] - }, - "Quartile2CardinalityOfNumericFeatures": { - "function": "get_numeric_cardinalities", - "arguments": { - "X": "X", - "column_types": "column_types" - }, - "returns": [ - "CardinalitiesOfNumericFeatures", - "MeanCardinalityOfNumericFeatures", - "StdevCardinalityOfNumericFeatures", - "SkewCardinalityOfNumericFeatures", - "KurtosisCardinalityOfNumericFeatures", - "MinCardinalityOfNumericFeatures", - "Quartile1CardinalityOfNumericFeatures", - "Quartile2CardinalityOfNumericFeatures", - "Quartile3CardinalityOfNumericFeatures", - "MaxCardinalityOfNumericFeatures" - ] - }, - "Quartile3CardinalityOfNumericFeatures": { - "function": "get_numeric_cardinalities", - "arguments": { - "X": "X", - "column_types": "column_types" - }, - "returns": [ - "CardinalitiesOfNumericFeatures", - "MeanCardinalityOfNumericFeatures", - "StdevCardinalityOfNumericFeatures", - "SkewCardinalityOfNumericFeatures", - "KurtosisCardinalityOfNumericFeatures", - "MinCardinalityOfNumericFeatures", - "Quartile1CardinalityOfNumericFeatures", - "Quartile2CardinalityOfNumericFeatures", - "Quartile3CardinalityOfNumericFeatures", - "MaxCardinalityOfNumericFeatures" - ] - }, - "MaxCardinalityOfNumericFeatures": { - "function": "get_numeric_cardinalities", - "arguments": { - "X": "X", - "column_types": "column_types" - }, - "returns": [ - "CardinalitiesOfNumericFeatures", - "MeanCardinalityOfNumericFeatures", - "StdevCardinalityOfNumericFeatures", - "SkewCardinalityOfNumericFeatures", - "KurtosisCardinalityOfNumericFeatures", - "MinCardinalityOfNumericFeatures", - "Quartile1CardinalityOfNumericFeatures", - "Quartile2CardinalityOfNumericFeatures", - "Quartile3CardinalityOfNumericFeatures", - "MaxCardinalityOfNumericFeatures" - ] - }, - "MeanMeansOfNumericFeatures": { - "function": "get_numeric_means", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "MeansOfNumericFeatures", - "MeanMeansOfNumericFeatures", - "StdevMeansOfNumericFeatures", - "SkewMeansOfNumericFeatures", - "KurtosisMeansOfNumericFeatures", - "MinMeansOfNumericFeatures", - "Quartile1MeansOfNumericFeatures", - "Quartile2MeansOfNumericFeatures", - "Quartile3MeansOfNumericFeatures", - "MaxMeansOfNumericFeatures" - ] - }, - "StdevMeansOfNumericFeatures": { - "function": "get_numeric_means", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "MeansOfNumericFeatures", - "MeanMeansOfNumericFeatures", - "StdevMeansOfNumericFeatures", - "SkewMeansOfNumericFeatures", - "KurtosisMeansOfNumericFeatures", - "MinMeansOfNumericFeatures", - "Quartile1MeansOfNumericFeatures", - "Quartile2MeansOfNumericFeatures", - "Quartile3MeansOfNumericFeatures", - "MaxMeansOfNumericFeatures" - ] - }, - "SkewMeansOfNumericFeatures": { - "function": "get_numeric_means", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "MeansOfNumericFeatures", - "MeanMeansOfNumericFeatures", - "StdevMeansOfNumericFeatures", - "SkewMeansOfNumericFeatures", - "KurtosisMeansOfNumericFeatures", - "MinMeansOfNumericFeatures", - "Quartile1MeansOfNumericFeatures", - "Quartile2MeansOfNumericFeatures", - "Quartile3MeansOfNumericFeatures", - "MaxMeansOfNumericFeatures" - ] - }, - "KurtosisMeansOfNumericFeatures": { - "function": "get_numeric_means", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "MeansOfNumericFeatures", - "MeanMeansOfNumericFeatures", - "StdevMeansOfNumericFeatures", - "SkewMeansOfNumericFeatures", - "KurtosisMeansOfNumericFeatures", - "MinMeansOfNumericFeatures", - "Quartile1MeansOfNumericFeatures", - "Quartile2MeansOfNumericFeatures", - "Quartile3MeansOfNumericFeatures", - "MaxMeansOfNumericFeatures" - ] - }, - "MinMeansOfNumericFeatures": { - "function": "get_numeric_means", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "MeansOfNumericFeatures", - "MeanMeansOfNumericFeatures", - "StdevMeansOfNumericFeatures", - "SkewMeansOfNumericFeatures", - "KurtosisMeansOfNumericFeatures", - "MinMeansOfNumericFeatures", - "Quartile1MeansOfNumericFeatures", - "Quartile2MeansOfNumericFeatures", - "Quartile3MeansOfNumericFeatures", - "MaxMeansOfNumericFeatures" - ] - }, - "MaxMeansOfNumericFeatures": { - "function": "get_numeric_means", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "MeansOfNumericFeatures", - "MeanMeansOfNumericFeatures", - "StdevMeansOfNumericFeatures", - "SkewMeansOfNumericFeatures", - "KurtosisMeansOfNumericFeatures", - "MinMeansOfNumericFeatures", - "Quartile1MeansOfNumericFeatures", - "Quartile2MeansOfNumericFeatures", - "Quartile3MeansOfNumericFeatures", - "MaxMeansOfNumericFeatures" - ] - }, - "Quartile1MeansOfNumericFeatures": { - "function": "get_numeric_means", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "MeansOfNumericFeatures", - "MeanMeansOfNumericFeatures", - "StdevMeansOfNumericFeatures", - "SkewMeansOfNumericFeatures", - "KurtosisMeansOfNumericFeatures", - "MinMeansOfNumericFeatures", - "Quartile1MeansOfNumericFeatures", - "Quartile2MeansOfNumericFeatures", - "Quartile3MeansOfNumericFeatures", - "MaxMeansOfNumericFeatures" - ] - }, - "Quartile2MeansOfNumericFeatures": { - "function": "get_numeric_means", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "MeansOfNumericFeatures", - "MeanMeansOfNumericFeatures", - "StdevMeansOfNumericFeatures", - "SkewMeansOfNumericFeatures", - "KurtosisMeansOfNumericFeatures", - "MinMeansOfNumericFeatures", - "Quartile1MeansOfNumericFeatures", - "Quartile2MeansOfNumericFeatures", - "Quartile3MeansOfNumericFeatures", - "MaxMeansOfNumericFeatures" - ] - }, - "Quartile3MeansOfNumericFeatures": { - "function": "get_numeric_means", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "MeansOfNumericFeatures", - "MeanMeansOfNumericFeatures", - "StdevMeansOfNumericFeatures", - "SkewMeansOfNumericFeatures", - "KurtosisMeansOfNumericFeatures", - "MinMeansOfNumericFeatures", - "Quartile1MeansOfNumericFeatures", - "Quartile2MeansOfNumericFeatures", - "Quartile3MeansOfNumericFeatures", - "MaxMeansOfNumericFeatures" - ] - }, - "MeanStdDevOfNumericFeatures": { - "function": "get_numeric_stdev", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "StdDevsOfNumericFeatures", - "MeanStdDevOfNumericFeatures", - "StdevStdDevOfNumericFeatures", - "SkewStdDevOfNumericFeatures", - "KurtosisStdDevOfNumericFeatures", - "MinStdDevOfNumericFeatures", - "Quartile1StdDevOfNumericFeatures", - "Quartile2StdDevOfNumericFeatures", - "Quartile3StdDevOfNumericFeatures", - "MaxStdDevOfNumericFeatures" - ] - }, - "StdevStdDevOfNumericFeatures": { - "function": "get_numeric_stdev", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "StdDevsOfNumericFeatures", - "MeanStdDevOfNumericFeatures", - "StdevStdDevOfNumericFeatures", - "SkewStdDevOfNumericFeatures", - "KurtosisStdDevOfNumericFeatures", - "MinStdDevOfNumericFeatures", - "Quartile1StdDevOfNumericFeatures", - "Quartile2StdDevOfNumericFeatures", - "Quartile3StdDevOfNumericFeatures", - "MaxStdDevOfNumericFeatures" - ] - }, - "SkewStdDevOfNumericFeatures": { - "function": "get_numeric_stdev", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "StdDevsOfNumericFeatures", - "MeanStdDevOfNumericFeatures", - "StdevStdDevOfNumericFeatures", - "SkewStdDevOfNumericFeatures", - "KurtosisStdDevOfNumericFeatures", - "MinStdDevOfNumericFeatures", - "Quartile1StdDevOfNumericFeatures", - "Quartile2StdDevOfNumericFeatures", - "Quartile3StdDevOfNumericFeatures", - "MaxStdDevOfNumericFeatures" - ] - }, - "KurtosisStdDevOfNumericFeatures": { - "function": "get_numeric_stdev", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "StdDevsOfNumericFeatures", - "MeanStdDevOfNumericFeatures", - "StdevStdDevOfNumericFeatures", - "SkewStdDevOfNumericFeatures", - "KurtosisStdDevOfNumericFeatures", - "MinStdDevOfNumericFeatures", - "Quartile1StdDevOfNumericFeatures", - "Quartile2StdDevOfNumericFeatures", - "Quartile3StdDevOfNumericFeatures", - "MaxStdDevOfNumericFeatures" - ] - }, - "MinStdDevOfNumericFeatures": { - "function": "get_numeric_stdev", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "StdDevsOfNumericFeatures", - "MeanStdDevOfNumericFeatures", - "StdevStdDevOfNumericFeatures", - "SkewStdDevOfNumericFeatures", - "KurtosisStdDevOfNumericFeatures", - "MinStdDevOfNumericFeatures", - "Quartile1StdDevOfNumericFeatures", - "Quartile2StdDevOfNumericFeatures", - "Quartile3StdDevOfNumericFeatures", - "MaxStdDevOfNumericFeatures" - ] - }, - "MaxStdDevOfNumericFeatures": { - "function": "get_numeric_stdev", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "StdDevsOfNumericFeatures", - "MeanStdDevOfNumericFeatures", - "StdevStdDevOfNumericFeatures", - "SkewStdDevOfNumericFeatures", - "KurtosisStdDevOfNumericFeatures", - "MinStdDevOfNumericFeatures", - "Quartile1StdDevOfNumericFeatures", - "Quartile2StdDevOfNumericFeatures", - "Quartile3StdDevOfNumericFeatures", - "MaxStdDevOfNumericFeatures" - ] - }, - "Quartile1StdDevOfNumericFeatures": { - "function": "get_numeric_stdev", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "StdDevsOfNumericFeatures", - "MeanStdDevOfNumericFeatures", - "StdevStdDevOfNumericFeatures", - "SkewStdDevOfNumericFeatures", - "KurtosisStdDevOfNumericFeatures", - "MinStdDevOfNumericFeatures", - "Quartile1StdDevOfNumericFeatures", - "Quartile2StdDevOfNumericFeatures", - "Quartile3StdDevOfNumericFeatures", - "MaxStdDevOfNumericFeatures" - ] - }, - "Quartile2StdDevOfNumericFeatures": { - "function": "get_numeric_stdev", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "StdDevsOfNumericFeatures", - "MeanStdDevOfNumericFeatures", - "StdevStdDevOfNumericFeatures", - "SkewStdDevOfNumericFeatures", - "KurtosisStdDevOfNumericFeatures", - "MinStdDevOfNumericFeatures", - "Quartile1StdDevOfNumericFeatures", - "Quartile2StdDevOfNumericFeatures", - "Quartile3StdDevOfNumericFeatures", - "MaxStdDevOfNumericFeatures" - ] - }, - "Quartile3StdDevOfNumericFeatures": { - "function": "get_numeric_stdev", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "StdDevsOfNumericFeatures", - "MeanStdDevOfNumericFeatures", - "StdevStdDevOfNumericFeatures", - "SkewStdDevOfNumericFeatures", - "KurtosisStdDevOfNumericFeatures", - "MinStdDevOfNumericFeatures", - "Quartile1StdDevOfNumericFeatures", - "Quartile2StdDevOfNumericFeatures", - "Quartile3StdDevOfNumericFeatures", - "MaxStdDevOfNumericFeatures" - ] - }, - "MeanSkewnessOfNumericFeatures": { - "function": "get_numeric_skewness", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "SkewnessesOfNumericFeatures", - "MeanSkewnessOfNumericFeatures", - "StdevSkewnessOfNumericFeatures", - "SkewSkewnessOfNumericFeatures", - "KurtosisSkewnessOfNumericFeatures", - "MinSkewnessOfNumericFeatures", - "Quartile1SkewnessOfNumericFeatures", - "Quartile2SkewnessOfNumericFeatures", - "Quartile3SkewnessOfNumericFeatures", - "MaxSkewnessOfNumericFeatures" - ] - }, - "StdevSkewnessOfNumericFeatures": { - "function": "get_numeric_skewness", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "SkewnessesOfNumericFeatures", - "MeanSkewnessOfNumericFeatures", - "StdevSkewnessOfNumericFeatures", - "SkewSkewnessOfNumericFeatures", - "KurtosisSkewnessOfNumericFeatures", - "MinSkewnessOfNumericFeatures", - "Quartile1SkewnessOfNumericFeatures", - "Quartile2SkewnessOfNumericFeatures", - "Quartile3SkewnessOfNumericFeatures", - "MaxSkewnessOfNumericFeatures" - ] - }, - "SkewSkewnessOfNumericFeatures": { - "function": "get_numeric_skewness", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "SkewnessesOfNumericFeatures", - "MeanSkewnessOfNumericFeatures", - "StdevSkewnessOfNumericFeatures", - "SkewSkewnessOfNumericFeatures", - "KurtosisSkewnessOfNumericFeatures", - "MinSkewnessOfNumericFeatures", - "Quartile1SkewnessOfNumericFeatures", - "Quartile2SkewnessOfNumericFeatures", - "Quartile3SkewnessOfNumericFeatures", - "MaxSkewnessOfNumericFeatures" - ] - }, - "KurtosisSkewnessOfNumericFeatures": { - "function": "get_numeric_skewness", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "SkewnessesOfNumericFeatures", - "MeanSkewnessOfNumericFeatures", - "StdevSkewnessOfNumericFeatures", - "SkewSkewnessOfNumericFeatures", - "KurtosisSkewnessOfNumericFeatures", - "MinSkewnessOfNumericFeatures", - "Quartile1SkewnessOfNumericFeatures", - "Quartile2SkewnessOfNumericFeatures", - "Quartile3SkewnessOfNumericFeatures", - "MaxSkewnessOfNumericFeatures" - ] - }, - "MinSkewnessOfNumericFeatures": { - "function": "get_numeric_skewness", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "SkewnessesOfNumericFeatures", - "MeanSkewnessOfNumericFeatures", - "StdevSkewnessOfNumericFeatures", - "SkewSkewnessOfNumericFeatures", - "KurtosisSkewnessOfNumericFeatures", - "MinSkewnessOfNumericFeatures", - "Quartile1SkewnessOfNumericFeatures", - "Quartile2SkewnessOfNumericFeatures", - "Quartile3SkewnessOfNumericFeatures", - "MaxSkewnessOfNumericFeatures" - ] - }, - "MaxSkewnessOfNumericFeatures": { - "function": "get_numeric_skewness", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "SkewnessesOfNumericFeatures", - "MeanSkewnessOfNumericFeatures", - "StdevSkewnessOfNumericFeatures", - "SkewSkewnessOfNumericFeatures", - "KurtosisSkewnessOfNumericFeatures", - "MinSkewnessOfNumericFeatures", - "Quartile1SkewnessOfNumericFeatures", - "Quartile2SkewnessOfNumericFeatures", - "Quartile3SkewnessOfNumericFeatures", - "MaxSkewnessOfNumericFeatures" - ] - }, - "Quartile1SkewnessOfNumericFeatures": { - "function": "get_numeric_skewness", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "SkewnessesOfNumericFeatures", - "MeanSkewnessOfNumericFeatures", - "StdevSkewnessOfNumericFeatures", - "SkewSkewnessOfNumericFeatures", - "KurtosisSkewnessOfNumericFeatures", - "MinSkewnessOfNumericFeatures", - "Quartile1SkewnessOfNumericFeatures", - "Quartile2SkewnessOfNumericFeatures", - "Quartile3SkewnessOfNumericFeatures", - "MaxSkewnessOfNumericFeatures" - ] - }, - "Quartile2SkewnessOfNumericFeatures": { - "function": "get_numeric_skewness", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "SkewnessesOfNumericFeatures", - "MeanSkewnessOfNumericFeatures", - "StdevSkewnessOfNumericFeatures", - "SkewSkewnessOfNumericFeatures", - "KurtosisSkewnessOfNumericFeatures", - "MinSkewnessOfNumericFeatures", - "Quartile1SkewnessOfNumericFeatures", - "Quartile2SkewnessOfNumericFeatures", - "Quartile3SkewnessOfNumericFeatures", - "MaxSkewnessOfNumericFeatures" - ] - }, - "Quartile3SkewnessOfNumericFeatures": { - "function": "get_numeric_skewness", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "SkewnessesOfNumericFeatures", - "MeanSkewnessOfNumericFeatures", - "StdevSkewnessOfNumericFeatures", - "SkewSkewnessOfNumericFeatures", - "KurtosisSkewnessOfNumericFeatures", - "MinSkewnessOfNumericFeatures", - "Quartile1SkewnessOfNumericFeatures", - "Quartile2SkewnessOfNumericFeatures", - "Quartile3SkewnessOfNumericFeatures", - "MaxSkewnessOfNumericFeatures" - ] - }, - "MeanKurtosisOfNumericFeatures": { - "function": "get_numeric_kurtosis", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "KurtosesOfNumericFeatures", - "MeanKurtosisOfNumericFeatures", - "StdevKurtosisOfNumericFeatures", - "SkewKurtosisOfNumericFeatures", - "KurtosisKurtosisOfNumericFeatures", - "MinKurtosisOfNumericFeatures", - "Quartile1KurtosisOfNumericFeatures", - "Quartile2KurtosisOfNumericFeatures", - "Quartile3KurtosisOfNumericFeatures", - "MaxKurtosisOfNumericFeatures" - ] - }, - "StdevKurtosisOfNumericFeatures": { - "function": "get_numeric_kurtosis", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "KurtosesOfNumericFeatures", - "MeanKurtosisOfNumericFeatures", - "StdevKurtosisOfNumericFeatures", - "SkewKurtosisOfNumericFeatures", - "KurtosisKurtosisOfNumericFeatures", - "MinKurtosisOfNumericFeatures", - "Quartile1KurtosisOfNumericFeatures", - "Quartile2KurtosisOfNumericFeatures", - "Quartile3KurtosisOfNumericFeatures", - "MaxKurtosisOfNumericFeatures" - ] - }, - "SkewKurtosisOfNumericFeatures": { - "function": "get_numeric_kurtosis", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "KurtosesOfNumericFeatures", - "MeanKurtosisOfNumericFeatures", - "StdevKurtosisOfNumericFeatures", - "SkewKurtosisOfNumericFeatures", - "KurtosisKurtosisOfNumericFeatures", - "MinKurtosisOfNumericFeatures", - "Quartile1KurtosisOfNumericFeatures", - "Quartile2KurtosisOfNumericFeatures", - "Quartile3KurtosisOfNumericFeatures", - "MaxKurtosisOfNumericFeatures" - ] - }, - "KurtosisKurtosisOfNumericFeatures": { - "function": "get_numeric_kurtosis", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "KurtosesOfNumericFeatures", - "MeanKurtosisOfNumericFeatures", - "StdevKurtosisOfNumericFeatures", - "SkewKurtosisOfNumericFeatures", - "KurtosisKurtosisOfNumericFeatures", - "MinKurtosisOfNumericFeatures", - "Quartile1KurtosisOfNumericFeatures", - "Quartile2KurtosisOfNumericFeatures", - "Quartile3KurtosisOfNumericFeatures", - "MaxKurtosisOfNumericFeatures" - ] - }, - "MinKurtosisOfNumericFeatures": { - "function": "get_numeric_kurtosis", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "KurtosesOfNumericFeatures", - "MeanKurtosisOfNumericFeatures", - "StdevKurtosisOfNumericFeatures", - "SkewKurtosisOfNumericFeatures", - "KurtosisKurtosisOfNumericFeatures", - "MinKurtosisOfNumericFeatures", - "Quartile1KurtosisOfNumericFeatures", - "Quartile2KurtosisOfNumericFeatures", - "Quartile3KurtosisOfNumericFeatures", - "MaxKurtosisOfNumericFeatures" - ] - }, - "MaxKurtosisOfNumericFeatures": { - "function": "get_numeric_kurtosis", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "KurtosesOfNumericFeatures", - "MeanKurtosisOfNumericFeatures", - "StdevKurtosisOfNumericFeatures", - "SkewKurtosisOfNumericFeatures", - "KurtosisKurtosisOfNumericFeatures", - "MinKurtosisOfNumericFeatures", - "Quartile1KurtosisOfNumericFeatures", - "Quartile2KurtosisOfNumericFeatures", - "Quartile3KurtosisOfNumericFeatures", - "MaxKurtosisOfNumericFeatures" - ] - }, - "Quartile1KurtosisOfNumericFeatures": { - "function": "get_numeric_kurtosis", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "KurtosesOfNumericFeatures", - "MeanKurtosisOfNumericFeatures", - "StdevKurtosisOfNumericFeatures", - "SkewKurtosisOfNumericFeatures", - "KurtosisKurtosisOfNumericFeatures", - "MinKurtosisOfNumericFeatures", - "Quartile1KurtosisOfNumericFeatures", - "Quartile2KurtosisOfNumericFeatures", - "Quartile3KurtosisOfNumericFeatures", - "MaxKurtosisOfNumericFeatures" - ] - }, - "Quartile2KurtosisOfNumericFeatures": { - "function": "get_numeric_kurtosis", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "KurtosesOfNumericFeatures", - "MeanKurtosisOfNumericFeatures", - "StdevKurtosisOfNumericFeatures", - "SkewKurtosisOfNumericFeatures", - "KurtosisKurtosisOfNumericFeatures", - "MinKurtosisOfNumericFeatures", - "Quartile1KurtosisOfNumericFeatures", - "Quartile2KurtosisOfNumericFeatures", - "Quartile3KurtosisOfNumericFeatures", - "MaxKurtosisOfNumericFeatures" - ] - }, - "Quartile3KurtosisOfNumericFeatures": { - "function": "get_numeric_kurtosis", - "arguments": { - "numeric_features_array": "NoNaNNumericFeatures" - }, - "returns": [ - "KurtosesOfNumericFeatures", - "MeanKurtosisOfNumericFeatures", - "StdevKurtosisOfNumericFeatures", - "SkewKurtosisOfNumericFeatures", - "KurtosisKurtosisOfNumericFeatures", - "MinKurtosisOfNumericFeatures", - "Quartile1KurtosisOfNumericFeatures", - "Quartile2KurtosisOfNumericFeatures", - "Quartile3KurtosisOfNumericFeatures", - "MaxKurtosisOfNumericFeatures" - ] - }, - "PredPCA1": { - "function": "get_pca", - "arguments": { - "X_preprocessed": "XPreprocessed" - }, - "returns": [ - "PredPCA1", - "PredPCA2", - "PredPCA3", - "PredEigen1", - "PredEigen2", - "PredEigen3", - "PredDet" - ] - }, - "PredPCA2": { - "function": "get_pca", - "arguments": { - "X_preprocessed": "XPreprocessed" - }, - "returns": [ - "PredPCA1", - "PredPCA2", - "PredPCA3", - "PredEigen1", - "PredEigen2", - "PredEigen3", - "PredDet" - ] - }, - "PredPCA3": { - "function": "get_pca", - "arguments": { - "X_preprocessed": "XPreprocessed" - }, - "returns": [ - "PredPCA1", - "PredPCA2", - "PredPCA3", - "PredEigen1", - "PredEigen2", - "PredEigen3", - "PredDet" - ] - }, - "PredEigen1": { - "function": "get_pca", - "arguments": { - "X_preprocessed": "XPreprocessed" - }, - "returns": [ - "PredPCA1", - "PredPCA2", - "PredPCA3", - "PredEigen1", - "PredEigen2", - "PredEigen3", - "PredDet" - ] - }, - "PredEigen2": { - "function": "get_pca", - "arguments": { - "X_preprocessed": "XPreprocessed" - }, - "returns": [ - "PredPCA1", - "PredPCA2", - "PredPCA3", - "PredEigen1", - "PredEigen2", - "PredEigen3", - "PredDet" - ] - }, - "PredEigen3": { - "function": "get_pca", - "arguments": { - "X_preprocessed": "XPreprocessed" - }, - "returns": [ - "PredPCA1", - "PredPCA2", - "PredPCA3", - "PredEigen1", - "PredEigen2", - "PredEigen3", - "PredDet" - ] - }, - "PredDet": { - "function": "get_pca", - "arguments": { - "X_preprocessed": "XPreprocessed" - }, - "returns": [ - "PredPCA1", - "PredPCA2", - "PredPCA3", - "PredEigen1", - "PredEigen2", - "PredEigen3", - "PredDet" - ] - }, - "ClassEntropy": { - "function": "get_class_entropy", - "arguments": { - "Y_sample": "YSample" - }, - "returns": [ - "ClassEntropy" - ] - }, - "MeanCategoricalAttributeEntropy": { - "function": "get_attribute_entropy", - "arguments": { - "feature_array": "NoNaNCategoricalFeatures" - }, - "returns": [ - "CategoricalAttributeEntropies", - "MeanCategoricalAttributeEntropy", - "StdevCategoricalAttributeEntropy", - "SkewCategoricalAttributeEntropy", - "KurtosisCategoricalAttributeEntropy", - "MinCategoricalAttributeEntropy", - "Quartile1CategoricalAttributeEntropy", - "Quartile2CategoricalAttributeEntropy", - "Quartile3CategoricalAttributeEntropy", - "MaxCategoricalAttributeEntropy" - ] - }, - "StdevCategoricalAttributeEntropy": { - "function": "get_attribute_entropy", - "arguments": { - "feature_array": "NoNaNCategoricalFeatures" - }, - "returns": [ - "CategoricalAttributeEntropies", - "MeanCategoricalAttributeEntropy", - "StdevCategoricalAttributeEntropy", - "SkewCategoricalAttributeEntropy", - "KurtosisCategoricalAttributeEntropy", - "MinCategoricalAttributeEntropy", - "Quartile1CategoricalAttributeEntropy", - "Quartile2CategoricalAttributeEntropy", - "Quartile3CategoricalAttributeEntropy", - "MaxCategoricalAttributeEntropy" - ] - }, - "SkewCategoricalAttributeEntropy": { - "function": "get_attribute_entropy", - "arguments": { - "feature_array": "NoNaNCategoricalFeatures" - }, - "returns": [ - "CategoricalAttributeEntropies", - "MeanCategoricalAttributeEntropy", - "StdevCategoricalAttributeEntropy", - "SkewCategoricalAttributeEntropy", - "KurtosisCategoricalAttributeEntropy", - "MinCategoricalAttributeEntropy", - "Quartile1CategoricalAttributeEntropy", - "Quartile2CategoricalAttributeEntropy", - "Quartile3CategoricalAttributeEntropy", - "MaxCategoricalAttributeEntropy" - ] - }, - "KurtosisCategoricalAttributeEntropy": { - "function": "get_attribute_entropy", - "arguments": { - "feature_array": "NoNaNCategoricalFeatures" - }, - "returns": [ - "CategoricalAttributeEntropies", - "MeanCategoricalAttributeEntropy", - "StdevCategoricalAttributeEntropy", - "SkewCategoricalAttributeEntropy", - "KurtosisCategoricalAttributeEntropy", - "MinCategoricalAttributeEntropy", - "Quartile1CategoricalAttributeEntropy", - "Quartile2CategoricalAttributeEntropy", - "Quartile3CategoricalAttributeEntropy", - "MaxCategoricalAttributeEntropy" - ] - }, - "MinCategoricalAttributeEntropy": { - "function": "get_attribute_entropy", - "arguments": { - "feature_array": "NoNaNCategoricalFeatures" - }, - "returns": [ - "CategoricalAttributeEntropies", - "MeanCategoricalAttributeEntropy", - "StdevCategoricalAttributeEntropy", - "SkewCategoricalAttributeEntropy", - "KurtosisCategoricalAttributeEntropy", - "MinCategoricalAttributeEntropy", - "Quartile1CategoricalAttributeEntropy", - "Quartile2CategoricalAttributeEntropy", - "Quartile3CategoricalAttributeEntropy", - "MaxCategoricalAttributeEntropy" - ] - }, - "Quartile1CategoricalAttributeEntropy": { - "function": "get_attribute_entropy", - "arguments": { - "feature_array": "NoNaNCategoricalFeatures" - }, - "returns": [ - "CategoricalAttributeEntropies", - "MeanCategoricalAttributeEntropy", - "StdevCategoricalAttributeEntropy", - "SkewCategoricalAttributeEntropy", - "KurtosisCategoricalAttributeEntropy", - "MinCategoricalAttributeEntropy", - "Quartile1CategoricalAttributeEntropy", - "Quartile2CategoricalAttributeEntropy", - "Quartile3CategoricalAttributeEntropy", - "MaxCategoricalAttributeEntropy" - ] - }, - "Quartile2CategoricalAttributeEntropy": { - "function": "get_attribute_entropy", - "arguments": { - "feature_array": "NoNaNCategoricalFeatures" - }, - "returns": [ - "CategoricalAttributeEntropies", - "MeanCategoricalAttributeEntropy", - "StdevCategoricalAttributeEntropy", - "SkewCategoricalAttributeEntropy", - "KurtosisCategoricalAttributeEntropy", - "MinCategoricalAttributeEntropy", - "Quartile1CategoricalAttributeEntropy", - "Quartile2CategoricalAttributeEntropy", - "Quartile3CategoricalAttributeEntropy", - "MaxCategoricalAttributeEntropy" - ] - }, - "Quartile3CategoricalAttributeEntropy": { - "function": "get_attribute_entropy", - "arguments": { - "feature_array": "NoNaNCategoricalFeatures" - }, - "returns": [ - "CategoricalAttributeEntropies", - "MeanCategoricalAttributeEntropy", - "StdevCategoricalAttributeEntropy", - "SkewCategoricalAttributeEntropy", - "KurtosisCategoricalAttributeEntropy", - "MinCategoricalAttributeEntropy", - "Quartile1CategoricalAttributeEntropy", - "Quartile2CategoricalAttributeEntropy", - "Quartile3CategoricalAttributeEntropy", - "MaxCategoricalAttributeEntropy" - ] - }, - "MaxCategoricalAttributeEntropy": { - "function": "get_attribute_entropy", - "arguments": { - "feature_array": "NoNaNCategoricalFeatures" - }, - "returns": [ - "CategoricalAttributeEntropies", - "MeanCategoricalAttributeEntropy", - "StdevCategoricalAttributeEntropy", - "SkewCategoricalAttributeEntropy", - "KurtosisCategoricalAttributeEntropy", - "MinCategoricalAttributeEntropy", - "Quartile1CategoricalAttributeEntropy", - "Quartile2CategoricalAttributeEntropy", - "Quartile3CategoricalAttributeEntropy", - "MaxCategoricalAttributeEntropy" - ] - }, - "MeanNumericAttributeEntropy": { - "function": "get_attribute_entropy", - "arguments": { - "feature_array": "NoNaNBinnedNumericFeatures" - }, - "returns": [ - "NumericAttributeEntropies", - "MeanNumericAttributeEntropy", - "StdevNumericAttributeEntropy", - "SkewNumericAttributeEntropy", - "KurtosisNumericAttributeEntropy", - "MinNumericAttributeEntropy", - "Quartile1NumericAttributeEntropy", - "Quartile2NumericAttributeEntropy", - "Quartile3NumericAttributeEntropy", - "MaxNumericAttributeEntropy" - ] - }, - "StdevNumericAttributeEntropy": { - "function": "get_attribute_entropy", - "arguments": { - "feature_array": "NoNaNBinnedNumericFeatures" - }, - "returns": [ - "NumericAttributeEntropies", - "MeanNumericAttributeEntropy", - "StdevNumericAttributeEntropy", - "SkewNumericAttributeEntropy", - "KurtosisNumericAttributeEntropy", - "MinNumericAttributeEntropy", - "Quartile1NumericAttributeEntropy", - "Quartile2NumericAttributeEntropy", - "Quartile3NumericAttributeEntropy", - "MaxNumericAttributeEntropy" - ] - }, - "SkewNumericAttributeEntropy": { - "function": "get_attribute_entropy", - "arguments": { - "feature_array": "NoNaNBinnedNumericFeatures" - }, - "returns": [ - "NumericAttributeEntropies", - "MeanNumericAttributeEntropy", - "StdevNumericAttributeEntropy", - "SkewNumericAttributeEntropy", - "KurtosisNumericAttributeEntropy", - "MinNumericAttributeEntropy", - "Quartile1NumericAttributeEntropy", - "Quartile2NumericAttributeEntropy", - "Quartile3NumericAttributeEntropy", - "MaxNumericAttributeEntropy" - ] - }, - "KurtosisNumericAttributeEntropy": { - "function": "get_attribute_entropy", - "arguments": { - "feature_array": "NoNaNBinnedNumericFeatures" - }, - "returns": [ - "NumericAttributeEntropies", - "MeanNumericAttributeEntropy", - "StdevNumericAttributeEntropy", - "SkewNumericAttributeEntropy", - "KurtosisNumericAttributeEntropy", - "MinNumericAttributeEntropy", - "Quartile1NumericAttributeEntropy", - "Quartile2NumericAttributeEntropy", - "Quartile3NumericAttributeEntropy", - "MaxNumericAttributeEntropy" - ] - }, - "MinNumericAttributeEntropy": { - "function": "get_attribute_entropy", - "arguments": { - "feature_array": "NoNaNBinnedNumericFeatures" - }, - "returns": [ - "NumericAttributeEntropies", - "MeanNumericAttributeEntropy", - "StdevNumericAttributeEntropy", - "SkewNumericAttributeEntropy", - "KurtosisNumericAttributeEntropy", - "MinNumericAttributeEntropy", - "Quartile1NumericAttributeEntropy", - "Quartile2NumericAttributeEntropy", - "Quartile3NumericAttributeEntropy", - "MaxNumericAttributeEntropy" - ] - }, - "Quartile1NumericAttributeEntropy": { - "function": "get_attribute_entropy", - "arguments": { - "feature_array": "NoNaNBinnedNumericFeatures" - }, - "returns": [ - "NumericAttributeEntropies", - "MeanNumericAttributeEntropy", - "StdevNumericAttributeEntropy", - "SkewNumericAttributeEntropy", - "KurtosisNumericAttributeEntropy", - "MinNumericAttributeEntropy", - "Quartile1NumericAttributeEntropy", - "Quartile2NumericAttributeEntropy", - "Quartile3NumericAttributeEntropy", - "MaxNumericAttributeEntropy" - ] - }, - "Quartile2NumericAttributeEntropy": { - "function": "get_attribute_entropy", - "arguments": { - "feature_array": "NoNaNBinnedNumericFeatures" - }, - "returns": [ - "NumericAttributeEntropies", - "MeanNumericAttributeEntropy", - "StdevNumericAttributeEntropy", - "SkewNumericAttributeEntropy", - "KurtosisNumericAttributeEntropy", - "MinNumericAttributeEntropy", - "Quartile1NumericAttributeEntropy", - "Quartile2NumericAttributeEntropy", - "Quartile3NumericAttributeEntropy", - "MaxNumericAttributeEntropy" - ] - }, - "Quartile3NumericAttributeEntropy": { - "function": "get_attribute_entropy", - "arguments": { - "feature_array": "NoNaNBinnedNumericFeatures" - }, - "returns": [ - "NumericAttributeEntropies", - "MeanNumericAttributeEntropy", - "StdevNumericAttributeEntropy", - "SkewNumericAttributeEntropy", - "KurtosisNumericAttributeEntropy", - "MinNumericAttributeEntropy", - "Quartile1NumericAttributeEntropy", - "Quartile2NumericAttributeEntropy", - "Quartile3NumericAttributeEntropy", - "MaxNumericAttributeEntropy" - ] - }, - "MaxNumericAttributeEntropy": { - "function": "get_attribute_entropy", - "arguments": { - "feature_array": "NoNaNBinnedNumericFeatures" - }, - "returns": [ - "NumericAttributeEntropies", - "MeanNumericAttributeEntropy", - "StdevNumericAttributeEntropy", - "SkewNumericAttributeEntropy", - "KurtosisNumericAttributeEntropy", - "MinNumericAttributeEntropy", - "Quartile1NumericAttributeEntropy", - "Quartile2NumericAttributeEntropy", - "Quartile3NumericAttributeEntropy", - "MaxNumericAttributeEntropy" - ] - }, - "MeanCategoricalJointEntropy": { - "function": "get_joint_entropy", - "arguments": { - "feature_class_array": "NoNaNCategoricalFeaturesAndClass" - }, - "returns": [ - "CategoricalJointEntropies", - "MeanCategoricalJointEntropy", - "StdevCategoricalJointEntropy", - "SkewCategoricalJointEntropy", - "KurtosisCategoricalJointEntropy", - "MinCategoricalJointEntropy", - "Quartile1CategoricalJointEntropy", - "Quartile2CategoricalJointEntropy", - "Quartile3CategoricalJointEntropy", - "MaxCategoricalJointEntropy" - ] - }, - "StdevCategoricalJointEntropy": { - "function": "get_joint_entropy", - "arguments": { - "feature_class_array": "NoNaNCategoricalFeaturesAndClass" - }, - "returns": [ - "CategoricalJointEntropies", - "MeanCategoricalJointEntropy", - "StdevCategoricalJointEntropy", - "SkewCategoricalJointEntropy", - "KurtosisCategoricalJointEntropy", - "MinCategoricalJointEntropy", - "Quartile1CategoricalJointEntropy", - "Quartile2CategoricalJointEntropy", - "Quartile3CategoricalJointEntropy", - "MaxCategoricalJointEntropy" - ] - }, - "SkewCategoricalJointEntropy": { - "function": "get_joint_entropy", - "arguments": { - "feature_class_array": "NoNaNCategoricalFeaturesAndClass" - }, - "returns": [ - "CategoricalJointEntropies", - "MeanCategoricalJointEntropy", - "StdevCategoricalJointEntropy", - "SkewCategoricalJointEntropy", - "KurtosisCategoricalJointEntropy", - "MinCategoricalJointEntropy", - "Quartile1CategoricalJointEntropy", - "Quartile2CategoricalJointEntropy", - "Quartile3CategoricalJointEntropy", - "MaxCategoricalJointEntropy" - ] - }, - "KurtosisCategoricalJointEntropy": { - "function": "get_joint_entropy", - "arguments": { - "feature_class_array": "NoNaNCategoricalFeaturesAndClass" - }, - "returns": [ - "CategoricalJointEntropies", - "MeanCategoricalJointEntropy", - "StdevCategoricalJointEntropy", - "SkewCategoricalJointEntropy", - "KurtosisCategoricalJointEntropy", - "MinCategoricalJointEntropy", - "Quartile1CategoricalJointEntropy", - "Quartile2CategoricalJointEntropy", - "Quartile3CategoricalJointEntropy", - "MaxCategoricalJointEntropy" - ] - }, - "MinCategoricalJointEntropy": { - "function": "get_joint_entropy", - "arguments": { - "feature_class_array": "NoNaNCategoricalFeaturesAndClass" - }, - "returns": [ - "CategoricalJointEntropies", - "MeanCategoricalJointEntropy", - "StdevCategoricalJointEntropy", - "SkewCategoricalJointEntropy", - "KurtosisCategoricalJointEntropy", - "MinCategoricalJointEntropy", - "Quartile1CategoricalJointEntropy", - "Quartile2CategoricalJointEntropy", - "Quartile3CategoricalJointEntropy", - "MaxCategoricalJointEntropy" - ] - }, - "Quartile1CategoricalJointEntropy": { - "function": "get_joint_entropy", - "arguments": { - "feature_class_array": "NoNaNCategoricalFeaturesAndClass" - }, - "returns": [ - "CategoricalJointEntropies", - "MeanCategoricalJointEntropy", - "StdevCategoricalJointEntropy", - "SkewCategoricalJointEntropy", - "KurtosisCategoricalJointEntropy", - "MinCategoricalJointEntropy", - "Quartile1CategoricalJointEntropy", - "Quartile2CategoricalJointEntropy", - "Quartile3CategoricalJointEntropy", - "MaxCategoricalJointEntropy" - ] - }, - "Quartile2CategoricalJointEntropy": { - "function": "get_joint_entropy", - "arguments": { - "feature_class_array": "NoNaNCategoricalFeaturesAndClass" - }, - "returns": [ - "CategoricalJointEntropies", - "MeanCategoricalJointEntropy", - "StdevCategoricalJointEntropy", - "SkewCategoricalJointEntropy", - "KurtosisCategoricalJointEntropy", - "MinCategoricalJointEntropy", - "Quartile1CategoricalJointEntropy", - "Quartile2CategoricalJointEntropy", - "Quartile3CategoricalJointEntropy", - "MaxCategoricalJointEntropy" - ] - }, - "Quartile3CategoricalJointEntropy": { - "function": "get_joint_entropy", - "arguments": { - "feature_class_array": "NoNaNCategoricalFeaturesAndClass" - }, - "returns": [ - "CategoricalJointEntropies", - "MeanCategoricalJointEntropy", - "StdevCategoricalJointEntropy", - "SkewCategoricalJointEntropy", - "KurtosisCategoricalJointEntropy", - "MinCategoricalJointEntropy", - "Quartile1CategoricalJointEntropy", - "Quartile2CategoricalJointEntropy", - "Quartile3CategoricalJointEntropy", - "MaxCategoricalJointEntropy" - ] - }, - "MaxCategoricalJointEntropy": { - "function": "get_joint_entropy", - "arguments": { - "feature_class_array": "NoNaNCategoricalFeaturesAndClass" - }, - "returns": [ - "CategoricalJointEntropies", - "MeanCategoricalJointEntropy", - "StdevCategoricalJointEntropy", - "SkewCategoricalJointEntropy", - "KurtosisCategoricalJointEntropy", - "MinCategoricalJointEntropy", - "Quartile1CategoricalJointEntropy", - "Quartile2CategoricalJointEntropy", - "Quartile3CategoricalJointEntropy", - "MaxCategoricalJointEntropy" - ] - }, - "MeanNumericJointEntropy": { - "function": "get_joint_entropy", - "arguments": { - "feature_class_array": "NoNaNBinnedNumericFeaturesAndClass" - }, - "returns": [ - "NumericJointEntropies", - "MeanNumericJointEntropy", - "StdevNumericJointEntropy", - "SkewNumericJointEntropy", - "KurtosisNumericJointEntropy", - "MinNumericJointEntropy", - "Quartile1NumericJointEntropy", - "Quartile2NumericJointEntropy", - "Quartile3NumericJointEntropy", - "MaxNumericJointEntropy" - ] - }, - "StdevNumericJointEntropy": { - "function": "get_joint_entropy", - "arguments": { - "feature_class_array": "NoNaNBinnedNumericFeaturesAndClass" - }, - "returns": [ - "NumericJointEntropies", - "MeanNumericJointEntropy", - "StdevNumericJointEntropy", - "SkewNumericJointEntropy", - "KurtosisNumericJointEntropy", - "MinNumericJointEntropy", - "Quartile1NumericJointEntropy", - "Quartile2NumericJointEntropy", - "Quartile3NumericJointEntropy", - "MaxNumericJointEntropy" - ] - }, - "SkewNumericJointEntropy": { - "function": "get_joint_entropy", - "arguments": { - "feature_class_array": "NoNaNBinnedNumericFeaturesAndClass" - }, - "returns": [ - "NumericJointEntropies", - "MeanNumericJointEntropy", - "StdevNumericJointEntropy", - "SkewNumericJointEntropy", - "KurtosisNumericJointEntropy", - "MinNumericJointEntropy", - "Quartile1NumericJointEntropy", - "Quartile2NumericJointEntropy", - "Quartile3NumericJointEntropy", - "MaxNumericJointEntropy" - ] - }, - "KurtosisNumericJointEntropy": { - "function": "get_joint_entropy", - "arguments": { - "feature_class_array": "NoNaNBinnedNumericFeaturesAndClass" - }, - "returns": [ - "NumericJointEntropies", - "MeanNumericJointEntropy", - "StdevNumericJointEntropy", - "SkewNumericJointEntropy", - "KurtosisNumericJointEntropy", - "MinNumericJointEntropy", - "Quartile1NumericJointEntropy", - "Quartile2NumericJointEntropy", - "Quartile3NumericJointEntropy", - "MaxNumericJointEntropy" - ] - }, - "MinNumericJointEntropy": { - "function": "get_joint_entropy", - "arguments": { - "feature_class_array": "NoNaNBinnedNumericFeaturesAndClass" - }, - "returns": [ - "NumericJointEntropies", - "MeanNumericJointEntropy", - "StdevNumericJointEntropy", - "SkewNumericJointEntropy", - "KurtosisNumericJointEntropy", - "MinNumericJointEntropy", - "Quartile1NumericJointEntropy", - "Quartile2NumericJointEntropy", - "Quartile3NumericJointEntropy", - "MaxNumericJointEntropy" - ] - }, - "Quartile1NumericJointEntropy": { - "function": "get_joint_entropy", - "arguments": { - "feature_class_array": "NoNaNBinnedNumericFeaturesAndClass" - }, - "returns": [ - "NumericJointEntropies", - "MeanNumericJointEntropy", - "StdevNumericJointEntropy", - "SkewNumericJointEntropy", - "KurtosisNumericJointEntropy", - "MinNumericJointEntropy", - "Quartile1NumericJointEntropy", - "Quartile2NumericJointEntropy", - "Quartile3NumericJointEntropy", - "MaxNumericJointEntropy" - ] - }, - "Quartile2NumericJointEntropy": { - "function": "get_joint_entropy", - "arguments": { - "feature_class_array": "NoNaNBinnedNumericFeaturesAndClass" - }, - "returns": [ - "NumericJointEntropies", - "MeanNumericJointEntropy", - "StdevNumericJointEntropy", - "SkewNumericJointEntropy", - "KurtosisNumericJointEntropy", - "MinNumericJointEntropy", - "Quartile1NumericJointEntropy", - "Quartile2NumericJointEntropy", - "Quartile3NumericJointEntropy", - "MaxNumericJointEntropy" - ] - }, - "Quartile3NumericJointEntropy": { - "function": "get_joint_entropy", - "arguments": { - "feature_class_array": "NoNaNBinnedNumericFeaturesAndClass" - }, - "returns": [ - "NumericJointEntropies", - "MeanNumericJointEntropy", - "StdevNumericJointEntropy", - "SkewNumericJointEntropy", - "KurtosisNumericJointEntropy", - "MinNumericJointEntropy", - "Quartile1NumericJointEntropy", - "Quartile2NumericJointEntropy", - "Quartile3NumericJointEntropy", - "MaxNumericJointEntropy" - ] - }, - "MaxNumericJointEntropy": { - "function": "get_joint_entropy", - "arguments": { - "feature_class_array": "NoNaNBinnedNumericFeaturesAndClass" - }, - "returns": [ - "NumericJointEntropies", - "MeanNumericJointEntropy", - "StdevNumericJointEntropy", - "SkewNumericJointEntropy", - "KurtosisNumericJointEntropy", - "MinNumericJointEntropy", - "Quartile1NumericJointEntropy", - "Quartile2NumericJointEntropy", - "Quartile3NumericJointEntropy", - "MaxNumericJointEntropy" - ] - }, - "MeanCategoricalMutualInformation": { - "function": "get_mutual_information", - "arguments": { - "feature_class_array": "NoNaNCategoricalFeaturesAndClass" - }, - "returns": [ - "CategoricalMutualInformation", - "MeanCategoricalMutualInformation", - "StdevCategoricalMutualInformation", - "SkewCategoricalMutualInformation", - "KurtosisCategoricalMutualInformation", - "MinCategoricalMutualInformation", - "Quartile1CategoricalMutualInformation", - "Quartile2CategoricalMutualInformation", - "Quartile3CategoricalMutualInformation", - "MaxCategoricalMutualInformation" - ] - }, - "StdevCategoricalMutualInformation": { - "function": "get_mutual_information", - "arguments": { - "feature_class_array": "NoNaNCategoricalFeaturesAndClass" - }, - "returns": [ - "CategoricalMutualInformation", - "MeanCategoricalMutualInformation", - "StdevCategoricalMutualInformation", - "SkewCategoricalMutualInformation", - "KurtosisCategoricalMutualInformation", - "MinCategoricalMutualInformation", - "Quartile1CategoricalMutualInformation", - "Quartile2CategoricalMutualInformation", - "Quartile3CategoricalMutualInformation", - "MaxCategoricalMutualInformation" - ] - }, - "SkewCategoricalMutualInformation": { - "function": "get_mutual_information", - "arguments": { - "feature_class_array": "NoNaNCategoricalFeaturesAndClass" - }, - "returns": [ - "CategoricalMutualInformation", - "MeanCategoricalMutualInformation", - "StdevCategoricalMutualInformation", - "SkewCategoricalMutualInformation", - "KurtosisCategoricalMutualInformation", - "MinCategoricalMutualInformation", - "Quartile1CategoricalMutualInformation", - "Quartile2CategoricalMutualInformation", - "Quartile3CategoricalMutualInformation", - "MaxCategoricalMutualInformation" - ] - }, - "KurtosisCategoricalMutualInformation": { - "function": "get_mutual_information", - "arguments": { - "feature_class_array": "NoNaNCategoricalFeaturesAndClass" - }, - "returns": [ - "CategoricalMutualInformation", - "MeanCategoricalMutualInformation", - "StdevCategoricalMutualInformation", - "SkewCategoricalMutualInformation", - "KurtosisCategoricalMutualInformation", - "MinCategoricalMutualInformation", - "Quartile1CategoricalMutualInformation", - "Quartile2CategoricalMutualInformation", - "Quartile3CategoricalMutualInformation", - "MaxCategoricalMutualInformation" - ] - }, - "MinCategoricalMutualInformation": { - "function": "get_mutual_information", - "arguments": { - "feature_class_array": "NoNaNCategoricalFeaturesAndClass" - }, - "returns": [ - "CategoricalMutualInformation", - "MeanCategoricalMutualInformation", - "StdevCategoricalMutualInformation", - "SkewCategoricalMutualInformation", - "KurtosisCategoricalMutualInformation", - "MinCategoricalMutualInformation", - "Quartile1CategoricalMutualInformation", - "Quartile2CategoricalMutualInformation", - "Quartile3CategoricalMutualInformation", - "MaxCategoricalMutualInformation" - ] - }, - "Quartile1CategoricalMutualInformation": { - "function": "get_mutual_information", - "arguments": { - "feature_class_array": "NoNaNCategoricalFeaturesAndClass" - }, - "returns": [ - "CategoricalMutualInformation", - "MeanCategoricalMutualInformation", - "StdevCategoricalMutualInformation", - "SkewCategoricalMutualInformation", - "KurtosisCategoricalMutualInformation", - "MinCategoricalMutualInformation", - "Quartile1CategoricalMutualInformation", - "Quartile2CategoricalMutualInformation", - "Quartile3CategoricalMutualInformation", - "MaxCategoricalMutualInformation" - ] - }, - "Quartile2CategoricalMutualInformation": { - "function": "get_mutual_information", - "arguments": { - "feature_class_array": "NoNaNCategoricalFeaturesAndClass" - }, - "returns": [ - "CategoricalMutualInformation", - "MeanCategoricalMutualInformation", - "StdevCategoricalMutualInformation", - "SkewCategoricalMutualInformation", - "KurtosisCategoricalMutualInformation", - "MinCategoricalMutualInformation", - "Quartile1CategoricalMutualInformation", - "Quartile2CategoricalMutualInformation", - "Quartile3CategoricalMutualInformation", - "MaxCategoricalMutualInformation" - ] - }, - "Quartile3CategoricalMutualInformation": { - "function": "get_mutual_information", - "arguments": { - "feature_class_array": "NoNaNCategoricalFeaturesAndClass" - }, - "returns": [ - "CategoricalMutualInformation", - "MeanCategoricalMutualInformation", - "StdevCategoricalMutualInformation", - "SkewCategoricalMutualInformation", - "KurtosisCategoricalMutualInformation", - "MinCategoricalMutualInformation", - "Quartile1CategoricalMutualInformation", - "Quartile2CategoricalMutualInformation", - "Quartile3CategoricalMutualInformation", - "MaxCategoricalMutualInformation" - ] - }, - "MaxCategoricalMutualInformation": { - "function": "get_mutual_information", - "arguments": { - "feature_class_array": "NoNaNCategoricalFeaturesAndClass" - }, - "returns": [ - "CategoricalMutualInformation", - "MeanCategoricalMutualInformation", - "StdevCategoricalMutualInformation", - "SkewCategoricalMutualInformation", - "KurtosisCategoricalMutualInformation", - "MinCategoricalMutualInformation", - "Quartile1CategoricalMutualInformation", - "Quartile2CategoricalMutualInformation", - "Quartile3CategoricalMutualInformation", - "MaxCategoricalMutualInformation" - ] - }, - "MeanNumericMutualInformation": { - "function": "get_mutual_information", - "arguments": { - "feature_class_array": "NoNaNBinnedNumericFeaturesAndClass" - }, - "returns": [ - "NumericMutualInformation", - "MeanNumericMutualInformation", - "StdevNumericMutualInformation", - "SkewNumericMutualInformation", - "KurtosisNumericMutualInformation", - "MinNumericMutualInformation", - "Quartile1NumericMutualInformation", - "Quartile2NumericMutualInformation", - "Quartile3NumericMutualInformation", - "MaxNumericMutualInformation" - ] - }, - "StdevNumericMutualInformation": { - "function": "get_mutual_information", - "arguments": { - "feature_class_array": "NoNaNBinnedNumericFeaturesAndClass" - }, - "returns": [ - "NumericMutualInformation", - "MeanNumericMutualInformation", - "StdevNumericMutualInformation", - "SkewNumericMutualInformation", - "KurtosisNumericMutualInformation", - "MinNumericMutualInformation", - "Quartile1NumericMutualInformation", - "Quartile2NumericMutualInformation", - "Quartile3NumericMutualInformation", - "MaxNumericMutualInformation" - ] - }, - "SkewNumericMutualInformation": { - "function": "get_mutual_information", - "arguments": { - "feature_class_array": "NoNaNBinnedNumericFeaturesAndClass" - }, - "returns": [ - "NumericMutualInformation", - "MeanNumericMutualInformation", - "StdevNumericMutualInformation", - "SkewNumericMutualInformation", - "KurtosisNumericMutualInformation", - "MinNumericMutualInformation", - "Quartile1NumericMutualInformation", - "Quartile2NumericMutualInformation", - "Quartile3NumericMutualInformation", - "MaxNumericMutualInformation" - ] - }, - "KurtosisNumericMutualInformation": { - "function": "get_mutual_information", - "arguments": { - "feature_class_array": "NoNaNBinnedNumericFeaturesAndClass" - }, - "returns": [ - "NumericMutualInformation", - "MeanNumericMutualInformation", - "StdevNumericMutualInformation", - "SkewNumericMutualInformation", - "KurtosisNumericMutualInformation", - "MinNumericMutualInformation", - "Quartile1NumericMutualInformation", - "Quartile2NumericMutualInformation", - "Quartile3NumericMutualInformation", - "MaxNumericMutualInformation" - ] - }, - "MinNumericMutualInformation": { - "function": "get_mutual_information", - "arguments": { - "feature_class_array": "NoNaNBinnedNumericFeaturesAndClass" - }, - "returns": [ - "NumericMutualInformation", - "MeanNumericMutualInformation", - "StdevNumericMutualInformation", - "SkewNumericMutualInformation", - "KurtosisNumericMutualInformation", - "MinNumericMutualInformation", - "Quartile1NumericMutualInformation", - "Quartile2NumericMutualInformation", - "Quartile3NumericMutualInformation", - "MaxNumericMutualInformation" - ] - }, - "Quartile1NumericMutualInformation": { - "function": "get_mutual_information", - "arguments": { - "feature_class_array": "NoNaNBinnedNumericFeaturesAndClass" - }, - "returns": [ - "NumericMutualInformation", - "MeanNumericMutualInformation", - "StdevNumericMutualInformation", - "SkewNumericMutualInformation", - "KurtosisNumericMutualInformation", - "MinNumericMutualInformation", - "Quartile1NumericMutualInformation", - "Quartile2NumericMutualInformation", - "Quartile3NumericMutualInformation", - "MaxNumericMutualInformation" - ] - }, - "Quartile2NumericMutualInformation": { - "function": "get_mutual_information", - "arguments": { - "feature_class_array": "NoNaNBinnedNumericFeaturesAndClass" - }, - "returns": [ - "NumericMutualInformation", - "MeanNumericMutualInformation", - "StdevNumericMutualInformation", - "SkewNumericMutualInformation", - "KurtosisNumericMutualInformation", - "MinNumericMutualInformation", - "Quartile1NumericMutualInformation", - "Quartile2NumericMutualInformation", - "Quartile3NumericMutualInformation", - "MaxNumericMutualInformation" - ] - }, - "Quartile3NumericMutualInformation": { - "function": "get_mutual_information", - "arguments": { - "feature_class_array": "NoNaNBinnedNumericFeaturesAndClass" - }, - "returns": [ - "NumericMutualInformation", - "MeanNumericMutualInformation", - "StdevNumericMutualInformation", - "SkewNumericMutualInformation", - "KurtosisNumericMutualInformation", - "MinNumericMutualInformation", - "Quartile1NumericMutualInformation", - "Quartile2NumericMutualInformation", - "Quartile3NumericMutualInformation", - "MaxNumericMutualInformation" - ] - }, - "MaxNumericMutualInformation": { - "function": "get_mutual_information", - "arguments": { - "feature_class_array": "NoNaNBinnedNumericFeaturesAndClass" - }, - "returns": [ - "NumericMutualInformation", - "MeanNumericMutualInformation", - "StdevNumericMutualInformation", - "SkewNumericMutualInformation", - "KurtosisNumericMutualInformation", - "MinNumericMutualInformation", - "Quartile1NumericMutualInformation", - "Quartile2NumericMutualInformation", - "Quartile3NumericMutualInformation", - "MaxNumericMutualInformation" - ] - }, - "EquivalentNumberOfCategoricalFeatures": { - "function": "get_equivalent_number_features", - "arguments": { - "class_entropy": "ClassEntropy", - "mutual_information": "MeanCategoricalMutualInformation" - }, - "returns": [ - "EquivalentNumberOfCategoricalFeatures" - ] - }, - "EquivalentNumberOfNumericFeatures": { - "function": "get_equivalent_number_features", - "arguments": { - "class_entropy": "ClassEntropy", - "mutual_information": "MeanNumericMutualInformation" - }, - "returns": [ - "EquivalentNumberOfNumericFeatures" - ] - }, - "CategoricalNoiseToSignalRatio": { - "function": "get_noise_signal_ratio", - "arguments": { - "attribute_entropy": "MeanCategoricalAttributeEntropy", - "mutual_information": "MeanCategoricalMutualInformation" - }, - "returns": [ - "CategoricalNoiseToSignalRatio" - ] - }, - "NumericNoiseToSignalRatio": { - "function": "get_noise_signal_ratio", - "arguments": { - "attribute_entropy": "MeanNumericAttributeEntropy", - "mutual_information": "MeanNumericMutualInformation" - }, - "returns": [ - "NumericNoiseToSignalRatio"] - }, - "NaiveBayesErrRate": { - "function": "get_naive_bayes", - "arguments": { - "X": "XPreprocessed", - "Y": "YSample", - "n_folds": "n_folds", - "cv_seed": "cv_seed" - }, - "returns": [ - "NaiveBayesErrRate", - "NaiveBayesKappa" - ] - }, - "NaiveBayesKappa": { - "function": "get_naive_bayes", - "arguments": { - "X": "XPreprocessed", - "Y": "YSample", - "n_folds": "n_folds", - "cv_seed": "cv_seed" - }, - "returns": [ - "NaiveBayesErrRate", - "NaiveBayesKappa" - ] - }, - "kNN1NErrRate": { - "function": "get_knn_1", - "arguments": { - "X": "XPreprocessed", - "Y": "YSample", - "n_folds": "n_folds", - "cv_seed": "cv_seed" - }, - "returns": [ - "kNN1NErrRate", - "kNN1NKappa" - ] - }, - "kNN1NKappa": { - "function": "get_knn_1", - "arguments": { - "X": "XPreprocessed", - "Y": "YSample", - "n_folds": "n_folds", - "cv_seed": "cv_seed" - }, - "returns": [ - "kNN1NErrRate", - "kNN1NKappa" - ] - }, - "DecisionStumpErrRate": { - "function": "get_decision_stump", - "arguments": { - "X": "XPreprocessed", - "Y": "YSample", - "seed": 5, - "n_folds": "n_folds", - "cv_seed": "cv_seed" - }, - "returns": [ - "DecisionStumpErrRate", - "DecisionStumpKappa" - ] - }, - "DecisionStumpKappa": { - "function": "get_decision_stump", - "arguments": { - "X": "XPreprocessed", - "Y": "YSample", - "seed": 5, - "n_folds": "n_folds", - "cv_seed": "cv_seed" - }, - "returns": [ - "DecisionStumpErrRate", - "DecisionStumpKappa" - ] - }, - "RandomTreeDepth1ErrRate": { - "function": "get_random_tree", - "arguments": { - "X": "XPreprocessed", - "Y": "YSample", - "depth": 1, - "seed": 6, - "n_folds": "n_folds", - "cv_seed": "cv_seed" - }, - "returns": [ - "RandomTreeDepth1ErrRate", - "RandomTreeDepth1Kappa" - ] - }, - "RandomTreeDepth1Kappa": { - "function": "get_random_tree", - "arguments": { - "X": "XPreprocessed", - "Y": "YSample", - "depth": 1, - "seed": 6, - "n_folds": "n_folds", - "cv_seed": "cv_seed" - }, - "returns": [ - "RandomTreeDepth1ErrRate", - "RandomTreeDepth1Kappa" - ] - }, - "RandomTreeDepth2ErrRate": { - "function": "get_random_tree", - "arguments": { - "X": "XPreprocessed", - "Y": "YSample", - "depth": 2, - "seed": 7, - "n_folds": "n_folds", - "cv_seed": "cv_seed" - }, - "returns": [ - "RandomTreeDepth2ErrRate", - "RandomTreeDepth2Kappa" - ] - }, - "RandomTreeDepth2Kappa": { - "function": "get_random_tree", - "arguments": { - "X": "XPreprocessed", - "Y": "YSample", - "depth": 2, - "seed": 7, - "n_folds": "n_folds", - "cv_seed": "cv_seed" - }, - "returns": [ - "RandomTreeDepth2ErrRate", - "RandomTreeDepth2Kappa" - ] - }, - "RandomTreeDepth3ErrRate": { - "function": "get_random_tree", - "arguments": { - "X": "XPreprocessed", - "Y": "YSample", - "depth": 3, - "seed": 8, - "n_folds": "n_folds", - "cv_seed": "cv_seed" - }, - "returns": [ - "RandomTreeDepth3ErrRate", - "RandomTreeDepth3Kappa" - ] - }, - "RandomTreeDepth3Kappa": { - "function": "get_random_tree", - "arguments": { - "X": "XPreprocessed", - "Y": "YSample", - "depth": 3, - "seed": 8, - "n_folds": "n_folds", - "cv_seed": "cv_seed" - }, - "returns": [ - "RandomTreeDepth3ErrRate", - "RandomTreeDepth3Kappa" - ] - }, - "LinearDiscriminantAnalysisErrRate": { - "function": "get_lda", - "arguments": { - "X": "XPreprocessed", - "Y": "YSample", - "n_folds": "n_folds", - "cv_seed": "cv_seed" - }, - "returns": [ - "LinearDiscriminantAnalysisErrRate", - "LinearDiscriminantAnalysisKappa" - ] - }, - "LinearDiscriminantAnalysisKappa": { - "function": "get_lda", - "arguments": { - "X": "XPreprocessed", - "Y": "YSample", - "n_folds": "n_folds", - "cv_seed": "cv_seed" - }, - "returns": [ - "LinearDiscriminantAnalysisErrRate", - "LinearDiscriminantAnalysisKappa" - ] - }, - "MeanMeansOfStringLengthOfTextFeatures": { - "function": "get_string_length_means", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "MeansOfStringLengthOfTextFeatures", - "MeanMeansOfStringLengthOfTextFeatures", - "StdevMeansOfStringLengthOfTextFeatures", - "SkewMeansOfStringLengthOfTextFeatures", - "KurtosisMeansOfStringLengthOfTextFeatures", - "MinMeansOfStringLengthOfTextFeatures", - "Quartile1MeansOfStringLengthOfTextFeatures", - "Quartile2MeansOfStringLengthOfTextFeatures", - "Quartile3MeansOfStringLengthOfTextFeatures", - "MaxMeansOfStringLengthOfTextFeatures" - ] - }, - "StdevMeansOfStringLengthOfTextFeatures": { - "function": "get_string_length_means", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "MeansOfStringLengthOfTextFeatures", - "MeanMeansOfStringLengthOfTextFeatures", - "StdevMeansOfStringLengthOfTextFeatures", - "SkewMeansOfStringLengthOfTextFeatures", - "KurtosisMeansOfStringLengthOfTextFeatures", - "MinMeansOfStringLengthOfTextFeatures", - "Quartile1MeansOfStringLengthOfTextFeatures", - "Quartile2MeansOfStringLengthOfTextFeatures", - "Quartile3MeansOfStringLengthOfTextFeatures", - "MaxMeansOfStringLengthOfTextFeatures" - ] - }, - "SkewMeansOfStringLengthOfTextFeatures": { - "function": "get_string_length_means", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "MeansOfStringLengthOfTextFeatures", - "MeanMeansOfStringLengthOfTextFeatures", - "StdevMeansOfStringLengthOfTextFeatures", - "SkewMeansOfStringLengthOfTextFeatures", - "KurtosisMeansOfStringLengthOfTextFeatures", - "MinMeansOfStringLengthOfTextFeatures", - "Quartile1MeansOfStringLengthOfTextFeatures", - "Quartile2MeansOfStringLengthOfTextFeatures", - "Quartile3MeansOfStringLengthOfTextFeatures", - "MaxMeansOfStringLengthOfTextFeatures" - ] - }, - "KurtosisMeansOfStringLengthOfTextFeatures": { - "function": "get_string_length_means", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "MeansOfStringLengthOfTextFeatures", - "MeanMeansOfStringLengthOfTextFeatures", - "StdevMeansOfStringLengthOfTextFeatures", - "SkewMeansOfStringLengthOfTextFeatures", - "KurtosisMeansOfStringLengthOfTextFeatures", - "MinMeansOfStringLengthOfTextFeatures", - "Quartile1MeansOfStringLengthOfTextFeatures", - "Quartile2MeansOfStringLengthOfTextFeatures", - "Quartile3MeansOfStringLengthOfTextFeatures", - "MaxMeansOfStringLengthOfTextFeatures" - ] - }, - "MinMeansOfStringLengthOfTextFeatures": { - "function": "get_string_length_means", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "MeansOfStringLengthOfTextFeatures", - "MeanMeansOfStringLengthOfTextFeatures", - "StdevMeansOfStringLengthOfTextFeatures", - "SkewMeansOfStringLengthOfTextFeatures", - "KurtosisMeansOfStringLengthOfTextFeatures", - "MinMeansOfStringLengthOfTextFeatures", - "Quartile1MeansOfStringLengthOfTextFeatures", - "Quartile2MeansOfStringLengthOfTextFeatures", - "Quartile3MeansOfStringLengthOfTextFeatures", - "MaxMeansOfStringLengthOfTextFeatures" - ] - }, - "MaxMeansOfStringLengthOfTextFeatures": { - "function": "get_string_length_means", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "MeansOfStringLengthOfTextFeatures", - "MeanMeansOfStringLengthOfTextFeatures", - "StdevMeansOfStringLengthOfTextFeatures", - "SkewMeansOfStringLengthOfTextFeatures", - "KurtosisMeansOfStringLengthOfTextFeatures", - "MinMeansOfStringLengthOfTextFeatures", - "Quartile1MeansOfStringLengthOfTextFeatures", - "Quartile2MeansOfStringLengthOfTextFeatures", - "Quartile3MeansOfStringLengthOfTextFeatures", - "MaxMeansOfStringLengthOfTextFeatures" - ] - }, - "Quartile1MeansOfStringLengthOfTextFeatures": { - "function": "get_string_length_means", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "MeansOfStringLengthOfTextFeatures", - "MeanMeansOfStringLengthOfTextFeatures", - "StdevMeansOfStringLengthOfTextFeatures", - "SkewMeansOfStringLengthOfTextFeatures", - "KurtosisMeansOfStringLengthOfTextFeatures", - "MinMeansOfStringLengthOfTextFeatures", - "Quartile1MeansOfStringLengthOfTextFeatures", - "Quartile2MeansOfStringLengthOfTextFeatures", - "Quartile3MeansOfStringLengthOfTextFeatures", - "MaxMeansOfStringLengthOfTextFeatures" - ] - }, - "Quartile2MeansOfStringLengthOfTextFeatures": { - "function": "get_string_length_means", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "MeansOfStringLengthOfTextFeatures", - "MeanMeansOfStringLengthOfTextFeatures", - "StdevMeansOfStringLengthOfTextFeatures", - "SkewMeansOfStringLengthOfTextFeatures", - "KurtosisMeansOfStringLengthOfTextFeatures", - "MinMeansOfStringLengthOfTextFeatures", - "Quartile1MeansOfStringLengthOfTextFeatures", - "Quartile2MeansOfStringLengthOfTextFeatures", - "Quartile3MeansOfStringLengthOfTextFeatures", - "MaxMeansOfStringLengthOfTextFeatures" - ] - }, - "Quartile3MeansOfStringLengthOfTextFeatures": { - "function": "get_string_length_means", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "MeansOfStringLengthOfTextFeatures", - "MeanMeansOfStringLengthOfTextFeatures", - "StdevMeansOfStringLengthOfTextFeatures", - "SkewMeansOfStringLengthOfTextFeatures", - "KurtosisMeansOfStringLengthOfTextFeatures", - "MinMeansOfStringLengthOfTextFeatures", - "Quartile1MeansOfStringLengthOfTextFeatures", - "Quartile2MeansOfStringLengthOfTextFeatures", - "Quartile3MeansOfStringLengthOfTextFeatures", - "MaxMeansOfStringLengthOfTextFeatures" - ] - }, - "MeanStdDevOfStringLengthOfTextFeatures": { - "function": "get_string_length_stdev", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "StdDevsOfStringLengthOfTextFeatures", - "MeanStdDevOfStringLengthOfTextFeatures", - "StdevStdDevOfStringLengthOfTextFeatures", - "SkewStdDevOfStringLengthOfTextFeatures", - "KurtosisStdDevOfStringLengthOfTextFeatures", - "MinStdDevOfStringLengthOfTextFeatures", - "Quartile1StdDevOfStringLengthOfTextFeatures", - "Quartile2StdDevOfStringLengthOfTextFeatures", - "Quartile3StdDevOfStringLengthOfTextFeatures", - "MaxStdDevOfStringLengthOfTextFeatures" - ] - }, - "StdevStdDevOfStringLengthOfTextFeatures": { - "function": "get_string_length_stdev", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "StdDevsOfStringLengthOfTextFeatures", - "MeanStdDevOfStringLengthOfTextFeatures", - "StdevStdDevOfStringLengthOfTextFeatures", - "SkewStdDevOfStringLengthOfTextFeatures", - "KurtosisStdDevOfStringLengthOfTextFeatures", - "MinStdDevOfStringLengthOfTextFeatures", - "Quartile1StdDevOfStringLengthOfTextFeatures", - "Quartile2StdDevOfStringLengthOfTextFeatures", - "Quartile3StdDevOfStringLengthOfTextFeatures", - "MaxStdDevOfStringLengthOfTextFeatures" - ] - }, - "SkewStdDevOfStringLengthOfTextFeatures": { - "function": "get_string_length_stdev", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "StdDevsOfStringLengthOfTextFeatures", - "MeanStdDevOfStringLengthOfTextFeatures", - "StdevStdDevOfStringLengthOfTextFeatures", - "SkewStdDevOfStringLengthOfTextFeatures", - "KurtosisStdDevOfStringLengthOfTextFeatures", - "MinStdDevOfStringLengthOfTextFeatures", - "Quartile1StdDevOfStringLengthOfTextFeatures", - "Quartile2StdDevOfStringLengthOfTextFeatures", - "Quartile3StdDevOfStringLengthOfTextFeatures", - "MaxStdDevOfStringLengthOfTextFeatures" - ] - }, - "KurtosisStdDevOfStringLengthOfTextFeatures": { - "function": "get_string_length_stdev", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "StdDevsOfStringLengthOfTextFeatures", - "MeanStdDevOfStringLengthOfTextFeatures", - "StdevStdDevOfStringLengthOfTextFeatures", - "SkewStdDevOfStringLengthOfTextFeatures", - "KurtosisStdDevOfStringLengthOfTextFeatures", - "MinStdDevOfStringLengthOfTextFeatures", - "Quartile1StdDevOfStringLengthOfTextFeatures", - "Quartile2StdDevOfStringLengthOfTextFeatures", - "Quartile3StdDevOfStringLengthOfTextFeatures", - "MaxStdDevOfStringLengthOfTextFeatures" - ] - }, - "MinStdDevOfStringLengthOfTextFeatures": { - "function": "get_string_length_stdev", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "StdDevsOfStringLengthOfTextFeatures", - "MeanStdDevOfStringLengthOfTextFeatures", - "StdevStdDevOfStringLengthOfTextFeatures", - "SkewStdDevOfStringLengthOfTextFeatures", - "KurtosisStdDevOfStringLengthOfTextFeatures", - "MinStdDevOfStringLengthOfTextFeatures", - "Quartile1StdDevOfStringLengthOfTextFeatures", - "Quartile2StdDevOfStringLengthOfTextFeatures", - "Quartile3StdDevOfStringLengthOfTextFeatures", - "MaxStdDevOfStringLengthOfTextFeatures" - ] - }, - "MaxStdDevOfStringLengthOfTextFeatures": { - "function": "get_string_length_stdev", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "StdDevsOfStringLengthOfTextFeatures", - "MeanStdDevOfStringLengthOfTextFeatures", - "StdevStdDevOfStringLengthOfTextFeatures", - "SkewStdDevOfStringLengthOfTextFeatures", - "KurtosisStdDevOfStringLengthOfTextFeatures", - "MinStdDevOfStringLengthOfTextFeatures", - "Quartile1StdDevOfStringLengthOfTextFeatures", - "Quartile2StdDevOfStringLengthOfTextFeatures", - "Quartile3StdDevOfStringLengthOfTextFeatures", - "MaxStdDevOfStringLengthOfTextFeatures" - ] - }, - "Quartile1StdDevOfStringLengthOfTextFeatures": { - "function": "get_string_length_stdev", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "StdDevsOfStringLengthOfTextFeatures", - "MeanStdDevOfStringLengthOfTextFeatures", - "StdevStdDevOfStringLengthOfTextFeatures", - "SkewStdDevOfStringLengthOfTextFeatures", - "KurtosisStdDevOfStringLengthOfTextFeatures", - "MinStdDevOfStringLengthOfTextFeatures", - "Quartile1StdDevOfStringLengthOfTextFeatures", - "Quartile2StdDevOfStringLengthOfTextFeatures", - "Quartile3StdDevOfStringLengthOfTextFeatures", - "MaxStdDevOfStringLengthOfTextFeatures" - ] - }, - "Quartile2StdDevOfStringLengthOfTextFeatures": { - "function": "get_string_length_stdev", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "StdDevsOfStringLengthOfTextFeatures", - "MeanStdDevOfStringLengthOfTextFeatures", - "StdevStdDevOfStringLengthOfTextFeatures", - "SkewStdDevOfStringLengthOfTextFeatures", - "KurtosisStdDevOfStringLengthOfTextFeatures", - "MinStdDevOfStringLengthOfTextFeatures", - "Quartile1StdDevOfStringLengthOfTextFeatures", - "Quartile2StdDevOfStringLengthOfTextFeatures", - "Quartile3StdDevOfStringLengthOfTextFeatures", - "MaxStdDevOfStringLengthOfTextFeatures" - ] - }, - "Quartile3StdDevOfStringLengthOfTextFeatures": { - "function": "get_string_length_stdev", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "StdDevsOfStringLengthOfTextFeatures", - "MeanStdDevOfStringLengthOfTextFeatures", - "StdevStdDevOfStringLengthOfTextFeatures", - "SkewStdDevOfStringLengthOfTextFeatures", - "KurtosisStdDevOfStringLengthOfTextFeatures", - "MinStdDevOfStringLengthOfTextFeatures", - "Quartile1StdDevOfStringLengthOfTextFeatures", - "Quartile2StdDevOfStringLengthOfTextFeatures", - "Quartile3StdDevOfStringLengthOfTextFeatures", - "MaxStdDevOfStringLengthOfTextFeatures" - ] - }, - "MeanSkewnessOfStringLengthOfTextFeatures": { - "function": "get_string_length_skewness", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "SkewnessesOfStringLengthOfTextFeatures", - "MeanSkewnessOfStringLengthOfTextFeatures", - "StdevSkewnessOfStringLengthOfTextFeatures", - "SkewSkewnessOfStringLengthOfTextFeatures", - "KurtosisSkewnessOfStringLengthOfTextFeatures", - "MinSkewnessOfStringLengthOfTextFeatures", - "Quartile1SkewnessOfStringLengthOfTextFeatures", - "Quartile2SkewnessOfStringLengthOfTextFeatures", - "Quartile3SkewnessOfStringLengthOfTextFeatures", - "MaxSkewnessOfStringLengthOfTextFeatures" - ] - }, - "StdevSkewnessOfStringLengthOfTextFeatures": { - "function": "get_string_length_skewness", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "SkewnessesOfStringLengthOfTextFeatures", - "MeanSkewnessOfStringLengthOfTextFeatures", - "StdevSkewnessOfStringLengthOfTextFeatures", - "SkewSkewnessOfStringLengthOfTextFeatures", - "KurtosisSkewnessOfStringLengthOfTextFeatures", - "MinSkewnessOfStringLengthOfTextFeatures", - "Quartile1SkewnessOfStringLengthOfTextFeatures", - "Quartile2SkewnessOfStringLengthOfTextFeatures", - "Quartile3SkewnessOfStringLengthOfTextFeatures", - "MaxSkewnessOfStringLengthOfTextFeatures" - ] - }, - "SkewSkewnessOfStringLengthOfTextFeatures": { - "function": "get_string_length_skewness", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "SkewnessesOfStringLengthOfTextFeatures", - "MeanSkewnessOfStringLengthOfTextFeatures", - "StdevSkewnessOfStringLengthOfTextFeatures", - "SkewSkewnessOfStringLengthOfTextFeatures", - "KurtosisSkewnessOfStringLengthOfTextFeatures", - "MinSkewnessOfStringLengthOfTextFeatures", - "Quartile1SkewnessOfStringLengthOfTextFeatures", - "Quartile2SkewnessOfStringLengthOfTextFeatures", - "Quartile3SkewnessOfStringLengthOfTextFeatures", - "MaxSkewnessOfStringLengthOfTextFeatures" - ] - }, - "KurtosisSkewnessOfStringLengthOfTextFeatures": { - "function": "get_string_length_skewness", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "SkewnessesOfStringLengthOfTextFeatures", - "MeanSkewnessOfStringLengthOfTextFeatures", - "StdevSkewnessOfStringLengthOfTextFeatures", - "SkewSkewnessOfStringLengthOfTextFeatures", - "KurtosisSkewnessOfStringLengthOfTextFeatures", - "MinSkewnessOfStringLengthOfTextFeatures", - "Quartile1SkewnessOfStringLengthOfTextFeatures", - "Quartile2SkewnessOfStringLengthOfTextFeatures", - "Quartile3SkewnessOfStringLengthOfTextFeatures", - "MaxSkewnessOfStringLengthOfTextFeatures" - ] - }, - "MinSkewnessOfStringLengthOfTextFeatures": { - "function": "get_string_length_skewness", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "SkewnessesOfStringLengthOfTextFeatures", - "MeanSkewnessOfStringLengthOfTextFeatures", - "StdevSkewnessOfStringLengthOfTextFeatures", - "SkewSkewnessOfStringLengthOfTextFeatures", - "KurtosisSkewnessOfStringLengthOfTextFeatures", - "MinSkewnessOfStringLengthOfTextFeatures", - "Quartile1SkewnessOfStringLengthOfTextFeatures", - "Quartile2SkewnessOfStringLengthOfTextFeatures", - "Quartile3SkewnessOfStringLengthOfTextFeatures", - "MaxSkewnessOfStringLengthOfTextFeatures" - ] - }, - "MaxSkewnessOfStringLengthOfTextFeatures": { - "function": "get_string_length_skewness", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "SkewnessesOfStringLengthOfTextFeatures", - "MeanSkewnessOfStringLengthOfTextFeatures", - "StdevSkewnessOfStringLengthOfTextFeatures", - "SkewSkewnessOfStringLengthOfTextFeatures", - "KurtosisSkewnessOfStringLengthOfTextFeatures", - "MinSkewnessOfStringLengthOfTextFeatures", - "Quartile1SkewnessOfStringLengthOfTextFeatures", - "Quartile2SkewnessOfStringLengthOfTextFeatures", - "Quartile3SkewnessOfStringLengthOfTextFeatures", - "MaxSkewnessOfStringLengthOfTextFeatures" - ] - }, - "Quartile1SkewnessOfStringLengthOfTextFeatures": { - "function": "get_string_length_skewness", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "SkewnessesOfStringLengthOfTextFeatures", - "MeanSkewnessOfStringLengthOfTextFeatures", - "StdevSkewnessOfStringLengthOfTextFeatures", - "SkewSkewnessOfStringLengthOfTextFeatures", - "KurtosisSkewnessOfStringLengthOfTextFeatures", - "MinSkewnessOfStringLengthOfTextFeatures", - "Quartile1SkewnessOfStringLengthOfTextFeatures", - "Quartile2SkewnessOfStringLengthOfTextFeatures", - "Quartile3SkewnessOfStringLengthOfTextFeatures", - "MaxSkewnessOfStringLengthOfTextFeatures" - ] - }, - "Quartile2SkewnessOfStringLengthOfTextFeatures": { - "function": "get_string_length_skewness", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "SkewnessesOfStringLengthOfTextFeatures", - "MeanSkewnessOfStringLengthOfTextFeatures", - "StdevSkewnessOfStringLengthOfTextFeatures", - "SkewSkewnessOfStringLengthOfTextFeatures", - "KurtosisSkewnessOfStringLengthOfTextFeatures", - "MinSkewnessOfStringLengthOfTextFeatures", - "Quartile1SkewnessOfStringLengthOfTextFeatures", - "Quartile2SkewnessOfStringLengthOfTextFeatures", - "Quartile3SkewnessOfStringLengthOfTextFeatures", - "MaxSkewnessOfStringLengthOfTextFeatures" - ] - }, - "Quartile3SkewnessOfStringLengthOfTextFeatures": { - "function": "get_string_length_skewness", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "SkewnessesOfStringLengthOfTextFeatures", - "MeanSkewnessOfStringLengthOfTextFeatures", - "StdevSkewnessOfStringLengthOfTextFeatures", - "SkewSkewnessOfStringLengthOfTextFeatures", - "KurtosisSkewnessOfStringLengthOfTextFeatures", - "MinSkewnessOfStringLengthOfTextFeatures", - "Quartile1SkewnessOfStringLengthOfTextFeatures", - "Quartile2SkewnessOfStringLengthOfTextFeatures", - "Quartile3SkewnessOfStringLengthOfTextFeatures", - "MaxSkewnessOfStringLengthOfTextFeatures" - ] - }, - "MeanKurtosisOfStringLengthOfTextFeatures": { - "function": "get_string_length_kurtosis", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "KurtosesOfStringLengthOfTextFeatures", - "MeanKurtosisOfStringLengthOfTextFeatures", - "StdevKurtosisOfStringLengthOfTextFeatures", - "SkewKurtosisOfStringLengthOfTextFeatures", - "KurtosisKurtosisOfStringLengthOfTextFeatures", - "MinKurtosisOfStringLengthOfTextFeatures", - "Quartile1KurtosisOfStringLengthOfTextFeatures", - "Quartile2KurtosisOfStringLengthOfTextFeatures", - "Quartile3KurtosisOfStringLengthOfTextFeatures", - "MaxKurtosisOfStringLengthOfTextFeatures" - ] - }, - "StdevKurtosisOfStringLengthOfTextFeatures": { - "function": "get_string_length_kurtosis", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "KurtosesOfStringLengthOfTextFeatures", - "MeanKurtosisOfStringLengthOfTextFeatures", - "StdevKurtosisOfStringLengthOfTextFeatures", - "SkewKurtosisOfStringLengthOfTextFeatures", - "KurtosisKurtosisOfStringLengthOfTextFeatures", - "MinKurtosisOfStringLengthOfTextFeatures", - "Quartile1KurtosisOfStringLengthOfTextFeatures", - "Quartile2KurtosisOfStringLengthOfTextFeatures", - "Quartile3KurtosisOfStringLengthOfTextFeatures", - "MaxKurtosisOfStringLengthOfTextFeatures" - ] - }, - "SkewKurtosisOfStringLengthOfTextFeatures": { - "function": "get_string_length_kurtosis", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "KurtosesOfStringLengthOfTextFeatures", - "MeanKurtosisOfStringLengthOfTextFeatures", - "StdevKurtosisOfStringLengthOfTextFeatures", - "SkewKurtosisOfStringLengthOfTextFeatures", - "KurtosisKurtosisOfStringLengthOfTextFeatures", - "MinKurtosisOfStringLengthOfTextFeatures", - "Quartile1KurtosisOfStringLengthOfTextFeatures", - "Quartile2KurtosisOfStringLengthOfTextFeatures", - "Quartile3KurtosisOfStringLengthOfTextFeatures", - "MaxKurtosisOfStringLengthOfTextFeatures" - ] - }, - "KurtosisKurtosisOfStringLengthOfTextFeatures": { - "function": "get_string_length_kurtosis", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "KurtosesOfStringLengthOfTextFeatures", - "MeanKurtosisOfStringLengthOfTextFeatures", - "StdevKurtosisOfStringLengthOfTextFeatures", - "SkewKurtosisOfStringLengthOfTextFeatures", - "KurtosisKurtosisOfStringLengthOfTextFeatures", - "MinKurtosisOfStringLengthOfTextFeatures", - "Quartile1KurtosisOfStringLengthOfTextFeatures", - "Quartile2KurtosisOfStringLengthOfTextFeatures", - "Quartile3KurtosisOfStringLengthOfTextFeatures", - "MaxKurtosisOfStringLengthOfTextFeatures" - ] - }, - "MinKurtosisOfStringLengthOfTextFeatures": { - "function": "get_string_length_kurtosis", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "KurtosesOfStringLengthOfTextFeatures", - "MeanKurtosisOfStringLengthOfTextFeatures", - "StdevKurtosisOfStringLengthOfTextFeatures", - "SkewKurtosisOfStringLengthOfTextFeatures", - "KurtosisKurtosisOfStringLengthOfTextFeatures", - "MinKurtosisOfStringLengthOfTextFeatures", - "Quartile1KurtosisOfStringLengthOfTextFeatures", - "Quartile2KurtosisOfStringLengthOfTextFeatures", - "Quartile3KurtosisOfStringLengthOfTextFeatures", - "MaxKurtosisOfStringLengthOfTextFeatures" - ] - }, - "MaxKurtosisOfStringLengthOfTextFeatures": { - "function": "get_string_length_kurtosis", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "KurtosesOfStringLengthOfTextFeatures", - "MeanKurtosisOfStringLengthOfTextFeatures", - "StdevKurtosisOfStringLengthOfTextFeatures", - "SkewKurtosisOfStringLengthOfTextFeatures", - "KurtosisKurtosisOfStringLengthOfTextFeatures", - "MinKurtosisOfStringLengthOfTextFeatures", - "Quartile1KurtosisOfStringLengthOfTextFeatures", - "Quartile2KurtosisOfStringLengthOfTextFeatures", - "Quartile3KurtosisOfStringLengthOfTextFeatures", - "MaxKurtosisOfStringLengthOfTextFeatures" - ] - }, - "Quartile1KurtosisOfStringLengthOfTextFeatures": { - "function": "get_string_length_kurtosis", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "KurtosesOfStringLengthOfTextFeatures", - "MeanKurtosisOfStringLengthOfTextFeatures", - "StdevKurtosisOfStringLengthOfTextFeatures", - "SkewKurtosisOfStringLengthOfTextFeatures", - "KurtosisKurtosisOfStringLengthOfTextFeatures", - "MinKurtosisOfStringLengthOfTextFeatures", - "Quartile1KurtosisOfStringLengthOfTextFeatures", - "Quartile2KurtosisOfStringLengthOfTextFeatures", - "Quartile3KurtosisOfStringLengthOfTextFeatures", - "MaxKurtosisOfStringLengthOfTextFeatures" - ] - }, - "Quartile2KurtosisOfStringLengthOfTextFeatures": { - "function": "get_string_length_kurtosis", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "KurtosesOfStringLengthOfTextFeatures", - "MeanKurtosisOfStringLengthOfTextFeatures", - "StdevKurtosisOfStringLengthOfTextFeatures", - "SkewKurtosisOfStringLengthOfTextFeatures", - "KurtosisKurtosisOfStringLengthOfTextFeatures", - "MinKurtosisOfStringLengthOfTextFeatures", - "Quartile1KurtosisOfStringLengthOfTextFeatures", - "Quartile2KurtosisOfStringLengthOfTextFeatures", - "Quartile3KurtosisOfStringLengthOfTextFeatures", - "MaxKurtosisOfStringLengthOfTextFeatures" - ] - }, - "Quartile3KurtosisOfStringLengthOfTextFeatures": { - "function": "get_string_length_kurtosis", - "arguments": { - "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" - }, - "returns": [ - "KurtosesOfStringLengthOfTextFeatures", - "MeanKurtosisOfStringLengthOfTextFeatures", - "StdevKurtosisOfStringLengthOfTextFeatures", - "SkewKurtosisOfStringLengthOfTextFeatures", - "KurtosisKurtosisOfStringLengthOfTextFeatures", - "MinKurtosisOfStringLengthOfTextFeatures", - "Quartile1KurtosisOfStringLengthOfTextFeatures", - "Quartile2KurtosisOfStringLengthOfTextFeatures", - "Quartile3KurtosisOfStringLengthOfTextFeatures", - "MaxKurtosisOfStringLengthOfTextFeatures" - ] - }, - "MeanDecisionTreeLevelSize": { - "function": "get_decision_tree_level_sizes", - "arguments": { - "tree": "TraversedDecisionTree" - }, - "returns": [ - "DecisionTreeLevelSizes", - "MeanDecisionTreeLevelSize", - "StdevDecisionTreeLevelSize", - "SkewDecisionTreeLevelSize", - "KurtosisDecisionTreeLevelSize", - "MinDecisionTreeLevelSize", - "Quartile1DecisionTreeLevelSize", - "Quartile2DecisionTreeLevelSize", - "Quartile3DecisionTreeLevelSize", - "MaxDecisionTreeLevelSize" - ] - }, - "StdevDecisionTreeLevelSize": { - "function": "get_decision_tree_level_sizes", - "arguments": { - "tree": "TraversedDecisionTree" - }, - "returns": [ - "DecisionTreeLevelSizes", - "MeanDecisionTreeLevelSize", - "StdevDecisionTreeLevelSize", - "SkewDecisionTreeLevelSize", - "KurtosisDecisionTreeLevelSize", - "MinDecisionTreeLevelSize", - "Quartile1DecisionTreeLevelSize", - "Quartile2DecisionTreeLevelSize", - "Quartile3DecisionTreeLevelSize", - "MaxDecisionTreeLevelSize" - ] - }, - "SkewDecisionTreeLevelSize": { - "function": "get_decision_tree_level_sizes", - "arguments": { - "tree": "TraversedDecisionTree" - }, - "returns": [ - "DecisionTreeLevelSizes", - "MeanDecisionTreeLevelSize", - "StdevDecisionTreeLevelSize", - "SkewDecisionTreeLevelSize", - "KurtosisDecisionTreeLevelSize", - "MinDecisionTreeLevelSize", - "Quartile1DecisionTreeLevelSize", - "Quartile2DecisionTreeLevelSize", - "Quartile3DecisionTreeLevelSize", - "MaxDecisionTreeLevelSize" - ] - }, - "KurtosisDecisionTreeLevelSize": { - "function": "get_decision_tree_level_sizes", - "arguments": { - "tree": "TraversedDecisionTree" - }, - "returns": [ - "DecisionTreeLevelSizes", - "MeanDecisionTreeLevelSize", - "StdevDecisionTreeLevelSize", - "SkewDecisionTreeLevelSize", - "KurtosisDecisionTreeLevelSize", - "MinDecisionTreeLevelSize", - "Quartile1DecisionTreeLevelSize", - "Quartile2DecisionTreeLevelSize", - "Quartile3DecisionTreeLevelSize", - "MaxDecisionTreeLevelSize" - ] - }, - "MinDecisionTreeLevelSize": { - "function": "get_decision_tree_level_sizes", - "arguments": { - "tree": "TraversedDecisionTree" - }, - "returns": [ - "DecisionTreeLevelSizes", - "MeanDecisionTreeLevelSize", - "StdevDecisionTreeLevelSize", - "SkewDecisionTreeLevelSize", - "KurtosisDecisionTreeLevelSize", - "MinDecisionTreeLevelSize", - "Quartile1DecisionTreeLevelSize", - "Quartile2DecisionTreeLevelSize", - "Quartile3DecisionTreeLevelSize", - "MaxDecisionTreeLevelSize" - ] - }, - "Quartile1DecisionTreeLevelSize": { - "function": "get_decision_tree_level_sizes", - "arguments": { - "tree": "TraversedDecisionTree" - }, - "returns": [ - "DecisionTreeLevelSizes", - "MeanDecisionTreeLevelSize", - "StdevDecisionTreeLevelSize", - "SkewDecisionTreeLevelSize", - "KurtosisDecisionTreeLevelSize", - "MinDecisionTreeLevelSize", - "Quartile1DecisionTreeLevelSize", - "Quartile2DecisionTreeLevelSize", - "Quartile3DecisionTreeLevelSize", - "MaxDecisionTreeLevelSize" - ] - }, - "Quartile2DecisionTreeLevelSize": { - "function": "get_decision_tree_level_sizes", - "arguments": { - "tree": "TraversedDecisionTree" - }, - "returns": [ - "DecisionTreeLevelSizes", - "MeanDecisionTreeLevelSize", - "StdevDecisionTreeLevelSize", - "SkewDecisionTreeLevelSize", - "KurtosisDecisionTreeLevelSize", - "MinDecisionTreeLevelSize", - "Quartile1DecisionTreeLevelSize", - "Quartile2DecisionTreeLevelSize", - "Quartile3DecisionTreeLevelSize", - "MaxDecisionTreeLevelSize" - ] - }, - "Quartile3DecisionTreeLevelSize": { - "function": "get_decision_tree_level_sizes", - "arguments": { - "tree": "TraversedDecisionTree" - }, - "returns": [ - "DecisionTreeLevelSizes", - "MeanDecisionTreeLevelSize", - "StdevDecisionTreeLevelSize", - "SkewDecisionTreeLevelSize", - "KurtosisDecisionTreeLevelSize", - "MinDecisionTreeLevelSize", - "Quartile1DecisionTreeLevelSize", - "Quartile2DecisionTreeLevelSize", - "Quartile3DecisionTreeLevelSize", - "MaxDecisionTreeLevelSize" - ] - }, - "MaxDecisionTreeLevelSize": { - "function": "get_decision_tree_level_sizes", - "arguments": { - "tree": "TraversedDecisionTree" - }, - "returns": [ - "DecisionTreeLevelSizes", - "MeanDecisionTreeLevelSize", - "StdevDecisionTreeLevelSize", - "SkewDecisionTreeLevelSize", - "KurtosisDecisionTreeLevelSize", - "MinDecisionTreeLevelSize", - "Quartile1DecisionTreeLevelSize", - "Quartile2DecisionTreeLevelSize", - "Quartile3DecisionTreeLevelSize", - "MaxDecisionTreeLevelSize" - ] - }, - "MeanDecisionTreeBranchLength": { - "function": "get_decision_tree_branch_lengths", - "arguments": { - "tree": "TraversedDecisionTree" - }, - "returns": [ - "DecisionTreeBranchLengths", - "MeanDecisionTreeBranchLength", - "StdevDecisionTreeBranchLength", - "SkewDecisionTreeBranchLength", - "KurtosisDecisionTreeBranchLength", - "MinDecisionTreeBranchLength", - "Quartile1DecisionTreeBranchLength", - "Quartile2DecisionTreeBranchLength", - "Quartile3DecisionTreeBranchLength", - "MaxDecisionTreeBranchLength" - ] - }, - "StdevDecisionTreeBranchLength": { - "function": "get_decision_tree_branch_lengths", - "arguments": { - "tree": "TraversedDecisionTree" - }, - "returns": [ - "DecisionTreeBranchLengths", - "MeanDecisionTreeBranchLength", - "StdevDecisionTreeBranchLength", - "SkewDecisionTreeBranchLength", - "KurtosisDecisionTreeBranchLength", - "MinDecisionTreeBranchLength", - "Quartile1DecisionTreeBranchLength", - "Quartile2DecisionTreeBranchLength", - "Quartile3DecisionTreeBranchLength", - "MaxDecisionTreeBranchLength" - ] - }, - "SkewDecisionTreeBranchLength": { - "function": "get_decision_tree_branch_lengths", - "arguments": { - "tree": "TraversedDecisionTree" - }, - "returns": [ - "DecisionTreeBranchLengths", - "MeanDecisionTreeBranchLength", - "StdevDecisionTreeBranchLength", - "SkewDecisionTreeBranchLength", - "KurtosisDecisionTreeBranchLength", - "MinDecisionTreeBranchLength", - "Quartile1DecisionTreeBranchLength", - "Quartile2DecisionTreeBranchLength", - "Quartile3DecisionTreeBranchLength", - "MaxDecisionTreeBranchLength" - ] - }, - "KurtosisDecisionTreeBranchLength": { - "function": "get_decision_tree_branch_lengths", - "arguments": { - "tree": "TraversedDecisionTree" - }, - "returns": [ - "DecisionTreeBranchLengths", - "MeanDecisionTreeBranchLength", - "StdevDecisionTreeBranchLength", - "SkewDecisionTreeBranchLength", - "KurtosisDecisionTreeBranchLength", - "MinDecisionTreeBranchLength", - "Quartile1DecisionTreeBranchLength", - "Quartile2DecisionTreeBranchLength", - "Quartile3DecisionTreeBranchLength", - "MaxDecisionTreeBranchLength" - ] - }, - "MinDecisionTreeBranchLength": { - "function": "get_decision_tree_branch_lengths", - "arguments": { - "tree": "TraversedDecisionTree" - }, - "returns": [ - "DecisionTreeBranchLengths", - "MeanDecisionTreeBranchLength", - "StdevDecisionTreeBranchLength", - "SkewDecisionTreeBranchLength", - "KurtosisDecisionTreeBranchLength", - "MinDecisionTreeBranchLength", - "Quartile1DecisionTreeBranchLength", - "Quartile2DecisionTreeBranchLength", - "Quartile3DecisionTreeBranchLength", - "MaxDecisionTreeBranchLength" - ] - }, - "Quartile1DecisionTreeBranchLength": { - "function": "get_decision_tree_branch_lengths", - "arguments": { - "tree": "TraversedDecisionTree" - }, - "returns": [ - "DecisionTreeBranchLengths", - "MeanDecisionTreeBranchLength", - "StdevDecisionTreeBranchLength", - "SkewDecisionTreeBranchLength", - "KurtosisDecisionTreeBranchLength", - "MinDecisionTreeBranchLength", - "Quartile1DecisionTreeBranchLength", - "Quartile2DecisionTreeBranchLength", - "Quartile3DecisionTreeBranchLength", - "MaxDecisionTreeBranchLength" - ] - }, - "Quartile2DecisionTreeBranchLength": { - "function": "get_decision_tree_branch_lengths", - "arguments": { - "tree": "TraversedDecisionTree" - }, - "returns": [ - "DecisionTreeBranchLengths", - "MeanDecisionTreeBranchLength", - "StdevDecisionTreeBranchLength", - "SkewDecisionTreeBranchLength", - "KurtosisDecisionTreeBranchLength", - "MinDecisionTreeBranchLength", - "Quartile1DecisionTreeBranchLength", - "Quartile2DecisionTreeBranchLength", - "Quartile3DecisionTreeBranchLength", - "MaxDecisionTreeBranchLength" - ] - }, - "Quartile3DecisionTreeBranchLength": { - "function": "get_decision_tree_branch_lengths", - "arguments": { - "tree": "TraversedDecisionTree" - }, - "returns": [ - "DecisionTreeBranchLengths", - "MeanDecisionTreeBranchLength", - "StdevDecisionTreeBranchLength", - "SkewDecisionTreeBranchLength", - "KurtosisDecisionTreeBranchLength", - "MinDecisionTreeBranchLength", - "Quartile1DecisionTreeBranchLength", - "Quartile2DecisionTreeBranchLength", - "Quartile3DecisionTreeBranchLength", - "MaxDecisionTreeBranchLength" - ] - }, - "MaxDecisionTreeBranchLength": { - "function": "get_decision_tree_branch_lengths", - "arguments": { - "tree": "TraversedDecisionTree" - }, - "returns": [ - "DecisionTreeBranchLengths", - "MeanDecisionTreeBranchLength", - "StdevDecisionTreeBranchLength", - "SkewDecisionTreeBranchLength", - "KurtosisDecisionTreeBranchLength", - "MinDecisionTreeBranchLength", - "Quartile1DecisionTreeBranchLength", - "Quartile2DecisionTreeBranchLength", - "Quartile3DecisionTreeBranchLength", - "MaxDecisionTreeBranchLength" - ] - }, - "DecisionTreeNodeCount": { - "function": "get_decision_tree_general_info", - "arguments": { - "tree": "DecisionTree" - }, - "returns": [ - "DecisionTreeNodeCount", - "DecisionTreeLeafCount", - "DecisionTreeHeight", - "DecisionTreeWidth" - ] - }, - "DecisionTreeLeafCount": { - "function": "get_decision_tree_general_info", - "arguments": { - "tree": "DecisionTree" - }, - "returns": [ - "DecisionTreeNodeCount", - "DecisionTreeLeafCount", - "DecisionTreeHeight", - "DecisionTreeWidth" - ] - }, - "DecisionTreeHeight": { - "function": "get_decision_tree_general_info", - "arguments": { - "tree": "DecisionTree" - }, - "returns": [ - "DecisionTreeNodeCount", - "DecisionTreeLeafCount", - "DecisionTreeHeight", - "DecisionTreeWidth" - ] - }, - "DecisionTreeWidth": { - "function": "get_decision_tree_width", - "arguments": { - "tree": "TraversedDecisionTree" - }, - "returns": [ - "DecisionTreeWidth" - ] - }, - "MeanDecisionTreeAttribute": { - "function": "get_decision_tree_attributes", - "arguments": { - "tree": "DecisionTree" - }, - "returns": [ - "DecisionTreeAttributes", - "MeanDecisionTreeAttribute", - "StdevDecisionTreeAttribute", - "SkewDecisionTreeAttribute", - "KurtosisDecisionTreeAttribute", - "MinDecisionTreeAttribute", - "Quartile1DecisionTreeAttribute", - "Quartile2DecisionTreeAttribute", - "Quartile3DecisionTreeAttribute", - "MaxDecisionTreeAttribute" - ] - }, - "StdevDecisionTreeAttribute": { - "function": "get_decision_tree_attributes", - "arguments": { - "tree": "DecisionTree" - }, - "returns": [ - "DecisionTreeAttributes", - "MeanDecisionTreeAttribute", - "StdevDecisionTreeAttribute", - "SkewDecisionTreeAttribute", - "KurtosisDecisionTreeAttribute", - "MinDecisionTreeAttribute", - "Quartile1DecisionTreeAttribute", - "Quartile2DecisionTreeAttribute", - "Quartile3DecisionTreeAttribute", - "MaxDecisionTreeAttribute" - ] - }, - "SkewDecisionTreeAttribute": { - "function": "get_decision_tree_attributes", - "arguments": { - "tree": "DecisionTree" - }, - "returns": [ - "DecisionTreeAttributes", - "MeanDecisionTreeAttribute", - "StdevDecisionTreeAttribute", - "SkewDecisionTreeAttribute", - "KurtosisDecisionTreeAttribute", - "MinDecisionTreeAttribute", - "Quartile1DecisionTreeAttribute", - "Quartile2DecisionTreeAttribute", - "Quartile3DecisionTreeAttribute", - "MaxDecisionTreeAttribute" - ] - }, - "KurtosisDecisionTreeAttribute": { - "function": "get_decision_tree_attributes", - "arguments": { - "tree": "DecisionTree" - }, - "returns": [ - "DecisionTreeAttributes", - "MeanDecisionTreeAttribute", - "StdevDecisionTreeAttribute", - "SkewDecisionTreeAttribute", - "KurtosisDecisionTreeAttribute", - "MinDecisionTreeAttribute", - "Quartile1DecisionTreeAttribute", - "Quartile2DecisionTreeAttribute", - "Quartile3DecisionTreeAttribute", - "MaxDecisionTreeAttribute" - ] - }, - "MinDecisionTreeAttribute": { - "function": "get_decision_tree_attributes", - "arguments": { - "tree": "DecisionTree" - }, - "returns": [ - "DecisionTreeAttributes", - "MeanDecisionTreeAttribute", - "StdevDecisionTreeAttribute", - "SkewDecisionTreeAttribute", - "KurtosisDecisionTreeAttribute", - "MinDecisionTreeAttribute", - "Quartile1DecisionTreeAttribute", - "Quartile2DecisionTreeAttribute", - "Quartile3DecisionTreeAttribute", - "MaxDecisionTreeAttribute" - ] - }, - "Quartile1DecisionTreeAttribute": { - "function": "get_decision_tree_attributes", - "arguments": { - "tree": "DecisionTree" - }, - "returns": [ - "DecisionTreeAttributes", - "MeanDecisionTreeAttribute", - "StdevDecisionTreeAttribute", - "SkewDecisionTreeAttribute", - "KurtosisDecisionTreeAttribute", - "MinDecisionTreeAttribute", - "Quartile1DecisionTreeAttribute", - "Quartile2DecisionTreeAttribute", - "Quartile3DecisionTreeAttribute", - "MaxDecisionTreeAttribute" - ] - }, - "Quartile2DecisionTreeAttribute": { - "function": "get_decision_tree_attributes", - "arguments": { - "tree": "DecisionTree" - }, - "returns": [ - "DecisionTreeAttributes", - "MeanDecisionTreeAttribute", - "StdevDecisionTreeAttribute", - "SkewDecisionTreeAttribute", - "KurtosisDecisionTreeAttribute", - "MinDecisionTreeAttribute", - "Quartile1DecisionTreeAttribute", - "Quartile2DecisionTreeAttribute", - "Quartile3DecisionTreeAttribute", - "MaxDecisionTreeAttribute" - ] - }, - "Quartile3DecisionTreeAttribute": { - "function": "get_decision_tree_attributes", - "arguments": { - "tree": "DecisionTree" - }, - "returns": [ - "DecisionTreeAttributes", - "MeanDecisionTreeAttribute", - "StdevDecisionTreeAttribute", - "SkewDecisionTreeAttribute", - "KurtosisDecisionTreeAttribute", - "MinDecisionTreeAttribute", - "Quartile1DecisionTreeAttribute", - "Quartile2DecisionTreeAttribute", - "Quartile3DecisionTreeAttribute", - "MaxDecisionTreeAttribute" - ] - }, - "MaxDecisionTreeAttribute": { - "function": "get_decision_tree_attributes", - "arguments": { - "tree": "DecisionTree" - }, - "returns": [ - "DecisionTreeAttributes", - "MeanDecisionTreeAttribute", - "StdevDecisionTreeAttribute", - "SkewDecisionTreeAttribute", - "KurtosisDecisionTreeAttribute", - "MinDecisionTreeAttribute", - "Quartile1DecisionTreeAttribute", - "Quartile2DecisionTreeAttribute", - "Quartile3DecisionTreeAttribute", - "MaxDecisionTreeAttribute" - ] - }, - "NumberOfTokens": { - "function": "get_mfs_for_tokens_split_by_space", - "arguments": { - "text_features_array": "NoNaNTextFeatures" - }, - "returns": [ - "NumberOfTokens", - "NumberOfDistinctTokens", - "NumberOfTokensContainingNumericChar", - "RatioOfDistinctTokens", - "RatioOfTokensContainingNumericChar" - ] - }, - "NumberOfDistinctTokens": { - "function": "get_mfs_for_tokens_split_by_space", - "arguments": { - "text_features_array": "NoNaNTextFeatures" - }, - "returns": [ - "NumberOfTokens", - "NumberOfDistinctTokens", - "NumberOfTokensContainingNumericChar", - "RatioOfDistinctTokens", - "RatioOfTokensContainingNumericChar" - ] - }, - "NumberOfTokensContainingNumericChar": { - "function": "get_mfs_for_tokens_split_by_space", - "arguments": { - "text_features_array": "NoNaNTextFeatures" - }, - "returns": [ - "NumberOfTokens", - "NumberOfDistinctTokens", - "NumberOfTokensContainingNumericChar", - "RatioOfDistinctTokens", - "RatioOfTokensContainingNumericChar" - ] - }, - "RatioOfDistinctTokens": { - "function": "get_mfs_for_tokens_split_by_space", - "arguments": { - "text_features_array": "NoNaNTextFeatures" - }, - "returns": [ - "NumberOfTokens", - "NumberOfDistinctTokens", - "NumberOfTokensContainingNumericChar", - "RatioOfDistinctTokens", - "RatioOfTokensContainingNumericChar" - ] - }, - "RatioOfTokensContainingNumericChar": { - "function": "get_mfs_for_tokens_split_by_space", - "arguments": { - "text_features_array": "NoNaNTextFeatures" - }, - "returns": [ - "NumberOfTokens", - "NumberOfDistinctTokens", - "NumberOfTokensContainingNumericChar", - "RatioOfDistinctTokens", - "RatioOfTokensContainingNumericChar" - ] - } - } -} diff --git a/metalearn/metafeatures/metafeatures.py b/metalearn/metafeatures/metafeatures.py index c87be61..b1a02d7 100644 --- a/metalearn/metafeatures/metafeatures.py +++ b/metalearn/metafeatures/metafeatures.py @@ -9,16 +9,11 @@ import numpy as np import pandas as pd from pandas import DataFrame, Series -from sklearn.model_selection import StratifiedShuffleSplit -from .resources import METAFEATURE_CONFIG -from .common_operations import * -from .simple_metafeatures import * -from .statistical_metafeatures import * -from .information_theoretic_metafeatures import * -from .landmarking_metafeatures import * -from .decision_tree_metafeatures import * -from .text_metafeatures import * +from metalearn.metafeatures.common_operations import * +from metalearn.metafeatures.resources import resources_info, metafeature_ids +from metalearn.metafeatures.base import ResourceComputer, MetafeatureComputer, ResourceComputerMap +import metalearn.metafeatures.constants as consts class Metafeatures(object): @@ -29,21 +24,9 @@ class Metafeatures(object): meta-learning applications. """ - VALUE_KEY = 'value' - COMPUTE_TIME_KEY = 'compute_time' - NUMERIC = "NUMERIC" - TEXT = "TEXT" - CATEGORICAL = "CATEGORICAL" - NO_TARGETS = "NO_TARGETS" - NUMERIC_TARGETS = "NUMERIC_TARGETS" - TIMEOUT = "TIMEOUT" - with open(METAFEATURE_CONFIG, 'r') as f: - _metadata = json.load(f) - IDS = list(_metadata["metafeatures"].keys()) - _resources_info = {} - _resources_info.update(_metadata["resources"]) - _resources_info.update(_metadata["metafeatures"]) + _resources_info: ResourceComputerMap = resources_info + IDS: List[str] = metafeature_ids @classmethod def list_metafeatures(cls, group="all"): @@ -139,65 +122,51 @@ def check_time(): X, Y, column_types, sample_shape, seed, n_folds ) - computed_metafeatures = {name: {self.VALUE_KEY: self.TIMEOUT, self.COMPUTE_TIME_KEY: 0} - for name in metafeature_ids} + computed_metafeatures = { + name: self._format_resource(consts.TIMEOUT, 0) + for name in metafeature_ids + } try: for metafeature_id in metafeature_ids: self._check_timeout() if verbose: print(metafeature_id) if self._resource_is_target_dependent(metafeature_id) and ( - Y is None or column_types[Y.name] == self.NUMERIC + Y is None or column_types[Y.name] == consts.NUMERIC ): if Y is None: - value = self.NO_TARGETS + value = consts.NO_TARGETS else: - value = self.NUMERIC_TARGETS + value = consts.NUMERIC_TARGETS compute_time = None else: value, compute_time = self._get_resource(metafeature_id) - computed_metafeatures[metafeature_id] = { - self.VALUE_KEY: value, - self.COMPUTE_TIME_KEY: compute_time - } + computed_metafeatures[metafeature_id] = self._format_resource(value, compute_time) except TimeoutError: pass return computed_metafeatures + + def _format_resource(self, value, compute_time): + """Formats the resource data as a dict""" + return { + consts.VALUE_KEY: value, + consts.COMPUTE_TIME_KEY: compute_time + } def _init_resources( self, X, Y, column_types, sample_shape, seed, n_folds ): + # Add the base resources to our resources hash self._resources = { - "X_raw": { - self.VALUE_KEY: X, - self.COMPUTE_TIME_KEY: 0. - }, - "X": { - self.VALUE_KEY: X.dropna(axis=1, how="all"), - self.COMPUTE_TIME_KEY: 0. - }, - "Y": { - self.VALUE_KEY: Y, - self.COMPUTE_TIME_KEY: 0. - }, - "column_types": { - self.VALUE_KEY: column_types, - self.COMPUTE_TIME_KEY: 0. - }, - "sample_shape": { - self.VALUE_KEY: sample_shape, - self.COMPUTE_TIME_KEY: 0. - }, - "seed_base": { - self.VALUE_KEY: seed, - self.COMPUTE_TIME_KEY: 0. - }, - "n_folds": { - self.VALUE_KEY: n_folds, - self.COMPUTE_TIME_KEY: 0. - } + "X_raw": self._format_resource(X, 0.), + "X": self._format_resource(X.dropna(axis=1, how="all"), 0.), + "Y": self._format_resource(Y, 0.), + "column_types": self._format_resource(column_types, 0.), + "sample_shape": self._format_resource(sample_shape, 0.), + "seed_base": self._format_resource(seed, 0.), + "n_folds": self._format_resource(n_folds, 0.) } @classmethod @@ -207,19 +176,14 @@ def _resource_is_target_dependent(cls, resource_id): elif resource_id=='XSample': return False else: - resource_info = cls._resources_info[resource_id] - function = resource_info["function"] - args = resource_info["arguments"] - for parameter, argument in args.items(): + resource_computer = cls._resources_info.get(resource_id) + for argument in resource_computer.argmap.values(): if (argument in cls._resources_info and cls._resource_is_target_dependent(argument) ): return True return False - def _get_cv_seed(self, seed_base, seed_offset): - return (seed_base + seed_offset,) - def _validate_compute_arguments( self, X, Y, column_types, metafeature_ids, exclude, sample_shape, seed, n_folds, verbose @@ -267,13 +231,13 @@ def _validate_column_types( f"Column type not specified for column {col}" ) col_type = column_types[col] - # todo: add self.TEXT to check. Additionally add self.TEXT to all tests that check for column types - if not col_type in [self.NUMERIC, self.CATEGORICAL, self.TEXT]: + # todo: add consts.TEXT to check. Additionally add consts.TEXT to all tests that check for column types + if not col_type in [consts.NUMERIC, consts.CATEGORICAL, consts.TEXT]: invalid_column_types[col] = col_type if len(invalid_column_types) > 0: raise ValueError( f"Invalid column types: {invalid_column_types}. Valid types " + - f"include {self.NUMERIC} and {self.CATEGORICAL} and {self.TEXT}." + f"include {consts.NUMERIC} and {consts.CATEGORICAL} and {consts.TEXT}." ) def _validate_metafeature_ids( @@ -331,7 +295,7 @@ def _validate_n_folds( raise ValueError(f"`n_folds` must be >= 2, but was {n_folds}") if (Y is not None and column_types is not None and - column_types[Y.name] != self.NUMERIC and + column_types[Y.name] != consts.NUMERIC and metafeature_ids is not None): # when computing landmarking metafeatures, there must be at least # n_folds instances of each class of Y @@ -360,16 +324,16 @@ def _infer_column_types(self, X, Y): column_types = {} for col_name in X.columns: if dtype_is_numeric(X[col_name].dtype): - column_types[col_name] = self.NUMERIC + column_types[col_name] = consts.NUMERIC else: - column_types[col_name] = self.CATEGORICAL + column_types[col_name] = consts.CATEGORICAL if not Y is None: if dtype_is_numeric(Y.dtype): - column_types[Y.name] = self.NUMERIC + column_types[Y.name] = consts.NUMERIC else: # todo: get number of unique values in col_name, compute unique/total ratio. Use ratio to infer type - column_types[Y.name] = self.CATEGORICAL + column_types[Y.name] = consts.CATEGORICAL return column_types def _get_metafeature_ids(self, exclude): @@ -381,34 +345,23 @@ def _get_metafeature_ids(self, exclude): def _get_resource(self, resource_id): self._check_timeout() if not resource_id in self._resources: - resource_info = self._resources_info[resource_id] - f_name = resource_info["function"] - f = self._get_function(f_name) + resource_computer = self._resources_info.get(resource_id) args, total_time = self._get_arguments(resource_id) - return_resources = resource_info["returns"] + return_resources = resource_computer.returns start_timestamp = time.perf_counter() - computed_resources = f(**args) + computed_resources = resource_computer(**args) compute_time = time.perf_counter() - start_timestamp total_time += compute_time for res_id, computed_resource in zip( return_resources, computed_resources ): - self._resources[res_id] = { - self.VALUE_KEY: computed_resource, - self.COMPUTE_TIME_KEY: total_time - } + self._resources[res_id] = self._format_resource(computed_resource, total_time) resource = self._resources[resource_id] - return resource[self.VALUE_KEY], resource[self.COMPUTE_TIME_KEY] - - def _get_function(self, f_name): - if f_name.startswith("self."): - return getattr(self, f_name[len("self."):]) - else: - return globals()[f_name] + return resource[consts.VALUE_KEY], resource[consts.COMPUTE_TIME_KEY] def _get_arguments(self, resource_id): - resource_info = self._resources_info[resource_id] - args = resource_info["arguments"] + resource_computer = self._resources_info.get(resource_id) + args = resource_computer.argmap resolved_parameters = {} total_time = 0.0 for parameter, argument in args.items(): @@ -424,153 +377,7 @@ def _get_arguments(self, resource_id): elif dtype_is_numeric(argument_type): compute_time = 0 else: - raise Exception("unhandled argument type") + raise Exception(f"unhandled argument type '{argument_type}'") resolved_parameters[parameter] = argument total_time += compute_time return (resolved_parameters, total_time) - - def _get_preprocessed_data(self, X_sample, X_sampled_columns, column_types, seed): - series_array = [] - for feature in X_sample.columns: - is_text = False - feature_series = X_sample[feature].copy() - col = feature_series.values - dropped_nan_series = X_sampled_columns[feature].dropna( - axis=0,how='any' - ) - num_nan = np.sum(feature_series.isnull()) - np.random.seed(seed) - col[feature_series.isnull()] = np.random.choice( - dropped_nan_series, size=num_nan - ) - if column_types[feature_series.name] == self.CATEGORICAL: - feature_series = pd.get_dummies(feature_series) - elif column_types[feature_series.name] == self.TEXT: - is_text = True - if not is_text: - series_array.append(feature_series) - return (pd.concat(series_array, axis=1, copy=False),) - - def _sample_columns(self, X, sample_shape, seed): - if sample_shape[1] is None or X.shape[1] <= sample_shape[1]: - X_sample = X - else: - np.random.seed(seed) - sampled_column_indices = np.random.choice( - X.shape[1], size=sample_shape[1], replace=False - ) - sampled_columns = X.columns[sampled_column_indices] - X_sample = X[sampled_columns] - return (X_sample,) - - def _sample_rows(self, X, Y, sample_shape, seed): - """ - Stratified uniform sampling of rows, according to the classes in Y. - Ensures there are enough samples from each class in Y for cross - validation. - """ - if sample_shape[0] is None or X.shape[0] <= sample_shape[0]: - X_sample, Y_sample = X, Y - elif Y is None: - np.random.seed(seed) - row_indices = np.random.choice( - X.shape[0], size=sample_shape[0], replace=False - ) - X_sample, Y_sample = X.iloc[row_indices], Y - else: - drop_size = X.shape[0] - sample_shape[0] - sample_size = sample_shape[0] - sss = StratifiedShuffleSplit( - n_splits=2, test_size=drop_size, train_size=sample_size, random_state=seed - ) - row_indices, _ = next(sss.split(X, Y)) - X_sample, Y_sample = X.iloc[row_indices], Y.iloc[row_indices] - return (X_sample, Y_sample) - - def _get_categorical_features_with_no_missing_values( - self, X_sample, column_types - ): - categorical_features_with_no_missing_values = [] - for feature in X_sample.columns: - if column_types[feature] == self.CATEGORICAL: - no_nan_series = X_sample[feature].dropna( - axis=0, how='any' - ) - categorical_features_with_no_missing_values.append( - no_nan_series - ) - return (categorical_features_with_no_missing_values,) - - def _get_categorical_features_and_class_with_no_missing_values( - self, X_sample, Y_sample, column_types - ): - categorical_features_and_class_with_no_missing_values = [] - for feature in X_sample.columns: - if column_types[feature] == self.CATEGORICAL: - df = pd.concat([X_sample[feature],Y_sample], axis=1).dropna( - axis=0, how='any' - ) - categorical_features_and_class_with_no_missing_values.append( - (df[feature],df[Y_sample.name]) - ) - return (categorical_features_and_class_with_no_missing_values,) - - def _get_numeric_features_with_no_missing_values( - self, X_sample, column_types - ): - numeric_features_with_no_missing_values = [] - for feature in X_sample.columns: - if column_types[feature] == self.NUMERIC: - no_nan_series = X_sample[feature].dropna( - axis=0, how='any' - ) - numeric_features_with_no_missing_values.append( - no_nan_series - ) - return (numeric_features_with_no_missing_values,) - - def _get_text_features_with_no_missing_values( - self, X_sample, column_types - ): - text_features_with_no_missing_values = [] - for feature in X_sample.columns: - if column_types[feature] == self.TEXT: - no_nan_series = X_sample[feature].dropna( - axis=0, how='any' - ) - text_features_with_no_missing_values.append( - no_nan_series - ) - return (text_features_with_no_missing_values,) - - def _get_binned_numeric_features_with_no_missing_values( - self, numeric_features_array - ): - binned_feature_array = [ - ( - pd.cut(feature, - round(feature.shape[0]**(1./3.))) - ) for feature in numeric_features_array - ] - return (binned_feature_array,) - - def _get_binned_numeric_features_and_class_with_no_missing_values( - self, X_sample, Y_sample, column_types - ): - numeric_features_and_class_with_no_missing_values = [] - for feature in X_sample.columns: - if column_types[feature] == self.NUMERIC: - # renaming avoids name collisions and problems when y does not have a name - df = pd.concat([X_sample[feature].rename('x'), Y_sample.rename('y')], axis=1) - df.dropna(axis=0, how='any', inplace=True) - numeric_features_and_class_with_no_missing_values.append( - (df['x'],df['y']) - ) - binned_feature_class_array = [ - ( - pd.cut(feature_class_pair[0], - round(feature_class_pair[0].shape[0]**(1./3.))), - feature_class_pair[1] - ) for feature_class_pair in numeric_features_and_class_with_no_missing_values - ] - return (binned_feature_class_array,) diff --git a/metalearn/metafeatures/resources.py b/metalearn/metafeatures/resources.py index 3553ca6..506d967 100644 --- a/metalearn/metafeatures/resources.py +++ b/metalearn/metafeatures/resources.py @@ -1,10 +1,71 @@ -""" -Exposes all the package's bundled data files as file path strings. -Needed to allow the data files to be successfully accessed across platforms -and install types. -""" - +from typing import List import pkg_resources -METAFEATURE_CONFIG = pkg_resources.resource_filename('metalearn', 'metafeatures/metafeatures.json') -METAFEATURES_JSON_SCHEMA = pkg_resources.resource_filename('metalearn', 'metafeatures/metafeatures_schema.json') \ No newline at end of file +from metalearn.metafeatures.base import ResourceComputerMap, MetafeatureComputer, ResourceComputer + +from metalearn.metafeatures.decision_tree_metafeatures import resource_computers as dt_resources +from metalearn.metafeatures.general_resource_computers import resource_computers as util_resources +from metalearn.metafeatures.text_metafeatures import resource_computers as text_resources + +from metalearn.metafeatures.simple_metafeatures import metafeature_computers as simple_metafeatures +from metalearn.metafeatures.statistical_metafeatures import metafeature_computers as statistical_metafeatures +from metalearn.metafeatures.information_theoretic_metafeatures import metafeature_computers as info_theoretic_metafeatures +from metalearn.metafeatures.landmarking_metafeatures import metafeature_computers as landmarking_metafeatures +from metalearn.metafeatures.text_metafeatures import metafeature_computers as text_metafeatures +from metalearn.metafeatures.decision_tree_metafeatures import metafeature_computers as dt_metafeatures + + +# Expose the `metafeatures_schema.json` file as a file path string. +# Needed to allow the file to be successfully accessed across platforms +# and install types. +METAFEATURES_JSON_SCHEMA = pkg_resources.resource_filename('metalearn', 'metafeatures/metafeatures_schema.json') + + +def _get_metafeature_ids(metafeature_computers: List[MetafeatureComputer]) -> List[str]: + """Returns a list of all metafeature IDs found in `metafeature_computers`""" + metafeature_ids = set() + for computer in metafeature_computers: + for name in computer.returns: + if name in metafeature_ids: + raise ValueError("there is already a MetafeatureComputer that returns the {name} metafeature.") + metafeature_ids.add(name) + return list(metafeature_ids) + + +resources_info = ResourceComputerMap() + +# Add all the ResourceComputers +resources_info.add(dt_resources) +resources_info.add(util_resources) +resources_info.add(text_resources) + +# Add noop resource computers for the base resources. +# Since they'll always be in the Metafeatures resource hash, +# they'll never be needed to be computed by a ResourceComputer, +# but they need to be in `resources_info` since `Metafeatures._get_arguments` +# and `Metafeatures._resource_is_target_dependent` requires them to be. +resources_info.add(ResourceComputer(lambda _: None, ["X_raw"])) +resources_info.add(ResourceComputer(lambda _: None, ["X"])) +resources_info.add(ResourceComputer(lambda _: None, ["Y"])) +resources_info.add(ResourceComputer(lambda _: None, ["column_types"])) +resources_info.add(ResourceComputer(lambda _: None, ["sample_shape"])) +resources_info.add(ResourceComputer(lambda _: None, ["seed_base"])) +resources_info.add(ResourceComputer(lambda _: None, ["n_folds"])) + +# Add all the MetafeatureComputers +resources_info.add(simple_metafeatures) +resources_info.add(statistical_metafeatures) +resources_info.add(info_theoretic_metafeatures) +resources_info.add(landmarking_metafeatures) +resources_info.add(text_metafeatures) +resources_info.add(dt_metafeatures) + +# Get all the metafeature ids +metafeature_ids = _get_metafeature_ids( + simple_metafeatures + + statistical_metafeatures + + info_theoretic_metafeatures + + landmarking_metafeatures + + text_metafeatures + + dt_metafeatures +) \ No newline at end of file diff --git a/metalearn/metafeatures/simple_metafeatures.py b/metalearn/metafeatures/simple_metafeatures.py index c262330..bc6b52f 100644 --- a/metalearn/metafeatures/simple_metafeatures.py +++ b/metalearn/metafeatures/simple_metafeatures.py @@ -1,7 +1,9 @@ import numpy as np from pandas import DataFrame -from .common_operations import * +from metalearn.metafeatures.common_operations import * +from metalearn.metafeatures.base import MetafeatureComputer +from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup def get_dataset_stats(X, column_types): number_of_instances = X.shape[0] @@ -12,10 +14,38 @@ def get_dataset_stats(X, column_types): ratio_of_categorical_features = categorical_features / number_of_features return (number_of_instances, number_of_features, numeric_features, categorical_features, ratio_of_numeric_features, ratio_of_categorical_features) +get_dataset_stats = MetafeatureComputer( + get_dataset_stats, + [ + "NumberOfInstances", + "NumberOfFeatures", + "NumberOfNumericFeatures", + "NumberOfCategoricalFeatures", + "RatioOfNumericFeatures", + "RatioOfCategoricalFeatures" + ], + ProblemType.ANY, + [MetafeatureGroup.SIMPLE], + { "X": "X_raw" } +) + + def get_dimensionality(number_of_features, number_of_instances): dimensionality = number_of_features / number_of_instances return (dimensionality,) +get_dimensionality = MetafeatureComputer( + get_dimensionality, + ["Dimensionality"], + ProblemType.ANY, + [MetafeatureGroup.SIMPLE], + { + "number_of_features": "NumberOfFeatures", + "number_of_instances": "NumberOfInstances" + } +) + + def get_missing_values(X): missing_values_by_instance = X.shape[1] - X.count(axis=1) missing_values_by_feature = X.shape[0] - X.count(axis=0) @@ -31,6 +61,22 @@ def get_missing_values(X): ratio_features_with_missing ) +get_missing_values = MetafeatureComputer( + get_missing_values, + [ + "NumberOfMissingValues", + "RatioOfMissingValues", + "NumberOfInstancesWithMissingValues", + "RatioOfInstancesWithMissingValues", + "NumberOfFeaturesWithMissingValues", + "RatioOfFeaturesWithMissingValues" + ], + ProblemType.ANY, + [MetafeatureGroup.SIMPLE], + { "X": "X_raw" } +) + + def get_class_stats(Y): classes = Y.unique() number_of_classes = classes.shape[0] @@ -40,10 +86,80 @@ def get_class_stats(Y): minority_class_size = min(counts) return (number_of_classes, *profile_distribution(probs), minority_class_size, majority_class_size) +get_class_stats = MetafeatureComputer( + computer=get_class_stats, + returns=[ + "NumberOfClasses", + "MeanClassProbability", + "StdevClassProbability", + "SkewClassProbability", + "KurtosisClassProbability", + "MinClassProbability", + "Quartile1ClassProbability", + "Quartile2ClassProbability", + "Quartile3ClassProbability", + "MaxClassProbability", + "MinorityClassSize", + "MajorityClassSize" + ], + problem_type=ProblemType.CLASSIFICATION, + groups=[MetafeatureGroup.SIMPLE] +) + + def get_categorical_cardinalities(X, column_types): cardinalities = [X[feature].unique().shape[0] for feature in get_categorical_features(X, column_types)] return profile_distribution(cardinalities) +get_categorical_cardinalities = MetafeatureComputer( + get_categorical_cardinalities, + [ + "MeanCardinalityOfCategoricalFeatures", + "StdevCardinalityOfCategoricalFeatures", + "SkewCardinalityOfCategoricalFeatures", + "KurtosisCardinalityOfCategoricalFeatures", + "MinCardinalityOfCategoricalFeatures", + "Quartile1CardinalityOfCategoricalFeatures", + "Quartile2CardinalityOfCategoricalFeatures", + "Quartile3CardinalityOfCategoricalFeatures", + "MaxCardinalityOfCategoricalFeatures" + ], + ProblemType.ANY, + [MetafeatureGroup.SIMPLE], +) + + def get_numeric_cardinalities(X, column_types): cardinalities = [X[feature].unique().shape[0] for feature in get_numeric_features(X, column_types)] return profile_distribution(cardinalities) + +get_numeric_cardinalities = MetafeatureComputer( + get_numeric_cardinalities, + [ + "MeanCardinalityOfNumericFeatures", + "StdevCardinalityOfNumericFeatures", + "SkewCardinalityOfNumericFeatures", + "KurtosisCardinalityOfNumericFeatures", + "MinCardinalityOfNumericFeatures", + "Quartile1CardinalityOfNumericFeatures", + "Quartile2CardinalityOfNumericFeatures", + "Quartile3CardinalityOfNumericFeatures", + "MaxCardinalityOfNumericFeatures" + ], + ProblemType.ANY, + [MetafeatureGroup.SIMPLE] +) + + +""" +A list of all MetafeatureComputer +instances in this module. +""" +metafeature_computers = [ + get_dataset_stats, + get_class_stats, + get_dimensionality, + get_missing_values, + get_categorical_cardinalities, + get_numeric_cardinalities +] \ No newline at end of file diff --git a/metalearn/metafeatures/statistical_metafeatures.py b/metalearn/metafeatures/statistical_metafeatures.py index f092e37..3fbfd2f 100644 --- a/metalearn/metafeatures/statistical_metafeatures.py +++ b/metalearn/metafeatures/statistical_metafeatures.py @@ -8,24 +8,112 @@ from sklearn.decomposition import PCA from sklearn.cross_decomposition import CCA -from .common_operations import * +from metalearn.metafeatures.common_operations import * +from metalearn.metafeatures.base import MetafeatureComputer +from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup +import metalearn.metafeatures.constants as consts + def get_numeric_means(numeric_features_array): means = [feature.mean() for feature in numeric_features_array] return profile_distribution(means) +get_numeric_means = MetafeatureComputer( + get_numeric_means, + [ + "MeanMeansOfNumericFeatures", + "StdevMeansOfNumericFeatures", + "SkewMeansOfNumericFeatures", + "KurtosisMeansOfNumericFeatures", + "MinMeansOfNumericFeatures", + "Quartile1MeansOfNumericFeatures", + "Quartile2MeansOfNumericFeatures", + "Quartile3MeansOfNumericFeatures", + "MaxMeansOfNumericFeatures" + ], + ProblemType.ANY, + [MetafeatureGroup.STATISTICAL], + { + "numeric_features_array": "NoNaNNumericFeatures" + } +) + + def get_numeric_stdev(numeric_features_array): stdevs = [feature.std() for feature in numeric_features_array] return profile_distribution(stdevs) +get_numeric_stdev = MetafeatureComputer( + get_numeric_stdev, + [ + "MeanStdDevOfNumericFeatures", + "StdevStdDevOfNumericFeatures", + "SkewStdDevOfNumericFeatures", + "KurtosisStdDevOfNumericFeatures", + "MinStdDevOfNumericFeatures", + "Quartile1StdDevOfNumericFeatures", + "Quartile2StdDevOfNumericFeatures", + "Quartile3StdDevOfNumericFeatures", + "MaxStdDevOfNumericFeatures" + ], + ProblemType.ANY, + [MetafeatureGroup.STATISTICAL], + { + "numeric_features_array": "NoNaNNumericFeatures" + } +) + + def get_numeric_skewness(numeric_features_array): skews = [feature.skew() for feature in numeric_features_array] return profile_distribution(skews) +get_numeric_skewness = MetafeatureComputer( + get_numeric_skewness, + [ + "MeanSkewnessOfNumericFeatures", + "StdevSkewnessOfNumericFeatures", + "SkewSkewnessOfNumericFeatures", + "KurtosisSkewnessOfNumericFeatures", + "MinSkewnessOfNumericFeatures", + "Quartile1SkewnessOfNumericFeatures", + "Quartile2SkewnessOfNumericFeatures", + "Quartile3SkewnessOfNumericFeatures", + "MaxSkewnessOfNumericFeatures" + ], + ProblemType.ANY, + [MetafeatureGroup.STATISTICAL], + { + "numeric_features_array": "NoNaNNumericFeatures" + } +) + + def get_numeric_kurtosis(numeric_features_array): kurtoses = [feature.kurtosis() for feature in numeric_features_array] return profile_distribution(kurtoses) +get_numeric_kurtosis = MetafeatureComputer( + get_numeric_kurtosis, + [ + "MeanKurtosisOfNumericFeatures", + "StdevKurtosisOfNumericFeatures", + "SkewKurtosisOfNumericFeatures", + "KurtosisKurtosisOfNumericFeatures", + "MinKurtosisOfNumericFeatures", + "Quartile1KurtosisOfNumericFeatures", + "Quartile2KurtosisOfNumericFeatures", + "Quartile3KurtosisOfNumericFeatures", + "MaxKurtosisOfNumericFeatures" + ], + ProblemType.ANY, + [MetafeatureGroup.STATISTICAL], + { + "numeric_features_array": "NoNaNNumericFeatures" + } +) + + def get_pca(X_preprocessed): num_components = min(3, X_preprocessed.shape[1]) pca_data = PCA(n_components=num_components) @@ -41,6 +129,25 @@ def get_pca(X_preprocessed): eigenvalues[i] = pred_eigen[i] return (variance_percentages[0], variance_percentages[1], variance_percentages[2], eigenvalues[0], eigenvalues[1], eigenvalues[2], pred_det) +get_pca = MetafeatureComputer( + get_pca, + [ + "PredPCA1", + "PredPCA2", + "PredPCA3", + "PredEigen1", + "PredEigen2", + "PredEigen3", + "PredDet" + ], + ProblemType.ANY, + [MetafeatureGroup.STATISTICAL], + { + "X_preprocessed": "XPreprocessed" + } +) + + def get_correlations(X_sample, column_types): correlations = get_canonical_correlations(X_sample, column_types) profile_distribution(correlations) @@ -65,7 +172,7 @@ def get_canonical_correlations(dataframe, column_types): ''' def preprocess(series): - if column_types[series.name] == 'CATEGORICAL': + if column_types[series.name] == consts.CATEGORICAL: series = pd.get_dummies(series) array = series.values.reshape(series.shape[0], -1) return array @@ -105,3 +212,16 @@ def preprocess(series): correlations.append(c) return correlations + + +""" +A list of all MetafeatureComputer +instances in this module. +""" +metafeature_computers = [ + get_numeric_means, + get_numeric_stdev, + get_numeric_skewness, + get_numeric_kurtosis, + get_pca +] \ No newline at end of file diff --git a/metalearn/metafeatures/text_metafeatures.py b/metalearn/metafeatures/text_metafeatures.py index 42ae718..694e843 100644 --- a/metalearn/metafeatures/text_metafeatures.py +++ b/metalearn/metafeatures/text_metafeatures.py @@ -4,28 +4,120 @@ import numpy as np import pandas as pd -from .common_operations import * +from metalearn.metafeatures.common_operations import * +from metalearn.metafeatures.base import ResourceComputer, MetafeatureComputer +from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup def get_string_lengths_array_from_text_features(text_features_array): lengths = [feature.apply(len) for feature in text_features_array] return (lengths,) +get_string_lengths_array_from_text_features = ResourceComputer( + get_string_lengths_array_from_text_features, + ["ArrayOfStringLengthsOfTextFeatures"], + { "text_features_array": "NoNaNTextFeatures" } +) + def get_string_length_means(string_lengths_array): means = [feature.mean() for feature in string_lengths_array] return profile_distribution(means) +get_string_length_means = MetafeatureComputer( + get_string_length_means, + [ + "MeanMeansOfStringLengthOfTextFeatures", + "StdevMeansOfStringLengthOfTextFeatures", + "SkewMeansOfStringLengthOfTextFeatures", + "KurtosisMeansOfStringLengthOfTextFeatures", + "MinMeansOfStringLengthOfTextFeatures", + "Quartile1MeansOfStringLengthOfTextFeatures", + "Quartile2MeansOfStringLengthOfTextFeatures", + "Quartile3MeansOfStringLengthOfTextFeatures", + "MaxMeansOfStringLengthOfTextFeatures" + ], + ProblemType.ANY, + [MetafeatureGroup.TEXT], + { + "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" + } +) + + def get_string_length_stdev(string_lengths_array): stdevs = [feature.std() for feature in string_lengths_array] return profile_distribution(stdevs) +get_string_length_stdev = MetafeatureComputer( + get_string_length_stdev, + [ + "MeanStdDevOfStringLengthOfTextFeatures", + "StdevStdDevOfStringLengthOfTextFeatures", + "SkewStdDevOfStringLengthOfTextFeatures", + "KurtosisStdDevOfStringLengthOfTextFeatures", + "MinStdDevOfStringLengthOfTextFeatures", + "Quartile1StdDevOfStringLengthOfTextFeatures", + "Quartile2StdDevOfStringLengthOfTextFeatures", + "Quartile3StdDevOfStringLengthOfTextFeatures", + "MaxStdDevOfStringLengthOfTextFeatures" + ], + ProblemType.ANY, + [MetafeatureGroup.TEXT], + { + "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" + } +) + + def get_string_length_skewness(string_lengths_array): skews = [feature.skew() for feature in string_lengths_array] return profile_distribution(skews) +get_string_length_skewness = MetafeatureComputer( + get_string_length_skewness, + [ + "MeanSkewnessOfStringLengthOfTextFeatures", + "StdevSkewnessOfStringLengthOfTextFeatures", + "SkewSkewnessOfStringLengthOfTextFeatures", + "KurtosisSkewnessOfStringLengthOfTextFeatures", + "MinSkewnessOfStringLengthOfTextFeatures", + "Quartile1SkewnessOfStringLengthOfTextFeatures", + "Quartile2SkewnessOfStringLengthOfTextFeatures", + "Quartile3SkewnessOfStringLengthOfTextFeatures", + "MaxSkewnessOfStringLengthOfTextFeatures" + ], + ProblemType.ANY, + [MetafeatureGroup.TEXT], + { + "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" + } +) + + def get_string_length_kurtosis(string_lengths_array): kurtoses = [feature.kurtosis() for feature in string_lengths_array] return profile_distribution(kurtoses) +get_string_length_kurtosis = MetafeatureComputer( + get_string_length_kurtosis, + [ + "MeanKurtosisOfStringLengthOfTextFeatures", + "StdevKurtosisOfStringLengthOfTextFeatures", + "SkewKurtosisOfStringLengthOfTextFeatures", + "KurtosisKurtosisOfStringLengthOfTextFeatures", + "MinKurtosisOfStringLengthOfTextFeatures", + "Quartile1KurtosisOfStringLengthOfTextFeatures", + "Quartile2KurtosisOfStringLengthOfTextFeatures", + "Quartile3KurtosisOfStringLengthOfTextFeatures", + "MaxKurtosisOfStringLengthOfTextFeatures" + ], + ProblemType.ANY, + [MetafeatureGroup.TEXT], + { + "string_lengths_array": "ArrayOfStringLengthsOfTextFeatures" + } +) + + def get_mfs_for_tokens_split_by_space(text_features_array, most_common_limit=10): def isnumeric(token): @@ -116,3 +208,40 @@ def filter_and_aggregate(tokens_series, f): # ) # if len(most_common_tokens) == most_common_limit: # break + +get_mfs_for_tokens_split_by_space = MetafeatureComputer( + get_mfs_for_tokens_split_by_space, + [ + "NumberOfTokens", + "NumberOfDistinctTokens", + "NumberOfTokensContainingNumericChar", + "RatioOfDistinctTokens", + "RatioOfTokensContainingNumericChar" + ], + ProblemType.ANY, + [MetafeatureGroup.TEXT], + { + "text_features_array": "NoNaNTextFeatures" + } +) + + +""" +A list of all ResourceComputer +instances in this module. +""" +resource_computers = [ + get_string_lengths_array_from_text_features +] + +""" +A list of all MetafeatureComputer +instances in this module. +""" +metafeature_computers = [ + get_string_length_means, + get_string_length_stdev, + get_string_length_skewness, + get_string_length_kurtosis, + get_mfs_for_tokens_split_by_space +] \ No newline at end of file diff --git a/tests/benchmark_metafeatures.py b/tests/benchmark_metafeatures.py index a4867b6..ad5c157 100644 --- a/tests/benchmark_metafeatures.py +++ b/tests/benchmark_metafeatures.py @@ -4,6 +4,7 @@ import numpy as np from metalearn import Metafeatures +import metalearn.metafeatures.constants as consts from tests.data.dataset import read_dataset from tests.config import CORRECTNESS_SEED, METADATA_PATH @@ -49,7 +50,7 @@ def run_metafeature_benchmark(benchmark_name, iters=100): total_compute_times.append(compute_timestamp - init_timestamp) for mf_id, result in computed_mfs.items(): metafeature_compute_times[mf_id].append( - result[Metafeatures.COMPUTE_TIME_KEY] + result[consts.COMPUTE_TIME_KEY] ) benchmark_data[dataset_metadata["filename"]] = { "init_time": { diff --git a/tests/compare_with_openml.py b/tests/compare_with_openml.py index 4888358..9ce135b 100644 --- a/tests/compare_with_openml.py +++ b/tests/compare_with_openml.py @@ -8,6 +8,7 @@ from tests.data.dataset import _read_arff_dataset from metalearn import Metafeatures +import metalearn.metafeatures.constants as consts from tests.config import OPENML_COMPARE_RESULTS_DIR @@ -115,7 +116,7 @@ def _compare_metafeatures(oml_dataset, tol, verbose): inconsistent_mfs = {} for our_mf_id, our_mf_result in our_mfs.items(): - our_mf_value = our_mf_result[Metafeatures.VALUE_KEY] + our_mf_value = our_mf_result[consts.VALUE_KEY] if our_mf_id in mf_id_map: oml_mf_id = mf_id_map[our_mf_id]["openmlName"] if oml_mf_id in oml_mfs: diff --git a/tests/test_metafeatures.py b/tests/test_metafeatures.py index f3e637a..0551442 100644 --- a/tests/test_metafeatures.py +++ b/tests/test_metafeatures.py @@ -12,7 +12,9 @@ import pandas as pd import numpy as np -from metalearn import Metafeatures, METAFEATURE_CONFIG, METAFEATURES_JSON_SCHEMA +from metalearn import Metafeatures, METAFEATURES_JSON_SCHEMA +import metalearn.metafeatures.constants as consts +from metalearn.metafeatures.resources import metafeature_ids from tests.config import CORRECTNESS_SEED, METADATA_PATH from tests.data.dataset import read_dataset from tests.data.compute_dataset_metafeatures import get_dataset_metafeatures_path @@ -71,12 +73,12 @@ def _check_correctness(self, computed_mfs, known_mfs, filename): fail_message = "Not all metafeatures matched previous results." for mf_id, result in computed_mfs.items(): - computed_value = result[Metafeatures.VALUE_KEY] + computed_value = result[consts.VALUE_KEY] if not any(isinstance(computed_value, type_) for type_ in [str, float, int]): self.fail( 'computed {} has invalid value {} with type {}'.format(mf_id, computed_value, type(computed_value)) ) - known_value = known_mfs[mf_id][Metafeatures.VALUE_KEY] + known_value = known_mfs[mf_id][consts.VALUE_KEY] correct = True if known_value is None: correct = False @@ -116,9 +118,7 @@ def _check_compare_metafeature_lists(self, computed_mfs, known_mfs, filename): test_failures = {} fail_message = "Metafeature lists do not match." - with open(METAFEATURE_CONFIG) as f: - master_mf_ids = json.load(f)["metafeatures"].keys() - master_mf_ids_set = set(master_mf_ids) + master_mf_ids_set = set(metafeature_ids) known_mf_ids_set = set({ x for x in known_mfs.keys() if "_Time" not in x @@ -218,8 +218,8 @@ def test_no_targets(self): ) for mf_name in target_dependent_metafeatures: known_mfs[mf_name] = { - Metafeatures.VALUE_KEY: Metafeatures.NO_TARGETS, - Metafeatures.COMPUTE_TIME_KEY: 0. + consts.VALUE_KEY: consts.NO_TARGETS, + consts.COMPUTE_TIME_KEY: 0. } required_checks = [ @@ -240,7 +240,7 @@ def test_numeric_targets(self): for dataset_filename, dataset in self.datasets.items(): metafeatures = Metafeatures() column_types = dataset["column_types"].copy() - column_types[dataset["Y"].name] = metafeatures.NUMERIC + column_types[dataset["Y"].name] = consts.NUMERIC computed_mfs = metafeatures.compute( X=dataset["X"], Y=pd.Series(np.random.rand(dataset["Y"].shape[0]), name=dataset["Y"].name), seed=CORRECTNESS_SEED, @@ -252,8 +252,8 @@ def test_numeric_targets(self): ) for mf_name in target_dependent_metafeatures: known_mfs[mf_name] = { - Metafeatures.VALUE_KEY: Metafeatures.NUMERIC_TARGETS, - Metafeatures.COMPUTE_TIME_KEY: 0. + consts.VALUE_KEY: consts.NUMERIC_TARGETS, + consts.COMPUTE_TIME_KEY: 0. } required_checks = [ @@ -417,7 +417,7 @@ def test_soft_timeout(self): f"Compute metafeatures exceeded timeout on '{dataset_filename}'" ) computed_mfs_timeout = {k: v for k, v in computed_mfs.items() - if v[Metafeatures.VALUE_KEY] != Metafeatures.TIMEOUT} + if v[consts.VALUE_KEY] != consts.TIMEOUT} known_mfs = dataset["known_metafeatures"] required_checks = [ (self._check_correctness, @@ -554,9 +554,9 @@ def test_request_and_exclude_metafeatures(self): self.assertEqual(str(cm.exception), expected_exception_string) def test_column_type_input(self): - column_types = {col: "NUMERIC" for col in self.dummy_features.columns} - column_types[self.dummy_features.columns[2]] = "CATEGORICAL" - column_types[self.dummy_target.name] = "CATEGORICAL" + column_types = {col: consts.NUMERIC for col in self.dummy_features.columns} + column_types[self.dummy_features.columns[2]] = consts.CATEGORICAL + column_types[self.dummy_target.name] = consts.CATEGORICAL # all valid try: Metafeatures().compute( From aeff076d0756d8ca98f9aa6f17808289107ee910 Mon Sep 17 00:00:00 2001 From: Evan Peterson Date: Wed, 7 Aug 2019 14:07:45 -0600 Subject: [PATCH 04/20] Add beginnings of a contributing guide --- CONTRIBUTING.md | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..2c0f17e --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,49 @@ +# Contributing to `metalearn` + +## Adding New Metafeatures + +`metalearn` uses a caching mechanism to cache expensive computations that may need to be used again within the package by another function. Both resources (e.g. the dataset itself or a preprocessed version of it) and metafeatures (e.g. entropy, number of features) are cached by the system. + +When adding a new metafeature to the package, the function that computes that metafeature needs to be registered in the `resources_info` variable in [./metalearn/metafeatures/resources.py](./metalearn/metafeatures/resources.py). Before the function can be registered though, it needs to be decorated with metadata by being passed through the `MetafeatureComputer` constructor (see example below). This allows the metafeatures returned by the function to be used intelligently by the package. + +Follow the example below to know how to write and register new metafeature(s). Note that a metafeature-computing function (e.g. `get_dataset_stats` as seen below) can compute and return more than one meta-feature. + +```python +# Import needed utilities +from metalearn.metafeatures.base import MetafeatureComputer +from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup + +# Declare the function that computes the metafeatures. +def get_dataset_stats(X, column_types): + # Calculate metafeatures. + number_of_instances = X.shape[0] + number_of_features = X.shape[1] + # Return a tuple (here it's two metafeatures). + return (number_of_instances, number_of_features) + +# Decorate the metafeature-computing function with data +# the package will use. +get_dataset_stats = MetafeatureComputer( + # Pass the function into the `MetafeatureComputer` + # decorator. + computer=get_dataset_stats, + # Give each metafeature returned by the function a + # name for the cache to use (order here must match the + # order they are returned in by `computer`). + returns=[ + "NumberOfInstances", + "NumberOfFeatures" + ], + # Associate a problem type with the new metafeatures. + problem_type=ProblemType.ANY, + # Associate one or more metafeature groups. + groups=[MetafeatureGroup.SIMPLE], + # Specify which values to pass to the function + # when calling it to compute the metafeatures. + # Here we are passing the cached resource called + # "X_raw" as the value for this function's "X" argument. + argmap={ "X": "X_raw" } +) +``` + +By convention, all the decorated metafeature-computing functions in a module are aggregated at the bottom of the module into a list called `metafeature_computers`, which is then imported by [./metalearn/metafeatures/resources.py](./metalearn/metafeatures/resources.py) and added to that module's `resources_info` variable. \ No newline at end of file From 044a863876aed96e4b4f4e5708be876fd82eefeb Mon Sep 17 00:00:00 2001 From: Evan Peterson Date: Mon, 12 Aug 2019 11:27:47 -0600 Subject: [PATCH 05/20] Remove error-prone default dictionary --- metalearn/metafeatures/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metalearn/metafeatures/base.py b/metalearn/metafeatures/base.py index 4a7019c..ef1dbd6 100644 --- a/metalearn/metafeatures/base.py +++ b/metalearn/metafeatures/base.py @@ -11,7 +11,7 @@ def __init__( self, computer: Callable, returns: List[str], - argmap: Optional[Dict[str,Any]] = {} + argmap: Optional[Dict[str,Any]] = None ) -> None: """ Decorates ``computer``, a resource computing function From 9ffebcce24379df9448af632cf90e456db6f889f Mon Sep 17 00:00:00 2001 From: Evan Peterson Date: Mon, 12 Aug 2019 11:28:12 -0600 Subject: [PATCH 06/20] Remove unused import --- metalearn/metafeatures/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/metalearn/metafeatures/base.py b/metalearn/metafeatures/base.py index ef1dbd6..1aaed7a 100644 --- a/metalearn/metafeatures/base.py +++ b/metalearn/metafeatures/base.py @@ -1,5 +1,4 @@ import inspect -from abc import ABC, abstractmethod from typing import List, Callable, Dict, Union, Optional, Any import itertools From e2dfb480eb90ef3adf1d83d933c76eda128177dc Mon Sep 17 00:00:00 2001 From: Evan Peterson Date: Mon, 12 Aug 2019 11:28:46 -0600 Subject: [PATCH 07/20] Simplify ResourceComputer constructor --- metalearn/metafeatures/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/metalearn/metafeatures/base.py b/metalearn/metafeatures/base.py index 1aaed7a..cf07c84 100644 --- a/metalearn/metafeatures/base.py +++ b/metalearn/metafeatures/base.py @@ -55,12 +55,12 @@ def __init__( # ``computer``. self.argmap[local_name] = local_name - for local_name, resource_name in argmap.items(): + if argmap is not None: # Now include any argument name or value overrides - # the developer has provided. Note: `resource_name` + # the developer has provided. Note: each value in `argmap` # may be a global resource name (e.g. `"XSample"`) or # a direct value for the argument (e.g. `5`) - self.argmap[local_name] = resource_name + self.argmap.update(argmap) def __call__(self, *args, **kwargs): """ From 507e568e031471fb496e64cbec1934ad63c0dca4 Mon Sep 17 00:00:00 2001 From: Evan Peterson Date: Mon, 12 Aug 2019 11:29:55 -0600 Subject: [PATCH 08/20] Make ResourceComputerMap be a dictionary --- metalearn/metafeatures/base.py | 41 +++++++++++++++----------- metalearn/metafeatures/metafeatures.py | 6 ++-- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/metalearn/metafeatures/base.py b/metalearn/metafeatures/base.py index cf07c84..f03972f 100644 --- a/metalearn/metafeatures/base.py +++ b/metalearn/metafeatures/base.py @@ -1,6 +1,7 @@ import inspect from typing import List, Callable, Dict, Union, Optional, Any import itertools +from collections.abc import MutableMapping from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup @@ -125,7 +126,7 @@ def __init__( self.problem_type = problem_type -class ResourceComputerMap: +class ResourceComputerMap(MutableMapping): def __init__(self, computers: Union[ResourceComputer,List[ResourceComputer],None] = None) -> None: """ Wraps a dictionary map of resource names to their computers. @@ -138,10 +139,6 @@ def __init__(self, computers: Union[ResourceComputer,List[ResourceComputer],None if computers is not None: self.add(computers) - def __contains__(self, key): - """Called to implement membership test operators. e.g. `key in my_resouce_map`.""" - return key in self._map - def add(self, computers: Union[ResourceComputer,List[ResourceComputer]]) -> None: """ Adds more resource name/resource computer key/value @@ -155,21 +152,31 @@ def add(self, computers: Union[ResourceComputer,List[ResourceComputer]]) -> None else: raise ValueError("computers must be ResourceComputer or List[ResourceComputer]") - def get(self, key: str = None) -> Union[Dict[str,ResourceComputer],ResourceComputer]: - """Used for getting the resource map.""" - if key is not None: - return self._map[key] - return self._map + def __getitem__(self, key: str = None) -> ResourceComputer: + """Used for getting a resource from the map.""" + return self._map[key] def _add_one(self, computer: ResourceComputer) -> None: if not isinstance(computer, ResourceComputer): raise ValueError(f"computer is not a ResourceComputer; it is a {type(computer)}") for resource_name in computer.returns: - if resource_name in self._map: - raise ValueError( - f"duplicate computer '{computer.name}' provided for resource '{resource_name}', " - f"which is already present in the resouce map, registered " - f"by computer '{self.get(resource_name).name}'" - ) - self._map[resource_name] = computer \ No newline at end of file + self.__setitem__(resource_name, computer) + + def __setitem__(self, resource_name: str, computer: ResourceComputer): + if resource_name in self._map: + raise ValueError( + f"duplicate computer '{computer.name}' provided for resource '{resource_name}', " + f"which is already present in the resouce map, registered " + f"by computer '{self._map[resource_name].name}'" + ) + self._map[resource_name] = computer + + def __iter__(self): + return iter(self._map) + + def __len__(self): + return len(self._map) + + def __delitem__(self, key: str): + raise TypeError("ResourceComputerMap does not support deletion of its ResourceComputers") \ No newline at end of file diff --git a/metalearn/metafeatures/metafeatures.py b/metalearn/metafeatures/metafeatures.py index b1a02d7..4e6aabe 100644 --- a/metalearn/metafeatures/metafeatures.py +++ b/metalearn/metafeatures/metafeatures.py @@ -176,7 +176,7 @@ def _resource_is_target_dependent(cls, resource_id): elif resource_id=='XSample': return False else: - resource_computer = cls._resources_info.get(resource_id) + resource_computer = cls._resources_info[resource_id] for argument in resource_computer.argmap.values(): if (argument in cls._resources_info and cls._resource_is_target_dependent(argument) @@ -345,7 +345,7 @@ def _get_metafeature_ids(self, exclude): def _get_resource(self, resource_id): self._check_timeout() if not resource_id in self._resources: - resource_computer = self._resources_info.get(resource_id) + resource_computer = self._resources_info[resource_id] args, total_time = self._get_arguments(resource_id) return_resources = resource_computer.returns start_timestamp = time.perf_counter() @@ -360,7 +360,7 @@ def _get_resource(self, resource_id): return resource[consts.VALUE_KEY], resource[consts.COMPUTE_TIME_KEY] def _get_arguments(self, resource_id): - resource_computer = self._resources_info.get(resource_id) + resource_computer = self._resources_info[resource_id] args = resource_computer.argmap resolved_parameters = {} total_time = 0.0 From 6049753155f368ec6ab6486d44557310d7d810cc Mon Sep 17 00:00:00 2001 From: Evan Peterson Date: Mon, 12 Aug 2019 11:40:39 -0600 Subject: [PATCH 09/20] Add clarification to readme --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2c0f17e..f05aa34 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,7 +4,7 @@ `metalearn` uses a caching mechanism to cache expensive computations that may need to be used again within the package by another function. Both resources (e.g. the dataset itself or a preprocessed version of it) and metafeatures (e.g. entropy, number of features) are cached by the system. -When adding a new metafeature to the package, the function that computes that metafeature needs to be registered in the `resources_info` variable in [./metalearn/metafeatures/resources.py](./metalearn/metafeatures/resources.py). Before the function can be registered though, it needs to be decorated with metadata by being passed through the `MetafeatureComputer` constructor (see example below). This allows the metafeatures returned by the function to be used intelligently by the package. +When adding a new metafeature to the package, the function that computes that metafeature needs to be registered in the `resources_info` variable in [./metalearn/metafeatures/resources.py](./metalearn/metafeatures/resources.py), and passed to the call made to `_get_metafeature_ids` in that module as well. Before the function can be registered and passed though, it needs to be decorated with metadata by being passed through the `MetafeatureComputer` constructor (see example below). This allows the metafeatures returned by the function to be used intelligently by the package. Follow the example below to know how to write and register new metafeature(s). Note that a metafeature-computing function (e.g. `get_dataset_stats` as seen below) can compute and return more than one meta-feature. From 37f426f069941a28cde3fa4b918ada44c6ccc236 Mon Sep 17 00:00:00 2001 From: Evan Peterson Date: Mon, 12 Aug 2019 11:50:55 -0600 Subject: [PATCH 10/20] Avoid inadvertent errors due to unsupported computer function signatures --- metalearn/metafeatures/base.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/metalearn/metafeatures/base.py b/metalearn/metafeatures/base.py index f03972f..78b8166 100644 --- a/metalearn/metafeatures/base.py +++ b/metalearn/metafeatures/base.py @@ -28,6 +28,20 @@ def __init__( A custom map of ``computer``'s argument names to the global resource names that will be passed as ``computer``'s arguments when ``computer`` is called. """ + + computer_args = inspect.getfullargspec(computer) + # TODO: If needed, add support for `computer` functions that + # use these types of arguments. + if ( + computer_args.varargs is not None or + computer_args.varkw is not None or + len(computer_args.kwonlyargs) > 0 + ): + raise ValueError(( + "ResourceComputer supports `computer` functions that " + "use positional arguments only in their function definition." + )) + self._computer = computer self.returns = returns @@ -62,6 +76,7 @@ def __init__( # may be a global resource name (e.g. `"XSample"`) or # a direct value for the argument (e.g. `5`) self.argmap.update(argmap) + def __call__(self, *args, **kwargs): """ From 7170d287a108e9016d640f471bda85b1ec9f2099 Mon Sep 17 00:00:00 2001 From: Evan Peterson Date: Mon, 12 Aug 2019 12:00:02 -0600 Subject: [PATCH 11/20] Improve module's separation of concerns --- metalearn/__init__.py | 2 +- metalearn/metafeatures/resources.py | 7 ------- metalearn/metafeatures/static_assets.py | 6 ++++++ 3 files changed, 7 insertions(+), 8 deletions(-) create mode 100644 metalearn/metafeatures/static_assets.py diff --git a/metalearn/__init__.py b/metalearn/__init__.py index 3c7a77a..56bbde5 100644 --- a/metalearn/__init__.py +++ b/metalearn/__init__.py @@ -1,2 +1,2 @@ from .metafeatures.metafeatures import Metafeatures -from .metafeatures.resources import METAFEATURES_JSON_SCHEMA \ No newline at end of file +from .metafeatures.static_assets import METAFEATURES_JSON_SCHEMA \ No newline at end of file diff --git a/metalearn/metafeatures/resources.py b/metalearn/metafeatures/resources.py index 506d967..e6a753b 100644 --- a/metalearn/metafeatures/resources.py +++ b/metalearn/metafeatures/resources.py @@ -1,5 +1,4 @@ from typing import List -import pkg_resources from metalearn.metafeatures.base import ResourceComputerMap, MetafeatureComputer, ResourceComputer @@ -15,12 +14,6 @@ from metalearn.metafeatures.decision_tree_metafeatures import metafeature_computers as dt_metafeatures -# Expose the `metafeatures_schema.json` file as a file path string. -# Needed to allow the file to be successfully accessed across platforms -# and install types. -METAFEATURES_JSON_SCHEMA = pkg_resources.resource_filename('metalearn', 'metafeatures/metafeatures_schema.json') - - def _get_metafeature_ids(metafeature_computers: List[MetafeatureComputer]) -> List[str]: """Returns a list of all metafeature IDs found in `metafeature_computers`""" metafeature_ids = set() diff --git a/metalearn/metafeatures/static_assets.py b/metalearn/metafeatures/static_assets.py new file mode 100644 index 0000000..e960edc --- /dev/null +++ b/metalearn/metafeatures/static_assets.py @@ -0,0 +1,6 @@ +import pkg_resources + +# Expose the `metafeatures_schema.json` file as a file path string. +# Needed to allow the file to be successfully accessed across platforms +# and install types. +METAFEATURES_JSON_SCHEMA = pkg_resources.resource_filename('metalearn', 'metafeatures/metafeatures_schema.json') \ No newline at end of file From a18265a5d95982f909c7e64ea36f5602ecfa8e3d Mon Sep 17 00:00:00 2001 From: Brandon Schoenfeld Date: Thu, 15 Aug 2019 14:02:46 -0600 Subject: [PATCH 12/20] simplified resource computer by removing default args; added newlines at eof --- metalearn/__init__.py | 2 +- metalearn/metafeatures/base.py | 113 +++++------------- .../general_resource_computers.py | 5 +- metalearn/metafeatures/static_assets.py | 2 +- metalearn/metafeatures/text_metafeatures.py | 5 +- 5 files changed, 39 insertions(+), 88 deletions(-) diff --git a/metalearn/__init__.py b/metalearn/__init__.py index 56bbde5..c52b3b9 100644 --- a/metalearn/__init__.py +++ b/metalearn/__init__.py @@ -1,2 +1,2 @@ from .metafeatures.metafeatures import Metafeatures -from .metafeatures.static_assets import METAFEATURES_JSON_SCHEMA \ No newline at end of file +from .metafeatures.static_assets import METAFEATURES_JSON_SCHEMA diff --git a/metalearn/metafeatures/base.py b/metalearn/metafeatures/base.py index 78b8166..85f6193 100644 --- a/metalearn/metafeatures/base.py +++ b/metalearn/metafeatures/base.py @@ -1,109 +1,58 @@ +from collections.abc import MutableMapping import inspect -from typing import List, Callable, Dict, Union, Optional, Any import itertools -from collections.abc import MutableMapping +from typing import List, Callable, Dict, Union, Optional, Any from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup + class ResourceComputer: + """ + Decorates ``computer``, a resource computing function with metadata about that function. + + Parameters + ---------- + computer + The function that computes the resources. + returns + The names of the resources that ``computer`` returns, specified in the same order as ``computer`` returns + them. + argmap + A custom map of ``computer``'s argument names to the global resource names that will be passed as + ``computer``'s arguments when ``computer`` is called. + """ def __init__( - self, - computer: Callable, - returns: List[str], - argmap: Optional[Dict[str,Any]] = None + self, computer: Callable, returns: List[str], argmap: Optional[Dict[str,Any]] = None ) -> None: - """ - Decorates ``computer``, a resource computing function - with metadata about that function. - - Parameters - ---------- - computer - The function that computes the resources. - returns - The names of the resources that ``computer`` returns, specified in - the same order as ``computer`` returns them. - argmap - A custom map of ``computer``'s argument names to the global resource names - that will be passed as ``computer``'s arguments when ``computer`` is called. - """ - - computer_args = inspect.getfullargspec(computer) - # TODO: If needed, add support for `computer` functions that - # use these types of arguments. + argspec = inspect.getfullargspec(computer) + # TODO: If needed, add support for `computer` functions that use these types of arguments. if ( - computer_args.varargs is not None or - computer_args.varkw is not None or - len(computer_args.kwonlyargs) > 0 + argspec.varargs is not None or argspec.varkw is not None or argspec.defaults is not None or + len(argspec.kwonlyargs) > 0 ): - raise ValueError(( - "ResourceComputer supports `computer` functions that " - "use positional arguments only in their function definition." - )) + raise ValueError('`computer` must use only positional arguments with no default values') - self._computer = computer + self.computer = computer self.returns = returns - - self.argmap = {} - - # reversing is needed because `self.defaults` gives the default - # argument values corresponding to the *last* `n` arguments in the - # function signature. - reversed_args = self.args[::-1] - reversed_defaults = self.defaults[::-1] - arg_default_pairs = itertools.zip_longest(reversed_args, reversed_defaults) - - for local_name, default in arg_default_pairs: - # By default, just use the `computer` function's - # normal local argument names in the argmap, - # making sure to preserve default argument values - # when they are supplied. - if default is not None: - # The function has a default value for this arg; - # use that. - self.argmap[local_name] = default - else: - # This function has no default. Tell the system - # to pass in the global resource identified by - # this arg's ``local_name`` when calling this - # ``computer``. - self.argmap[local_name] = local_name + self.argmap = {arg_name: arg_name for arg_name in argspec.args} if argmap is not None: - # Now include any argument name or value overrides - # the developer has provided. Note: each value in `argmap` - # may be a global resource name (e.g. `"XSample"`) or - # a direct value for the argument (e.g. `5`) + # override computer arg value with developer provided values + # Note each value in `argmap` is a global resource name (e.g. `"XSample"`) or a literal value (e.g. `5`) self.argmap.update(argmap) def __call__(self, *args, **kwargs): """ - Allows a ``ResourceComputer`` instance to be callable. - Just forwards all arguments on to self._computer. - """ - return self._computer(*args, **kwargs) - - @property - def args(self) -> list: - """Returns a list of the positional parameter names of self._computer""" - return inspect.getfullargspec(self._computer).args - - @property - def defaults(self) -> list: - """ - From https://docs.python.org/3/library/inspect.html#inspect.getfullargspec - [Returns] an n-tuple of default argument values corresponding to the last `n` - positional parameters [of self._computer]. + Allows a ``ResourceComputer`` instance to be callable. Just forwards all arguments on to self.computer. """ - defaults = inspect.getfullargspec(self._computer).defaults - return [] if defaults is None else defaults + return self.computer(*args, **kwargs) @property def name(self) -> str: - """Returns the function name of self._computer""" - return self._computer.__name__ + """Returns the function name of self.computer""" + return self.computer.__name__ class MetafeatureComputer(ResourceComputer): diff --git a/metalearn/metafeatures/general_resource_computers.py b/metalearn/metafeatures/general_resource_computers.py index 7e1c144..179c511 100644 --- a/metalearn/metafeatures/general_resource_computers.py +++ b/metalearn/metafeatures/general_resource_computers.py @@ -5,10 +5,11 @@ from metalearn.metafeatures.base import ResourceComputer import metalearn.metafeatures.constants as consts -def get_cv_seed(seed_base, seed_offset = 1): + +def get_cv_seed(seed_base, seed_offset): return (seed_base + seed_offset,) -get_cv_seed = ResourceComputer(get_cv_seed, ["cv_seed"]) +get_cv_seed = ResourceComputer(get_cv_seed, ["cv_seed"], {'seed_offset': 1}) def sample_columns(X, sample_shape, seed): diff --git a/metalearn/metafeatures/static_assets.py b/metalearn/metafeatures/static_assets.py index e960edc..c8473de 100644 --- a/metalearn/metafeatures/static_assets.py +++ b/metalearn/metafeatures/static_assets.py @@ -3,4 +3,4 @@ # Expose the `metafeatures_schema.json` file as a file path string. # Needed to allow the file to be successfully accessed across platforms # and install types. -METAFEATURES_JSON_SCHEMA = pkg_resources.resource_filename('metalearn', 'metafeatures/metafeatures_schema.json') \ No newline at end of file +METAFEATURES_JSON_SCHEMA = pkg_resources.resource_filename('metalearn', 'metafeatures/metafeatures_schema.json') diff --git a/metalearn/metafeatures/text_metafeatures.py b/metalearn/metafeatures/text_metafeatures.py index 694e843..2ecf301 100644 --- a/metalearn/metafeatures/text_metafeatures.py +++ b/metalearn/metafeatures/text_metafeatures.py @@ -118,7 +118,7 @@ def get_string_length_kurtosis(string_lengths_array): ) -def get_mfs_for_tokens_split_by_space(text_features_array, most_common_limit=10): +def get_mfs_for_tokens_split_by_space(text_features_array, most_common_limit): def isnumeric(token): try: @@ -221,7 +221,8 @@ def filter_and_aggregate(tokens_series, f): ProblemType.ANY, [MetafeatureGroup.TEXT], { - "text_features_array": "NoNaNTextFeatures" + "text_features_array": "NoNaNTextFeatures", + 'most_common_limit': 10, } ) From 6bf4019bbe662dc396dffa34740c688cb62b7169 Mon Sep 17 00:00:00 2001 From: Evan Peterson Date: Fri, 16 Aug 2019 09:37:30 -0600 Subject: [PATCH 13/20] Use assert keyword when doing assertion --- metalearn/metafeatures/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/metalearn/metafeatures/base.py b/metalearn/metafeatures/base.py index 85f6193..7d38025 100644 --- a/metalearn/metafeatures/base.py +++ b/metalearn/metafeatures/base.py @@ -128,8 +128,7 @@ def _add_one(self, computer: ResourceComputer) -> None: self.__setitem__(resource_name, computer) def __setitem__(self, resource_name: str, computer: ResourceComputer): - if resource_name in self._map: - raise ValueError( + assert resource_name not in self._map, ( f"duplicate computer '{computer.name}' provided for resource '{resource_name}', " f"which is already present in the resouce map, registered " f"by computer '{self._map[resource_name].name}'" From fb3cf234d9dca75adfc704d889da5f951a18d207 Mon Sep 17 00:00:00 2001 From: Brandon Schoenfeld Date: Fri, 16 Aug 2019 12:53:29 -0600 Subject: [PATCH 14/20] rename variable; newline at eof --- CONTRIBUTING.md | 2 +- MANIFEST.in | 2 +- metalearn/__init__.py | 2 +- metalearn/metafeatures/base.py | 2 +- metalearn/metafeatures/constants.py | 2 +- metalearn/metafeatures/decision_tree_metafeatures.py | 3 ++- metalearn/metafeatures/general_resource_computers.py | 2 +- .../metafeatures/information_theoretic_metafeatures.py | 3 ++- metalearn/metafeatures/landmarking_metafeatures.py | 2 +- metalearn/metafeatures/metafeatures_schema.json | 2 +- metalearn/metafeatures/resources.py | 2 +- metalearn/metafeatures/simple_metafeatures.py | 3 ++- metalearn/metafeatures/static_assets.py | 2 +- metalearn/metafeatures/statistical_metafeatures.py | 2 +- metalearn/metafeatures/text_metafeatures.py | 3 ++- tests/test_metafeatures.py | 6 +++--- 16 files changed, 22 insertions(+), 18 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f05aa34..3b8c168 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -46,4 +46,4 @@ get_dataset_stats = MetafeatureComputer( ) ``` -By convention, all the decorated metafeature-computing functions in a module are aggregated at the bottom of the module into a list called `metafeature_computers`, which is then imported by [./metalearn/metafeatures/resources.py](./metalearn/metafeatures/resources.py) and added to that module's `resources_info` variable. \ No newline at end of file +By convention, all the decorated metafeature-computing functions in a module are aggregated at the bottom of the module into a list called `metafeature_computers`, which is then imported by [./metalearn/metafeatures/resources.py](./metalearn/metafeatures/resources.py) and added to that module's `resources_info` variable. diff --git a/MANIFEST.in b/MANIFEST.in index 1650733..42aaeff 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1 @@ -include metalearn/metafeatures/metafeatures_schema.json \ No newline at end of file +include metalearn/metafeatures/metafeatures_schema.json diff --git a/metalearn/__init__.py b/metalearn/__init__.py index c52b3b9..227393b 100644 --- a/metalearn/__init__.py +++ b/metalearn/__init__.py @@ -1,2 +1,2 @@ from .metafeatures.metafeatures import Metafeatures -from .metafeatures.static_assets import METAFEATURES_JSON_SCHEMA +from .metafeatures.static_assets import METAFEATURES_JSON_SCHEMA_PATH diff --git a/metalearn/metafeatures/base.py b/metalearn/metafeatures/base.py index 7d38025..b530a2f 100644 --- a/metalearn/metafeatures/base.py +++ b/metalearn/metafeatures/base.py @@ -142,4 +142,4 @@ def __len__(self): return len(self._map) def __delitem__(self, key: str): - raise TypeError("ResourceComputerMap does not support deletion of its ResourceComputers") \ No newline at end of file + raise TypeError("ResourceComputerMap does not support deletion of its ResourceComputers") diff --git a/metalearn/metafeatures/constants.py b/metalearn/metafeatures/constants.py index b76cc22..6059cc3 100644 --- a/metalearn/metafeatures/constants.py +++ b/metalearn/metafeatures/constants.py @@ -24,4 +24,4 @@ class MetafeatureGroup(Enum): CATEGORICAL = "CATEGORICAL" NO_TARGETS = "NO_TARGETS" NUMERIC_TARGETS = "NUMERIC_TARGETS" -TIMEOUT = "TIMEOUT" \ No newline at end of file +TIMEOUT = "TIMEOUT" diff --git a/metalearn/metafeatures/decision_tree_metafeatures.py b/metalearn/metafeatures/decision_tree_metafeatures.py index 3f85cc7..ce1fb85 100644 --- a/metalearn/metafeatures/decision_tree_metafeatures.py +++ b/metalearn/metafeatures/decision_tree_metafeatures.py @@ -26,6 +26,7 @@ def get_general_info(self): def get_attributes(self): return [x for x in Counter(self.tree.feature).values() if x != -2] + class TraversedDecisionTree: def __init__(self, tree): @@ -208,4 +209,4 @@ def get_decision_tree_width(tree): get_decision_tree_attributes, get_decision_tree_general_info, get_decision_tree_width -] \ No newline at end of file +] diff --git a/metalearn/metafeatures/general_resource_computers.py b/metalearn/metafeatures/general_resource_computers.py index 179c511..abb6f6d 100644 --- a/metalearn/metafeatures/general_resource_computers.py +++ b/metalearn/metafeatures/general_resource_computers.py @@ -245,4 +245,4 @@ def get_text_features_with_no_missing_values( get_binned_numeric_features_with_no_missing_values, get_binned_numeric_features_and_class_with_no_missing_values, get_text_features_with_no_missing_values -] \ No newline at end of file +] diff --git a/metalearn/metafeatures/information_theoretic_metafeatures.py b/metalearn/metafeatures/information_theoretic_metafeatures.py index 7309602..33d1fa9 100644 --- a/metalearn/metafeatures/information_theoretic_metafeatures.py +++ b/metalearn/metafeatures/information_theoretic_metafeatures.py @@ -6,6 +6,7 @@ from metalearn.metafeatures.base import MetafeatureComputer from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup + def get_entropy(col): return entropy(col.value_counts()) @@ -237,4 +238,4 @@ def translate_into_tuples(col1, col2): get_equivalent_number_numeric_features, get_categorical_noise_signal_ratio, get_numeric_noise_signal_ratio -] \ No newline at end of file +] diff --git a/metalearn/metafeatures/landmarking_metafeatures.py b/metalearn/metafeatures/landmarking_metafeatures.py index 3dc069a..71b520f 100644 --- a/metalearn/metafeatures/landmarking_metafeatures.py +++ b/metalearn/metafeatures/landmarking_metafeatures.py @@ -195,4 +195,4 @@ def get_lda(X, Y, n_folds, cv_seed): get_random_tree_depth_2, get_random_tree_depth_3, get_lda -] \ No newline at end of file +] diff --git a/metalearn/metafeatures/metafeatures_schema.json b/metalearn/metafeatures/metafeatures_schema.json index f183706..6258c9e 100644 --- a/metalearn/metafeatures/metafeatures_schema.json +++ b/metalearn/metafeatures/metafeatures_schema.json @@ -29,4 +29,4 @@ "$ref": "#/definitions/dataset_metafeature" } } -} \ No newline at end of file +} diff --git a/metalearn/metafeatures/resources.py b/metalearn/metafeatures/resources.py index e6a753b..eac7c01 100644 --- a/metalearn/metafeatures/resources.py +++ b/metalearn/metafeatures/resources.py @@ -61,4 +61,4 @@ def _get_metafeature_ids(metafeature_computers: List[MetafeatureComputer]) -> Li landmarking_metafeatures + text_metafeatures + dt_metafeatures -) \ No newline at end of file +) diff --git a/metalearn/metafeatures/simple_metafeatures.py b/metalearn/metafeatures/simple_metafeatures.py index bc6b52f..c71ce16 100644 --- a/metalearn/metafeatures/simple_metafeatures.py +++ b/metalearn/metafeatures/simple_metafeatures.py @@ -5,6 +5,7 @@ from metalearn.metafeatures.base import MetafeatureComputer from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup + def get_dataset_stats(X, column_types): number_of_instances = X.shape[0] number_of_features = X.shape[1] @@ -162,4 +163,4 @@ def get_numeric_cardinalities(X, column_types): get_missing_values, get_categorical_cardinalities, get_numeric_cardinalities -] \ No newline at end of file +] diff --git a/metalearn/metafeatures/static_assets.py b/metalearn/metafeatures/static_assets.py index c8473de..3190696 100644 --- a/metalearn/metafeatures/static_assets.py +++ b/metalearn/metafeatures/static_assets.py @@ -3,4 +3,4 @@ # Expose the `metafeatures_schema.json` file as a file path string. # Needed to allow the file to be successfully accessed across platforms # and install types. -METAFEATURES_JSON_SCHEMA = pkg_resources.resource_filename('metalearn', 'metafeatures/metafeatures_schema.json') +METAFEATURES_JSON_SCHEMA_PATH = pkg_resources.resource_filename('metalearn', 'metafeatures/metafeatures_schema.json') diff --git a/metalearn/metafeatures/statistical_metafeatures.py b/metalearn/metafeatures/statistical_metafeatures.py index 3fbfd2f..e5b9943 100644 --- a/metalearn/metafeatures/statistical_metafeatures.py +++ b/metalearn/metafeatures/statistical_metafeatures.py @@ -224,4 +224,4 @@ def preprocess(series): get_numeric_skewness, get_numeric_kurtosis, get_pca -] \ No newline at end of file +] diff --git a/metalearn/metafeatures/text_metafeatures.py b/metalearn/metafeatures/text_metafeatures.py index 2ecf301..61692bc 100644 --- a/metalearn/metafeatures/text_metafeatures.py +++ b/metalearn/metafeatures/text_metafeatures.py @@ -8,6 +8,7 @@ from metalearn.metafeatures.base import ResourceComputer, MetafeatureComputer from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup + def get_string_lengths_array_from_text_features(text_features_array): lengths = [feature.apply(len) for feature in text_features_array] return (lengths,) @@ -245,4 +246,4 @@ def filter_and_aggregate(tokens_series, f): get_string_length_skewness, get_string_length_kurtosis, get_mfs_for_tokens_split_by_space -] \ No newline at end of file +] diff --git a/tests/test_metafeatures.py b/tests/test_metafeatures.py index 0551442..a82076e 100644 --- a/tests/test_metafeatures.py +++ b/tests/test_metafeatures.py @@ -12,7 +12,7 @@ import pandas as pd import numpy as np -from metalearn import Metafeatures, METAFEATURES_JSON_SCHEMA +from metalearn import Metafeatures, METAFEATURES_JSON_SCHEMA_PATH import metalearn.metafeatures.constants as consts from metalearn.metafeatures.resources import metafeature_ids from tests.config import CORRECTNESS_SEED, METADATA_PATH @@ -361,7 +361,7 @@ def test_compute_effects_on_compute(self): self._report_test_failures(test_failures, test_name) def test_output_format(self): - with open(METAFEATURES_JSON_SCHEMA) as f: + with open(METAFEATURES_JSON_SCHEMA_PATH) as f: mf_schema = json.load(f) for dataset_filename, dataset in self.datasets.items(): computed_mfs = Metafeatures().compute( @@ -377,7 +377,7 @@ def test_output_format(self): ) def test_output_json_compatibility(self): - with open(METAFEATURES_JSON_SCHEMA) as f: + with open(METAFEATURES_JSON_SCHEMA_PATH) as f: mf_schema = json.load(f) for dataset_filename, dataset in self.datasets.items(): computed_mfs = Metafeatures().compute( From a065dcbcd9349b1e59c7632d384c445240abc066 Mon Sep 17 00:00:00 2001 From: Brandon Schoenfeld Date: Fri, 16 Aug 2019 15:33:16 -0600 Subject: [PATCH 15/20] replace resource map with collectordict --- metalearn/metafeatures/base.py | 88 ++++++++----------- .../decision_tree_metafeatures.py | 10 +-- .../general_resource_computers.py | 6 +- .../information_theoretic_metafeatures.py | 6 +- .../metafeatures/landmarking_metafeatures.py | 6 +- metalearn/metafeatures/metafeatures.py | 4 +- metalearn/metafeatures/resources.py | 83 ++++++++--------- metalearn/metafeatures/simple_metafeatures.py | 6 +- .../metafeatures/statistical_metafeatures.py | 6 +- metalearn/metafeatures/text_metafeatures.py | 10 +-- tests/test_base.py | 31 +++++++ 11 files changed, 131 insertions(+), 125 deletions(-) create mode 100644 tests/test_base.py diff --git a/metalearn/metafeatures/base.py b/metalearn/metafeatures/base.py index b530a2f..56bf4c4 100644 --- a/metalearn/metafeatures/base.py +++ b/metalearn/metafeatures/base.py @@ -1,4 +1,4 @@ -from collections.abc import MutableMapping +from collections.abc import Mapping import inspect import itertools from typing import List, Callable, Dict, Union, Optional, Any @@ -90,56 +90,40 @@ def __init__( self.problem_type = problem_type -class ResourceComputerMap(MutableMapping): - def __init__(self, computers: Union[ResourceComputer,List[ResourceComputer],None] = None) -> None: - """ - Wraps a dictionary map of resource names to their computers. - Includes visibility into whether duplicate computers - are trying to become associated with a resource in the map e.g. - if a package developer has accidentally declared two computers - that return the same resource. - """ - self._map: Dict[str,ResourceComputer] = {} - if computers is not None: - self.add(computers) - - def add(self, computers: Union[ResourceComputer,List[ResourceComputer]]) -> None: - """ - Adds more resource name/resource computer key/value - pairs to a resource map, throwing an error on duplicates. - """ - if isinstance(computers, list): - for computer in computers: - self._add_one(computer) - elif isinstance(computers, ResourceComputer): - self._add_one(computers) - else: - raise ValueError("computers must be ResourceComputer or List[ResourceComputer]") - - def __getitem__(self, key: str = None) -> ResourceComputer: - """Used for getting a resource from the map.""" - return self._map[key] - - def _add_one(self, computer: ResourceComputer) -> None: - if not isinstance(computer, ResourceComputer): - raise ValueError(f"computer is not a ResourceComputer; it is a {type(computer)}") - - for resource_name in computer.returns: - self.__setitem__(resource_name, computer) - - def __setitem__(self, resource_name: str, computer: ResourceComputer): - assert resource_name not in self._map, ( - f"duplicate computer '{computer.name}' provided for resource '{resource_name}', " - f"which is already present in the resouce map, registered " - f"by computer '{self._map[resource_name].name}'" - ) - self._map[resource_name] = computer - +class collectordict(Mapping): + """ + A partially mutable mapping in which keys can be set at most one time. + A LookupError is raised if a key is set more than once. + For simplicity, all values must be set manually. + """ + + dict_cls = dict + + def __init__(self): + self._dict = self.dict_cls() + + def __getitem__(self, key): + return self._dict[key] + def __iter__(self): - return iter(self._map) - + return iter(self._dict) + def __len__(self): - return len(self._map) - - def __delitem__(self, key: str): - raise TypeError("ResourceComputerMap does not support deletion of its ResourceComputers") + return len(self._dict) + + def __setitem__(self, key, value): + if key in self._dict: + raise LookupError(f'{key} already exists') + self._dict[key] = value + + def update(self, mapping: Mapping): + for key, value in mapping.items(): + self[key] = value + + +def build_resources_info(*computers: ResourceComputer) -> collectordict: + resources_info = collectordict() + for computer in computers: + for resource_name in computer.returns: + resources_info[resource_name] = computer + return resources_info diff --git a/metalearn/metafeatures/decision_tree_metafeatures.py b/metalearn/metafeatures/decision_tree_metafeatures.py index ce1fb85..6e21d67 100644 --- a/metalearn/metafeatures/decision_tree_metafeatures.py +++ b/metalearn/metafeatures/decision_tree_metafeatures.py @@ -3,7 +3,7 @@ from sklearn.tree import DecisionTreeClassifier from metalearn.metafeatures.common_operations import profile_distribution -from metalearn.metafeatures.base import ResourceComputer, MetafeatureComputer +from metalearn.metafeatures.base import build_resources_info, ResourceComputer, MetafeatureComputer from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup @@ -194,19 +194,19 @@ def get_decision_tree_width(tree): A list of all ResourceComputer instances in this module. """ -resource_computers = [ +resources_info = build_resources_info( get_decision_tree, traverse_tree -] +) """ A list of all MetafeatureComputer instances in this module. """ -metafeature_computers = [ +metafeatures_info = build_resources_info( get_decision_tree_level_sizes, get_decision_tree_branch_lengths, get_decision_tree_attributes, get_decision_tree_general_info, get_decision_tree_width -] +) diff --git a/metalearn/metafeatures/general_resource_computers.py b/metalearn/metafeatures/general_resource_computers.py index abb6f6d..0dd4371 100644 --- a/metalearn/metafeatures/general_resource_computers.py +++ b/metalearn/metafeatures/general_resource_computers.py @@ -2,7 +2,7 @@ import pandas as pd from sklearn.model_selection import StratifiedShuffleSplit -from metalearn.metafeatures.base import ResourceComputer +from metalearn.metafeatures.base import build_resources_info, ResourceComputer import metalearn.metafeatures.constants as consts @@ -234,7 +234,7 @@ def get_text_features_with_no_missing_values( A list of all ResourceComputer instances in this module. """ -resource_computers = [ +resources_info = build_resources_info( get_cv_seed, sample_columns, sample_rows, @@ -245,4 +245,4 @@ def get_text_features_with_no_missing_values( get_binned_numeric_features_with_no_missing_values, get_binned_numeric_features_and_class_with_no_missing_values, get_text_features_with_no_missing_values -] +) diff --git a/metalearn/metafeatures/information_theoretic_metafeatures.py b/metalearn/metafeatures/information_theoretic_metafeatures.py index 33d1fa9..be66212 100644 --- a/metalearn/metafeatures/information_theoretic_metafeatures.py +++ b/metalearn/metafeatures/information_theoretic_metafeatures.py @@ -3,7 +3,7 @@ from sklearn.metrics import mutual_info_score from metalearn.metafeatures.common_operations import * -from metalearn.metafeatures.base import MetafeatureComputer +from metalearn.metafeatures.base import build_resources_info, MetafeatureComputer from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup @@ -226,7 +226,7 @@ def translate_into_tuples(col1, col2): A list of all MetafeatureComputer instances in this module. """ -metafeature_computers = [ +metafeatures_info = build_resources_info( get_class_entropy, get_categorical_attribute_entropy, get_numeric_attribute_entropy, @@ -238,4 +238,4 @@ def translate_into_tuples(col1, col2): get_equivalent_number_numeric_features, get_categorical_noise_signal_ratio, get_numeric_noise_signal_ratio -] +) diff --git a/metalearn/metafeatures/landmarking_metafeatures.py b/metalearn/metafeatures/landmarking_metafeatures.py index 71b520f..f115283 100644 --- a/metalearn/metafeatures/landmarking_metafeatures.py +++ b/metalearn/metafeatures/landmarking_metafeatures.py @@ -11,7 +11,7 @@ from sklearn.tree import DecisionTreeClassifier from metalearn.metafeatures.common_operations import * -from metalearn.metafeatures.base import MetafeatureComputer +from metalearn.metafeatures.base import build_resources_info, MetafeatureComputer from metalearn.metafeatures.constants import MetafeatureGroup, ProblemType @@ -187,7 +187,7 @@ def get_lda(X, Y, n_folds, cv_seed): A list of all MetafeatureComputer instances in this module. """ -metafeature_computers = [ +metafeatures_info = build_resources_info( get_naive_bayes, get_knn_1, get_decision_stump, @@ -195,4 +195,4 @@ def get_lda(X, Y, n_folds, cv_seed): get_random_tree_depth_2, get_random_tree_depth_3, get_lda -] +) diff --git a/metalearn/metafeatures/metafeatures.py b/metalearn/metafeatures/metafeatures.py index 4e6aabe..dbd1287 100644 --- a/metalearn/metafeatures/metafeatures.py +++ b/metalearn/metafeatures/metafeatures.py @@ -12,7 +12,7 @@ from metalearn.metafeatures.common_operations import * from metalearn.metafeatures.resources import resources_info, metafeature_ids -from metalearn.metafeatures.base import ResourceComputer, MetafeatureComputer, ResourceComputerMap +from metalearn.metafeatures.base import collectordict, ResourceComputer, MetafeatureComputer import metalearn.metafeatures.constants as consts @@ -25,7 +25,7 @@ class Metafeatures(object): """ - _resources_info: ResourceComputerMap = resources_info + _resources_info: collectordict = resources_info IDS: List[str] = metafeature_ids @classmethod diff --git a/metalearn/metafeatures/resources.py b/metalearn/metafeatures/resources.py index eac7c01..218fe7b 100644 --- a/metalearn/metafeatures/resources.py +++ b/metalearn/metafeatures/resources.py @@ -1,64 +1,55 @@ from typing import List -from metalearn.metafeatures.base import ResourceComputerMap, MetafeatureComputer, ResourceComputer +from metalearn.metafeatures.base import collectordict, MetafeatureComputer, ResourceComputer -from metalearn.metafeatures.decision_tree_metafeatures import resource_computers as dt_resources -from metalearn.metafeatures.general_resource_computers import resource_computers as util_resources -from metalearn.metafeatures.text_metafeatures import resource_computers as text_resources +from metalearn.metafeatures.decision_tree_metafeatures import resources_info as dt_resources +from metalearn.metafeatures.general_resource_computers import resources_info as util_resources +from metalearn.metafeatures.text_metafeatures import resources_info as text_resources -from metalearn.metafeatures.simple_metafeatures import metafeature_computers as simple_metafeatures -from metalearn.metafeatures.statistical_metafeatures import metafeature_computers as statistical_metafeatures -from metalearn.metafeatures.information_theoretic_metafeatures import metafeature_computers as info_theoretic_metafeatures -from metalearn.metafeatures.landmarking_metafeatures import metafeature_computers as landmarking_metafeatures -from metalearn.metafeatures.text_metafeatures import metafeature_computers as text_metafeatures -from metalearn.metafeatures.decision_tree_metafeatures import metafeature_computers as dt_metafeatures +from metalearn.metafeatures.simple_metafeatures import metafeatures_info as simple_metafeatures +from metalearn.metafeatures.statistical_metafeatures import metafeatures_info as statistical_metafeatures +from metalearn.metafeatures.information_theoretic_metafeatures import metafeatures_info as info_theoretic_metafeatures +from metalearn.metafeatures.landmarking_metafeatures import metafeatures_info as landmarking_metafeatures +from metalearn.metafeatures.text_metafeatures import metafeatures_info as text_metafeatures +from metalearn.metafeatures.decision_tree_metafeatures import metafeatures_info as dt_metafeatures -def _get_metafeature_ids(metafeature_computers: List[MetafeatureComputer]) -> List[str]: - """Returns a list of all metafeature IDs found in `metafeature_computers`""" - metafeature_ids = set() - for computer in metafeature_computers: - for name in computer.returns: - if name in metafeature_ids: - raise ValueError("there is already a MetafeatureComputer that returns the {name} metafeature.") - metafeature_ids.add(name) - return list(metafeature_ids) - - -resources_info = ResourceComputerMap() +resources_info = collectordict() # Add all the ResourceComputers -resources_info.add(dt_resources) -resources_info.add(util_resources) -resources_info.add(text_resources) +resources_info.update(dt_resources) +resources_info.update(util_resources) +resources_info.update(text_resources) # Add noop resource computers for the base resources. # Since they'll always be in the Metafeatures resource hash, # they'll never be needed to be computed by a ResourceComputer, # but they need to be in `resources_info` since `Metafeatures._get_arguments` # and `Metafeatures._resource_is_target_dependent` requires them to be. -resources_info.add(ResourceComputer(lambda _: None, ["X_raw"])) -resources_info.add(ResourceComputer(lambda _: None, ["X"])) -resources_info.add(ResourceComputer(lambda _: None, ["Y"])) -resources_info.add(ResourceComputer(lambda _: None, ["column_types"])) -resources_info.add(ResourceComputer(lambda _: None, ["sample_shape"])) -resources_info.add(ResourceComputer(lambda _: None, ["seed_base"])) -resources_info.add(ResourceComputer(lambda _: None, ["n_folds"])) +resources_info["X_raw"] = ResourceComputer(lambda _: None, ["X_raw"]) +resources_info["X"] = ResourceComputer(lambda _: None, ["X"]) +resources_info["Y"] = ResourceComputer(lambda _: None, ["Y"]) +resources_info["column_types"] = ResourceComputer(lambda _: None, ["column_types"]) +resources_info["sample_shape"] = ResourceComputer(lambda _: None, ["sample_shape"]) +resources_info["seed_base"] = ResourceComputer(lambda _: None, ["seed_base"]) +resources_info["n_folds"] = ResourceComputer(lambda _: None, ["n_folds"]) # Add all the MetafeatureComputers -resources_info.add(simple_metafeatures) -resources_info.add(statistical_metafeatures) -resources_info.add(info_theoretic_metafeatures) -resources_info.add(landmarking_metafeatures) -resources_info.add(text_metafeatures) -resources_info.add(dt_metafeatures) +resources_info.update(simple_metafeatures) +resources_info.update(statistical_metafeatures) +resources_info.update(info_theoretic_metafeatures) +resources_info.update(landmarking_metafeatures) +resources_info.update(text_metafeatures) +resources_info.update(dt_metafeatures) # Get all the metafeature ids -metafeature_ids = _get_metafeature_ids( - simple_metafeatures + - statistical_metafeatures + - info_theoretic_metafeatures + - landmarking_metafeatures + - text_metafeatures + - dt_metafeatures -) +metafeature_ids = [ + mf_id for mfs_info in [ + simple_metafeatures, + statistical_metafeatures, + info_theoretic_metafeatures, + landmarking_metafeatures, + text_metafeatures, + dt_metafeatures, + ] for mf_id in mfs_info.keys() +] diff --git a/metalearn/metafeatures/simple_metafeatures.py b/metalearn/metafeatures/simple_metafeatures.py index c71ce16..dea3b91 100644 --- a/metalearn/metafeatures/simple_metafeatures.py +++ b/metalearn/metafeatures/simple_metafeatures.py @@ -2,7 +2,7 @@ from pandas import DataFrame from metalearn.metafeatures.common_operations import * -from metalearn.metafeatures.base import MetafeatureComputer +from metalearn.metafeatures.base import build_resources_info, MetafeatureComputer from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup @@ -156,11 +156,11 @@ def get_numeric_cardinalities(X, column_types): A list of all MetafeatureComputer instances in this module. """ -metafeature_computers = [ +metafeatures_info = build_resources_info( get_dataset_stats, get_class_stats, get_dimensionality, get_missing_values, get_categorical_cardinalities, get_numeric_cardinalities -] +) diff --git a/metalearn/metafeatures/statistical_metafeatures.py b/metalearn/metafeatures/statistical_metafeatures.py index e5b9943..aedfa3f 100644 --- a/metalearn/metafeatures/statistical_metafeatures.py +++ b/metalearn/metafeatures/statistical_metafeatures.py @@ -9,7 +9,7 @@ from sklearn.cross_decomposition import CCA from metalearn.metafeatures.common_operations import * -from metalearn.metafeatures.base import MetafeatureComputer +from metalearn.metafeatures.base import build_resources_info, MetafeatureComputer from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup import metalearn.metafeatures.constants as consts @@ -218,10 +218,10 @@ def preprocess(series): A list of all MetafeatureComputer instances in this module. """ -metafeature_computers = [ +metafeatures_info = build_resources_info( get_numeric_means, get_numeric_stdev, get_numeric_skewness, get_numeric_kurtosis, get_pca -] +) diff --git a/metalearn/metafeatures/text_metafeatures.py b/metalearn/metafeatures/text_metafeatures.py index 61692bc..d840e5c 100644 --- a/metalearn/metafeatures/text_metafeatures.py +++ b/metalearn/metafeatures/text_metafeatures.py @@ -5,7 +5,7 @@ import pandas as pd from metalearn.metafeatures.common_operations import * -from metalearn.metafeatures.base import ResourceComputer, MetafeatureComputer +from metalearn.metafeatures.base import build_resources_info, ResourceComputer, MetafeatureComputer from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup @@ -232,18 +232,18 @@ def filter_and_aggregate(tokens_series, f): A list of all ResourceComputer instances in this module. """ -resource_computers = [ +resources_info = build_resources_info( get_string_lengths_array_from_text_features -] +) """ A list of all MetafeatureComputer instances in this module. """ -metafeature_computers = [ +metafeatures_info = build_resources_info( get_string_length_means, get_string_length_stdev, get_string_length_skewness, get_string_length_kurtosis, get_mfs_for_tokens_split_by_space -] +) diff --git a/tests/test_base.py b/tests/test_base.py new file mode 100644 index 0000000..9b104a8 --- /dev/null +++ b/tests/test_base.py @@ -0,0 +1,31 @@ +import unittest + +from metalearn.metafeatures.base import collectordict + + +class CollectorDictTestCase(unittest.TestCase): + + def test_no_init_args(self): + try: + cd = collectordict({'a': 1}) + self.fail('collectordict should have failed when passed init args') + except TypeError as e: + pass + + def test_no_duplicate_setter(self): + cd = collectordict() + cd[1] = 1 + try: + cd[1] = 2 + self.fail('collectordict should have raised an error when setting an existing key') + except LookupError as e: + pass + + def test_no_duplicates_in_update(self): + cd = collectordict() + cd[1] = 1 + try: + cd.update({1:2}) + self.fail('collectordict should have raised an error when updating with an existing key') + except LookupError as e: + pass From 9cacede1191431e86d594aa79eca9c9b83eed4f2 Mon Sep 17 00:00:00 2001 From: Brandon Schoenfeld Date: Fri, 16 Aug 2019 16:39:45 -0600 Subject: [PATCH 16/20] moved resources to Metafeatures class; added test for duplicates --- metalearn/metafeatures/metafeatures.py | 42 +++++++++++++++++--- metalearn/metafeatures/resources.py | 55 -------------------------- tests/test_metafeatures.py | 6 ++- 3 files changed, 40 insertions(+), 63 deletions(-) delete mode 100644 metalearn/metafeatures/resources.py diff --git a/metalearn/metafeatures/metafeatures.py b/metalearn/metafeatures/metafeatures.py index dbd1287..b4a10d1 100644 --- a/metalearn/metafeatures/metafeatures.py +++ b/metalearn/metafeatures/metafeatures.py @@ -10,11 +10,21 @@ import pandas as pd from pandas import DataFrame, Series -from metalearn.metafeatures.common_operations import * -from metalearn.metafeatures.resources import resources_info, metafeature_ids from metalearn.metafeatures.base import collectordict, ResourceComputer, MetafeatureComputer +from metalearn.metafeatures.common_operations import * import metalearn.metafeatures.constants as consts +from metalearn.metafeatures.decision_tree_metafeatures import resources_info as dt_resources +from metalearn.metafeatures.general_resource_computers import resources_info as general_resources +from metalearn.metafeatures.text_metafeatures import resources_info as text_resources + +from metalearn.metafeatures.decision_tree_metafeatures import metafeatures_info as dt_metafeatures +from metalearn.metafeatures.information_theoretic_metafeatures import metafeatures_info as info_theoretic_metafeatures +from metalearn.metafeatures.landmarking_metafeatures import metafeatures_info as landmarking_metafeatures +from metalearn.metafeatures.simple_metafeatures import metafeatures_info as simple_metafeatures +from metalearn.metafeatures.statistical_metafeatures import metafeatures_info as statistical_metafeatures +from metalearn.metafeatures.text_metafeatures import metafeatures_info as text_metafeatures + class Metafeatures(object): """ @@ -24,9 +34,29 @@ class Metafeatures(object): meta-learning applications. """ + _resources_info = collectordict() + _resources_info.update(dt_resources) + _resources_info.update(general_resources) + _resources_info.update(text_resources) + + # noop resource computers for the user-provided resources + # `_get_arguments` and `_resource_is_target_dependent` assumes ResourceComputer's + for resource_name in ["X_raw", "X", "Y", "column_types", "sample_shape", "seed_base", "n_folds"]: + _resources_info[resource_name] = ResourceComputer(lambda: None, [resource_name]) + + _mfs_info = [ + dt_metafeatures, + info_theoretic_metafeatures, + landmarking_metafeatures, + simple_metafeatures, + statistical_metafeatures, + text_metafeatures, + ] + + for mf_info in _mfs_info: + _resources_info.update(mf_info) - _resources_info: collectordict = resources_info - IDS: List[str] = metafeature_ids + IDS: List[str] = [mf_id for mfs_info in _mfs_info for mf_id in mfs_info.keys()] @classmethod def list_metafeatures(cls, group="all"): @@ -160,8 +190,8 @@ def _init_resources( ): # Add the base resources to our resources hash self._resources = { - "X_raw": self._format_resource(X, 0.), - "X": self._format_resource(X.dropna(axis=1, how="all"), 0.), + "X_raw": self._format_resource(X, 0.), # TODO: rename to X + "X": self._format_resource(X.dropna(axis=1, how="all"), 0.), # TODO: make resource computer; rename "Y": self._format_resource(Y, 0.), "column_types": self._format_resource(column_types, 0.), "sample_shape": self._format_resource(sample_shape, 0.), diff --git a/metalearn/metafeatures/resources.py b/metalearn/metafeatures/resources.py deleted file mode 100644 index 218fe7b..0000000 --- a/metalearn/metafeatures/resources.py +++ /dev/null @@ -1,55 +0,0 @@ -from typing import List - -from metalearn.metafeatures.base import collectordict, MetafeatureComputer, ResourceComputer - -from metalearn.metafeatures.decision_tree_metafeatures import resources_info as dt_resources -from metalearn.metafeatures.general_resource_computers import resources_info as util_resources -from metalearn.metafeatures.text_metafeatures import resources_info as text_resources - -from metalearn.metafeatures.simple_metafeatures import metafeatures_info as simple_metafeatures -from metalearn.metafeatures.statistical_metafeatures import metafeatures_info as statistical_metafeatures -from metalearn.metafeatures.information_theoretic_metafeatures import metafeatures_info as info_theoretic_metafeatures -from metalearn.metafeatures.landmarking_metafeatures import metafeatures_info as landmarking_metafeatures -from metalearn.metafeatures.text_metafeatures import metafeatures_info as text_metafeatures -from metalearn.metafeatures.decision_tree_metafeatures import metafeatures_info as dt_metafeatures - - -resources_info = collectordict() - -# Add all the ResourceComputers -resources_info.update(dt_resources) -resources_info.update(util_resources) -resources_info.update(text_resources) - -# Add noop resource computers for the base resources. -# Since they'll always be in the Metafeatures resource hash, -# they'll never be needed to be computed by a ResourceComputer, -# but they need to be in `resources_info` since `Metafeatures._get_arguments` -# and `Metafeatures._resource_is_target_dependent` requires them to be. -resources_info["X_raw"] = ResourceComputer(lambda _: None, ["X_raw"]) -resources_info["X"] = ResourceComputer(lambda _: None, ["X"]) -resources_info["Y"] = ResourceComputer(lambda _: None, ["Y"]) -resources_info["column_types"] = ResourceComputer(lambda _: None, ["column_types"]) -resources_info["sample_shape"] = ResourceComputer(lambda _: None, ["sample_shape"]) -resources_info["seed_base"] = ResourceComputer(lambda _: None, ["seed_base"]) -resources_info["n_folds"] = ResourceComputer(lambda _: None, ["n_folds"]) - -# Add all the MetafeatureComputers -resources_info.update(simple_metafeatures) -resources_info.update(statistical_metafeatures) -resources_info.update(info_theoretic_metafeatures) -resources_info.update(landmarking_metafeatures) -resources_info.update(text_metafeatures) -resources_info.update(dt_metafeatures) - -# Get all the metafeature ids -metafeature_ids = [ - mf_id for mfs_info in [ - simple_metafeatures, - statistical_metafeatures, - info_theoretic_metafeatures, - landmarking_metafeatures, - text_metafeatures, - dt_metafeatures, - ] for mf_id in mfs_info.keys() -] diff --git a/tests/test_metafeatures.py b/tests/test_metafeatures.py index a82076e..3ae1aa0 100644 --- a/tests/test_metafeatures.py +++ b/tests/test_metafeatures.py @@ -14,7 +14,6 @@ from metalearn import Metafeatures, METAFEATURES_JSON_SCHEMA_PATH import metalearn.metafeatures.constants as consts -from metalearn.metafeatures.resources import metafeature_ids from tests.config import CORRECTNESS_SEED, METADATA_PATH from tests.data.dataset import read_dataset from tests.data.compute_dataset_metafeatures import get_dataset_metafeatures_path @@ -118,7 +117,7 @@ def _check_compare_metafeature_lists(self, computed_mfs, known_mfs, filename): test_failures = {} fail_message = "Metafeature lists do not match." - master_mf_ids_set = set(metafeature_ids) + master_mf_ids_set = set(Metafeatures.IDS) known_mf_ids_set = set({ x for x in known_mfs.keys() if "_Time" not in x @@ -761,3 +760,6 @@ def test_y_no_name(self): Metafeatures().compute(X,y) except Exception as e: self.fail(e) + + def test_no_duplicate_mf_ids(self): + self.assertEqual(len(Metafeatures.IDS), len(set(Metafeatures.IDS)), 'Metafeatures has duplicate IDS') From 0be355ce65b67aa619538b633397de24dae3addd Mon Sep 17 00:00:00 2001 From: Brandon Schoenfeld Date: Fri, 16 Aug 2019 16:52:14 -0600 Subject: [PATCH 17/20] organization and docstrings --- metalearn/metafeatures/base.py | 63 +++++++++++++++++----------------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/metalearn/metafeatures/base.py b/metalearn/metafeatures/base.py index 56bf4c4..bf223a5 100644 --- a/metalearn/metafeatures/base.py +++ b/metalearn/metafeatures/base.py @@ -39,10 +39,9 @@ def __init__( if argmap is not None: # override computer arg value with developer provided values - # Note each value in `argmap` is a global resource name (e.g. `"XSample"`) or a literal value (e.g. `5`) + # Note each value in `argmap` is a global resource name (e.g. `'XSample'`) or a literal value (e.g. `5`) self.argmap.update(argmap) - - + def __call__(self, *args, **kwargs): """ Allows a ``ResourceComputer`` instance to be callable. Just forwards all arguments on to self.computer. @@ -56,36 +55,33 @@ def name(self) -> str: class MetafeatureComputer(ResourceComputer): + """ + Decorates ``computer``, a metafeature computing function + with metadata about that function. + + Parameters + ---------- + computer + The function that computes the metafeatures. + returns + The names of the metafeatures that ``computer`` returns, specified in + the same order as ``computer`` returns them. + problem_type + The type of ML problem `computer`'s metafeatures can be computed for. + groups + The metafeature groups this computer's returned metafeatures belong to. + e.g. statistical, info-theoretic, simple, etc. + argmap + A custom map of ``computer``'s argument names to the global resource names + that will be passed as ``computer``'s arguments when ``computer`` is called. + """ def __init__( - self, - computer: Callable, - returns: List[str], # TODO: Add support for passing just a string, not a list? - problem_type: ProblemType, - groups: List[MetafeatureGroup], - argmap: Optional[Dict[str,str]] = {} + self, computer: Callable, returns: List[str], problem_type: ProblemType, groups: List[MetafeatureGroup], + argmap: Optional[Dict[str,str]] = None ) -> None: - """ - Decorates ``computer``, a metafeature computing function - with metadata about that function. - - Parameters - ---------- - computer - The function that computes the metafeatures. - returns - The names of the metafeatures that ``computer`` returns, specified in - the same order as ``computer`` returns them. - problem_type - The type of ML problem `computer`'s metafeatures can be computed for. - groups - The metafeature groups this computer's returned metafeatures belong to. - e.g. statistical, info-theoretic, simple, etc. - argmap - A custom map of ``computer``'s argument names to the global resource names - that will be passed as ``computer``'s arguments when ``computer`` is called. - """ - super(MetafeatureComputer, self).__init__(computer, returns, argmap) + # TODO: Add support for passing a string to `returns`, not just a list? + super().__init__(computer, returns, argmap) self.groups = groups self.problem_type = problem_type @@ -93,8 +89,8 @@ def __init__( class collectordict(Mapping): """ A partially mutable mapping in which keys can be set at most one time. - A LookupError is raised if a key is set more than once. - For simplicity, all values must be set manually. + A LookupError is raised if a key is set more than once. Keys cannot be deleted. + For simplicity, all values must be set manually, not in __init__. """ dict_cls = dict @@ -122,6 +118,9 @@ def update(self, mapping: Mapping): def build_resources_info(*computers: ResourceComputer) -> collectordict: + """ + Combines multiple resource computers into a mapping of resource name to computer + """ resources_info = collectordict() for computer in computers: for resource_name in computer.returns: From 7187c33aea76189f11d0c353e44d7c6884f021ce Mon Sep 17 00:00:00 2001 From: Brandon Schoenfeld Date: Fri, 16 Aug 2019 16:54:04 -0600 Subject: [PATCH 18/20] style --- metalearn/metafeatures/constants.py | 34 ++++++++++++++--------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/metalearn/metafeatures/constants.py b/metalearn/metafeatures/constants.py index 6059cc3..dbc18b8 100644 --- a/metalearn/metafeatures/constants.py +++ b/metalearn/metafeatures/constants.py @@ -2,26 +2,26 @@ # Constant Enums class ProblemType(Enum): - CLASSIFICATION = "classification" - REGRESSION = "regression" - ANY = "any" + CLASSIFICATION = 'classification' + REGRESSION = 'regression' + ANY = 'any' class MetafeatureGroup(Enum): - ALL = "all" - SIMPLE = "simple" - TEXT = "text" - STATISTICAL = "statistical" - INFO_THEORETIC = "info_theoretic" - LANDMARKING = "landmarking" - MODEL_BASED = "model_based" - TARGET_DEPENDENT = "target_dependent" + ALL = 'all' + SIMPLE = 'simple' + TEXT = 'text' + STATISTICAL = 'statistical' + INFO_THEORETIC = 'info_theoretic' + LANDMARKING = 'landmarking' + MODEL_BASED = 'model_based' + TARGET_DEPENDENT = 'target_dependent' # Constant strings VALUE_KEY = 'value' COMPUTE_TIME_KEY = 'compute_time' -NUMERIC = "NUMERIC" -TEXT = "TEXT" -CATEGORICAL = "CATEGORICAL" -NO_TARGETS = "NO_TARGETS" -NUMERIC_TARGETS = "NUMERIC_TARGETS" -TIMEOUT = "TIMEOUT" +NUMERIC = 'NUMERIC' +TEXT = 'TEXT' +CATEGORICAL = 'CATEGORICAL' +NO_TARGETS = 'NO_TARGETS' +NUMERIC_TARGETS = 'NUMERIC_TARGETS' +TIMEOUT = 'TIMEOUT' From 666a6086e3a2c770d45cd6bea997c1b34d94decf Mon Sep 17 00:00:00 2001 From: Evan Peterson Date: Fri, 16 Aug 2019 17:06:32 -0600 Subject: [PATCH 19/20] Update contrib guide to reflect new changes --- CONTRIBUTING.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3b8c168..94dcf14 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,7 +4,7 @@ `metalearn` uses a caching mechanism to cache expensive computations that may need to be used again within the package by another function. Both resources (e.g. the dataset itself or a preprocessed version of it) and metafeatures (e.g. entropy, number of features) are cached by the system. -When adding a new metafeature to the package, the function that computes that metafeature needs to be registered in the `resources_info` variable in [./metalearn/metafeatures/resources.py](./metalearn/metafeatures/resources.py), and passed to the call made to `_get_metafeature_ids` in that module as well. Before the function can be registered and passed though, it needs to be decorated with metadata by being passed through the `MetafeatureComputer` constructor (see example below). This allows the metafeatures returned by the function to be used intelligently by the package. +When adding a new metafeature to the package, the function that computes that metafeature needs to be included in the `Metafeatures` class definition in the `Metafeatures._mfs_info` attribute in [./metalearn/metafeatures/metafeatures.py](./metalearn/metafeatures/metafeatures.py). Before the function can be included though, it needs to be decorated with metadata by being passed through the `MetafeatureComputer` constructor (see example below). This allows the metafeatures returned by the function to be used intelligently by the package. Follow the example below to know how to write and register new metafeature(s). Note that a metafeature-computing function (e.g. `get_dataset_stats` as seen below) can compute and return more than one meta-feature. @@ -46,4 +46,4 @@ get_dataset_stats = MetafeatureComputer( ) ``` -By convention, all the decorated metafeature-computing functions in a module are aggregated at the bottom of the module into a list called `metafeature_computers`, which is then imported by [./metalearn/metafeatures/resources.py](./metalearn/metafeatures/resources.py) and added to that module's `resources_info` variable. +By convention, all the decorated metafeature-computing functions in a module are aggregated at the bottom of the module into a call to `build_resources_info`, which is then imported by [./metalearn/metafeatures/metafeatures.py](./metalearn/metafeatures/metafeatures.py) and added to the `_mfs_info` attribute of that module's `Metafeatures` class. From 71382b95087db09d3a3464a90b47a86b786cbe9d Mon Sep 17 00:00:00 2001 From: Brandon Schoenfeld Date: Fri, 16 Aug 2019 17:28:41 -0600 Subject: [PATCH 20/20] style --- metalearn/metafeatures/metafeatures.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/metalearn/metafeatures/metafeatures.py b/metalearn/metafeatures/metafeatures.py index b4a10d1..2466fd6 100644 --- a/metalearn/metafeatures/metafeatures.py +++ b/metalearn/metafeatures/metafeatures.py @@ -390,11 +390,9 @@ def _get_resource(self, resource_id): return resource[consts.VALUE_KEY], resource[consts.COMPUTE_TIME_KEY] def _get_arguments(self, resource_id): - resource_computer = self._resources_info[resource_id] - args = resource_computer.argmap resolved_parameters = {} total_time = 0.0 - for parameter, argument in args.items(): + for parameter, argument in self._resources_info[resource_id].argmap.items(): argument_type = type(argument) if parameter == "seed": seed_base, compute_time = self._get_resource("seed_base") @@ -407,7 +405,7 @@ def _get_arguments(self, resource_id): elif dtype_is_numeric(argument_type): compute_time = 0 else: - raise Exception(f"unhandled argument type '{argument_type}'") + raise TypeError(f'unhandled argument type: {argument_type}') resolved_parameters[parameter] = argument total_time += compute_time return (resolved_parameters, total_time)