From 1dff16c08f84199d4ec3b89f1cd9f6724d5a45d9 Mon Sep 17 00:00:00 2001
From: santi1234567 <45318759+santi1234567@users.noreply.github.com>
Date: Tue, 30 Apr 2024 21:15:18 +0200
Subject: [PATCH 1/6] rename knn_classifier to classifier

---
 build_db.py                        |   4 +-
 classifier.py                      | 370 +++++++++++++++++++++++++++++
 compute_periods.py                 |   2 +-
 interactive.ipynb                  |   2 +-
 multi_classifier.py                |   2 +-
 prepare_training_data.py           |   4 +-
 tests/test_classifier_persister.py |   2 +-
 7 files changed, 378 insertions(+), 8 deletions(-)
 create mode 100755 classifier.py

diff --git a/build_db.py b/build_db.py
index 0726bc9..002df7b 100755
--- a/build_db.py
+++ b/build_db.py
@@ -4,7 +4,7 @@
 import json
 import sqlite3
 import argparse
-from knn_classifier import Classifier
+from classifier import Classifier
 from multi_classifier import MultiClassifier
 from prepare_training_data import CLIENTS
 
@@ -370,7 +370,7 @@ def main():
     if args.multi_classifier:
         classifier = MultiClassifier(data_dir)
     else:
-        print("loading single KNN classifier")
+        print("loading single classifier")
         classifier = Classifier(data_dir)
         print("loaded")
 
diff --git a/classifier.py b/classifier.py
new file mode 100755
index 0000000..072b4b9
--- /dev/null
+++ b/classifier.py
@@ -0,0 +1,370 @@
+#!/usr/bin/env python3
+
+import os
+import json
+import itertools
+import argparse
+import numpy as np
+import matplotlib.pyplot as plt
+import pickle
+
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.model_selection import cross_validate
+from feature_selection import *  # noqa F403
+from feature_selection import ALL_FEATURES
+from prepare_training_data import CLIENTS, classify_reward_by_graffiti
+
+K = 9
+WEIGHTS = "distance"
+
+MIN_GUESS_THRESHOLD = 0.20
+CONFIDENCE_THRESHOLD = 0.95
+
+DEFAULT_FEATURES = [
+    "percent_redundant_boost",
+    "difflib_rewards",
+    "difflib_slot",
+    "difflib_slot_rev",
+]
+
+DEFAULT_GRAFFITI_ONLY = ["Lodestar"]
+
+VIABLE_FEATURES = [
+    "percent_redundant_boost",
+    "percent_pairwise_ordered",
+    "difflib_rewards",
+    "difflib_slot_index",
+    "difflib_index_slot",
+    "difflib_slot_index_rev",
+    "difflib_index_slot_rev",
+    "difflib_slot",
+    "difflib_slot_rev",
+    "spearman_correlation",
+    "norm_reward",
+    "mean_density",
+    "percent_single_bit",
+    "difflib_slot_reward",
+    "difflib_slot_reward_rev",
+]
+
+
+def all_feature_vecs_with_dimension(dimension):
+    return sorted(map(list, itertools.combinations(VIABLE_FEATURES, dimension)))
+
+
+def all_client_groupings_with_dimension(enabled_clients, dimension):
+    return sorted(map(list, itertools.combinations(enabled_clients, dimension)))
+
+
+def into_feature_row(block_reward, features):
+    return [ALL_FEATURES[feature](block_reward) for feature in features]
+
+
+class Classifier:
+    def __init__(
+        self,
+        data_dir,
+        grouped_clients=[],
+        disabled_clients=[],
+        graffiti_only_clients=DEFAULT_GRAFFITI_ONLY,
+        features=DEFAULT_FEATURES,
+        enable_cv=False,
+        classifier_type='knn'
+    ):
+        graffiti_only_clients = set(graffiti_only_clients)
+
+        assert (
+            set(disabled_clients) & graffiti_only_clients == set()
+        ), "clients must not be both graffiti-only and disabled"
+        assert (
+            set(disabled_clients) & set(grouped_clients) == set()
+        ), "clients must not be both disabled and grouped"
+        assert (
+            set(grouped_clients) & graffiti_only_clients == set()
+        ), "clients must not be both graffiti-only and grouped"
+
+        assert (
+            classifier_type in ["knn", "mlp"]
+        ), "classifier_type must be knn or mlp"
+
+        feature_matrix = []
+        training_labels = []
+
+        enabled_clients = []
+        other_index = CLIENTS.index("Other")
+
+        for i, client in enumerate(CLIENTS):
+            if client in disabled_clients or client in graffiti_only_clients:
+                continue
+
+            client_dir = os.path.join(data_dir, client)
+
+            if os.path.exists(client_dir):
+                if client not in grouped_clients:
+                    enabled_clients.append(client)
+            else:
+                if client == "Other" and len(grouped_clients) > 0:
+                    enabled_clients.append(client)
+                continue
+
+            for reward_file in os.listdir(client_dir):
+                with open(os.path.join(client_dir, reward_file), "r") as f:
+                    block_reward = json.load(f)
+
+                feature_row = into_feature_row(block_reward, features)
+                feature_matrix.append(feature_row)
+
+                # print(f"{client}: {feature_row}")
+
+                if client in grouped_clients:
+                    training_labels.append(other_index)
+                else:
+                    training_labels.append(i)
+
+        feature_matrix = np.array(feature_matrix)
+
+        classifier = KNeighborsClassifier(n_neighbors=K, weights=WEIGHTS)
+
+        if enable_cv:
+            self.scores = cross_validate(
+                classifier, feature_matrix, training_labels, scoring="balanced_accuracy"
+            )
+        else:
+            self.scores = None
+
+        classifier.fit(feature_matrix, training_labels)
+
+        self.classifier = classifier
+        self.enabled_clients = enabled_clients
+        self.graffiti_only_clients = set(graffiti_only_clients)
+        self.features = features
+
+        self.feature_matrix = feature_matrix
+        self.training_labels = training_labels
+
+    def classify(self, block_reward):
+        graffiti_guess = classify_reward_by_graffiti(block_reward)
+
+        if graffiti_guess in self.graffiti_only_clients:
+            prob_by_client = {graffiti_guess: 1.0}
+            return (graffiti_guess, graffiti_guess, prob_by_client, graffiti_guess)
+
+        row = into_feature_row(block_reward, self.features)
+        res = self.classifier.predict_proba([row])
+
+        prob_by_client = {
+            client: res[0][i] for i, client in enumerate(self.enabled_clients)
+        }
+
+        multilabel = compute_multilabel(
+            compute_guess_list(prob_by_client, self.enabled_clients)
+        )
+
+        label = compute_best_guess(prob_by_client)
+
+        return (label, multilabel, prob_by_client, graffiti_guess)
+
+    def plot_feature_matrix(self, output_path):
+        fig = plt.figure()
+
+        ax = fig.add_subplot(projection="3d")
+
+        x = self.feature_matrix[:, 0]
+        y = self.feature_matrix[:, 1]
+        z = self.feature_matrix[:, 2]
+
+        scatter = ax.scatter(
+            x, y, z, c=self.training_labels, marker=".", alpha=0.25, cmap="Set1"
+        )
+
+        handles, _ = scatter.legend_elements()
+        labels = self.enabled_clients
+
+        legend1 = ax.legend(handles, labels, loc="best", title="Client")
+        ax.add_artist(legend1)
+
+        assert (
+            len(self.features) == 3
+        ), "must have exactly 3 features selected for plotting"
+        ax.set_xlabel(self.features[0])
+        ax.set_ylabel(self.features[1])
+        ax.set_zlabel(self.features[2])
+
+        if output_path is None:
+            fig.show()
+        else:
+            fig.savefig(output_path)
+
+
+def compute_guess_list(probability_map, enabled_clients) -> list:
+    guesses = []
+    for client in enabled_clients:
+        if probability_map[client] > CONFIDENCE_THRESHOLD:
+            return [client]
+        elif probability_map[client] > MIN_GUESS_THRESHOLD:
+            guesses.append(client)
+    return guesses
+
+
+def compute_multilabel(guess_list):
+    if len(guess_list) == 1:
+        return guess_list[0]
+    elif len(guess_list) == 2:
+        return f"{guess_list[0]} or {guess_list[1]}"
+    else:
+        return "Uncertain"
+
+
+def compute_best_guess(probability_map) -> str:
+    return max(
+        probability_map.keys(),
+        key=lambda client: probability_map[client],
+        default="Uncertain",
+    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("Classifier testing and cross validation")
+
+    parser.add_argument("data_dir", help="training data directory")
+    parser.add_argument("--classify", help="data to classify")
+    parser.add_argument(
+        "--cv", action="store_true", dest="enable_cv", help="enable cross validation"
+    )
+    parser.add_argument(
+        "--cv-group", default=0, type=int, help="number of clients to group for CV"
+    )
+    parser.add_argument(
+        "--cv-num-features", type=int, help="feature dimensionality for CV"
+    )
+    parser.add_argument(
+        "--group", default=[], nargs="+", help="clients to group during classification"
+    )
+    parser.add_argument(
+        "--persist",
+        action="store_true",
+        dest="should_persist",
+        help="if provided, the model is persisted",
+    )
+    parser.add_argument(
+        "--disable",
+        default=[],
+        nargs="+",
+        help="clients to disable during cross validation",
+    )
+    parser.add_argument(
+        "--graffiti-only",
+        default=DEFAULT_GRAFFITI_ONLY,
+        nargs="+",
+        help="clients to classify based on graffiti only",
+    )
+    parser.add_argument(
+        "--plot",
+        type=str,
+        help="output plot of 3D training data vectors (only works with --classify)",
+    )
+    return parser.parse_args()
+
+
+def persist_classifier(classifier: Classifier, name: str) -> None:
+    try:
+        filename = f"{name}.pkl"
+        with open(filename, "wb") as fid:
+            pickle.dump(classifier, fid)
+    except Exception as e:
+        print(f"Failed to persist classifier due to {e}")
+
+
+def main():
+    args = parse_args()
+    data_dir = args.data_dir
+    classify_dir = args.classify
+    enable_cv = args.enable_cv
+    num_grouped = args.cv_group
+    num_features = args.cv_num_features
+    grouped_clients = args.group
+    should_persist = args.should_persist
+    graffiti_only = args.graffiti_only
+
+    disabled_clients = args.disable
+    enabled_clients = [
+        client
+        for client in CLIENTS
+        if client not in disabled_clients and client != "Other"
+    ]
+
+    if enable_cv:
+        best_score = 0.0
+        best_features = None
+
+        print("performing cross validation")
+        if num_features is None:
+            feature_vecs = [DEFAULT_FEATURES]
+        else:
+            feature_vecs = all_feature_vecs_with_dimension(num_features)
+
+        for grouped_clients in all_client_groupings_with_dimension(
+            enabled_clients, num_grouped
+        ):
+            for feature_vec in feature_vecs:
+                print(f"features: {feature_vec}")
+                classifier = Classifier(
+                    data_dir,
+                    grouped_clients=grouped_clients,
+                    disabled_clients=disabled_clients,
+                    graffiti_only_clients=graffiti_only,
+                    features=feature_vec,
+                    enable_cv=True,
+                )
+                print(f"enabled clients: {classifier.enabled_clients}")
+                print(f"classifier scores: {classifier.scores['test_score']}")
+
+                min_score = min(classifier.scores["test_score"])
+
+                if min_score > best_score:
+                    best_features = feature_vec
+                    best_score = min_score
+
+        print(f"best features found: {best_features}")
+        print(f"score: {best_score}")
+        return
+
+    assert classify_dir is not None, "classify dir required"
+    print(f"classifying all data in directory {classify_dir}")
+    print(f"grouped clients: {grouped_clients}")
+    classifier = Classifier(data_dir, grouped_clients=grouped_clients)
+
+    if args.plot is not None:
+        classifier.plot_feature_matrix(args.plot)
+        print("plot of training data written to {}".format(args.plot))
+
+    frequency_map = {}
+    total_blocks = 0
+
+    for input_file in os.listdir(classify_dir):
+        print(f"classifying rewards from file {input_file}")
+        with open(os.path.join(classify_dir, input_file), "r") as f:
+            block_rewards = json.load(f)
+
+        for block_reward in block_rewards:
+            _, multilabel, _, _ = classifier.classify(block_reward)
+
+            if multilabel not in frequency_map:
+                frequency_map[multilabel] = 0
+
+            frequency_map[multilabel] += 1
+
+        total_blocks += len(block_rewards)
+
+    print(f"total blocks processed: {total_blocks}")
+
+    if should_persist:
+        persist_classifier(classifier, "classifier")
+
+    for multilabel, num_blocks in sorted(frequency_map.items()):
+        percentage = round(num_blocks / total_blocks, 4)
+        print(f"{multilabel},{percentage}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/compute_periods.py b/compute_periods.py
index f2b708c..192e3ae 100644
--- a/compute_periods.py
+++ b/compute_periods.py
@@ -5,7 +5,7 @@
 import sqlite3
 import requests
 import statistics
-from knn_classifier import compute_best_guess
+from classifier import compute_best_guess
 from prepare_training_data import CLIENTS
 from build_db import block_row_to_obj
 
diff --git a/interactive.ipynb b/interactive.ipynb
index c6f591b..2ea53a5 100644
--- a/interactive.ipynb
+++ b/interactive.ipynb
@@ -7,7 +7,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from knn_classifier import Classifier, DEFAULT_FEATURES"
+    "from classifier import Classifier, DEFAULT_FEATURES"
    ]
   },
   {
diff --git a/multi_classifier.py b/multi_classifier.py
index 4ebfc82..d898734 100644
--- a/multi_classifier.py
+++ b/multi_classifier.py
@@ -1,6 +1,6 @@
 import os
 
-from knn_classifier import Classifier
+from classifier import Classifier
 
 
 def start_and_end_slot(sub_dir_name) -> (int, int):
diff --git a/prepare_training_data.py b/prepare_training_data.py
index c7e5941..a9a2c63 100755
--- a/prepare_training_data.py
+++ b/prepare_training_data.py
@@ -75,13 +75,13 @@ def process_file(
 
 
 def parse_args():
-    parser = argparse.ArgumentParser("create training data for the KNN classifier")
+    parser = argparse.ArgumentParser("create training data for the classifier")
 
     parser.add_argument(
         "raw_data_dir", help="input containing data to classify using graffiti"
     )
     parser.add_argument(
-        "proc_data_dir", help="output for processed data, suitable for KNN training"
+        "proc_data_dir", help="output for processed data, suitable for training"
     )
     parser.add_argument(
         "--disable",
diff --git a/tests/test_classifier_persister.py b/tests/test_classifier_persister.py
index 9de8d27..d2c8db4 100644
--- a/tests/test_classifier_persister.py
+++ b/tests/test_classifier_persister.py
@@ -2,7 +2,7 @@
 import json
 import os
 from typing import Any, Dict, List
-from knn_classifier import Classifier, persist_classifier
+from classifier import Classifier, persist_classifier
 from prepare_training_data import CLIENTS
 
 

From 58ee23249e1ed951db628f32156e0c39de91e686 Mon Sep 17 00:00:00 2001
From: santi1234567 <45318759+santi1234567@users.noreply.github.com>
Date: Tue, 30 Apr 2024 21:19:43 +0200
Subject: [PATCH 2/6] remove deprecated file

---
 knn_classifier.py | 365 ----------------------------------------------
 1 file changed, 365 deletions(-)
 delete mode 100755 knn_classifier.py

diff --git a/knn_classifier.py b/knn_classifier.py
deleted file mode 100755
index d43aa28..0000000
--- a/knn_classifier.py
+++ /dev/null
@@ -1,365 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-import json
-import itertools
-import argparse
-import numpy as np
-import matplotlib.pyplot as plt
-import pickle
-
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn.model_selection import cross_validate
-from feature_selection import *  # noqa F403
-from feature_selection import ALL_FEATURES
-from prepare_training_data import CLIENTS, classify_reward_by_graffiti
-
-K = 9
-WEIGHTS = "distance"
-
-MIN_GUESS_THRESHOLD = 0.20
-CONFIDENCE_THRESHOLD = 0.95
-
-DEFAULT_FEATURES = [
-    "percent_redundant_boost",
-    "difflib_rewards",
-    "difflib_slot",
-    "difflib_slot_rev",
-]
-
-DEFAULT_GRAFFITI_ONLY = ["Lodestar"]
-
-VIABLE_FEATURES = [
-    "percent_redundant_boost",
-    "percent_pairwise_ordered",
-    "difflib_rewards",
-    "difflib_slot_index",
-    "difflib_index_slot",
-    "difflib_slot_index_rev",
-    "difflib_index_slot_rev",
-    "difflib_slot",
-    "difflib_slot_rev",
-    "spearman_correlation",
-    "norm_reward",
-    "mean_density",
-    "percent_single_bit",
-    "difflib_slot_reward",
-    "difflib_slot_reward_rev",
-]
-
-
-def all_feature_vecs_with_dimension(dimension):
-    return sorted(map(list, itertools.combinations(VIABLE_FEATURES, dimension)))
-
-
-def all_client_groupings_with_dimension(enabled_clients, dimension):
-    return sorted(map(list, itertools.combinations(enabled_clients, dimension)))
-
-
-def into_feature_row(block_reward, features):
-    return [ALL_FEATURES[feature](block_reward) for feature in features]
-
-
-class Classifier:
-    def __init__(
-        self,
-        data_dir,
-        grouped_clients=[],
-        disabled_clients=[],
-        graffiti_only_clients=DEFAULT_GRAFFITI_ONLY,
-        features=DEFAULT_FEATURES,
-        enable_cv=False,
-    ):
-        graffiti_only_clients = set(graffiti_only_clients)
-
-        assert (
-            set(disabled_clients) & graffiti_only_clients == set()
-        ), "clients must not be both graffiti-only and disabled"
-        assert (
-            set(disabled_clients) & set(grouped_clients) == set()
-        ), "clients must not be both disabled and grouped"
-        assert (
-            set(grouped_clients) & graffiti_only_clients == set()
-        ), "clients must not be both graffiti-only and grouped"
-
-        feature_matrix = []
-        training_labels = []
-
-        enabled_clients = []
-        other_index = CLIENTS.index("Other")
-
-        for i, client in enumerate(CLIENTS):
-            if client in disabled_clients or client in graffiti_only_clients:
-                continue
-
-            client_dir = os.path.join(data_dir, client)
-
-            if os.path.exists(client_dir):
-                if client not in grouped_clients:
-                    enabled_clients.append(client)
-            else:
-                if client == "Other" and len(grouped_clients) > 0:
-                    enabled_clients.append(client)
-                continue
-
-            for reward_file in os.listdir(client_dir):
-                with open(os.path.join(client_dir, reward_file), "r") as f:
-                    block_reward = json.load(f)
-
-                feature_row = into_feature_row(block_reward, features)
-                feature_matrix.append(feature_row)
-
-                # print(f"{client}: {feature_row}")
-
-                if client in grouped_clients:
-                    training_labels.append(other_index)
-                else:
-                    training_labels.append(i)
-
-        feature_matrix = np.array(feature_matrix)
-
-        knn = KNeighborsClassifier(n_neighbors=K, weights=WEIGHTS)
-
-        if enable_cv:
-            self.scores = cross_validate(
-                knn, feature_matrix, training_labels, scoring="balanced_accuracy"
-            )
-        else:
-            self.scores = None
-
-        knn.fit(feature_matrix, training_labels)
-
-        self.knn = knn
-        self.enabled_clients = enabled_clients
-        self.graffiti_only_clients = set(graffiti_only_clients)
-        self.features = features
-
-        self.feature_matrix = feature_matrix
-        self.training_labels = training_labels
-
-    def classify(self, block_reward):
-        graffiti_guess = classify_reward_by_graffiti(block_reward)
-
-        if graffiti_guess in self.graffiti_only_clients:
-            prob_by_client = {graffiti_guess: 1.0}
-            return (graffiti_guess, graffiti_guess, prob_by_client, graffiti_guess)
-
-        row = into_feature_row(block_reward, self.features)
-        res = self.knn.predict_proba([row])
-
-        prob_by_client = {
-            client: res[0][i] for i, client in enumerate(self.enabled_clients)
-        }
-
-        multilabel = compute_multilabel(
-            compute_guess_list(prob_by_client, self.enabled_clients)
-        )
-
-        label = compute_best_guess(prob_by_client)
-
-        return (label, multilabel, prob_by_client, graffiti_guess)
-
-    def plot_feature_matrix(self, output_path):
-        fig = plt.figure()
-
-        ax = fig.add_subplot(projection="3d")
-
-        x = self.feature_matrix[:, 0]
-        y = self.feature_matrix[:, 1]
-        z = self.feature_matrix[:, 2]
-
-        scatter = ax.scatter(
-            x, y, z, c=self.training_labels, marker=".", alpha=0.25, cmap="Set1"
-        )
-
-        handles, _ = scatter.legend_elements()
-        labels = self.enabled_clients
-
-        legend1 = ax.legend(handles, labels, loc="best", title="Client")
-        ax.add_artist(legend1)
-
-        assert (
-            len(self.features) == 3
-        ), "must have exactly 3 features selected for plotting"
-        ax.set_xlabel(self.features[0])
-        ax.set_ylabel(self.features[1])
-        ax.set_zlabel(self.features[2])
-
-        if output_path is None:
-            fig.show()
-        else:
-            fig.savefig(output_path)
-
-
-def compute_guess_list(probability_map, enabled_clients) -> list:
-    guesses = []
-    for client in enabled_clients:
-        if probability_map[client] > CONFIDENCE_THRESHOLD:
-            return [client]
-        elif probability_map[client] > MIN_GUESS_THRESHOLD:
-            guesses.append(client)
-    return guesses
-
-
-def compute_multilabel(guess_list):
-    if len(guess_list) == 1:
-        return guess_list[0]
-    elif len(guess_list) == 2:
-        return f"{guess_list[0]} or {guess_list[1]}"
-    else:
-        return "Uncertain"
-
-
-def compute_best_guess(probability_map) -> str:
-    return max(
-        probability_map.keys(),
-        key=lambda client: probability_map[client],
-        default="Uncertain",
-    )
-
-
-def parse_args():
-    parser = argparse.ArgumentParser("KNN testing and cross validation")
-
-    parser.add_argument("data_dir", help="training data directory")
-    parser.add_argument("--classify", help="data to classify")
-    parser.add_argument(
-        "--cv", action="store_true", dest="enable_cv", help="enable cross validation"
-    )
-    parser.add_argument(
-        "--cv-group", default=0, type=int, help="number of clients to group for CV"
-    )
-    parser.add_argument(
-        "--cv-num-features", type=int, help="feature dimensionality for CV"
-    )
-    parser.add_argument(
-        "--group", default=[], nargs="+", help="clients to group during classification"
-    )
-    parser.add_argument(
-        "--persist",
-        action="store_true",
-        dest="should_persist",
-        help="if provided, the model is persisted",
-    )
-    parser.add_argument(
-        "--disable",
-        default=[],
-        nargs="+",
-        help="clients to disable during cross validation",
-    )
-    parser.add_argument(
-        "--graffiti-only",
-        default=DEFAULT_GRAFFITI_ONLY,
-        nargs="+",
-        help="clients to classify based on graffiti only",
-    )
-    parser.add_argument(
-        "--plot",
-        type=str,
-        help="output plot of 3D training data vectors (only works with --classify)",
-    )
-    return parser.parse_args()
-
-
-def persist_classifier(classifier: Classifier, name: str) -> None:
-    try:
-        filename = f"{name}.pkl"
-        with open(filename, "wb") as fid:
-            pickle.dump(classifier, fid)
-    except Exception as e:
-        print(f"Failed to persist classifier due to {e}")
-
-
-def main():
-    args = parse_args()
-    data_dir = args.data_dir
-    classify_dir = args.classify
-    enable_cv = args.enable_cv
-    num_grouped = args.cv_group
-    num_features = args.cv_num_features
-    grouped_clients = args.group
-    should_persist = args.should_persist
-    graffiti_only = args.graffiti_only
-
-    disabled_clients = args.disable
-    enabled_clients = [
-        client
-        for client in CLIENTS
-        if client not in disabled_clients and client != "Other"
-    ]
-
-    if enable_cv:
-        best_score = 0.0
-        best_features = None
-
-        print("performing cross validation")
-        if num_features is None:
-            feature_vecs = [DEFAULT_FEATURES]
-        else:
-            feature_vecs = all_feature_vecs_with_dimension(num_features)
-
-        for grouped_clients in all_client_groupings_with_dimension(
-            enabled_clients, num_grouped
-        ):
-            for feature_vec in feature_vecs:
-                print(f"features: {feature_vec}")
-                classifier = Classifier(
-                    data_dir,
-                    grouped_clients=grouped_clients,
-                    disabled_clients=disabled_clients,
-                    graffiti_only_clients=graffiti_only,
-                    features=feature_vec,
-                    enable_cv=True,
-                )
-                print(f"enabled clients: {classifier.enabled_clients}")
-                print(f"classifier scores: {classifier.scores['test_score']}")
-
-                min_score = min(classifier.scores["test_score"])
-
-                if min_score > best_score:
-                    best_features = feature_vec
-                    best_score = min_score
-
-        print(f"best features found: {best_features}")
-        print(f"score: {best_score}")
-        return
-
-    assert classify_dir is not None, "classify dir required"
-    print(f"classifying all data in directory {classify_dir}")
-    print(f"grouped clients: {grouped_clients}")
-    classifier = Classifier(data_dir, grouped_clients=grouped_clients)
-
-    if args.plot is not None:
-        classifier.plot_feature_matrix(args.plot)
-        print("plot of training data written to {}".format(args.plot))
-
-    frequency_map = {}
-    total_blocks = 0
-
-    for input_file in os.listdir(classify_dir):
-        print(f"classifying rewards from file {input_file}")
-        with open(os.path.join(classify_dir, input_file), "r") as f:
-            block_rewards = json.load(f)
-
-        for block_reward in block_rewards:
-            _, multilabel, _, _ = classifier.classify(block_reward)
-
-            if multilabel not in frequency_map:
-                frequency_map[multilabel] = 0
-
-            frequency_map[multilabel] += 1
-
-        total_blocks += len(block_rewards)
-
-    print(f"total blocks processed: {total_blocks}")
-
-    if should_persist:
-        persist_classifier(classifier, "knn_classifier")
-
-    for multilabel, num_blocks in sorted(frequency_map.items()):
-        percentage = round(num_blocks / total_blocks, 4)
-        print(f"{multilabel},{percentage}")
-
-
-if __name__ == "__main__":
-    main()

From 268a3d0b3161c7e11559ff2f299aaf1e2178c36f Mon Sep 17 00:00:00 2001
From: santi1234567 <45318759+santi1234567@users.noreply.github.com>
Date: Thu, 2 May 2024 14:53:35 +0200
Subject: [PATCH 3/6] support changing classifier type

---
 classifier.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/classifier.py b/classifier.py
index 072b4b9..ae5a446 100755
--- a/classifier.py
+++ b/classifier.py
@@ -9,12 +9,16 @@
 import pickle
 
 from sklearn.neighbors import KNeighborsClassifier
+from sklearn.neural_network import MLPClassifier
 from sklearn.model_selection import cross_validate
 from feature_selection import *  # noqa F403
 from feature_selection import ALL_FEATURES
 from prepare_training_data import CLIENTS, classify_reward_by_graffiti
 
 K = 9
+
+MLP_HIDDEN_LAYER_SIZES=(390, 870)
+
 WEIGHTS = "distance"
 
 MIN_GUESS_THRESHOLD = 0.20
@@ -69,7 +73,8 @@ def __init__(
         graffiti_only_clients=DEFAULT_GRAFFITI_ONLY,
         features=DEFAULT_FEATURES,
         enable_cv=False,
-        classifier_type='knn'
+        classifier_type='knn',
+        hidden_layer_sizes=MLP_HIDDEN_LAYER_SIZES 
     ):
         graffiti_only_clients = set(graffiti_only_clients)
 
@@ -123,7 +128,13 @@ def __init__(
 
         feature_matrix = np.array(feature_matrix)
 
-        classifier = KNeighborsClassifier(n_neighbors=K, weights=WEIGHTS)
+        if classifier_type == 'knn':
+            classifier = KNeighborsClassifier(n_neighbors=K, weights=WEIGHTS)
+        elif classifier_type == 'mlp':
+            classifier = MLPClassifier(
+                hidden_layer_sizes=hidden_layer_sizes, max_iter=1000
+            )
+        # Assert above makes sure that classifier_type is one of the valid types 
 
         if enable_cv:
             self.scores = cross_validate(

From 52a3545a7d9d9659ded17e6c0ebaa6cfa1ff906d Mon Sep 17 00:00:00 2001
From: santi1234567 <45318759+santi1234567@users.noreply.github.com>
Date: Thu, 2 May 2024 21:08:06 +0200
Subject: [PATCH 4/6] add classifier type flag

---
 classifier.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/classifier.py b/classifier.py
index ae5a446..6b0981b 100755
--- a/classifier.py
+++ b/classifier.py
@@ -251,6 +251,9 @@ def parse_args():
     parser.add_argument(
         "--group", default=[], nargs="+", help="clients to group during classification"
     )
+    parser.add_argument(
+        "--classifier-type", default="knn", choices=["knn", "mlp"], help="the type of classifier to use"
+    )
     parser.add_argument(
         "--persist",
         action="store_true",
@@ -296,7 +299,7 @@ def main():
     grouped_clients = args.group
     should_persist = args.should_persist
     graffiti_only = args.graffiti_only
-
+    classifier_type = args.classifier_type
     disabled_clients = args.disable
     enabled_clients = [
         client
@@ -326,6 +329,7 @@ def main():
                     graffiti_only_clients=graffiti_only,
                     features=feature_vec,
                     enable_cv=True,
+                    classifier_type=classifier_type
                 )
                 print(f"enabled clients: {classifier.enabled_clients}")
                 print(f"classifier scores: {classifier.scores['test_score']}")
@@ -343,7 +347,8 @@ def main():
     assert classify_dir is not None, "classify dir required"
     print(f"classifying all data in directory {classify_dir}")
     print(f"grouped clients: {grouped_clients}")
-    classifier = Classifier(data_dir, grouped_clients=grouped_clients)
+    classifier = Classifier(data_dir, grouped_clients=grouped_clients,
+                    classifier_type=classifier_type)
 
     if args.plot is not None:
         classifier.plot_feature_matrix(args.plot)

From 9c2c8a08665715543f853c0efbb9f4d21a914fcf Mon Sep 17 00:00:00 2001
From: santi1234567 <45318759+santi1234567@users.noreply.github.com>
Date: Thu, 2 May 2024 21:08:12 +0200
Subject: [PATCH 5/6] update readme

---
 README.md | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 8ec1b58..461e4e9 100644
--- a/README.md
+++ b/README.md
@@ -36,11 +36,16 @@ pip install -r requirements.txt
 pip install -r requirements-dev.txt
 ```
 
-### k-NN Classifier
+### The Classifier
 
-Blockprint's classifier is a k-nearest neighbours classifier in `knn_classifier.py`.
+Blockprint's classifier utilizes one of two machine learning algorithms:
 
-See `./knn_classifier.py --help` for command line options including cross
+- K-nearest neighbours
+- Multi-layer Perceptron
+
+These can be chosen with the `--classifier-type` flag in `classifier.py`.
+
+See `./classifier.py --help` for more command line options including cross
 validation (CV) and manual classification.
 
 ### Training the Classifier
@@ -81,10 +86,10 @@ testdata_proc
     └── 0x7fedb0da9699c93ce66966555c6719e1159ae7b3220c7053a08c8f50e2f3f56f.json
 ```
 
-You can then use this directory as the datadir argument to `./knn_classifier.py`:
+You can then use this directory as the datadir argument to `./classifier.py`:
 
 ```
-./knn_classifier.py testdata_proc --classify testdata
+./classifier.py testdata_proc --classify testdata
 ```
 
 If you then want to use the classifier to build an sqlite database:

From 3eac126b0d46d54f73963ba4d151882c7b76a003 Mon Sep 17 00:00:00 2001
From: santi1234567 <45318759+santi1234567@users.noreply.github.com>
Date: Tue, 14 May 2024 16:32:57 +0200
Subject: [PATCH 6/6] linting

---
 classifier.py     | 28 +++++++++++++++-------------
 interactive.ipynb |  2 +-
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/classifier.py b/classifier.py
index 6b0981b..9463b0f 100755
--- a/classifier.py
+++ b/classifier.py
@@ -17,7 +17,7 @@
 
 K = 9
 
-MLP_HIDDEN_LAYER_SIZES=(390, 870)
+MLP_HIDDEN_LAYER_SIZES = (390, 870)
 
 WEIGHTS = "distance"
 
@@ -73,8 +73,8 @@ def __init__(
         graffiti_only_clients=DEFAULT_GRAFFITI_ONLY,
         features=DEFAULT_FEATURES,
         enable_cv=False,
-        classifier_type='knn',
-        hidden_layer_sizes=MLP_HIDDEN_LAYER_SIZES 
+        classifier_type="knn",
+        hidden_layer_sizes=MLP_HIDDEN_LAYER_SIZES,
     ):
         graffiti_only_clients = set(graffiti_only_clients)
 
@@ -88,9 +88,7 @@ def __init__(
             set(grouped_clients) & graffiti_only_clients == set()
         ), "clients must not be both graffiti-only and grouped"
 
-        assert (
-            classifier_type in ["knn", "mlp"]
-        ), "classifier_type must be knn or mlp"
+        assert classifier_type in ["knn", "mlp"], "classifier_type must be knn or mlp"
 
         feature_matrix = []
         training_labels = []
@@ -128,13 +126,13 @@ def __init__(
 
         feature_matrix = np.array(feature_matrix)
 
-        if classifier_type == 'knn':
+        if classifier_type == "knn":
             classifier = KNeighborsClassifier(n_neighbors=K, weights=WEIGHTS)
-        elif classifier_type == 'mlp':
+        elif classifier_type == "mlp":
             classifier = MLPClassifier(
                 hidden_layer_sizes=hidden_layer_sizes, max_iter=1000
             )
-        # Assert above makes sure that classifier_type is one of the valid types 
+        # Assert above makes sure that classifier_type is one of the valid types
 
         if enable_cv:
             self.scores = cross_validate(
@@ -252,7 +250,10 @@ def parse_args():
         "--group", default=[], nargs="+", help="clients to group during classification"
     )
     parser.add_argument(
-        "--classifier-type", default="knn", choices=["knn", "mlp"], help="the type of classifier to use"
+        "--classifier-type",
+        default="knn",
+        choices=["knn", "mlp"],
+        help="the type of classifier to use",
     )
     parser.add_argument(
         "--persist",
@@ -329,7 +330,7 @@ def main():
                     graffiti_only_clients=graffiti_only,
                     features=feature_vec,
                     enable_cv=True,
-                    classifier_type=classifier_type
+                    classifier_type=classifier_type,
                 )
                 print(f"enabled clients: {classifier.enabled_clients}")
                 print(f"classifier scores: {classifier.scores['test_score']}")
@@ -347,8 +348,9 @@ def main():
     assert classify_dir is not None, "classify dir required"
     print(f"classifying all data in directory {classify_dir}")
     print(f"grouped clients: {grouped_clients}")
-    classifier = Classifier(data_dir, grouped_clients=grouped_clients,
-                    classifier_type=classifier_type)
+    classifier = Classifier(
+        data_dir, grouped_clients=grouped_clients, classifier_type=classifier_type
+    )
 
     if args.plot is not None:
         classifier.plot_feature_matrix(args.plot)
diff --git a/interactive.ipynb b/interactive.ipynb
index 2ea53a5..9facf34 100644
--- a/interactive.ipynb
+++ b/interactive.ipynb
@@ -19,7 +19,7 @@
    "source": [
     "datadir = \"data/mainnet/training/slots_3481601_to_3702784_bal2x\"\n",
     "disabled_clients = []\n",
-    "features = ['percent_redundant', 'percent_pairwise_ordered', 'norm_reward']\n",
+    "features = [\"percent_redundant\", \"percent_pairwise_ordered\", \"norm_reward\"]\n",
     "\n",
     "classifier = Classifier(datadir, disabled_clients=disabled_clients, features=features)"
    ]