From 1dff16c08f84199d4ec3b89f1cd9f6724d5a45d9 Mon Sep 17 00:00:00 2001 From: santi1234567 <45318759+santi1234567@users.noreply.github.com> Date: Tue, 30 Apr 2024 21:15:18 +0200 Subject: [PATCH 1/6] rename knn_classifier to classifier --- build_db.py | 4 +- classifier.py | 370 +++++++++++++++++++++++++++++ compute_periods.py | 2 +- interactive.ipynb | 2 +- multi_classifier.py | 2 +- prepare_training_data.py | 4 +- tests/test_classifier_persister.py | 2 +- 7 files changed, 378 insertions(+), 8 deletions(-) create mode 100755 classifier.py diff --git a/build_db.py b/build_db.py index 0726bc9..002df7b 100755 --- a/build_db.py +++ b/build_db.py @@ -4,7 +4,7 @@ import json import sqlite3 import argparse -from knn_classifier import Classifier +from classifier import Classifier from multi_classifier import MultiClassifier from prepare_training_data import CLIENTS @@ -370,7 +370,7 @@ def main(): if args.multi_classifier: classifier = MultiClassifier(data_dir) else: - print("loading single KNN classifier") + print("loading single classifier") classifier = Classifier(data_dir) print("loaded") diff --git a/classifier.py b/classifier.py new file mode 100755 index 0000000..072b4b9 --- /dev/null +++ b/classifier.py @@ -0,0 +1,370 @@ +#!/usr/bin/env python3 + +import os +import json +import itertools +import argparse +import numpy as np +import matplotlib.pyplot as plt +import pickle + +from sklearn.neighbors import KNeighborsClassifier +from sklearn.model_selection import cross_validate +from feature_selection import * # noqa F403 +from feature_selection import ALL_FEATURES +from prepare_training_data import CLIENTS, classify_reward_by_graffiti + +K = 9 +WEIGHTS = "distance" + +MIN_GUESS_THRESHOLD = 0.20 +CONFIDENCE_THRESHOLD = 0.95 + +DEFAULT_FEATURES = [ + "percent_redundant_boost", + "difflib_rewards", + "difflib_slot", + "difflib_slot_rev", +] + +DEFAULT_GRAFFITI_ONLY = ["Lodestar"] + +VIABLE_FEATURES = [ + "percent_redundant_boost", + "percent_pairwise_ordered", + "difflib_rewards", + "difflib_slot_index", + "difflib_index_slot", + "difflib_slot_index_rev", + "difflib_index_slot_rev", + "difflib_slot", + "difflib_slot_rev", + "spearman_correlation", + "norm_reward", + "mean_density", + "percent_single_bit", + "difflib_slot_reward", + "difflib_slot_reward_rev", +] + + +def all_feature_vecs_with_dimension(dimension): + return sorted(map(list, itertools.combinations(VIABLE_FEATURES, dimension))) + + +def all_client_groupings_with_dimension(enabled_clients, dimension): + return sorted(map(list, itertools.combinations(enabled_clients, dimension))) + + +def into_feature_row(block_reward, features): + return [ALL_FEATURES[feature](block_reward) for feature in features] + + +class Classifier: + def __init__( + self, + data_dir, + grouped_clients=[], + disabled_clients=[], + graffiti_only_clients=DEFAULT_GRAFFITI_ONLY, + features=DEFAULT_FEATURES, + enable_cv=False, + classifier_type='knn' + ): + graffiti_only_clients = set(graffiti_only_clients) + + assert ( + set(disabled_clients) & graffiti_only_clients == set() + ), "clients must not be both graffiti-only and disabled" + assert ( + set(disabled_clients) & set(grouped_clients) == set() + ), "clients must not be both disabled and grouped" + assert ( + set(grouped_clients) & graffiti_only_clients == set() + ), "clients must not be both graffiti-only and grouped" + + assert ( + classifier_type in ["knn", "mlp"] + ), "classifier_type must be knn or mlp" + + feature_matrix = [] + training_labels = [] + + enabled_clients = [] + other_index = CLIENTS.index("Other") + + for i, client in enumerate(CLIENTS): + if client in disabled_clients or client in graffiti_only_clients: + continue + + client_dir = os.path.join(data_dir, client) + + if os.path.exists(client_dir): + if client not in grouped_clients: + enabled_clients.append(client) + else: + if client == "Other" and len(grouped_clients) > 0: + enabled_clients.append(client) + continue + + for reward_file in os.listdir(client_dir): + with open(os.path.join(client_dir, reward_file), "r") as f: + block_reward = json.load(f) + + feature_row = into_feature_row(block_reward, features) + feature_matrix.append(feature_row) + + # print(f"{client}: {feature_row}") + + if client in grouped_clients: + training_labels.append(other_index) + else: + training_labels.append(i) + + feature_matrix = np.array(feature_matrix) + + classifier = KNeighborsClassifier(n_neighbors=K, weights=WEIGHTS) + + if enable_cv: + self.scores = cross_validate( + classifier, feature_matrix, training_labels, scoring="balanced_accuracy" + ) + else: + self.scores = None + + classifier.fit(feature_matrix, training_labels) + + self.classifier = classifier + self.enabled_clients = enabled_clients + self.graffiti_only_clients = set(graffiti_only_clients) + self.features = features + + self.feature_matrix = feature_matrix + self.training_labels = training_labels + + def classify(self, block_reward): + graffiti_guess = classify_reward_by_graffiti(block_reward) + + if graffiti_guess in self.graffiti_only_clients: + prob_by_client = {graffiti_guess: 1.0} + return (graffiti_guess, graffiti_guess, prob_by_client, graffiti_guess) + + row = into_feature_row(block_reward, self.features) + res = self.classifier.predict_proba([row]) + + prob_by_client = { + client: res[0][i] for i, client in enumerate(self.enabled_clients) + } + + multilabel = compute_multilabel( + compute_guess_list(prob_by_client, self.enabled_clients) + ) + + label = compute_best_guess(prob_by_client) + + return (label, multilabel, prob_by_client, graffiti_guess) + + def plot_feature_matrix(self, output_path): + fig = plt.figure() + + ax = fig.add_subplot(projection="3d") + + x = self.feature_matrix[:, 0] + y = self.feature_matrix[:, 1] + z = self.feature_matrix[:, 2] + + scatter = ax.scatter( + x, y, z, c=self.training_labels, marker=".", alpha=0.25, cmap="Set1" + ) + + handles, _ = scatter.legend_elements() + labels = self.enabled_clients + + legend1 = ax.legend(handles, labels, loc="best", title="Client") + ax.add_artist(legend1) + + assert ( + len(self.features) == 3 + ), "must have exactly 3 features selected for plotting" + ax.set_xlabel(self.features[0]) + ax.set_ylabel(self.features[1]) + ax.set_zlabel(self.features[2]) + + if output_path is None: + fig.show() + else: + fig.savefig(output_path) + + +def compute_guess_list(probability_map, enabled_clients) -> list: + guesses = [] + for client in enabled_clients: + if probability_map[client] > CONFIDENCE_THRESHOLD: + return [client] + elif probability_map[client] > MIN_GUESS_THRESHOLD: + guesses.append(client) + return guesses + + +def compute_multilabel(guess_list): + if len(guess_list) == 1: + return guess_list[0] + elif len(guess_list) == 2: + return f"{guess_list[0]} or {guess_list[1]}" + else: + return "Uncertain" + + +def compute_best_guess(probability_map) -> str: + return max( + probability_map.keys(), + key=lambda client: probability_map[client], + default="Uncertain", + ) + + +def parse_args(): + parser = argparse.ArgumentParser("Classifier testing and cross validation") + + parser.add_argument("data_dir", help="training data directory") + parser.add_argument("--classify", help="data to classify") + parser.add_argument( + "--cv", action="store_true", dest="enable_cv", help="enable cross validation" + ) + parser.add_argument( + "--cv-group", default=0, type=int, help="number of clients to group for CV" + ) + parser.add_argument( + "--cv-num-features", type=int, help="feature dimensionality for CV" + ) + parser.add_argument( + "--group", default=[], nargs="+", help="clients to group during classification" + ) + parser.add_argument( + "--persist", + action="store_true", + dest="should_persist", + help="if provided, the model is persisted", + ) + parser.add_argument( + "--disable", + default=[], + nargs="+", + help="clients to disable during cross validation", + ) + parser.add_argument( + "--graffiti-only", + default=DEFAULT_GRAFFITI_ONLY, + nargs="+", + help="clients to classify based on graffiti only", + ) + parser.add_argument( + "--plot", + type=str, + help="output plot of 3D training data vectors (only works with --classify)", + ) + return parser.parse_args() + + +def persist_classifier(classifier: Classifier, name: str) -> None: + try: + filename = f"{name}.pkl" + with open(filename, "wb") as fid: + pickle.dump(classifier, fid) + except Exception as e: + print(f"Failed to persist classifier due to {e}") + + +def main(): + args = parse_args() + data_dir = args.data_dir + classify_dir = args.classify + enable_cv = args.enable_cv + num_grouped = args.cv_group + num_features = args.cv_num_features + grouped_clients = args.group + should_persist = args.should_persist + graffiti_only = args.graffiti_only + + disabled_clients = args.disable + enabled_clients = [ + client + for client in CLIENTS + if client not in disabled_clients and client != "Other" + ] + + if enable_cv: + best_score = 0.0 + best_features = None + + print("performing cross validation") + if num_features is None: + feature_vecs = [DEFAULT_FEATURES] + else: + feature_vecs = all_feature_vecs_with_dimension(num_features) + + for grouped_clients in all_client_groupings_with_dimension( + enabled_clients, num_grouped + ): + for feature_vec in feature_vecs: + print(f"features: {feature_vec}") + classifier = Classifier( + data_dir, + grouped_clients=grouped_clients, + disabled_clients=disabled_clients, + graffiti_only_clients=graffiti_only, + features=feature_vec, + enable_cv=True, + ) + print(f"enabled clients: {classifier.enabled_clients}") + print(f"classifier scores: {classifier.scores['test_score']}") + + min_score = min(classifier.scores["test_score"]) + + if min_score > best_score: + best_features = feature_vec + best_score = min_score + + print(f"best features found: {best_features}") + print(f"score: {best_score}") + return + + assert classify_dir is not None, "classify dir required" + print(f"classifying all data in directory {classify_dir}") + print(f"grouped clients: {grouped_clients}") + classifier = Classifier(data_dir, grouped_clients=grouped_clients) + + if args.plot is not None: + classifier.plot_feature_matrix(args.plot) + print("plot of training data written to {}".format(args.plot)) + + frequency_map = {} + total_blocks = 0 + + for input_file in os.listdir(classify_dir): + print(f"classifying rewards from file {input_file}") + with open(os.path.join(classify_dir, input_file), "r") as f: + block_rewards = json.load(f) + + for block_reward in block_rewards: + _, multilabel, _, _ = classifier.classify(block_reward) + + if multilabel not in frequency_map: + frequency_map[multilabel] = 0 + + frequency_map[multilabel] += 1 + + total_blocks += len(block_rewards) + + print(f"total blocks processed: {total_blocks}") + + if should_persist: + persist_classifier(classifier, "classifier") + + for multilabel, num_blocks in sorted(frequency_map.items()): + percentage = round(num_blocks / total_blocks, 4) + print(f"{multilabel},{percentage}") + + +if __name__ == "__main__": + main() diff --git a/compute_periods.py b/compute_periods.py index f2b708c..192e3ae 100644 --- a/compute_periods.py +++ b/compute_periods.py @@ -5,7 +5,7 @@ import sqlite3 import requests import statistics -from knn_classifier import compute_best_guess +from classifier import compute_best_guess from prepare_training_data import CLIENTS from build_db import block_row_to_obj diff --git a/interactive.ipynb b/interactive.ipynb index c6f591b..2ea53a5 100644 --- a/interactive.ipynb +++ b/interactive.ipynb @@ -7,7 +7,7 @@ "metadata": {}, "outputs": [], "source": [ - "from knn_classifier import Classifier, DEFAULT_FEATURES" + "from classifier import Classifier, DEFAULT_FEATURES" ] }, { diff --git a/multi_classifier.py b/multi_classifier.py index 4ebfc82..d898734 100644 --- a/multi_classifier.py +++ b/multi_classifier.py @@ -1,6 +1,6 @@ import os -from knn_classifier import Classifier +from classifier import Classifier def start_and_end_slot(sub_dir_name) -> (int, int): diff --git a/prepare_training_data.py b/prepare_training_data.py index c7e5941..a9a2c63 100755 --- a/prepare_training_data.py +++ b/prepare_training_data.py @@ -75,13 +75,13 @@ def process_file( def parse_args(): - parser = argparse.ArgumentParser("create training data for the KNN classifier") + parser = argparse.ArgumentParser("create training data for the classifier") parser.add_argument( "raw_data_dir", help="input containing data to classify using graffiti" ) parser.add_argument( - "proc_data_dir", help="output for processed data, suitable for KNN training" + "proc_data_dir", help="output for processed data, suitable for training" ) parser.add_argument( "--disable", diff --git a/tests/test_classifier_persister.py b/tests/test_classifier_persister.py index 9de8d27..d2c8db4 100644 --- a/tests/test_classifier_persister.py +++ b/tests/test_classifier_persister.py @@ -2,7 +2,7 @@ import json import os from typing import Any, Dict, List -from knn_classifier import Classifier, persist_classifier +from classifier import Classifier, persist_classifier from prepare_training_data import CLIENTS From 58ee23249e1ed951db628f32156e0c39de91e686 Mon Sep 17 00:00:00 2001 From: santi1234567 <45318759+santi1234567@users.noreply.github.com> Date: Tue, 30 Apr 2024 21:19:43 +0200 Subject: [PATCH 2/6] remove deprecated file --- knn_classifier.py | 365 ---------------------------------------------- 1 file changed, 365 deletions(-) delete mode 100755 knn_classifier.py diff --git a/knn_classifier.py b/knn_classifier.py deleted file mode 100755 index d43aa28..0000000 --- a/knn_classifier.py +++ /dev/null @@ -1,365 +0,0 @@ -#!/usr/bin/env python3 - -import os -import json -import itertools -import argparse -import numpy as np -import matplotlib.pyplot as plt -import pickle - -from sklearn.neighbors import KNeighborsClassifier -from sklearn.model_selection import cross_validate -from feature_selection import * # noqa F403 -from feature_selection import ALL_FEATURES -from prepare_training_data import CLIENTS, classify_reward_by_graffiti - -K = 9 -WEIGHTS = "distance" - -MIN_GUESS_THRESHOLD = 0.20 -CONFIDENCE_THRESHOLD = 0.95 - -DEFAULT_FEATURES = [ - "percent_redundant_boost", - "difflib_rewards", - "difflib_slot", - "difflib_slot_rev", -] - -DEFAULT_GRAFFITI_ONLY = ["Lodestar"] - -VIABLE_FEATURES = [ - "percent_redundant_boost", - "percent_pairwise_ordered", - "difflib_rewards", - "difflib_slot_index", - "difflib_index_slot", - "difflib_slot_index_rev", - "difflib_index_slot_rev", - "difflib_slot", - "difflib_slot_rev", - "spearman_correlation", - "norm_reward", - "mean_density", - "percent_single_bit", - "difflib_slot_reward", - "difflib_slot_reward_rev", -] - - -def all_feature_vecs_with_dimension(dimension): - return sorted(map(list, itertools.combinations(VIABLE_FEATURES, dimension))) - - -def all_client_groupings_with_dimension(enabled_clients, dimension): - return sorted(map(list, itertools.combinations(enabled_clients, dimension))) - - -def into_feature_row(block_reward, features): - return [ALL_FEATURES[feature](block_reward) for feature in features] - - -class Classifier: - def __init__( - self, - data_dir, - grouped_clients=[], - disabled_clients=[], - graffiti_only_clients=DEFAULT_GRAFFITI_ONLY, - features=DEFAULT_FEATURES, - enable_cv=False, - ): - graffiti_only_clients = set(graffiti_only_clients) - - assert ( - set(disabled_clients) & graffiti_only_clients == set() - ), "clients must not be both graffiti-only and disabled" - assert ( - set(disabled_clients) & set(grouped_clients) == set() - ), "clients must not be both disabled and grouped" - assert ( - set(grouped_clients) & graffiti_only_clients == set() - ), "clients must not be both graffiti-only and grouped" - - feature_matrix = [] - training_labels = [] - - enabled_clients = [] - other_index = CLIENTS.index("Other") - - for i, client in enumerate(CLIENTS): - if client in disabled_clients or client in graffiti_only_clients: - continue - - client_dir = os.path.join(data_dir, client) - - if os.path.exists(client_dir): - if client not in grouped_clients: - enabled_clients.append(client) - else: - if client == "Other" and len(grouped_clients) > 0: - enabled_clients.append(client) - continue - - for reward_file in os.listdir(client_dir): - with open(os.path.join(client_dir, reward_file), "r") as f: - block_reward = json.load(f) - - feature_row = into_feature_row(block_reward, features) - feature_matrix.append(feature_row) - - # print(f"{client}: {feature_row}") - - if client in grouped_clients: - training_labels.append(other_index) - else: - training_labels.append(i) - - feature_matrix = np.array(feature_matrix) - - knn = KNeighborsClassifier(n_neighbors=K, weights=WEIGHTS) - - if enable_cv: - self.scores = cross_validate( - knn, feature_matrix, training_labels, scoring="balanced_accuracy" - ) - else: - self.scores = None - - knn.fit(feature_matrix, training_labels) - - self.knn = knn - self.enabled_clients = enabled_clients - self.graffiti_only_clients = set(graffiti_only_clients) - self.features = features - - self.feature_matrix = feature_matrix - self.training_labels = training_labels - - def classify(self, block_reward): - graffiti_guess = classify_reward_by_graffiti(block_reward) - - if graffiti_guess in self.graffiti_only_clients: - prob_by_client = {graffiti_guess: 1.0} - return (graffiti_guess, graffiti_guess, prob_by_client, graffiti_guess) - - row = into_feature_row(block_reward, self.features) - res = self.knn.predict_proba([row]) - - prob_by_client = { - client: res[0][i] for i, client in enumerate(self.enabled_clients) - } - - multilabel = compute_multilabel( - compute_guess_list(prob_by_client, self.enabled_clients) - ) - - label = compute_best_guess(prob_by_client) - - return (label, multilabel, prob_by_client, graffiti_guess) - - def plot_feature_matrix(self, output_path): - fig = plt.figure() - - ax = fig.add_subplot(projection="3d") - - x = self.feature_matrix[:, 0] - y = self.feature_matrix[:, 1] - z = self.feature_matrix[:, 2] - - scatter = ax.scatter( - x, y, z, c=self.training_labels, marker=".", alpha=0.25, cmap="Set1" - ) - - handles, _ = scatter.legend_elements() - labels = self.enabled_clients - - legend1 = ax.legend(handles, labels, loc="best", title="Client") - ax.add_artist(legend1) - - assert ( - len(self.features) == 3 - ), "must have exactly 3 features selected for plotting" - ax.set_xlabel(self.features[0]) - ax.set_ylabel(self.features[1]) - ax.set_zlabel(self.features[2]) - - if output_path is None: - fig.show() - else: - fig.savefig(output_path) - - -def compute_guess_list(probability_map, enabled_clients) -> list: - guesses = [] - for client in enabled_clients: - if probability_map[client] > CONFIDENCE_THRESHOLD: - return [client] - elif probability_map[client] > MIN_GUESS_THRESHOLD: - guesses.append(client) - return guesses - - -def compute_multilabel(guess_list): - if len(guess_list) == 1: - return guess_list[0] - elif len(guess_list) == 2: - return f"{guess_list[0]} or {guess_list[1]}" - else: - return "Uncertain" - - -def compute_best_guess(probability_map) -> str: - return max( - probability_map.keys(), - key=lambda client: probability_map[client], - default="Uncertain", - ) - - -def parse_args(): - parser = argparse.ArgumentParser("KNN testing and cross validation") - - parser.add_argument("data_dir", help="training data directory") - parser.add_argument("--classify", help="data to classify") - parser.add_argument( - "--cv", action="store_true", dest="enable_cv", help="enable cross validation" - ) - parser.add_argument( - "--cv-group", default=0, type=int, help="number of clients to group for CV" - ) - parser.add_argument( - "--cv-num-features", type=int, help="feature dimensionality for CV" - ) - parser.add_argument( - "--group", default=[], nargs="+", help="clients to group during classification" - ) - parser.add_argument( - "--persist", - action="store_true", - dest="should_persist", - help="if provided, the model is persisted", - ) - parser.add_argument( - "--disable", - default=[], - nargs="+", - help="clients to disable during cross validation", - ) - parser.add_argument( - "--graffiti-only", - default=DEFAULT_GRAFFITI_ONLY, - nargs="+", - help="clients to classify based on graffiti only", - ) - parser.add_argument( - "--plot", - type=str, - help="output plot of 3D training data vectors (only works with --classify)", - ) - return parser.parse_args() - - -def persist_classifier(classifier: Classifier, name: str) -> None: - try: - filename = f"{name}.pkl" - with open(filename, "wb") as fid: - pickle.dump(classifier, fid) - except Exception as e: - print(f"Failed to persist classifier due to {e}") - - -def main(): - args = parse_args() - data_dir = args.data_dir - classify_dir = args.classify - enable_cv = args.enable_cv - num_grouped = args.cv_group - num_features = args.cv_num_features - grouped_clients = args.group - should_persist = args.should_persist - graffiti_only = args.graffiti_only - - disabled_clients = args.disable - enabled_clients = [ - client - for client in CLIENTS - if client not in disabled_clients and client != "Other" - ] - - if enable_cv: - best_score = 0.0 - best_features = None - - print("performing cross validation") - if num_features is None: - feature_vecs = [DEFAULT_FEATURES] - else: - feature_vecs = all_feature_vecs_with_dimension(num_features) - - for grouped_clients in all_client_groupings_with_dimension( - enabled_clients, num_grouped - ): - for feature_vec in feature_vecs: - print(f"features: {feature_vec}") - classifier = Classifier( - data_dir, - grouped_clients=grouped_clients, - disabled_clients=disabled_clients, - graffiti_only_clients=graffiti_only, - features=feature_vec, - enable_cv=True, - ) - print(f"enabled clients: {classifier.enabled_clients}") - print(f"classifier scores: {classifier.scores['test_score']}") - - min_score = min(classifier.scores["test_score"]) - - if min_score > best_score: - best_features = feature_vec - best_score = min_score - - print(f"best features found: {best_features}") - print(f"score: {best_score}") - return - - assert classify_dir is not None, "classify dir required" - print(f"classifying all data in directory {classify_dir}") - print(f"grouped clients: {grouped_clients}") - classifier = Classifier(data_dir, grouped_clients=grouped_clients) - - if args.plot is not None: - classifier.plot_feature_matrix(args.plot) - print("plot of training data written to {}".format(args.plot)) - - frequency_map = {} - total_blocks = 0 - - for input_file in os.listdir(classify_dir): - print(f"classifying rewards from file {input_file}") - with open(os.path.join(classify_dir, input_file), "r") as f: - block_rewards = json.load(f) - - for block_reward in block_rewards: - _, multilabel, _, _ = classifier.classify(block_reward) - - if multilabel not in frequency_map: - frequency_map[multilabel] = 0 - - frequency_map[multilabel] += 1 - - total_blocks += len(block_rewards) - - print(f"total blocks processed: {total_blocks}") - - if should_persist: - persist_classifier(classifier, "knn_classifier") - - for multilabel, num_blocks in sorted(frequency_map.items()): - percentage = round(num_blocks / total_blocks, 4) - print(f"{multilabel},{percentage}") - - -if __name__ == "__main__": - main() From 268a3d0b3161c7e11559ff2f299aaf1e2178c36f Mon Sep 17 00:00:00 2001 From: santi1234567 <45318759+santi1234567@users.noreply.github.com> Date: Thu, 2 May 2024 14:53:35 +0200 Subject: [PATCH 3/6] support changing classifier type --- classifier.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/classifier.py b/classifier.py index 072b4b9..ae5a446 100755 --- a/classifier.py +++ b/classifier.py @@ -9,12 +9,16 @@ import pickle from sklearn.neighbors import KNeighborsClassifier +from sklearn.neural_network import MLPClassifier from sklearn.model_selection import cross_validate from feature_selection import * # noqa F403 from feature_selection import ALL_FEATURES from prepare_training_data import CLIENTS, classify_reward_by_graffiti K = 9 + +MLP_HIDDEN_LAYER_SIZES=(390, 870) + WEIGHTS = "distance" MIN_GUESS_THRESHOLD = 0.20 @@ -69,7 +73,8 @@ def __init__( graffiti_only_clients=DEFAULT_GRAFFITI_ONLY, features=DEFAULT_FEATURES, enable_cv=False, - classifier_type='knn' + classifier_type='knn', + hidden_layer_sizes=MLP_HIDDEN_LAYER_SIZES ): graffiti_only_clients = set(graffiti_only_clients) @@ -123,7 +128,13 @@ def __init__( feature_matrix = np.array(feature_matrix) - classifier = KNeighborsClassifier(n_neighbors=K, weights=WEIGHTS) + if classifier_type == 'knn': + classifier = KNeighborsClassifier(n_neighbors=K, weights=WEIGHTS) + elif classifier_type == 'mlp': + classifier = MLPClassifier( + hidden_layer_sizes=hidden_layer_sizes, max_iter=1000 + ) + # Assert above makes sure that classifier_type is one of the valid types if enable_cv: self.scores = cross_validate( From 52a3545a7d9d9659ded17e6c0ebaa6cfa1ff906d Mon Sep 17 00:00:00 2001 From: santi1234567 <45318759+santi1234567@users.noreply.github.com> Date: Thu, 2 May 2024 21:08:06 +0200 Subject: [PATCH 4/6] add classifier type flag --- classifier.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/classifier.py b/classifier.py index ae5a446..6b0981b 100755 --- a/classifier.py +++ b/classifier.py @@ -251,6 +251,9 @@ def parse_args(): parser.add_argument( "--group", default=[], nargs="+", help="clients to group during classification" ) + parser.add_argument( + "--classifier-type", default="knn", choices=["knn", "mlp"], help="the type of classifier to use" + ) parser.add_argument( "--persist", action="store_true", @@ -296,7 +299,7 @@ def main(): grouped_clients = args.group should_persist = args.should_persist graffiti_only = args.graffiti_only - + classifier_type = args.classifier_type disabled_clients = args.disable enabled_clients = [ client @@ -326,6 +329,7 @@ def main(): graffiti_only_clients=graffiti_only, features=feature_vec, enable_cv=True, + classifier_type=classifier_type ) print(f"enabled clients: {classifier.enabled_clients}") print(f"classifier scores: {classifier.scores['test_score']}") @@ -343,7 +347,8 @@ def main(): assert classify_dir is not None, "classify dir required" print(f"classifying all data in directory {classify_dir}") print(f"grouped clients: {grouped_clients}") - classifier = Classifier(data_dir, grouped_clients=grouped_clients) + classifier = Classifier(data_dir, grouped_clients=grouped_clients, + classifier_type=classifier_type) if args.plot is not None: classifier.plot_feature_matrix(args.plot) From 9c2c8a08665715543f853c0efbb9f4d21a914fcf Mon Sep 17 00:00:00 2001 From: santi1234567 <45318759+santi1234567@users.noreply.github.com> Date: Thu, 2 May 2024 21:08:12 +0200 Subject: [PATCH 5/6] update readme --- README.md | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 8ec1b58..461e4e9 100644 --- a/README.md +++ b/README.md @@ -36,11 +36,16 @@ pip install -r requirements.txt pip install -r requirements-dev.txt ``` -### k-NN Classifier +### The Classifier -Blockprint's classifier is a k-nearest neighbours classifier in `knn_classifier.py`. +Blockprint's classifier utilizes one of two machine learning algorithms: -See `./knn_classifier.py --help` for command line options including cross +- K-nearest neighbours +- Multi-layer Perceptron + +These can be chosen with the `--classifier-type` flag in `classifier.py`. + +See `./classifier.py --help` for more command line options including cross validation (CV) and manual classification. ### Training the Classifier @@ -81,10 +86,10 @@ testdata_proc └── 0x7fedb0da9699c93ce66966555c6719e1159ae7b3220c7053a08c8f50e2f3f56f.json ``` -You can then use this directory as the datadir argument to `./knn_classifier.py`: +You can then use this directory as the datadir argument to `./classifier.py`: ``` -./knn_classifier.py testdata_proc --classify testdata +./classifier.py testdata_proc --classify testdata ``` If you then want to use the classifier to build an sqlite database: From 3eac126b0d46d54f73963ba4d151882c7b76a003 Mon Sep 17 00:00:00 2001 From: santi1234567 <45318759+santi1234567@users.noreply.github.com> Date: Tue, 14 May 2024 16:32:57 +0200 Subject: [PATCH 6/6] linting --- classifier.py | 28 +++++++++++++++------------- interactive.ipynb | 2 +- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/classifier.py b/classifier.py index 6b0981b..9463b0f 100755 --- a/classifier.py +++ b/classifier.py @@ -17,7 +17,7 @@ K = 9 -MLP_HIDDEN_LAYER_SIZES=(390, 870) +MLP_HIDDEN_LAYER_SIZES = (390, 870) WEIGHTS = "distance" @@ -73,8 +73,8 @@ def __init__( graffiti_only_clients=DEFAULT_GRAFFITI_ONLY, features=DEFAULT_FEATURES, enable_cv=False, - classifier_type='knn', - hidden_layer_sizes=MLP_HIDDEN_LAYER_SIZES + classifier_type="knn", + hidden_layer_sizes=MLP_HIDDEN_LAYER_SIZES, ): graffiti_only_clients = set(graffiti_only_clients) @@ -88,9 +88,7 @@ def __init__( set(grouped_clients) & graffiti_only_clients == set() ), "clients must not be both graffiti-only and grouped" - assert ( - classifier_type in ["knn", "mlp"] - ), "classifier_type must be knn or mlp" + assert classifier_type in ["knn", "mlp"], "classifier_type must be knn or mlp" feature_matrix = [] training_labels = [] @@ -128,13 +126,13 @@ def __init__( feature_matrix = np.array(feature_matrix) - if classifier_type == 'knn': + if classifier_type == "knn": classifier = KNeighborsClassifier(n_neighbors=K, weights=WEIGHTS) - elif classifier_type == 'mlp': + elif classifier_type == "mlp": classifier = MLPClassifier( hidden_layer_sizes=hidden_layer_sizes, max_iter=1000 ) - # Assert above makes sure that classifier_type is one of the valid types + # Assert above makes sure that classifier_type is one of the valid types if enable_cv: self.scores = cross_validate( @@ -252,7 +250,10 @@ def parse_args(): "--group", default=[], nargs="+", help="clients to group during classification" ) parser.add_argument( - "--classifier-type", default="knn", choices=["knn", "mlp"], help="the type of classifier to use" + "--classifier-type", + default="knn", + choices=["knn", "mlp"], + help="the type of classifier to use", ) parser.add_argument( "--persist", @@ -329,7 +330,7 @@ def main(): graffiti_only_clients=graffiti_only, features=feature_vec, enable_cv=True, - classifier_type=classifier_type + classifier_type=classifier_type, ) print(f"enabled clients: {classifier.enabled_clients}") print(f"classifier scores: {classifier.scores['test_score']}") @@ -347,8 +348,9 @@ def main(): assert classify_dir is not None, "classify dir required" print(f"classifying all data in directory {classify_dir}") print(f"grouped clients: {grouped_clients}") - classifier = Classifier(data_dir, grouped_clients=grouped_clients, - classifier_type=classifier_type) + classifier = Classifier( + data_dir, grouped_clients=grouped_clients, classifier_type=classifier_type + ) if args.plot is not None: classifier.plot_feature_matrix(args.plot) diff --git a/interactive.ipynb b/interactive.ipynb index 2ea53a5..9facf34 100644 --- a/interactive.ipynb +++ b/interactive.ipynb @@ -19,7 +19,7 @@ "source": [ "datadir = \"data/mainnet/training/slots_3481601_to_3702784_bal2x\"\n", "disabled_clients = []\n", - "features = ['percent_redundant', 'percent_pairwise_ordered', 'norm_reward']\n", + "features = [\"percent_redundant\", \"percent_pairwise_ordered\", \"norm_reward\"]\n", "\n", "classifier = Classifier(datadir, disabled_clients=disabled_clients, features=features)" ]