From dcd00f9ee2015a3741ccf3f13af30c4d8d80b276 Mon Sep 17 00:00:00 2001 From: lbechberger Date: Tue, 12 Oct 2021 14:28:13 +0200 Subject: [PATCH] added grid search --- code/application/application.py | 2 +- code/classification/classifier.sge | 22 ++++++++++++++++++ code/classification/grid_search.sh | 28 +++++++++++++++++++++++ code/classification/run_classifier.py | 31 +++++++++++++++++++++----- data/classification/classifier.pickle | Bin 4654852 -> 4654847 bytes 5 files changed, 77 insertions(+), 6 deletions(-) create mode 100755 code/classification/classifier.sge create mode 100755 code/classification/grid_search.sh diff --git a/code/application/application.py b/code/application/application.py index 159aafaa..84ecb543 100644 --- a/code/application/application.py +++ b/code/application/application.py @@ -29,7 +29,7 @@ with open(args.dim_red_file, 'rb') as f_in: dimensionality_reduction = pickle.load(f_in) with open(args.classifier_file, 'rb') as f_in: - classifier = pickle.load(f_in) + classifier = pickle.load(f_in)["classifier"] # chain them together into a single pipeline pipeline = make_pipeline(preprocessing, feature_extraction, dimensionality_reduction, classifier) diff --git a/code/classification/classifier.sge b/code/classification/classifier.sge new file mode 100755 index 00000000..5b03d664 --- /dev/null +++ b/code/classification/classifier.sge @@ -0,0 +1,22 @@ +#!/bin/bash +#$ -N classifier +#$ -l mem=2G +#$ -cwd +#$ -pe default 2 +#$ -o $HOME +#$ -e $HOME +#$ -l h=*cippy* + +export PATH="$HOME/miniconda/bin:$PATH" +eval "$(conda shell.bash hook)" +conda activate MLinPractice + +# train classifier on training set +echo " training" +python -m code.classification.run_classifier data/dimensionality_reduction/training.pickle -e $* + +# evaluate classifier on validation set +echo " validation" +python -m code.classification.run_classifier data/dimensionality_reduction/validation.pickle -i $* + +conda deactivate \ No newline at end of file diff --git a/code/classification/grid_search.sh b/code/classification/grid_search.sh new file mode 100755 index 00000000..6897508f --- /dev/null +++ b/code/classification/grid_search.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +mkdir -p data/classification + +# specify hyperparameter values +values_of_k=("1 2 3 4 5 6 7 8 9 10") + + +# different execution modes +if [ $1 = local ] +then + echo "[local execution]" + cmd="code/classification/classifier.sge" +elif [ $1 = grid ] +then + echo "[grid execution]" + cmd="qsub code/classification/classifier.sge" +else + echo "[ERROR! Argument not supported!]" + exit 1 +fi + +# do the grid search +for k in $values_of_k +do + echo $k + $cmd 'data/classification/clf_'"$k"'.pickle' --knn $k -s 42 --accuracy --kappa +done \ No newline at end of file diff --git a/code/classification/run_classifier.py b/code/classification/run_classifier.py index b9d55245..414e0ce5 100644 --- a/code/classification/run_classifier.py +++ b/code/classification/run_classifier.py @@ -14,6 +14,7 @@ from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline +from mlflow import log_metric, log_param, set_tracking_uri # setting up CLI parser = argparse.ArgumentParser(description = "Classifier") @@ -26,37 +27,54 @@ parser.add_argument("--knn", type = int, help = "k nearest neighbor classifier with the specified value of k", default = None) parser.add_argument("-a", "--accuracy", action = "store_true", help = "evaluate using accuracy") parser.add_argument("-k", "--kappa", action = "store_true", help = "evaluate using Cohen's kappa") +parser.add_argument("--log_folder", help = "where to log the mlflow results", default = "data/classification/mlflow") args = parser.parse_args() # load data with open(args.input_file, 'rb') as f_in: data = pickle.load(f_in) +set_tracking_uri(args.log_folder) + if args.import_file is not None: # import a pre-trained classifier with open(args.import_file, 'rb') as f_in: - classifier = pickle.load(f_in) + input_dict = pickle.load(f_in) + + classifier = input_dict["classifier"] + for param, value in input_dict["params"].items(): + log_param(param, value) + + log_param("dataset", "validation") else: # manually set up a classifier if args.majority: # majority vote classifier print(" majority vote classifier") + log_param("classifier", "majority") + params = {"classifier": "majority"} classifier = DummyClassifier(strategy = "most_frequent", random_state = args.seed) elif args.frequency: # label frequency classifier print(" label frequency classifier") + log_param("classifier", "frequency") + params = {"classifier": "frequency"} classifier = DummyClassifier(strategy = "stratified", random_state = args.seed) elif args.knn is not None: print(" {0} nearest neighbor classifier".format(args.knn)) + log_param("classifier", "knn") + log_param("k", args.knn) + params = {"classifier": "knn", "k": args.knn} standardizer = StandardScaler() - knn_classifier = KNeighborsClassifier(args.knn) + knn_classifier = KNeighborsClassifier(args.knn, n_jobs = -1) classifier = make_pipeline(standardizer, knn_classifier) classifier.fit(data["features"], data["labels"].ravel()) + log_param("dataset", "training") # now classify the given data prediction = classifier.predict(data["features"]) @@ -66,13 +84,16 @@ if args.accuracy: evaluation_metrics.append(("accuracy", accuracy_score)) if args.kappa: - evaluation_metrics.append(("Cohen's kappa", cohen_kappa_score)) + evaluation_metrics.append(("Cohen_kappa", cohen_kappa_score)) # compute and print them for metric_name, metric in evaluation_metrics: - print(" {0}: {1}".format(metric_name, metric(data["labels"], prediction))) + metric_value = metric(data["labels"], prediction) + print(" {0}: {1}".format(metric_name, metric_value)) + log_metric(metric_name, metric_value) # export the trained classifier if the user wants us to do so if args.export_file is not None: + output_dict = {"classifier": classifier, "params": params} with open(args.export_file, 'wb') as f_out: - pickle.dump(classifier, f_out) \ No newline at end of file + pickle.dump(output_dict, f_out) \ No newline at end of file diff --git a/data/classification/classifier.pickle b/data/classification/classifier.pickle index 27bc682155271132b201a0354f8ab274889d03a2..012911f3aa6c7c8b0e931f7a9384449a3469cdad 100644 GIT binary patch delta 176 zcmWm0Nj8E3007{>zd>e-3?U7M&|nUUR4iP;9lY0jE!tarTTajov~($_$n&i}U!!pH zISM~SgrZ{N5~d_gn=xz7yakJvEK5mSu_|NDx(%DMwrtyxvn#KlsASK+1Bc3v96M1_ zRa1BB%()Afu3T%laqCXgy_N@$+MaZD^*sN){3bHdAPC<3{lWKJd(007X}OUam$A)ySRkfF>|Wm@F(kA<-Yye`n-+&_p$YL I6YF#;e=smS2><{9