Skip to content

Commit

Permalink
added grid search
Browse files Browse the repository at this point in the history
  • Loading branch information
lbechberger committed Oct 12, 2021
1 parent 428e765 commit dcd00f9
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 6 deletions.
2 changes: 1 addition & 1 deletion code/application/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
with open(args.dim_red_file, 'rb') as f_in:
dimensionality_reduction = pickle.load(f_in)
with open(args.classifier_file, 'rb') as f_in:
classifier = pickle.load(f_in)
classifier = pickle.load(f_in)["classifier"]

# chain them together into a single pipeline
pipeline = make_pipeline(preprocessing, feature_extraction, dimensionality_reduction, classifier)
Expand Down
22 changes: 22 additions & 0 deletions code/classification/classifier.sge
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
#$ -N classifier
#$ -l mem=2G
#$ -cwd
#$ -pe default 2
#$ -o $HOME
#$ -e $HOME
#$ -l h=*cippy*

export PATH="$HOME/miniconda/bin:$PATH"
eval "$(conda shell.bash hook)"
conda activate MLinPractice

# train classifier on training set
echo " training"
python -m code.classification.run_classifier data/dimensionality_reduction/training.pickle -e $*

# evaluate classifier on validation set
echo " validation"
python -m code.classification.run_classifier data/dimensionality_reduction/validation.pickle -i $*

conda deactivate
28 changes: 28 additions & 0 deletions code/classification/grid_search.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash

mkdir -p data/classification

# specify hyperparameter values
values_of_k=("1 2 3 4 5 6 7 8 9 10")


# different execution modes
if [ $1 = local ]
then
echo "[local execution]"
cmd="code/classification/classifier.sge"
elif [ $1 = grid ]
then
echo "[grid execution]"
cmd="qsub code/classification/classifier.sge"
else
echo "[ERROR! Argument not supported!]"
exit 1
fi

# do the grid search
for k in $values_of_k
do
echo $k
$cmd 'data/classification/clf_'"$k"'.pickle' --knn $k -s 42 --accuracy --kappa
done
31 changes: 26 additions & 5 deletions code/classification/run_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from mlflow import log_metric, log_param, set_tracking_uri

# setting up CLI
parser = argparse.ArgumentParser(description = "Classifier")
Expand All @@ -26,37 +27,54 @@
parser.add_argument("--knn", type = int, help = "k nearest neighbor classifier with the specified value of k", default = None)
parser.add_argument("-a", "--accuracy", action = "store_true", help = "evaluate using accuracy")
parser.add_argument("-k", "--kappa", action = "store_true", help = "evaluate using Cohen's kappa")
parser.add_argument("--log_folder", help = "where to log the mlflow results", default = "data/classification/mlflow")
args = parser.parse_args()

# load data
with open(args.input_file, 'rb') as f_in:
data = pickle.load(f_in)

set_tracking_uri(args.log_folder)

if args.import_file is not None:
# import a pre-trained classifier
with open(args.import_file, 'rb') as f_in:
classifier = pickle.load(f_in)
input_dict = pickle.load(f_in)

classifier = input_dict["classifier"]
for param, value in input_dict["params"].items():
log_param(param, value)

log_param("dataset", "validation")

else: # manually set up a classifier

if args.majority:
# majority vote classifier
print(" majority vote classifier")
log_param("classifier", "majority")
params = {"classifier": "majority"}
classifier = DummyClassifier(strategy = "most_frequent", random_state = args.seed)

elif args.frequency:
# label frequency classifier
print(" label frequency classifier")
log_param("classifier", "frequency")
params = {"classifier": "frequency"}
classifier = DummyClassifier(strategy = "stratified", random_state = args.seed)


elif args.knn is not None:
print(" {0} nearest neighbor classifier".format(args.knn))
log_param("classifier", "knn")
log_param("k", args.knn)
params = {"classifier": "knn", "k": args.knn}
standardizer = StandardScaler()
knn_classifier = KNeighborsClassifier(args.knn)
knn_classifier = KNeighborsClassifier(args.knn, n_jobs = -1)
classifier = make_pipeline(standardizer, knn_classifier)

classifier.fit(data["features"], data["labels"].ravel())
log_param("dataset", "training")

# now classify the given data
prediction = classifier.predict(data["features"])
Expand All @@ -66,13 +84,16 @@
if args.accuracy:
evaluation_metrics.append(("accuracy", accuracy_score))
if args.kappa:
evaluation_metrics.append(("Cohen's kappa", cohen_kappa_score))
evaluation_metrics.append(("Cohen_kappa", cohen_kappa_score))

# compute and print them
for metric_name, metric in evaluation_metrics:
print(" {0}: {1}".format(metric_name, metric(data["labels"], prediction)))
metric_value = metric(data["labels"], prediction)
print(" {0}: {1}".format(metric_name, metric_value))
log_metric(metric_name, metric_value)

# export the trained classifier if the user wants us to do so
if args.export_file is not None:
output_dict = {"classifier": classifier, "params": params}
with open(args.export_file, 'wb') as f_out:
pickle.dump(classifier, f_out)
pickle.dump(output_dict, f_out)
Binary file modified data/classification/classifier.pickle
Binary file not shown.

0 comments on commit dcd00f9

Please sign in to comment.