diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 00000000..ca63d1dc --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,40 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Attach", + "type": "python", + "request": "attach", + "connect": { + "host": "localhost", + "port": 5678 + } + }, + { + "name": "Python: Module", + "type": "python", + "request": "launch", + "module": "code", + "cwd": "${workspaceFolder}", + }, + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal", + "cwd": "${workspaceFolder}", + // "pythonArgs": [ + // "-m", + // "src.feature_extraction.test.feature_extraction_test", + // "E:\\MyPC\\code\\git\\myforkMLiP\\MLinPractice\\src\\feature_extraction\\test\\feature_extraction_test.py" + // ], + // "env": { + // "PYTHONPATH": "${workspaceFolder}/code" + // } + } + ] +} \ No newline at end of file diff --git a/README.md b/README.md index f1c12d81..8d6e571f 100644 --- a/README.md +++ b/README.md @@ -27,16 +27,31 @@ In order to save some space on your local machine, you can run `conda clean -y - The installed libraries are used for machine learning (`scikit-learn`), visualizations (`matplotlib`), NLP (`nltk`), word embeddings (`gensim`), and IDE (`spyder`), and data handling (`pandas`) -## Overall Pipeline +## Setup & Overall Pipeline & Tests -The overall pipeline can be executed with the script `code/pipeline.sh`, which executes all of the following shell scripts: -- The script `code/load_data.sh` downloads the raw csv files containing the tweets and their metadata. They are stored in the folder `data/raw/` (which will be created if it does not yet exist). +### Setup + +The shell script `code/setup.sh` needs to be run once before the actual `code/pipeline.sh` script or any other shell scripts can be executed. It downloads necessary data by running the scripts `code/load_data.sh` and `code/load_nltk_data.sh`. +- The former script `code/load_data.sh` downloads the Data Science Tweets as raw csv files containing the tweets and their metadata. They are stored in the folder `data/raw/` (which will be created if it does not yet exist). +- The latter script `code/load_nltk_data.sh` downloads necessary NLTK data sets, corpora and models (see more: https://www.nltk.org/data.html) + +### Pipeline + +The overall pipeline can be executed with the script `code/pipeline.sh`, which executes all of the following shell scripts except `setup.py`: - The script `code/preprocessing.sh` executes all necessary preprocessing steps, including a creation of labels and splitting the data set. - The script `code/feature_extraction.sh` takes care of feature extraction. - The script `code/dimensionality_reduction.sh` takes care of dimensionality reduction. - The script `code/classification.sh` takes care of training and evaluating a classifier. - The script `code/application.sh` launches the application example. +### Tests + +For running unit tests use the following line of code: + +```shell +python -m unittest discover -s src -p '*_test.py' +``` + ## Preprocessing All python scripts and classes for the preprocessing of the input data can be found in `code/preprocessing/`. @@ -44,7 +59,7 @@ All python scripts and classes for the preprocessing of the input data can be fo ### Creating Labels The script `create_labels.py` assigns labels to the raw data points based on a threshold on a linear combination of the number of likes and retweets. It is executed as follows: -```python -m code.preprocessing.create_labels path/to/input_dir path/to/output.csv``` +```python -m src.preprocessing.create_labels path/to/input_dir path/to/output.csv``` Here, `input_dir` is the directory containing the original raw csv files, while `output.csv` is the single csv file where the output will be written. The script takes the following optional parameters: - `-l` or `--likes_weight` determines the relative weight of the number of likes a tweet has received. Defaults to 1. @@ -54,7 +69,7 @@ The script takes the following optional parameters: ### Classical Preprocessing The script `run_preprocessing.py` is used to run various preprocessing steps on the raw data, producing additional columns in the csv file. It is executed as follows: -```python -m code.preprocessing.run_preprocessing path/to/input.csv path/to/output.csv``` +```python -m src.preprocessing.run_preprocessing path/to/input.csv path/to/output.csv``` Here, `input.csv` is a csv file (ideally the output of `create_labels.py`), while `output.csv` is the csv file where the output will be written. The preprocessing steps to take can be configured with the following flags: - `-p` or `--punctuation`: A new column "tweet_no_punctuation" is created, where all punctuation is removed from the original tweet. (See `code/preprocessing/punctuation_remover.py` for more details) @@ -66,7 +81,7 @@ Moreover, the script accepts the following optional parameters: ### Splitting the Data Set The script `split_data.py` splits the overall preprocessed data into training, validation, and test set. It can be invoked as follows: -```python -m code.preprocessing.split_data path/to/input.csv path/to/output_dir``` +```python -m src.preprocessing.split_data path/to/input.csv path/to/output_dir``` Here, `input.csv` is the input csv file to split (containing a column "label" with the label information, i.e., `create_labels.py` needs to be run beforehand) and `output_dir` is the directory where three individual csv files `training.csv`, `validation.csv`, and `test.csv` will be stored. The script takes the following optional parameters: - `-t` or `--test_size` determines the relative size of the test set and defaults to 0.2 (i.e., 20 % of the data). @@ -79,7 +94,7 @@ The script takes the following optional parameters: All python scripts and classes for feature extraction can be found in `code/feature_extraction/`. The script `extract_features.py` takes care of the overall feature extraction process and can be invoked as follows: -```python -m code.feature_extraction.extract_features path/to/input.csv path/to/output.pickle``` +```python -m src.feature_extraction.extract_features path/to/input.csv path/to/output.pickle``` Here, `input.csv` is the respective training, validation, or test set file created by `split_data.py`. The file `output.pickle` will be used to store the results of the feature extraction process, namely a dictionary with the following entries: - `"features"`: a numpy array with the raw feature values (rows are training examples, colums are features) - `"feature_names"`: a list of feature names for the columns of the numpy array @@ -98,7 +113,7 @@ All python scripts and classes for dimensionality reduction can be found in `cod The script `reduce_dimensionality.py` takes care of the overall dimensionality reduction procedure and can be invoked as follows: -```python -m code.dimensionality_reduction.reduce_dimensionality path/to/input.pickle path/to/output.pickle``` +```python -m src.dimensionality_reduction.reduce_dimensionality path/to/input.pickle path/to/output.pickle``` Here, `input.pickle` is the respective training, validation, or test set file created by `extract_features.py`. The file `output.pickle` will be used to store the results of the dimensionality reduction process, containing `"features"` (which are the selected/projected ones) and `"labels"` (same as in the input file). @@ -118,19 +133,28 @@ All python scripts and classes for classification can be found in `code/classifi ### Train and Evaluate a Single Classifier The script `run_classifier.py` can be used to train and/or evaluate a given classifier. It can be executed as follows: -```python -m code.classification.run_classifier path/to/input.pickle``` +```python -m src.classification.run_classifier path/to/input.pickle``` Here, `input.pickle` is a pickle file of the respective data subset, produced by either `extract_features.py` or `reduce_dimensionality.py`. -By default, this data is used to train a classifier, which is specified by one of the following optional arguments: -- `-m` or `--majority`: Majority vote classifier that always predicts the majority class. -- `-f` or `--frequency`: Dummy classifier that makes predictions based on the label frequency in the training data. +By default, this data is used to train a **classifier**, which is specified by one of the following optional arguments: +- `-c` or `--classifier` followed by either `most_frequent` or `stratified` + - `most_frequent` is a [_DummyClassifier_](https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html) which always predicts the most frequently occuring label in the training set. + - `stratified` is a [_DummyClassifier_](https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html) that makes predictions based on the label frequency in the training data (respects the training set’s class distribution). + +**Evaluation metrics** are then used by the classifier. Which metrics are used evaluatioon is specified with the following optional arguments: +- `-m` or `--metrics` followed by another option (default is `none`): +`none`, `all`, -The classifier is then evaluated, using the evaluation metrics as specified through the following optional arguments: -- `-a`or `--accuracy`: Classification accurracy (i.e., percentage of correctly classified examples). -- `-k`or `--kappa`: Cohen's kappa (i.e., adjusting accuracy for probability of random agreement). + - `accuracy`: Classification accurracy (i.e., percentage of correctly classified examples). + - `kappa`: Cohen's kappa (i.e., adjusting accuracy for probability of random agreement). + - `precision` + - `recall` + - `f1` + - `jaccard` +For more details on the metrics used, see: https://scikit-learn.org/stable/modules/classes.html#classification-metrics -Moreover, the script support importing and exporting trained classifiers with the following optional arguments: +Moreover, the script support **importing and exporting trained classifiers** with the following optional arguments: - `-i` or `--import_file`: Load a trained classifier from the given pickle file. Ignore all parameters that configure the classifier to use and don't retrain the classifier. - `-e` or `--export_file`: Export the trained classifier into the given pickle file. @@ -143,5 +167,39 @@ All python code for the application demo can be found in `code/application/`. The script `application.py` provides a simple command line interface, where the user is asked to type in their prospective tweet, which is then analyzed using the trained ML pipeline. The script can be invoked as follows: -```python -m code.application.application path/to/preprocessing.pickle path/to/feature_extraction.pickle path/to/dimensionality_reduction.pickle path/to/classifier.pickle``` +```python -m src.application.application path/to/preprocessing.pickle path/to/feature_extraction.pickle path/to/dimensionality_reduction.pickle path/to/classifier.pickle``` The four pickle files correspond to the exported versions for the different pipeline steps as created by `run_preprocessing.py`, `extract_features.py`, `reduce_dimensionality.py`, and `run_classifier.py`, respectively, with the `-e` option. + +## Debugging in Visual Studio Code + +1. Running a file in debug mode configured as waiting, because otherwise it woulk just finish to quickly + +``` +python -m debugpy --wait-for-client --listen 5678 .\src\feature_extraction\test\feature_extraction_test.py +``` + +2. `launch.json` configuration to attach the editor to the already started debug process. + +```json +... +"configurations": [ + { + "name": "Python: Attach", + "type": "python", + "request": "attach", + "connect": { + "host": "localhost", + "port": 5678 + } + }, +] +... +``` + +3. Start the attach debug configuration via the VS Code UI ([F5] key or `Run`/`Run and Debug` menu) + +## Running MlFlow + +``` +mlflow ui --backend-store-uri data/classification/mlflow +``` \ No newline at end of file diff --git a/code/application.sh b/code/application.sh deleted file mode 100755 index da31860e..00000000 --- a/code/application.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -# execute the application with all necessary pickle files -echo "Starting the application..." -python -m code.application.application data/preprocessing/pipeline.pickle data/feature_extraction/pipeline.pickle data/dimensionality_reduction/pipeline.pickle data/classification/classifier.pickle \ No newline at end of file diff --git a/code/application/application.py b/code/application/application.py deleted file mode 100644 index 84ecb543..00000000 --- a/code/application/application.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Console-based application for tweet classification. - -Created on Wed Sep 29 14:49:25 2021 - -@author: lbechberger -""" - -import argparse, pickle -import pandas as pd -from sklearn.pipeline import make_pipeline -from code.util import COLUMN_TWEET - -# setting up CLI -parser = argparse.ArgumentParser(description = "Application") -parser.add_argument("preprocessing_file", help = "path to the pickle file containing the preprocessing") -parser.add_argument("feature_file", help = "path to the pickle file containing the feature extraction") -parser.add_argument("dim_red_file", help = "path to the pickle file containing the dimensionality reduction") -parser.add_argument("classifier_file", help = "path to the pickle file containing the classifier") -args = parser.parse_args() - -# load all the pipeline steps -with open(args.preprocessing_file, 'rb') as f_in: - preprocessing = pickle.load(f_in) -with open(args.feature_file, 'rb') as f_in: - feature_extraction = pickle.load(f_in) -with open(args.dim_red_file, 'rb') as f_in: - dimensionality_reduction = pickle.load(f_in) -with open(args.classifier_file, 'rb') as f_in: - classifier = pickle.load(f_in)["classifier"] - -# chain them together into a single pipeline -pipeline = make_pipeline(preprocessing, feature_extraction, dimensionality_reduction, classifier) - -# headline output -print("Welcome to ViralTweeter v0.1!") -print("-----------------------------") -print("") - -while True: - # ask user for input - tweet = input("Please type in your tweet (type 'quit' to quit the program): ") - - # terminate if necessary - if tweet == "quit": - print("Okay, goodbye!") - break - - # if not terminated: create pandas DataFrame and put it through the pipeline - df = pd.DataFrame() - df[COLUMN_TWEET] = [tweet] - - prediction = pipeline.predict(df) - confidence = pipeline.predict_proba(df) - - print("Prediction: {0}, Confidence: {1}".format(prediction, confidence)) - print("") - diff --git a/code/classification.sh b/code/classification.sh deleted file mode 100755 index ceb7ac18..00000000 --- a/code/classification.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -# create directory if not yet existing -mkdir -p data/classification/ - -# run feature extraction on training set (may need to fit extractors) -echo " training set" -python -m code.classification.run_classifier data/dimensionality_reduction/training.pickle -e data/classification/classifier.pickle --knn 5 -s 42 --accuracy --kappa - -# run feature extraction on validation set (with pre-fit extractors) -echo " validation set" -python -m code.classification.run_classifier data/dimensionality_reduction/validation.pickle -i data/classification/classifier.pickle --accuracy --kappa - -# don't touch the test set, yet, because that would ruin the final generalization experiment! \ No newline at end of file diff --git a/code/classification/run_classifier.py b/code/classification/run_classifier.py deleted file mode 100644 index 414e0ce5..00000000 --- a/code/classification/run_classifier.py +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Train or evaluate a single classifier with its given set of hyperparameters. - -Created on Wed Sep 29 14:23:48 2021 - -@author: lbechberger -""" - -import argparse, pickle -from sklearn.dummy import DummyClassifier -from sklearn.metrics import accuracy_score, cohen_kappa_score -from sklearn.preprocessing import StandardScaler -from sklearn.neighbors import KNeighborsClassifier -from sklearn.pipeline import make_pipeline -from mlflow import log_metric, log_param, set_tracking_uri - -# setting up CLI -parser = argparse.ArgumentParser(description = "Classifier") -parser.add_argument("input_file", help = "path to the input pickle file") -parser.add_argument("-s", '--seed', type = int, help = "seed for the random number generator", default = None) -parser.add_argument("-e", "--export_file", help = "export the trained classifier to the given location", default = None) -parser.add_argument("-i", "--import_file", help = "import a trained classifier from the given location", default = None) -parser.add_argument("-m", "--majority", action = "store_true", help = "majority class classifier") -parser.add_argument("-f", "--frequency", action = "store_true", help = "label frequency classifier") -parser.add_argument("--knn", type = int, help = "k nearest neighbor classifier with the specified value of k", default = None) -parser.add_argument("-a", "--accuracy", action = "store_true", help = "evaluate using accuracy") -parser.add_argument("-k", "--kappa", action = "store_true", help = "evaluate using Cohen's kappa") -parser.add_argument("--log_folder", help = "where to log the mlflow results", default = "data/classification/mlflow") -args = parser.parse_args() - -# load data -with open(args.input_file, 'rb') as f_in: - data = pickle.load(f_in) - -set_tracking_uri(args.log_folder) - -if args.import_file is not None: - # import a pre-trained classifier - with open(args.import_file, 'rb') as f_in: - input_dict = pickle.load(f_in) - - classifier = input_dict["classifier"] - for param, value in input_dict["params"].items(): - log_param(param, value) - - log_param("dataset", "validation") - -else: # manually set up a classifier - - if args.majority: - # majority vote classifier - print(" majority vote classifier") - log_param("classifier", "majority") - params = {"classifier": "majority"} - classifier = DummyClassifier(strategy = "most_frequent", random_state = args.seed) - - elif args.frequency: - # label frequency classifier - print(" label frequency classifier") - log_param("classifier", "frequency") - params = {"classifier": "frequency"} - classifier = DummyClassifier(strategy = "stratified", random_state = args.seed) - - - elif args.knn is not None: - print(" {0} nearest neighbor classifier".format(args.knn)) - log_param("classifier", "knn") - log_param("k", args.knn) - params = {"classifier": "knn", "k": args.knn} - standardizer = StandardScaler() - knn_classifier = KNeighborsClassifier(args.knn, n_jobs = -1) - classifier = make_pipeline(standardizer, knn_classifier) - - classifier.fit(data["features"], data["labels"].ravel()) - log_param("dataset", "training") - -# now classify the given data -prediction = classifier.predict(data["features"]) - -# collect all evaluation metrics -evaluation_metrics = [] -if args.accuracy: - evaluation_metrics.append(("accuracy", accuracy_score)) -if args.kappa: - evaluation_metrics.append(("Cohen_kappa", cohen_kappa_score)) - -# compute and print them -for metric_name, metric in evaluation_metrics: - metric_value = metric(data["labels"], prediction) - print(" {0}: {1}".format(metric_name, metric_value)) - log_metric(metric_name, metric_value) - -# export the trained classifier if the user wants us to do so -if args.export_file is not None: - output_dict = {"classifier": classifier, "params": params} - with open(args.export_file, 'wb') as f_out: - pickle.dump(output_dict, f_out) \ No newline at end of file diff --git a/code/dimensionality_reduction.sh b/code/dimensionality_reduction.sh deleted file mode 100755 index b82230b5..00000000 --- a/code/dimensionality_reduction.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -# create directory if not yet existing -mkdir -p data/dimensionality_reduction/ - -# run dimensionality reduction on training set to fit the parameters -echo " training set" -python -m code.dimensionality_reduction.reduce_dimensionality data/feature_extraction/training.pickle data/dimensionality_reduction/training.pickle -e data/dimensionality_reduction/pipeline.pickle -m 1 --verbose - -# run feature extraction on validation set and test set (with pre-fit parameters) -echo " validation set" -python -m code.dimensionality_reduction.reduce_dimensionality data/feature_extraction/validation.pickle data/dimensionality_reduction/validation.pickle -i data/dimensionality_reduction/pipeline.pickle -echo " test set" -python -m code.dimensionality_reduction.reduce_dimensionality data/feature_extraction/test.pickle data/dimensionality_reduction/test.pickle -i data/dimensionality_reduction/pipeline.pickle diff --git a/code/dimensionality_reduction/reduce_dimensionality.py b/code/dimensionality_reduction/reduce_dimensionality.py deleted file mode 100644 index d2b27419..00000000 --- a/code/dimensionality_reduction/reduce_dimensionality.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Apply a dimensionality reduction technique. - -Created on Wed Sep 29 13:33:37 2021 - -@author: lbechberger -""" - -import argparse, pickle -from sklearn.feature_selection import SelectKBest, mutual_info_classif - - -# setting up CLI -parser = argparse.ArgumentParser(description = "Dimensionality reduction") -parser.add_argument("input_file", help = "path to the input pickle file") -parser.add_argument("output_file", help = "path to the output pickle file") -parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None) -parser.add_argument("-i", "--import_file", help = "import an existing pipeline from the given location", default = None) -parser.add_argument("-m", "--mutual_information", type = int, help = "select K best features with Mutual Information", default = None) -parser.add_argument("--verbose", action = "store_true", help = "print information about feature selection process") -args = parser.parse_args() - -# load the data -with open(args.input_file, 'rb') as f_in: - input_data = pickle.load(f_in) - -features = input_data["features"] -labels = input_data["labels"] -feature_names = input_data["feature_names"] - -if args.import_file is not None: - # simply import an already fitted dimensionality reducer - with open(args.import_file, 'rb') as f_in: - dim_red = pickle.load(f_in) - -else: # need to set things up manually - - if args.mutual_information is not None: - # select K best based on Mutual Information - dim_red = SelectKBest(mutual_info_classif, k = args.mutual_information) - dim_red.fit(features, labels.ravel()) - - # resulting feature names based on support given by SelectKBest - def get_feature_names(kbest, names): - support = kbest.get_support() - result = [] - for name, selected in zip(names, support): - if selected: - result.append(name) - return result - - if args.verbose: - print(" SelectKBest with Mutual Information and k = {0}".format(args.mutual_information)) - print(" {0}".format(feature_names)) - print(" " + str(dim_red.scores_)) - print(" " + str(get_feature_names(dim_red, feature_names))) - pass - -# apply the dimensionality reduction to the given features -reduced_features = dim_red.transform(features) - -# store the results -output_data = {"features": reduced_features, - "labels": labels} -with open(args.output_file, 'wb') as f_out: - pickle.dump(output_data, f_out) - -# export the dimensionality reduction technique as pickle file if desired by user -if args.export_file is not None: - with open(args.export_file, 'wb') as f_out: - pickle.dump(dim_red, f_out) \ No newline at end of file diff --git a/code/examples.py b/code/examples.py deleted file mode 100644 index 69b2b3e3..00000000 --- a/code/examples.py +++ /dev/null @@ -1,212 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu Oct 7 09:20:09 2021 - -@author: ml -""" - -############################################################################### -######################## DATA VISUALIZATION ############################## -############################################################################### - -# plotting with pandas -import csv -import pandas as pd - -df = pd.read_csv("data/preprocessing/preprocessed.csv", quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n") - -df["language"].value_counts().plot(kind = 'bar') -df["language"].value_counts().plot(kind = 'bar', logy = True) - -df["date"] = df["date"].astype("datetime64") -df["label"].groupby(df["date"].dt.month).count().plot(kind = 'bar') - - -# plotting with matplotlib -import pickle -from matplotlib import pyplot as plt -import numpy as np - -with open("data/feature_extraction/training.pickle", "rb") as f_in: - data = pickle.load(f_in) - -features = data["features"] -labels = data["labels"] - -plt.hist(features) -plt.hist(features, range = [0,400]) - -pos = features[labels] -neg_index = np.array([not x for x in labels]) -neg = features[neg_index] - -bins = [0, 50, 100, 150, 200, 250, 300, 350, 400] - -plt.hist(pos, bins = bins) -plt.hist(neg, bins = bins) - -############################################################################### -######################## FEATURE EXTRACTION ############################## -############################################################################### - -# bigrams -import nltk -import string - -text = "John Wilkes Booth shot Abraham Lincoln. Abraham Lincoln was not shot inside the White House." -tokens = nltk.word_tokenize(text) -tokens = [token for token in tokens if token not in string.punctuation] - -bigrams = nltk.bigrams(tokens) -freq_dist = nltk.FreqDist(bigrams) -freq_list = [] -for bigram, freq in freq_dist.items(): - freq_list.append([bigram, freq]) -freq_list.sort(key = lambda x: x[1], reverse = True) -for i in range(len(freq_list)): - print(freq_list[i]) - - -# tf-idf -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.metrics.pairwise import cosine_similarity - -tweets = df["tweet"][:100] -vectorizer = TfidfVectorizer() -tf_idf_vectors = vectorizer.fit_transform(tweets).todense() - -print(tf_idf_vectors.shape) -print(vectorizer.get_feature_names()[142:145]) -print(tf_idf_vectors[66:71, 142:145]) - -tf_idf_similarities = cosine_similarity(tf_idf_vectors) -print(tf_idf_similarities[:5,:5]) - - -# NER -text = "John Wilkes Booth shot Abraham Lincoln. Abraham Lincoln was not shot inside the White House." -sentences = nltk.sent_tokenize(text) -for sentence in sentences: - words = nltk.word_tokenize(sentence) - pos_tagged = nltk.pos_tag(words) - ne_chunked = nltk.ne_chunk(pos_tagged) - print(ne_chunked) - - -# WordNet -dog_synsets = nltk.corpus.wordnet.synsets('dog') -for syn in dog_synsets: - words = [str(lemma.name()) for lemma in syn.lemmas()] - print(syn, words, syn.definition(), syn.hypernyms()) - print("") - - -# word2vec -import gensim.downloader as api - -embeddings = api.load('word2vec-google-news-300') -pairs = [('car', 'minivan'), ('car', 'airplane'), ('car', 'cereal')] - -for w1, w2 in pairs: - print("{0} - {1}: {2}".format(w1, w2, embeddings.similarity(w1, w2))) - -dog_vector = embeddings['dog'] - - -# one hot encoding -from sklearn.preprocessing import OneHotEncoder -import numpy as np - -features = np.array([["morning"], ["afternoon"], ["evening"], ["night"], ["afternoon"]]) -encoder = OneHotEncoder(sparse = False) -encoder.fit(features) -encoder.transform(features) - - -############################################################################### -##################### DIMENSIONALITY REDUCTION ########################### -############################################################################### - -from sklearn.datasets import load_breast_cancer -from sklearn.decomposition import PCA -from sklearn.linear_model import LogisticRegression -from sklearn.feature_selection import RFE, SelectKBest, mutual_info_classif, SelectFromModel -from sklearn.ensemble import RandomForestClassifier -import numpy as np - -data_set = load_breast_cancer() -X = data_set.data -y = data_set.target -print("Data Set: ", X.shape, y.shape) -print("Combinatorics of binary feature values:", 2**30) - - -# PCA -print("\nPCA") -print('---') -pca = PCA(random_state = 42) -pca.fit(X) -print("explained variance (percentage): ", pca.explained_variance_ratio_) -print('most important component: ', pca.components_[0]) -pca_transformed = pca.transform(X) -pca_transformed = pca_transformed[:,0:1] -print("after transformation: ", pca_transformed.shape, y.shape) -print("Compare: ", X[0], pca_transformed[0]) - - -# wrapper -print("\nWrapper") -print("-------") - -model = LogisticRegression(random_state = 42, max_iter = 10000) -rfe = RFE(model, n_features_to_select = 2) -rfe.fit(X,y) -print("Feature ranking according to RFE/LogReg:", rfe.ranking_) -index_of_first = np.where(rfe.ranking_ == 1)[0][0] -index_of_second = np.where(rfe.ranking_ == 2)[0][0] -print("Two most promising features: ", index_of_first, index_of_second) -wrapper_transformed = rfe.transform(X) -print("After transformation: ", wrapper_transformed.shape, y.shape) -print("compare: ", X[0], wrapper_transformed[0]) - - -# Filter -print("\n Filter") -print("------") -skb = SelectKBest(score_func = mutual_info_classif, k = 3) -skb.fit(X,y) -print("Feature scores according to MI: ", skb.scores_) -filter_transformed = skb.transform(X) -print("After transformation: ", filter_transformed.shape, y.shape) -print("Compare: ", X[0], filter_transformed[0]) - - - -# Embedded -print("\nEmbedded") -print("--------") -rf = RandomForestClassifier(n_estimators = 10, random_state=42) -rf.fit(X,y) -print("Feature imporance according to RF: ", rf.feature_importances_) -sfm = SelectFromModel(rf, threshold = 0.1, prefit = True) -embedded_transformed = sfm.transform(X) -print("After transformation: ", embedded_transformed.shape, y.shape) -print("Compare: ", X[0], embedded_transformed[0]) - - - - - - - - - - - - - - - - - diff --git a/code/feature_extraction.sh b/code/feature_extraction.sh deleted file mode 100755 index f494f835..00000000 --- a/code/feature_extraction.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -# create directory if not yet existing -mkdir -p data/feature_extraction/ - -# run feature extraction on training set (may need to fit extractors) -echo " training set" -python -m code.feature_extraction.extract_features data/preprocessing/split/training.csv data/feature_extraction/training.pickle -e data/feature_extraction/pipeline.pickle --char_length - -# run feature extraction on validation set and test set (with pre-fit extractors) -echo " validation set" -python -m code.feature_extraction.extract_features data/preprocessing/split/validation.csv data/feature_extraction/validation.pickle -i data/feature_extraction/pipeline.pickle -echo " test set" -python -m code.feature_extraction.extract_features data/preprocessing/split/test.csv data/feature_extraction/test.pickle -i data/feature_extraction/pipeline.pickle \ No newline at end of file diff --git a/code/feature_extraction/extract_features.py b/code/feature_extraction/extract_features.py deleted file mode 100644 index a3527acf..00000000 --- a/code/feature_extraction/extract_features.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Runs the specified collection of feature extractors. - -Created on Wed Sep 29 11:00:24 2021 - -@author: lbechberger -""" - -import argparse, csv, pickle -import pandas as pd -import numpy as np -from code.feature_extraction.character_length import CharacterLength -from code.feature_extraction.feature_collector import FeatureCollector -from code.util import COLUMN_TWEET, COLUMN_LABEL - - -# setting up CLI -parser = argparse.ArgumentParser(description = "Feature Extraction") -parser.add_argument("input_file", help = "path to the input csv file") -parser.add_argument("output_file", help = "path to the output pickle file") -parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None) -parser.add_argument("-i", "--import_file", help = "import an existing pipeline from the given location", default = None) -parser.add_argument("-c", "--char_length", action = "store_true", help = "compute the number of characters in the tweet") -args = parser.parse_args() - -# load data -df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n") - -if args.import_file is not None: - # simply import an exisiting FeatureCollector - with open(args.import_file, "rb") as f_in: - feature_collector = pickle.load(f_in) - -else: # need to create FeatureCollector manually - - # collect all feature extractors - features = [] - if args.char_length: - # character length of original tweet (without any changes) - features.append(CharacterLength(COLUMN_TWEET)) - - # create overall FeatureCollector - feature_collector = FeatureCollector(features) - - # fit it on the given data set (assumed to be training data) - feature_collector.fit(df) - - -# apply the given FeatureCollector on the current data set -# maps the pandas DataFrame to an numpy array -feature_array = feature_collector.transform(df) - -# get label array -label_array = np.array(df[COLUMN_LABEL]) -label_array = label_array.reshape(-1, 1) - -# store the results -results = {"features": feature_array, "labels": label_array, - "feature_names": feature_collector.get_feature_names()} -with open(args.output_file, 'wb') as f_out: - pickle.dump(results, f_out) - -# export the FeatureCollector as pickle file if desired by user -if args.export_file is not None: - with open(args.export_file, 'wb') as f_out: - pickle.dump(feature_collector, f_out) \ No newline at end of file diff --git a/code/preprocessing.sh b/code/preprocessing.sh deleted file mode 100755 index 61f83ea6..00000000 --- a/code/preprocessing.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -# create directory if not yet existing -mkdir -p data/preprocessing/split/ - -# install all NLTK models -python -m nltk.downloader all - -# add labels -echo " creating labels" -python -m code.preprocessing.create_labels data/raw/ data/preprocessing/labeled.csv - -# other preprocessing (removing punctuation etc.) -echo " general preprocessing" -python -m code.preprocessing.run_preprocessing data/preprocessing/labeled.csv data/preprocessing/preprocessed.csv --punctuation --tokenize -e data/preprocessing/pipeline.pickle - -# split the data set -echo " splitting the data set" -python -m code.preprocessing.split_data data/preprocessing/preprocessed.csv data/preprocessing/split/ -s 42 \ No newline at end of file diff --git a/code/preprocessing/create_labels.py b/code/preprocessing/create_labels.py deleted file mode 100644 index 21b1748d..00000000 --- a/code/preprocessing/create_labels.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Reads in the original csv files and creates labels for the data points. -Stores the result as a single pandas DataFrame in a pickle file. - -Created on Tue Sep 28 15:55:44 2021 - -@author: lbechberger -""" - -import os, argparse, csv -import pandas as pd -from code.util import COLUMN_LIKES, COLUMN_RETWEETS, COLUMN_LABEL - -# setting up CLI -parser = argparse.ArgumentParser(description = "Creation of Labels") -parser.add_argument("data_directory", help = "directory where the original csv files reside") -parser.add_argument("output_file", help = "path to the output csv file") -parser.add_argument("-l", '--likes_weight', type = int, help = "weight of likes", default = 1) -parser.add_argument("-r", '--retweet_weight', type = int, help = "weight of retweets", default = 1) -parser.add_argument("-t", '--threshold', type = int, help = "threshold to surpass for positive class", default = 50) -args = parser.parse_args() - -# get all csv files in data_directory -file_paths = [args.data_directory + f for f in os.listdir(args.data_directory) if f.endswith(".csv")] - -# load all csv files -dfs = [] -for file_path in file_paths: - dfs.append(pd.read_csv(file_path, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n")) - -# join all data into a single DataFrame -df = pd.concat(dfs) - -# compute new column "label" based on likes and retweets -df[COLUMN_LABEL] = (args.likes_weight * df[COLUMN_LIKES] + args.retweet_weight * df[COLUMN_RETWEETS]) > args.threshold - -# print statistics -print("Number of tweets: {0}".format(len(df))) -print("Label distribution:") -print(df[COLUMN_LABEL].value_counts(normalize = True)) - -# store the DataFrame into a csv file -df.to_csv(args.output_file, index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n") \ No newline at end of file diff --git a/code/preprocessing/run_preprocessing.py b/code/preprocessing/run_preprocessing.py deleted file mode 100644 index 72130a30..00000000 --- a/code/preprocessing/run_preprocessing.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Runs the specified collection of preprocessing steps - -Created on Tue Sep 28 16:43:18 2021 - -@author: lbechberger -""" - -import argparse, csv, pickle -import pandas as pd -from sklearn.pipeline import make_pipeline -from code.preprocessing.punctuation_remover import PunctuationRemover -from code.preprocessing.tokenizer import Tokenizer -from code.util import COLUMN_TWEET, SUFFIX_TOKENIZED - -# setting up CLI -parser = argparse.ArgumentParser(description = "Various preprocessing steps") -parser.add_argument("input_file", help = "path to the input csv file") -parser.add_argument("output_file", help = "path to the output csv file") -parser.add_argument("-p", "--punctuation", action = "store_true", help = "remove punctuation") -parser.add_argument("-t", "--tokenize", action = "store_true", help = "tokenize given column into individual words") -parser.add_argument("--tokenize_input", help = "input column to tokenize", default = COLUMN_TWEET) -parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None) -args = parser.parse_args() - -# load data -df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n") - -# collect all preprocessors -preprocessors = [] -if args.punctuation: - preprocessors.append(PunctuationRemover()) -if args.tokenize: - preprocessors.append(Tokenizer(args.tokenize_input, args.tokenize_input + SUFFIX_TOKENIZED)) - -# call all preprocessing steps -for preprocessor in preprocessors: - df = preprocessor.fit_transform(df) - -# store the results -df.to_csv(args.output_file, index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n") - -# create a pipeline if necessary and store it as pickle file -if args.export_file is not None: - pipeline = make_pipeline(*preprocessors) - with open(args.export_file, 'wb') as f_out: - pickle.dump(pipeline, f_out) \ No newline at end of file diff --git a/code/preprocessing/split_data.py b/code/preprocessing/split_data.py deleted file mode 100644 index 57bad668..00000000 --- a/code/preprocessing/split_data.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Splits the preprocessed data into training, validation, and test set. - -Created on Tue Sep 28 16:45:51 2021 - -@author: lbechberger -""" - -import os, argparse, csv -import pandas as pd -from sklearn.model_selection import train_test_split -from code.util import COLUMN_LABEL - -# setting up CLI -parser = argparse.ArgumentParser(description = "Splitting the data set") -parser.add_argument("input_file", help = "path to the input csv file") -parser.add_argument("output_folder", help = "path to the output folder") -parser.add_argument("-s", '--seed', type = int, help = "seed for the random number generator", default = None) -parser.add_argument("-t", '--test_size', type = float, help = "relative size of the test set", default = 0.2) -parser.add_argument("-v", '--validation_size', type = float, help = "relative size of the validation set", default = 0.2) -args = parser.parse_args() - -# load the data -df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n") - -# split into (training & validation) and test set -X, X_test = train_test_split(df, test_size = args.test_size, random_state = args.seed, shuffle = True, stratify = df[COLUMN_LABEL]) - -# split remainder into training and validation -relative_validation_size = args.validation_size / (1 - args.test_size) -X_train, X_val = train_test_split(X, test_size = relative_validation_size, random_state = args.seed, shuffle = True, stratify = X[COLUMN_LABEL]) - -# store the three data sets separately -X_train.to_csv(os.path.join(args.output_folder, "training.csv"), index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n") -X_val.to_csv(os.path.join(args.output_folder, "validation.csv"), index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n") -X_test.to_csv(os.path.join(args.output_folder, "test.csv"), index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n") - -print("Training: {0} examples, Validation: {1} examples, Test: {2} examples".format(len(X_train), len(X_val), len(X_test))) \ No newline at end of file diff --git a/code/util.py b/code/util.py deleted file mode 100644 index 7d8794c7..00000000 --- a/code/util.py +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Utility file for collecting frequently used constants and helper functions. - -Created on Wed Sep 29 10:50:36 2021 - -@author: lbechberger -""" - -# column names for the original data frame -COLUMN_TWEET = "tweet" -COLUMN_LIKES = "likes_count" -COLUMN_RETWEETS = "retweets_count" - -# column names of novel columns for preprocessing -COLUMN_LABEL = "label" -COLUMN_PUNCTUATION = "tweet_no_punctuation" - -SUFFIX_TOKENIZED = "_tokenized" \ No newline at end of file diff --git a/data/classification/classifier.pickle b/data/classification/classifier.pickle index 012911f3..70f98197 100644 Binary files a/data/classification/classifier.pickle and b/data/classification/classifier.pickle differ diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/meta.yaml b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/meta.yaml new file mode 100644 index 00000000..1cb5f70d --- /dev/null +++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/meta.yaml @@ -0,0 +1,15 @@ +artifact_uri: data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/artifacts +end_time: 1634906752479 +entry_point_name: '' +experiment_id: '0' +lifecycle_stage: active +name: '' +run_id: 29048f3f5892425cb3622a4ad04a8c0b +run_uuid: 29048f3f5892425cb3622a4ad04a8c0b +source_name: '' +source_type: 4 +source_version: '' +start_time: 1634906752379 +status: 3 +tags: [] +user_id: Krext diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Accuracy b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Accuracy new file mode 100644 index 00000000..36a0a585 --- /dev/null +++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Accuracy @@ -0,0 +1 @@ +1634906752471 0.8315562773619545 0 diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Cohen_kappa b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Cohen_kappa new file mode 100644 index 00000000..a24b8a34 --- /dev/null +++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Cohen_kappa @@ -0,0 +1 @@ +1634906752473 0.010411614999104146 0 diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/F1-Score b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/F1-Score new file mode 100644 index 00000000..352b4320 --- /dev/null +++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/F1-Score @@ -0,0 +1 @@ +1634906752476 0.10336402931779741 0 diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Jaccard b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Jaccard new file mode 100644 index 00000000..f3be7376 --- /dev/null +++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Jaccard @@ -0,0 +1 @@ +1634906752478 0.05449861276258423 0 diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Precision b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Precision new file mode 100644 index 00000000..2b5eff49 --- /dev/null +++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Precision @@ -0,0 +1 @@ +1634906752474 0.10361718161266013 0 diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Recall b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Recall new file mode 100644 index 00000000..bcc48219 --- /dev/null +++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/metrics/Recall @@ -0,0 +1 @@ +1634906752475 0.10311211098612673 0 diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/params/classifier b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/params/classifier new file mode 100644 index 00000000..b11cc475 --- /dev/null +++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/params/classifier @@ -0,0 +1 @@ +stratified \ No newline at end of file diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/params/dataset b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/params/dataset new file mode 100644 index 00000000..efc02160 --- /dev/null +++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/params/dataset @@ -0,0 +1 @@ +validation \ No newline at end of file diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.source.git.commit b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.source.git.commit new file mode 100644 index 00000000..73f681b9 --- /dev/null +++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.source.git.commit @@ -0,0 +1 @@ +a07f531063b7ce83182c0226a382000c0df50b8d \ No newline at end of file diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.source.name b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.source.name new file mode 100644 index 00000000..a50988a9 --- /dev/null +++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.source.name @@ -0,0 +1 @@ +E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py \ No newline at end of file diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.source.type b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.source.type new file mode 100644 index 00000000..0c2c1fe9 --- /dev/null +++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.source.type @@ -0,0 +1 @@ +LOCAL \ No newline at end of file diff --git a/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.user b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.user new file mode 100644 index 00000000..d10f720c --- /dev/null +++ b/data/classification/mlflow/0/29048f3f5892425cb3622a4ad04a8c0b/tags/mlflow.user @@ -0,0 +1 @@ +Krext \ No newline at end of file diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/meta.yaml b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/meta.yaml new file mode 100644 index 00000000..3c08d2d6 --- /dev/null +++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/meta.yaml @@ -0,0 +1,15 @@ +artifact_uri: data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/artifacts +end_time: 1634906750278 +entry_point_name: '' +experiment_id: '0' +lifecycle_stage: active +name: '' +run_id: 4d5b6cef36004ae1af5f9aad10adf64a +run_uuid: 4d5b6cef36004ae1af5f9aad10adf64a +source_name: '' +source_type: 4 +source_version: '' +start_time: 1634906750021 +status: 3 +tags: [] +user_id: Krext diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Accuracy b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Accuracy new file mode 100644 index 00000000..b42c9de2 --- /dev/null +++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Accuracy @@ -0,0 +1 @@ +1634906750272 0.8298910229251989 0 diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Cohen_kappa b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Cohen_kappa new file mode 100644 index 00000000..9a759866 --- /dev/null +++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Cohen_kappa @@ -0,0 +1 @@ +1634906750274 -0.0008600519779315974 0 diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/F1-Score b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/F1-Score new file mode 100644 index 00000000..2de8d9b5 --- /dev/null +++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/F1-Score @@ -0,0 +1 @@ +1634906750276 0.09299407021617041 0 diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Jaccard b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Jaccard new file mode 100644 index 00000000..70b292b3 --- /dev/null +++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Jaccard @@ -0,0 +1 @@ +1634906750277 0.0487644368398539 0 diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Precision b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Precision new file mode 100644 index 00000000..8eb2df09 --- /dev/null +++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Precision @@ -0,0 +1 @@ +1634906750274 0.09337197580645161 0 diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Recall b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Recall new file mode 100644 index 00000000..8f2f77b3 --- /dev/null +++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/metrics/Recall @@ -0,0 +1 @@ +1634906750275 0.09261921129929379 0 diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/params/classifier b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/params/classifier new file mode 100644 index 00000000..b11cc475 --- /dev/null +++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/params/classifier @@ -0,0 +1 @@ +stratified \ No newline at end of file diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/params/dataset b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/params/dataset new file mode 100644 index 00000000..ce15c0a9 --- /dev/null +++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/params/dataset @@ -0,0 +1 @@ +training \ No newline at end of file diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.source.git.commit b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.source.git.commit new file mode 100644 index 00000000..73f681b9 --- /dev/null +++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.source.git.commit @@ -0,0 +1 @@ +a07f531063b7ce83182c0226a382000c0df50b8d \ No newline at end of file diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.source.name b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.source.name new file mode 100644 index 00000000..a50988a9 --- /dev/null +++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.source.name @@ -0,0 +1 @@ +E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py \ No newline at end of file diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.source.type b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.source.type new file mode 100644 index 00000000..0c2c1fe9 --- /dev/null +++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.source.type @@ -0,0 +1 @@ +LOCAL \ No newline at end of file diff --git a/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.user b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.user new file mode 100644 index 00000000..d10f720c --- /dev/null +++ b/data/classification/mlflow/0/4d5b6cef36004ae1af5f9aad10adf64a/tags/mlflow.user @@ -0,0 +1 @@ +Krext \ No newline at end of file diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/meta.yaml b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/meta.yaml new file mode 100644 index 00000000..31b2fc75 --- /dev/null +++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/meta.yaml @@ -0,0 +1,15 @@ +artifact_uri: data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/artifacts +end_time: 1634906743997 +entry_point_name: '' +experiment_id: '0' +lifecycle_stage: active +name: '' +run_id: 568feaea689947798516e2a96b7edc58 +run_uuid: 568feaea689947798516e2a96b7edc58 +source_name: '' +source_type: 4 +source_version: '' +start_time: 1634906743903 +status: 3 +tags: [] +user_id: Krext diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Accuracy b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Accuracy new file mode 100644 index 00000000..00dbc64d --- /dev/null +++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Accuracy @@ -0,0 +1 @@ +1634906743991 0.9058395706821071 0 diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Cohen_kappa b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Cohen_kappa new file mode 100644 index 00000000..9b8c5db1 --- /dev/null +++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Cohen_kappa @@ -0,0 +1 @@ +1634906743992 0.0 0 diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/F1-Score b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/F1-Score new file mode 100644 index 00000000..37842d2e --- /dev/null +++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/F1-Score @@ -0,0 +1 @@ +1634906743995 0.0 0 diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Jaccard b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Jaccard new file mode 100644 index 00000000..37842d2e --- /dev/null +++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Jaccard @@ -0,0 +1 @@ +1634906743995 0.0 0 diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Precision b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Precision new file mode 100644 index 00000000..26d795f4 --- /dev/null +++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Precision @@ -0,0 +1 @@ +1634906743993 0.0 0 diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Recall b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Recall new file mode 100644 index 00000000..52fb372f --- /dev/null +++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/metrics/Recall @@ -0,0 +1 @@ +1634906743994 0.0 0 diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/params/classifier b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/params/classifier new file mode 100644 index 00000000..ede38720 --- /dev/null +++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/params/classifier @@ -0,0 +1 @@ +most_frequent \ No newline at end of file diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/params/dataset b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/params/dataset new file mode 100644 index 00000000..efc02160 --- /dev/null +++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/params/dataset @@ -0,0 +1 @@ +validation \ No newline at end of file diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.source.git.commit b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.source.git.commit new file mode 100644 index 00000000..73f681b9 --- /dev/null +++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.source.git.commit @@ -0,0 +1 @@ +a07f531063b7ce83182c0226a382000c0df50b8d \ No newline at end of file diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.source.name b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.source.name new file mode 100644 index 00000000..a50988a9 --- /dev/null +++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.source.name @@ -0,0 +1 @@ +E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py \ No newline at end of file diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.source.type b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.source.type new file mode 100644 index 00000000..0c2c1fe9 --- /dev/null +++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.source.type @@ -0,0 +1 @@ +LOCAL \ No newline at end of file diff --git a/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.user b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.user new file mode 100644 index 00000000..d10f720c --- /dev/null +++ b/data/classification/mlflow/0/568feaea689947798516e2a96b7edc58/tags/mlflow.user @@ -0,0 +1 @@ +Krext \ No newline at end of file diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/meta.yaml b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/meta.yaml new file mode 100644 index 00000000..61a3f71b --- /dev/null +++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/meta.yaml @@ -0,0 +1,15 @@ +artifact_uri: data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/artifacts +end_time: 1634906771848 +entry_point_name: '' +experiment_id: '0' +lifecycle_stage: active +name: '' +run_id: 62af76001a3e4770beda60181362e4e5 +run_uuid: 62af76001a3e4770beda60181362e4e5 +source_name: '' +source_type: 4 +source_version: '' +start_time: 1634906771681 +status: 3 +tags: [] +user_id: Krext diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Accuracy b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Accuracy new file mode 100644 index 00000000..71645d47 --- /dev/null +++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Accuracy @@ -0,0 +1 @@ +1634906771842 0.9045509108882926 0 diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Cohen_kappa b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Cohen_kappa new file mode 100644 index 00000000..17ce5fa1 --- /dev/null +++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Cohen_kappa @@ -0,0 +1 @@ +1634906771843 0.04186413205264283 0 diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/F1-Score b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/F1-Score new file mode 100644 index 00000000..ce0c4be1 --- /dev/null +++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/F1-Score @@ -0,0 +1 @@ +1634906771846 0.05389326334208224 0 diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Jaccard b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Jaccard new file mode 100644 index 00000000..3c075306 --- /dev/null +++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Jaccard @@ -0,0 +1 @@ +1634906771847 0.0276928609962237 0 diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Precision b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Precision new file mode 100644 index 00000000..ad0a2c0e --- /dev/null +++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Precision @@ -0,0 +1 @@ +1634906771844 0.4041994750656168 0 diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Recall b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Recall new file mode 100644 index 00000000..cb94e5e5 --- /dev/null +++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/metrics/Recall @@ -0,0 +1 @@ +1634906771845 0.028871391076115485 0 diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/params/classifier b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/params/classifier new file mode 100644 index 00000000..f5035153 --- /dev/null +++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/params/classifier @@ -0,0 +1 @@ +randomforest \ No newline at end of file diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/params/dataset b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/params/dataset new file mode 100644 index 00000000..efc02160 --- /dev/null +++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/params/dataset @@ -0,0 +1 @@ +validation \ No newline at end of file diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/params/n b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/params/n new file mode 100644 index 00000000..9a037142 --- /dev/null +++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/params/n @@ -0,0 +1 @@ +10 \ No newline at end of file diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.source.git.commit b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.source.git.commit new file mode 100644 index 00000000..73f681b9 --- /dev/null +++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.source.git.commit @@ -0,0 +1 @@ +a07f531063b7ce83182c0226a382000c0df50b8d \ No newline at end of file diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.source.name b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.source.name new file mode 100644 index 00000000..a50988a9 --- /dev/null +++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.source.name @@ -0,0 +1 @@ +E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py \ No newline at end of file diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.source.type b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.source.type new file mode 100644 index 00000000..0c2c1fe9 --- /dev/null +++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.source.type @@ -0,0 +1 @@ +LOCAL \ No newline at end of file diff --git a/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.user b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.user new file mode 100644 index 00000000..d10f720c --- /dev/null +++ b/data/classification/mlflow/0/62af76001a3e4770beda60181362e4e5/tags/mlflow.user @@ -0,0 +1 @@ +Krext \ No newline at end of file diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/meta.yaml b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/meta.yaml new file mode 100644 index 00000000..8e453c52 --- /dev/null +++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/meta.yaml @@ -0,0 +1,15 @@ +artifact_uri: data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/artifacts +end_time: 1634906791009 +entry_point_name: '' +experiment_id: '0' +lifecycle_stage: active +name: '' +run_id: 73628a4a7c194985bf8ad402d54d9e11 +run_uuid: 73628a4a7c194985bf8ad402d54d9e11 +source_name: '' +source_type: 4 +source_version: '' +start_time: 1634906778765 +status: 3 +tags: [] +user_id: Krext diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Accuracy b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Accuracy new file mode 100644 index 00000000..e796f24b --- /dev/null +++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Accuracy @@ -0,0 +1 @@ +1634906790986 0.8436602645577367 0 diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Cohen_kappa b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Cohen_kappa new file mode 100644 index 00000000..e67e8622 --- /dev/null +++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Cohen_kappa @@ -0,0 +1 @@ +1634906790989 0.0993382191603408 0 diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/F1-Score b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/F1-Score new file mode 100644 index 00000000..592a11f3 --- /dev/null +++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/F1-Score @@ -0,0 +1 @@ +1634906790993 0.18577426373693726 0 diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Jaccard b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Jaccard new file mode 100644 index 00000000..088fbcdc --- /dev/null +++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Jaccard @@ -0,0 +1 @@ +1634906790993 0.10239864864864864 0 diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Precision b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Precision new file mode 100644 index 00000000..9eda6a53 --- /dev/null +++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Precision @@ -0,0 +1 @@ +1634906790990 0.18226097414311485 0 diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Recall b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Recall new file mode 100644 index 00000000..5014ca2c --- /dev/null +++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/metrics/Recall @@ -0,0 +1 @@ +1634906790991 0.18942566089619398 0 diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/params/classifier b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/params/classifier new file mode 100644 index 00000000..eecfc333 --- /dev/null +++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/params/classifier @@ -0,0 +1 @@ +knn \ No newline at end of file diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/params/dataset b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/params/dataset new file mode 100644 index 00000000..ce15c0a9 --- /dev/null +++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/params/dataset @@ -0,0 +1 @@ +training \ No newline at end of file diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/params/k b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/params/k new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/params/k @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.source.git.commit b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.source.git.commit new file mode 100644 index 00000000..73f681b9 --- /dev/null +++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.source.git.commit @@ -0,0 +1 @@ +a07f531063b7ce83182c0226a382000c0df50b8d \ No newline at end of file diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.source.name b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.source.name new file mode 100644 index 00000000..a50988a9 --- /dev/null +++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.source.name @@ -0,0 +1 @@ +E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py \ No newline at end of file diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.source.type b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.source.type new file mode 100644 index 00000000..0c2c1fe9 --- /dev/null +++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.source.type @@ -0,0 +1 @@ +LOCAL \ No newline at end of file diff --git a/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.user b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.user new file mode 100644 index 00000000..d10f720c --- /dev/null +++ b/data/classification/mlflow/0/73628a4a7c194985bf8ad402d54d9e11/tags/mlflow.user @@ -0,0 +1 @@ +Krext \ No newline at end of file diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/meta.yaml b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/meta.yaml new file mode 100644 index 00000000..89d99cca --- /dev/null +++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/meta.yaml @@ -0,0 +1,15 @@ +artifact_uri: data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/artifacts +end_time: 1634906769598 +entry_point_name: '' +experiment_id: '0' +lifecycle_stage: active +name: '' +run_id: a7e9e6e2984448b39f2b82f11b4ed46c +run_uuid: a7e9e6e2984448b39f2b82f11b4ed46c +source_name: '' +source_type: 4 +source_version: '' +start_time: 1634906768317 +status: 3 +tags: [] +user_id: Krext diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Accuracy b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Accuracy new file mode 100644 index 00000000..98abeb7c --- /dev/null +++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Accuracy @@ -0,0 +1 @@ +1634906769587 0.908393353104552 0 diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Cohen_kappa b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Cohen_kappa new file mode 100644 index 00000000..c2a029b6 --- /dev/null +++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Cohen_kappa @@ -0,0 +1 @@ +1634906769588 0.08152669872440343 0 diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/F1-Score b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/F1-Score new file mode 100644 index 00000000..6e983bad --- /dev/null +++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/F1-Score @@ -0,0 +1 @@ +1634906769591 0.09330227140361096 0 diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Jaccard b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Jaccard new file mode 100644 index 00000000..79473c13 --- /dev/null +++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Jaccard @@ -0,0 +1 @@ +1634906769592 0.048933960535157923 0 diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Precision b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Precision new file mode 100644 index 00000000..e69642af --- /dev/null +++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Precision @@ -0,0 +1 @@ +1634906769589 0.6852010265183918 0 diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Recall b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Recall new file mode 100644 index 00000000..3f9ff12a --- /dev/null +++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/metrics/Recall @@ -0,0 +1 @@ +1634906769590 0.05005937128929442 0 diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/params/classifier b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/params/classifier new file mode 100644 index 00000000..f5035153 --- /dev/null +++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/params/classifier @@ -0,0 +1 @@ +randomforest \ No newline at end of file diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/params/dataset b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/params/dataset new file mode 100644 index 00000000..ce15c0a9 --- /dev/null +++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/params/dataset @@ -0,0 +1 @@ +training \ No newline at end of file diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/params/n b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/params/n new file mode 100644 index 00000000..9a037142 --- /dev/null +++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/params/n @@ -0,0 +1 @@ +10 \ No newline at end of file diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.source.git.commit b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.source.git.commit new file mode 100644 index 00000000..73f681b9 --- /dev/null +++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.source.git.commit @@ -0,0 +1 @@ +a07f531063b7ce83182c0226a382000c0df50b8d \ No newline at end of file diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.source.name b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.source.name new file mode 100644 index 00000000..a50988a9 --- /dev/null +++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.source.name @@ -0,0 +1 @@ +E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py \ No newline at end of file diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.source.type b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.source.type new file mode 100644 index 00000000..0c2c1fe9 --- /dev/null +++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.source.type @@ -0,0 +1 @@ +LOCAL \ No newline at end of file diff --git a/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.user b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.user new file mode 100644 index 00000000..d10f720c --- /dev/null +++ b/data/classification/mlflow/0/a7e9e6e2984448b39f2b82f11b4ed46c/tags/mlflow.user @@ -0,0 +1 @@ +Krext \ No newline at end of file diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/meta.yaml b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/meta.yaml new file mode 100644 index 00000000..76595577 --- /dev/null +++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/meta.yaml @@ -0,0 +1,15 @@ +artifact_uri: data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/artifacts +end_time: 1634906741758 +entry_point_name: '' +experiment_id: '0' +lifecycle_stage: active +name: '' +run_id: af59d8fd6467448887013f0561655ca2 +run_uuid: af59d8fd6467448887013f0561655ca2 +source_name: '' +source_type: 4 +source_version: '' +start_time: 1634906741516 +status: 3 +tags: [] +user_id: Krext diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Accuracy b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Accuracy new file mode 100644 index 00000000..8e439910 --- /dev/null +++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Accuracy @@ -0,0 +1 @@ +1634906741751 0.905845454973403 0 diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Cohen_kappa b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Cohen_kappa new file mode 100644 index 00000000..4b5cb0b2 --- /dev/null +++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Cohen_kappa @@ -0,0 +1 @@ +1634906741752 0.0 0 diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/F1-Score b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/F1-Score new file mode 100644 index 00000000..a32d8f40 --- /dev/null +++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/F1-Score @@ -0,0 +1 @@ +1634906741755 0.0 0 diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Jaccard b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Jaccard new file mode 100644 index 00000000..e3a1e26b --- /dev/null +++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Jaccard @@ -0,0 +1 @@ +1634906741756 0.0 0 diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Precision b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Precision new file mode 100644 index 00000000..07a50e00 --- /dev/null +++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Precision @@ -0,0 +1 @@ +1634906741753 0.0 0 diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Recall b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Recall new file mode 100644 index 00000000..e5d55367 --- /dev/null +++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/metrics/Recall @@ -0,0 +1 @@ +1634906741754 0.0 0 diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/params/classifier b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/params/classifier new file mode 100644 index 00000000..ede38720 --- /dev/null +++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/params/classifier @@ -0,0 +1 @@ +most_frequent \ No newline at end of file diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/params/dataset b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/params/dataset new file mode 100644 index 00000000..ce15c0a9 --- /dev/null +++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/params/dataset @@ -0,0 +1 @@ +training \ No newline at end of file diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.source.git.commit b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.source.git.commit new file mode 100644 index 00000000..73f681b9 --- /dev/null +++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.source.git.commit @@ -0,0 +1 @@ +a07f531063b7ce83182c0226a382000c0df50b8d \ No newline at end of file diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.source.name b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.source.name new file mode 100644 index 00000000..a50988a9 --- /dev/null +++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.source.name @@ -0,0 +1 @@ +E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py \ No newline at end of file diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.source.type b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.source.type new file mode 100644 index 00000000..0c2c1fe9 --- /dev/null +++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.source.type @@ -0,0 +1 @@ +LOCAL \ No newline at end of file diff --git a/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.user b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.user new file mode 100644 index 00000000..d10f720c --- /dev/null +++ b/data/classification/mlflow/0/af59d8fd6467448887013f0561655ca2/tags/mlflow.user @@ -0,0 +1 @@ +Krext \ No newline at end of file diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/meta.yaml b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/meta.yaml new file mode 100644 index 00000000..60c7576f --- /dev/null +++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/meta.yaml @@ -0,0 +1,15 @@ +artifact_uri: data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/artifacts +end_time: 1634906794982 +entry_point_name: '' +experiment_id: '0' +lifecycle_stage: active +name: '' +run_id: b0bb43858340487191c30da4b1a7218e +run_uuid: b0bb43858340487191c30da4b1a7218e +source_name: '' +source_type: 4 +source_version: '' +start_time: 1634906793160 +status: 3 +tags: [] +user_id: Krext diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Accuracy b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Accuracy new file mode 100644 index 00000000..77dc0de8 --- /dev/null +++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Accuracy @@ -0,0 +1 @@ +1634906794975 0.8351045050134162 0 diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Cohen_kappa b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Cohen_kappa new file mode 100644 index 00000000..7b368705 --- /dev/null +++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Cohen_kappa @@ -0,0 +1 @@ +1634906794977 0.06410899327093145 0 diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/F1-Score b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/F1-Score new file mode 100644 index 00000000..68123d0f --- /dev/null +++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/F1-Score @@ -0,0 +1 @@ +1634906794979 0.15534858486300748 0 diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Jaccard b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Jaccard new file mode 100644 index 00000000..535ef0da --- /dev/null +++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Jaccard @@ -0,0 +1 @@ +1634906794980 0.0842156862745098 0 diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Precision b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Precision new file mode 100644 index 00000000..7aa4cad5 --- /dev/null +++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Precision @@ -0,0 +1 @@ +1634906794978 0.15004366812227074 0 diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Recall b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Recall new file mode 100644 index 00000000..df747ad3 --- /dev/null +++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/metrics/Recall @@ -0,0 +1 @@ +1634906794979 0.16104236970378702 0 diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/params/classifier b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/params/classifier new file mode 100644 index 00000000..eecfc333 --- /dev/null +++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/params/classifier @@ -0,0 +1 @@ +knn \ No newline at end of file diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/params/dataset b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/params/dataset new file mode 100644 index 00000000..efc02160 --- /dev/null +++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/params/dataset @@ -0,0 +1 @@ +validation \ No newline at end of file diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/params/k b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/params/k new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/params/k @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.source.git.commit b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.source.git.commit new file mode 100644 index 00000000..73f681b9 --- /dev/null +++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.source.git.commit @@ -0,0 +1 @@ +a07f531063b7ce83182c0226a382000c0df50b8d \ No newline at end of file diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.source.name b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.source.name new file mode 100644 index 00000000..a50988a9 --- /dev/null +++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.source.name @@ -0,0 +1 @@ +E:\MyPC\code\git\myforkMLiP\MLinPractice\src\classification\run_classifier.py \ No newline at end of file diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.source.type b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.source.type new file mode 100644 index 00000000..0c2c1fe9 --- /dev/null +++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.source.type @@ -0,0 +1 @@ +LOCAL \ No newline at end of file diff --git a/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.user b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.user new file mode 100644 index 00000000..d10f720c --- /dev/null +++ b/data/classification/mlflow/0/b0bb43858340487191c30da4b1a7218e/tags/mlflow.user @@ -0,0 +1 @@ +Krext \ No newline at end of file diff --git a/data/classification/mlflow/0/meta.yaml b/data/classification/mlflow/0/meta.yaml new file mode 100644 index 00000000..0c88f710 --- /dev/null +++ b/data/classification/mlflow/0/meta.yaml @@ -0,0 +1,4 @@ +artifact_location: data/classification/mlflow/0 +experiment_id: '0' +lifecycle_stage: active +name: Default diff --git a/data/dimensionality_reduction/pipeline.pickle b/data/dimensionality_reduction/pipeline.pickle index 566baf5a..89e42eb8 100644 Binary files a/data/dimensionality_reduction/pipeline.pickle and b/data/dimensionality_reduction/pipeline.pickle differ diff --git a/data/dimensionality_reduction/test.pickle b/data/dimensionality_reduction/test.pickle index 40ffb175..c25cc843 100644 Binary files a/data/dimensionality_reduction/test.pickle and b/data/dimensionality_reduction/test.pickle differ diff --git a/data/dimensionality_reduction/training.pickle b/data/dimensionality_reduction/training.pickle index f87bd9b5..f58360a4 100644 Binary files a/data/dimensionality_reduction/training.pickle and b/data/dimensionality_reduction/training.pickle differ diff --git a/data/dimensionality_reduction/validation.pickle b/data/dimensionality_reduction/validation.pickle index d3ced73e..d192e83a 100644 Binary files a/data/dimensionality_reduction/validation.pickle and b/data/dimensionality_reduction/validation.pickle differ diff --git a/data/feature_extraction/pipeline.pickle b/data/feature_extraction/pipeline.pickle index e7be5a45..dc587831 100644 Binary files a/data/feature_extraction/pipeline.pickle and b/data/feature_extraction/pipeline.pickle differ diff --git a/data/feature_extraction/test.pickle b/data/feature_extraction/test.pickle index a96dba62..caa35ec5 100644 Binary files a/data/feature_extraction/test.pickle and b/data/feature_extraction/test.pickle differ diff --git a/data/feature_extraction/training.pickle b/data/feature_extraction/training.pickle index df5d0d53..9eb41950 100644 Binary files a/data/feature_extraction/training.pickle and b/data/feature_extraction/training.pickle differ diff --git a/data/feature_extraction/validation.pickle b/data/feature_extraction/validation.pickle index f3c5ced4..c1616ae1 100644 Binary files a/data/feature_extraction/validation.pickle and b/data/feature_extraction/validation.pickle differ diff --git a/code/__init__.py b/src/__init__.py similarity index 100% rename from code/__init__.py rename to src/__init__.py diff --git a/src/application.sh b/src/application.sh new file mode 100755 index 00000000..789a114a --- /dev/null +++ b/src/application.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +# execute the application with all necessary pickle files +echo "Starting the application..." +python -m src.application.application data/preprocessing/pipeline.pickle data/feature_extraction/pipeline.pickle data/dimensionality_reduction/pipeline.pickle data/classification/classifier.pickle \ No newline at end of file diff --git a/code/application/__init__.py b/src/application/__init__.py similarity index 100% rename from code/application/__init__.py rename to src/application/__init__.py diff --git a/src/application/application.py b/src/application/application.py new file mode 100644 index 00000000..ebaae0bd --- /dev/null +++ b/src/application/application.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Console-based application for tweet classification. +""" + +import argparse, pickle +import pandas as pd +from sklearn.pipeline import make_pipeline +from src.util import COLUMN_TWEET + + +def main(): + # setting up CLI + parser = argparse.ArgumentParser(description = "Application") + parser.add_argument("preprocessing_file", help = "path to the pickle file containing the preprocessing") + parser.add_argument("feature_file", help = "path to the pickle file containing the feature extraction") + parser.add_argument("dim_red_file", help = "path to the pickle file containing the dimensionality reduction") + parser.add_argument("classifier_file", help = "path to the pickle file containing the classifier") + args = parser.parse_args() + + # load all the pipeline steps + with open(args.preprocessing_file, 'rb') as f_in: + preprocessing = pickle.load(f_in) + with open(args.feature_file, 'rb') as f_in: + feature_extraction = pickle.load(f_in) + with open(args.dim_red_file, 'rb') as f_in: + dimensionality_reduction = pickle.load(f_in) + with open(args.classifier_file, 'rb') as f_in: + classifier = pickle.load(f_in)["classifier"] + + # chain them together into a single pipeline + pipeline = make_pipeline(preprocessing, feature_extraction, dimensionality_reduction, classifier) + + # headline output + print("Welcome to ViralTweeter v0.1!") + print("-----------------------------") + print("") + + while True: + # ask user for input + tweet = input("Please type in your tweet (type 'quit' to quit the program): ") + + # terminate if necessary + if tweet == "quit": + print("Okay, goodbye!") + break + + # if not terminated: create pandas DataFrame and put it through the pipeline + df = pd.DataFrame() + df[COLUMN_TWEET] = [tweet] + + prediction = pipeline.predict(df) + confidence = pipeline.predict_proba(df) + + print("Prediction: {0}, Confidence: {1}".format(prediction, confidence)) + print("") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/classification.sh b/src/classification.sh new file mode 100755 index 00000000..66ca39d9 --- /dev/null +++ b/src/classification.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# create directory if not yet existing +mkdir -p data/classification/ + +# run feature extraction on training set (may need to fit extractors) + +# echo " training set" + +# uncomment one of these lines to train a specific classifier +python -m src.classification.run_classifier data/dimensionality_reduction/training.pickle -e data/classification/classifier.pickle -s 42 --knn 1 --metrics all +# python -m src.classification.run_classifier data/dimensionality_reduction/training.pickle -e data/classification/classifier.pickle -s 42 --randomforest 10 --metrics all +# python -m src.classification.run_classifier data/dimensionality_reduction/training.pickle -e data/classification/classifier.pickle -s 42 --dummyclassifier stratified --metrics all +# python -m src.classification.run_classifier data/dimensionality_reduction/training.pickle -e data/classification/classifier.pickle -s 42 --dummyclassifier most_frequent --metrics all + +# run feature extraction on validation set (with pre-fit extractors) +# echo " validation set" +python -m src.classification.run_classifier data/dimensionality_reduction/validation.pickle -i data/classification/classifier.pickle --metrics all + +# don't touch the test set, yet, because that would ruin the final generalization experiment! \ No newline at end of file diff --git a/code/classification/__init__.py b/src/classification/__init__.py similarity index 100% rename from code/classification/__init__.py rename to src/classification/__init__.py diff --git a/code/classification/classifier.sge b/src/classification/classifier.sge similarity index 63% rename from code/classification/classifier.sge rename to src/classification/classifier.sge index 5b03d664..80d44491 100755 --- a/code/classification/classifier.sge +++ b/src/classification/classifier.sge @@ -13,10 +13,10 @@ conda activate MLinPractice # train classifier on training set echo " training" -python -m code.classification.run_classifier data/dimensionality_reduction/training.pickle -e $* +python -m src.classification.run_classifier data/dimensionality_reduction/training.pickle -e $* # evaluate classifier on validation set echo " validation" -python -m code.classification.run_classifier data/dimensionality_reduction/validation.pickle -i $* +python -m src.classification.run_classifier data/dimensionality_reduction/validation.pickle -i $* conda deactivate \ No newline at end of file diff --git a/code/classification/grid_search.sh b/src/classification/grid_search.sh similarity index 82% rename from code/classification/grid_search.sh rename to src/classification/grid_search.sh index 6897508f..59700720 100755 --- a/code/classification/grid_search.sh +++ b/src/classification/grid_search.sh @@ -10,11 +10,11 @@ values_of_k=("1 2 3 4 5 6 7 8 9 10") if [ $1 = local ] then echo "[local execution]" - cmd="code/classification/classifier.sge" + cmd="src/classification/classifier.sge" elif [ $1 = grid ] then echo "[grid execution]" - cmd="qsub code/classification/classifier.sge" + cmd="qsub src/classification/classifier.sge" else echo "[ERROR! Argument not supported!]" exit 1 diff --git a/src/classification/run_classifier.py b/src/classification/run_classifier.py new file mode 100644 index 00000000..e65ed551 --- /dev/null +++ b/src/classification/run_classifier.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Train or evaluate a single classifier with its given set of hyperparameters. +""" + +import argparse, pickle, os +from pathlib import Path +from sys import float_info +from typing import Any, Callable, List, Tuple +from sklearn.dummy import DummyClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.neighbors import KNeighborsClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.pipeline import make_pipeline +from mlflow import log_metric, log_param, set_tracking_uri + +from sklearn.metrics import accuracy_score, cohen_kappa_score, precision_score, recall_score, f1_score, jaccard_score + +METR_ACC = "accuracy" +METR_KAPPA = "kappa" +METR_PREC = "precision" +METR_REC = "recall" +METR_F1 = "f1" +METR_JAC = "jaccard" + +def main(): + # setting up CLI + parser = argparse.ArgumentParser(description = "Classifier") + parser.add_argument("input_file", help = "path to the input pickle file") + parser.add_argument("-s", '--seed', type = int, help = "seed for the random number generator", default = None) + parser.add_argument("-e", "--export_file", help = "export the trained classifier to the given location", default = None) + parser.add_argument("-i", "--import_file", help = "import a trained classifier from the given location", default = None) + + parser.add_argument("-d", '--dummyclassifier', choices=["most_frequent", "stratified"], default=None) + parser.add_argument("--knn", type = int, help = "k nearest neighbor classifier with the specified value of k", default = None) + parser.add_argument("-r", "--randomforest", type = int, help = "Random Forest classifier with the specified number of estimators (trees)", default = None) + + metrics_choices = ["none", "all", METR_ACC, METR_KAPPA, METR_PREC, METR_REC, METR_F1, METR_JAC] + parser.add_argument("-m", "--metrics", choices=metrics_choices, default="none") + + parser.add_argument("--log_folder", help = "where to log the mlflow results", default = "data/classification/mlflow") + args = parser.parse_args() + + # load data + with open(args.input_file, 'rb') as f_in: + data = pickle.load(f_in) + + if args.log_folder is not None: + set_tracking_uri(args.log_folder) + + if args.import_file is not None: + # import a pre-trained classifier + with open(args.import_file, 'rb') as f_in: + input_dict = pickle.load(f_in) + + classifier = input_dict["classifier"] + for param, value in input_dict["params"].items(): + log_param(param, value) + + log_param("dataset", "validation") + + else: + # manually set up a classifier + if args.dummyclassifier == "most_frequent": + # majority vote classifier + print(" always most_frequent label (Dummy Classifier)") + log_param("classifier", "most_frequent") + params = {"classifier": "most_frequent"} + classifier = DummyClassifier(strategy = "most_frequent", random_state = args.seed) + + elif args.dummyclassifier == "stratified": + # label frequency classifier + print(" label frequency classifier") + log_param("classifier", "stratified") + params = {"classifier": "stratified"} + classifier = DummyClassifier(strategy = "stratified", random_state = args.seed) + + elif args.randomforest is not None: + print(" random forest classifier") + log_param("classifier", "randomforest") + log_param("n", args.randomforest) + params = {"classifier": "randomforest", "n": args.randomforest} + classifier = RandomForestClassifier(n_estimators = args.randomforest, random_state = args.seed) + + elif args.knn is not None: + print(" {0} nearest neighbor classifier".format(args.knn)) + log_param("classifier", "knn") + log_param("k", args.knn) + params = {"classifier": "knn", "k": args.knn} + standardizer = StandardScaler() + knn_classifier = KNeighborsClassifier(args.knn, n_jobs = -1) + classifier = make_pipeline(standardizer, knn_classifier) + + classifier.fit(data["features"], data["labels"].ravel()) + log_param("dataset", "training") + + prediction = classifier.predict(data["features"]) + + evaluation_metrics = select_metrics_based_on_args(args.metrics) + computed_metrics = compute_metrics(evaluation_metrics, data, prediction) + + print_input_file_name(args.input_file) # eg training set + print_formatted_metrics(computed_metrics) # eg Accuracy: 0.908 + log_metrics(computed_metrics) + # export the trained classifier if the user wants us to do so + if args.export_file is not None: + output_dict = {"classifier": classifier, "params": params} + with open(args.export_file, 'wb') as f_out: + pickle.dump(output_dict, f_out) + + +def print_input_file_name(input_file): + print(" " + Path(input_file).stem + " set"); + + +def select_metrics_based_on_args(metrics: str): + evaluation_metrics: List[Tuple[str, Callable[[Any, Any], float] ]] = [] + + if metrics == METR_ACC or metrics == "all": + evaluation_metrics.append(("Accuracy", accuracy_score)) + + if metrics == METR_KAPPA or metrics == "all": + evaluation_metrics.append(("Cohen_kappa", cohen_kappa_score)) + + if metrics == METR_PREC or metrics == "all": + evaluation_metrics.append(("Precision", precision_score)) + + if metrics == METR_REC or metrics == "all": + evaluation_metrics.append(("Recall", recall_score)) + + if metrics == METR_F1 or metrics == "all": + evaluation_metrics.append(("F1-Score", f1_score)) + + if metrics == METR_JAC or metrics == "all": + evaluation_metrics.append(("Jaccard", jaccard_score)) + + return evaluation_metrics + + +def compute_metrics(evaluation_metrics, data, prediction): + computed_metrics: List[Tuple[str, float]] = [] + + for metric_name, metric in evaluation_metrics: + metric_score = metric(data["labels"], prediction) + computed_metrics.append((metric_name, metric_score)) + + return computed_metrics + + +def print_formatted_metrics(computed_metrics): + for metric_name, metric_score in computed_metrics: + number_of_decimals = 3 + rounded_score = round(metric_score, number_of_decimals) + print(f"\t{metric_name}: {rounded_score}") + + +def log_metrics(computed_metrics): + for metric_name, metric_score in computed_metrics: + log_metric(metric_name, metric_score) + + +if __name__ == "__main__": + main() diff --git a/code/create_environment.sge b/src/create_environment.sge similarity index 100% rename from code/create_environment.sge rename to src/create_environment.sge diff --git a/src/dimensionality_reduction.sh b/src/dimensionality_reduction.sh new file mode 100755 index 00000000..9ea2e334 --- /dev/null +++ b/src/dimensionality_reduction.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# create directory if not yet existing +mkdir -p data/dimensionality_reduction/ + +# run dimensionality reduction on training set to fit the parameters +echo " training set" +python -m src.dimensionality_reduction.reduce_dimensionality data/feature_extraction/training.pickle data/dimensionality_reduction/training.pickle -e data/dimensionality_reduction/pipeline.pickle -m 3 --verbose + +# run feature extraction on validation set and test set (with pre-fit parameters) +echo " validation set" +python -m src.dimensionality_reduction.reduce_dimensionality data/feature_extraction/validation.pickle data/dimensionality_reduction/validation.pickle -i data/dimensionality_reduction/pipeline.pickle +echo " test set" +python -m src.dimensionality_reduction.reduce_dimensionality data/feature_extraction/test.pickle data/dimensionality_reduction/test.pickle -i data/dimensionality_reduction/pipeline.pickle diff --git a/code/dimensionality_reduction/__init__.py b/src/dimensionality_reduction/__init__.py similarity index 100% rename from code/dimensionality_reduction/__init__.py rename to src/dimensionality_reduction/__init__.py diff --git a/src/dimensionality_reduction/reduce_dimensionality.py b/src/dimensionality_reduction/reduce_dimensionality.py new file mode 100644 index 00000000..561c1e99 --- /dev/null +++ b/src/dimensionality_reduction/reduce_dimensionality.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Apply a dimensionality reduction technique. +""" + +import argparse, pickle +from sklearn.feature_selection import SelectKBest, mutual_info_classif + + +def main(): + # setting up CLI + parser = argparse.ArgumentParser(description = "Dimensionality reduction") + parser.add_argument("input_file", help = "path to the input pickle file") + parser.add_argument("output_file", help = "path to the output pickle file") + parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None) + parser.add_argument("-i", "--import_file", help = "import an existing pipeline from the given location", default = None) + parser.add_argument("-m", "--mutual_information", type = int, help = "select K best features with Mutual Information", default = None) + parser.add_argument("--verbose", action = "store_true", help = "print information about feature selection process") + args = parser.parse_args() + + # load the data + with open(args.input_file, 'rb') as f_in: + input_data = pickle.load(f_in) + + features = input_data["features"] + labels = input_data["labels"] + feature_names = input_data["feature_names"] + + if args.import_file is not None: + # simply import an already fitted dimensionality reducer + with open(args.import_file, 'rb') as f_in: + dim_red = pickle.load(f_in) + + else: # need to set things up manually + + if args.mutual_information is not None: + # select K best based on Mutual Information + dim_red = SelectKBest(mutual_info_classif, k="all") # k = args.mutual_information) + dim_red.fit(features, labels.ravel()) + + # resulting feature names based on support given by SelectKBest + def get_feature_names(kbest, names): + support = kbest.get_support() + result = [] + for name, selected in zip(names, support): + if selected: + result.append(name) + return result + + if args.verbose: + print(" SelectKBest with Mutual Information and k = {0}".format(args.mutual_information)) + print(" {0}".format(feature_names)) + print(" " + str(dim_red.scores_)) + print(" " + str(get_feature_names(dim_red, feature_names))) + pass + + # apply the dimensionality reduction to the given features + reduced_features = dim_red.transform(features) + + # print("reduced_features \n --- \n ", reduced_features) + + # store the results + output_data = {"features": reduced_features, + "labels": labels} + with open(args.output_file, 'wb') as f_out: + pickle.dump(output_data, f_out) + + # export the dimensionality reduction technique as pickle file if desired by user + if args.export_file is not None: + with open(args.export_file, 'wb') as f_out: + pickle.dump(dim_red, f_out) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/examples/example_data_viz.py b/src/examples/example_data_viz.py new file mode 100644 index 00000000..d9d6aae9 --- /dev/null +++ b/src/examples/example_data_viz.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +Example: exploratory data visualizuation with pandas and matplotlib +""" + +# plotting with pandas +import csv +import pandas as pd + +df = pd.read_csv("data/preprocessing/preprocessed.csv", quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n") + +df["language"].value_counts().plot(kind = 'bar') +df["language"].value_counts().plot(kind = 'bar', logy = True) + +df["date"] = df["date"].astype("datetime64") +df["label"].groupby(df["date"].dt.month).count().plot(kind = 'bar') + + +# plotting with matplotlib +import pickle +from matplotlib import pyplot as plt +import numpy as np + +with open("data/feature_extraction/training.pickle", "rb") as f_in: + data = pickle.load(f_in) + +features = data["features"] +labels = data["labels"] + +plt.hist(features) +plt.hist(features, range = [0,400]) + +pos = features[labels] +neg_index = np.array([not x for x in labels]) +neg = features[neg_index] + +bins = [0, 50, 100, 150, 200, 250, 300, 350, 400] + +plt.hist(pos, bins = bins) +plt.hist(neg, bins = bins) diff --git a/src/examples/example_dim_reduction.py b/src/examples/example_dim_reduction.py new file mode 100644 index 00000000..8c739bb2 --- /dev/null +++ b/src/examples/example_dim_reduction.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Examples for different kinds of dimensionality reduction +""" + +from sklearn.datasets import load_breast_cancer +from sklearn.decomposition import PCA +from sklearn.linear_model import LogisticRegression +from sklearn.feature_selection import RFE, SelectKBest, mutual_info_classif, SelectFromModel +from sklearn.ensemble import RandomForestClassifier +import numpy as np + +data_set = load_breast_cancer() +X = data_set.data +y = data_set.target +print("Data Set: ", X.shape, y.shape) +print("Combinatorics of binary feature values:", 2**30) + + +# PCA +print("\nPCA") +print('---') +pca = PCA(random_state = 42) +pca.fit(X) +print("explained variance (percentage): ", pca.explained_variance_ratio_) +print('most important component: ', pca.components_[0]) +pca_transformed = pca.transform(X) +pca_transformed = pca_transformed[:,0:1] +print("after transformation: ", pca_transformed.shape, y.shape) +print("Compare: ", X[0], pca_transformed[0]) + + +# wrapper +print("\nWrapper") +print("-------") + +model = LogisticRegression(random_state = 42, max_iter = 10000) +rfe = RFE(model, n_features_to_select = 2) +rfe.fit(X,y) +print("Feature ranking according to RFE/LogReg:", rfe.ranking_) +index_of_first = np.where(rfe.ranking_ == 1)[0][0] +index_of_second = np.where(rfe.ranking_ == 2)[0][0] +print("Two most promising features: ", index_of_first, index_of_second) +wrapper_transformed = rfe.transform(X) +# or alternatively this does the same?? +# wrapper_transformed = features[:,[index_of_first,index_of_second]] +# the line was in the preparatory dimensionalityReduction branch of lbechberger +print("After transformation: ", wrapper_transformed.shape, y.shape) +print("compare: ", X[0], wrapper_transformed[0]) + + +# Filter +print("\n Filter") +print("------") +# mutual information (related to entropy and information gain when comparing data) +skb = SelectKBest(score_func = mutual_info_classif, k = 3) +skb.fit(X,y) +print("Feature scores according to MI: ", skb.scores_) +filter_transformed = skb.transform(X) +print("After transformation: ", filter_transformed.shape, y.shape) +print("Compare: ", X[0], filter_transformed[0]) + + +# Embedded +print("\nEmbedded") +print("--------") +rf = RandomForestClassifier(n_estimators = 10, random_state=42) +rf.fit(X,y) +print("Feature importance according to RF: ", rf.feature_importances_) +sfm = SelectFromModel(rf, threshold = 0.1, prefit = True) +embedded_transformed = sfm.transform(X) +print("After transformation: ", embedded_transformed.shape, y.shape) +print("Compare: ", X[0], embedded_transformed[0]) diff --git a/src/examples/example_feature_extraction.py b/src/examples/example_feature_extraction.py new file mode 100644 index 00000000..52ff25c4 --- /dev/null +++ b/src/examples/example_feature_extraction.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +Some examples for feature extraction: +- bigrams +- TF-IDF +- WordNet synonyms +- word embeddings from gensim word2vec-google-news-300 model +- one hot encoding of categorical data + - day and night are similar in nature, but on the opposite ends of the array) +""" + +# bigrams +import nltk +import string +import csv +import pandas as pd + +df = pd.read_csv("data/preprocessing/preprocessed.csv", quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n") + +text = "John Wilkes Booth shot Abraham Lincoln. Abraham Lincoln was not shot inside the White House." +tokens = nltk.word_tokenize(text) +tokens = [token for token in tokens if token not in string.punctuation] + +bigrams = nltk.bigrams(tokens) +freq_dist = nltk.FreqDist(bigrams) +freq_list = [] +for bigram, freq in freq_dist.items(): + freq_list.append([bigram, freq]) +freq_list.sort(key = lambda x: x[1], reverse = True) +for i in range(len(freq_list)): + print(freq_list[i]) + + +# tf-idf +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity + +tweets = df["tweet"][:100] +vectorizer = TfidfVectorizer() +tf_idf_vectors = vectorizer.fit_transform(tweets).todense() + +print(tf_idf_vectors.shape) +print(vectorizer.get_feature_names()[142:145]) +print(tf_idf_vectors[66:71, 142:145]) + +tf_idf_similarities = cosine_similarity(tf_idf_vectors) +print(tf_idf_similarities[:5,:5]) + + +# NER +text = "John Wilkes Booth shot Abraham Lincoln. Abraham Lincoln was not shot inside the White House." +sentences = nltk.sent_tokenize(text) +for sentence in sentences: + words = nltk.word_tokenize(sentence) + pos_tagged = nltk.pos_tag(words) + ne_chunked = nltk.ne_chunk(pos_tagged) + print(ne_chunked) + + +# WordNet +dog_synsets = nltk.corpus.wordnet.synsets('dog') +for syn in dog_synsets: + words = [str(lemma.name()) for lemma in syn.lemmas()] + print(syn, words, syn.definition(), syn.hypernyms()) + print("") + + +# word2vec +import gensim.downloader as api + +embeddings = api.load('word2vec-google-news-300') +pairs = [('car', 'minivan'), ('car', 'airplane'), ('car', 'cereal')] + +for w1, w2 in pairs: + print("{0} - {1}: {2}".format(w1, w2, embeddings.similarity(w1, w2))) + +dog_vector = embeddings['dog'] + + +# one hot encoding +from sklearn.preprocessing import OneHotEncoder +import numpy as np + +features = np.array([["morning"], ["afternoon"], ["evening"], ["night"], ["afternoon"]]) +encoder = OneHotEncoder(sparse = False) +encoder.fit(features) +encoder.transform(features) diff --git a/src/examples/examples_parsing_with_ast.py b/src/examples/examples_parsing_with_ast.py new file mode 100644 index 00000000..11b3e4f7 --- /dev/null +++ b/src/examples/examples_parsing_with_ast.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Example: parsing a double quoted list of strings +""" + +""" +string wrapping around a list of strings parsed to a list of strings +""" + +import csv +import ast +import pandas as pd + +df = pd.read_csv("data/preprocessing/preprocessed.csv", quoting=csv.QUOTE_NONNUMERIC, lineterminator="\n") +tokenized_string = df["tweet_tokenized"][0] +tokenized_list = ast.literal_eval(tokenized_string) \ No newline at end of file diff --git a/src/feature_extraction.sh b/src/feature_extraction.sh new file mode 100755 index 00000000..a1f6d6f4 --- /dev/null +++ b/src/feature_extraction.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# create directory if not yet existing +mkdir -p data/feature_extraction/ + +# run feature extraction on training set (may need to fit extractors) +echo " training set" +python -m src.feature_extraction.extract_features data/preprocessing/split/training.csv data/feature_extraction/training.pickle -e data/feature_extraction/pipeline.pickle --char_length + +# run feature extraction on validation set and test set (with pre-fit extractors) +echo " validation set" +python -m src.feature_extraction.extract_features data/preprocessing/split/validation.csv data/feature_extraction/validation.pickle -i data/feature_extraction/pipeline.pickle +echo " test set" +python -m src.feature_extraction.extract_features data/preprocessing/split/test.csv data/feature_extraction/test.pickle -i data/feature_extraction/pipeline.pickle \ No newline at end of file diff --git a/code/feature_extraction/__init__.py b/src/feature_extraction/__init__.py similarity index 100% rename from code/feature_extraction/__init__.py rename to src/feature_extraction/__init__.py diff --git a/code/feature_extraction/bigrams.py b/src/feature_extraction/bigrams.py similarity index 88% rename from code/feature_extraction/bigrams.py rename to src/feature_extraction/bigrams.py index 6c0c4b3a..f7be51ec 100644 --- a/code/feature_extraction/bigrams.py +++ b/src/feature_extraction/bigrams.py @@ -8,7 +8,7 @@ import ast import nltk -from code.feature_extraction.feature_extractor import FeatureExtractor +from src.feature_extraction.feature_extractor import FeatureExtractor class BigramFeature(FeatureExtractor): diff --git a/code/feature_extraction/character_length.py b/src/feature_extraction/character_length.py similarity index 82% rename from code/feature_extraction/character_length.py rename to src/feature_extraction/character_length.py index 0349bf94..32285508 100644 --- a/code/feature_extraction/character_length.py +++ b/src/feature_extraction/character_length.py @@ -1,15 +1,11 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -Simple feature that counts the number of characters in the given column. - -Created on Wed Sep 29 12:29:25 2021 - -@author: lbechberger +Simple feature that counts the number of characters in the given column (e.g tweet column). """ import numpy as np -from code.feature_extraction.feature_extractor import FeatureExtractor +from src.feature_extraction.feature_extractor import FeatureExtractor # class for extracting the character-based length as a feature class CharacterLength(FeatureExtractor): diff --git a/src/feature_extraction/counter_fe.py b/src/feature_extraction/counter_fe.py new file mode 100644 index 00000000..df8e01d5 --- /dev/null +++ b/src/feature_extraction/counter_fe.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Counts the +""" + +import numpy as np +import ast +from src.feature_extraction.feature_extractor import FeatureExtractor + +# class for extracting the character-based length as a feature +class CounterFE(FeatureExtractor): + + # constructor + def __init__(self, input_column): + super().__init__([input_column], f"{input_column}_count") + + + # don't need to fit, so don't overwrite _set_variables() + + + def _get_values(self, inputs): + """ + Parses the string in every cell of the column/series as an array + and counts the length in the cell of the output column + """ + + evaluated = inputs[0].apply(ast.literal_eval) + result = np.array(evaluated.str.len()) + result = result.reshape(-1,1) + return result diff --git a/src/feature_extraction/extract_features.py b/src/feature_extraction/extract_features.py new file mode 100644 index 00000000..67ea4890 --- /dev/null +++ b/src/feature_extraction/extract_features.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Runs the specified collection of feature extractors. +""" + +import argparse, csv, pickle +import pandas as pd +import numpy as np +from src.feature_extraction.character_length import CharacterLength +from src.feature_extraction.counter_fe import CounterFE +from src.feature_extraction.feature_collector import FeatureCollector +from src.util import COLUMN_MENTIONS, COLUMN_PHOTOS, COLUMN_TWEET, COLUMN_LABEL + + +def main(): + # setting up CLI + parser = argparse.ArgumentParser(description = "Feature Extraction") + parser.add_argument("input_file", help = "path to the input csv file") + parser.add_argument("output_file", help = "path to the output pickle file") + parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None) + parser.add_argument("-i", "--import_file", help = "import an existing pipeline from the given location", default = None) + parser.add_argument("-c", "--char_length", action = "store_true", help = "compute the number of characters in the tweet") + args = parser.parse_args() + + # load data + df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n") + + if args.import_file is not None: + # simply import an exisiting FeatureCollector + with open(args.import_file, "rb") as f_in: + feature_collector = pickle.load(f_in) + + else: # need to create FeatureCollector manually + + # collect all feature extractors + features = [] + if args.char_length: + features.append(CharacterLength(COLUMN_TWEET)) + features.append(CounterFE(COLUMN_MENTIONS)) + features.append(CounterFE(COLUMN_PHOTOS)) + + # create overall FeatureCollector + feature_collector = FeatureCollector(features) + + # fit it on the given data set (assumed to be training data) + feature_collector.fit(df) + + + # apply the given FeatureCollector on the current data set + # maps the pandas DataFrame to an numpy array + feature_array = feature_collector.transform(df) + + # get label array + label_array = np.array(df[COLUMN_LABEL]) + label_array = label_array.reshape(-1, 1) + + # print("features\n ---\n", feature_array) + + # store the results + results = {"features": feature_array, "labels": label_array, + "feature_names": feature_collector.get_feature_names()} + with open(args.output_file, 'wb') as f_out: + pickle.dump(results, f_out) + + # export the FeatureCollector as pickle file if desired by user + if args.export_file is not None: + with open(args.export_file, 'wb') as f_out: + pickle.dump(feature_collector, f_out) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/code/feature_extraction/feature_collector.py b/src/feature_extraction/feature_collector.py similarity index 89% rename from code/feature_extraction/feature_collector.py rename to src/feature_extraction/feature_collector.py index d2fca494..f637f3cf 100644 --- a/code/feature_extraction/feature_collector.py +++ b/src/feature_extraction/feature_collector.py @@ -2,14 +2,10 @@ # -*- coding: utf-8 -*- """ Collects the feature values from many different feature extractors. - -Created on Wed Sep 29 12:36:01 2021 - -@author: lbechberger """ import numpy as np -from code.feature_extraction.feature_extractor import FeatureExtractor +from src.feature_extraction.feature_extractor import FeatureExtractor # extend FeatureExtractor for the sake of simplicity class FeatureCollector(FeatureExtractor): @@ -26,7 +22,7 @@ def __init__(self, features): input_columns += feature.get_input_columns() # remove duplicate columns - input_colums = list(set(input_columns)) + input_columns = list(set(input_columns)) # call constructor of super class super().__init__(input_columns, "FeatureCollector") diff --git a/code/feature_extraction/feature_extractor.py b/src/feature_extraction/feature_extractor.py similarity index 95% rename from code/feature_extraction/feature_extractor.py rename to src/feature_extraction/feature_extractor.py index e8db5d84..c2302d57 100644 --- a/code/feature_extraction/feature_extractor.py +++ b/src/feature_extraction/feature_extractor.py @@ -2,12 +2,8 @@ # -*- coding: utf-8 -*- """ Base class for all of our feature extractors. - -Created on Wed Sep 29 12:22:13 2021 - -@author: lbechberger """ - +import numpy as np from sklearn.base import BaseEstimator, TransformerMixin # base class for all feature extractors @@ -53,7 +49,7 @@ def fit(self, df): # get feature values based on input column and internal variables # should return a numpy array # to be implemented by subclass! - def _get_values(self, inputs): + def _get_values(self, inputs) -> np.ndarray: pass # transform function: transforms pandas DataFrame to numpy array of feature values diff --git a/code/preprocessing/__init__.py b/src/feature_extraction/test/__init__.py similarity index 100% rename from code/preprocessing/__init__.py rename to src/feature_extraction/test/__init__.py diff --git a/test/feature_extraction/bigrams_test.py b/src/feature_extraction/test/bigrams_test.py similarity index 92% rename from test/feature_extraction/bigrams_test.py rename to src/feature_extraction/test/bigrams_test.py index 29abfdae..81e3e12f 100644 --- a/test/feature_extraction/bigrams_test.py +++ b/src/feature_extraction/test/bigrams_test.py @@ -1,15 +1,13 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -Created on Thu Oct 7 14:51:00 2021 - -@author: ml +Test the bigram feature """ import unittest import pandas as pd import nltk -from code.feature_extraction.bigrams import BigramFeature +from src.feature_extraction.bigrams import BigramFeature class BigramFeatureTest(unittest.TestCase): diff --git a/src/feature_extraction/test/feature_extraction_test.py b/src/feature_extraction/test/feature_extraction_test.py new file mode 100644 index 00000000..e7f74f24 --- /dev/null +++ b/src/feature_extraction/test/feature_extraction_test.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Tests feature extraction +""" + +import csv +import logging +import unittest +import pandas as pd +import numpy as np +from src.feature_extraction.counter_fe import CounterFE + +class CountFeatureTest(unittest.TestCase): + + def setUp(self): + self.INPUT_COLUMN = "mockcolumn" + self.count_feature_extractor = CounterFE(self.INPUT_COLUMN) + + self.df = pd.DataFrame() + self.df[self.INPUT_COLUMN] = [ + "[{'screen_name': 'zeebusiness', 'name': 'zee business', 'id': '140798905'}, {'screen_name': 'amishdevgan', 'name': 'amish devgan', 'id': '163817624'}]", + "[]", + "[{'screen_name': 'zeebusiness', 'name': 'zee business', 'id': '140798905'}]" + ] + + + def test_feature_name(self): + self.assertEqual(self.count_feature_extractor.get_feature_name(), "mockcolumn_count") + + + def test_counting(self): + + self.count_feature_extractor.fit(self.df) + + actual_feature = self.count_feature_extractor.transform(self.df) + EXPECTED = np.array(pd.DataFrame({"mockcolumn_count": [2, 0, 1]})) + + isEqual = np.array_equal(actual_feature, EXPECTED, equal_nan=False) + self.assertTrue(isEqual) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/code/load_data.sh b/src/load_data.sh similarity index 100% rename from code/load_data.sh rename to src/load_data.sh diff --git a/src/load_nltk_data.sh b/src/load_nltk_data.sh new file mode 100644 index 00000000..60b202a2 --- /dev/null +++ b/src/load_nltk_data.sh @@ -0,0 +1,4 @@ +# install all NLTK models https://www.nltk.org/data.html +echo "installing nltk data sets, corpora and models" +# TODO this could be optimized. Not everything needs to be downloaded +python -m nltk.downloader all \ No newline at end of file diff --git a/code/pipeline.sh b/src/pipeline.sh similarity index 52% rename from code/pipeline.sh rename to src/pipeline.sh index 8cfef559..691bef9b 100755 --- a/code/pipeline.sh +++ b/src/pipeline.sh @@ -1,13 +1,11 @@ #!/bin/bash # overall pipeline for the ML experiments -echo "loading data" -code/load_data.sh echo "preprocessing" -code/preprocessing.sh +src/preprocessing.sh echo "feature extraction" -code/feature_extraction.sh +src/feature_extraction.sh echo "dimensionality reduction" -code/dimensionality_reduction.sh +src/dimensionality_reduction.sh echo "classification" -code/classification.sh \ No newline at end of file +src/classification.sh \ No newline at end of file diff --git a/src/preprocessing.sh b/src/preprocessing.sh new file mode 100755 index 00000000..93037282 --- /dev/null +++ b/src/preprocessing.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# create directory if not yet existing +mkdir -p data/preprocessing/split/ + +# add labels +echo " creating labels" +python -m src.preprocessing.create_labels data/raw/ data/preprocessing/labeled.csv + +# other preprocessing (removing punctuation etc.) +echo " general preprocessing" +python -m src.preprocessing.run_preprocessing data/preprocessing/labeled.csv data/preprocessing/preprocessed.csv --punctuation --tokenize --other -e data/preprocessing/pipeline.pickle + +# split the data set +echo " splitting the data set" +python -m src.preprocessing.split_data data/preprocessing/preprocessed.csv data/preprocessing/split/ -s 42 \ No newline at end of file diff --git a/test/__init__.py b/src/preprocessing/__init__.py similarity index 100% rename from test/__init__.py rename to src/preprocessing/__init__.py diff --git a/src/preprocessing/create_labels.py b/src/preprocessing/create_labels.py new file mode 100644 index 00000000..881452a3 --- /dev/null +++ b/src/preprocessing/create_labels.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Reads in the original csv files and creates labels for the data points. +Stores the result as a single pandas DataFrame in a pickle file. +""" + +import os, argparse, csv +import pandas as pd +from src.util import COLUMN_LIKES, COLUMN_RETWEETS, COLUMN_LABEL + + +def main(): + # setting up CLI + parser = argparse.ArgumentParser(description = "Creation of Labels") + parser.add_argument("data_directory", help = "directory where the original csv files reside") + parser.add_argument("output_file", help = "path to the output csv file") + parser.add_argument("-l", '--likes_weight', type = int, help = "weight of likes", default = 1) + parser.add_argument("-r", '--retweet_weight', type = int, help = "weight of retweets", default = 1) + parser.add_argument("-t", '--threshold', type = int, help = "threshold to surpass for positive class", default = 50) + args = parser.parse_args() + + # get all csv files in data_directory + file_paths = [args.data_directory + f for f in os.listdir(args.data_directory) if f.endswith(".csv")] + + # load all csv files + dfs = [] + for file_path in file_paths: + dfs.append(pd.read_csv(file_path, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n", dtype={"quote_url": str, "place": str, "tweet": str, "language": str})) + + # join all data into a single DataFrame + df = pd.concat(dfs) + + # compute new column "label" based on likes and retweets + df[COLUMN_LABEL] = (args.likes_weight * df[COLUMN_LIKES] + args.retweet_weight * df[COLUMN_RETWEETS]) > args.threshold + + # print statistics + print("Number of tweets: {0}".format(len(df))) + print("Label distribution:") + print(df[COLUMN_LABEL].value_counts(normalize = True)) + + # store the DataFrame into a csv file + df.to_csv(args.output_file, index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/code/preprocessing/preprocessor.py b/src/preprocessing/preprocessor.py similarity index 96% rename from code/preprocessing/preprocessor.py rename to src/preprocessing/preprocessor.py index a5abd445..6514b99d 100644 --- a/code/preprocessing/preprocessor.py +++ b/src/preprocessing/preprocessor.py @@ -2,10 +2,6 @@ # -*- coding: utf-8 -*- """ Superclass for all preprocessors. - -Created on Tue Sep 28 17:06:35 2021 - -@author: lbechberger """ from sklearn.base import BaseEstimator, TransformerMixin diff --git a/test/feature_extraction/__init__.py b/src/preprocessing/preprocessors/__init__.py similarity index 100% rename from test/feature_extraction/__init__.py rename to src/preprocessing/preprocessors/__init__.py diff --git a/src/preprocessing/preprocessors/column_dropper.py b/src/preprocessing/preprocessors/column_dropper.py new file mode 100644 index 00000000..abd9280f --- /dev/null +++ b/src/preprocessing/preprocessors/column_dropper.py @@ -0,0 +1,20 @@ +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin + +class ColumnDropper(BaseEstimator, TransformerMixin): + + def __init__(self, cols): + + if not isinstance(cols, list): + self.cols = [cols] + else: + self.cols = cols + + def fit(self, df): + # there is nothing to fit + return self + + def transform(self, X_df: pd.DataFrame): + df = X_df.drop(columns=self.cols) + return df + diff --git a/src/preprocessing/preprocessors/non_english_remover.py b/src/preprocessing/preprocessors/non_english_remover.py new file mode 100644 index 00000000..5309deaf --- /dev/null +++ b/src/preprocessing/preprocessors/non_english_remover.py @@ -0,0 +1,13 @@ +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin + +class NonEnglishRemover(BaseEstimator, TransformerMixin): + + def fit(self, df): + # there is nothing to fit + return self + + def transform(self, X_df: pd.DataFrame): + df = X_df[X_df.language == 'en'] + return df + diff --git a/code/preprocessing/punctuation_remover.py b/src/preprocessing/punctuation_remover.py similarity index 83% rename from code/preprocessing/punctuation_remover.py rename to src/preprocessing/punctuation_remover.py index 0f026b0e..2359e9a9 100644 --- a/code/preprocessing/punctuation_remover.py +++ b/src/preprocessing/punctuation_remover.py @@ -2,15 +2,11 @@ # -*- coding: utf-8 -*- """ Preprocessor that removes punctuation from the original tweet text. - -Created on Wed Sep 29 09:45:56 2021 - -@author: lbechberger """ import string -from code.preprocessing.preprocessor import Preprocessor -from code.util import COLUMN_TWEET, COLUMN_PUNCTUATION +from src.preprocessing.preprocessor import Preprocessor +from src.util import COLUMN_TWEET, COLUMN_PUNCTUATION # removes punctuation from the original tweet # inspired by https://stackoverflow.com/a/45600350 diff --git a/src/preprocessing/run_preprocessing.py b/src/preprocessing/run_preprocessing.py new file mode 100644 index 00000000..a733d82f --- /dev/null +++ b/src/preprocessing/run_preprocessing.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Runs the specified collection of preprocessing steps +""" + +import argparse, csv, pickle +from numpy.core.numeric import NaN +import pandas as pd +from sklearn.pipeline import make_pipeline +from src.preprocessing.preprocessors.column_dropper import ColumnDropper +from src.preprocessing.preprocessors.non_english_remover import NonEnglishRemover +from src.preprocessing.punctuation_remover import PunctuationRemover +from src.preprocessing.tweet_clean import TweetClean +from src.preprocessing.tokenizer import Tokenizer +from src.util import COLUMN_TWEET, SUFFIX_TOKENIZED + + +def main(): + # setting up CLI + parser = argparse.ArgumentParser(description = "Various preprocessing steps") + parser.add_argument("input_file", help = "path to the input csv file") + parser.add_argument("output_file", help = "path to the output csv file") + parser.add_argument("-p", "--punctuation", action = "store_true", help = "remove punctuation") + parser.add_argument("-t", "--tokenize", action = "store_true", help = "tokenize given column into individual words") + parser.add_argument("-o", "--other", action = "store_true", help = "remove non-english tweets and unnecessary columns") + parser.add_argument("--tokenize_input", help = "input column to tokenize", default = COLUMN_TWEET) + parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None) + args = parser.parse_args() + + # load data + df = pd.read_csv(args.input_file, + quoting=csv.QUOTE_NONNUMERIC, + lineterminator="\n", + verbose=False, + dtype={"quote_url": object, "place": object, "tweet": object, "language": object, "thumbnail": object}, + converters={'mentions': eval, 'photos': eval, 'urls': eval}) + + # collect all preprocessors + preprocessors = [] + if args.punctuation: + preprocessors.append(PunctuationRemover()) + preprocessors.append(TweetClean("tweet", "tweet_clean")) + if args.tokenize: + preprocessors.append(Tokenizer(args.tokenize_input, args.tokenize_input + SUFFIX_TOKENIZED)) + if args.other: + DROP_COLS = [ + "id", "conversation_id", "created_at", "timezone", "user_id", "name", "place", + "replies_count", "retweets_count", "likes_count", "language", + # "cashtag" only few records have this filled. Might be useless + # below columns have always the same value for all records + "retweet", "near", "geo", "source", "user_rt_id", "user_rt", "retweet_id", + "retweet_date", "translate", "trans_src", 'trans_dest\r'] + + preprocessors.append(NonEnglishRemover()) + preprocessors.append(ColumnDropper(DROP_COLS)) + + # call all preprocessing steps + for preprocessor in preprocessors: + df = preprocessor.fit_transform(df) + + # store the results + df.to_csv(args.output_file, index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n") + + # create a pipeline if necessary and store it as pickle file + if args.export_file is not None: + pipeline = make_pipeline(*preprocessors) + with open(args.export_file, 'wb') as f_out: + pickle.dump(pipeline, f_out) + + +if __name__ == "__main__": + main() diff --git a/src/preprocessing/split_data.py b/src/preprocessing/split_data.py new file mode 100644 index 00000000..337b5406 --- /dev/null +++ b/src/preprocessing/split_data.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Splits the preprocessed data into training, validation, and test set. +""" + +import os, argparse, csv +import pandas as pd +from sklearn.model_selection import train_test_split +from src.util import COLUMN_LABEL + + +def main(): + # setting up CLI + parser = argparse.ArgumentParser(description = "Splitting the data set") + parser.add_argument("input_file", help = "path to the input csv file") + parser.add_argument("output_folder", help = "path to the output folder") + parser.add_argument("-s", '--seed', type = int, help = "seed for the random number generator", default = None) + parser.add_argument("-t", '--test_size', type = float, help = "relative size of the test set", default = 0.2) + parser.add_argument("-v", '--validation_size', type = float, help = "relative size of the validation set", default = 0.2) + args = parser.parse_args() + + # load the data + df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n", dtype={"quote_url": object, "tweet": object, "thumbnail": object},) + + # split into (training & validation) and test set + X, X_test = train_test_split(df, test_size = args.test_size, random_state = args.seed, shuffle = True, stratify = df[COLUMN_LABEL]) + + # split remainder into training and validation + relative_validation_size = args.validation_size / (1 - args.test_size) + X_train, X_val = train_test_split(X, test_size = relative_validation_size, random_state = args.seed, shuffle = True, stratify = X[COLUMN_LABEL]) + + # store the three data sets separately + X_train.to_csv(os.path.join(args.output_folder, "training.csv"), index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n") + X_val.to_csv(os.path.join(args.output_folder, "validation.csv"), index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n") + X_test.to_csv(os.path.join(args.output_folder, "test.csv"), index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n") + + print("Training: {0} examples, Validation: {1} examples, Test: {2} examples".format(len(X_train), len(X_val), len(X_test))) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/test/preprocessing/__init__.py b/src/preprocessing/test/__init__.py similarity index 100% rename from test/preprocessing/__init__.py rename to src/preprocessing/test/__init__.py diff --git a/test/preprocessing/tokenizer_test.py b/src/preprocessing/test/tokenizer_test.py similarity index 63% rename from test/preprocessing/tokenizer_test.py rename to src/preprocessing/test/tokenizer_test.py index 1e008029..9216001e 100644 --- a/test/preprocessing/tokenizer_test.py +++ b/src/preprocessing/test/tokenizer_test.py @@ -1,14 +1,14 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -""" -Created on Thu Oct 7 14:30:41 2021 -@author: ml +""" +Tests the Tokenizer class """ import unittest import pandas as pd -from code.preprocessing.tokenizer import Tokenizer +from src.preprocessing.tokenizer import Tokenizer +from src.util import fm class TokenizerTest(unittest.TestCase): @@ -27,15 +27,17 @@ def test_output_column(self): self.assertEqual(self.tokenizer._output_column, self.OUTPUT_COLUMN) def test_tokenization_single_sentence(self): - input_text = "This is an example sentence" - output_text = "['This', 'is', 'an', 'example', 'sentence']" + input_sentence = "This is an example sentence" + expected_output_text = "['This', 'is', 'an', 'example', 'sentence']" input_df = pd.DataFrame() - input_df[self.INPUT_COLUMN] = [input_text] + input_df[self.INPUT_COLUMN] = [input_sentence] tokenized = self.tokenizer.fit_transform(input_df) - self.assertEqual(tokenized[self.OUTPUT_COLUMN][0], output_text) - + + msg = fm("a sentence as a string", "return a list of words as a string") + self.assertEqual(tokenized[self.OUTPUT_COLUMN][0], expected_output_text, msg) + if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/src/preprocessing/test/tweet_cleaner_test.py b/src/preprocessing/test/tweet_cleaner_test.py new file mode 100644 index 00000000..a7bc4176 --- /dev/null +++ b/src/preprocessing/test/tweet_cleaner_test.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Oct 7 14:30:41 2021 + +@author: ml +""" + +import unittest +import pandas as pd +from src.preprocessing.tweet_clean import TweetClean +from src.util import fm + +class TweetCleanerTest(unittest.TestCase): + + def setUp(self): + self.INPUT_COLUMN = "some_column" + self.OUTPUT_COLUMN = "some_column_cleaned" + self.cleaner = TweetClean(self.INPUT_COLUMN, self.OUTPUT_COLUMN) + + def _apply_transform(self, input_sentence): + input_df = pd.DataFrame() + input_df[self.INPUT_COLUMN] = [input_sentence] + + cleaned = self.cleaner.transform(input_df) + cleaned_column = cleaned[self.OUTPUT_COLUMN][0] + return cleaned_column + + def test_punctuation_removal(self): + input_sentence = "This is an example sentence. SENTENCE!! And another sentence . And more" + expected_output_text = "This is an example sentence SENTENCE And another sentence And more" + + msg = fm("sentences with urls", "returns sentences without urls") + self.assertEqual(self._apply_transform(input_sentence), expected_output_text, msg) + + def test_url_removal1(self): + input_sentence = "This url will be removed https://example.org hopefully!" + expected_output_text = "This url will be removed hopefully" + + msg = fm("sentences with urls", "returns sentence without urls") + self.assertEqual(self._apply_transform(input_sentence), expected_output_text, msg) + + def test_url_removal2(self): + input_sentence = "http://t.co/DOFVEUCiBV Big Data needs data science but data science doesn't need big data" + expected_output_text = "Big Data needs data science but data science doesnt need big data" + + msg = fm("sentences with urls", "returns sentence without urls") + self.assertEqual(self._apply_transform(input_sentence), expected_output_text, msg) + + def test_cleaning(self): + input_sentence = "#DataScience is greater than the sum of its parts https://t.co/lMcc9OJwWr #BigData #Analytics | RT @Ronald_vanLoon https://t.co/UT8RFLoAy4" + expected_output_text = "is greater than the sum of its parts RT" + + msg = fm("sentences with urls and hashtags", "returns cleaned sentence") + self.assertEqual(self._apply_transform(input_sentence), expected_output_text, msg) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/code/preprocessing/tokenizer.py b/src/preprocessing/tokenizer.py similarity index 94% rename from code/preprocessing/tokenizer.py rename to src/preprocessing/tokenizer.py index 94191502..e9940561 100644 --- a/code/preprocessing/tokenizer.py +++ b/src/preprocessing/tokenizer.py @@ -8,7 +8,7 @@ @author: lbechberger """ -from code.preprocessing.preprocessor import Preprocessor +from src.preprocessing.preprocessor import Preprocessor import nltk class Tokenizer(Preprocessor): diff --git a/src/preprocessing/tweet_clean.py b/src/preprocessing/tweet_clean.py new file mode 100644 index 00000000..3be696fa --- /dev/null +++ b/src/preprocessing/tweet_clean.py @@ -0,0 +1,46 @@ + +""" +# remove @user, # and link from tweet column +""" + +import pandas as pd +import re +from src.preprocessing.preprocessor import Preprocessor + +class TweetClean(Preprocessor): + + def __init__(self, input_column, output_column): + """Initialize the Tokenizer with the given input and output column.""" + super().__init__([input_column], output_column) + + def _get_values(self, inputs): + column = inputs[0] + + # removes hashtags + # remove text after # with space at the end + column = column.apply(lambda x : re.sub("#[A-Za-z0-9_\$\?\'\;\:\@\%\&\.\,]+\s","",x)) + # remove text starting with # and at the end of sentence + column = column.apply(lambda x : re.sub("#[A-Za-z0-9_\$\?\'\;\:\@\%\&\.\,]+","",x)) + + # removes username handles (e.g. @someusername) + # remove text after @ with space at the end + column = column.apply(lambda x : re.sub("@[A-Za-z0-9_\$\?\'\;\:\@\%\&\.\,]+\s+","",x)) + #remove text starting with @ and at the end of sentence + column = column.apply(lambda x : re.sub("@[A-Za-z0-9_\$\?\'\;\:\@\%\&\.\,]+","",x)) + + # removes URLs starting with http or https + column = column.apply(lambda x : re.sub("http\S+",r'',x)) + column = column.apply(lambda x : re.sub("http\S+\s",r'',x)) + + # remove all non alphabet and non number to remove emojis encluding punctuation + # we will not be needing the punctuation remover after this + column = column.apply(lambda x : re.sub("[^a-zA-Z0-9 ]+","",x)) + + # remove double spaces + column = column.apply(lambda x : re.sub("\s+"," ",x)) + + # removes the space at the beginning and end of the sentence + column = column.apply(lambda x : x.strip()) + + return column + diff --git a/src/setup.sh b/src/setup.sh new file mode 100644 index 00000000..b40b74d9 --- /dev/null +++ b/src/setup.sh @@ -0,0 +1,6 @@ +echo "loading data" +src/load_data.sh + +echo "loading nltk data" +src/load_nltk_data.sh + diff --git a/src/util.py b/src/util.py new file mode 100644 index 00000000..c3477b9d --- /dev/null +++ b/src/util.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Utility file for collecting frequently used constants and helper function. +""" + +# column names for the original data frame +COLUMN_TWEET = "tweet" +COLUMN_MENTIONS = "mentions" +COLUMN_PHOTOS = "photos" +COLUMN_MENTIONS_COUNT = "mentions_count" +COLUMN_LIKES = "likes_count" +COLUMN_RETWEETS = "retweets_count" + +# column names of novel columns for preprocessing +COLUMN_LABEL = "label" +COLUMN_PUNCTUATION = "tweet_no_punctuation" + +SUFFIX_TOKENIZED = "_tokenized" + + + +def fm(given: str, should: "str"): + """ + Formats the passed given and should as a string across two lines. + Can be used for documenting assertions in tests. + And helps understanding what went wrong in failed tests + + given: a sentence as a string, + should: return a list of words as a string + """ + return _format_message(given, should) + + +def _format_message(given: str, should: "str"): + """ + See docstring of fm function + Other example + given: 'no arguments', + should: 'return 0', + """ + return f"\n given: {given},\n should: {should}" \ No newline at end of file