From 1e3dbff288bb4f9b512f8ab4c370bc6b9ba1b665 Mon Sep 17 00:00:00 2001 From: magmueller Date: Sat, 9 Oct 2021 12:52:06 +0200 Subject: [PATCH 01/33] added SVM classifier --- code/classification.sh | 4 ++-- code/classification/run_classifier.py | 13 +++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/code/classification.sh b/code/classification.sh index ac20e09e..b136e348 100755 --- a/code/classification.sh +++ b/code/classification.sh @@ -5,10 +5,10 @@ mkdir -p data/classification/ # run feature extraction on training set (may need to fit extractors) echo " training set" -python -m code.classification.run_classifier data/dimensionality_reduction/training.pickle -e data/classification/classifier.pickle --majority -s 42 --accuracy --kappa +python -m code.classification.run_classifier data/dimensionality_reduction/training.pickle -e data/classification/classifier.pickle --svm --accuracy --kappa # run feature extraction on validation set (with pre-fit extractors) echo " validation set" -python -m code.classification.run_classifier data/dimensionality_reduction/validation.pickle -i data/classification/classifier.pickle --accuracy --kappa +#python -m code.classification.run_classifier data/dimensionality_reduction/validation.pickle -i data/classification/classifier.pickle --accuracy --kappa # don't touch the test set, yet, because that would ruin the final generalization experiment! \ No newline at end of file diff --git a/code/classification/run_classifier.py b/code/classification/run_classifier.py index ffdeef55..50e3c60f 100644 --- a/code/classification/run_classifier.py +++ b/code/classification/run_classifier.py @@ -11,6 +11,10 @@ import argparse, pickle from sklearn.dummy import DummyClassifier from sklearn.metrics import accuracy_score, cohen_kappa_score +from sklearn.svm import SVC +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler + # setting up CLI parser = argparse.ArgumentParser(description = "Classifier") @@ -20,6 +24,7 @@ parser.add_argument("-i", "--import_file", help = "import a trained classifier from the given location", default = None) parser.add_argument("-m", "--majority", action = "store_true", help = "majority class classifier") parser.add_argument("-f", "--frequency", action = "store_true", help = "label frequency classifier") +parser.add_argument("-v", "--svm", action = "store_true", help = "SVM classifier") parser.add_argument("-a", "--accuracy", action = "store_true", help = "evaluate using accuracy") parser.add_argument("-k", "--kappa", action = "store_true", help = "evaluate using Cohen's kappa") args = parser.parse_args() @@ -45,6 +50,14 @@ print(" label frequency classifier") classifier = DummyClassifier(strategy = "stratified", random_state = args.seed) classifier.fit(data["features"], data["labels"]) + + elif args.svm: + print(" SVM classifier") + classifier = make_pipeline(StandardScaler(), SVC()) + classifier.fit(data["features"], data["labels"]) + print("") + + # now classify the given data prediction = classifier.predict(data["features"]) From efd3421298f5fc23f964c3781b23f600cbc132bb Mon Sep 17 00:00:00 2001 From: magmueller Date: Sat, 9 Oct 2021 13:19:26 +0200 Subject: [PATCH 02/33] added to classification.sh --- code/classification.sh | 1 - code/classification/run_classifier.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/code/classification.sh b/code/classification.sh index b136e348..a5556405 100755 --- a/code/classification.sh +++ b/code/classification.sh @@ -6,7 +6,6 @@ mkdir -p data/classification/ # run feature extraction on training set (may need to fit extractors) echo " training set" python -m code.classification.run_classifier data/dimensionality_reduction/training.pickle -e data/classification/classifier.pickle --svm --accuracy --kappa - # run feature extraction on validation set (with pre-fit extractors) echo " validation set" #python -m code.classification.run_classifier data/dimensionality_reduction/validation.pickle -i data/classification/classifier.pickle --accuracy --kappa diff --git a/code/classification/run_classifier.py b/code/classification/run_classifier.py index 50e3c60f..94a404d2 100644 --- a/code/classification/run_classifier.py +++ b/code/classification/run_classifier.py @@ -28,7 +28,7 @@ parser.add_argument("-a", "--accuracy", action = "store_true", help = "evaluate using accuracy") parser.add_argument("-k", "--kappa", action = "store_true", help = "evaluate using Cohen's kappa") args = parser.parse_args() - +#args, unk = parser.parse_known_args() # load data with open(args.input_file, 'rb') as f_in: data = pickle.load(f_in) From 637faec2f4b71e9535370793142a9683ce3ce0b4 Mon Sep 17 00:00:00 2001 From: magmueller Date: Sat, 9 Oct 2021 14:04:05 +0200 Subject: [PATCH 03/33] add knn --- code/classification.sh | 2 +- code/classification/run_classifier.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/code/classification.sh b/code/classification.sh index 5ae3c8b5..b2880091 100755 --- a/code/classification.sh +++ b/code/classification.sh @@ -5,7 +5,7 @@ mkdir -p data/classification/ # run feature extraction on training set (may need to fit extractors) echo " training set" -python -m code.classification.run_classifier data/dimensionality_reduction/training.pickle -e data/classification/classifier.pickle --svm --accuracy --kappa +python -m code.classification.run_classifier data/dimensionality_reduction/training.pickle -e data/classification/classifier.pickle --svm --knn 4 --accuracy --kappa # run feature extraction on validation set (with pre-fit extractors) echo " validation set" python -m code.classification.run_classifier data/dimensionality_reduction/validation.pickle -i data/classification/classifier.pickle --accuracy --kappa diff --git a/code/classification/run_classifier.py b/code/classification/run_classifier.py index 8f9b062d..fbce2618 100644 --- a/code/classification/run_classifier.py +++ b/code/classification/run_classifier.py @@ -46,13 +46,17 @@ # majority vote classifier print(" majority vote classifier") classifier = DummyClassifier(strategy = "most_frequent", random_state = args.seed) - classifier.fit(data["features"], data["labels"]) elif args.frequency: # label frequency classifier print(" label frequency classifier") classifier = DummyClassifier(strategy = "stratified", random_state = args.seed) - classifier.fit(data["features"], data["labels"]) + elif args.knn is not None: + print(" {0} nearest neighbor classifier".format(args.knn)) + standardizer = StandardScaler() + knn_classifier = KNeighborsClassifier(args.knn) + classifier = make_pipeline(standardizer, knn_classifier) +classifier.fit(data["features"], data["labels"].ravel()) # now classify the given data prediction = classifier.predict(data["features"]) From 97c7e69abe7a4919ddd68b34709956e3272c9641 Mon Sep 17 00:00:00 2001 From: magmueller Date: Sat, 9 Oct 2021 14:29:50 +0200 Subject: [PATCH 04/33] add svm --- code/classification/run_classifier.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/code/classification/run_classifier.py b/code/classification/run_classifier.py index 8f9b062d..e599fbdf 100644 --- a/code/classification/run_classifier.py +++ b/code/classification/run_classifier.py @@ -52,6 +52,11 @@ print(" label frequency classifier") classifier = DummyClassifier(strategy = "stratified", random_state = args.seed) classifier.fit(data["features"], data["labels"]) + elif args.svm: + print(" SVM classifier") + classifier = make_pipeline(StandardScaler(), SCV()) + + classifier.fit(data["features"], data["labels"]) # now classify the given data prediction = classifier.predict(data["features"]) From 8f319629e8a868604b8a22cec63884905b1236fb Mon Sep 17 00:00:00 2001 From: magmueller Date: Sat, 9 Oct 2021 14:39:43 +0200 Subject: [PATCH 05/33] spelling error --- code/classification/run_classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/classification/run_classifier.py b/code/classification/run_classifier.py index 95ed5871..ccfdaf48 100644 --- a/code/classification/run_classifier.py +++ b/code/classification/run_classifier.py @@ -59,7 +59,7 @@ elif args.svm: print(" SVM classifier") - classifier = make_pipeline(StandardScaler(), SCV()) + classifier = make_pipeline(StandardScaler(), SVC()) From 58fbed58905a1ad0673c6d05880516ab4165e98c Mon Sep 17 00:00:00 2001 From: magmueller Date: Sat, 9 Oct 2021 15:01:01 +0200 Subject: [PATCH 06/33] use not all data --- code/classification/run_classifier.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/code/classification/run_classifier.py b/code/classification/run_classifier.py index ccfdaf48..20f65f1f 100644 --- a/code/classification/run_classifier.py +++ b/code/classification/run_classifier.py @@ -29,6 +29,7 @@ parser.add_argument("--knn", type = int, help = "k nearest neighbor classifier with the specified value of k", default = None) parser.add_argument("-a", "--accuracy", action = "store_true", help = "evaluate using accuracy") parser.add_argument("-k", "--kappa", action = "store_true", help = "evaluate using Cohen's kappa") +parser.add_argument("--small", type = int, help = "not use all data but just subset", default = None) args = parser.parse_args() #args, unk = parser.parse_known_args() # load data @@ -62,10 +63,20 @@ classifier = make_pipeline(StandardScaler(), SVC()) +if args.small is not None: + # if limit is given + max_length = len(data['features']) + limit = min(args.small, max_length) + classifier.fit(data["features"][:limit], data["labels"].ravel()[:limit]) + # now classify the given data + prediction = classifier.predict(data["features"][:limit]) +else: + #else use all data + classifier.fit(data["features"], data["labels"].ravel()) + # now classify the given data + prediction = classifier.predict(data["features"]) + -classifier.fit(data["features"], data["labels"].ravel()) -# now classify the given data -prediction = classifier.predict(data["features"]) # collect all evaluation metrics evaluation_metrics = [] From c09128cec44070227254e300043a6f59e14d742e Mon Sep 17 00:00:00 2001 From: magmueller Date: Sat, 9 Oct 2021 15:09:40 +0200 Subject: [PATCH 07/33] safer limit --- code/classification/run_classifier.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/code/classification/run_classifier.py b/code/classification/run_classifier.py index 20f65f1f..09e97cf1 100644 --- a/code/classification/run_classifier.py +++ b/code/classification/run_classifier.py @@ -67,14 +67,14 @@ # if limit is given max_length = len(data['features']) limit = min(args.small, max_length) - classifier.fit(data["features"][:limit], data["labels"].ravel()[:limit]) - # now classify the given data - prediction = classifier.predict(data["features"][:limit]) -else: - #else use all data - classifier.fit(data["features"], data["labels"].ravel()) - # now classify the given data - prediction = classifier.predict(data["features"]) + # go through data and limit it + for key, value in data.items(): + data[key] = value[:limit] + + +classifier.fit(data["features"], data["labels"].ravel()) +# now classify the given data +prediction = classifier.predict(data["features"]) From f078050989f915c58c0bebdbb51f8456465f8860 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= Date: Mon, 11 Oct 2021 17:07:38 +0200 Subject: [PATCH 08/33] implementet hash feature --- code/classification/run_classifier.py | 9 +++--- code/feature_extraction/extract_features.py | 6 +++- code/feature_extraction/hash_vector.py | 33 +++++++++++++++++++++ 3 files changed, 43 insertions(+), 5 deletions(-) create mode 100644 code/feature_extraction/hash_vector.py diff --git a/code/classification/run_classifier.py b/code/classification/run_classifier.py index 09e97cf1..823fd37c 100644 --- a/code/classification/run_classifier.py +++ b/code/classification/run_classifier.py @@ -30,6 +30,7 @@ parser.add_argument("-a", "--accuracy", action = "store_true", help = "evaluate using accuracy") parser.add_argument("-k", "--kappa", action = "store_true", help = "evaluate using Cohen's kappa") parser.add_argument("--small", type = int, help = "not use all data but just subset", default = None) + args = parser.parse_args() #args, unk = parser.parse_known_args() # load data @@ -51,16 +52,16 @@ # label frequency classifier print(" label frequency classifier") classifier = DummyClassifier(strategy = "stratified", random_state = args.seed) - + elif args.svm: + print(" SVM classifier") + classifier = make_pipeline(StandardScaler(), SVC(probability=True)) elif args.knn is not None: print(" {0} nearest neighbor classifier".format(args.knn)) standardizer = StandardScaler() knn_classifier = KNeighborsClassifier(args.knn) classifier = make_pipeline(standardizer, knn_classifier) - elif args.svm: - print(" SVM classifier") - classifier = make_pipeline(StandardScaler(), SVC()) + if args.small is not None: diff --git a/code/feature_extraction/extract_features.py b/code/feature_extraction/extract_features.py index a3527acf..9c86505c 100644 --- a/code/feature_extraction/extract_features.py +++ b/code/feature_extraction/extract_features.py @@ -12,6 +12,7 @@ import pandas as pd import numpy as np from code.feature_extraction.character_length import CharacterLength +from code.feature_extraction.hash_vector import HashVector from code.feature_extraction.feature_collector import FeatureCollector from code.util import COLUMN_TWEET, COLUMN_LABEL @@ -23,6 +24,7 @@ parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None) parser.add_argument("-i", "--import_file", help = "import an existing pipeline from the given location", default = None) parser.add_argument("-c", "--char_length", action = "store_true", help = "compute the number of characters in the tweet") +parser.add_argument("--hash_vec", action = "store_true", help = "compute the hash vector of the tweet") args = parser.parse_args() # load data @@ -40,7 +42,9 @@ if args.char_length: # character length of original tweet (without any changes) features.append(CharacterLength(COLUMN_TWEET)) - + if args.hash_vec: + # character length of original tweet (without any changes) + features.append(HashVector(COLUMN_TWEET)) # create overall FeatureCollector feature_collector = FeatureCollector(features) diff --git a/code/feature_extraction/hash_vector.py b/code/feature_extraction/hash_vector.py new file mode 100644 index 00000000..aaf3628e --- /dev/null +++ b/code/feature_extraction/hash_vector.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Simple feature that counts the number of characters in the given column. + +Created on Wed Sep 29 12:29:25 2021 + +@author: lbechberger +""" + +import numpy as np +from code.feature_extraction.feature_extractor import FeatureExtractor +from sklearn.feature_extraction.text import HashingVectorizer +import pdb + +# class for extracting the character-based length as a feature +class HashVector(FeatureExtractor): + + # constructor + def __init__(self, input_column): + super().__init__([input_column], "{0}_hashvector".format(input_column)) + + # don't need to fit, so don't overwrite _set_variables() + + # compute the word length based on the inputs + def _get_values(self, inputs): + # inputs is list of text documents + # create the transform + #pdb.set_trace() + vectorizer = HashingVectorizer(n_features=30) + # encode document + vector = vectorizer.fit_transform(inputs[0]) + return vector.toarray() From 3fa2beb1cd0afad5be6d4250251614a48ae395a1 Mon Sep 17 00:00:00 2001 From: avocardio Date: Mon, 11 Oct 2021 10:25:38 -0500 Subject: [PATCH 09/33] new docu. file --- documentation.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 documentation.md diff --git a/documentation.md b/documentation.md new file mode 100644 index 00000000..8c7384e8 --- /dev/null +++ b/documentation.md @@ -0,0 +1 @@ +# Machine Learning in Practice This is the forked repository for group "ML freunde" consisting of Magnus Müller, Samuel Hagemann, Maximilian Kalcher. \ No newline at end of file From 521132a0686f761ecbfc27d93119850e538816f7 Mon Sep 17 00:00:00 2001 From: max <74978236+avocardio@users.noreply.github.com> Date: Mon, 11 Oct 2021 10:28:14 -0500 Subject: [PATCH 10/33] Update Documentation --- Documentation.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation.md b/Documentation.md index de0a5b0f..eb212141 100644 --- a/Documentation.md +++ b/Documentation.md @@ -1,5 +1,7 @@ # Documentation Example +This is the forked repository for Magnus Müller, Maximilian Kalcher and Samuel Hagemann. + Some introductory sentence(s). Data set and task are relatively fixed, so probably you don't have much to say about them (unless you modifed them). If you haven't changed the application much, there's also not much to say about @@ -94,4 +96,4 @@ selected setup: How well does it generalize to the test set? Which hyperparameter settings are how important for the results? How good are we? Can this be used in practice or are we still too bad? -Anything else we may have learned? \ No newline at end of file +Anything else we may have learned? From 6c8c4e3d2e20854b7c6c86a34891b6610f79643d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= Date: Mon, 11 Oct 2021 19:33:00 +0200 Subject: [PATCH 11/33] added hash vecor, but cohens kappa still 0.0 --- code/dimensionality_reduction/reduce_dimensionality.py | 2 ++ code/feature_extraction/extract_features.py | 6 +++++- code/feature_extraction/hash_vector.py | 6 ++++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/code/dimensionality_reduction/reduce_dimensionality.py b/code/dimensionality_reduction/reduce_dimensionality.py index d2b27419..7d4da260 100644 --- a/code/dimensionality_reduction/reduce_dimensionality.py +++ b/code/dimensionality_reduction/reduce_dimensionality.py @@ -40,6 +40,7 @@ if args.mutual_information is not None: # select K best based on Mutual Information dim_red = SelectKBest(mutual_info_classif, k = args.mutual_information) + dim_red.fit(features, labels.ravel()) # resulting feature names based on support given by SelectKBest @@ -64,6 +65,7 @@ def get_feature_names(kbest, names): # store the results output_data = {"features": reduced_features, "labels": labels} + with open(args.output_file, 'wb') as f_out: pickle.dump(output_data, f_out) diff --git a/code/feature_extraction/extract_features.py b/code/feature_extraction/extract_features.py index 9c86505c..841d4d6d 100644 --- a/code/feature_extraction/extract_features.py +++ b/code/feature_extraction/extract_features.py @@ -43,14 +43,17 @@ # character length of original tweet (without any changes) features.append(CharacterLength(COLUMN_TWEET)) if args.hash_vec: - # character length of original tweet (without any changes) + # hash of original tweet (without any changes) features.append(HashVector(COLUMN_TWEET)) + + # create overall FeatureCollector feature_collector = FeatureCollector(features) # fit it on the given data set (assumed to be training data) feature_collector.fit(df) + # apply the given FeatureCollector on the current data set # maps the pandas DataFrame to an numpy array @@ -63,6 +66,7 @@ # store the results results = {"features": feature_array, "labels": label_array, "feature_names": feature_collector.get_feature_names()} + with open(args.output_file, 'wb') as f_out: pickle.dump(results, f_out) diff --git a/code/feature_extraction/hash_vector.py b/code/feature_extraction/hash_vector.py index aaf3628e..6f1207ed 100644 --- a/code/feature_extraction/hash_vector.py +++ b/code/feature_extraction/hash_vector.py @@ -11,7 +11,8 @@ import numpy as np from code.feature_extraction.feature_extractor import FeatureExtractor from sklearn.feature_extraction.text import HashingVectorizer -import pdb + +from code.util import HASH_VECTOR_N_FEATURES # class for extracting the character-based length as a feature class HashVector(FeatureExtractor): @@ -27,7 +28,8 @@ def _get_values(self, inputs): # inputs is list of text documents # create the transform #pdb.set_trace() - vectorizer = HashingVectorizer(n_features=30) + vectorizer = HashingVectorizer(n_features=HASH_VECTOR_N_FEATURES) # encode document vector = vectorizer.fit_transform(inputs[0]) + return vector.toarray() From b5b598d458e2b3e465f4637a852b90bca2494889 Mon Sep 17 00:00:00 2001 From: max <74978236+avocardio@users.noreply.github.com> Date: Mon, 11 Oct 2021 17:21:44 -0500 Subject: [PATCH 12/33] Wrong Docu --- documentation.md | 1 - 1 file changed, 1 deletion(-) delete mode 100644 documentation.md diff --git a/documentation.md b/documentation.md deleted file mode 100644 index 8c7384e8..00000000 --- a/documentation.md +++ /dev/null @@ -1 +0,0 @@ -# Machine Learning in Practice This is the forked repository for group "ML freunde" consisting of Magnus Müller, Samuel Hagemann, Maximilian Kalcher. \ No newline at end of file From d0d66c8ee17f828a33f7ab393f2defa1e36b65ac Mon Sep 17 00:00:00 2001 From: max <74978236+avocardio@users.noreply.github.com> Date: Mon, 11 Oct 2021 17:57:09 -0500 Subject: [PATCH 13/33] Documentation update --- Documentation.md | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/Documentation.md b/Documentation.md index eb212141..4d626386 100644 --- a/Documentation.md +++ b/Documentation.md @@ -2,46 +2,52 @@ This is the forked repository for Magnus Müller, Maximilian Kalcher and Samuel Hagemann. -Some introductory sentence(s). Data set and task are relatively fixed, so +Our task involved building and documenting a real-life application of machine learning. +We were given a dataset of N tweets from the years X until Y and had to build a classifier that would detect whether a tweet would go viral. +The measure for it being viral was when the sum of likes and retweets were bigger than 50. + +The dataset was very variable and we had a lot of features to work with, which gave us the freedom to choose and experiment with these freely. + +At the end, our classifier is implemented into an 'application', callable by terminal, which gives the likeliness of an input tweet being viral, having used the dataset as training. + +//Some introductory sentence(s). Data set and task are relatively fixed, so probably you don't have much to say about them (unless you modifed them). If you haven't changed the application much, there's also not much to say about that. The following structure thus only covers preprocessing, feature extraction, dimensionality reduction, classification, and evaluation. -## Evaluation +## Preprocessing + +Before using the data or some aspects of it, it is important to process some of it beforehand so our chosen features can be extracted smoothly. +Many tweets had different kind of punctuation, ..., emojis, and some of them even were written in different languages. ### Design Decisions -Which evaluation metrics did you use and why? -Which baselines did you use and why? +After looking at the dataset closely, we chose to keep the core words of the sentence, ... ### Results -How do the baselines perform with respect to the evaluation metrics? +Maybe show a short example what your preprocessing does. ### Interpretation -Is there anything we can learn from these results? - -## Preprocessing +Probably, no real interpretation possible, so feel free to leave this section out. -I'm following the "Design Decisions - Results - Interpretation" structure here, -but you can also just use one subheading per preprocessing step to organize -things (depending on what you do, that may be better structured). +## Evaluation ### Design Decisions -Which kind of preprocessing steps did you implement? Why are they necessary -and/or useful down the road? +Which evaluation metrics did you use and why? +Which baselines did you use and why? ### Results -Maybe show a short example what your preprocessing does. +How do the baselines perform with respect to the evaluation metrics? ### Interpretation -Probably, no real interpretation possible, so feel free to leave this section out. +Is there anything we can learn from these results? ## Feature Extraction From 4e1faf8074fa7945df7575b9c3bd54c87ab515bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= Date: Tue, 12 Oct 2021 20:00:15 +0200 Subject: [PATCH 14/33] test for hash vector --- code/util.py | 5 +++- test/feature_extraction/hash_vector_test.py | 33 +++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 test/feature_extraction/hash_vector_test.py diff --git a/code/util.py b/code/util.py index 7d8794c7..b2392b7c 100644 --- a/code/util.py +++ b/code/util.py @@ -17,4 +17,7 @@ COLUMN_LABEL = "label" COLUMN_PUNCTUATION = "tweet_no_punctuation" -SUFFIX_TOKENIZED = "_tokenized" \ No newline at end of file +SUFFIX_TOKENIZED = "_tokenized" + +# number of features for hash vector +HASH_VECTOR_N_FEATURES = 20 \ No newline at end of file diff --git a/test/feature_extraction/hash_vector_test.py b/test/feature_extraction/hash_vector_test.py new file mode 100644 index 00000000..1adb0d69 --- /dev/null +++ b/test/feature_extraction/hash_vector_test.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Oct 7 14:51:00 2021 + +@author: ml +""" + +import unittest +import pandas as pd +import nltk +from code.feature_extraction.hash_vector import HashVector + +class HashVectorTest(unittest.TestCase): + + def setUp(self): + self.INPUT_COLUMN = "input" + self.hash_vector_feature = HashVector(self.INPUT_COLUMN) + self.df = pd.DataFrame() + self.df[self.INPUT_COLUMN] = ['["This", "is", "a", "tweet", "This", "is", "also", "a", "test"]', '["This", "is", "a", "tweet", "This", "is", "also", "a", "test"]'] + + def test_input_columns(self): + self.assertEqual(self.hash_vector_feature._input_columns, [self.INPUT_COLUMN]) + + def test_feature_name(self): + self.assertEqual(self.hash_vector_feature.get_feature_name(), self.INPUT_COLUMN + "_hashvector") + + + + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file From 645d2bf453e360585ace30dbcc02ddbe82c6fe56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= Date: Tue, 12 Oct 2021 21:13:45 +0200 Subject: [PATCH 15/33] updated readme and add first try to documentation.md --- Documentation.md | 14 +++++++++++--- README.md | 1 + 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/Documentation.md b/Documentation.md index 4d626386..424e0699 100644 --- a/Documentation.md +++ b/Documentation.md @@ -51,17 +51,24 @@ Is there anything we can learn from these results? ## Feature Extraction -Again, either structure among decision-result-interpretation or based on feature, -up to you. +Again, either structure under decision-result interpretation or based on features, +is up to you. + + ### Design Decisions Which features did you implement? What's their motivation and how are they computed? +We want to try something we didn't hear in the lecture. Therefore, we used the HashingVectorizer from sklearn to create an individual hash for each tweet. For a sentence like 'I love Machine Learning', the output can look like [0.4, 0.3, 0.9, 0, 0.21], with length n representing the number of features. It's not very intuitive to humans why this works, but after a long time of version conflicts and other problems, we enjoyed the simplicity of using sklearn. + ### Results Can you say something about how the feature values are distributed? Maybe show some plots? +When we finally ran it successfully with 25 features, we tried it with the SVM classifier, but that took too much time (nearly endless), so we used KNN with 4 NN on a 20000 sample subset and for the first time our Cohen kappa went from 0.0 to 0.1 and after some tuning (using more data) to 0.3. + + ### Interpretation Can we already guess which features may be more useful than others? @@ -86,12 +93,13 @@ Can we somehow make sense of the dimensionality reduction results? Which features are the most important ones and why may that be the case? ## Classification - +First of all we add a new argument: --small 1000 which would just use 1000s tweets. ### Design Decisions Which classifier(s) did you use? Which hyperparameter(s) (with their respective candidate values) did you look at? What were your reasons for this? +- SVM ### Results The big finale begins: What are the evaluation results you obtained with your diff --git a/README.md b/README.md index f1c12d81..e297e91e 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,7 @@ Moreover, the script support importing and exporting fitted feature extractors w - `-i` or `--import_file`: Load a configured and fitted feature extraction from the given pickle file. Ignore all parameters that configure the features to extract. - `-e` or `--export_file`: Export the configured and fitted feature extraction into the given pickle file. +- `--small 1000`: use just 1000 tweets. ## Dimensionality Reduction All python scripts and classes for dimensionality reduction can be found in `code/dimensionality_reduction/`. From 70f3928b292fe72116326fadc421aacd4b143658 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= Date: Tue, 12 Oct 2021 21:17:42 +0200 Subject: [PATCH 16/33] spelling mistaks --- Documentation.md | 2 ++ README.md | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Documentation.md b/Documentation.md index 424e0699..56c41c6b 100644 --- a/Documentation.md +++ b/Documentation.md @@ -62,6 +62,8 @@ Which features did you implement? What's their motivation and how are they compu We want to try something we didn't hear in the lecture. Therefore, we used the HashingVectorizer from sklearn to create an individual hash for each tweet. For a sentence like 'I love Machine Learning', the output can look like [0.4, 0.3, 0.9, 0, 0.21], with length n representing the number of features. It's not very intuitive to humans why this works, but after a long time of version conflicts and other problems, we enjoyed the simplicity of using sklearn. +Usage: `--hash_vec` +and for number of features for hash vector edit HASH_VECTOR_N_FEATURES in util.py ### Results Can you say something about how the feature values are distributed? Maybe show some plots? diff --git a/README.md b/README.md index e297e91e..45a2c4dd 100644 --- a/README.md +++ b/README.md @@ -91,8 +91,9 @@ The features to be extracted can be configured with the following optional param Moreover, the script support importing and exporting fitted feature extractors with the following optional arguments: - `-i` or `--import_file`: Load a configured and fitted feature extraction from the given pickle file. Ignore all parameters that configure the features to extract. - `-e` or `--export_file`: Export the configured and fitted feature extraction into the given pickle file. +- `--hash_vec`: use HashingVectorizer from sklearn. +and for number of features for hash vector edit HASH_VECTOR_N_FEATURES in util.py -- `--small 1000`: use just 1000 tweets. ## Dimensionality Reduction All python scripts and classes for dimensionality reduction can be found in `code/dimensionality_reduction/`. @@ -129,7 +130,7 @@ By default, this data is used to train a classifier, which is specified by one o The classifier is then evaluated, using the evaluation metrics as specified through the following optional arguments: - `-a`or `--accuracy`: Classification accurracy (i.e., percentage of correctly classified examples). - `-k`or `--kappa`: Cohen's kappa (i.e., adjusting accuracy for probability of random agreement). - +- `--small 1000`: use just 1000 tweets. Moreover, the script support importing and exporting trained classifiers with the following optional arguments: - `-i` or `--import_file`: Load a trained classifier from the given pickle file. Ignore all parameters that configure the classifier to use and don't retrain the classifier. From dd87c7b3f916bcfb260297db1825fda3bd8b37a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= Date: Tue, 12 Oct 2021 23:32:10 +0200 Subject: [PATCH 17/33] filter out all languages except from english, maybe later: Translate Todo: Tests --- Documentation.md | 6 +++++ README.md | 2 ++ code/preprocessing/language_remover.py | 29 +++++++++++++++++++++++++ code/preprocessing/run_preprocessing.py | 24 +++++++++++++++----- code/util.py | 1 + 5 files changed, 56 insertions(+), 6 deletions(-) create mode 100644 code/preprocessing/language_remover.py diff --git a/Documentation.md b/Documentation.md index 56c41c6b..865d6d61 100644 --- a/Documentation.md +++ b/Documentation.md @@ -25,10 +25,16 @@ Many tweets had different kind of punctuation, ..., emojis, and some of them eve ### Design Decisions After looking at the dataset closely, we chose to keep the core words of the sentence, ... +- use just englisch tweets ### Results Maybe show a short example what your preprocessing does. +language summary: +({'en': 282035, 'it': 4116, 'es': 3272, 'fr': 2781, 'de': 714, 'id': 523, 'nl': 480, 'pt': 364, 'ca': 275, 'ru': 204, 'th': 157, 'ar': 126, 'tl': 108, 'tr': 84, 'hr': 68, 'da': 66, 'ro': 60, 'ja': 58, 'sv': 42, 'et': 29, 'pl': 25, 'bg': 24, 'af': 23, 'no': 21, 'fi': 20, 'so': 16, 'ta': 16, 'hi': 11, 'mk': 11, 'he': 9, 'sw': 9, 'lt': 7, 'uk': 6, 'sl': 6, 'te': 5, 'zh-cn': 5, 'lv': 5, 'ko': 5, 'bn': 4, 'el': 4, 'fa': 3, 'vi': 2, 'mr': 2, 'ml': 2, 'hu': 2, 'kn': 1, 'cs': 1, 'gu': 1, 'sk': 1, 'ur': 1, 'sq': 1}) +Total: +295811 +English tweets are 95%. So we can delete (maybe later translate) 5% of disrupting data. ### Interpretation diff --git a/README.md b/README.md index 45a2c4dd..24964e9a 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,8 @@ conda install -y -q -c conda-forge gensim=4.1.2 conda install -y -q -c conda-forge spyder=5.1.5 conda install -y -q -c conda-forge pandas=1.1.5 conda install -y -q -c conda-forge mlflow=1.20.2 +conda install -y -q -c conda-forge spacy +conda install -c conda-forge langdetect ``` You can double-check that all of these packages have been installed by running `conda list` inside of your virtual environment. The Spyder IDE can be started by typing `~/miniconda/envs/MLinPractice/bin/spyder` in your terminal window (assuming you use miniconda, which is installed right in your home directory). diff --git a/code/preprocessing/language_remover.py b/code/preprocessing/language_remover.py new file mode 100644 index 00000000..5466c1c1 --- /dev/null +++ b/code/preprocessing/language_remover.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +import string +from code.preprocessing.preprocessor import Preprocessor +from langdetect import detect +from code.util import COLUMN_TWEET, COLUMN_LANGUAGE + +class LanguageRemover(Preprocessor): + + # constructor + def __init__(self, input_column = COLUMN_TWEET, output_column = COLUMN_LANGUAGE): #, language_to_keep = 'en' + # input column "tweet", new output column + super().__init__([input_column], output_column) + #self.language_to_keep = language_to_keep + + # set internal variables based on input columns + #def _set_variables(self, inputs): + # store punctuation for later reference + #self._punctuation = "[{}]".format(string.punctuation) + #self.nlp = spacy.load('en') # 1 + #self.nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) #2 + + + # get preprocessed column based on data frame and internal variables + def _get_values(self, inputs): + column = [detect(tweet) for tweet in inputs[0]] + return column \ No newline at end of file diff --git a/code/preprocessing/run_preprocessing.py b/code/preprocessing/run_preprocessing.py index 72130a30..ac5ac3a9 100644 --- a/code/preprocessing/run_preprocessing.py +++ b/code/preprocessing/run_preprocessing.py @@ -12,8 +12,9 @@ import pandas as pd from sklearn.pipeline import make_pipeline from code.preprocessing.punctuation_remover import PunctuationRemover +from code.preprocessing.language_remover import LanguageRemover from code.preprocessing.tokenizer import Tokenizer -from code.util import COLUMN_TWEET, SUFFIX_TOKENIZED +from code.util import COLUMN_TWEET, SUFFIX_TOKENIZED, COLUMN_LANGUAGE # setting up CLI parser = argparse.ArgumentParser(description = "Various preprocessing steps") @@ -23,6 +24,7 @@ parser.add_argument("-t", "--tokenize", action = "store_true", help = "tokenize given column into individual words") parser.add_argument("--tokenize_input", help = "input column to tokenize", default = COLUMN_TWEET) parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None) +parser.add_argument("--language", help = "just use tweets with this language ", default = None) args = parser.parse_args() # load data @@ -30,18 +32,28 @@ # collect all preprocessors preprocessors = [] -if args.punctuation: - preprocessors.append(PunctuationRemover()) -if args.tokenize: - preprocessors.append(Tokenizer(args.tokenize_input, args.tokenize_input + SUFFIX_TOKENIZED)) +#if args.punctuation: +# preprocessors.append(PunctuationRemover()) +#if args.tokenize: +# preprocessors.append(Tokenizer(args.tokenize_input, args.tokenize_input + SUFFIX_TOKENIZED)) +if args.language is not None: + preprocessors.append(LanguageRemover()) + # call all preprocessing steps for preprocessor in preprocessors: df = preprocessor.fit_transform(df) +if args.language is not None: + # filter out one language + before = len(df) + df = df[df[COLUMN_LANGUAGE]==args.language] + after = len(df) + print("Filtered out: {0}".format(begore-after)) + # store the results df.to_csv(args.output_file, index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n") - +#pdb.set_trace() # create a pipeline if necessary and store it as pickle file if args.export_file is not None: pipeline = make_pipeline(*preprocessors) diff --git a/code/util.py b/code/util.py index b2392b7c..6da7852a 100644 --- a/code/util.py +++ b/code/util.py @@ -16,6 +16,7 @@ # column names of novel columns for preprocessing COLUMN_LABEL = "label" COLUMN_PUNCTUATION = "tweet_no_punctuation" +COLUMN_LANGUAGE = "language" SUFFIX_TOKENIZED = "_tokenized" From f881a5792c10df09d003c88e63600c7d1893e509 Mon Sep 17 00:00:00 2001 From: avocardio Date: Wed, 13 Oct 2021 11:05:37 -0500 Subject: [PATCH 18/33] preprocess start --- new_preprocessing.py | 60 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 new_preprocessing.py diff --git a/new_preprocessing.py b/new_preprocessing.py new file mode 100644 index 00000000..b9ea5248 --- /dev/null +++ b/new_preprocessing.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Oct 11 18:09:54 2021 + +@author: maximilian +""" + +import pandas as pd +import csv +import string +import nltk + +df = pd.read_csv("data/preprocessing/preprocessed.csv", quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n", low_memory=False) + +tokenized = [] + +for tweet in df['tweet'][0:6]: + sentences = nltk.sent_tokenize(tweet) + tokenized_tweet = [] + for sentence in sentences: + words = nltk.word_tokenize(sentence) + tokenized_tweet += words + + tokenized.append(tokenized_tweet) + +# --- + +def punctuation(rows): + + punct = set(string.punctuation) - {'#'} + + for row in rows: + for x in row: + if x in punct: + row.remove(x) + + return rows + +def emoji(rows): + + for row in rows: + for x in row: + if x.startswith('U+'): + x.encode('utf-16', 'surrogatepass') + x.decode('utf-16') + x.encode("raw_unicode_escape") + x.decode("latin_1") + + +for x in df['tweet_tokenized'][8].replace('\\',''): + if x.startswith('U+'): + x.encode('utf-16', 'surrogatepass') + x.decode('utf-16') + x.encode("raw_unicode_escape") + x.decode("latin_1") + + + + From 35bffee73bb45292b4b23bb56b30ce6507cc8543 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= Date: Wed, 13 Oct 2021 20:24:16 +0200 Subject: [PATCH 19/33] preproccesing works now --- code/preprocessing/punctuation_remover.py | 8 +++-- code/preprocessing/run_preprocessing.py | 41 +++++++++++++++-------- code/preprocessing/stopwords.py | 39 +++++++++++++++++++++ code/preprocessing/tokenizer.py | 12 +++++-- 4 files changed, 80 insertions(+), 20 deletions(-) create mode 100644 code/preprocessing/stopwords.py diff --git a/code/preprocessing/punctuation_remover.py b/code/preprocessing/punctuation_remover.py index 0f026b0e..6999e1fd 100644 --- a/code/preprocessing/punctuation_remover.py +++ b/code/preprocessing/punctuation_remover.py @@ -17,9 +17,9 @@ class PunctuationRemover(Preprocessor): # constructor - def __init__(self): - # input column "tweet", new output column - super().__init__([COLUMN_TWEET], COLUMN_PUNCTUATION) + def __init__(self, inputcol, outputcol): + # input column, new output column + super().__init__([inputcol], outputcol) # set internal variables based on input columns def _set_variables(self, inputs): @@ -30,4 +30,6 @@ def _set_variables(self, inputs): def _get_values(self, inputs): # replace punctuation with empty string column = inputs[0].str.replace(self._punctuation, "") + #import pdb + #pdb.set_trace() return column \ No newline at end of file diff --git a/code/preprocessing/run_preprocessing.py b/code/preprocessing/run_preprocessing.py index ac5ac3a9..b20ba83b 100644 --- a/code/preprocessing/run_preprocessing.py +++ b/code/preprocessing/run_preprocessing.py @@ -12,6 +12,7 @@ import pandas as pd from sklearn.pipeline import make_pipeline from code.preprocessing.punctuation_remover import PunctuationRemover +from code.preprocessing.stopwords import StopwordsRemover from code.preprocessing.language_remover import LanguageRemover from code.preprocessing.tokenizer import Tokenizer from code.util import COLUMN_TWEET, SUFFIX_TOKENIZED, COLUMN_LANGUAGE @@ -21,8 +22,9 @@ parser.add_argument("input_file", help = "path to the input csv file") parser.add_argument("output_file", help = "path to the output csv file") parser.add_argument("-p", "--punctuation", action = "store_true", help = "remove punctuation") +parser.add_argument("-s", "--stopwords", action = "store_true", help = "remove stopwords") parser.add_argument("-t", "--tokenize", action = "store_true", help = "tokenize given column into individual words") -parser.add_argument("--tokenize_input", help = "input column to tokenize", default = COLUMN_TWEET) +#parser.add_argument("--tokenize_input", help = "input column to tokenize", default = 'output') parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None) parser.add_argument("--language", help = "just use tweets with this language ", default = None) args = parser.parse_args() @@ -30,26 +32,34 @@ # load data df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n") +preprocess_col = 'preprocess_col' # collect all preprocessors preprocessors = [] -#if args.punctuation: -# preprocessors.append(PunctuationRemover()) -#if args.tokenize: -# preprocessors.append(Tokenizer(args.tokenize_input, args.tokenize_input + SUFFIX_TOKENIZED)) -if args.language is not None: - preprocessors.append(LanguageRemover()) - +if args.punctuation: + preprocessors.append(PunctuationRemover("tweet", preprocess_col)) +if args.stopwords: + preprocessors.append(StopwordsRemover(preprocess_col, preprocess_col)) +if args.tokenize: + preprocessors.append(Tokenizer(preprocess_col, preprocess_col + SUFFIX_TOKENIZED)) -# call all preprocessing steps -for preprocessor in preprocessors: - df = preprocessor.fit_transform(df) +# no need to detect languages, because it is already given +# if args.language is not None: +# preprocessors.append(LanguageRemover()) +#import pdb +#pdb.set_trace() if args.language is not None: # filter out one language before = len(df) - df = df[df[COLUMN_LANGUAGE]==args.language] + df = df[df['language']==args.language] after = len(df) - print("Filtered out: {0}".format(begore-after)) + print("Filtered out: {0}".format(before-after)) + +# call all preprocessing steps +for preprocessor in preprocessors: + df = preprocessor.fit_transform(df) + + # store the results df.to_csv(args.output_file, index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n") @@ -58,4 +68,7 @@ if args.export_file is not None: pipeline = make_pipeline(*preprocessors) with open(args.export_file, 'wb') as f_out: - pickle.dump(pipeline, f_out) \ No newline at end of file + pickle.dump(pipeline, f_out) + + + diff --git a/code/preprocessing/stopwords.py b/code/preprocessing/stopwords.py new file mode 100644 index 00000000..dad45285 --- /dev/null +++ b/code/preprocessing/stopwords.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Preprocessor that removes punctuation from the original tweet text. +Created on Wed Sep 29 09:45:56 2021 +@author: lbechberger +""" +import string +from code.preprocessing.preprocessor import Preprocessor +from code.util import COLUMN_TWEET, COLUMN_PUNCTUATION +from nltk.corpus import stopwords +import pandas as pd +STOPWORDS = set(stopwords.words('english')) +# removes punctuation from the original tweet +# inspired by https://stackoverflow.com/a/45600350 +class StopwordsRemover(Preprocessor): + + # constructor + def __init__(self, inputcol, outputcol): + # input column "tweet", new output column + super().__init__([inputcol], outputcol) + + # set internal variables based on input columns + #def _set_variables(self, inputs): + # store punctuation for later reference + # self._punctuation = "[{}]".format(string.punctuation) + + + # get preprocessed column based on data frame and internal variables + def _get_values(self, inputs): + # replace punctuation with empty string + # replace stop words with empty string + # replace duplicate words with empty string + column = inputs[0].str #.replace(self._punctuation, "") + column = [' '.join([word for word in tweet if word.lower() not in STOPWORDS]) for tweet in column.split()] + column = pd.Series(column) + #import pdb + #pdb.set_trace() + return column \ No newline at end of file diff --git a/code/preprocessing/tokenizer.py b/code/preprocessing/tokenizer.py index 94191502..85420b2d 100644 --- a/code/preprocessing/tokenizer.py +++ b/code/preprocessing/tokenizer.py @@ -24,14 +24,20 @@ def _get_values(self, inputs): """Tokenize the tweet.""" tokenized = [] - + import pdb for tweet in inputs[0]: - sentences = nltk.sent_tokenize(tweet) + #pdb.set_trace() + if type(tweet) is float: + # if tweet is nan, maybe because of stopword remove + sentences = nltk.sent_tokenize('') + else: + sentences = nltk.sent_tokenize(tweet) tokenized_tweet = [] for sentence in sentences: words = nltk.word_tokenize(sentence) tokenized_tweet += words tokenized.append(str(tokenized_tweet)) - + + #pdb.set_trace() return tokenized \ No newline at end of file From 92c73219a8b30c824c41aaaf66a47525e00ca817 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= Date: Wed, 13 Oct 2021 21:09:08 +0200 Subject: [PATCH 20/33] now the outputfile looks correct --- code/preprocessing/run_preprocessing.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/code/preprocessing/run_preprocessing.py b/code/preprocessing/run_preprocessing.py index b20ba83b..4c92cd0b 100644 --- a/code/preprocessing/run_preprocessing.py +++ b/code/preprocessing/run_preprocessing.py @@ -46,21 +46,25 @@ # if args.language is not None: # preprocessors.append(LanguageRemover()) -#import pdb -#pdb.set_trace() + if args.language is not None: # filter out one language before = len(df) df = df[df['language']==args.language] after = len(df) print("Filtered out: {0}".format(before-after)) + df.reset_index(drop=True, inplace=True) + + # call all preprocessing steps for preprocessor in preprocessors: df = preprocessor.fit_transform(df) - - +# drop useless line which makes problems with csv +del df['trans_dest\r'] +import pdb +pdb.set_trace() # store the results df.to_csv(args.output_file, index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n") #pdb.set_trace() From 95efc6cd024982ccc7c201feac885467bc41660e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= Date: Wed, 13 Oct 2021 21:29:18 +0200 Subject: [PATCH 21/33] edit documentation --- Documentation.md | 7 +++++++ code/preprocessing/run_preprocessing.py | 5 ----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/Documentation.md b/Documentation.md index 865d6d61..f1db8209 100644 --- a/Documentation.md +++ b/Documentation.md @@ -25,7 +25,10 @@ Many tweets had different kind of punctuation, ..., emojis, and some of them eve ### Design Decisions After looking at the dataset closely, we chose to keep the core words of the sentence, ... +- remove stopwords like 'a' or 'is' +- remove punctation - use just englisch tweets +- tokenize ### Results @@ -36,6 +39,10 @@ Total: 295811 English tweets are 95%. So we can delete (maybe later translate) 5% of disrupting data. +Lenght of all tweets: +- before preprocessing: 52686072 +- after preprocessing (just englisch + punctation + stopwords): 39666607 +39666607/52686072 = 0.75 ### Interpretation Probably, no real interpretation possible, so feel free to leave this section out. diff --git a/code/preprocessing/run_preprocessing.py b/code/preprocessing/run_preprocessing.py index 4c92cd0b..21a633b3 100644 --- a/code/preprocessing/run_preprocessing.py +++ b/code/preprocessing/run_preprocessing.py @@ -46,7 +46,6 @@ # if args.language is not None: # preprocessors.append(LanguageRemover()) - if args.language is not None: # filter out one language before = len(df) @@ -55,16 +54,12 @@ print("Filtered out: {0}".format(before-after)) df.reset_index(drop=True, inplace=True) - - # call all preprocessing steps for preprocessor in preprocessors: df = preprocessor.fit_transform(df) # drop useless line which makes problems with csv del df['trans_dest\r'] -import pdb -pdb.set_trace() # store the results df.to_csv(args.output_file, index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n") #pdb.set_trace() From b2bf9bbeba3cc87567a2526a0c65f04ec0cf1203 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= Date: Wed, 13 Oct 2021 22:10:27 +0200 Subject: [PATCH 22/33] edit other files for test run --- code/feature_extraction/extract_features.py | 2 +- code/feature_extraction/hash_vector.py | 14 ++++++++------ code/util.py | 4 ++-- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/code/feature_extraction/extract_features.py b/code/feature_extraction/extract_features.py index 841d4d6d..fae6d04d 100644 --- a/code/feature_extraction/extract_features.py +++ b/code/feature_extraction/extract_features.py @@ -14,7 +14,7 @@ from code.feature_extraction.character_length import CharacterLength from code.feature_extraction.hash_vector import HashVector from code.feature_extraction.feature_collector import FeatureCollector -from code.util import COLUMN_TWEET, COLUMN_LABEL +from code.util import COLUMN_TWEET, COLUMN_LABEL, COLUMN_PREPROCESS # setting up CLI diff --git a/code/feature_extraction/hash_vector.py b/code/feature_extraction/hash_vector.py index 6f1207ed..852140be 100644 --- a/code/feature_extraction/hash_vector.py +++ b/code/feature_extraction/hash_vector.py @@ -15,21 +15,23 @@ from code.util import HASH_VECTOR_N_FEATURES # class for extracting the character-based length as a feature + + class HashVector(FeatureExtractor): - + # constructor def __init__(self, input_column): super().__init__([input_column], "{0}_hashvector".format(input_column)) - + # don't need to fit, so don't overwrite _set_variables() - + # compute the word length based on the inputs def _get_values(self, inputs): # inputs is list of text documents # create the transform - #pdb.set_trace() - vectorizer = HashingVectorizer(n_features=HASH_VECTOR_N_FEATURES) + # pdb.set_trace() + vectorizer = HashingVectorizer(n_features=HASH_VECTOR_N_FEATURES, + strip_accents='ascii', stop_words='english', ngram_range=(2, 2)) # encode document vector = vectorizer.fit_transform(inputs[0]) - return vector.toarray() diff --git a/code/util.py b/code/util.py index 6da7852a..37fe5bd7 100644 --- a/code/util.py +++ b/code/util.py @@ -17,8 +17,8 @@ COLUMN_LABEL = "label" COLUMN_PUNCTUATION = "tweet_no_punctuation" COLUMN_LANGUAGE = "language" - +COLUMN_PREPROCESS = 'preprocess_col' SUFFIX_TOKENIZED = "_tokenized" # number of features for hash vector -HASH_VECTOR_N_FEATURES = 20 \ No newline at end of file +HASH_VECTOR_N_FEATURES = 2**10 \ No newline at end of file From c879c9db2abc31aff70040fcb03562261af48a87 Mon Sep 17 00:00:00 2001 From: avocardio Date: Wed, 13 Oct 2021 16:02:16 -0500 Subject: [PATCH 23/33] deleted file --- new_preprocessing.py | 60 -------------------------------------------- 1 file changed, 60 deletions(-) delete mode 100644 new_preprocessing.py diff --git a/new_preprocessing.py b/new_preprocessing.py deleted file mode 100644 index b9ea5248..00000000 --- a/new_preprocessing.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Mon Oct 11 18:09:54 2021 - -@author: maximilian -""" - -import pandas as pd -import csv -import string -import nltk - -df = pd.read_csv("data/preprocessing/preprocessed.csv", quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n", low_memory=False) - -tokenized = [] - -for tweet in df['tweet'][0:6]: - sentences = nltk.sent_tokenize(tweet) - tokenized_tweet = [] - for sentence in sentences: - words = nltk.word_tokenize(sentence) - tokenized_tweet += words - - tokenized.append(tokenized_tweet) - -# --- - -def punctuation(rows): - - punct = set(string.punctuation) - {'#'} - - for row in rows: - for x in row: - if x in punct: - row.remove(x) - - return rows - -def emoji(rows): - - for row in rows: - for x in row: - if x.startswith('U+'): - x.encode('utf-16', 'surrogatepass') - x.decode('utf-16') - x.encode("raw_unicode_escape") - x.decode("latin_1") - - -for x in df['tweet_tokenized'][8].replace('\\',''): - if x.startswith('U+'): - x.encode('utf-16', 'surrogatepass') - x.decode('utf-16') - x.encode("raw_unicode_escape") - x.decode("latin_1") - - - - From 7e806da8941a6bc23dae58de74886d7383995129 Mon Sep 17 00:00:00 2001 From: avocardio Date: Wed, 13 Oct 2021 20:52:11 -0500 Subject: [PATCH 24/33] fix --- code/preprocessing/run_preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/preprocessing/run_preprocessing.py b/code/preprocessing/run_preprocessing.py index 21a633b3..e59db961 100644 --- a/code/preprocessing/run_preprocessing.py +++ b/code/preprocessing/run_preprocessing.py @@ -30,7 +30,7 @@ args = parser.parse_args() # load data -df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n") +df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n",low_memory=False) preprocess_col = 'preprocess_col' # collect all preprocessors From e729203c27df30c068024832b477ece693d6db71 Mon Sep 17 00:00:00 2001 From: avocardio Date: Thu, 14 Oct 2021 00:25:30 -0500 Subject: [PATCH 25/33] small changes / fixes --- code/preprocessing.sh | 12 ++++++------ code/preprocessing/create_labels.py | 2 +- code/preprocessing/punctuation_remover.py | 6 +++++- code/preprocessing/run_preprocessing.py | 4 +++- code/preprocessing/split_data.py | 2 +- code/preprocessing/stopwords.py | 10 +++++++--- 6 files changed, 23 insertions(+), 13 deletions(-) diff --git a/code/preprocessing.sh b/code/preprocessing.sh index 61f83ea6..07d47dc3 100755 --- a/code/preprocessing.sh +++ b/code/preprocessing.sh @@ -1,19 +1,19 @@ #!/bin/bash # create directory if not yet existing -mkdir -p data/preprocessing/split/ +#mkdir -p data/preprocessing/split/ # install all NLTK models -python -m nltk.downloader all +#python -m nltk.downloader all # add labels -echo " creating labels" +echo -e "\n -> creating labels\n" python -m code.preprocessing.create_labels data/raw/ data/preprocessing/labeled.csv # other preprocessing (removing punctuation etc.) -echo " general preprocessing" -python -m code.preprocessing.run_preprocessing data/preprocessing/labeled.csv data/preprocessing/preprocessed.csv --punctuation --tokenize -e data/preprocessing/pipeline.pickle +echo -e "\n -> general preprocessing\n" +python -m code.preprocessing.run_preprocessing data/preprocessing/labeled.csv data/preprocessing/preprocessed.csv --punctuation --stopwords --tokenize --language en -e data/preprocessing/pipeline.pickle # split the data set -echo " splitting the data set" +echo -e "\n -> splitting the data set\n" python -m code.preprocessing.split_data data/preprocessing/preprocessed.csv data/preprocessing/split/ -s 42 \ No newline at end of file diff --git a/code/preprocessing/create_labels.py b/code/preprocessing/create_labels.py index 21b1748d..860a5fe0 100644 --- a/code/preprocessing/create_labels.py +++ b/code/preprocessing/create_labels.py @@ -28,7 +28,7 @@ # load all csv files dfs = [] for file_path in file_paths: - dfs.append(pd.read_csv(file_path, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n")) + dfs.append(pd.read_csv(file_path, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n", low_memory=False)) # join all data into a single DataFrame df = pd.concat(dfs) diff --git a/code/preprocessing/punctuation_remover.py b/code/preprocessing/punctuation_remover.py index 6999e1fd..f56a5648 100644 --- a/code/preprocessing/punctuation_remover.py +++ b/code/preprocessing/punctuation_remover.py @@ -11,6 +11,10 @@ import string from code.preprocessing.preprocessor import Preprocessor from code.util import COLUMN_TWEET, COLUMN_PUNCTUATION +import pdb + +punct = set(string.punctuation).union(string.digits).union('—') +#print(str(''.join(punct))) # removes punctuation from the original tweet # inspired by https://stackoverflow.com/a/45600350 @@ -24,7 +28,7 @@ def __init__(self, inputcol, outputcol): # set internal variables based on input columns def _set_variables(self, inputs): # store punctuation for later reference - self._punctuation = "[{}]".format(string.punctuation) + self._punctuation = "[{}]".format(string.punctuation+string.digits+'’'+'—'+'”') # get preprocessed column based on data frame and internal variables def _get_values(self, inputs): diff --git a/code/preprocessing/run_preprocessing.py b/code/preprocessing/run_preprocessing.py index e59db961..e4745467 100644 --- a/code/preprocessing/run_preprocessing.py +++ b/code/preprocessing/run_preprocessing.py @@ -10,6 +10,7 @@ import argparse, csv, pickle import pandas as pd +from tqdm import tqdm from sklearn.pipeline import make_pipeline from code.preprocessing.punctuation_remover import PunctuationRemover from code.preprocessing.stopwords import StopwordsRemover @@ -31,6 +32,7 @@ # load data df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n",low_memory=False) +df = df[0:1000] preprocess_col = 'preprocess_col' # collect all preprocessors @@ -55,7 +57,7 @@ df.reset_index(drop=True, inplace=True) # call all preprocessing steps -for preprocessor in preprocessors: +for preprocessor in tqdm(preprocessors): df = preprocessor.fit_transform(df) # drop useless line which makes problems with csv diff --git a/code/preprocessing/split_data.py b/code/preprocessing/split_data.py index 57bad668..88f0ff63 100644 --- a/code/preprocessing/split_data.py +++ b/code/preprocessing/split_data.py @@ -23,7 +23,7 @@ args = parser.parse_args() # load the data -df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n") +df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n", low_memory=False) # split into (training & validation) and test set X, X_test = train_test_split(df, test_size = args.test_size, random_state = args.seed, shuffle = True, stratify = df[COLUMN_LABEL]) diff --git a/code/preprocessing/stopwords.py b/code/preprocessing/stopwords.py index dad45285..07dbf16d 100644 --- a/code/preprocessing/stopwords.py +++ b/code/preprocessing/stopwords.py @@ -10,7 +10,11 @@ from code.util import COLUMN_TWEET, COLUMN_PUNCTUATION from nltk.corpus import stopwords import pandas as pd + STOPWORDS = set(stopwords.words('english')) +LINKS = set('https') +EMOJIS = set('U+') + # removes punctuation from the original tweet # inspired by https://stackoverflow.com/a/45600350 class StopwordsRemover(Preprocessor): @@ -28,11 +32,11 @@ def __init__(self, inputcol, outputcol): # get preprocessed column based on data frame and internal variables def _get_values(self, inputs): - # replace punctuation with empty string - # replace stop words with empty string - # replace duplicate words with empty string + # replace stopwords with empty string column = inputs[0].str #.replace(self._punctuation, "") column = [' '.join([word for word in tweet if word.lower() not in STOPWORDS]) for tweet in column.split()] + # column = [' '.join([word for word in tweet if word.startswith('https') is False]) for tweet in column.split()] + # column = [' '.join([word for word in tweet if word.encode('unicode-escape').startswith('U00') is False]) for tweet in column.split()] column = pd.Series(column) #import pdb #pdb.set_trace() From 9b5d2138e66ee087c65bd25d5d2a8dd53b8546b7 Mon Sep 17 00:00:00 2001 From: avocardio Date: Thu, 14 Oct 2021 00:33:00 -0500 Subject: [PATCH 26/33] commented out small dataset --- code/preprocessing/run_preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/preprocessing/run_preprocessing.py b/code/preprocessing/run_preprocessing.py index e4745467..5c310b47 100644 --- a/code/preprocessing/run_preprocessing.py +++ b/code/preprocessing/run_preprocessing.py @@ -32,7 +32,7 @@ # load data df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n",low_memory=False) -df = df[0:1000] +#df = df[0:1000] preprocess_col = 'preprocess_col' # collect all preprocessors From 3a13c43dedffa2ce8b727596cb03d31fa8c9879d Mon Sep 17 00:00:00 2001 From: avocardio Date: Thu, 14 Oct 2021 22:01:39 -0500 Subject: [PATCH 27/33] renamed file, added emoji / link remover --- code/preprocessing.sh | 2 +- code/preprocessing/punctuation_remover.py | 4 ++-- code/preprocessing/run_preprocessing.py | 12 ++++++------ .../{stopwords.py => string_remover.py} | 19 +++++++++++-------- 4 files changed, 20 insertions(+), 17 deletions(-) rename code/preprocessing/{stopwords.py => string_remover.py} (74%) diff --git a/code/preprocessing.sh b/code/preprocessing.sh index 07d47dc3..b381f36e 100755 --- a/code/preprocessing.sh +++ b/code/preprocessing.sh @@ -12,7 +12,7 @@ python -m code.preprocessing.create_labels data/raw/ data/preprocessing/labeled. # other preprocessing (removing punctuation etc.) echo -e "\n -> general preprocessing\n" -python -m code.preprocessing.run_preprocessing data/preprocessing/labeled.csv data/preprocessing/preprocessed.csv --punctuation --stopwords --tokenize --language en -e data/preprocessing/pipeline.pickle +python -m code.preprocessing.run_preprocessing data/preprocessing/labeled.csv data/preprocessing/preprocessed.csv --punctuation --strings --tokenize --language en -e data/preprocessing/pipeline.pickle # split the data set echo -e "\n -> splitting the data set\n" diff --git a/code/preprocessing/punctuation_remover.py b/code/preprocessing/punctuation_remover.py index f56a5648..b8e0258c 100644 --- a/code/preprocessing/punctuation_remover.py +++ b/code/preprocessing/punctuation_remover.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -Preprocessor that removes punctuation from the original tweet text. +Preprocessor that removes punctuation & digits from the original tweet text. Created on Wed Sep 29 09:45:56 2021 @@ -28,7 +28,7 @@ def __init__(self, inputcol, outputcol): # set internal variables based on input columns def _set_variables(self, inputs): # store punctuation for later reference - self._punctuation = "[{}]".format(string.punctuation+string.digits+'’'+'—'+'”') + self._punctuation = "[{}]".format(string.punctuation+string.digits+'’'+'—'+'”'+'➡️') # get preprocessed column based on data frame and internal variables def _get_values(self, inputs): diff --git a/code/preprocessing/run_preprocessing.py b/code/preprocessing/run_preprocessing.py index 5c310b47..b584499a 100644 --- a/code/preprocessing/run_preprocessing.py +++ b/code/preprocessing/run_preprocessing.py @@ -13,7 +13,7 @@ from tqdm import tqdm from sklearn.pipeline import make_pipeline from code.preprocessing.punctuation_remover import PunctuationRemover -from code.preprocessing.stopwords import StopwordsRemover +from code.preprocessing.string_remover import StringRemover from code.preprocessing.language_remover import LanguageRemover from code.preprocessing.tokenizer import Tokenizer from code.util import COLUMN_TWEET, SUFFIX_TOKENIZED, COLUMN_LANGUAGE @@ -23,7 +23,7 @@ parser.add_argument("input_file", help = "path to the input csv file") parser.add_argument("output_file", help = "path to the output csv file") parser.add_argument("-p", "--punctuation", action = "store_true", help = "remove punctuation") -parser.add_argument("-s", "--stopwords", action = "store_true", help = "remove stopwords") +parser.add_argument("-s", "--strings", action = "store_true", help = "remove stopwords, links and emojis") parser.add_argument("-t", "--tokenize", action = "store_true", help = "tokenize given column into individual words") #parser.add_argument("--tokenize_input", help = "input column to tokenize", default = 'output') parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None) @@ -32,15 +32,15 @@ # load data df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n",low_memory=False) -#df = df[0:1000] +df = df[0:1000] preprocess_col = 'preprocess_col' # collect all preprocessors preprocessors = [] if args.punctuation: preprocessors.append(PunctuationRemover("tweet", preprocess_col)) -if args.stopwords: - preprocessors.append(StopwordsRemover(preprocess_col, preprocess_col)) +if args.strings: + preprocessors.append(StringRemover(preprocess_col, preprocess_col)) if args.tokenize: preprocessors.append(Tokenizer(preprocess_col, preprocess_col + SUFFIX_TOKENIZED)) @@ -53,7 +53,7 @@ before = len(df) df = df[df['language']==args.language] after = len(df) - print("Filtered out: {0}".format(before-after)) + print("Filtered out: {0} (not 'en')".format(before-after)) df.reset_index(drop=True, inplace=True) # call all preprocessing steps diff --git a/code/preprocessing/stopwords.py b/code/preprocessing/string_remover.py similarity index 74% rename from code/preprocessing/stopwords.py rename to code/preprocessing/string_remover.py index 07dbf16d..9a5dad34 100644 --- a/code/preprocessing/stopwords.py +++ b/code/preprocessing/string_remover.py @@ -12,12 +12,10 @@ import pandas as pd STOPWORDS = set(stopwords.words('english')) -LINKS = set('https') -EMOJIS = set('U+') # removes punctuation from the original tweet # inspired by https://stackoverflow.com/a/45600350 -class StopwordsRemover(Preprocessor): +class StringRemover(Preprocessor): # constructor def __init__(self, inputcol, outputcol): @@ -32,12 +30,17 @@ def __init__(self, inputcol, outputcol): # get preprocessed column based on data frame and internal variables def _get_values(self, inputs): - # replace stopwords with empty string column = inputs[0].str #.replace(self._punctuation, "") - column = [' '.join([word for word in tweet if word.lower() not in STOPWORDS]) for tweet in column.split()] + + # replace stopwords with empty string + # column = [' '.join([word for word in tweet if word.lower() not in STOPWORDS]) for tweet in column.split()] + + # replace links with empty string # column = [' '.join([word for word in tweet if word.startswith('https') is False]) for tweet in column.split()] - # column = [' '.join([word for word in tweet if word.encode('unicode-escape').startswith('U00') is False]) for tweet in column.split()] + + # replace emojis with empty string + column = [' '.join([word for word in tweet if str(word.encode('unicode-escape').decode('ASCII')).__contains__('\\') is False]) for tweet in column.split()] + column = pd.Series(column) - #import pdb - #pdb.set_trace() + return column \ No newline at end of file From 373008961462ef77f89f649c10f472c00b379964 Mon Sep 17 00:00:00 2001 From: avocardio Date: Thu, 14 Oct 2021 22:05:48 -0500 Subject: [PATCH 28/33] small mistake --- code/preprocessing/run_preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/preprocessing/run_preprocessing.py b/code/preprocessing/run_preprocessing.py index b584499a..74f04757 100644 --- a/code/preprocessing/run_preprocessing.py +++ b/code/preprocessing/run_preprocessing.py @@ -22,7 +22,7 @@ parser = argparse.ArgumentParser(description = "Various preprocessing steps") parser.add_argument("input_file", help = "path to the input csv file") parser.add_argument("output_file", help = "path to the output csv file") -parser.add_argument("-p", "--punctuation", action = "store_true", help = "remove punctuation") +parser.add_argument("-p", "--punctuation", action = "store_true", help = "remove punctuation and special characters") parser.add_argument("-s", "--strings", action = "store_true", help = "remove stopwords, links and emojis") parser.add_argument("-t", "--tokenize", action = "store_true", help = "tokenize given column into individual words") #parser.add_argument("--tokenize_input", help = "input column to tokenize", default = 'output') From cc5d908c064f4f305fcf5c02b34dff066fa2b94b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= Date: Fri, 15 Oct 2021 16:24:43 +0200 Subject: [PATCH 29/33] preprocessing done, edit string remover it works now!!! --- code/preprocessing/string_remover.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/code/preprocessing/string_remover.py b/code/preprocessing/string_remover.py index 9a5dad34..7ee1559f 100644 --- a/code/preprocessing/string_remover.py +++ b/code/preprocessing/string_remover.py @@ -33,14 +33,15 @@ def _get_values(self, inputs): column = inputs[0].str #.replace(self._punctuation, "") # replace stopwords with empty string - # column = [' '.join([word for word in tweet if word.lower() not in STOPWORDS]) for tweet in column.split()] - + column = [' '.join([word for word in tweet if word.lower() not in STOPWORDS]) for tweet in column.split()] + column = pd.Series(column) # replace links with empty string - # column = [' '.join([word for word in tweet if word.startswith('https') is False]) for tweet in column.split()] - + column = [' '.join([word for word in tweet if word.startswith('https') is False]) for tweet in column.str.split()] + column = pd.Series(column) # replace emojis with empty string - column = [' '.join([word for word in tweet if str(word.encode('unicode-escape').decode('ASCII')).__contains__('\\') is False]) for tweet in column.split()] + column = [' '.join([word for word in tweet if str(word.encode('unicode-escape').decode('ASCII')).__contains__('\\') is False]) for tweet in column.str.split()] column = pd.Series(column) - + import pdb + pdb.set_trace() return column \ No newline at end of file From 2e36ac8a4de71bb374edf85069259a2df5806ac9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= Date: Fri, 15 Oct 2021 17:01:51 +0200 Subject: [PATCH 30/33] prettier --- code/preprocessing/run_preprocessing.py | 4 ++-- code/preprocessing/string_remover.py | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/code/preprocessing/run_preprocessing.py b/code/preprocessing/run_preprocessing.py index 74f04757..78181775 100644 --- a/code/preprocessing/run_preprocessing.py +++ b/code/preprocessing/run_preprocessing.py @@ -32,7 +32,7 @@ # load data df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n",low_memory=False) -df = df[0:1000] + preprocess_col = 'preprocess_col' # collect all preprocessors @@ -64,7 +64,7 @@ del df['trans_dest\r'] # store the results df.to_csv(args.output_file, index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n") -#pdb.set_trace() + # create a pipeline if necessary and store it as pickle file if args.export_file is not None: pipeline = make_pipeline(*preprocessors) diff --git a/code/preprocessing/string_remover.py b/code/preprocessing/string_remover.py index 7ee1559f..7d11d045 100644 --- a/code/preprocessing/string_remover.py +++ b/code/preprocessing/string_remover.py @@ -42,6 +42,4 @@ def _get_values(self, inputs): column = [' '.join([word for word in tweet if str(word.encode('unicode-escape').decode('ASCII')).__contains__('\\') is False]) for tweet in column.str.split()] column = pd.Series(column) - import pdb - pdb.set_trace() return column \ No newline at end of file From 1a9a399b7e286a3fa232b0a28177b3721fd229b1 Mon Sep 17 00:00:00 2001 From: Samuel Date: Sat, 16 Oct 2021 17:59:58 +0200 Subject: [PATCH 31/33] Added Photo Feature --- codes/feature_extraction.sh | 14 ++++ codes/feature_extraction/extract_features.py | 80 ++++++++++++++++++++ codes/feature_extraction/photo_bool.py | 33 ++++++++ codes/util.py | 25 ++++++ 4 files changed, 152 insertions(+) create mode 100755 codes/feature_extraction.sh create mode 100644 codes/feature_extraction/extract_features.py create mode 100644 codes/feature_extraction/photo_bool.py create mode 100644 codes/util.py diff --git a/codes/feature_extraction.sh b/codes/feature_extraction.sh new file mode 100755 index 00000000..9597907d --- /dev/null +++ b/codes/feature_extraction.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# create directory if not yet existing +mkdir -p data/feature_extraction/ + +# run feature extraction on training set (may need to fit extractors) +echo " training set" +python -m codes.feature_extraction.extract_features data/preprocessing/split/training.csv data/feature_extraction/training.pickle -e data/feature_extraction/pipeline.pickle --char_length --photo_bool + +# run feature extraction on validation set and test set (with pre-fit extractors) +echo " validation set" +python -m codes.feature_extraction.extract_features data/preprocessing/split/validation.csv data/feature_extraction/validation.pickle -i data/feature_extraction/pipeline.pickle +echo " test set" +python -m codes.feature_extraction.extract_features data/preprocessing/split/test.csv data/feature_extraction/test.pickle -i data/feature_extraction/pipeline.pickle diff --git a/codes/feature_extraction/extract_features.py b/codes/feature_extraction/extract_features.py new file mode 100644 index 00000000..c4ea4c2a --- /dev/null +++ b/codes/feature_extraction/extract_features.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Runs the specified collection of feature extractors. + +Created on Wed Sep 29 11:00:24 2021 + +@author: lbechberger +""" + +import argparse, csv, pickle +import pandas as pd +import numpy as np +from codes.feature_extraction.character_length import CharacterLength +from codes.feature_extraction.hash_vector import HashVector +from codes.feature_extraction.feature_collector import FeatureCollector +from codes.feature_extraction.photo_bool import PhotoBool +from codes.util import COLUMN_TWEET, COLUMN_LABEL, COLUMN_PREPROCESS, COLUMN_PHOTOS + + +# setting up CLI +parser = argparse.ArgumentParser(description = "Feature Extraction") +parser.add_argument("input_file", help = "path to the input csv file") +parser.add_argument("output_file", help = "path to the output pickle file") +parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None) +parser.add_argument("-i", "--import_file", help = "import an existing pipeline from the given location", default = None) +parser.add_argument("-c", "--char_length", action = "store_true", help = "compute the number of characters in the tweet") +parser.add_argument("--hash_vec", action = "store_true", help = "compute the hash vector of the tweet") +parser.add_argument("--photo_bool", action= "store_true", help= "XXXXXXXXXXXXXXXXXXXXXXXXXX") +args = parser.parse_args() + +# load data +df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n") + +if args.import_file is not None: + # simply import an exisiting FeatureCollector + with open(args.import_file, "rb") as f_in: + feature_collector = pickle.load(f_in) + +else: # need to create FeatureCollector manually + + # collect all feature extractors + features = [] + if args.char_length: + # character length of original tweet (without any changes) + features.append(CharacterLength(COLUMN_TWEET)) + if args.hash_vec: + # hash of original tweet (without any changes) + features.append(HashVector(COLUMN_TWEET)) + if args.photo_bool: + # do photos exist or not + features.append(PhotoBool(COLUMN_PHOTOS)) + + # create overall FeatureCollector + feature_collector = FeatureCollector(features) + + # fit it on the given data set (assumed to be training data) + feature_collector.fit(df) + + + +# apply the given FeatureCollector on the current data set +# maps the pandas DataFrame to an numpy array +feature_array = feature_collector.transform(df) + +# get label array +label_array = np.array(df[COLUMN_LABEL]) +label_array = label_array.reshape(-1, 1) + +# store the results +results = {"features": feature_array, "labels": label_array, + "feature_names": feature_collector.get_feature_names()} + +with open(args.output_file, 'wb') as f_out: + pickle.dump(results, f_out) + +# export the FeatureCollector as pickle file if desired by user +if args.export_file is not None: + with open(args.export_file, 'wb') as f_out: + pickle.dump(feature_collector, f_out) diff --git a/codes/feature_extraction/photo_bool.py b/codes/feature_extraction/photo_bool.py new file mode 100644 index 00000000..93942b13 --- /dev/null +++ b/codes/feature_extraction/photo_bool.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Simple feature that tells whether photos are present or not. + +Created on Wed Sep 29 12:29:25 2021 + +@author: shagemann +""" + +import numpy as np +from codes.feature_extraction.feature_extractor import FeatureExtractor + +# class for extracting the photo-bool as a feature +class PhotoBool(FeatureExtractor): + + # constructor + def __init__(self, input_column): + super().__init__([input_column], "{0}_photo_bool".format(input_column)) + + # don't need to fit, so don't overwrite _set_variables() + + # 0 if no photos, return 1 else + def _get_values(self, inputs): + values = [] + for index, row in inputs[0].iteritems(): + if len(row) > 2: + values.append(1) + else: + values.append(0) + result = np.array(values) + result = result.reshape(-1,1) + return result diff --git a/codes/util.py b/codes/util.py new file mode 100644 index 00000000..0b5f4963 --- /dev/null +++ b/codes/util.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Utility file for collecting frequently used constants and helper functions. + +Created on Wed Sep 29 10:50:36 2021 + +@author: lbechberger +""" + +# column names for the original data frame +COLUMN_TWEET = "tweet" +COLUMN_LIKES = "likes_count" +COLUMN_RETWEETS = "retweets_count" +COLUMN_PHOTOS = "photos" + +# column names of novel columns for preprocessing +COLUMN_LABEL = "label" +COLUMN_PUNCTUATION = "tweet_no_punctuation" +COLUMN_LANGUAGE = "language" +COLUMN_PREPROCESS = 'preprocess_col' +SUFFIX_TOKENIZED = "_tokenized" + +# number of features for hash vector +HASH_VECTOR_N_FEATURES = 2**3 From 2b8377d28879f29bb308be32037a8d904dbe7ff5 Mon Sep 17 00:00:00 2001 From: Samuel Date: Sun, 17 Oct 2021 19:06:11 +0200 Subject: [PATCH 32/33] Added replies count feature; added 'help' description --- codes/feature_extraction.sh | 2 +- codes/feature_extraction/extract_features.py | 9 ++++-- codes/feature_extraction/replies_count.py | 30 ++++++++++++++++++++ codes/util.py | 1 + 4 files changed, 39 insertions(+), 3 deletions(-) create mode 100644 codes/feature_extraction/replies_count.py diff --git a/codes/feature_extraction.sh b/codes/feature_extraction.sh index 9597907d..e6b7ea3c 100755 --- a/codes/feature_extraction.sh +++ b/codes/feature_extraction.sh @@ -5,7 +5,7 @@ mkdir -p data/feature_extraction/ # run feature extraction on training set (may need to fit extractors) echo " training set" -python -m codes.feature_extraction.extract_features data/preprocessing/split/training.csv data/feature_extraction/training.pickle -e data/feature_extraction/pipeline.pickle --char_length --photo_bool +python -m codes.feature_extraction.extract_features data/preprocessing/split/training.csv data/feature_extraction/training.pickle -e data/feature_extraction/pipeline.pickle --char_length --photo_bool --replies_count # run feature extraction on validation set and test set (with pre-fit extractors) echo " validation set" diff --git a/codes/feature_extraction/extract_features.py b/codes/feature_extraction/extract_features.py index c4ea4c2a..f3526564 100644 --- a/codes/feature_extraction/extract_features.py +++ b/codes/feature_extraction/extract_features.py @@ -15,7 +15,8 @@ from codes.feature_extraction.hash_vector import HashVector from codes.feature_extraction.feature_collector import FeatureCollector from codes.feature_extraction.photo_bool import PhotoBool -from codes.util import COLUMN_TWEET, COLUMN_LABEL, COLUMN_PREPROCESS, COLUMN_PHOTOS +from codes.feature_extraction.replies_count import RepliesCount +from codes.util import COLUMN_TWEET, COLUMN_LABEL, COLUMN_PREPROCESS, COLUMN_PHOTOS, COLUMN_REPLIES # setting up CLI @@ -26,7 +27,8 @@ parser.add_argument("-i", "--import_file", help = "import an existing pipeline from the given location", default = None) parser.add_argument("-c", "--char_length", action = "store_true", help = "compute the number of characters in the tweet") parser.add_argument("--hash_vec", action = "store_true", help = "compute the hash vector of the tweet") -parser.add_argument("--photo_bool", action= "store_true", help= "XXXXXXXXXXXXXXXXXXXXXXXXXX") +parser.add_argument("--photo_bool", action= "store_true", help= "tells whether the tweet contains photos or not") +parser.add_argument("--replies_count", action="store_true", help="compute the amount of replies of the tweet") args = parser.parse_args() # load data @@ -50,6 +52,9 @@ if args.photo_bool: # do photos exist or not features.append(PhotoBool(COLUMN_PHOTOS)) + if args.replies_count: + # how many replies does the tweet have + features.append(RepliesCount(COLUMN_REPLIES)) # create overall FeatureCollector feature_collector = FeatureCollector(features) diff --git a/codes/feature_extraction/replies_count.py b/codes/feature_extraction/replies_count.py new file mode 100644 index 00000000..dd72c4f7 --- /dev/null +++ b/codes/feature_extraction/replies_count.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Simple feature that tells how many replies a tweet has. + +Created on Wed Sep 29 12:29:25 2021 + +@author: shagemann +""" + +import numpy as np +from codes.feature_extraction.feature_extractor import FeatureExtractor + +# class for extracting the photo-bool as a feature +class RepliesCount(FeatureExtractor): + + # constructor + def __init__(self, input_column): + super().__init__([input_column], "{0}_replies_count".format(input_column)) + + # don't need to fit, so don't overwrite _set_variables() + + # use the replies count column as a feature + def _get_values(self, inputs): + values = [] + for index, row in inputs[0].iteritems(): + values.append(row) + result = np.array(values) + result = result.reshape(-1,1) + return result diff --git a/codes/util.py b/codes/util.py index 0b5f4963..d845bded 100644 --- a/codes/util.py +++ b/codes/util.py @@ -13,6 +13,7 @@ COLUMN_LIKES = "likes_count" COLUMN_RETWEETS = "retweets_count" COLUMN_PHOTOS = "photos" +COLUMN_REPLIES = "replies_count" # column names of novel columns for preprocessing COLUMN_LABEL = "label" From b74f3e9fe19b266295593aa079906922114da724 Mon Sep 17 00:00:00 2001 From: max <74978236+avocardio@users.noreply.github.com> Date: Sun, 17 Oct 2021 19:42:29 -0500 Subject: [PATCH 33/33] Appends int(row) instead of row, same as video_bool --- codes/feature_extraction/replies_count.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codes/feature_extraction/replies_count.py b/codes/feature_extraction/replies_count.py index dd72c4f7..ada34fd4 100644 --- a/codes/feature_extraction/replies_count.py +++ b/codes/feature_extraction/replies_count.py @@ -24,7 +24,7 @@ def __init__(self, input_column): def _get_values(self, inputs): values = [] for index, row in inputs[0].iteritems(): - values.append(row) + values.append(int(row)) result = np.array(values) result = result.reshape(-1,1) return result