From 1a9d767b97ffdf572069a69c20ba39147130771c Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Tue, 28 May 2024 09:11:30 +0200 Subject: [PATCH] update tests part 1 --- .github/workflows/Part2_3/example1.py | 34 ++++ .github/workflows/Part2_3/example2.py | 21 ++ .github/workflows/Part2_3/example3.py | 25 +++ .github/workflows/Part2_3/example4.py | 181 +++++++++++++++++ .../workflows/Part2_3/example_jupyter.ipynb | 100 ++++++++++ {chapter1 => chapter2_3}/example1.py | 0 {chapter1 => chapter2_3}/example2.py | 0 {chapter1 => chapter2_3}/example3.py | 0 chapter2_3/example4.py | 184 ++++++++++++++++++ chapter2_3/example_jupyter.ipynb | 103 ++++++++++ ...est_fixmes.py => test_linter_formatter.py} | 9 +- 11 files changed, 654 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/Part2_3/example1.py create mode 100644 .github/workflows/Part2_3/example2.py create mode 100644 .github/workflows/Part2_3/example3.py create mode 100644 .github/workflows/Part2_3/example4.py create mode 100644 .github/workflows/Part2_3/example_jupyter.ipynb rename {chapter1 => chapter2_3}/example1.py (100%) rename {chapter1 => chapter2_3}/example2.py (100%) rename {chapter1 => chapter2_3}/example3.py (100%) create mode 100644 chapter2_3/example4.py create mode 100644 chapter2_3/example_jupyter.ipynb rename tests/{test_fixmes.py => test_linter_formatter.py} (89%) diff --git a/.github/workflows/Part2_3/example1.py b/.github/workflows/Part2_3/example1.py new file mode 100644 index 0000000..2fb0df9 --- /dev/null +++ b/.github/workflows/Part2_3/example1.py @@ -0,0 +1,34 @@ +import os +import glob + + +# find all png files in a folder +def find_files(path=None, pattern="*.png", recursive=True, limit=20) -> list: + """Find image files on the file system. + + :param path: + The base directory where we are looking for the images. Defaults to None, which uses the XDG data directory if set or the current working directory otherwise. + :param pattern: + The naming pattern that the filename should match. Defaults to + "*.png". Can be used to allow other patterns or to only include + specific prefixes or suffixes. + :param recursive: + Whether to recurse into subdirectories. + :param limit: + The maximum number of images to be found. Defaults to 20. + To return all images, set to None. + """ + if path is None: + path = os.environ.get("XDG_DATA_HOME", ".") + + result = list(glob.glob(f"{path}/{pattern}", recursive=recursive)) + + if limit is not None: + result = result[:limit] + + return result + + +if __name__ == "__main__": + list = find_files(path="./data/") + print("Found files {}".format(list)) diff --git a/.github/workflows/Part2_3/example2.py b/.github/workflows/Part2_3/example2.py new file mode 100644 index 0000000..7f1c00b --- /dev/null +++ b/.github/workflows/Part2_3/example2.py @@ -0,0 +1,21 @@ +import numpy as np + + +def area_circ(r_in): + """Calculate the area of a circle with given radius. + + :Input: The radius of the circle (float, >=0). + :Returns: The area of the circle (float).""" + if r_in < 0: + raise ValueError("The radius must be >= 0.") + Kreis = np.pi * r_in**2 + print( + """The area of a circle with radius r = {:3.2f}cm is A = {:4.2f}cm2.""".format( + r_in, Kreis + ) + ) + return Kreis + + +if __name__ == "__main__": + _ = area_circ(5.0) diff --git a/.github/workflows/Part2_3/example3.py b/.github/workflows/Part2_3/example3.py new file mode 100644 index 0000000..c2122ec --- /dev/null +++ b/.github/workflows/Part2_3/example3.py @@ -0,0 +1,25 @@ +def validate_data_dict(data_dict): + if not data_dict: + raise ValueError("data_dict is empty") + for something, otherthing in data_dict.items(): + if not otherthing: + raise ValueError(f"The dict content under {something} is empty.") + if not isinstance(otherthing, dict): + raise ValueError( + f"The content of {something} is not a dict but {type(otherthing)}." + ) + + list = ["data", "file_type", "sofa", "paragraph"] + missing_cats = [] + for category in list: + if category not in list(otherthing.keys()): + missing_cats.append(category) + + if missing_cats: + raise ValueError(f"Data dict is missing categories: {missing_cats}") + + +if __name__ == "__main__": + data_dict = {} + data_dict = {"test": {"testing": "just testing"}} + validate_data_dict(data_dict) diff --git a/.github/workflows/Part2_3/example4.py b/.github/workflows/Part2_3/example4.py new file mode 100644 index 0000000..12cbf92 --- /dev/null +++ b/.github/workflows/Part2_3/example4.py @@ -0,0 +1,181 @@ +map_expressions = { + "KAT1MoralisierendesSegment": "KAT1-Moralisierendes Segment", + "Moralwerte": "KAT2-Moralwerte", + "KAT2Subjektive_Ausdrcke": "KAT2-Subjektive Ausdrücke", + "Protagonistinnen2": "KAT3-Gruppe", + "Protagonistinnen": "KAT3-Rolle", + "Protagonistinnen3": "KAT3-own/other", + "KommunikativeFunktion": "KAT4-Kommunikative Funktion", + "Forderung": "KAT5-Forderung explizit", + "KAT5Ausformulierung": "KAT5-Forderung implizit", + "Kommentar": "KOMMENTAR", +} + + +def validate_data_dict(data_dict): + if not data_dict: + raise ValueError("data_dict is empty") + for data_file_name, data_file in data_dict.items(): + validation_list = ["data", "file_type", "sofa", "paragraph"] + missing_cats = [] + for category in validation_list: + if category not in list(data_file.keys()): + missing_cats.append(category) + + if missing_cats: + raise ValueError(f"Data dict is missing categories: {missing_cats}") + + +class AnalyseOccurrence: + """Contains statistical information methods about the data.""" + + def __init__( + self, + data_dict: dict, + mode: str = "instances", + file_names: str = None, + ) -> None: + + validate_data_dict(data_dict) + + self.mode = mode + self.data_dict = data_dict + self.mode_dict = { + "instances": self.report_instances, + "spans": self.report_spans, + "span_index": self.report_index, + } + self.file_names = self._initialize_files(file_names) + self.instance_dict = self._initialize_dict() + # call the analysis method + self.mode_dict[self.mode]() + # map the df columns to the expressions given + self.map_categories() + + def _initialize_files(self, file_names: str) -> list: + """Helper method to get file names in list.""" + # get the file names from the global dict of dicts + if file_names is None: + file_names = list(self.data_dict.keys()) + # or use the file names that were passed explicitly + elif isinstance(file_names, str): + file_names = [file_names] + return file_names + + def _initialize_dict(self) -> defaultdict: + """Helper method to initialize dict.""" + return defaultdict(lambda: defaultdict(lambda: defaultdict(int))) + + def _initialize_df(self): + """Helper method to initialize data frame.""" + self.df = pd.DataFrame(self.instance_dict) + self.df.index = self.df.index.set_names((["Main Category", "Sub Category"])) + + def _get_categories(self, span_dict, file_name): + """Helper method to initialize a dict with the given main and sub categories.""" + for main_cat_key, main_cat_value in span_dict.items(): + for sub_cat_key, sub_cat_value in main_cat_value.items(): + # the tuple index makes it easy to convert the dict into a pandas dataframe + self.instance_dict[file_name][(main_cat_key, sub_cat_key)] = len( + sub_cat_value + ) + return self.instance_dict + + def _add_total(self): + """Helper method to set additional headers in data frame.""" + self.df.loc[("total instances", "with invalid"), :] = self.df.sum(axis=0).values + self.df.loc[("total instances", "without invalid"), :] = ( + self.df.loc[("total instances", "with invalid"), :].values + - self.df.loc["KAT1MoralisierendesSegment", "Keine Moralisierung"].values + ) + + def _clean_df(self): + """Helper method to sort data frame and clean up values.""" + self.df = self.df.sort_values( + by=[ + "Main Category", + "Sub Category", + # self.file_names[0], + ], + ascending=True, + ) + # fill NaN with 0 for instances or None for spans + if self.mode == "instances": + self.df = self.df.fillna(0) + if self.mode == "spans": + self.df = self.df.replace({np.nan: None}) + # remove quotes - not sure if this is necessary + # self.df = self.df.applymap(lambda x: x.replace('"','') if isinstance(x, str) else x) + + def report_instances(self): + """Reports number of occurrences of a category per text source.""" + # instances reports the number of occurrences + # filename: main_cat: sub_cat: instances + for file_name in self.file_names: + span_dict = self.data_dict[file_name]["data"] + # initilize total instances rows for easier setting later. + # only for mode instances + self.instance_dict[file_name][("total instances", "with invalid")] = 0 + self.instance_dict[file_name][("total instances", "without invalid")] = 0 + self.instance_dict = self._get_categories(span_dict, file_name) + # initialize data frame + self._initialize_df() + # add rows for total instances + # only do this for mode instances + self._add_total() + + def report_spans(self): + """Reports spans of a category per text source.""" + # span reports the spans of the annotations separated by separator-token + self.instance_dict = self._get_categories( + self.data_dict[self.file_names[0]]["data"], self.file_names[0] + ) + self._initialize_df() + self.df[:] = self.df[:].astype("object") + for file_name in self.file_names: + span_dict = self.data_dict[file_name]["data"] + span_text = self.data_dict[file_name]["sofa"] + for main_cat_key, main_cat_value in span_dict.items(): + for sub_cat_key in main_cat_value.keys(): + # save the span begin and end character index for further analysis + # span_dict[main_cat_key][sub_cat_key] = + # find the text for each span + span_annotated_text = [ + span_text[span["begin"] : span["end"]] + for span in span_dict[main_cat_key][sub_cat_key] + ] + # clean the spans from # + span_annotated_text = [ + span.replace("#", "") for span in span_annotated_text + ] + # clean the spans from " + # span_annotated_text = [ + # span.replace('"', "") for span in span_annotated_text + # ] + # convert list to &-separated spans + span_annotated_text = " & ".join(span_annotated_text) + self.df.at[ + (main_cat_key, sub_cat_key), + file_name, + ] = span_annotated_text + + def report_index(self): + self.report_instances() + self.df[:] = self.df[:].astype("object") + for file_name in self.file_names: + span_dict = self.data_dict[file_name]["data"] + for main_cat_key, main_cat_value in span_dict.items(): + for sub_cat_key in main_cat_value.keys(): + # report the beginning and end of each span as a tuple + span_list = [ + (span["begin"], span["end"]) + for span in span_dict[main_cat_key][sub_cat_key] + ] + self.df.at[ + (main_cat_key, sub_cat_key), + file_name, + ] = span_list + + def map_categories(self): + self.df = self.df.rename(map_expressions) + self._clean_df() diff --git a/.github/workflows/Part2_3/example_jupyter.ipynb b/.github/workflows/Part2_3/example_jupyter.ipynb new file mode 100644 index 0000000..36014e6 --- /dev/null +++ b/.github/workflows/Part2_3/example_jupyter.ipynb @@ -0,0 +1,100 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# This is a test for formatting jupyter notebooks using black" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import glob" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# provide some badly formatted code\n", + "# find all png files in a folder\n", + "def find_files(path=None, pattern=\"*.png\", recursive=True, limit=20) -> list:\n", + " \"\"\"Find image files on the file system\n", + "\n", + " :param path:\n", + " The base directory where we are looking for the images. Defaults to None, which uses the XDG data directory if set or the current working directory otherwise.\n", + " :param pattern:\n", + " The naming pattern that the filename should match. Defaults to\n", + " \"*.png\". Can be used to allow other patterns or to only include\n", + " specific prefixes or suffixes.\n", + " :param recursive:\n", + " Whether to recurse into subdirectories.\n", + " :param limit:\n", + " The maximum number of images to be found. Defaults to 20.\n", + " To return all images, set to None.\n", + " \"\"\"\n", + " if path is None:\n", + " path = os.environ.get(\"XDG_DATA_HOME\", \".\")\n", + "\n", + " result = list(glob.glob(f\"{path}/{pattern}\", recursive=recursive))\n", + "\n", + " if limit is not None:\n", + " result = result[:limit]\n", + "\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mylist = find_files(path=\"../data/\")\n", + "\n", + "print(\"Found files {}\".format(mylist))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "vscode": { + "interpreter": { + "hash": "6ed5ef3eb539eaf448e543cae5130aec76966f2d42b674aa1c9bdf28f2d85483" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/chapter1/example1.py b/chapter2_3/example1.py similarity index 100% rename from chapter1/example1.py rename to chapter2_3/example1.py diff --git a/chapter1/example2.py b/chapter2_3/example2.py similarity index 100% rename from chapter1/example2.py rename to chapter2_3/example2.py diff --git a/chapter1/example3.py b/chapter2_3/example3.py similarity index 100% rename from chapter1/example3.py rename to chapter2_3/example3.py diff --git a/chapter2_3/example4.py b/chapter2_3/example4.py new file mode 100644 index 0000000..343c193 --- /dev/null +++ b/chapter2_3/example4.py @@ -0,0 +1,184 @@ + +map_expressions = { + "KAT1MoralisierendesSegment": "KAT1-Moralisierendes Segment", + "Moralwerte": "KAT2-Moralwerte", + "KAT2Subjektive_Ausdrcke": "KAT2-Subjektive Ausdrücke", + "Protagonistinnen2": "KAT3-Gruppe", + "Protagonistinnen": "KAT3-Rolle", + "Protagonistinnen3": "KAT3-own/other", + "KommunikativeFunktion": "KAT4-Kommunikative Funktion", + "Forderung": "KAT5-Forderung explizit", + "KAT5Ausformulierung": "KAT5-Forderung implizit", + "Kommentar": "KOMMENTAR", +} + +def validate_data_dict(data_dict): + if not data_dict: + raise ValueError("data_dict is empty") + for data_file_name, data_file in data_dict.items(): + validation_list = ["data", "file_type", "sofa", "paragraph"] + missing_cats = [] + for category in validation_list: + if category not in list(data_file.keys()): + missing_cats.append(category) + + if missing_cats: + raise ValueError(f"Data dict is missing categories: {missing_cats}") + + +class AnalyseOccurrence: + """Contains statistical information methods about the data.""" + + def __init__( + self, + data_dict: dict, + mode: str = "instances", + file_names: str = None, + ) -> None: + + validate_data_dict(data_dict) + + self.mode = mode + self.data_dict = data_dict + self.mode_dict = { + "instances": self.report_instances, + "spans": self.report_spans, + "span_index": self.report_index, + } + self.file_names = self._initialize_files(file_names) + self.instance_dict = self._initialize_dict() + # call the analysis method + self.mode_dict[self.mode]() + # map the df columns to the expressions given + self.map_categories() + + def _initialize_files(self, file_names: str) -> list: + """Helper method to get file names in list.""" + # get the file names from the global dict of dicts + if file_names is None: + file_names = list(self.data_dict.keys()) + # or use the file names that were passed explicitly + elif isinstance(file_names, str): + file_names = [file_names] + return file_names + + def _initialize_dict(self) -> defaultdict: + """Helper method to initialize dict.""" + return defaultdict(lambda: defaultdict(lambda: defaultdict(int))) + + def _initialize_df(self): + """Helper method to initialize data frame.""" + self.df = pd.DataFrame(self.instance_dict) + self.df.index = self.df.index.set_names((["Main Category", "Sub Category"])) + + def _get_categories(self, span_dict, file_name): + """Helper method to initialize a dict with the given main and sub categories.""" + for main_cat_key, main_cat_value in span_dict.items(): + for sub_cat_key, sub_cat_value in main_cat_value.items(): + # the tuple index makes it easy to convert the dict into a pandas dataframe + self.instance_dict[file_name][(main_cat_key, sub_cat_key)] = len( + sub_cat_value + ) + return self.instance_dict + + def _add_total(self): + """Helper method to set additional headers in data frame.""" + self.df.loc[("total instances", "with invalid"), :] = self.df.sum(axis=0).values + self.df.loc[("total instances", "without invalid"), :] = ( + self.df.loc[("total instances", "with invalid"), :].values + - self.df.loc["KAT1MoralisierendesSegment", "Keine Moralisierung"].values + ) + + def _clean_df(self): + """Helper method to sort data frame and clean up values.""" + self.df = self.df.sort_values( + by=[ + "Main Category", + "Sub Category", + # self.file_names[0], + ], + ascending=True, + ) + # fill NaN with 0 for instances or None for spans + if self.mode == "instances": + self.df = self.df.fillna(0) + if self.mode == "spans": + self.df = self.df.replace({np.nan: None}) + # remove quotes - not sure if this is necessary + # self.df = self.df.applymap(lambda x: x.replace('"','') if isinstance(x, str) else x) + + def report_instances(self): + """Reports number of occurrences of a category per text source.""" + # instances reports the number of occurrences + # filename: main_cat: sub_cat: instances + for file_name in self.file_names: + span_dict = self.data_dict[file_name]["data"] + # initilize total instances rows for easier setting later. + # only for mode instances + self.instance_dict[file_name][("total instances", "with invalid")] = 0 + self.instance_dict[file_name][("total instances", "without invalid")] = 0 + self.instance_dict = self._get_categories(span_dict, file_name) + # initialize data frame + self._initialize_df() + # add rows for total instances + # only do this for mode instances + self._add_total() + + def report_spans(self): + """Reports spans of a category per text source.""" + # span reports the spans of the annotations separated by separator-token + self.instance_dict = self._get_categories( + self.data_dict[self.file_names[0]]["data"], self.file_names[0] + ) + self._initialize_df() + self.df[:] = self.df[:].astype("object") + for file_name in self.file_names: + span_dict = self.data_dict[file_name]["data"] + span_text = self.data_dict[file_name]["sofa"] + for main_cat_key, main_cat_value in span_dict.items(): + for sub_cat_key in main_cat_value.keys(): + # save the span begin and end character index for further analysis + # span_dict[main_cat_key][sub_cat_key] = + # find the text for each span + span_annotated_text = [ + span_text[span["begin"] : span["end"]] + for span in span_dict[main_cat_key][sub_cat_key] + ] + # clean the spans from # + span_annotated_text = [ + span.replace("#", "") for span in span_annotated_text + ] + # clean the spans from " + # span_annotated_text = [ + # span.replace('"', "") for span in span_annotated_text + # ] + # convert list to &-separated spans + span_annotated_text = " & ".join(span_annotated_text) + self.df.at[ + (main_cat_key, sub_cat_key), + file_name, + ] = span_annotated_text + + def report_index(self): + self.report_instances() + self.df[:] = self.df[:].astype("object") + for file_name in self.file_names: + span_dict = self.data_dict[file_name]["data"] + for main_cat_key, main_cat_value in span_dict.items(): + for sub_cat_key in main_cat_value.keys(): + # report the beginning and end of each span as a tuple + span_list = [ + (span["begin"], span["end"]) + for span in span_dict[main_cat_key][sub_cat_key] + ] + self.df.at[ + (main_cat_key, sub_cat_key), + file_name, + ] = span_list + + def map_categories(self): + self.df = self.df.rename(map_expressions) + self._clean_df() + + + diff --git a/chapter2_3/example_jupyter.ipynb b/chapter2_3/example_jupyter.ipynb new file mode 100644 index 0000000..0b99fd6 --- /dev/null +++ b/chapter2_3/example_jupyter.ipynb @@ -0,0 +1,103 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# This is a test for formatting jupyter notebooks using black" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import glob" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#provide some badly formatted code\n", + "# find all png files in a folder\n", + "def find_files(path=None , pattern=\"*.png\" , recursive=True , limit = 20) -> list:\n", + " \"\"\"Find image files on the file system\n", + "\n", + " :param path:\n", + " The base directory where we are looking for the images. Defaults to None, which uses the XDG data directory if set or the current working directory otherwise.\n", + " :param pattern:\n", + " The naming pattern that the filename should match. Defaults to\n", + " \"*.png\". Can be used to allow other patterns or to only include\n", + " specific prefixes or suffixes.\n", + " :param recursive:\n", + " Whether to recurse into subdirectories.\n", + " :param limit:\n", + " The maximum number of images to be found. Defaults to 20.\n", + " To return all images, set to None.\n", + " \"\"\"\n", + " if path is None:\n", + " path = os.environ.get(\"XDG_DATA_HOME\", \".\")\n", + "\n", + " result=list(glob.glob(f\"{path}/{pattern}\", recursive=recursive))\n", + "\n", + " if limit is not None:\n", + " result = result[: limit]\n", + "\n", + " return result\n", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "mylist = find_files(path=\"../data/\")\n", + "\n", + "print(\"Found files {}\".format(mylist))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "vscode": { + "interpreter": { + "hash": "6ed5ef3eb539eaf448e543cae5130aec76966f2d42b674aa1c9bdf28f2d85483" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/test_fixmes.py b/tests/test_linter_formatter.py similarity index 89% rename from tests/test_fixmes.py rename to tests/test_linter_formatter.py index 39ac7f8..b321b9c 100644 --- a/tests/test_fixmes.py +++ b/tests/test_linter_formatter.py @@ -7,7 +7,7 @@ def test_flake8(): # Get the repository directory current_dir = Path(__file__).resolve().parents[1] - input_file_path = current_dir / "chapter1" + input_file_path = current_dir / "chapter2_3" # run flake8 on the example files command = "flake8 {}".format(input_file_path) failure = 0 @@ -27,7 +27,7 @@ def test_flake8(): def test_german_name(): # Kreis in example 2 current_dir = Path(__file__).resolve().parents[1] - input_file = current_dir / "chapter1" / "example2.py" + input_file = current_dir / "chapter2_3" / "example2.py" # figure out if the word "Kreis" is in the file with open(input_file, "r") as f: file_content = f.read() @@ -37,7 +37,7 @@ def test_german_name(): def test_intrinsic_function(): # check variable name in example3 current_dir = Path(__file__).resolve().parents[1] - input_file = current_dir / "chapter1" / "example3.py" + input_file = current_dir / "chapter2_3" / "example3.py" # make sure the intrinsic "list" function is not used as a variable name with open(input_file, "r") as f: file_content = f.read() @@ -52,3 +52,6 @@ def test_intrinsic_function(): print(file_content[i:i+5]) failure = 1 assert failure == 0 + +def test_jupyter_notebook(): + pass \ No newline at end of file