update tests part 1

ssciwr-courses · May 28, 2024 · 1a9d767 · 1a9d767
1 parent 675f0b1
commit 1a9d767
Show file tree

Hide file tree

Showing 11 changed files with 654 additions and 3 deletions.
diff --git a/.github/workflows/Part2_3/example1.py b/.github/workflows/Part2_3/example1.py
@@ -0,0 +1,34 @@
+import os
+import glob
+
+
+# find all png files in a folder
+def find_files(path=None, pattern="*.png", recursive=True, limit=20) -> list:
+    """Find image files on the file system.
+
+    :param path:
+        The base directory where we are looking for the images. Defaults to None, which uses the XDG data directory if set or the current working directory otherwise.
+    :param pattern:
+        The naming pattern that the filename should match. Defaults to
+        "*.png". Can be used to allow other patterns or to only include
+        specific prefixes or suffixes.
+    :param recursive:
+        Whether to recurse into subdirectories.
+    :param limit:
+        The maximum number of images to be found. Defaults to 20.
+        To return all images, set to None.
+    """
+    if path is None:
+        path = os.environ.get("XDG_DATA_HOME", ".")
+
+    result = list(glob.glob(f"{path}/{pattern}", recursive=recursive))
+
+    if limit is not None:
+        result = result[:limit]
+
+    return result
+
+
+if __name__ == "__main__":
+    list = find_files(path="./data/")
+    print("Found files {}".format(list))
diff --git a/.github/workflows/Part2_3/example2.py b/.github/workflows/Part2_3/example2.py
@@ -0,0 +1,21 @@
+import numpy as np
+
+
+def area_circ(r_in):
+    """Calculate the area of a circle with given radius.
+
+    :Input: The radius of the circle (float, >=0).
+    :Returns: The area of the circle (float)."""
+    if r_in < 0:
+        raise ValueError("The radius must be >= 0.")
+    Kreis = np.pi * r_in**2
+    print(
+        """The area of a circle with radius r = {:3.2f}cm is A = {:4.2f}cm2.""".format(
+            r_in, Kreis
+        )
+    )
+    return Kreis
+
+
+if __name__ == "__main__":
+    _ = area_circ(5.0)
diff --git a/.github/workflows/Part2_3/example3.py b/.github/workflows/Part2_3/example3.py
@@ -0,0 +1,25 @@
+def validate_data_dict(data_dict):
+    if not data_dict:
+        raise ValueError("data_dict is empty")
+    for something, otherthing in data_dict.items():
+        if not otherthing:
+            raise ValueError(f"The dict content under {something} is empty.")
+        if not isinstance(otherthing, dict):
+            raise ValueError(
+                f"The content of {something} is not a dict but {type(otherthing)}."
+            )
+
+        list = ["data", "file_type", "sofa", "paragraph"]
+        missing_cats = []
+        for category in list:
+            if category not in list(otherthing.keys()):
+                missing_cats.append(category)
+
+        if missing_cats:
+            raise ValueError(f"Data dict is missing categories: {missing_cats}")
+
+
+if __name__ == "__main__":
+    data_dict = {}
+    data_dict = {"test": {"testing": "just testing"}}
+    validate_data_dict(data_dict)
diff --git a/.github/workflows/Part2_3/example4.py b/.github/workflows/Part2_3/example4.py
@@ -0,0 +1,181 @@
+map_expressions = {
+    "KAT1MoralisierendesSegment": "KAT1-Moralisierendes Segment",
+    "Moralwerte": "KAT2-Moralwerte",
+    "KAT2Subjektive_Ausdrcke": "KAT2-Subjektive Ausdrücke",
+    "Protagonistinnen2": "KAT3-Gruppe",
+    "Protagonistinnen": "KAT3-Rolle",
+    "Protagonistinnen3": "KAT3-own/other",
+    "KommunikativeFunktion": "KAT4-Kommunikative Funktion",
+    "Forderung": "KAT5-Forderung explizit",
+    "KAT5Ausformulierung": "KAT5-Forderung implizit",
+    "Kommentar": "KOMMENTAR",
+}
+
+
+def validate_data_dict(data_dict):
+    if not data_dict:
+        raise ValueError("data_dict is empty")
+    for data_file_name, data_file in data_dict.items():
+        validation_list = ["data", "file_type", "sofa", "paragraph"]
+        missing_cats = []
+        for category in validation_list:
+            if category not in list(data_file.keys()):
+                missing_cats.append(category)
+
+        if missing_cats:
+            raise ValueError(f"Data dict is missing categories: {missing_cats}")
+
+
+class AnalyseOccurrence:
+    """Contains statistical information methods about the data."""
+
+    def __init__(
+        self,
+        data_dict: dict,
+        mode: str = "instances",
+        file_names: str = None,
+    ) -> None:
+
+        validate_data_dict(data_dict)
+
+        self.mode = mode
+        self.data_dict = data_dict
+        self.mode_dict = {
+            "instances": self.report_instances,
+            "spans": self.report_spans,
+            "span_index": self.report_index,
+        }
+        self.file_names = self._initialize_files(file_names)
+        self.instance_dict = self._initialize_dict()
+        # call the analysis method
+        self.mode_dict[self.mode]()
+        # map the df columns to the expressions given
+        self.map_categories()
+
+    def _initialize_files(self, file_names: str) -> list:
+        """Helper method to get file names in list."""
+        # get the file names from the global dict of dicts
+        if file_names is None:
+            file_names = list(self.data_dict.keys())
+        # or use the file names that were passed explicitly
+        elif isinstance(file_names, str):
+            file_names = [file_names]
+        return file_names
+
+    def _initialize_dict(self) -> defaultdict:
+        """Helper method to initialize dict."""
+        return defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
+
+    def _initialize_df(self):
+        """Helper method to initialize data frame."""
+        self.df = pd.DataFrame(self.instance_dict)
+        self.df.index = self.df.index.set_names((["Main Category", "Sub Category"]))
+
+    def _get_categories(self, span_dict, file_name):
+        """Helper method to initialize a dict with the given main and sub categories."""
+        for main_cat_key, main_cat_value in span_dict.items():
+            for sub_cat_key, sub_cat_value in main_cat_value.items():
+                # the tuple index makes it easy to convert the dict into a pandas dataframe
+                self.instance_dict[file_name][(main_cat_key, sub_cat_key)] = len(
+                    sub_cat_value
+                )
+        return self.instance_dict
+
+    def _add_total(self):
+        """Helper method to set additional headers in data frame."""
+        self.df.loc[("total instances", "with invalid"), :] = self.df.sum(axis=0).values
+        self.df.loc[("total instances", "without invalid"), :] = (
+            self.df.loc[("total instances", "with invalid"), :].values
+            - self.df.loc["KAT1MoralisierendesSegment", "Keine Moralisierung"].values
+        )
+
+    def _clean_df(self):
+        """Helper method to sort data frame and clean up values."""
+        self.df = self.df.sort_values(
+            by=[
+                "Main Category",
+                "Sub Category",
+                # self.file_names[0],
+            ],
+            ascending=True,
+        )
+        # fill NaN with 0 for instances or None for spans
+        if self.mode == "instances":
+            self.df = self.df.fillna(0)
+        if self.mode == "spans":
+            self.df = self.df.replace({np.nan: None})
+            # remove quotes - not sure if this is necessary
+            # self.df = self.df.applymap(lambda x: x.replace('"','') if isinstance(x, str) else x)
+
+    def report_instances(self):
+        """Reports number of occurrences of a category per text source."""
+        # instances reports the number of occurrences
+        # filename: main_cat: sub_cat: instances
+        for file_name in self.file_names:
+            span_dict = self.data_dict[file_name]["data"]
+            # initilize total instances rows for easier setting later.
+            # only for mode instances
+            self.instance_dict[file_name][("total instances", "with invalid")] = 0
+            self.instance_dict[file_name][("total instances", "without invalid")] = 0
+            self.instance_dict = self._get_categories(span_dict, file_name)
+        # initialize data frame
+        self._initialize_df()
+        # add rows for total instances
+        # only do this for mode instances
+        self._add_total()
+
+    def report_spans(self):
+        """Reports spans of a category per text source."""
+        # span reports the spans of the annotations separated by separator-token
+        self.instance_dict = self._get_categories(
+            self.data_dict[self.file_names[0]]["data"], self.file_names[0]
+        )
+        self._initialize_df()
+        self.df[:] = self.df[:].astype("object")
+        for file_name in self.file_names:
+            span_dict = self.data_dict[file_name]["data"]
+            span_text = self.data_dict[file_name]["sofa"]
+            for main_cat_key, main_cat_value in span_dict.items():
+                for sub_cat_key in main_cat_value.keys():
+                    # save the span begin and end character index for further analysis
+                    # span_dict[main_cat_key][sub_cat_key] =
+                    # find the text for each span
+                    span_annotated_text = [
+                        span_text[span["begin"] : span["end"]]
+                        for span in span_dict[main_cat_key][sub_cat_key]
+                    ]
+                    # clean the spans from #
+                    span_annotated_text = [
+                        span.replace("#", "") for span in span_annotated_text
+                    ]
+                    # clean the spans from "
+                    # span_annotated_text = [
+                    #     span.replace('"', "") for span in span_annotated_text
+                    # ]
+                    # convert list to &-separated spans
+                    span_annotated_text = " & ".join(span_annotated_text)
+                    self.df.at[
+                        (main_cat_key, sub_cat_key),
+                        file_name,
+                    ] = span_annotated_text
+
+    def report_index(self):
+        self.report_instances()
+        self.df[:] = self.df[:].astype("object")
+        for file_name in self.file_names:
+            span_dict = self.data_dict[file_name]["data"]
+            for main_cat_key, main_cat_value in span_dict.items():
+                for sub_cat_key in main_cat_value.keys():
+                    # report the beginning and end of each span as a tuple
+                    span_list = [
+                        (span["begin"], span["end"])
+                        for span in span_dict[main_cat_key][sub_cat_key]
+                    ]
+                    self.df.at[
+                        (main_cat_key, sub_cat_key),
+                        file_name,
+                    ] = span_list
+
+    def map_categories(self):
+        self.df = self.df.rename(map_expressions)
+        self._clean_df()
diff --git a/.github/workflows/Part2_3/example_jupyter.ipynb b/.github/workflows/Part2_3/example_jupyter.ipynb
@@ -0,0 +1,100 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# This is a test for formatting jupyter notebooks using black"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "import glob"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# provide some badly formatted code\n",
+    "# find all png files in a folder\n",
+    "def find_files(path=None, pattern=\"*.png\", recursive=True, limit=20) -> list:\n",
+    "    \"\"\"Find image files on the file system\n",
+    "\n",
+    "    :param path:\n",
+    "        The base directory where we are looking for the images. Defaults to None, which uses the XDG data directory if set or the current working directory otherwise.\n",
+    "    :param pattern:\n",
+    "        The naming pattern that the filename should match. Defaults to\n",
+    "        \"*.png\". Can be used to allow other patterns or to only include\n",
+    "        specific prefixes or suffixes.\n",
+    "    :param recursive:\n",
+    "        Whether to recurse into subdirectories.\n",
+    "    :param limit:\n",
+    "        The maximum number of images to be found. Defaults to 20.\n",
+    "        To return all images, set to None.\n",
+    "    \"\"\"\n",
+    "    if path is None:\n",
+    "        path = os.environ.get(\"XDG_DATA_HOME\", \".\")\n",
+    "\n",
+    "    result = list(glob.glob(f\"{path}/{pattern}\", recursive=recursive))\n",
+    "\n",
+    "    if limit is not None:\n",
+    "        result = result[:limit]\n",
+    "\n",
+    "    return result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mylist = find_files(path=\"../data/\")\n",
+    "\n",
+    "print(\"Found files {}\".format(mylist))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "6ed5ef3eb539eaf448e543cae5130aec76966f2d42b674aa1c9bdf28f2d85483"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/chapter1/example1.py → chapter2_3/example1.py b/chapter1/example1.py → chapter2_3/example1.py
diff --git a/chapter1/example2.py → chapter2_3/example2.py b/chapter1/example2.py → chapter2_3/example2.py
diff --git a/chapter1/example3.py → chapter2_3/example3.py b/chapter1/example3.py → chapter2_3/example3.py