Skip to content

Commit

Permalink
update tests part 1
Browse files Browse the repository at this point in the history
  • Loading branch information
iulusoy committed May 28, 2024
1 parent 675f0b1 commit 1a9d767
Show file tree
Hide file tree
Showing 11 changed files with 654 additions and 3 deletions.
34 changes: 34 additions & 0 deletions .github/workflows/Part2_3/example1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import os
import glob


# find all png files in a folder
def find_files(path=None, pattern="*.png", recursive=True, limit=20) -> list:
"""Find image files on the file system.
:param path:
The base directory where we are looking for the images. Defaults to None, which uses the XDG data directory if set or the current working directory otherwise.
:param pattern:
The naming pattern that the filename should match. Defaults to
"*.png". Can be used to allow other patterns or to only include
specific prefixes or suffixes.
:param recursive:
Whether to recurse into subdirectories.
:param limit:
The maximum number of images to be found. Defaults to 20.
To return all images, set to None.
"""
if path is None:
path = os.environ.get("XDG_DATA_HOME", ".")

result = list(glob.glob(f"{path}/{pattern}", recursive=recursive))

if limit is not None:
result = result[:limit]

return result


if __name__ == "__main__":
list = find_files(path="./data/")
print("Found files {}".format(list))
21 changes: 21 additions & 0 deletions .github/workflows/Part2_3/example2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import numpy as np


def area_circ(r_in):
"""Calculate the area of a circle with given radius.
:Input: The radius of the circle (float, >=0).
:Returns: The area of the circle (float)."""
if r_in < 0:
raise ValueError("The radius must be >= 0.")
Kreis = np.pi * r_in**2
print(
"""The area of a circle with radius r = {:3.2f}cm is A = {:4.2f}cm2.""".format(
r_in, Kreis
)
)
return Kreis


if __name__ == "__main__":
_ = area_circ(5.0)
25 changes: 25 additions & 0 deletions .github/workflows/Part2_3/example3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
def validate_data_dict(data_dict):
if not data_dict:
raise ValueError("data_dict is empty")
for something, otherthing in data_dict.items():
if not otherthing:
raise ValueError(f"The dict content under {something} is empty.")
if not isinstance(otherthing, dict):
raise ValueError(
f"The content of {something} is not a dict but {type(otherthing)}."
)

list = ["data", "file_type", "sofa", "paragraph"]
missing_cats = []
for category in list:
if category not in list(otherthing.keys()):
missing_cats.append(category)

if missing_cats:
raise ValueError(f"Data dict is missing categories: {missing_cats}")


if __name__ == "__main__":
data_dict = {}
data_dict = {"test": {"testing": "just testing"}}
validate_data_dict(data_dict)
181 changes: 181 additions & 0 deletions .github/workflows/Part2_3/example4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
map_expressions = {
"KAT1MoralisierendesSegment": "KAT1-Moralisierendes Segment",
"Moralwerte": "KAT2-Moralwerte",
"KAT2Subjektive_Ausdrcke": "KAT2-Subjektive Ausdrücke",
"Protagonistinnen2": "KAT3-Gruppe",
"Protagonistinnen": "KAT3-Rolle",
"Protagonistinnen3": "KAT3-own/other",
"KommunikativeFunktion": "KAT4-Kommunikative Funktion",
"Forderung": "KAT5-Forderung explizit",
"KAT5Ausformulierung": "KAT5-Forderung implizit",
"Kommentar": "KOMMENTAR",
}


def validate_data_dict(data_dict):
if not data_dict:
raise ValueError("data_dict is empty")
for data_file_name, data_file in data_dict.items():
validation_list = ["data", "file_type", "sofa", "paragraph"]
missing_cats = []
for category in validation_list:
if category not in list(data_file.keys()):
missing_cats.append(category)

if missing_cats:
raise ValueError(f"Data dict is missing categories: {missing_cats}")


class AnalyseOccurrence:
"""Contains statistical information methods about the data."""

def __init__(
self,
data_dict: dict,
mode: str = "instances",
file_names: str = None,
) -> None:

validate_data_dict(data_dict)

self.mode = mode
self.data_dict = data_dict
self.mode_dict = {
"instances": self.report_instances,
"spans": self.report_spans,
"span_index": self.report_index,
}
self.file_names = self._initialize_files(file_names)
self.instance_dict = self._initialize_dict()
# call the analysis method
self.mode_dict[self.mode]()
# map the df columns to the expressions given
self.map_categories()

def _initialize_files(self, file_names: str) -> list:
"""Helper method to get file names in list."""
# get the file names from the global dict of dicts
if file_names is None:
file_names = list(self.data_dict.keys())
# or use the file names that were passed explicitly
elif isinstance(file_names, str):
file_names = [file_names]
return file_names

def _initialize_dict(self) -> defaultdict:
"""Helper method to initialize dict."""
return defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

def _initialize_df(self):
"""Helper method to initialize data frame."""
self.df = pd.DataFrame(self.instance_dict)
self.df.index = self.df.index.set_names((["Main Category", "Sub Category"]))

def _get_categories(self, span_dict, file_name):
"""Helper method to initialize a dict with the given main and sub categories."""
for main_cat_key, main_cat_value in span_dict.items():
for sub_cat_key, sub_cat_value in main_cat_value.items():
# the tuple index makes it easy to convert the dict into a pandas dataframe
self.instance_dict[file_name][(main_cat_key, sub_cat_key)] = len(
sub_cat_value
)
return self.instance_dict

def _add_total(self):
"""Helper method to set additional headers in data frame."""
self.df.loc[("total instances", "with invalid"), :] = self.df.sum(axis=0).values
self.df.loc[("total instances", "without invalid"), :] = (
self.df.loc[("total instances", "with invalid"), :].values
- self.df.loc["KAT1MoralisierendesSegment", "Keine Moralisierung"].values
)

def _clean_df(self):
"""Helper method to sort data frame and clean up values."""
self.df = self.df.sort_values(
by=[
"Main Category",
"Sub Category",
# self.file_names[0],
],
ascending=True,
)
# fill NaN with 0 for instances or None for spans
if self.mode == "instances":
self.df = self.df.fillna(0)
if self.mode == "spans":
self.df = self.df.replace({np.nan: None})
# remove quotes - not sure if this is necessary
# self.df = self.df.applymap(lambda x: x.replace('"','') if isinstance(x, str) else x)

def report_instances(self):
"""Reports number of occurrences of a category per text source."""
# instances reports the number of occurrences
# filename: main_cat: sub_cat: instances
for file_name in self.file_names:
span_dict = self.data_dict[file_name]["data"]
# initilize total instances rows for easier setting later.
# only for mode instances
self.instance_dict[file_name][("total instances", "with invalid")] = 0
self.instance_dict[file_name][("total instances", "without invalid")] = 0
self.instance_dict = self._get_categories(span_dict, file_name)
# initialize data frame
self._initialize_df()
# add rows for total instances
# only do this for mode instances
self._add_total()

def report_spans(self):
"""Reports spans of a category per text source."""
# span reports the spans of the annotations separated by separator-token
self.instance_dict = self._get_categories(
self.data_dict[self.file_names[0]]["data"], self.file_names[0]
)
self._initialize_df()
self.df[:] = self.df[:].astype("object")
for file_name in self.file_names:
span_dict = self.data_dict[file_name]["data"]
span_text = self.data_dict[file_name]["sofa"]
for main_cat_key, main_cat_value in span_dict.items():
for sub_cat_key in main_cat_value.keys():
# save the span begin and end character index for further analysis
# span_dict[main_cat_key][sub_cat_key] =
# find the text for each span
span_annotated_text = [
span_text[span["begin"] : span["end"]]
for span in span_dict[main_cat_key][sub_cat_key]
]
# clean the spans from #
span_annotated_text = [
span.replace("#", "") for span in span_annotated_text
]
# clean the spans from "
# span_annotated_text = [
# span.replace('"', "") for span in span_annotated_text
# ]
# convert list to &-separated spans
span_annotated_text = " & ".join(span_annotated_text)
self.df.at[
(main_cat_key, sub_cat_key),
file_name,
] = span_annotated_text

def report_index(self):
self.report_instances()
self.df[:] = self.df[:].astype("object")
for file_name in self.file_names:
span_dict = self.data_dict[file_name]["data"]
for main_cat_key, main_cat_value in span_dict.items():
for sub_cat_key in main_cat_value.keys():
# report the beginning and end of each span as a tuple
span_list = [
(span["begin"], span["end"])
for span in span_dict[main_cat_key][sub_cat_key]
]
self.df.at[
(main_cat_key, sub_cat_key),
file_name,
] = span_list

def map_categories(self):
self.df = self.df.rename(map_expressions)
self._clean_df()
100 changes: 100 additions & 0 deletions .github/workflows/Part2_3/example_jupyter.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# This is a test for formatting jupyter notebooks using black"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"import glob"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# provide some badly formatted code\n",
"# find all png files in a folder\n",
"def find_files(path=None, pattern=\"*.png\", recursive=True, limit=20) -> list:\n",
" \"\"\"Find image files on the file system\n",
"\n",
" :param path:\n",
" The base directory where we are looking for the images. Defaults to None, which uses the XDG data directory if set or the current working directory otherwise.\n",
" :param pattern:\n",
" The naming pattern that the filename should match. Defaults to\n",
" \"*.png\". Can be used to allow other patterns or to only include\n",
" specific prefixes or suffixes.\n",
" :param recursive:\n",
" Whether to recurse into subdirectories.\n",
" :param limit:\n",
" The maximum number of images to be found. Defaults to 20.\n",
" To return all images, set to None.\n",
" \"\"\"\n",
" if path is None:\n",
" path = os.environ.get(\"XDG_DATA_HOME\", \".\")\n",
"\n",
" result = list(glob.glob(f\"{path}/{pattern}\", recursive=recursive))\n",
"\n",
" if limit is not None:\n",
" result = result[:limit]\n",
"\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mylist = find_files(path=\"../data/\")\n",
"\n",
"print(\"Found files {}\".format(mylist))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
},
"vscode": {
"interpreter": {
"hash": "6ed5ef3eb539eaf448e543cae5130aec76966f2d42b674aa1c9bdf28f2d85483"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 1a9d767

Please sign in to comment.