diff --git a/src/framework/processing/py/port/api/props.py b/src/framework/processing/py/port/api/props.py index 989a3bb3..42bf8c3b 100644 --- a/src/framework/processing/py/port/api/props.py +++ b/src/framework/processing/py/port/api/props.py @@ -4,39 +4,46 @@ import pandas as pd -class Translations(TypedDict): - """Typed dict containing text that is display in a speficic language - - Attributes: - en: English string to display - nl: Dutch string to display +@dataclass +class Translations: + """ + Typed dict containing text that is displayed in a specific language. """ - en: str nl: str @dataclass class Translatable: - """Wrapper class for Translations""" - + """ + Wrapper class for Translations. + """ translations: Translations def toDict(self): + """ + Convert the object to a dictionary. + + Returns: + dict: A dictionary representation of the object. + """ return self.__dict__.copy() @dataclass class PropsUIHeader: - """Page header - - Attributes: - title: title of the page """ - + Page header. + """ title: Translatable def toDict(self): + """ + Convert the object to a dictionary. + + Returns: + dict: A dictionary representation of the object. + """ dict = {} dict["__type__"] = "PropsUIHeader" dict["title"] = self.title.toDict() @@ -45,13 +52,16 @@ def toDict(self): @dataclass class PropsUIFooter: - """Page footer - - Attributes: - progressPercentage: float indicating the progress in the flow """ - + Page footer. + """ def toDict(self): + """ + Convert the object to a dictionary. + + Returns: + dict: A dictionary representation of the object. + """ dict = {} dict["__type__"] = "PropsUIFooter" return dict @@ -59,22 +69,28 @@ def toDict(self): @dataclass class PropsUIPromptConfirm: - """Retry submitting a file page + """ + Retry submitting a file page. Prompt the user if they want to submit a new file. This can be used in case a file could not be processed. Attributes: - text: message to display - ok: message to display if the user wants to try again - cancel: message to display if the user wants to continue regardless + text (Translatable): Message to display. + ok (Translatable): Message to display if the user wants to try again. + cancel (Translatable): Message to display if the user wants to continue regardless. """ - text: Translatable ok: Translatable cancel: Translatable def toDict(self): + """ + Convert the object to a dictionary. + + Returns: + dict: A dictionary representation of the object. + """ dict = {} dict["__type__"] = "PropsUIPromptConfirm" dict["text"] = self.text.toDict() @@ -85,15 +101,18 @@ def toDict(self): @dataclass class PropsUIPromptConsentFormTable: - """Table to be shown to the participant prior to donation + """ + Table to be shown to the participant prior to donation. Attributes: - id: a unique string to itentify the table after donation - title: title of the table - data_frame: table to be shown - visualizations: optional visualizations to be shown. (see TODO for input format) + id (str): A unique string to identify the table after donation. + title (Translatable): Title of the table. + data_frame (pd.DataFrame): Table to be shown. + description (Optional[Translatable]): Optional description of the table. + visualizations (Optional[list]): Optional visualizations to be shown. + folded (Optional[bool]): Whether the table should be initially folded. + delete_option (Optional[bool]): Whether to show a delete option for the table. """ - id: str title: Translatable data_frame: pd.DataFrame @@ -103,6 +122,12 @@ class PropsUIPromptConsentFormTable: delete_option: Optional[bool] = True def toDict(self): + """ + Convert the object to a dictionary. + + Returns: + dict: A dictionary representation of the object. + """ dict = {} dict["__type__"] = "PropsUIPromptConsentFormTable" dict["id"] = self.id @@ -117,13 +142,16 @@ def toDict(self): @dataclass class PropsUIPromptConsentForm: - """Tables to be shown to the participant prior to donation + """ + Tables to be shown to the participant prior to donation. Attributes: - tables: a list of tables - meta_tables: a list of optional tables, for example for logging data + tables (list[PropsUIPromptConsentFormTable]): A list of tables. + meta_tables (list[PropsUIPromptConsentFormTable]): A list of optional tables, for example for logging data. + description (Optional[Translatable]): Optional description of the consent form. + donate_question (Optional[Translatable]): Optional donation question. + donate_button (Optional[Translatable]): Optional text for the donate button. """ - tables: list[PropsUIPromptConsentFormTable] meta_tables: list[PropsUIPromptConsentFormTable] description: Optional[Translatable] = None @@ -131,18 +159,36 @@ class PropsUIPromptConsentForm: donate_button: Optional[Translatable] = None def translate_tables(self): + """ + Translate the tables to a list of dictionaries. + + Returns: + list: A list of dictionaries representing the tables. + """ output = [] for table in self.tables: output.append(table.toDict()) return output def translate_meta_tables(self): + """ + Translate the meta tables to a list of dictionaries. + + Returns: + list: A list of dictionaries representing the meta tables. + """ output = [] for table in self.meta_tables: output.append(table.toDict()) return output def toDict(self): + """ + Convert the object to a dictionary. + + Returns: + dict: A dictionary representation of the object. + """ dict = {} dict["__type__"] = "PropsUIPromptConsentForm" dict["tables"] = self.translate_tables() @@ -155,17 +201,23 @@ def toDict(self): @dataclass class PropsUIPromptFileInput: - """Prompt the user to submit a file + """ + Prompt the user to submit a file. Attributes: - description: text with an explanation - extensions: accepted mime types, example: "application/zip, text/plain" + description (Translatable): Text with an explanation. + extensions (str): Accepted mime types, example: "application/zip, text/plain". """ - description: Translatable extensions: str def toDict(self): + """ + Convert the object to a dictionary. + + Returns: + dict: A dictionary representation of the object. + """ dict = {} dict["__type__"] = "PropsUIPromptFileInput" dict["description"] = self.description.toDict() @@ -175,17 +227,23 @@ def toDict(self): @dataclass class PropsUIPromptFileInputMultiple: - """Prompt the user to submit multiple files + """ + Prompt the user to submit multiple files. Attributes: - description: text with an explanation - extensions: accepted mime types, example: "application/zip, text/plain" + description (Translatable): Text with an explanation. + extensions (str): Accepted mime types, example: "application/zip, text/plain". """ - description: Translatable extensions: str def toDict(self): + """ + Convert the object to a dictionary. + + Returns: + dict: A dictionary representation of the object. + """ dict = {} dict["__type__"] = "PropsUIPromptFileInputMultiple" dict["description"] = self.description.toDict() @@ -195,56 +253,68 @@ def toDict(self): @dataclass class PropsUIPromptProgress: - """Prompt the user information during the extraction + """ + Prompt the user information during the extraction. Attributes: - description: text with an explanation - message: can be used to show extraction progress + description (Translatable): Text with an explanation. + message (str): Can be used to show extraction progress. + percentage (Optional[int]): Optional percentage of progress. """ - description: Translatable message: str percentage: Optional[int] = None def toDict(self): + """ + Convert the object to a dictionary. + + Returns: + dict: A dictionary representation of the object. + """ dict = {} dict["__type__"] = "PropsUIPromptProgress" dict["description"] = self.description.toDict() dict["message"] = self.message dict["percentage"] = self.percentage - return dict class RadioItem(TypedDict): - """Radio button + """ + Radio button. Attributes: - id: id of radio button - value: text to be displayed + id (int): ID of radio button. + value (str): Text to be displayed. """ - id: int value: str @dataclass class PropsUIPromptRadioInput: - """Radio group + """ + Radio group. - This radio group can be used get a mutiple choice answer from a user + This radio group can be used to get a multiple choice answer from a user. Attributes: - title: title of the radio group - description: short description of the radio group - items: a list of radio buttons + title (Translatable): Title of the radio group. + description (Translatable): Short description of the radio group. + items (list[RadioItem]): A list of radio buttons. """ - title: Translatable description: Translatable items: list[RadioItem] def toDict(self): + """ + Convert the object to a dictionary. + + Returns: + dict: A dictionary representation of the object. + """ dict = {} dict["__type__"] = "PropsUIPromptRadioInput" dict["title"] = self.title.toDict() @@ -256,12 +326,22 @@ def toDict(self): @dataclass class PropsUIQuestionOpen: """ - NO DOCS YET + Open-ended question. + + Attributes: + id (int): Question ID. + question (Translatable): The question text. """ id: int question: Translatable def toDict(self): + """ + Convert the object to a dictionary. + + Returns: + dict: A dictionary representation of the object. + """ dict = {} dict["__type__"] = "PropsUIQuestionOpen" dict["id"] = self.id @@ -272,13 +352,24 @@ def toDict(self): @dataclass class PropsUIQuestionMultipleChoiceCheckbox: """ - NO DOCS YET + Multiple choice question with checkboxes. + + Attributes: + id (int): Question ID. + question (Translatable): The question text. + choices (list[Translatable]): List of choices. """ id: int question: Translatable choices: list[Translatable] def toDict(self): + """ + Convert the object to a dictionary. + + Returns: + dict: A dictionary representation of the object. + """ dict = {} dict["__type__"] = "PropsUIQuestionMultipleChoiceCheckbox" dict["id"] = self.id @@ -290,13 +381,24 @@ def toDict(self): @dataclass class PropsUIQuestionMultipleChoice: """ - NO DOCS YET + Multiple choice question with radio buttons. + + Attributes: + id (int): Question ID. + question (Translatable): The question text. + choices (list[Translatable]): List of choices. """ id: int question: Translatable choices: list[Translatable] def toDict(self): + """ + Convert the object to a dictionary. + + Returns: + dict: A dictionary representation of the object. + """ dict = {} dict["__type__"] = "PropsUIQuestionMultipleChoice" dict["id"] = self.id @@ -308,12 +410,23 @@ def toDict(self): @dataclass class PropsUIPromptQuestionnaire: """ - NO DOCS YET + Questionnaire containing multiple questions. + + Attributes: + description (Translatable): Description of the questionnaire. + questions (list[PropsUIQuestionMultipleChoice | PropsUIQuestionMultipleChoiceCheckbox | PropsUIQuestionOpen]): + List of questions in the questionnaire. """ description: Translatable questions: list[PropsUIQuestionMultipleChoice | PropsUIQuestionMultipleChoiceCheckbox | PropsUIQuestionOpen] def toDict(self): + """ + Convert the object to a dictionary. + + Returns: + dict: A dictionary representation of the object. + """ dict = {} dict["__type__"] = "PropsUIPromptQuestionnaire" dict["description"] = self.description.toDict() @@ -323,14 +436,17 @@ def toDict(self): @dataclass class PropsUIPageDonation: - """A multi-purpose page that gets shown to the user + """ + A multi-purpose page that gets shown to the user. Attributes: - platform: the platform name the user is curently in the process of donating data from - header: page header - body: main body of the page, see the individual classes for an explanation + platform (str): The platform name the user is currently in the process of donating data from. + header (PropsUIHeader): Page header. + body (PropsUIPromptRadioInput | PropsUIPromptConsentForm | PropsUIPromptFileInput | + PropsUIPromptFileInputMultiple | PropsUIPromptConfirm | PropsUIPromptQuestionnaire): + Main body of the page. + footer (Optional[PropsUIFooter]): Optional page footer. """ - platform: str header: PropsUIHeader body: ( @@ -344,6 +460,12 @@ class PropsUIPageDonation: footer: Optional[PropsUIFooter] = None def toDict(self): + """ + Convert the object to a dictionary. + + Returns: + dict: A dictionary representation of the object. + """ dict = {} dict["__type__"] = "PropsUIPageDonation" dict["platform"] = self.platform @@ -354,9 +476,16 @@ def toDict(self): class PropsUIPageEnd: - """An ending page to show the user they are done""" - + """ + An ending page to show the user they are done. + """ def toDict(self): + """ + Convert the object to a dictionary. + + Returns: + dict: A dictionary representation of the object. + """ dict = {} dict["__type__"] = "PropsUIPageEnd" return dict diff --git a/src/framework/processing/py/port/helpers/extraction_helpers.py b/src/framework/processing/py/port/helpers/extraction_helpers.py new file mode 100644 index 00000000..6f25291d --- /dev/null +++ b/src/framework/processing/py/port/helpers/extraction_helpers.py @@ -0,0 +1,546 @@ +""" +This module contains helper functions that can be used during the data extraction process +""" +import math +import re +import logging +from datetime import datetime, timezone +from typing import Any, Callable +from pathlib import Path +import zipfile +import csv +import io +import json + +import pandas as pd +import numpy as np + + +logger = logging.getLogger(__name__) + + +def dict_denester(inp: dict[Any, Any] | list[Any], new: dict[Any, Any] | None = None, name: str = "", run_first: bool = True) -> dict[Any, Any]: + """ + Denests a dictionary or list, returning a new flattened dictionary. + + Args: + inp (dict[Any, Any] | list[Any]): The input dictionary or list to be denested. + new (dict[Any, Any] | None, optional): The dictionary to store denested key-value pairs. Defaults to None. + name (str, optional): The current key name in the denesting process. Defaults to "". + run_first (bool, optional): Flag to indicate if this is the first run of the function. Defaults to True. + + Returns: + dict[Any, Any]: A new denested dictionary. + + Examples: + >>> nested_dict = {"a": {"b": {"c": 1}}, "d": [2, 3]} + >>> dict_denester(nested_dict) + {"a-b-c": 1, "d-0": 2, "d-1": 3} + """ + if run_first: + new = {} + + if isinstance(inp, dict): + for k, v in inp.items(): + if isinstance(v, (dict, list)): + dict_denester(v, new, f"{name}-{str(k)}", run_first=False) + else: + newname = f"{name}-{k}" + new.update({newname[1:]: v}) # type: ignore + + elif isinstance(inp, list): + for i, item in enumerate(inp): + dict_denester(item, new, f"{name}-{i}", run_first=False) + + else: + new.update({name[1:]: inp}) # type: ignore + + return new # type: ignore + + +def find_item(d: dict[Any, Any], key_to_match: str) -> str: + """ + Finds the least nested value in a denested dictionary whose key contains the given key_to_match. + + Args: + d (dict[Any, Any]): A denested dictionary to search in. + key_to_match (str): The substring to match in the keys. + + Returns: + str: The value of the least nested key containing key_to_match. + Returns an empty string if no match is found. + + Raises: + Exception: Logs an error message if an exception occurs during the search. + + Examples: + >>> d = {"asd-asd-asd": 1, "asd-asd": 2, "qwe": 3} + >>> find_item(d, "asd") + "2" + """ + out = "" + pattern = r"{}".format(f"^.*{key_to_match}.*$") + depth = math.inf + + try: + for k, v in d.items(): + if re.match(pattern, k): + depth_current_match = k.count("-") + if depth_current_match < depth: + depth = depth_current_match + out = str(v) + except Exception as e: + logger.error(e) + + return out + + +def find_items(d: dict[Any, Any], key_to_match: str) -> list: + """ + Finds all values in a denested dictionary whose keys contain the given key_to_match. + + Args: + d (dict[Any, Any]): A denested dictionary to search in. + key_to_match (str): The substring to match in the keys. + + Returns: + list: A list of all values whose keys contain key_to_match. + + Raises: + Exception: Logs an error message if an exception occurs during the search. + + Examples: + >>> d = {"asd-1": "a", "asd-2": "b", "qwe": "c"} + >>> find_items(d, "asd") + ["a", "b"] + """ + out = [] + pattern = r"{}".format(f"^.*{key_to_match}.*$") + + try: + for k, v in d.items(): + if re.match(pattern, k): + out.append(str(v)) + except Exception as e: + logger.error("bork bork: %s", e) + + return out + + +def json_dumper(zfile: str) -> pd.DataFrame: + """ + Reads all JSON files in a zip file, flattens them, and combines them into a single DataFrame. + + Args: + zfile (str): Path to the zip file containing JSON files. + + Returns: + pd.DataFrame: A DataFrame containing flattened data from all JSON files in the zip. + + Raises: + Exception: Logs an error message if an exception occurs during the process. + + Examples: + >>> df = json_dumper("data.zip") + >>> print(df.head()) + """ + out = pd.DataFrame() + datapoints = [] + + try: + with zipfile.ZipFile(zfile, "r") as zf: + for f in zf.namelist(): + logger.debug("Contained in zip: %s", f) + fp = Path(f) + if fp.suffix == ".json": + b = io.BytesIO(zf.read(f)) + d = dict_denester(unzipddp.read_json_from_bytes(b)) + for k, v in d.items(): + datapoints.append({ + "file name": fp.name, + "key": k, + "value": v + }) + + out = pd.DataFrame(datapoints) + + except Exception as e: + logger.error("Exception was caught: %s", e) + + return out + + +def fix_ascii_string(input: str) -> str: + """ + Fixes the string encoding by removing non-ASCII characters. + + Args: + input (str): The input string that needs to be fixed. + + Returns: + str: The fixed string with only ASCII characters, or the original string if an exception occurs. + + Examples: + >>> fix_ascii_string("Hello, 世界!") + "Hello, !" + """ + try: + fixed_string = input.encode("ascii", 'ignore').decode() + return fixed_string + except Exception: + return input + + +def replace_months(input_string: str) -> str: + """ + Replaces Dutch month abbreviations with English equivalents in the input string. + + Args: + input_string (str): The input string containing potential Dutch month abbreviations. + + Returns: + str: The input string with Dutch month abbreviations replaced by English equivalents. + + Examples: + >>> replace_months("15 mei 2023") + "15 may 2023" + """ + + month_mapping = { + 'mrt': 'mar', + 'mei': 'may', + 'okt': 'oct', + } + + for dutch_month, english_month in month_mapping.items(): + if dutch_month in input_string: + replaced_string = input_string.replace(dutch_month, english_month, 1) + return replaced_string + + return input_string + + +def epoch_to_iso(epoch_timestamp: str | int | float) -> str: + """ + Convert epoch timestamp to an ISO 8601 string, assuming UTC. + + Args: + epoch_timestamp (str | int): The epoch timestamp to convert. + + Returns: + str: The ISO 8601 formatted string, or the original input if conversion fails. + + Raises: + Exception: Logs an error message if conversion fails. + + Examples: + >>> epoch_to_iso(1632139200) + "2021-09-20T12:00:00+00:00" + """ + out = str(epoch_timestamp) + try: + epoch_timestamp = int(float(epoch_timestamp)) + out = datetime.fromtimestamp(epoch_timestamp, tz=timezone.utc).isoformat() + except (OverflowError, OSError, ValueError, TypeError) as e: + logger.error("Could not convert epoch time timestamp, %s", e) + + return out + + +def sort_isotimestamp_empty_timestamp_last(timestamp_series: pd.Series) -> pd.Series: + """ + Creates a key for sorting a pandas Series of ISO timestamps, placing empty timestamps last. + + Args: + timestamp_series (pd.Series): A pandas Series containing ISO formatted timestamps. + + Returns: + pd.Series: A Series of sorting keys, with -timestamp for valid dates and infinity for invalid/empty dates. + + Examples: + >>> df = df.sort_values(by="Date", key=sort_isotimestamp_empty_timestamp_last) + """ + def convert_timestamp(timestamp): + + out = np.inf + try: + if isinstance(timestamp, str) and len(timestamp) > 0: + dt = datetime.fromisoformat(timestamp) + out = -dt.timestamp() + except Exception as e: + logger.debug("Cannot convert timestamp: %s", e) + + return out + + return timestamp_series.apply(convert_timestamp) + + +def fix_latin1_string(input: str) -> str: + """ + Fixes the string encoding by attempting to encode it using the 'latin1' encoding and then decoding it. + + Args: + input (str): The input string that needs to be fixed. + + Returns: + str: The fixed string after encoding and decoding, or the original string if an exception occurs. + + Examples: + >>> fix_latin1_string("café") + "café" + """ + try: + fixed_string = input.encode("latin1").decode() + return fixed_string + except Exception: + return input + + +class FileNotFoundInZipError(Exception): + """ + The File you are looking for is not present in a zipfile + """ + + +def extract_file_from_zip(zfile: str, file_to_extract: str) -> io.BytesIO: + """ + Extracts a specific file from a zipfile and returns it as a BytesIO buffer. + + Args: + zfile (str): Path to the zip file. + file_to_extract (str): Name or path of the file to extract from the zip. + + Returns: + io.BytesIO: A BytesIO buffer containing the extracted file's content of the first file found. + Returns an empty BytesIO if the file is not found or an error occurs. + + Raises: + FileNotFoundInZipError: Logs an error if the specified file is not found in the zip. + zipfile.BadZipFile: Logs an error if the zip file is invalid. + Exception: Logs any other unexpected errors. + + Examples: + >>> extracted_file = extract_file_from_zip("archive.zip", "data.txt") + >>> content = extracted_file.getvalue().decode('utf-8') + """ + + file_to_extract_bytes = io.BytesIO() + + try: + with zipfile.ZipFile(zfile, "r") as zf: + file_found = False + + for f in zf.namelist(): + logger.debug("Contained in zip: %s", f) + if re.match(rf"^.*{re.escape(file_to_extract)}$", f): + file_to_extract_bytes = io.BytesIO(zf.read(f)) + file_found = True + break + + if not file_found: + raise FileNotFoundInZipError("File not found in zip") + + except zipfile.BadZipFile as e: + logger.error("BadZipFile: %s", e) + except FileNotFoundInZipError as e: + logger.error("File not found: %s: %s", file_to_extract, e) + except Exception as e: + logger.error("Exception was caught: %s", e) + + finally: + return file_to_extract_bytes + + +def _json_reader_bytes(json_bytes: bytes, encoding: str) -> Any: + """ + Reads JSON data from bytes using the specified encoding. + This function should not be used directly. + + Args: + json_bytes (bytes): The JSON data in bytes. + encoding (str): The encoding to use for decoding the bytes. + + Returns: + Any: The parsed JSON data. + + Examples: + >>> data = _json_reader_bytes(b'{"key": "value"}', "utf-8") + >>> print(data) + {'key': 'value'} + """ + json_str = json_bytes.decode(encoding) + result = json.loads(json_str) + return result + + +def _json_reader_file(json_file: str, encoding: str) -> Any: + """ + Reads JSON data from a file using the specified encoding. + This function should not be used directly. + + Args: + json_file (str): Path to the JSON file. + encoding (str): The encoding to use for reading the file. + + Returns: + Any: The parsed JSON data. + + Examples: + >>> data = _json_reader_file("data.json", "utf-8") + >>> print(data) + {'key': 'value'} + """ + with open(json_file, 'r', encoding=encoding) as f: + result = json.load(f) + return result + + +def _read_json(json_input: Any, json_reader: Callable[[Any, str], Any]) -> dict[Any, Any] | list[Any]: + """ + Reads JSON input using the provided json_reader function, trying different encodings. + This function should not be used directly. + + Args: + json_input (Any): The JSON input (can be bytes or file path). + json_reader (Callable[[Any, str], Any]): A function to read the JSON input. + + Returns: + dict[Any, Any] | list[Any]: The parsed JSON data as a dictionary or list. + Returns an empty dictionary if parsing fails. + + Raises: + TypeError: Logs an error if the parsed result is not a dict or list. + json.JSONDecodeError: Logs an error if JSON decoding fails. + Exception: Logs any other unexpected errors. + + Examples: + >>> data = _read_json(b'{"key": "value"}', _json_reader_bytes) + >>> print(data) + {'key': 'value'} + """ + + out: dict[Any, Any] | list[Any] = {} + + encodings = ["utf8", "utf-8-sig"] + for encoding in encodings: + try: + result = json_reader(json_input, encoding) + + if not isinstance(result, (dict, list)): + raise TypeError("Did not convert bytes to a list or dict, but to another type instead") + + out = result + logger.debug("Succesfully converted json bytes with encoding: %s", encoding) + break + + except json.JSONDecodeError: + logger.error("Cannot decode json with encoding: %s", encoding) + except TypeError as e: + logger.error("%s, could not convert json bytes", e) + break + except Exception as e: + logger.error("%s, could not convert json bytes", e) + break + + return out + + +def read_json_from_bytes(json_bytes: io.BytesIO) -> dict[Any, Any] | list[Any]: + """ + Reads JSON data from a BytesIO buffer. + + Args: + json_bytes (io.BytesIO): A BytesIO buffer containing JSON data. + + Returns: + dict[Any, Any] | list[Any]: The parsed JSON data as a dictionary or list. + Returns an empty dictionary if parsing fails. + + Examples: + >>> buffer = io.BytesIO(b'{"key": "value"}') + >>> data = read_json_from_bytes(buffer) + >>> print(data) + {'key': 'value'} + """ + out: dict[Any, Any] | list[Any] = {} + try: + b = json_bytes.read() + out = _read_json(b, _json_reader_bytes) + except Exception as e: + logger.error("%s, could not convert json bytes", e) + + return out + + +def read_json_from_file(json_file: str) -> dict[Any, Any] | list[Any]: + """ + Reads JSON data from a file. + + Args: + json_file (str): Path to the JSON file. + + Returns: + dict[Any, Any] | list[Any]: The parsed JSON data as a dictionary or list. + Returns an empty dictionary if parsing fails. + + Examples: + >>> data = read_json_from_file("data.json") + >>> print(data) + {'key': 'value'} + """ + out = _read_json(json_file, _json_reader_file) + return out + + +def read_csv_from_bytes(json_bytes: io.BytesIO) -> list[dict[Any, Any]]: + """ + Reads CSV data from a BytesIO buffer and returns it as a list of dictionaries. + + Args: + json_bytes (io.BytesIO): A BytesIO buffer containing CSV data. + + Returns: + list[dict[Any, Any]]: A list of dictionaries, where each dictionary represents a row in the CSV. + Returns an empty list if parsing fails. + + Examples: + >>> buffer = io.BytesIO(b'name,age\\nAlice,30\\nBob,25') + >>> data = read_csv_from_bytes(buffer) + >>> print(data) + [{'name': 'Alice', 'age': '30'}, {'name': 'Bob', 'age': '25'}] + """ + out: list[dict[Any, Any]] = [] + + try: + stream = io.TextIOWrapper(json_bytes, encoding="utf-8") + reader = csv.DictReader(stream) + for row in reader: + out.append(row) + logger.debug("succesfully converted csv bytes with encoding utf8") + + except Exception as e: + logger.error("%s, could not convert csv bytes", e) + + finally: + return out + + +def read_csv_from_bytes_to_df(json_bytes: io.BytesIO) -> pd.DataFrame: + """ + Reads CSV data from a BytesIO buffer and returns it as a pandas DataFrame. + + Args: + json_bytes (io.BytesIO): A BytesIO buffer containing CSV data. + + Returns: + pd.DataFrame: A pandas DataFrame containing the CSV data. + + Examples: + >>> buffer = io.BytesIO(b'name,age\\nAlice,30\\nBob,25') + >>> df = read_csv_from_bytes_to_df(buffer) + >>> print(df) + name age + 0 Alice 30 + 1 Bob 25 + """ + return pd.DataFrame(read_csv_from_bytes(json_bytes)) diff --git a/src/framework/processing/py/port/helpers/port_helpers.py b/src/framework/processing/py/port/helpers/port_helpers.py new file mode 100644 index 00000000..8694123b --- /dev/null +++ b/src/framework/processing/py/port/helpers/port_helpers.py @@ -0,0 +1,174 @@ +import port.api.props as props +from port.api.commands import ( + CommandSystemDonate, + CommandUIRender, + CommandSystemExit, +) + + +def render_page( + header_text: props.Translatable, + body: ( + props.PropsUIPromptRadioInput + | props.PropsUIPromptConsentForm + | props.PropsUIPromptFileInput + | props.PropsUIPromptFileInputMultiple + | props.PropsUIPromptConfirm + | props.PropsUIPromptQuestionnaire + ) +) -> CommandUIRender: + """ + Renders the UI components for a donation page. + + This function assembles various UI components including a header, body, and footer + to create a complete donation page. It uses the provided header text and body content + to customize the page. + + Args: + header_text (props.Translatable): The text to be displayed in the header. + This should be a translatable object to support multiple languages. + body ( + props.PropsUIPromptRadioInput | + props.PropsUIPromptConsentForm | + props.PropsUIPromptFileInput | + props.PropsUIPromptFileInputMultiple | + props.PropsUIPromptConfirm | + props.PropsUIPromptQuestionnaire + ): The main content of the page. It must be compatible with `props.PropsUIPageDonation`. + + Returns: + CommandUIRender: A render command object containing the fully assembled page. Must be yielded. + """ + header = props.PropsUIHeader(header_text) + footer = props.PropsUIFooter() + page = props.PropsUIPageDonation("does not matter", header, body, footer) + return CommandUIRender(page) + + +def generate_retry_prompt(platform_name: str) -> props.PropsUIPromptConfirm: + """ + Generates a confirmation prompt for retrying file processing. + + This function creates a bilingual (English and Dutch) confirmation prompt + when a file from a specific platform cannot be processed. It allows the user + to either try again with a different file or continue with the current file. + + Args: + platform_name (str): The name of the platform associated with the file + that couldn't be processed. This is inserted into the prompt text. + + Returns: + props.PropsUIPromptConfirm: A confirmation prompt object containing + the message, and labels for the "OK" (try again) and "Cancel" (continue) buttons. + """ + text = props.Translatable({ + "en": f"Unfortunately, we cannot process your {platform_name} file. Continue, if you are sure that you selected the right file. Try again to select a different file.", + "nl": f"Helaas, kunnen we uw {platform_name} bestand niet verwerken. Weet u zeker dat u het juiste bestand heeft gekozen? Ga dan verder. Probeer opnieuw als u een ander bestand wilt kiezen." + }) + ok = props.Translatable({ + "en": "Try again", + "nl": "Probeer opnieuw" + }) + cancel = props.Translatable({ + "en": "Continue", + "nl": "Verder" + }) + return props.PropsUIPromptConfirm(text, ok, cancel) + + +def generate_file_prompt(extensions: str) -> props.PropsUIPromptFileInput: + """ + Generates a file input prompt for selecting a file for a platform. + + This function creates a bilingual (English and Dutch) file input prompt + that instructs the user to select a file they've received from a platform + and stored on their device. + + Args: + extensions (str): A collection of allowed MIME types. + For example: "application/zip, text/plain, application/json" + + Returns: + props.PropsUIPromptFileInput: A file input prompt object containing + the description text and allowed file extensions. + """ + description = props.Translatable({ + "en": "Please follow the download instructions and choose the file that you stored on your device.", + "nl": "Volg de download instructies en kies het bestand dat u opgeslagen heeft op uw apparaat.", + }) + return props.PropsUIPromptFileInput(description, extensions) + + +def generate_review_data_prompt(description: props.Translatable, table_list: list[props.PropsUIPromptConsentFormTable]) -> props.PropsUIPromptConsentForm: + """ + Generates a data review form with a list of tables and a description, including default donate question and button. + The participant can review these tables before they will be send to the researcher. + + Args: + table_list (list[props.PropsUIPromptConsentFormTable]): A list of consent form tables to be included in the prompt. + description (props.Translatable): A translatable description text for the consent prompt. + + Returns: + props.PropsUIPromptConsentForm: A structured consent form object containing the provided table list, description, + and default values for donate question and button. + """ + donate_question = props.Translatable({ + "en": "Do you want to share this data for research?", + "nl": "Wilt u deze gegevens delen voor onderzoek?" + }) + + donate_button = props.Translatable({ + "en": "Yes, share for research", + "nl": "Ja, deel voor onderzoek" + }) + + return props.PropsUIPromptConsentForm( + table_list, + meta_tables=[], + description=description, + donate_question=donate_question, + donate_button=donate_button + ) + + +def donate(key: str, json_string: str) -> CommandSystemDonate: + """ + Initiates a donation process using the provided key and data. + + This function triggers the donation process by passing a key and a JSON-formatted string + that contains donation information. + + Args: + key (str): The key associated with the donation process. The key will be used in the file name. + json_string (str): A JSON-formatted string containing the donated data. + + Returns: + CommandSystemDonate: A system command that initiates the donation process. Must be yielded. + """ + return CommandSystemDonate(key, json_string) + + +def exit(code: int, info: str) -> CommandSystemExit: + """ + Exits Next with the provided exit code and additional information. + This if the code reaches this function, it will return to the task list in Next. + + Args: + code (int): The exit code representing the type or status of the exit. + info (str): A string containing additional information about the exit. + + Returns: + CommandSystemExit: A system command that initiates the exit process in Next. + + Examples: + yield exit(0, "Success") + """ + return CommandSystemExit(code, info) + + +def render_end_page(): + """ + Renders a thank you page, must be yielded. + """ + page = props.PropsUIPageEnd() + return CommandUIRender(page) diff --git a/src/framework/processing/py/port/helpers/validate.py b/src/framework/processing/py/port/helpers/validate.py new file mode 100644 index 00000000..461342cd --- /dev/null +++ b/src/framework/processing/py/port/helpers/validate.py @@ -0,0 +1,232 @@ +""" +Contains classes to deal with input validation of DDPs + +The idea of this module is to provide a uniform way to assign a validation status to a DDP validation +Which can be used and acted upon +""" + +from dataclasses import dataclass, field +from pathlib import Path +from enum import Enum +import zipfile + +import logging + +logger = logging.getLogger(__name__) + + +class Language(Enum): + """ + Enumeration of supported languages. + """ + EN = 1 + NL = 2 + UNKNOWN = 3 + + +class DDPFiletype(Enum): + """ + Enumeration of supported DDP file types. + """ + JSON = 1 + HTML = 2 + CSV = 3 + TXT = 4 + UNKOWN = 5 + + +@dataclass +class DDPCategory: + """ + Represents characteristics that define a DDP (Data Delivery Package) category. + + Args: + id (str): Unique identifier for the DDP category. + ddp_filetype (DDPFiletype): The file type of the DDP. + language (Language): The language of the DDP. + known_files (List[str]): A list of known files associated with this DDP category. + + Examples: + >>> category = DDPCategory("cat1", DDPFiletype.JSON, Language.EN, ["file1.json", "file2.json"]) + >>> print(category.id) + cat1 + >>> print(category.language) + + """ + id: str + ddp_filetype: DDPFiletype + language: Language + known_files: list[str] + + +@dataclass +class StatusCode: + """ + Represents a status code that can be used to set a DDP status. + + Args: + id (int): The numeric identifier of the status code. + description (str): A brief description of what the status code represents. + + Examples: + >>> status = StatusCode(0, "Success") + >>> print(status.id) + 0 + >>> print(status.description) + Success + """ + id: int + description: str + + +@dataclass +class ValidateInput: + """ + A class for validating input data against predefined categories and status codes. + + Args: + all_status_codes (List[StatusCode]): A list of valid status codes. + all_ddp_categories (List[DDPCategory]): A list of valid DDP categories. + current_status_code (Optional[StatusCode]): The current status code. Defaults to None. + current_ddp_category (Optional[DDPCategory]): The current DDP category. Defaults to None. + + Attributes: + ddp_categories_lookup (Dict[str, DDPCategory]): A lookup dictionary for DDP categories. + status_codes_lookup (Dict[int, StatusCode]): A lookup dictionary for status codes. + + Examples: + >>> status_codes = [StatusCode(id=0, description="Success"), StatusCode(id=1, description="Error")] + >>> ddp_categories = [DDPCategory(id="cat1", ddp_filetype=DDPFiletype.JSON, language=Language.EN, known_files=["file1.txt", "file2.txt"])] + >>> validator = ValidateInput(all_status_codes=status_codes, all_ddp_categories=ddp_categories) + """ + + all_status_codes: list[StatusCode] + all_ddp_categories: list[DDPCategory] + current_status_code: StatusCode | None = None + current_ddp_category: DDPCategory | None = None + + ddp_categories_lookup: dict[str, DDPCategory] = field(init=False) + status_codes_lookup: dict[int, StatusCode] = field(init=False) + + def infer_ddp_category(self, file_list_input: list[str]) -> bool: + """ + Compares a list of files to a list of known files and infers the DDPCategory. + + Args: + file_list_input (List[str]): A list of input files to compare against known files. + + Returns: + bool: True if a valid DDP category is inferred, False otherwise. It sets the current_status_code + and current_ddp_category to either the DDP catogory match, or to an unknown category. + + Examples: + >>> validator.infer_ddp_category(["file1.txt", "file2.txt"]) + """ + prop_category = {} + for id, category in self.ddp_categories_lookup.items(): + n_files_found = [ + 1 if f in category.known_files else 0 for f in file_list_input + ] + prop_category[id] = sum(n_files_found) / len(category.known_files) * 100 + + if max(prop_category.values()) >= 5: + highest = max(prop_category, key=prop_category.get) # type: ignore + self.ddp_category = self.ddp_categories_lookup[highest] + self.set_current_status_code_by_id(0) + logger.info("Detected DDP category: %s", self.ddp_category.id) + return True + else: + logger.info("Not a valid input; not enough files matched when performing input validation") + self.set_current_status_code_by_id(1) + self.ddp_category = DDPCategory(id = "unknown", ddp_filetype=DDPFiletype.UNKOWN, language=Language.UNKNOWN, known_files=[]) + return False + + def set_current_status_code_by_id(self, id: int) -> None: + """ + Set the status code based on the provided ID. + + Args: + id (int): The ID of the status code to set. + + Examples: + >>> validator.set_current_status_code_by_id(0) + """ + self.current_status_code = self.status_codes_lookup.get(id, None) + + def get_status_code_id(self) -> int: + """ + Return the current assigned status code ID. Note: zero is always used for OK. + Non-zero otherwise. + + Returns: + int: The ID of the current status code, or 1 if no status code is set. + + Examples: + >>> validator.get_status_code_id() + """ + if self.current_status_code == None: + return 1 + else: + return self.current_status_code.id + + def __post_init__(self) -> None: + for status_code, ddp_category in zip(self.all_status_codes, self.all_ddp_categories): + assert isinstance(status_code, StatusCode), "Input is not of class StatusCode" + assert isinstance(ddp_category, DDPCategory), "Input is not of class DDPCategory" + + self.ddp_categories_lookup = { + category.id: category for category in self.all_ddp_categories + } + self.status_codes_lookup = { + status_code.id: status_code for status_code in self.all_status_codes + } + + +def validate_zip(ddp_categories: list[DDPCategory], path_to_zip: str) -> ValidateInput: + """ + Validates a DDP zip file against a list of DDP categories. + + This function attempts to open and read the contents of a zip file, then uses + the ValidateInput class to infer the DDP category based on the files in the zip. + If the zip file is invalid or cannot be read, it sets an error status code (an integer greather than 0). + + Args: + ddp_categories (List[DDPCategory]): A list of valid DDP categories to compare against. + path_to_zip (str): The file path to the zip file to be validated. + + Returns: + ValidateInput: An instance of ValidateInput containing the validation results. + + Raises: + zipfile.BadZipFile: This exception is caught internally and results in an error status code. + + Examples: + >>> categories = [DDPCategory(id="cat1", ddp_filetype=DDPFiletype.JSON, language=Language.EN, known_files=["file1.txt", "file2.txt"])] + >>> result = validate_zip(categories, "path/to/valid.zip") + >>> result.get_status_code_id() + 0 + + >>> result = validate_zip(categories, "path/to/invalid.zip") + >>> result.get_status_code_id() + 1 + """ + + status_codes = [ + StatusCode(id=0, description="Detected a zip from the DDPCategory list"), + StatusCode(id=1, description="Undetected zip or bad zipfile"), + ] + validate = ValidateInput(status_codes, ddp_categories) + + try: + paths = [] + with zipfile.ZipFile(path_to_zip, "r") as zf: + for f in zf.namelist(): + p = Path(f) + logger.debug("Found: %s in zip", p.name) + paths.append(p.name) + + validate.infer_ddp_category(paths) + except zipfile.BadZipFile: + validate.set_current_status_code_by_id(1) + + return validate diff --git a/src/framework/processing/py/port/platforms/chatgpt.py b/src/framework/processing/py/port/platforms/chatgpt.py new file mode 100644 index 00000000..3984dcfd --- /dev/null +++ b/src/framework/processing/py/port/platforms/chatgpt.py @@ -0,0 +1,173 @@ +""" +ChatGPT + +This module contains an example flow of a ChatGPT data donation study +""" +import logging + +import pandas as pd + +import port.api.props as props +import port.helpers.extraction_helpers as eh +import port.helpers.port_helpers as ph +import port.helpers.validate as validate + +from port.helpers.validate import ( + DDPCategory, + DDPFiletype, + Language, +) + +logger = logging.getLogger(__name__) + +DDP_CATEGORIES = [ + DDPCategory( + id="json", + ddp_filetype=DDPFiletype.JSON, + language=Language.EN, + known_files=[ + "chat.html", + "conversations.json", + "message_feedback.json", + "model_comparisons.json", + "user.json" + ] + ) +] + + +def conversations_to_df(chatgpt_zip: str) -> pd.DataFrame: + b = eh.extract_file_from_zip(chatgpt_zip, "conversations.json") + conversations = eh.read_json_from_bytes(b) + + datapoints = [] + out = pd.DataFrame() + + try: + for conversation in conversations: + title = conversation["title"] + for _, turn in conversation["mapping"].items(): + + denested_d = eh.dict_denester(turn) + is_hidden = eh.find_item(denested_d, "is_visually_hidden_from_conversation") + if is_hidden != "True": + role = eh.find_item(denested_d, "role") + message = "".join(eh.find_items(denested_d, "part")) + model = eh.find_item(denested_d, "-model_slug") + time = eh.epoch_to_iso(eh.find_item(denested_d, "create_time")) + + datapoint = { + "conversation title": title, + "role": role, + "message": message, + "model": model, + "time": time, + } + if role != "": + datapoints.append(datapoint) + + out = pd.DataFrame(datapoints) + + except Exception as e: + logger.error("Data extraction error: %s", e) + + return out + + + +def extraction(chatgpt_zip: str) -> list[props.PropsUIPromptConsentFormTable]: + tables_to_render = [] + + df = conversations_to_df(chatgpt_zip) + if not df.empty: + table_title = props.Translatable({ + "en": "Your conversations with ChatGPT", + "nl": "Uw gesprekken met ChatGPT" + }) + table_description = props.Translatable({ + "en": "In this table you find your conversations with ChatGPT sorted by time. Below, you find a wordcloud, where the size of the words represents how frequent these words have been used in the conversations.", + "nl": "In this table you find your conversations with ChatGPT sorted by time. Below, you find a wordcloud, where the size of the words represents how frequent these words have been used in the conversations.", + }) + wordcloud = { + "title": { + "en": "Your messages in a wordcloud", + "nl": "Your messages in a wordcloud" + }, + "type": "wordcloud", + "textColumn": "message", + "tokenize": True, + } + table = props.PropsUIPromptConsentFormTable("chatgpt_conversations", table_title, df, table_description, [wordcloud]) + tables_to_render.append(table) + + return tables_to_render + + + +# TEXTS +SUBMIT_FILE_HEADER = props.Translatable({ + "en": "Select your ChatGPT file", + "nl": "Selecteer uw ChatGPT bestand" +}) + +REVIEW_DATA_HEADER = props.Translatable({ + "en": "Your ChatGPT data", + "nl": "Uw ChatGPT gegevens" +}) + +RETRY_HEADER = props.Translatable({ + "en": "Try again", + "nl": "Probeer opnieuw" +}) + +REVIEW_DATA_DESCRIPTION = props.Translatable({ + "en": "Below you will find a currated selection of ChatGPT data. In this case only the conversations you had with ChatGPT are show on screen. The data represented in this way are much more insightfull because you can actually read back the conversations you had with ChatGPT", + "nl": "Below you will find a currated selection of ChatGPT data. In this case only the conversations you had with ChatGPT are show on screen. The data represented in this way are much more insightfull because you can actually read back the conversations you had with ChatGPT", +}) + + +def process(session_id: int): + platform_name = "ChatGPT" + + table_list = None + while True: + logger.info("Prompt for file for %s", platform_name) + + file_prompt = ph.generate_file_prompt("application/zip") + file_result = yield ph.render_page(SUBMIT_FILE_HEADER, file_prompt) + + if file_result.__type__ == "PayloadString": + validation = validate.validate_zip(DDP_CATEGORIES, file_result.value) + + # Happy flow: Valid DDP + if validation.get_status_code_id() == 0: + logger.info("Payload for %s", platform_name) + extraction_result = extraction(file_result.value) + table_list = extraction_result + break + + # Enter retry flow, reason: if DDP was not a ChatGPT DDP + if validation.get_status_code_id() != 0: + logger.info("Not a valid %s zip; No payload; prompt retry_confirmation", platform_name) + retry_prompt = ph.generate_retry_prompt(platform_name) + retry_result = yield ph.render_page(RETRY_HEADER, retry_prompt) + + if retry_result.__type__ == "PayloadTrue": + continue + else: + logger.info("Skipped during retry flow") + break + + else: + logger.info("Skipped at file selection ending flow") + break + + if table_list is not None: + logger.info("Prompt consent; %s", platform_name) + review_data_prompt = ph.generate_review_data_prompt(REVIEW_DATA_DESCRIPTION, table_list) + review_data_result = yield ph.render_page(REVIEW_DATA_HEADER, review_data_prompt) + if review_data_result.__type__ == "PayloadJSON": + yield ph.donate(f"{session_id}-questionnaire-donation", review_data_result.value) + + yield ph.exit(0, "Success") + yield ph.render_end_page() diff --git a/src/framework/processing/py/port/script.py b/src/framework/processing/py/port/script.py index fe809456..dd1fb087 100644 --- a/src/framework/processing/py/port/script.py +++ b/src/framework/processing/py/port/script.py @@ -1,17 +1,40 @@ -import port.api.props as props -from port.api.commands import (CommandSystemDonate, CommandUIRender, CommandSystemExit) +import zipfile import pandas as pd -import zipfile + +import port.api.props as props +import port.helpers.port_helpers as ph + + +SUBMIT_FILE_HEADER = props.Translatable({ + "en": "Select a random zipfile of choice", + "nl": "Selecteer een willekeurige zipfile", +}) + +REVIEW_DATA_HEADER = props.Translatable({ + "en": "Your random zip contents", + "nl": "De gegevens in uw zip" +}) + +REVIEW_DATA_DESCRIPTION = props.Translatable({ + "en": "Below you will find meta data about the contents of the zip file you submitted. Please review the data carefully and remove any information you do not wish to share. If you would like to share this data, click on the 'Yes, share for research' button at the bottom of this page. By sharing this data, you contribute to research .", + "nl": "Hieronder ziet u gegevens over de zip die u heeft ingediend. Bekijk de gegevens zorgvuldig, en verwijder de gegevens die u niet wilt delen. Als u deze gegevens wilt delen, klik dan op de knop 'Ja, deel voor onderzoek' onderaan deze pagina. Door deze gegevens te delen draagt u bij aan onderzoek over ." +}) + +RETRY_HEADER = props.Translatable({ + "en": "Try again", + "nl": "Probeer opnieuw" +}) + def process(session_id: str): - platform = "Platform of interest" + platform_name = "Platform of interest" # Start of the data donation flow while True: # Ask the participant to submit a file - file_prompt = generate_file_prompt(platform, "application/zip, text/plain") - file_prompt_result = yield render_page(platform, file_prompt) + file_prompt = ph.generate_file_prompt("application/zip, text/plain") + file_prompt_result = yield ph.render_page(SUBMIT_FILE_HEADER, file_prompt) # If the participant submitted a file: continue if file_prompt_result.__type__ == 'PayloadString': @@ -28,20 +51,20 @@ def process(session_id: str): # Show this data to the participant in a table on screen # The participant can now decide to donate extracted_data = extract_the_data_you_are_interested_in(file_prompt_result.value) - consent_prompt = generate_consent_prompt(extracted_data) - consent_prompt_result = yield render_page(platform, consent_prompt) + consent_prompt = ph.generate_review_data_prompt(REVIEW_DATA_DESCRIPTION, extracted_data) + consent_prompt_result = yield ph.render_page(REVIEW_DATA_HEADER, consent_prompt) # If the participant wants to donate the data gets donated if consent_prompt_result.__type__ == "PayloadJSON": - yield donate(f"{session_id}-{platform}", consent_prompt_result.value) + yield ph.donate(f"{session_id}-{platform_name}", consent_prompt_result.value) break # Sad flow: # The data was not valid, ask the participant to retry if is_data_valid == False: - retry_prompt = generate_retry_prompt(platform) - retry_prompt_result = yield render_page(platform, retry_prompt) + retry_prompt = ph.generate_retry_prompt(platform_name) + retry_prompt_result = yield ph.render_page(RETRY_HEADER, retry_prompt) # The participant wants to retry: start from the beginning if retry_prompt_result.__type__ == 'PayloadTrue': @@ -54,11 +77,12 @@ def process(session_id: str): else: break - yield exit_port(0, "Success") - yield render_end_page() + yield ph.exit(0, "Success") + yield ph.render_end_page() -def extract_the_data_you_are_interested_in(zip_file: str) -> pd.DataFrame: + +def extract_the_data_you_are_interested_in(zip_file: str) -> list[props.PropsUIPromptConsentFormTable]: """ This function extracts the data the researcher is interested in @@ -69,7 +93,7 @@ def extract_the_data_you_are_interested_in(zip_file: str) -> pd.DataFrame: You could extract anything here """ - out = pd.DataFrame() + tables = [] try: file = zipfile.ZipFile(zip_file) @@ -78,12 +102,34 @@ def extract_the_data_you_are_interested_in(zip_file: str) -> pd.DataFrame: info = file.getinfo(name) data.append((name, info.compress_size, info.file_size)) - out = pd.DataFrame(data, columns=["File name", "Compressed file size", "File size"]) + df = pd.DataFrame(data, columns=["File name", "Compressed file size", "File size"]) #pyright: ignore + table_title = props.Translatable({ + "en": f"The contents of your zipfile contents:", + "nl": "De inhoud van uw zip bestand" + }) + wordcloud = { + "title": { + "en": "You can also add visualizations", + "nl": "You can also add visualizations" + }, + "type": "wordcloud", + "textColumn": "File name", + "tokenize": True, + } + tables.append( + props.PropsUIPromptConsentFormTable( + id="zip_contents", + title=table_title, + data_frame=df, + visualizations=[wordcloud], + delete_option=True + ) + ) except Exception as e: print(f"Something went wrong: {e}") - return out + return tables def validate_the_participants_input(zip_file: str) -> bool: @@ -104,225 +150,3 @@ def validate_the_participants_input(zip_file: str) -> bool: return False -def render_end_page(): - """ - Renders a thank you page - """ - page = props.PropsUIPageEnd() - return CommandUIRender(page) - - -def render_page(platform: str, body): - """ - Renders the UI components - """ - header = props.PropsUIHeader(props.Translatable({"en": platform, "nl": platform })) - footer = props.PropsUIFooter() - page = props.PropsUIPageDonation(platform, header, body, footer) - return CommandUIRender(page) - - -def generate_retry_prompt(platform: str) -> props.PropsUIPromptConfirm: - text = props.Translatable({ - "en": f"Unfortunately, we cannot process your {platform} file. Continue, if you are sure that you selected the right file. Try again to select a different file.", - "nl": f"Helaas, kunnen we uw {platform} bestand niet verwerken. Weet u zeker dat u het juiste bestand heeft gekozen? Ga dan verder. Probeer opnieuw als u een ander bestand wilt kiezen." - }) - ok = props.Translatable({ - "en": "Try again", - "nl": "Probeer opnieuw" - }) - cancel = props.Translatable({ - "en": "Continue", - "nl": "Verder" - }) - return props.PropsUIPromptConfirm(text, ok, cancel) - - -def generate_file_prompt(platform, extensions) -> props.PropsUIPromptFileInput: - description = props.Translatable({ - "en": f"Please follow the download instructions and choose the file that you stored on your device. Click “Skip” at the right bottom, if you do not have a {platform} file. ", - "nl": f"Volg de download instructies en kies het bestand dat u opgeslagen heeft op uw apparaat. Als u geen {platform} bestand heeft klik dan op “Overslaan” rechts onder." - }) - return props.PropsUIPromptFileInput(description, extensions) - - -def generate_consent_prompt(*args: pd.DataFrame) -> props.PropsUIPromptConsentForm: - description = props.Translatable({ - "en": "Below you will find meta data about the contents of the zip file you submitted. Please review the data carefully and remove any information you do not wish to share. If you would like to share this data, click on the 'Yes, share for research' button at the bottom of this page. By sharing this data, you contribute to research .", - "nl": "Hieronder ziet u gegevens over de zip die u heeft ingediend. Bekijk de gegevens zorgvuldig, en verwijder de gegevens die u niet wilt delen. Als u deze gegevens wilt delen, klik dan op de knop 'Ja, deel voor onderzoek' onderaan deze pagina. Door deze gegevens te delen draagt u bij aan onderzoek over ." - }) - - donate_question = props.Translatable({ - "en": "Do you want to share this data for research?", - "nl": "Wilt u deze gegevens delen voor onderzoek?" - }) - - donate_button = props.Translatable({ - "en": "Yes, share for research", - "nl": "Ja, deel voor onderzoek" - }) - - tables = [] - for index, df in enumerate(args): - table_title = props.Translatable({ - "en": f"The contents of your zipfile contents (Table {index + 1}/{len(args)})", - "nl": "De inhoud van uw zip bestand" - }) - wordcloud = { - "title": { - "en": "You can also add visualizations", - "nl": "You can also add visualizations" - }, - "type": "wordcloud", - "textColumn": "File name", - "tokenize": True, - } - tables.append(props.PropsUIPromptConsentFormTable(f"zip_contents_{index}", table_title, df, visualizations=[wordcloud], delete_option=True)) - - return props.PropsUIPromptConsentForm( - tables, - [], - description = description, - donate_question = donate_question, - donate_button = donate_button - ) - - -def donate(key, json_string): - return CommandSystemDonate(key, json_string) - - -def exit_port(code, info): - return CommandSystemExit(code, info) - - -################################################################################## -# Exercise for the reader - -# Add an extra table to the output -# This table should calculate 2 aggegrate statistics about your the files in your zipfile - -# 1. it should give the total number of files in the zipfile -# 2. it should give the total number of bytes of all files in the zipfile -# 3. As a bonus: count the number of times the letter a occurs in all text files in the zipfile. By all means use AI to find out how to do this - -# Depending on your data the table could look like this: -# | Statistic | Value | -# ----------------------------- -# | Total number of files | 12 | -# | Total number of bytes | 762376 | -# | Total occurrences of 'a' in text files | 2378 | - - -################################################################################## -# Hints - -# Hint 1: Write a function that extracts the statistics and put them in a dataframe. -# In order to do that you can copy extract_the_data_you_are_interested_in() and then modify it so it extracts the total number of files and bytes - -# Hint 2: If you wrote that function, then -# Changes these lines: -# extracted_data = extract_the_data_you_are_interested_in(file_prompt_result.value) -# consent_prompt = generate_consent_prompt(extracted_data) - -# to: -# extracted_data = extract_the_data_you_are_interested_in(file_prompt_result.value) -# extracted_data_statistics = extract_statistics_you_are_interested_in(file_prompt_result.value) -# consent_prompt = generate_consent_prompt(extracted_data, extracted_data_statistics) - -################################################################################## -# Answer: - -# Uncomment all these lines to see the answer in action - -#def extract_statistics_you_are_interested_in(zip_file: str) -> pd.DataFrame: -# """ -# Function that extracts the desired statistics -# """ -# out = pd.DataFrame() -# count = 0 -# total_number_of_bytes = 0 -# total_a_count = 0 -# -# try: -# file = zipfile.ZipFile(zip_file) -# for name in file.namelist(): -# info = file.getinfo(name) -# count += 1 -# total_number_of_bytes += info.file_size -# -# # Check if the file is a text file -# # if so, open it and count the letter a -# if name.endswith('.txt'): -# with file.open(name) as txt_file: -# content = txt_file.read().decode('utf-8') -# total_a_count += content.count('a') -# -# data = [ -# ("Total number of files", count), -# ("Total number of bytes", total_number_of_bytes), -# ("Total occurrences of 'a' in text files", total_a_count), -# ] -# -# out = pd.DataFrame(data, columns=["Statistic", "Value"]) -# -# except Exception as e: -# print(f"Something went wrong: {e}") -# -# return out -# -# -#def process(session_id: str): -# platform = "Platform of interest" -# -# # Start of the data donation flow -# while True: -# # Ask the participant to submit a file -# file_prompt = generate_file_prompt(platform, "application/zip, text/plain") -# file_prompt_result = yield render_page(platform, file_prompt) -# -# # If the participant submitted a file: continue -# if file_prompt_result.__type__ == 'PayloadString': -# -# # Validate the file the participant submitted -# # In general this is wise to do -# is_data_valid = validate_the_participants_input(file_prompt_result.value) -# -# # Happy flow: -# # The file the participant submitted is valid -# if is_data_valid == True: -# -# # Extract the data you as a researcher are interested in, and put it in a pandas DataFrame -# # Show this data to the participant in a table on screen -# # The participant can now decide to donate -# extracted_data = extract_the_data_you_are_interested_in(file_prompt_result.value) -# extracted_data_statistics = extract_statistics_you_are_interested_in(file_prompt_result.value) -# consent_prompt = generate_consent_prompt(extracted_data, extracted_data_statistics) -# consent_prompt_result = yield render_page(platform, consent_prompt) -# -# # If the participant wants to donate the data gets donated -# if consent_prompt_result.__type__ == "PayloadJSON": -# yield donate(f"{session_id}-{platform}", consent_prompt_result.value) -# -# break -# -# # Sad flow: -# # The data was not valid, ask the participant to retry -# if is_data_valid == False: -# retry_prompt = generate_retry_prompt(platform) -# retry_prompt_result = yield render_page(platform, retry_prompt) -# -# # The participant wants to retry: start from the beginning -# if retry_prompt_result.__type__ == 'PayloadTrue': -# continue -# # The participant does not want to retry or pressed skip -# else: -# break -# -# # The participant did not submit a file and pressed skip -# else: -# break -# -# yield exit_port(0, "Success") -# yield render_end_page() -#