From f196be2709c5681e4e3924901357bf0e6ed220df Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Mon, 19 Aug 2024 14:22:07 +0200 Subject: [PATCH 01/12] export processor --- common/lib/dataset.py | 16 +++- processors/conversion/export_datasets.py | 100 +++++++++++++++++++++++ webtool/views/api_tool.py | 6 +- 3 files changed, 116 insertions(+), 6 deletions(-) create mode 100644 processors/conversion/export_datasets.py diff --git a/common/lib/dataset.py b/common/lib/dataset.py index 8510a5adb..56ea7d463 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -15,7 +15,7 @@ import backend from common.config_manager import config from common.lib.job import Job, JobNotFoundException -from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int +from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int, get_software_version from common.lib.item_mapping import MappedItem, MissingMappedField, DatasetItem from common.lib.fourcat_module import FourcatModule from common.lib.exceptions import (ProcessorInterruptedException, DataSetException, DataSetNotFoundException, @@ -1542,6 +1542,20 @@ def get_media_type(self): # Default to text return self.parameters.get("media_type", "text") + def get_metadata(self): + """ + Get dataset metadata + + This consists of all the data stored in the database for this dataset, plus the current 4CAT version (appended + as 'current_4CAT_version'). This is useful for exporting datasets, as it can be used by another 4CAT instance to + update its database (and ensure compatibility with the exporting version of 4CAT). + """ + metadata = self.db.fetchone("SELECT * FROM datasets WHERE key = %s", (self.key,)) + + # get 4CAT version (presumably to ensure export is compatible with import) + metadata["current_4CAT_version"] = get_software_version() + return metadata + def get_result_url(self): """ Gets the 4CAT frontend URL of a dataset file. diff --git a/processors/conversion/export_datasets.py b/processors/conversion/export_datasets.py new file mode 100644 index 000000000..f45cf92f7 --- /dev/null +++ b/processors/conversion/export_datasets.py @@ -0,0 +1,100 @@ +""" +Export a dataset and all its children to a ZIP file +""" +import shutil +import json + +from backend.lib.processor import BasicProcessor +from common.lib.dataset import DataSet +from common.lib.exceptions import ProcessorException, DataSetException + +__author__ = "Dale Wahl" +__credits__ = ["Dale Wahl"] +__maintainer__ = "Dale Wahl" +__email__ = "4cat@oilab.eu" + + + +class ExportDatasets(BasicProcessor): + """ + Export a dataset and all its children to a ZIP file + """ + type = "export-datasets" # job type ID + category = "Conversion" # category + title = "Export Dataset and All Analyses" # title displayed in UI + description = "Creates a ZIP file containing the dataset and all analyses to be archived and uploaded to a 4CAT instance in the future" # description displayed in UI + extension = "zip" # extension of result file, used internally and in UI + + @classmethod + def is_compatible_with(cls, module=None, user=None): + """ + Determine if processor is compatible with dataset + + :param module: Module to determine compatibility with + """ + return module.is_top_dataset() and user.can_access_dataset(dataset=module, role="owner") + + def process(self): + """ + This takes a CSV file as input and writes the same data as a JSON file + """ + self.dataset.update_status("Collecting dataset and all analyses") + + results_path = self.dataset.get_staging_area() + + exported_datasets = [] + failed_exports = [] # keys that failed to import + keys = [self.dataset.top_parent().key] # get the key of the top parent + while keys: + dataset_key = keys.pop(0) + self.dataset.log(f"Exporting dataset {dataset_key}.") + + try: + dataset = DataSet(key=dataset_key, db=self.db) + # TODO: these two should fail for the primary dataset, but should they fail for the children too? + except DataSetException: + self.dataset.finish_with_error("Dataset not found.") + return + if not dataset.is_finished(): + self.dataset.finish_with_error("You cannot export unfinished datasets.") + return + + # get metadata + metadata = dataset.get_metadata() + if metadata["num_rows"] == 0: + self.dataset.update_status(f"Skipping empty dataset {dataset_key}") + failed_exports.append(dataset_key) + continue + + # get data + data_file = dataset.get_results_path() + if not data_file.exists(): + self.dataset.finish_with_error(f"Dataset {dataset_key} has no data; skipping.") + failed_exports.append(dataset_key) + continue + + # get log + log_file = dataset.get_results_path().with_suffix(".log") + + # All good, add to ZIP + with results_path.joinpath(f"{dataset_key}_metadata.json").open("w", encoding="utf-8") as outfile: + outfile.write(json.dumps(metadata)) + shutil.copy(data_file, results_path.joinpath(data_file.name)) + if log_file.exists(): + shutil.copy(log_file, results_path.joinpath(log_file.name)) + + # add children to queue + # Not using get_all_children() because we want to skip unfinished datasets and only need the keys + children = [d["key"] for d in self.db.fetchall("SELECT key FROM datasets WHERE key_parent = %s AND is_finished = TRUE", (dataset_key,))] + keys.extend(children) + + self.dataset.update_status(f"Exported dataset {dataset_key}.") + exported_datasets.append(dataset_key) + + # Add export log to ZIP + self.dataset.log(f"Exported datasets: {exported_datasets}") + self.dataset.log(f"Failed to export datasets: {failed_exports}") + shutil.copy(self.dataset.get_log_path(), results_path.joinpath("export.log")) + + # done! + self.write_archive_and_finish(results_path, len(exported_datasets)) \ No newline at end of file diff --git a/webtool/views/api_tool.py b/webtool/views/api_tool.py index e4645e6d5..64c331271 100644 --- a/webtool/views/api_tool.py +++ b/webtool/views/api_tool.py @@ -1237,11 +1237,7 @@ def export_packed_dataset(key=None, component=None): return error(403, error="You cannot export unfinished datasets.") if component == "metadata": - metadata = db.fetchone("SELECT * FROM datasets WHERE key = %s", (dataset.key,)) - - # get 4CAT version (presumably to ensure export is compatible with import) - metadata["current_4CAT_version"] = get_software_version() - return jsonify(metadata) + return jsonify(dataset.get_metadata()) elif component == "children": children = [d["key"] for d in db.fetchall("SELECT key FROM datasets WHERE key_parent = %s AND is_finished = TRUE", (dataset.key,))] From e23f9253f0f1193a837eaf08fb1e598fd42c2d7a Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 20 Aug 2024 09:07:52 +0200 Subject: [PATCH 02/12] start of importer --- datasources/fourcat_import/import_4cat.py | 400 +++++++++++++++------- 1 file changed, 274 insertions(+), 126 deletions(-) diff --git a/datasources/fourcat_import/import_4cat.py b/datasources/fourcat_import/import_4cat.py index cd231b445..069549446 100644 --- a/datasources/fourcat_import/import_4cat.py +++ b/datasources/fourcat_import/import_4cat.py @@ -4,6 +4,7 @@ import requests import json import time +import zipfile from backend.lib.processor import BasicProcessor from common.lib.exceptions import (QueryParametersException, FourcatException, ProcessorInterruptedException, @@ -19,8 +20,8 @@ class FourcatImportException(FourcatException): class SearchImportFromFourcat(BasicProcessor): type = "import_4cat-search" # job ID category = "Search" # category - title = "Import from 4CAT" # title displayed in UI - description = "Import a dataset from another 4CAT server" # description displayed in UI + title = "Import 4CAT dataset and analyses" # title displayed in UI + description = "Import a dataset from another 4CAT server or from a zip file (exported from a 4CAT server)" # description displayed in UI is_local = False # Whether this datasource is locally scraped is_static = False # Whether this datasource is still updated @@ -33,29 +34,242 @@ class SearchImportFromFourcat(BasicProcessor): "\n\nTo import a dataset across servers, both servers need to be running the same version of 4CAT. " "You can find the current version in the footer at the bottom of the interface." }, + "method": { + "type": UserInput.OPTION_CHOICE, + "help": "Import Type", + "options": { + "zip": "Zip File", + "url": "4CAT URL", + }, + "default": "url" + }, "url": { "type": UserInput.OPTION_TEXT, "help": "Dataset URL", - "tooltip": "URL to the dataset's page, for example https://4cat.example/results/28da332f8918e6dc5aacd1c3b0170f01b80bd95f8ff9964ac646cecd33bfee49/." + "tooltip": "URL to the dataset's page, for example https://4cat.example/results/28da332f8918e6dc5aacd1c3b0170f01b80bd95f8ff9964ac646cecd33bfee49/.", + "requires": "method^=url" }, "intro2": { "type": UserInput.OPTION_INFO, "help": "You can create an API key via the 'API Access' item in 4CAT's navigation menu. Note that you need " "an API key from **the server you are importing from**, not the one you are looking at right now. " - "Additionally, you need to have owner access to the dataset you want to import." + "Additionally, you need to have owner access to the dataset you want to import.", + "requires": "method^=url" }, "api-key": { "type": UserInput.OPTION_TEXT, "help": "4CAT API Key", "sensitive": True, "cache": True, - } + "requires": "method^=url" + }, + "data_upload": { + "type": UserInput.OPTION_FILE, + "help": "File", + "tooltip": "Upload a ZIP file containing a dataset exported from a 4CAT server.", + "requires": "method^=zip" + }, + } created_datasets = None base = None + remapped_keys = None def process(self): + """ + Import 4CAT dataset either from another 4CAT server or from the uploaded zip file + """ + if self.parameters.get("method") == "zip": + self.process_zip() + else: + self.process_urls() + + def after_create(query, dataset, request): + """ + Hook to execute after the dataset for this source has been created + + In this case, put the file in a temporary location so it can be + processed properly by the related Job later. + + :param dict query: Sanitised query parameters + :param DataSet dataset: Dataset created for this query + :param request: Flask request submitted for its creation + """ + if query.get("method") == "zip": + file = request.files["option-data_upload"] + file.seek(0) + with dataset.get_results_path().with_suffix(".importing").open("wb") as outfile: + while True: + chunk = file.read(1024) + if len(chunk) == 0: + break + outfile.write(chunk) + else: + # nothing to do for URLs + pass + + + def process_zip(self): + """ + Import 4CAT dataset from a ZIP file + """ + self.dataset.update_status(f"Importing datasets and analyses from ZIP file.") + temp_file = self.dataset.get_results_path().with_suffix(".importing") + + processed_files = [] + missed_files = [] + with zipfile.ZipFile(temp_file, "r") as zip_ref: + zip_contents = zip_ref.namelist() + + # Get all metadata files and determine primary dataset + metadata_files = [file for file in zip_contents if file.endswith("_metadata.json")] + if not metadata_files: + self.dataset.finish_with_error("No metadata files found in ZIP file; is this a 4CAT export?") + return + + # Get the primary dataset + primary_dataset_keys = set() + datasets = [] + for file in metadata_files: + with zip_ref.open(file) as f: + metadata = json.load(f) + if metadata.get("key_parent") is None: + primary_dataset_keys.add(metadata.get("key")) + datasets.append(metadata) + else: + # Child datasets are skipped for now, as we may need to remap keys + pass + + # Primary dataset will overwrite this dataset; we could address this to support multiple primary datasets + if len(primary_dataset_keys) != 1: + self.dataset.finish_with_error("ZIP file contains multiple primary datasets; only one is allowed.") + return + + # Import datasets ( + # TODO: this is ordered due to potential issues with keys needing to be remapped, but there may be an issue with a child datasets having additional children and needing remapping... + while datasets: + metadata = datasets.pop(0) + dataset_key = metadata.get("key") + processed_metadata = self.process_metadata(metadata) + if dataset_key in primary_dataset_keys: + # Import primary dataset + self.dataset.update_status(f"Importing primary dataset {dataset_key}.") + + + + + + + + + # Check that all files were processed + if len(zip_contents) != len(processed_files): + for file in zip_contents: + if file not in processed_files: + missed_files.append(file) + + + @staticmethod + def process_metadata(metadata): + """ + Process metadata for import + """ + # get rid of some keys that are server-specific and don't need to + # be stored (or don't correspond to database columns) + metadata.pop("current_4CAT_version") + metadata.pop("id") + metadata.pop("job") + metadata.pop("is_private") + metadata.pop("is_finished") # we'll finish it ourselves, thank you!!! + + # extra params are stored as JSON... + metadata["parameters"] = json.loads(metadata["parameters"]) + if "copied_from" in metadata["parameters"]: + metadata["parameters"].pop("copied_from") + metadata["parameters"] = json.dumps(metadata["parameters"]) + + return metadata + + def create_dataset(self, metadata, original_key, primary=False): + """ + Create a new dataset + """ + if primary: + # if this is the first dataset we're importing, make it the + # processor's "own" dataset. the key has already been set to + # the imported dataset's key via ensure_key() (or a new unqiue + # key if it already existed on this server) + # by making it the "own" dataset, the user initiating the + # import will see the imported dataset as the "result" of their + # import query in the interface, similar to the workflow for + # other data sources + new_dataset = self.dataset + metadata.pop("key") # key already OK (see above) + self.db.update("datasets", where={"key": new_dataset.key}, data=metadata) + + else: + # supernumerary datasets - handle on their own + # these include any children of imported datasets + try: + key_exists = DataSet(key=metadata["key"], db=self.db) + + # if we *haven't* thrown a DatasetException now, then the + # key is already in use, so create a "dummy" dataset and + # overwrite it with the metadata we have (except for the + # key). this ensures that a new unique key will be + # generated. + new_dataset = DataSet(parameters={}, type=self.type, db=self.db) + metadata.pop("key") + self.db.update("datasets", where={"key": new_dataset.key}, data=metadata) + + except DataSetException: + # this is *good* since it means the key doesn't exist, so + # we can re-use the key of the imported dataset + self.db.insert("datasets", data=metadata) + new_dataset = DataSet(key=metadata["key"], db=self.db) + + # make sure the dataset path uses the new key and local dataset + # path settings. this also makes sure the log file is created in + # the right place (since it is derived from the results file path) + extension = metadata["result_file"].split(".")[-1] + new_dataset.reserve_result_file(parameters=new_dataset.parameters, extension=extension) + + new_dataset.update_status("Imported dataset created") + if new_dataset.key != original_key: + # could not use original key because it was already in use + # so update any references to use the new key + self.remapped_keys[original_key] = new_dataset.key + new_dataset.update_status(f"Cannot import with same key - already in use on this server. Using key " + f"{new_dataset.key} instead of key {original_key}!") + + # refresh object, make sure it's in sync with the database + self.created_datasets.add(new_dataset.key) + new_dataset = DataSet(key=new_dataset.key, db=self.db) + if new_dataset.key == self.dataset.key: + # this ensures that the first imported dataset becomes the + # processor's "own" dataset, and that the import logs go to + # that dataset's log file. For later imports, this evaluates to + # False. + self.dataset = new_dataset + + # if the key of the parent dataset was changed, change the + # reference to it that the child dataset has + if new_dataset.key_parent and new_dataset.key_parent in self.remapped_keys: + new_dataset.key_parent = self.remapped_keys[new_dataset.key_parent] + + # update some attributes that should come from the new server, not + # the old + new_dataset.creator = dataset_owner + new_dataset.original_timestamp = new_dataset.timestamp + new_dataset.imported = True + new_dataset.timestamp = int(time.time()) + new_dataset.db.commit() + + return new_dataset + + + def process_urls(self): """ Import 4CAT dataset from another 4CAT server @@ -70,7 +284,7 @@ def process(self): self.created_datasets = set() # keys of created datasets - may not be successful! imported = [] # successfully imported datasets failed_imports = [] # keys that failed to import - remapped_keys = {} # changed dataset keys + self.remapped_keys = {} # changed dataset keys num_rows = 0 # will be used later dataset_owner = self.dataset.get_owners()[0] # at this point it has 1 owner @@ -101,90 +315,10 @@ def process(self): failed_imports.append(dataset_key) continue - # get rid of some keys that are server-specific and don't need to - # be stored (or don't correspond to database columns) - metadata.pop("current_4CAT_version") - metadata.pop("id") - metadata.pop("job") - metadata.pop("is_private") - metadata.pop("is_finished") # we'll finish it ourselves, thank you!!! - - # extra params are stored as JSON... - metadata["parameters"] = json.loads(metadata["parameters"]) - if "copied_from" in metadata["parameters"]: - metadata["parameters"].pop("copied_from") - metadata["parameters"] = json.dumps(metadata["parameters"]) - - if not imported: - # if this is the first dataset we're importing, make it the - # processor's "own" dataset. the key has already been set to - # the imported dataset's key via ensure_key() (or a new unqiue - # key if it already existed on this server) - # by making it the "own" dataset, the user initiating the - # import will see the imported dataset as the "result" of their - # import query in the interface, similar to the workflow for - # other data sources - new_dataset = self.dataset - metadata.pop("key") # key already OK (see above) - self.db.update("datasets", where={"key": new_dataset.key}, data=metadata) + metadata = self.process_metadata(metadata) - else: - # supernumerary datasets - handle on their own - # these include any children of imported datasets - try: - key_exists = DataSet(key=metadata["key"], db=self.db) - - # if we *haven't* thrown a DatasetException now, then the - # key is already in use, so create a "dummy" dataset and - # overwrite it with the metadata we have (except for the - # key). this ensures that a new unique key will be - # generated. - new_dataset = DataSet(parameters={}, type=self.type, db=self.db) - metadata.pop("key") - self.db.update("datasets", where={"key": new_dataset.key}, data=metadata) - - except DataSetException: - # this is *good* since it means the key doesn't exist, so - # we can re-use the key of the imported dataset - self.db.insert("datasets", data=metadata) - new_dataset = DataSet(key=metadata["key"], db=self.db) - - # make sure the dataset path uses the new key and local dataset - # path settings. this also makes sure the log file is created in - # the right place (since it is derived from the results file path) - extension = metadata["result_file"].split(".")[-1] - new_dataset.reserve_result_file(parameters=new_dataset.parameters, extension=extension) - - new_dataset.update_status("Imported dataset created") - if new_dataset.key != dataset_key: - # could not use original key because it was already in use - # so update any references to use the new key - remapped_keys[dataset_key] = new_dataset.key - new_dataset.update_status(f"Cannot import with same key - already in use on this server. Using key " - f"{new_dataset.key} instead of key {dataset_key}!") - - # refresh object, make sure it's in sync with the database - self.created_datasets.add(new_dataset.key) - new_dataset = DataSet(key=new_dataset.key, db=self.db) - if new_dataset.key == self.dataset.key: - # this ensures that the first imported dataset becomes the - # processor's "own" dataset, and that the import logs go to - # that dataset's log file. For later imports, this evaluates to - # False. - self.dataset = new_dataset - - # if the key of the parent dataset was changed, change the - # reference to it that the child dataset has - if new_dataset.key_parent and new_dataset.key_parent in remapped_keys: - new_dataset.key_parent = remapped_keys[new_dataset.key_parent] - - # update some attributes that should come from the new server, not - # the old - new_dataset.creator = dataset_owner - new_dataset.original_timestamp = new_dataset.timestamp - new_dataset.imported = True - new_dataset.timestamp = int(time.time()) - new_dataset.db.commit() + # create the new dataset + new_dataset = self.create_dataset(metadata, dataset_key, primary=True if not imported else False) # then, the log self.halt_and_catch_fire() @@ -353,47 +487,61 @@ def validate_query(query, request, user): :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ - urls = query.get("url") - if not urls: - return QueryParametersException("Provide at least one dataset URL.") - - urls = urls.split(",") - bases = set([url.split("/results/")[0].lower() for url in urls]) - keys = SearchImportFromFourcat.get_keys_from_urls(urls) + if query.get("method") == "zip": + file = request.files.get("data_upload") + if not file: + return QueryParametersException("No file uploaded.") + + if not file.filename.endswith(".zip"): + return QueryParametersException("Uploaded file must be a ZIP file.") + + return { + "data_upload": file + } + elif query.get("method") == "url": + urls = query.get("url") + if not urls: + return QueryParametersException("Provide at least one dataset URL.") + + urls = urls.split(",") + bases = set([url.split("/results/")[0].lower() for url in urls]) + keys = SearchImportFromFourcat.get_keys_from_urls(urls) + + if len(keys) != 1: + # todo: change this to < 1 if we allow multiple datasets + return QueryParametersException("You need to provide a single URL to a 4CAT dataset to import.") + + if len(bases) != 1: + return QueryParametersException("All URLs need to point to the same 4CAT server. You can only import from " + "one 4CAT server at a time.") + + base = urls[0].split("/results/")[0] + try: + # test if API key is valid and server is reachable + test = SearchImportFromFourcat.fetch_from_4cat(base, keys[0], query.get("api-key"), "metadata") + except FourcatImportException as e: + raise QueryParametersException(str(e)) - if len(keys) != 1: - # todo: change this to < 1 if we allow multiple datasets - return QueryParametersException("You need to provide a single URL to a 4CAT dataset to import.") + try: + # test if we get a response we can parse + metadata = test.json() + except ValueError: + raise QueryParametersException(f"Unexpected response when trying to fetch metadata for dataset {keys[0]}.") - if len(bases) != 1: - return QueryParametersException("All URLs need to point to the same 4CAT server. You can only import from " - "one 4CAT server at a time.") + version = get_software_version() - base = urls[0].split("/results/")[0] - try: - # test if API key is valid and server is reachable - test = SearchImportFromFourcat.fetch_from_4cat(base, keys[0], query.get("api-key"), "metadata") - except FourcatImportException as e: - raise QueryParametersException(str(e)) + if metadata.get("current_4CAT_version") != version: + raise QueryParametersException(f"This 4CAT server is running a different version of 4CAT ({version}) than " + f"the one you are trying to import from ({metadata.get('current_4CAT_version')}). Make " + "sure both are running the same version of 4CAT and try again.") - try: - # test if we get a response we can parse - metadata = test.json() - except ValueError: - raise QueryParametersException(f"Unexpected response when trying to fetch metadata for dataset {keys[0]}.") - - version = get_software_version() - - if metadata.get("current_4CAT_version") != version: - raise QueryParametersException(f"This 4CAT server is running a different version of 4CAT ({version}) than " - f"the one you are trying to import from ({metadata.get('current_4CAT_version')}). Make " - "sure both are running the same version of 4CAT and try again.") - - # OK, we can import at least one dataset - return { - "url": ",".join(urls), - "api-key": query.get("api-key") - } + # OK, we can import at least one dataset + return { + "url": ",".join(urls), + "api-key": query.get("api-key") + } + else: + raise QueryParametersException("Import method not yet implemented.") @staticmethod def get_keys_from_urls(urls): From 358aca2ea331859998db1eaf0683ad625407ed1d Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Fri, 6 Sep 2024 14:36:24 +0200 Subject: [PATCH 03/12] finish off importing ZIP 4CAT datasets --- datasources/fourcat_import/import_4cat.py | 146 +++++++++++++++++----- 1 file changed, 114 insertions(+), 32 deletions(-) diff --git a/datasources/fourcat_import/import_4cat.py b/datasources/fourcat_import/import_4cat.py index 069549446..a949599e7 100644 --- a/datasources/fourcat_import/import_4cat.py +++ b/datasources/fourcat_import/import_4cat.py @@ -75,11 +75,15 @@ class SearchImportFromFourcat(BasicProcessor): created_datasets = None base = None remapped_keys = None + dataset_owner = None def process(self): """ Import 4CAT dataset either from another 4CAT server or from the uploaded zip file """ + self.created_datasets = set() # keys of created datasets - may not be successful! + self.remapped_keys = {} # changed dataset keys + self.dataset_owner = self.dataset.get_owners()[0] # at this point it has 1 owner if self.parameters.get("method") == "zip": self.process_zip() else: @@ -117,8 +121,9 @@ def process_zip(self): self.dataset.update_status(f"Importing datasets and analyses from ZIP file.") temp_file = self.dataset.get_results_path().with_suffix(".importing") - processed_files = [] - missed_files = [] + imported = [] + processed_files = 1 # take into account the export.log file + failed_imports = [] with zipfile.ZipFile(temp_file, "r") as zip_ref: zip_contents = zip_ref.namelist() @@ -131,44 +136,111 @@ def process_zip(self): # Get the primary dataset primary_dataset_keys = set() datasets = [] + parent_child_mapping = {} for file in metadata_files: with zip_ref.open(file) as f: metadata = json.load(f) - if metadata.get("key_parent") is None: + if not metadata.get("key_parent"): primary_dataset_keys.add(metadata.get("key")) datasets.append(metadata) else: - # Child datasets are skipped for now, as we may need to remap keys - pass + # Store the mapping of parent to child datasets + parent_key = metadata.get("key_parent") + if parent_key not in parent_child_mapping: + parent_child_mapping[parent_key] = [] + parent_child_mapping[parent_key].append(metadata) # Primary dataset will overwrite this dataset; we could address this to support multiple primary datasets if len(primary_dataset_keys) != 1: self.dataset.finish_with_error("ZIP file contains multiple primary datasets; only one is allowed.") return - # Import datasets ( - # TODO: this is ordered due to potential issues with keys needing to be remapped, but there may be an issue with a child datasets having additional children and needing remapping... + # Import datasets while datasets: + self.halt_and_catch_fire() + + # Create the datasets metadata = datasets.pop(0) dataset_key = metadata.get("key") processed_metadata = self.process_metadata(metadata) - if dataset_key in primary_dataset_keys: - # Import primary dataset - self.dataset.update_status(f"Importing primary dataset {dataset_key}.") - - - - - - + new_dataset = self.create_dataset(processed_metadata, dataset_key, dataset_key in primary_dataset_keys) + processed_files += 1 + + # TODO: I am now noticing that we do not update the results_file; it is even more unlikely to collide as it is both a random key and label combined... but... + # Copy the log file + self.halt_and_catch_fire() + log_filename = new_dataset.get_log_path().name + if log_filename in zip_contents: + self.dataset.update_status(f"Transferring log file for dataset {new_dataset.key}") + with zip_ref.open(log_filename) as f: + with new_dataset.get_log_path().open("wb") as outfile: + outfile.write(f.read()) + processed_files += 1 + else: + self.dataset.log(f"Log file not found for dataset {new_dataset.key} (original key {dataset_key}).") + + # Copy the results + self.halt_and_catch_fire() + results_filename = new_dataset.get_results_path().name + if results_filename in zip_contents: + self.dataset.update_status(f"Transferring data file for dataset {new_dataset.key}") + with zip_ref.open(results_filename) as f: + with new_dataset.get_results_path().open("wb") as outfile: + outfile.write(f.read()) + processed_files += 1 + + if not imported: + # first dataset - use num rows as 'overall' + num_rows = metadata["num_rows"] + else: + # TODO: should I just delete the new_dataset here? + self.dataset.log(f"Results file not found for dataset {new_dataset.key} (original key {dataset_key}).") + new_dataset.finish_with_error(f"Results file not found for dataset {new_dataset.key} (original key {dataset_key}).") + failed_imports.append(dataset_key) + continue + + # finally, the kids + self.halt_and_catch_fire() + if dataset_key in parent_child_mapping: + datasets.extend(parent_child_mapping[dataset_key]) + self.dataset.log(f"Adding ({len(parent_child_mapping[dataset_key])}) child datasets to import queue") + + # done - remember that we've imported this one + imported.append(new_dataset) + new_dataset.update_status(metadata["status"]) + if new_dataset.key != self.dataset.key: + # only finish if this is not the 'main' dataset, or the user + # will think the whole import is done + new_dataset.finish(metadata["num_rows"]) # Check that all files were processed - if len(zip_contents) != len(processed_files): + missed_files = [] + if len(zip_contents) != processed_files: for file in zip_contents: if file not in processed_files: missed_files.append(file) + # todo: this part needs updating if/when we support importing multiple datasets! + if failed_imports: + self.dataset.update_status(f"Dataset import finished, but not all data was imported properly. " + f"{len(failed_imports)} dataset(s) were not successfully imported. Check the " + f"dataset log file for details.", is_final=True) + elif missed_files: + self.dataset.log(f"ZIP file contained {len(missed_files)} files that were not processed: {missed_files}") + self.dataset.update_status(f"Dataset import finished, but not all files were processed. " + f"{len(missed_files)} files were not successfully imported. Check the " + f"dataset log file for details.", is_final=True) + else: + self.dataset.update_status(f"{len(imported)} dataset(s) succesfully imported.", + is_final=True) + + if not self.dataset.is_finished(): + # now all related datasets are imported, we can finish the 'main' + # dataset, and the user will be alerted that the full import is + # complete + self.dataset.finish(num_rows) + @staticmethod def process_metadata(metadata): @@ -196,6 +268,7 @@ def create_dataset(self, metadata, original_key, primary=False): Create a new dataset """ if primary: + self.dataset.update_status(f"Importing primary dataset {original_key}.") # if this is the first dataset we're importing, make it the # processor's "own" dataset. the key has already been set to # the imported dataset's key via ensure_key() (or a new unqiue @@ -209,6 +282,7 @@ def create_dataset(self, metadata, original_key, primary=False): self.db.update("datasets", where={"key": new_dataset.key}, data=metadata) else: + self.dataset.update_status(f"Importing child dataset {original_key}.") # supernumerary datasets - handle on their own # these include any children of imported datasets try: @@ -260,7 +334,7 @@ def create_dataset(self, metadata, original_key, primary=False): # update some attributes that should come from the new server, not # the old - new_dataset.creator = dataset_owner + new_dataset.creator = self.dataset_owner new_dataset.original_timestamp = new_dataset.timestamp new_dataset.imported = True new_dataset.timestamp = int(time.time()) @@ -281,12 +355,9 @@ def process_urls(self): keys = SearchImportFromFourcat.get_keys_from_urls(urls) api_key = self.parameters.get("api-key") - self.created_datasets = set() # keys of created datasets - may not be successful! imported = [] # successfully imported datasets failed_imports = [] # keys that failed to import - self.remapped_keys = {} # changed dataset keys num_rows = 0 # will be used later - dataset_owner = self.dataset.get_owners()[0] # at this point it has 1 owner # we can add support for multiple datasets later by removing # this part! @@ -419,7 +490,7 @@ def halt_and_catch_fire(self): for deletable in deletables: DataSet(key=deletable, db=self.db).delete() - self.dataset.finish_with_error(f"Interrupted while importing datasets from {self.base}. Cannot resume - you " + self.dataset.finish_with_error(f"Interrupted while importing datasets{' from '+self.base if self.base else ''}. Cannot resume - you " f"will need to initiate the import again.") raise ProcessorInterruptedException() @@ -488,20 +559,31 @@ def validate_query(query, request, user): :return dict: Safe query parameters """ if query.get("method") == "zip": - file = request.files.get("data_upload") - if not file: - return QueryParametersException("No file uploaded.") - - if not file.filename.endswith(".zip"): - return QueryParametersException("Uploaded file must be a ZIP file.") + filename = "" + if "option-data_upload-entries" in request.form: + # First pass sends list of files in the zip + pass + elif "option-data_upload" in request.files: + # Second pass sends the actual file + file = request.files["option-data_upload"] + if not file: + raise QueryParametersException("No file uploaded.") + + if not file.filename.endswith(".zip"): + raise QueryParametersException("Uploaded file must be a ZIP file.") + + filename = file.filename + else: + raise QueryParametersException("No file was offered for upload.") return { - "data_upload": file + "method": "zip", + "filename": filename } elif query.get("method") == "url": urls = query.get("url") if not urls: - return QueryParametersException("Provide at least one dataset URL.") + raise QueryParametersException("Provide at least one dataset URL.") urls = urls.split(",") bases = set([url.split("/results/")[0].lower() for url in urls]) @@ -509,10 +591,10 @@ def validate_query(query, request, user): if len(keys) != 1: # todo: change this to < 1 if we allow multiple datasets - return QueryParametersException("You need to provide a single URL to a 4CAT dataset to import.") + raise QueryParametersException("You need to provide a single URL to a 4CAT dataset to import.") if len(bases) != 1: - return QueryParametersException("All URLs need to point to the same 4CAT server. You can only import from " + raise QueryParametersException("All URLs need to point to the same 4CAT server. You can only import from " "one 4CAT server at a time.") base = urls[0].split("/results/")[0] From 53f76dcba32d6ea1573ea146dc255a6a700a3035 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Fri, 6 Sep 2024 14:49:07 +0200 Subject: [PATCH 04/12] ensure cleanup on failure had some weird lost datasets when debugging this --- datasources/fourcat_import/import_4cat.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/datasources/fourcat_import/import_4cat.py b/datasources/fourcat_import/import_4cat.py index a949599e7..dc9868d34 100644 --- a/datasources/fourcat_import/import_4cat.py +++ b/datasources/fourcat_import/import_4cat.py @@ -84,10 +84,22 @@ def process(self): self.created_datasets = set() # keys of created datasets - may not be successful! self.remapped_keys = {} # changed dataset keys self.dataset_owner = self.dataset.get_owners()[0] # at this point it has 1 owner - if self.parameters.get("method") == "zip": - self.process_zip() - else: - self.process_urls() + try: + if self.parameters.get("method") == "zip": + self.process_zip() + else: + self.process_urls() + except Exception as e: + # Catch all exceptions and finish the job with an error + # Resuming is impossible because this dataset was overwritten with the importing dataset + # halt_and_catch_fire() will clean up and delete the datasets that were created + self.interrupted = True + try: + self.halt_and_catch_fire() + except ProcessorInterruptedException: + pass + # Reraise the original exception for logging + raise e def after_create(query, dataset, request): """ From f944c71e08497a3d06d68c5dac8b4667e915de3f Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Fri, 6 Sep 2024 15:59:44 +0200 Subject: [PATCH 05/12] auto-expire export zips --- processors/conversion/export_datasets.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/processors/conversion/export_datasets.py b/processors/conversion/export_datasets.py index f45cf92f7..bd7b81289 100644 --- a/processors/conversion/export_datasets.py +++ b/processors/conversion/export_datasets.py @@ -3,10 +3,11 @@ """ import shutil import json +import datetime from backend.lib.processor import BasicProcessor from common.lib.dataset import DataSet -from common.lib.exceptions import ProcessorException, DataSetException +from common.lib.exceptions import DataSetException __author__ = "Dale Wahl" __credits__ = ["Dale Wahl"] @@ -22,7 +23,7 @@ class ExportDatasets(BasicProcessor): type = "export-datasets" # job type ID category = "Conversion" # category title = "Export Dataset and All Analyses" # title displayed in UI - description = "Creates a ZIP file containing the dataset and all analyses to be archived and uploaded to a 4CAT instance in the future" # description displayed in UI + description = "Creates a ZIP file containing the dataset and all analyses to be archived and uploaded to a 4CAT instance in the future. Automatically expires after 1 day, after which you must run again." # description displayed in UI extension = "zip" # extension of result file, used internally and in UI @classmethod @@ -96,5 +97,10 @@ def process(self): self.dataset.log(f"Failed to export datasets: {failed_exports}") shutil.copy(self.dataset.get_log_path(), results_path.joinpath("export.log")) + # set expiration date + # these datasets can be very large and are just copies of the existing datasets, so we don't need to keep them around for long + # TODO: convince people to stop using hyphens in python variables and file names... + self.dataset.__setattr__("expires-after", (datetime.datetime.now() + datetime.timedelta(days=1)).timestamp()) + # done! self.write_archive_and_finish(results_path, len(exported_datasets)) \ No newline at end of file From abd9b112240b49c3c8fd660a33c83d5856a9ac52 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Fri, 6 Sep 2024 16:28:21 +0200 Subject: [PATCH 06/12] nltk again --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index fd06a1e55..e62f292ba 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ "lxml~=4.9.0", "markdown==3.0.1", "markdown2==2.4.2", - "nltk==3.9", + "nltk==3.9.1", "networkx~=2.8.0", "numpy>=1.19.2", "opencv-python>=4.6.0.66", From eabb1f5acf8ca5ad1e4a97fe72748710b9a6a0f9 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Thu, 19 Sep 2024 14:04:56 +0200 Subject: [PATCH 07/12] Squashed commit of the following: commit 3f2a62a124926cfeb840796f104a702878ac10e5 Author: Carsten Schnober Date: Wed Sep 18 18:18:29 2024 +0200 Update Gensim to >=4.3.3, <4.4.0 (#450) * Update Gensim to >=4.3.3, <4.4.0 * update nltk as well --------- Co-authored-by: Dale Wahl Co-authored-by: Sal Hagen commit fee2c8c08617094f28496963da282d2e2dddeab7 Merge: 3d94b666 f8e93eda Author: sal-phd-desktop Date: Wed Sep 18 18:11:19 2024 +0200 Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat commit 3d94b666cedd0de4e0bee953cbf1d787fdc38854 Author: sal-phd-desktop Date: Wed Sep 18 18:11:04 2024 +0200 FINALLY remove 'News' from the front page, replace with 4CAT BlueSky updates and potential information about the specific server (to be set on config page) commit f8e93edabe9013a2c1229caa4c454fab09620125 Author: Stijn Peeters Date: Wed Sep 18 15:11:21 2024 +0200 Simple extensions page in Control Panel commit b5be128c7b8682fb233d962326d9118a61053165 Author: Stijn Peeters Date: Wed Sep 18 14:08:13 2024 +0200 Remove 'docs' directory commit 1e2010af44817016c274c9ec9f7f9971deb57f66 Author: Stijn Peeters Date: Wed Sep 18 14:07:38 2024 +0200 Forgot TikTok and Douyin commit c757dd51884e7ec9cf62ca1726feacab4b2283b7 Author: Stijn Peeters Date: Wed Sep 18 14:01:31 2024 +0200 Say 'zeeschuimer' instead of 'extension' to avoid confusion with 4CAT extensions commit ee7f4345478f923541536c86a5b06246deae03f6 Author: Stijn Peeters Date: Wed Sep 18 14:00:40 2024 +0200 RIP Parler data source commit 11300f2430b51887823b280405de4ded4f15ede1 Author: Stijn Peeters Date: Wed Sep 18 11:21:37 2024 +0200 Tuplestring commit 547265240eba81ca0ad270cd3c536a2b1dcf512d Author: Stijn Peeters Date: Wed Sep 18 11:15:29 2024 +0200 Pass user obj instead of str to ConfigWrapper in Processor commit b21866d7900b5d20ed6ce61ee9aff50f3c0df910 Author: Stijn Peeters Date: Tue Sep 17 17:45:01 2024 +0200 Ensure request-aware config reader in user object when using config wrapper commit bbe79e4b0fe870ccc36cab7bfe7963b28d1948e3 Author: Sal Hagen Date: Tue Sep 17 15:12:46 2024 +0200 Fix extension path walk for Windows commit d6064beaf31a6a85b0e34ed4f8126eb4c4fc07e3 Author: Stijn Peeters Date: Mon Sep 16 14:50:45 2024 +0200 Allow tags that have no users Use case: tag-based frontend differentiation using X-4CAT-Config-Via-Proxy commit b542ded6f976809ec88445e7b04f2c81b900188e Author: Stijn Peeters Date: Mon Sep 16 14:13:14 2024 +0200 Trailing slash in query results list commit a4bddae575b22a009925206a1337bdd89349e567 Author: Dale Wahl <32108944+dale-wahl@users.noreply.github.com> Date: Mon Sep 16 13:57:23 2024 +0200 4CAT Extension - easy(ier) adding of new datasources/processors that can be mainted seperately from 4CAT base code (#451) * domain only * fix reference * try and collect links with selenium * update column_filter to find multiple matches * fix up the normal url_scraper datasource * ensure all selenium links are strings for join * change output of url_scraper to ndjson with map_items * missed key/index change * update web archive to use json and map to 4CAT * fix no text found * and none on scraped_links * check key first * fix up web_archive error reporting * handle None type for error * record web archive "bad request" * add wait after redirect movement * increase waittime for redirects * add processor for trackers * dict to list for addition * allow both newline and comma seperated links * attempt to scrape iframes as seperate pages * Fixes for selenium scraper to work with config database * installation of packages, geckodriver, and firefox if selenium enabled * update install instructions * fix merge error * fix dropped function * have to be kidding me * add note; setup requires docker... need to think about IF this will ever be installed without Docker * seperate selenium class into wrapper and Search class so wrapper can be used in processors! * add screenshots; add firefox extension support * update selenium definitions * regex for extracting urls from strings * screenshots processor; extract urls from text and takes screenshots * Allow producing zip files from data sources * import time * pick better default * test screenshot datasource * validate all params * fix enable extension * haha break out of while loop * count my items * whoops, len() is important here * must be getting tired... * remove redundant logging * Eager loading for screenshots, viewport options, etc * Woops, wrong folder * Fix label shortening * Just 'queue' instead of 'search queue' * Yeah, make it headless * README -> DESCRIPTION * h1 -> h2 * Actually just have no header * Use proper filename for downloaded files * Configure whether to offer pseudonymisation etc * Tweak descriptions * fix log missing data * add columns to post_topic_matrix * fix breadcrumb bug * Add top topics column * Fix selenium config install parameter (Docker uses this/manual would need to run install_selenium, well, manually) * this processor is slow; i thought it was broken long before it updated! * refactor detect_trackers as conversion processor not filter * add geckodriver executable to docker install * Auto-configure webdrivers if available in PATH * update screenshots to act as image-downloader and benefit from processors * fix is_compatible_with * Delete helper-scripts/migrate/migrate-1.30-1.31.py * fix embeddings is_compatible_with * fix up UI options for hashing and private * abstract was moved to lib * various fixes to selenium based datasources * processors not compatible with image datasets * update firefox extension handling * screenshots datasource fix get_options * rename screenshots processor to be detected as image dataset * add monthly and weekly frequencies to wayback machine datasource * wayback ds: fix fail if all attempts do not realize results; addion frequency options to options; add daily * add scroll down page to allow lazy loading for entire page screenshots * screenshots: adjust pause time so it can be used to force a wait for images to load I have not successfully come up with or found a way to wait for all images to load; document.readyState == 'complete' does not function in this way on certain sites including the wayback machine * hash URLs to create filenames * remove log * add setting to toggle display advanced options * add progress bars * web archive fix query validation * count subpages in progress * remove overwritten function * move http response to own column * special filenames * add timestamps to all screenshots * restart selenium on failure * new build have selenium * process urls after start (keep original query parameters) * undo default firefox * quick max * rename SeleniumScraper to SeleniumSearch todo: build SeleniumProcessor! * max number screenshots configurable * method to get url with error handling * use get_with_error_handling * d'oh, screenshot processor needs to quit selenium * update log to contain URL * Update scrolling to use Page down key if necessary * improve logs * update image_category_wall as screenshot datasource does not have category column; this is not ideal and ought to be solved in another way. Also, could I get categories from the metadata? That's... ugh. * no category, no processor * str errors * screenshots: dismiss alerts when checking ready state is complete * set screenshot timeout to 30 seconds * update gensim package * screenshots: move processor interrupt into attempts loop * if alert disappears before we can dismiss it... * selenium specific logger * do not switch window when no alert found on dismiss * extract wait for page to load to selenium class * improve descriptions of screenshot options * remove unused line * treat timeouts differently from other errors these are more likely due to an issue with the website in question * debug if requested * increase pause time * restart browser w/ PID * increase max_workers for selenium this is by individual worker class not for all selenium classes... so you can really crank them out if desired * quick fix restart by pid * avoid bad urls * missing bracket & attempt to fix-missing dependencies in Docker install * Allow dynamic form options in processors * Allow 'requires' on data source options as well * Handle list values with requires * basic processor for apple store; setup checks for additional requirements * fix is_4cat_class * show preview when no map_item * add google store datasource * Docker setup.py use extensions * Wider support for file upload in processors * Log file uploads in DMI service manager * add map_item methods and record more data per item need additional item data as map_item is staticmethod * update from master; merge conflicts * fix docker build context (ignore data files) * fix option requirements * apple store fix: list still tries to get query * apple & google stores fix up item mapping * missed merge error * minor fix * remove unused import * fix datasources w/ files frontend error * fix error w/ datasources having file option * better way to name docker volumes * update two other docker compose files * fix docker-compose ymls * minor bug: fix and add warning; fix no results fail * update apple field names to better match interface * update google store fieldnames and order * sneak in jinja logger if needed * fix fourcat.js handling checkboxes for dynamic settings * add new endpoint for app details to apple store * apple_store map new beta app data * add default lang/country * not all apps have advisories * revert so button works * add chart positions to beta map items * basic scheduler To-do - fix up and add options to scheduler view (e.g. delete/change) - add scheduler view to navigator - tie jobs to datasets? (either in scheduler view or, perhaps, filter dataset view) - more testing... * update scheduler view, add functions to update job interval * revert .env * working scheduler! * basic scheduler view w/ datasets * fix postgres tag * update job status in scheduled_jobs table * fix timestamp; end_date needed for last run check; add dataset label * improve scheduler view * remove dataset from scheduled_jobs table on delete * scheduler view order by last creation * scheduler views: separate scheduler list from scheduled dataset list * additional update from master fixes * apple_store map_items fix missing locales * add back depth for pagination * correct route * modify pagination to accept args * pagination fun * pagination: i hate testing on live servers... * ok ok need the pagination route * pagination: add route_args * fix up scheduler header * improve app store descriptions * add azure store * fix azure links * azure_store: add category search * azure fix type of config update timestamp OPTION_DATE does not appear correctly in settings and causes it to be written incorrectly * basic aws store * check if selenium available; get correct app_id * aws: implement pagination * add logging; wait for elements to load after next page; attempts to rework filter option collection * apple_store: handle invalid param error * fix filter_options * aws: fix filter option collection! * more merge * move new datasources and processors to extensions and modify setup.py and module loader to use the new locations * migrate.py to run extension "fourcat_install.py" files * formatting * remove extensions; add gitignore * excise scheduler merge * some additional cleanup from app_studies branch * allow nested datasources folders; ignore files in extensions main folder * allow extension install scripts to run pip if migrate.py has not * Remove unused URL functions we could use ural for * Take care of git commit hash tracking for extension processors * Get rid of unused path.versionfile config setting * Add extensions README * Squashed commit of the following: commit cd356f7a69d15e8ecc8efffc6d63a16368e62962 Author: Stijn Peeters Date: Sat Sep 14 17:36:18 2024 +0200 UI setting for 4CAT install ad in login commit 0945d8c0a11803a6bb411f15099d50fea25f10ab Author: Stijn Peeters Date: Sat Sep 14 17:32:55 2024 +0200 UI setting for anonymisation controls Todo: make per-datasource commit 1a2562c2f9a368dbe0fc03264fb387e44313213b Author: Stijn Peeters Date: Sat Sep 14 15:53:27 2024 +0200 Debug panel for HTTP headers in control panel commit 203314ec83fb631d985926a0b5c5c440cfaba9aa Author: Stijn Peeters Date: Sat Sep 14 15:53:17 2024 +0200 Preview for HTML datasets commit 48c20c2ebac382bd41b92da4481ff7d832dc1538 Author: Desktop Sal Date: Wed Sep 11 13:54:23 2024 +0200 Remove spacy processors (linguistic extractor, get nouns, get entities) and remove dependencies commit 657ffd75a7f48ba4537449127e5fa39debf4fdf3 Author: Dale Wahl Date: Fri Sep 6 16:29:19 2024 +0200 fix nltk where it matters commit 2ef5c80f2d1a5b5f893c8977d8394740de6d796d Author: Stijn Peeters Date: Tue Sep 3 12:05:14 2024 +0200 Actually check progress in text annotator commit 693960f41b73e39eda0c2f23eb361c18bde632cd Author: Stijn Peeters Date: Mon Sep 2 18:03:18 2024 +0200 Add processor for stormtrooper DMI service commit 6ae964aad492527bc5d016a00f870145aab6e1af Author: Stijn Peeters Date: Fri Aug 30 17:31:37 2024 +0200 Fix reference to old stopwords list in neologisms preset * Fix Github links for extensions * Fix commit detection in extensions * Fix extension detection in module loader * Follow symlinks when loading extensions Probably not uncommon to have a checked out repo somewhere to then symlink into the extensions dir * Make queue message on create page more generic * Markdown in datasource option tooltips * Remove Spacy model from requirements * Add software_source to database SQL --------- Co-authored-by: Stijn Peeters Co-authored-by: Stijn Peeters <42036349+stijn-uva@users.noreply.github.com> commit cd356f7a69d15e8ecc8efffc6d63a16368e62962 Author: Stijn Peeters Date: Sat Sep 14 17:36:18 2024 +0200 UI setting for 4CAT install ad in login commit 0945d8c0a11803a6bb411f15099d50fea25f10ab Author: Stijn Peeters Date: Sat Sep 14 17:32:55 2024 +0200 UI setting for anonymisation controls Todo: make per-datasource commit 1a2562c2f9a368dbe0fc03264fb387e44313213b Author: Stijn Peeters Date: Sat Sep 14 15:53:27 2024 +0200 Debug panel for HTTP headers in control panel commit 203314ec83fb631d985926a0b5c5c440cfaba9aa Author: Stijn Peeters Date: Sat Sep 14 15:53:17 2024 +0200 Preview for HTML datasets commit 48c20c2ebac382bd41b92da4481ff7d832dc1538 Author: Desktop Sal Date: Wed Sep 11 13:54:23 2024 +0200 Remove spacy processors (linguistic extractor, get nouns, get entities) and remove dependencies commit 657ffd75a7f48ba4537449127e5fa39debf4fdf3 Author: Dale Wahl Date: Fri Sep 6 16:29:19 2024 +0200 fix nltk where it matters --- .dockerignore | 1 + .env | 3 +- .zenodo.json | 2 +- VERSION | 2 +- backend/database.sql | 1 + backend/lib/processor.py | 11 +- backend/lib/search.py | 31 ++- backend/lib/worker.py | 11 + common/config_manager.py | 13 +- common/lib/config_definition.py | 50 +++-- common/lib/dataset.py | 23 +- common/lib/helpers.py | 181 +++++++++++++--- common/lib/logger.py | 4 +- common/lib/module_loader.py | 54 +++-- common/lib/user.py | 109 ++++++---- datasources/douyin/search_douyin.py | 2 +- datasources/gab/search_gab.py | 2 +- datasources/imgur/search_imgur.py | 2 +- datasources/instagram/search_instagram.py | 2 +- datasources/linkedin/search_linkedin.py | 2 +- datasources/ninegag/search_9gag.py | 2 +- datasources/parler/DESCRIPTION.md | 11 - datasources/parler/__init__.py | 12 -- datasources/parler/search_parler.py | 66 ------ datasources/tiktok/search_tiktok.py | 2 +- .../tiktok_comments/search_tiktok_comments.py | 2 +- datasources/truth/search_truth.py | 2 +- datasources/twitter-import/search_twitter.py | 2 +- docker-compose_build.yml | 8 - docker/Dockerfile | 1 + docs/conf.py | 62 ------ docs/datasource.rst | 73 ------- docs/index.rst | 20 -- docs/introduction.rst | 5 - docs/processor.rst | 63 ------ docs/requirements.txt | 1 - docs/worker.rst | 14 -- extensions/.gitignore | 5 + extensions/README.md | 39 ++++ helper-scripts/migrate.py | 47 ++++- helper-scripts/migrate/migrate-1.45-1.46.py | 33 +++ processors/filtering/column_filter.py | 6 +- processors/metrics/rank_attribute.py | 11 +- processors/networks/wikipedia_network.py | 7 +- processors/presets/neologisms.py | 21 +- processors/text-analysis/get_entities.py | 172 --------------- processors/text-analysis/get_nouns.py | 196 ------------------ .../text-analysis/linguistic_extractor.py | 168 --------------- processors/text-analysis/split_sentences.py | 5 +- processors/text-analysis/tokenise.py | 5 +- processors/visualisation/download_videos.py | 6 +- .../visualisation/image_category_wall.py | 10 +- processors/visualisation/word-trees.py | 17 +- setup.py | 36 ++-- webtool/__init__.py | 3 +- webtool/lib/helpers.py | 3 +- webtool/lib/template_filters.py | 19 +- webtool/static/js/fourcat.js | 10 +- webtool/templates/account/login.html | 2 + .../components/datasource-option.html | 2 +- webtool/templates/components/pagination.html | 6 +- .../templates/components/result-child.html | 2 +- .../templates/components/result-details.html | 4 +- .../components/result-result-row.html | 7 +- .../controlpanel/extensions-list.html | 55 +++++ webtool/templates/controlpanel/layout.html | 4 +- webtool/templates/controlpanel/logs.html | 7 + webtool/templates/create-dataset.html | 6 +- webtool/templates/data-overview.html | 2 +- webtool/templates/frontpage.html | 27 +-- webtool/templates/preview/csv.html | 2 +- webtool/templates/preview/html.html | 1 + webtool/views/api_tool.py | 16 +- webtool/views/views_admin.py | 3 +- webtool/views/views_dataset.py | 5 + webtool/views/views_extensions.py | 28 +++ webtool/views/views_misc.py | 4 +- 77 files changed, 731 insertions(+), 1123 deletions(-) delete mode 100644 datasources/parler/DESCRIPTION.md delete mode 100644 datasources/parler/__init__.py delete mode 100644 datasources/parler/search_parler.py delete mode 100644 docs/conf.py delete mode 100644 docs/datasource.rst delete mode 100644 docs/index.rst delete mode 100644 docs/introduction.rst delete mode 100644 docs/processor.rst delete mode 100644 docs/requirements.txt delete mode 100644 docs/worker.rst create mode 100644 extensions/.gitignore create mode 100644 extensions/README.md create mode 100644 helper-scripts/migrate/migrate-1.45-1.46.py delete mode 100644 processors/text-analysis/get_entities.py delete mode 100644 processors/text-analysis/get_nouns.py delete mode 100644 processors/text-analysis/linguistic_extractor.py create mode 100644 webtool/templates/controlpanel/extensions-list.html create mode 100644 webtool/templates/preview/html.html create mode 100644 webtool/views/views_extensions.py diff --git a/.dockerignore b/.dockerignore index 5d1d149e0..558da504b 100644 --- a/.dockerignore +++ b/.dockerignore @@ -2,3 +2,4 @@ data/ .github/ .ipynb_checkpoints/ .gitignore +.idea/ diff --git a/.env b/.env index 69a217df0..d03f9c703 100644 --- a/.env +++ b/.env @@ -30,7 +30,7 @@ TELEGRAM_PORT=443 # Docker Volume Names DOCKER_DB_VOL=4cat_4cat_db DOCKER_DATA_VOL=4cat_4cat_data -DOCKER_CONFIG_VOL=4cat_4cat_share +DOCKER_CONFIG_VOL=4cat_4cat_config DOCKER_LOGS_VOL=4cat_4cat_logs # Gunicorn settings @@ -39,4 +39,3 @@ workers=4 threads=4 worker_class=gthread log_level=debug - diff --git a/.zenodo.json b/.zenodo.json index 3ab05ca45..fd261019f 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -3,7 +3,7 @@ "license": "MPL-2.0", "title": "4CAT Capture and Analysis Toolkit", "upload_type": "software", - "version": "v1.45", + "version": "v1.46", "keywords": [ "webmining", "scraping", diff --git a/VERSION b/VERSION index 6245ec1a2..fa2cb2583 100644 --- a/VERSION +++ b/VERSION @@ -1,4 +1,4 @@ -1.45 +1.46 This file should not be modified. It is used by 4CAT to determine whether it needs to run migration scripts to e.g. update the database structure to a more diff --git a/backend/database.sql b/backend/database.sql index 33f0ea393..1f372a697 100644 --- a/backend/database.sql +++ b/backend/database.sql @@ -56,6 +56,7 @@ CREATE TABLE IF NOT EXISTS datasets ( is_private boolean DEFAULT TRUE, software_version text, software_file text DEFAULT '', + software_source text DEFAULT '', annotation_fields text DEFAULT '' ); diff --git a/backend/lib/processor.py b/backend/lib/processor.py index c67fa7a9d..339b112b4 100644 --- a/backend/lib/processor.py +++ b/backend/lib/processor.py @@ -20,6 +20,7 @@ from common.lib.exceptions import (WorkerInterruptedException, ProcessorInterruptedException, ProcessorException, DataSetException, MapItemException) from common.config_manager import config, ConfigWrapper +from common.lib.user import User csv.field_size_limit(1024 * 1024 * 1024) @@ -112,7 +113,7 @@ def work(self): # creator. This ensures that if a value has been overriden for the owner, # the overridden value is used instead. config.with_db(self.db) - self.config = ConfigWrapper(config=config, user=self.owner) + self.config = ConfigWrapper(config=config, user=User.get_by_name(self.db, self.owner)) if self.dataset.data.get("key_parent", None): # search workers never have parents (for now), so we don't need to @@ -164,7 +165,7 @@ def work(self): # start log file self.dataset.update_status("Processing data") - self.dataset.update_version(get_software_commit()) + self.dataset.update_version(get_software_commit(self)) # get parameters # if possible, fill defaults where parameters are not provided @@ -628,7 +629,7 @@ def write_csv_items_and_finish(self, data): self.dataset.update_status("Finished") self.dataset.finish(len(data)) - def write_archive_and_finish(self, files, num_items=None, compression=zipfile.ZIP_STORED): + def write_archive_and_finish(self, files, num_items=None, compression=zipfile.ZIP_STORED, finish=True): """ Archive a bunch of files into a zip archive and finish processing @@ -639,6 +640,7 @@ def write_archive_and_finish(self, files, num_items=None, compression=zipfile.ZI files added to the archive will be used. :param int compression: Type of compression to use. By default, files are not compressed, to speed up unarchiving. + :param bool finish: Finish the dataset/job afterwards or not? """ is_folder = False if issubclass(type(files), PurePath): @@ -665,7 +667,8 @@ def write_archive_and_finish(self, files, num_items=None, compression=zipfile.ZI if num_items is None: num_items = done - self.dataset.finish(num_items) + if finish: + self.dataset.finish(num_items) def create_standalone(self): """ diff --git a/backend/lib/search.py b/backend/lib/search.py index cdcd08115..15b3982d6 100644 --- a/backend/lib/search.py +++ b/backend/lib/search.py @@ -1,16 +1,16 @@ import hashlib +import zipfile import secrets -import shutil import random import json import math import csv +import os from pathlib import Path from abc import ABC, abstractmethod from common.config_manager import config -from common.lib.dataset import DataSet from backend.lib.processor import BasicProcessor from common.lib.helpers import strip_tags, dict_search_and_update, remove_nuls, HashCache from common.lib.exceptions import WorkerInterruptedException, ProcessorInterruptedException, MapItemException @@ -71,7 +71,6 @@ def process(self): items = self.import_from_file(query_parameters.get("file")) else: items = self.search(query_parameters) - except WorkerInterruptedException: raise ProcessorInterruptedException("Interrupted while collecting data, trying again later.") @@ -79,10 +78,12 @@ def process(self): num_items = 0 if items: self.dataset.update_status("Writing collected data to dataset file") - if results_file.suffix == ".ndjson": - num_items = self.items_to_ndjson(items, results_file) - elif results_file.suffix == ".csv": + if self.extension == "csv": num_items = self.items_to_csv(items, results_file) + elif self.extension == "ndjson": + num_items = self.items_to_ndjson(items, results_file) + elif self.extension == "zip": + num_items = self.items_to_archive(items, results_file) else: raise NotImplementedError("Datasource query cannot be saved as %s file" % results_file.suffix) @@ -361,6 +362,22 @@ def items_to_ndjson(self, items, filepath): return processed + def items_to_archive(self, items, filepath): + """ + Save retrieved items as an archive + + Assumes that items is an iterable with one item, a Path object + referring to a folder containing files to be archived. The folder will + be removed afterwards. + + :param items: + :param filepath: Where to store the archive + :return int: Number of items + """ + num_items = len(os.listdir(items)) + self.write_archive_and_finish(items, None, zipfile.ZIP_STORED, False) + return num_items + class SearchWithScope(Search, ABC): """ @@ -404,7 +421,7 @@ def search(self, query): # proportion of items matches # first, get amount of items for all threads in which matching # items occur and that are long enough - thread_ids = tuple([post["thread_id"] for post in items]) + thread_ids = tuple([item["thread_id"] for item in items]) self.dataset.update_status("Retrieving thread metadata for %i threads" % len(thread_ids)) try: min_length = int(query.get("scope_length", 30)) diff --git a/backend/lib/worker.py b/backend/lib/worker.py index 3fe19e067..a5695e673 100644 --- a/backend/lib/worker.py +++ b/backend/lib/worker.py @@ -133,6 +133,17 @@ def run(self): location = "->".join(frames) self.log.error("Worker %s raised exception %s and will abort: %s at %s" % (self.type, e.__class__.__name__, str(e), location)) + # Clean up after work successfully completed or terminates + self.clean_up() + + def clean_up(self): + """ + Clean up after a processor runs successfully or results in error. + Workers should override this method to implement any procedures + to run to clean up a worker; by default this does nothing. + """ + pass + def abort(self): """ Called when the application shuts down diff --git a/common/config_manager.py b/common/config_manager.py index 40bce67a6..eb6c846d0 100644 --- a/common/config_manager.py +++ b/common/config_manager.py @@ -44,9 +44,9 @@ def with_db(self, db=None): # Replace w/ db if provided else only initialise if not already self.db = db if db else Database(logger=None, dbname=self.get("DB_NAME"), user=self.get("DB_USER"), password=self.get("DB_PASSWORD"), host=self.get("DB_HOST"), - port=self.get("DB_PORT"), appname="config-reader") if not db else db + port=self.get("DB_PORT"), appname="config-reader") else: - # self.db already initialized + # self.db already initialized and no db provided pass def load_user_settings(self): @@ -170,11 +170,6 @@ def ensure_database(self): known_tags = [t["tag"] for t in self.db.fetchall("SELECT DISTINCT tag FROM settings")] tag_order = self.get("flask.tag_order") - for tag in tag_order: - # don't include tags not used by users in the tag order - if tag not in user_tags: - tag_order.remove(tag) - for tag in known_tags: # add tags used by a setting to tag order if tag and tag not in tag_order: @@ -442,6 +437,10 @@ def __init__(self, config, user=None, tags=None, request=None): self.tags = tags self.request = request + # this ensures the user object in turn reads from the wrapper + if self.user: + self.user.with_config(self) + def set(self, *args, **kwargs): """ diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py index c9601f78c..a4fca2dcd 100644 --- a/common/lib/config_definition.py +++ b/common/lib/config_definition.py @@ -56,6 +56,13 @@ "software, and a 'powered by 4CAT' notice may also show up in the web interface regardless of the " "value entered here." }, + "4cat.about_this_server": { + "type": UserInput.OPTION_TEXT_LARGE, + "default": "", + "help": "Server information", + "tooltip": "Custom server information that is displayed on the 'About' page. Can for instance be used to show " + "information about who maintains the tool or what its intended purpose is." + }, "4cat.crash_message": { "type": UserInput.OPTION_TEXT_LARGE, "default": "This processor has crashed; the crash has been logged. 4CAT will try again when it is restarted. " @@ -140,7 +147,7 @@ "type": UserInput.OPTION_TOGGLE, "default": False, "help": "Can restart/upgrade", - "tooltip": "Controls whether users can restart and upgrade 4CAT via the Control Panel" + "tooltip": "Controls whether users can restart, upgrade, and manage extensions 4CAT via the Control Panel" }, "privileges.can_upgrade_to_dev": { # this is NOT an admin privilege, because all admins automatically @@ -165,20 +172,10 @@ "help": "Can view worker status", "tooltip": "Controls whether users can view worker status via the Control Panel" }, - # The following two options should be set to ensure that every analysis step can + # The following option should be set to ensure that every analysis step can # be traced to a specific version of 4CAT. This allows for reproducible - # research. You can however leave them empty with no ill effect. The version ID - # should be a commit hash, which will be combined with the Github URL to offer - # links to the exact version of 4CAT code that produced an analysis result. - # If no version file is available, the output of "git show" in PATH_ROOT will be used - # to determine the version, if possible. - "path.versionfile": { - "type": UserInput.OPTION_TEXT, - "default": ".git-checked-out", - "help": "Version file", - "tooltip": "Path to file containing GitHub commit hash. File containing a commit ID (everything after the first whitespace found is ignored)", - "global": True - }, + # research. The output of "git show" in PATH_ROOT will be used to determine + # the version of a processor file, if possible. "4cat.github_url": { "type": UserInput.OPTION_TEXT, "default": "https://github.com/digitalmethodsinitiative/4cat", @@ -479,6 +476,19 @@ "default": False, "tooltip": "Show main dataset preview directly on dataset pages, instead of behind a 'preview' button" }, + "ui.offer_anonymisation": { + "type": UserInput.OPTION_TOGGLE, + "help": "Offer anonymisation options", + "default": True, + "tooltip": "Offer users the option to anonymise their datasets at the time of creation. It is strongly " + "recommended to leave this enabled." + }, + "ui.advertise_install": { + "type": UserInput.OPTION_TOGGLE, + "help": "Advertise local 4CAT", + "default": True, + "tooltip": "In the login form, remind users of the possibility to install their own 4CAT server." + }, "ui.show_datasource": { "type": UserInput.OPTION_TOGGLE, "help": "Show data source", @@ -503,6 +513,18 @@ "tooltip": "If a dataset is a JSON file but it can be mapped to a CSV file, show the CSV in the preview instead" "of the underlying JSON." }, + "ui.offer_hashing": { + "type": UserInput.OPTION_TOGGLE, + "default": True, + "help": "Offer pseudonymisation", + "tooltip": "Add a checkbox to the 'create dataset' forum to allow users to toggle pseudonymisation." + }, + "ui.offer_private": { + "type": UserInput.OPTION_TOGGLE, + "default": True, + "help": "Offer create as private", + "tooltip": "Add a checkbox to the 'create dataset' forum to allow users to make a dataset private." + }, "ui.option_email": { "type": UserInput.OPTION_CHOICE, "options": { diff --git a/common/lib/dataset.py b/common/lib/dataset.py index 56ea7d463..5a23afb7b 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -114,6 +114,9 @@ def __init__(self, parameters=None, key=None, job=None, data=None, db=None, pare self.parameters = json.loads(self.data["parameters"]) self.is_new = False else: + self.data = {"type": type} # get_own_processor needs this + own_processor = self.get_own_processor() + version = get_software_commit(own_processor) self.data = { "key": self.key, "query": self.get_label(parameters, default=type), @@ -125,7 +128,8 @@ def __init__(self, parameters=None, key=None, job=None, data=None, db=None, pare "timestamp": int(time.time()), "is_finished": False, "is_private": is_private, - "software_version": get_software_commit(), + "software_version": version[0], + "software_source": version[1], "software_file": "", "num_rows": 0, "progress": 0.0, @@ -139,7 +143,6 @@ def __init__(self, parameters=None, key=None, job=None, data=None, db=None, pare # Find desired extension from processor if not explicitly set if extension is None: - own_processor = self.get_own_processor() if own_processor: extension = own_processor.get_extension(parent_dataset=DataSet(key=parent, db=db) if parent else None) # Still no extension, default to 'csv' @@ -865,10 +868,12 @@ def get_label(self, parameters=None, default="Query"): elif parameters.get("subject_match") and parameters["subject_match"] != "empty": return parameters["subject_match"] elif parameters.get("query"): - label = parameters["query"] if len(parameters["query"]) < 30 else parameters["query"][:25] + "..." + label = parameters["query"] # Some legacy datasets have lists as query data if isinstance(label, list): label = ", ".join(label) + + label = label if len(label) < 30 else label[:25] + "..." label = label.strip().replace("\n", ", ") return label elif parameters.get("country_flag") and parameters["country_flag"] != "all": @@ -1116,7 +1121,8 @@ def update_version(self, version): processor_path = "" updated = self.db.update("datasets", where={"key": self.data["key"]}, data={ - "software_version": version, + "software_version": version[0], + "software_source": version[1], "software_file": processor_path }) @@ -1151,10 +1157,15 @@ def get_version_url(self, file): :param file: File to link within the repository :return: URL, or an empty string """ - if not self.data["software_version"] or not config.get("4cat.github_url"): + if not self.data["software_source"]: return "" - return config.get("4cat.github_url") + "/blob/" + self.data["software_version"] + self.data.get("software_file", "") + filepath = self.data.get("software_file", "") + if filepath.startswith("/extensions/"): + # go to root of extension + filepath = "/" + "/".join(filepath.split("/")[3:]) + + return self.data["software_source"] + "/blob/" + self.data["software_version"] + filepath def top_parent(self): """ diff --git a/common/lib/helpers.py b/common/lib/helpers.py index f6767c929..2911044f5 100644 --- a/common/lib/helpers.py +++ b/common/lib/helpers.py @@ -1,6 +1,7 @@ """ Miscellaneous helper functions for the 4CAT backend """ +import hashlib import subprocess import requests import datetime @@ -16,9 +17,10 @@ import os import io +from pathlib import Path from collections.abc import MutableMapping from html.parser import HTMLParser -from pathlib import Path +from urllib.parse import urlparse, urlunparse from calendar import monthrange from packaging import version @@ -40,7 +42,6 @@ def init_datasource(database, logger, queue, name): """ pass - def strip_tags(html, convert_newlines=True): """ Strip HTML from a string @@ -120,12 +121,9 @@ def get_git_branch(): return "" -def get_software_commit(): +def get_software_commit(worker=None): """ - Get current 4CAT commit hash - - Reads a given version file and returns the first string found in there - (up until the first space). On failure, return an empty string. + Get current 4CAT git commit hash Use `get_software_version()` instead if you need the release version number rather than the precise commit hash. @@ -134,34 +132,58 @@ def get_software_commit(): repository in the 4CAT root folder, and if so, what commit is currently checked out in it. - :return str: 4CAT git commit hash - """ - versionpath = config.get('PATH_ROOT').joinpath(config.get('path.versionfile')) + For extensions, get the repository information for that extension, or if + the extension is not a git repository, return empty data. - if versionpath.exists() and not versionpath.is_file(): - return "" + :param BasicWorker processor: Worker to get commit for. If not given, get + version information for the main 4CAT installation. - if not versionpath.exists(): - # try git command line within the 4CAT root folder - # if it is a checked-out git repository, it will tell us the hash of - # the currently checked-out commit - try: - cwd = os.getcwd() - os.chdir(config.get('PATH_ROOT')) - show = subprocess.run(["git", "show"], stderr=subprocess.PIPE, stdout=subprocess.PIPE) - os.chdir(cwd) - if show.returncode != 0: - raise ValueError() - return show.stdout.decode("utf-8").split("\n")[0].split(" ")[1] - except (subprocess.SubprocessError, IndexError, TypeError, ValueError, FileNotFoundError): - return "" + :return tuple: 4CAT git commit hash, repository name + """ + # try git command line within the 4CAT root folder + # if it is a checked-out git repository, it will tell us the hash of + # the currently checked-out commit + cwd = os.getcwd() + # path has no Path.relative()... + relative_filepath = Path(re.sub(r"^[/\\]+", "", worker.filepath)).parent try: - with open(versionpath, "r", encoding="utf-8", errors="ignore") as versionfile: - version = versionfile.readline().split(" ")[0] - return version - except OSError: - return "" + # if extension, go to the extension file's path + # we will run git here - if it is not its own repository, we have no + # useful version info (since the extension is by definition not in the + # main 4CAT repository) and will return an empty value + if worker and worker.is_extension: + extension_dir = config.get("PATH_ROOT").joinpath(relative_filepath) + os.chdir(extension_dir) + # check if we are in the extensions' own repo or 4CAT's + repo_level = subprocess.run(["git", "rev-parse", "--show-toplevel"], stderr=subprocess.PIPE, stdout=subprocess.PIPE) + if Path(repo_level.stdout.decode("utf-8")) == config.get("PATH_ROOT"): + # not its own repository + return ("", "") + + else: + os.chdir(config.get("PATH_ROOT")) + + show = subprocess.run(["git", "show"], stderr=subprocess.PIPE, stdout=subprocess.PIPE) + if show.returncode != 0: + raise ValueError() + commit = show.stdout.decode("utf-8").split("\n")[0].split(" ")[1] + + # now get the repository the commit belongs to, if we can + origin = subprocess.run(["git", "config", "--get", "remote.origin.url"], stderr=subprocess.PIPE, stdout=subprocess.PIPE) + if origin.returncode != 0 or not origin.stdout: + raise ValueError() + repository = origin.stdout.decode("utf-8").strip() + if repository.endswith(".git"): + repository = repository[:-4] + + except (subprocess.SubprocessError, IndexError, TypeError, ValueError, FileNotFoundError) as e: + return ("", "") + + finally: + os.chdir(cwd) + + return (commit, repository) def get_software_version(): """ @@ -174,7 +196,7 @@ def get_software_version(): :return str: Software version, for example `1.37`. """ - current_version_file = Path(config.get("PATH_ROOT"), "config/.current-version") + current_version_file = config.get("PATH_ROOT").joinpath("config/.current-version") if not current_version_file.exists(): return "" @@ -228,6 +250,70 @@ def get_ffmpeg_version(ffmpeg_path): return version.parse(ffmpeg_version) +def find_extensions(): + """ + Find 4CAT extensions and load their metadata + + Looks for subfolders of the extension folder, and loads additional metadata + where available. + + :return tuple: A tuple with two items; the extensions, as an ID -> metadata + dictionary, and a list of (str) errors encountered while loading + """ + extension_path = config.get("PATH_ROOT").joinpath("extensions") + errors = [] + if not extension_path.exists() or not extension_path.is_dir(): + return [], None + + # each folder in the extensions folder is an extension + extensions = { + extension.name: { + "name": extension.name, + "version": "", + "url": "", + "git_url": "", + "is_git": False + } for extension in sorted(os.scandir(extension_path), key=lambda x: x.name) if extension.is_dir() + } + + # collect metadata for extensions + allowed_metadata_keys = ("name", "version", "url") + cwd = os.getcwd() + for extension in extensions: + extension_folder = extension_path.joinpath(extension) + metadata_file = extension_folder.joinpath("metadata.json") + if metadata_file.exists(): + with metadata_file.open() as infile: + try: + metadata = json.load(infile) + extensions[extension].update({k: metadata[k] for k in metadata if k in allowed_metadata_keys}) + except (TypeError, ValueError) as e: + errors.append(f"Error reading metadata file for extension '{extension}' ({e})") + continue + + extensions[extension]["is_git"] = extension_folder.joinpath(".git/HEAD").exists() + if extensions[extension]["is_git"]: + # try to get remote URL + try: + os.chdir(extension_folder) + origin = subprocess.run(["git", "config", "--get", "remote.origin.url"], stderr=subprocess.PIPE, + stdout=subprocess.PIPE) + if origin.returncode != 0 or not origin.stdout: + raise ValueError() + repository = origin.stdout.decode("utf-8").strip() + if repository.endswith(".git") and "github.com" in repository: + # use repo URL + repository = repository[:-4] + extensions[extension]["git_url"] = repository + except (subprocess.SubprocessError, IndexError, TypeError, ValueError, FileNotFoundError) as e: + print(e) + pass + finally: + os.chdir(cwd) + + return extensions, errors + + def convert_to_int(value, default=0): """ Convert a value to an integer, with a fallback @@ -887,6 +973,37 @@ def _sets_to_lists_gen(d): return dict(_sets_to_lists_gen(d)) + +def url_to_hash(url, remove_scheme=True, remove_www=True): + """ + Convert a URL to a filename; some URLs are too long to be used as filenames, this keeps the domain and hashes the + rest of the URL. + """ + parsed_url = urlparse(url.lower()) + if parsed_url: + if remove_scheme: + parsed_url = parsed_url._replace(scheme="") + if remove_www: + netloc = re.sub(r"^www\.", "", parsed_url.netloc) + parsed_url = parsed_url._replace(netloc=netloc) + + url = re.sub(r"[^0-9a-z]+", "_", urlunparse(parsed_url).strip("/")) + else: + # Unable to parse URL; use regex + if remove_scheme: + url = re.sub(r"^https?://", "", url) + if remove_www: + if not remove_scheme: + scheme = re.match(r"^https?://", url).group() + temp_url = re.sub(r"^https?://", "", url) + url = scheme + re.sub(r"^www\.", "", temp_url) + else: + url = re.sub(r"^www\.", "", url) + + url = re.sub(r"[^0-9a-z]+", "_", url.lower().strip("/")) + + return hashlib.blake2b(url.encode("utf-8"), digest_size=24).hexdigest() + def folder_size(path='.'): """ Get the size of a folder using os.scandir for efficiency diff --git a/common/lib/logger.py b/common/lib/logger.py index c1a015ca6..bbd30c444 100644 --- a/common/lib/logger.py +++ b/common/lib/logger.py @@ -163,7 +163,7 @@ class Logger: } alert_level = "FATAL" - def __init__(self, output=False, filename='4cat.log', log_level="INFO"): + def __init__(self, logger_name='4cat-backend', output=False, filename='4cat.log', log_level="INFO"): """ Set up log handler @@ -181,7 +181,7 @@ def __init__(self, output=False, filename='4cat.log', log_level="INFO"): self.log_path = log_folder.joinpath(filename) self.previous_report = time.time() - self.logger = logging.getLogger("4cat-backend") + self.logger = logging.getLogger(logger_name) self.logger.setLevel(log_level) # this handler manages the text log files diff --git a/common/lib/module_loader.py b/common/lib/module_loader.py index 84e5d951e..b555801ec 100644 --- a/common/lib/module_loader.py +++ b/common/lib/module_loader.py @@ -7,6 +7,7 @@ import pickle import sys import re +import os from common.config_manager import config @@ -69,14 +70,11 @@ def is_4cat_class(object, only_processors=False): """ Determine if a module member is a worker class we can use """ - # it would be super cool to just use issubclass() here! - # but that requires importing the classes themselves, which leads to - # circular imports if inspect.isclass(object): if object.__name__ in("BasicProcessor", "BasicWorker") or inspect.isabstract(object): # ignore abstract and base classes return False - + if hasattr(object, "is_4cat_class"): if only_processors: if hasattr(object, "is_4cat_processor"): @@ -85,7 +83,7 @@ def is_4cat_class(object, only_processors=False): return False else: return object.is_4cat_class() - + return False def load_modules(self): @@ -99,14 +97,19 @@ def load_modules(self): """ # look for workers and processors in pre-defined folders and datasources - paths = [Path(config.get('PATH_ROOT'), "processors"), Path(config.get('PATH_ROOT'), "backend", "workers"), - *[self.datasources[datasource]["path"] for datasource in self.datasources]] + extension_path = Path(config.get('PATH_ROOT'), "extensions") + + paths = [Path(config.get('PATH_ROOT'), "processors"), + Path(config.get('PATH_ROOT'), "backend", "workers"), + extension_path, + *[self.datasources[datasource]["path"] for datasource in self.datasources]] # extension datasources will be here and the above line... root_match = re.compile(r"^%s" % re.escape(str(config.get('PATH_ROOT')))) root_path = Path(config.get('PATH_ROOT')) for folder in paths: # loop through folders, and files in those folders, recursively + is_extension = extension_path in folder.parents or folder == extension_path for file in folder.rglob("*.py"): # determine module name for file # reduce path to be relative to 4CAT root @@ -147,6 +150,7 @@ def load_modules(self): self.workers[component[1].type] = component[1] self.workers[component[1].type].filepath = relative_path + self.workers[component[1].type].is_extension = is_extension # we can't use issubclass() because for that we would need # to import BasicProcessor, which would lead to a circular @@ -169,8 +173,7 @@ def load_modules(self): for missing_module, processor_list in self.missing_modules.items(): warning += "\t%s (for %s)\n" % (missing_module, ", ".join(processor_list)) - self.log_buffer = warning - + self.log_buffer += warning self.processors = categorised_processors @@ -183,19 +186,21 @@ def load_datasources(self): `DATASOURCE` constant. The latter is taken as the ID for this datasource. """ - for subdirectory in Path(config.get('PATH_ROOT'), "datasources").iterdir(): - # folder name, also the name used in config.py - folder_name = subdirectory.parts[-1] - - # determine module name - module_name = "datasources." + folder_name + def _load_datasource(subdirectory): + """ + Load a single datasource + """ + # determine module name (path relative to 4CAT root w/ periods) + module_name = ".".join(subdirectory.relative_to(Path(config.get("PATH_ROOT"))).parts) try: datasource = importlib.import_module(module_name) except ImportError as e: - continue + self.log_buffer += "Could not import %s: %s\n" % (module_name, e) + return if not hasattr(datasource, "init_datasource") or not hasattr(datasource, "DATASOURCE"): - continue + self.log_buffer += "Could not load datasource %s: missing init_datasource or DATASOURCE\n" % subdirectory + return datasource_id = datasource.DATASOURCE @@ -208,6 +213,19 @@ def load_datasources(self): "config": {} if not hasattr(datasource, "config") else datasource.config } + # Load 4CAT core datasources + for subdirectory in Path(config.get('PATH_ROOT'), "datasources").iterdir(): + if subdirectory.is_dir(): + _load_datasource(subdirectory) + + # Load extension datasources + # os.walk is used to allow for the possibility of multiple extensions, with nested "datasources" folders + for root, dirs, files in os.walk(Path(config.get('PATH_ROOT'), "extensions"), followlinks=True): + if "datasources" in dirs: + for subdirectory in Path(root, "datasources").iterdir(): + if subdirectory.is_dir(): + _load_datasource(subdirectory) + sorted_datasources = {datasource_id: self.datasources[datasource_id] for datasource_id in sorted(self.datasources, key=lambda id: self.datasources[id]["name"])} self.datasources = sorted_datasources @@ -225,7 +243,7 @@ def expand_datasources(self): self.datasources[datasource_id]["has_worker"] = bool(worker) self.datasources[datasource_id]["has_options"] = self.datasources[datasource_id]["has_worker"] and \ bool(self.workers["%s-search" % datasource_id].get_options()) - self.datasources[datasource_id]["importable"] = worker and hasattr(worker, "is_from_extension") and worker.is_from_extension + self.datasources[datasource_id]["importable"] = worker and hasattr(worker, "is_from_zeeschuimer") and worker.is_from_zeeschuimer def load_worker_class(self, worker): """ diff --git a/common/lib/user.py b/common/lib/user.py index 2c9788869..2722d7574 100644 --- a/common/lib/user.py +++ b/common/lib/user.py @@ -14,7 +14,7 @@ from email.mime.text import MIMEText from common.lib.helpers import send_email from common.lib.exceptions import DataSetException -from common.config_manager import config +from common.config_manager import config as global_config class User: @@ -28,12 +28,13 @@ class User: is_authenticated = False is_active = False is_anonymous = True + config = None db = None name = "anonymous" @staticmethod - def get_by_login(db, name, password): + def get_by_login(db, name, password, config=None): """ Get user object, if login is correct @@ -43,6 +44,8 @@ def get_by_login(db, name, password): :param db: Database connection object :param name: User name :param password: User password + :param config: Configuration manager. Can be used for request-aware user objects using ConfigWrapper. Empty to + use a global configuration manager. :return: User object, or `None` if login was invalid """ user = db.fetchone("SELECT * FROM users WHERE name = %s", (name,)) @@ -54,30 +57,34 @@ def get_by_login(db, name, password): return None else: # valid login! - return User(db, user, authenticated=True) + return User(db, user, authenticated=True, config=config) @staticmethod - def get_by_name(db, name): + def get_by_name(db, name, config=None): """ Get user object for given user name :param db: Database connection object :param str name: Username to get object for + :param config: Configuration manager. Can be used for request-aware user objects using ConfigWrapper. Empty to + use a global configuration manager. :return: User object, or `None` for invalid user name """ user = db.fetchone("SELECT * FROM users WHERE name = %s", (name,)) if not user: return None else: - return User(db, user) + return User(db, user, config=config) @staticmethod - def get_by_token(db, token): + def get_by_token(db, token, config=None): """ Get user object for given token, if token is valid :param db: Database connection object :param str token: Token to get object for + :param config: Configuration manager. Can be used for request-aware user objects using ConfigWrapper. Empty to + use a global configuration manager. :return: User object, or `None` for invalid token """ user = db.fetchone( @@ -86,36 +93,9 @@ def get_by_token(db, token): if not user: return None else: - return User(db, user) + return User(db, user, config=config) - def can_access_dataset(self, dataset, role=None): - """ - Check if this user should be able to access a given dataset. - - This depends mostly on the dataset's owner, which should match the - user if the dataset is private. If the dataset is not private, or - if the user is an admin or the dataset is private but assigned to - an anonymous user, the dataset can be accessed. - - :param dataset: The dataset to check access to - :return bool: - """ - if not dataset.is_private: - return True - - elif self.is_admin: - return True - - elif dataset.is_accessible_by(self, role=role): - return True - - elif dataset.get_owners == ("anonymous",): - return True - - else: - return False - - def __init__(self, db, data, authenticated=False): + def __init__(self, db, data, authenticated=False, config=None): """ Instantiate user object @@ -127,6 +107,9 @@ def __init__(self, db, data, authenticated=False): """ self.db = db self.data = data + + self.config = config if config else global_config + try: self.userdata = json.loads(self.data["userdata"]) except (TypeError, json.JSONDecodeError): @@ -170,7 +153,7 @@ def get_name(self): if self.data["name"] == "anonymous": return "Anonymous" elif self.data["name"] == "autologin": - return config.get("flask.autologin.name") + return self.config.get("flask.autologin.name") else: return self.data["name"] @@ -184,6 +167,21 @@ def get_token(self): """ return self.generate_token(regenerate=False) + def with_config(self, config): + """ + Connect user to configuration manager + + By default, the user object reads from the global configuration + manager. For frontend operations it may be desireable to use a + request-aware configuration manager, but this is only available after + the user has been instantiated. This method can thus be used to connect + the user to that config manager later when it is available. + + :param config: Configuration manager object + :return: + """ + self.config = config + def clear_token(self): """ Reset password rest token @@ -195,6 +193,33 @@ def clear_token(self): """ self.db.update("users", data={"register_token": "", "timestamp_token": 0}, where={"name": self.get_id()}) + def can_access_dataset(self, dataset, role=None): + """ + Check if this user should be able to access a given dataset. + + This depends mostly on the dataset's owner, which should match the + user if the dataset is private. If the dataset is not private, or + if the user is an admin or the dataset is private but assigned to + an anonymous user, the dataset can be accessed. + + :param dataset: The dataset to check access to + :return bool: + """ + if not dataset.is_private: + return True + + elif self.is_admin: + return True + + elif dataset.is_accessible_by(self, role=role): + return True + + elif dataset.get_owners == ("anonymous",): + return True + + else: + return False + @property def is_special(self): """ @@ -246,7 +271,7 @@ def email_token(self, new=False): account? :return str: Link for the user to set their password with """ - if not config.get('mail.server'): + if not self.config.get('mail.server'): raise RuntimeError("No e-mail server configured. 4CAT cannot send any e-mails.") if self.is_special: @@ -258,14 +283,14 @@ def email_token(self, new=False): register_token = self.generate_token(regenerate=True) # prepare welcome e-mail - sender = config.get('mail.noreply') + sender = self.config.get('mail.noreply') message = MIMEMultipart("alternative") message["From"] = sender message["To"] = username # the actual e-mail... - url_base = config.get("flask.server_name") - protocol = "https" if config.get("flask.https") else "http" + url_base = self.config.get("flask.server_name") + protocol = "https" if self.config.get("flask.https") else "http" url = "%s://%s/reset-password/?token=%s" % (protocol, url_base, register_token) # we use slightly different e-mails depending on whether this is the first time setting a password @@ -408,7 +433,7 @@ def get_notifications(self): :return list: Notifications, as a list of dictionaries """ - tag_recipients = ["!everyone", *[f"!{tag}" for tag in self.data["tags"]]] + tag_recipients = ["!everyone", *[f"!{tag}" for tag in self.config.get_active_tags(self)]] if self.is_admin: # for backwards compatibility - used to be called '!admins' even if the tag is 'admin' tag_recipients.append("!admins") @@ -457,7 +482,7 @@ def sort_user_tags(self): tags = self.data["tags"] sorted_tags = [] - for tag in config.get("flask.tag_order"): + for tag in self.config.get("flask.tag_order"): if tag in tags: sorted_tags.append(tag) diff --git a/datasources/douyin/search_douyin.py b/datasources/douyin/search_douyin.py index e66b177ff..4b5d5b814 100644 --- a/datasources/douyin/search_douyin.py +++ b/datasources/douyin/search_douyin.py @@ -18,7 +18,7 @@ class SearchDouyin(Search): title = "Import scraped Douyin data" # title displayed in UI description = "Import Douyin data collected with an external tool such as Zeeschuimer." # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI - is_from_extension = True + is_from_zeeschuimer = True # not available as a processor for existing datasets accepts = [None] diff --git a/datasources/gab/search_gab.py b/datasources/gab/search_gab.py index 4b200b667..2ad7dfc34 100644 --- a/datasources/gab/search_gab.py +++ b/datasources/gab/search_gab.py @@ -16,7 +16,7 @@ class SearchGab(Search): title = "Import scraped Gab data" # title displayed in UI description = "Import Gab data collected with an external tool such as Zeeschuimer." # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI - is_from_extension = True + is_from_zeeschuimer = True fake = "" # not available as a processor for existing datasets diff --git a/datasources/imgur/search_imgur.py b/datasources/imgur/search_imgur.py index d3e55c38d..b8c80ec5b 100644 --- a/datasources/imgur/search_imgur.py +++ b/datasources/imgur/search_imgur.py @@ -18,7 +18,7 @@ class SearchNineGag(Search): title = "Import scraped Imgur data" # title displayed in UI description = "Import Imgur data collected with an external tool such as Zeeschuimer." # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI - is_from_extension = True + is_from_zeeschuimer = True # not available as a processor for existing datasets accepts = [None] diff --git a/datasources/instagram/search_instagram.py b/datasources/instagram/search_instagram.py index b82e4ca3e..3a3b76f4c 100644 --- a/datasources/instagram/search_instagram.py +++ b/datasources/instagram/search_instagram.py @@ -21,7 +21,7 @@ class SearchInstagram(Search): title = "Import scraped Instagram data" # title displayed in UI description = "Import Instagram data collected with an external tool such as Zeeschuimer." # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI - is_from_extension = True + is_from_zeeschuimer = True # not available as a processor for existing datasets accepts = [None] diff --git a/datasources/linkedin/search_linkedin.py b/datasources/linkedin/search_linkedin.py index f954782e0..f357341ed 100644 --- a/datasources/linkedin/search_linkedin.py +++ b/datasources/linkedin/search_linkedin.py @@ -21,7 +21,7 @@ class SearchLinkedIn(Search): title = "Import scraped LinkedIn data" # title displayed in UI description = "Import LinkedIn data collected with an external tool such as Zeeschuimer." # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI - is_from_extension = True + is_from_zeeschuimer = True # not available as a processor for existing datasets accepts = [None] diff --git a/datasources/ninegag/search_9gag.py b/datasources/ninegag/search_9gag.py index 973de82ba..e5d6c267b 100644 --- a/datasources/ninegag/search_9gag.py +++ b/datasources/ninegag/search_9gag.py @@ -19,7 +19,7 @@ class SearchNineGag(Search): title = "Import scraped 9gag data" # title displayed in UI description = "Import 9gag data collected with an external tool such as Zeeschuimer." # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI - is_from_extension = True + is_from_zeeschuimer = True # not available as a processor for existing datasets accepts = [None] diff --git a/datasources/parler/DESCRIPTION.md b/datasources/parler/DESCRIPTION.md deleted file mode 100644 index f2d745c68..000000000 --- a/datasources/parler/DESCRIPTION.md +++ /dev/null @@ -1,11 +0,0 @@ -The Parler data source can be used to manipulate data collected from parler.com with -[Zeeschuimer](https://github.com/digitalmethodsinitiative/zeeschuimer). Data is collected with the browser extension; -4CAT cannot collect data on its own. After collecting data with Zeeschuimer it can be uploaded to 4CAT for further -processing and analysis. See the Zeeschuimer documentation for more information on how to collect data with it. - -Data is collected as it is formatted internally by Parler's website. Posts are stored as (large) JSON objects; it -will usually be easier to make sense of the data by downloading it as a CSV file from 4CAT instead. - -### Data format -Most data attributes map to 4CAT's CSV export quite straightforwardly. Note that 'echoes' are Parler's term for what on -Twitter would be called a 'retweet', i.e. a post reposted by someone else. \ No newline at end of file diff --git a/datasources/parler/__init__.py b/datasources/parler/__init__.py deleted file mode 100644 index 44d34ac14..000000000 --- a/datasources/parler/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -""" -Initialize Parler data source -""" - -# An init_datasource function is expected to be available to initialize this -# data source. A default function that does this is available from the -# backend helpers library. -from common.lib.helpers import init_datasource - -# Internal identifier for this data source -DATASOURCE = "parler" -NAME = "Parler" \ No newline at end of file diff --git a/datasources/parler/search_parler.py b/datasources/parler/search_parler.py deleted file mode 100644 index 8ccc7ccd8..000000000 --- a/datasources/parler/search_parler.py +++ /dev/null @@ -1,66 +0,0 @@ -""" -Import scraped Parler data - -It's prohibitively difficult to scrape data from Parler within 4CAT itself -due to its aggressive rate limiting and login wall. Instead, import data -collected elsewhere. -""" -import datetime -import re - -from backend.lib.search import Search -from common.lib.item_mapping import MappedItem - - -class SearchParler(Search): - """ - Import scraped LinkedIn data - """ - type = "parler-search" # job ID - category = "Search" # category - title = "Import scraped Parler data" # title displayed in UI - description = "Import Parler data collected with an external tool such as Zeeschuimer." # description displayed in UI - extension = "ndjson" # extension of result file, used internally and in UI - is_from_extension = True - - # not available as a processor for existing datasets - accepts = [None] - - def get_items(self, query): - """ - Run custom search - - Not available for Parler - """ - raise NotImplementedError("Parler datasets can only be created by importing data from elsewhere") - - @staticmethod - def map_item(node): - """ - Parse Parler post - - :param node: Data as received from Parler - :return dict: Mapped item - """ - post = node["data"] - post_time = datetime.datetime.strptime(post["date_created"], "%Y-%m-%dT%H:%M:%S.000000Z") - - return MappedItem({ - "id": post["postuuid"], - "thread_id": post["postuuid"], - "body": post["body"], - "timestamp": post_time.strftime("%Y-%m-%d %H:%M:%S"), - "author": post["user"]["username"], - "author_name": post["user"]["name"], - "author_followers": post["user"]["follower_count"], - "detected_language": post["detected_language"], - "views": post["views"], - "echoes": post["echos"], - "comments": post["total_comments"], - "is_sensitive": "yes" if post["sensitive"] else "no", - "is_echo": "yes" if post["is_echo"] else "no", - "is_ad": "yes" if post["ad"] else "no", - "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", post["body"])), - "image_url": post["image"] if post["image"] else "", - "unix_timestamp": int(post_time.timestamp()) - }) diff --git a/datasources/tiktok/search_tiktok.py b/datasources/tiktok/search_tiktok.py index 90f443b49..2ee3c66bd 100644 --- a/datasources/tiktok/search_tiktok.py +++ b/datasources/tiktok/search_tiktok.py @@ -20,7 +20,7 @@ class SearchTikTok(Search): title = "Import scraped Tiktok data" # title displayed in UI description = "Import Tiktok data collected with an external tool such as Zeeschuimer." # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI - is_from_extension = True + is_from_zeeschuimer = True # not available as a processor for existing datasets accepts = [None] diff --git a/datasources/tiktok_comments/search_tiktok_comments.py b/datasources/tiktok_comments/search_tiktok_comments.py index d44581193..efaffc21d 100644 --- a/datasources/tiktok_comments/search_tiktok_comments.py +++ b/datasources/tiktok_comments/search_tiktok_comments.py @@ -20,7 +20,7 @@ class SearchTikTokComments(Search): title = "Import scraped Tiktok comment data" # title displayed in UI description = "Import Tiktok comment data collected with an external tool such as Zeeschuimer." # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI - is_from_extension = True + is_from_zeeschuimer = True # not available as a processor for existing datasets accepts = [None] diff --git a/datasources/truth/search_truth.py b/datasources/truth/search_truth.py index 52057e0fa..c1743e12c 100644 --- a/datasources/truth/search_truth.py +++ b/datasources/truth/search_truth.py @@ -16,7 +16,7 @@ class SearchGab(Search): title = "Import scraped Truth Social data" # title displayed in UI description = "Import Truth Social data collected with an external tool such as Zeeschuimer." # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI - is_from_extension = True + is_from_zeeschuimer = True fake = "" # not available as a processor for existing datasets diff --git a/datasources/twitter-import/search_twitter.py b/datasources/twitter-import/search_twitter.py index baa506923..9acb2b45c 100644 --- a/datasources/twitter-import/search_twitter.py +++ b/datasources/twitter-import/search_twitter.py @@ -20,7 +20,7 @@ class SearchTwitterViaZeeschuimer(Search): title = "Import scraped X/Twitter data" # title displayed in UI description = "Import X/Twitter data collected with an external tool such as Zeeschuimer." # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI - is_from_extension = True + is_from_zeeschuimer = True # not available as a processor for existing datasets accepts = [] diff --git a/docker-compose_build.yml b/docker-compose_build.yml index b1c1fa1af..7466e8ba8 100644 --- a/docker-compose_build.yml +++ b/docker-compose_build.yml @@ -9,7 +9,6 @@ services: - POSTGRES_HOST_AUTH_METHOD=${POSTGRES_HOST_AUTH_METHOD} volumes: - ./data/postgres/:/var/lib/postgresql/data/ -# - 4cat_db:/var/lib/postgresql/data/ healthcheck: test: [ "CMD-SHELL", "pg_isready -U $${POSTGRES_USER}" ] interval: 5s @@ -33,10 +32,6 @@ services: - ./data/datasets/:/usr/src/app/data/ - ./data/config/:/usr/src/app/config/ - ./data/logs/:/usr/src/app/logs/ -# - 4cat_data:/usr/src/app/data/ -# - 4cat_config:/usr/src/app/config/ -# - 4cat_logs:/usr/src/app/logs/ - entrypoint: docker/docker-entrypoint.sh frontend: @@ -54,9 +49,6 @@ services: - ./data/datasets/:/usr/src/app/data/ - ./data/config/:/usr/src/app/config/ - ./data/logs/:/usr/src/app/logs/ -# - 4cat_data:/usr/src/app/data/ -# - 4cat_config:/usr/src/app/config/ -# - 4cat_logs:/usr/src/app/logs/ command: ["docker/wait-for-backend.sh"] volumes: diff --git a/docker/Dockerfile b/docker/Dockerfile index 709d68893..046b39cba 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -24,6 +24,7 @@ ENV PYTHONUNBUFFERED=1 # Install dependencies RUN pip3 install --upgrade pip COPY ./requirements.txt /usr/src/app/requirements.txt +COPY ./extensions /usr/src/app/extensions COPY ./setup.py /usr/src/app/setup.py COPY ./VERSION /usr/src/app/VERSION COPY ./README.md /usr/src/app/README.md diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index a08dda6c9..000000000 --- a/docs/conf.py +++ /dev/null @@ -1,62 +0,0 @@ -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -import os -import sys -sys.path.insert(0, os.path.abspath('../..')) -print(os.path.abspath('../..')) - - -# -- Project information ----------------------------------------------------- -project = '4CAT Capture & Analysis Toolkit' -copyright = '2021, OILab & Digital Methods Initiative' -author = 'OILab & Digital Methods Initiative' - - -# -- General configuration --------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - "sphinx.ext.napoleon", - 'm2r2', - 'sphinx.ext.intersphinx' -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [] - -source_suffix = [".rst", ".md"] - -autodoc_default_options = { - "member-order": "groupwise" -} - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = 'sphinx_rtd_theme' - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] \ No newline at end of file diff --git a/docs/datasource.rst b/docs/datasource.rst deleted file mode 100644 index 56ae2189c..000000000 --- a/docs/datasource.rst +++ /dev/null @@ -1,73 +0,0 @@ -================= -4CAT Data sources -================= - -4CAT is a modular tool. Its modules come in two varieties: data sources and processors. This article covers the former. - -Data sources are a collection of workers, processors and interface elements that extend 4CAT to allow scraping, -processing and/or retrieving data for a given platform (such as Instagram, Reddit or Telegram). 4CAT has APIs that can -do most of the scaffolding around this for you so data source can be quite lightweight and mostly focus on retrieving -the actual data while 4CAT's back-end takes care of the scheduling, determining where the output should go, et cetera. - -Data sources are defined as an arbitrarily-named folder in the datasources folder in the 4CAT root. It is recommended to -use the datasource ID (see below) as the data source folder name. However, since Python files included in the folder -will be included as modules by 4CAT, folder names should be allowed as module names. Concretely this means (among other -things) that data source folder names cannot start with a number (hence the fourchan data source). - -*WARNING:* Data sources in multiple ways can define arbitrary code that will be run by either the 4CAT server or -client-side browsers. Be careful when running a data source supplied by someone else. - -A data source will at least contain the following: - -* An __init__.py containing data source metadata and initialisation code -* A search worker, which can collect data according to provided parameters and format it as a CSV or NDJSON file that - 4CAT can work with. - -It may contain additional components: - -* Any processors that are specific to datasets created by this data source -* Views for the web app that allow more advanced behaviour of the web tool interface -* Database or Sphinx index definitions - -The instructions below describe how to format and create these components (work in progress!) - -------------------- -Initialisation code -------------------- - -The data source root should contain a file `__init__.py` which in turn defines the following: - -.. code-block:: python - - DATASOURCE = "datasource-identifier" - -This constant defines the data source ID. This is most importantly used in config.py to enable the data source. - -.. code-block:: python - - def init_datasource(database, logger, queue, name): - pass - -This function is called when 4CAT starts, if the data source is enabled, and should set up anything the data source -needs to function (e.g. queueing any recurring workers). A default implementation of this function can be used instead -(and when defining your own, it is advised to still call it as part of your own implementation): - -.. code-block:: python - - from backend.lib.helpers import init_datasource - ------------------- -The `Search` class ------------------- -.. autoclass:: backend.abstract.search.Search - :members: - :undoc-members: - :show-inheritance: - ---------------------------- -The `SearchWithScope` class ---------------------------- -.. autoclass:: backend.abstract.search.SearchWithScope - :members: - :undoc-members: - :show-inheritance: \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index addb57b3f..000000000 --- a/docs/index.rst +++ /dev/null @@ -1,20 +0,0 @@ -.. 4CAT Capture & Analysis Toolkit documentation master file, created by - sphinx-quickstart on Tue Oct 19 11:38:20 2021. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to 4CAT Capture & Analysis Toolkit's documentation! -=========================================================== - -This documentation collects information about 4CAT's internals - -.. toctree:: - :maxdepth: 2 - :caption: Contents: - - introduction - processor - datasource - worker - -* :ref:`search` diff --git a/docs/introduction.rst b/docs/introduction.rst deleted file mode 100644 index b33e21e8d..000000000 --- a/docs/introduction.rst +++ /dev/null @@ -1,5 +0,0 @@ -============ -Introduction -============ - -.. mdinclude:: ../../README.md \ No newline at end of file diff --git a/docs/processor.rst b/docs/processor.rst deleted file mode 100644 index 1bc3c5191..000000000 --- a/docs/processor.rst +++ /dev/null @@ -1,63 +0,0 @@ -=============== -4CAT Processors -=============== - -4CAT is a modular tool. Its modules come in two varieties: data sources and processors. This article covers the latter. - -Processors are bits of code that produce a dataset. Typically, their input is another dataset. As such they can be used -to analyse data; for example, a processor can take a csv file containing posts as input, count how many posts occur per -month, and produce another csv file with the amount of posts per month (one month per row) as output. Processors always -produce the following things: - -* A set of metadata for the Dataset the processor will produce. This is stored in 4CAT's PostgreSQL database. The - record for the database is created when the processor's job is first queued, and updated by the processor. -* A result file, which may have an arbitrary format. This file contains whatever the processor produces, e.g. a list - of frequencies, an image wall or a zip archive containing word embedding models. -* A log file, with the same file name as the result file but with a '.log' extension. This documents any output from - the processor while it was producing the result file. - -4CAT has an API that can do most of the scaffolding around this for you so processors can be quite lightweight and -mostly focus on the analysis while 4CAT's back-end takes care of the scheduling, determining where the output should -go, et cetera. - -A minimal example of a processor could look like this: - -.. code-block:: python - - """ - A minimal example 4CAT processor - """ - from backend.abstract.processor import BasicProcessor - - class ExampleProcessor(BasicProcessor): - """ - Example Processor - """ - type = "example-processor" # job type ID - category = "Examples" # category - title = "A simple example" # title displayed in UI - description = "This doesn't do much" # description displayed in UI - extension = "csv" # extension of result file, used internally and in UI - - input = "csv:body" - output = "csv:value" - - def process(self): - """ - Saves a CSV file with one column ("value") and one row with a value ("Hello - world") and marks the dataset as finished. - """ - data = {"value": "Hello world!"} - self.write_csv_items_and_finish(data) - - -But there is more you can do. The full API looks like this: - --------------------------- -The `BasicProcessor` class --------------------------- - -.. autoclass:: backend.abstract.processor.BasicProcessor - :members: - :undoc-members: - :show-inheritance: \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index ecd67a4ad..000000000 --- a/docs/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -m2r2 \ No newline at end of file diff --git a/docs/worker.rst b/docs/worker.rst deleted file mode 100644 index bc122f7e9..000000000 --- a/docs/worker.rst +++ /dev/null @@ -1,14 +0,0 @@ -=============== -4CAT Workers -=============== - -TBD - ------------------------ -The `BasicWorker` class ------------------------ - -.. autoclass:: backend.abstract.worker.BasicWorker - :members: - :undoc-members: - :show-inheritance: \ No newline at end of file diff --git a/extensions/.gitignore b/extensions/.gitignore new file mode 100644 index 000000000..d7e401301 --- /dev/null +++ b/extensions/.gitignore @@ -0,0 +1,5 @@ +# Ignore everything in this directory +* +# Except these files +!.gitignore +!README.md diff --git a/extensions/README.md b/extensions/README.md new file mode 100644 index 000000000..f594bc152 --- /dev/null +++ b/extensions/README.md @@ -0,0 +1,39 @@ +This folder contains 4CAT extensions. + +Extensions are processor or data sources that are not part of the main 4CAT codebase, but are otherwise compatible +with it. For example, a processor that interfaces with a closed API would not be useful to most 4CAT users, but if you +have access to it, you could add such a processor to 4CAT as an extension. + + +## Installation +Extensions are simply folders within this 'extensions' folder in which Python files containing the relevant code is +contained. It is strongly recommended that you use git for version control of these folders. Simply commit the code to +a repository somewhere, then clone it into this folder like so: + +```shell +cd [4cat root] +cd extensions +git clone [repository URL] +``` + +This ensures that any dataset created with processors in your extension will be aware of the version of the code they +were created with. This helps debugging and doing reproducible and traceable research. + +## Structure +Processors can simply be .py files in the extension folder. Data sources should be sub-folders in a "datasources" +folder. An extension containing both processors and a data source could look like this: + +``` +[4CAT root]/ +├─ extensions/ +│ ├─ my_extension/ +│ ├─ my_processor.py +│ ├─ my_other_processor.py +│ ├─ datasources/ +│ ├─ my_datasource/ +│ ├─ __init__.py +│ ├─ DESCRIPTION.md +│ ├─ search_my_datasource.py +``` + +In this scenario, `my_extension` would be a git repository within which all other files are contained. \ No newline at end of file diff --git a/helper-scripts/migrate.py b/helper-scripts/migrate.py index fb85772ae..25071afe4 100644 --- a/helper-scripts/migrate.py +++ b/helper-scripts/migrate.py @@ -79,8 +79,39 @@ def check_for_nltk(): nltk.download("omw-1.4", quiet=True) +def install_extensions(no_pip=True): + """ + Check for extensions and run any installation scripts found. -def finish(args, logger): + Note: requirements texts are handled by setup.py + """ + # Check for extension packages + if os.path.isdir("extensions"): + for root, dirs, files in os.walk("extensions"): + for file in files: + if file == "fourcat_install.py": + command = [interpreter, os.path.join(root, file)] + if args.component == "frontend": + command.append("--component=frontend") + elif args.component == "backend": + command.append("--component=backend") + elif args.component == "both": + command.append("--component=both") + + if no_pip: + command.append("--no-pip") + + print(f"Installing extension: {os.path.join(root, file)}") + result = subprocess.run(command, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + if result.returncode != 0: + print("Error while running extension installation script: " + os.path.join(root, file)) + + print(result.stdout.decode("utf-8")) if result.stdout else None + print(result.stderr.decode("utf-8")) if result.stderr else None + + +def finish(args, logger, no_pip=True): """ Finish migration @@ -89,6 +120,7 @@ def finish(args, logger): wrap up and exit. """ check_for_nltk() + install_extensions(no_pip=no_pip) logger.info("\nMigration finished. You can now safely restart 4CAT.\n") if args.restart: @@ -115,7 +147,7 @@ def finish(args, logger): cli.add_argument("--no-migrate", "-m", default=False, action="store_true", help="Do not run scripts to upgrade between minor versions. Use if you only want to use migrate to e.g. upgrade dependencies.") cli.add_argument("--current-version", "-v", default="config/.current-version", help="File path to .current-version file, relative to the 4CAT root") cli.add_argument("--output", "-o", default="", help="By default migrate.py will send output to stdout. If this argument is set, it will write to the given path instead.") -cli.add_argument("--component", "-c", default="both", help="Which component of 4CAT to migrate. Currently only skips check for if 4CAT is running when set to 'frontend'") +cli.add_argument("--component", "-c", default="both", help="Which component of 4CAT to migrate ('both', 'backend', 'frontend'). Skips check for if 4CAT is running when set to 'frontend'. Also used by extensions w/ fourcat_install.py") cli.add_argument("--branch", "-b", default=False, help="Which branch to check out from GitHub. By default, check out the latest release.") args = cli.parse_args() @@ -125,6 +157,9 @@ def finish(args, logger): print("This script needs to be run from the same folder as 4cat-daemon.py\n") exit(1) +# track pip +pip_ran = False + # set up logging logger = logging.getLogger("migrate") logger.setLevel(logging.INFO) @@ -145,6 +180,7 @@ def finish(args, logger): logger.info("Restart after migration: " + ("yes" if args.restart else "no")) logger.info("Repository URL: " + args.repository) logger.info(".current-version path: " + args.current_version) +logger.info(f"Current Datetime: {time.strftime('%Y-%m-%d %H:%M:%S')}") # --------------------------------------------- # Ensure existence of current version file @@ -221,7 +257,7 @@ def finish(args, logger): logger.info(" ...latest release available from GitHub (%s) is older than or equivalent to currently checked out version " "(%s)." % (tag_version, current_version_c)) logger.info(" ...upgrade not necessary, skipping.") - finish(args, logger) + finish(args, logger, no_pip=pip_ran) logger.info(" ...ensuring repository %s is a known remote" % args.repository) remote = subprocess.run(shlex.split("git remote add 4cat_migrate %s" % args.repository), stdout=subprocess.PIPE, @@ -297,7 +333,7 @@ def finish(args, logger): if current_version == target_version: logger.info(" ...already up to date.") - finish(args, logger) + finish(args, logger, no_pip=pip_ran) if current_version_c[0:3] != target_version_c[0:3]: logger.info(" ...cannot migrate between different major versions.") @@ -365,6 +401,7 @@ def log_pip_output(logger, output): pip = subprocess.run([interpreter, "-m", "pip", "install", "-r", "requirements.txt", "--upgrade", "--upgrade-strategy", "eager"], stderr=subprocess.STDOUT, stdout=subprocess.PIPE, check=True, cwd=cwd) log_pip_output(logger, pip.stdout) + pip_ran = True except subprocess.CalledProcessError as e: log_pip_output(logger, e.output) logger.info(f"\n Error running pip: {e}") @@ -410,4 +447,4 @@ def log_pip_output(logger, output): # --------------------------------------------- # Done! Wrap up and finish # --------------------------------------------- -finish(args, logger) +finish(args, logger, no_pip=pip_ran) diff --git a/helper-scripts/migrate/migrate-1.45-1.46.py b/helper-scripts/migrate/migrate-1.45-1.46.py new file mode 100644 index 000000000..8bf5d0683 --- /dev/null +++ b/helper-scripts/migrate/migrate-1.45-1.46.py @@ -0,0 +1,33 @@ +# Ensure unique metrics index exists +import json +import sys +import os + +from pathlib import Path + +sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), "../..")) +from common.lib.database import Database +from common.lib.logger import Logger + +log = Logger(output=True) + +import configparser + +ini = configparser.ConfigParser() +ini.read(Path(__file__).parent.parent.parent.resolve().joinpath("config/config.ini")) +db_config = ini["DATABASE"] + +db = Database(logger=log, dbname=db_config["db_name"], user=db_config["db_user"], password=db_config["db_password"], + host=db_config["db_host"], port=db_config["db_port"], appname="4cat-migrate") + +print(" Checking if datasets table has a column 'software_source'...") +has_column = db.fetchone( + "SELECT COUNT(*) AS num FROM information_schema.columns WHERE table_name = 'datasets' AND column_name = 'software_source'") +if has_column["num"] == 0: + print(" ...No, adding.") + current_source = db.fetchone("SELECT value FROM settings WHERE name = '4cat.github_url' AND tag = ''") + current_source = json.loads(current_source["value"]) if current_source is not None else "" + db.execute("ALTER TABLE datasets ADD COLUMN software_source TEXT DEFAULT %s", (current_source,)) + db.commit() +else: + print(" ...Yes, nothing to update.") \ No newline at end of file diff --git a/processors/filtering/column_filter.py b/processors/filtering/column_filter.py index 01c7fa88f..2dc73b63e 100644 --- a/processors/filtering/column_filter.py +++ b/processors/filtering/column_filter.py @@ -75,7 +75,7 @@ class ColumnFilter(BaseFilter): @classmethod def is_compatible_with(cls, module=None, user=None): """ - Allow processor on top datasets. + Allow processor on top datasets that are CSV or NDJSON. :param module: Module to determine compatibility with """ @@ -262,11 +262,11 @@ class ColumnProcessorFilter(ColumnFilter): @classmethod def is_compatible_with(cls, module=None, user=None): """ - Allow processor on top datasets. + Allow on child datasets and do not create a standalone dataset :param module: Dataset or processor to determine compatibility with """ - return module.get_extension() in ("csv", "ndjson") and not module.is_top_dataset() + return not module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") @classmethod def is_filter(cls): diff --git a/processors/metrics/rank_attribute.py b/processors/metrics/rank_attribute.py index adffe824a..0e38757c6 100644 --- a/processors/metrics/rank_attribute.py +++ b/processors/metrics/rank_attribute.py @@ -110,11 +110,12 @@ class AttributeRanker(BasicProcessor): @classmethod def is_compatible_with(cls, module=None, user=None): """ - Allow processor on top image rankings + Allow processor to run on all csv and NDJSON datasets :param module: Module to determine compatibility with """ - return module.get_extension() in ["csv", "ndjson"] + + return module.get_extension() in ("csv", "ndjson") def process(self): """ @@ -134,7 +135,7 @@ def process(self): weighby = self.parameters.get("weigh") to_lowercase = self.parameters.get("to-lowercase", True) self.include_missing_data = self.parameters.get("count_missing") - + try: if self.parameters.get("filter"): filter = re.compile(".*" + self.parameters.get("filter") + ".*") @@ -203,7 +204,7 @@ def missing_value_placeholder(data, field_name): for value in values: if to_lowercase: value = value.lower() - + if rank_style == "overall" and value not in overall_top: continue @@ -340,4 +341,4 @@ def get_options(cls, parent_dataset=None, user=None): options["columns"]["options"] = {v: v for v in columns} options["columns"]["default"] = ["body"] - return options \ No newline at end of file + return options diff --git a/processors/networks/wikipedia_network.py b/processors/networks/wikipedia_network.py index 00e141fc7..0426c97d2 100644 --- a/processors/networks/wikipedia_network.py +++ b/processors/networks/wikipedia_network.py @@ -3,19 +3,20 @@ """ import re import requests - -from backend.lib.processor import BasicProcessor from lxml import etree from lxml.cssselect import CSSSelector as css from io import StringIO - import networkx as nx +from backend.lib.processor import BasicProcessor +from common.lib.exceptions import ProcessorInterruptedException + __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters", "Sal Hagen"] __maintainer__ = "Stijn Peeters" __email__ = "4cat@oilab.eu" + class WikiURLCoLinker(BasicProcessor): """ Generate URL co-link network diff --git a/processors/presets/neologisms.py b/processors/presets/neologisms.py index 26684e4d0..1cf258503 100644 --- a/processors/presets/neologisms.py +++ b/processors/presets/neologisms.py @@ -19,17 +19,6 @@ class NeologismExtractor(ProcessorPreset): references = ["Van Soest, Jeroen. 2019. 'Language Innovation Tracker: Detecting language innovation in online discussion fora.' (MA thesis), Beuls, K. (Promotor), Van Eecke, P. (Advisor).'"] - @staticmethod - def is_compatible_with(module=None, user=None): - """ - Determine compatibility - - This preset is compatible with any dataset that has columns - - :param Dataset module: Module ID to determine compatibility with - :return bool: - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") @classmethod def get_options(cls, parent_dataset=None, user=None): @@ -60,6 +49,16 @@ def get_options(cls, parent_dataset=None, user=None): return options + @classmethod + def is_compatible_with(cls, module=None, user=None): + """ + Allow processor to run on all csv and NDJSON datasets + + :param module: Dataset or processor to determine compatibility with + """ + + return module.get_extension() in ("csv", "ndjson") + def get_processor_pipeline(self): """ This queues a series of post-processors to extract neologisms from a diff --git a/processors/text-analysis/get_entities.py b/processors/text-analysis/get_entities.py deleted file mode 100644 index e639c7672..000000000 --- a/processors/text-analysis/get_entities.py +++ /dev/null @@ -1,172 +0,0 @@ -""" -Extract nouns from SpaCy NLP docs. - -""" -import pickle -import spacy - -from collections import Counter -from spacy.tokens import DocBin -from common.lib.helpers import UserInput -from backend.lib.processor import BasicProcessor -from common.lib.exceptions import ProcessorInterruptedException - -__author__ = "Sal Hagen" -__credits__ = ["Sal Hagen"] -__maintainer__ = "Sal Hagen" -__email__ = "4cat@oilab.eu" - - -class ExtractNouns(BasicProcessor): # TEMPORARILY DISABLED - """ - Rank vectors over time - """ - type = "get-entities" # job type ID - category = "Text analysis" # category - title = "Extract named entities" # title displayed in UI - description = "Retrieve named entities detected by SpaCy, ranked on frequency. Be sure to have selected " \ - "\"Named Entity Recognition\" in the previous module." # description displayed in UI - extension = "csv" # extension of result file, used internally and in UI - - followups = ["wordcloud"] - - options = { - "entities": { - "type": UserInput.OPTION_MULTI, - "default": [], - "options": { - "PERSON": "PERSON: People, including fictional.", - "NORP": "NORP: Nationalities or religious or political groups.", - "FAC": "FAC: Buildings, airports, highways, bridges, etc.", - "ORG": "ORG: Companies, agencies, institutions, etc.", - "GPE": "GPE: Countries, cities, states.", - "LOC": "LOC: Non-GPE locations, mountain ranges, bodies of water.", - "PRODUCT": "PRODUCT: Objects, vehicles, foods, etc. (Not services.)", - "EVENT": "EVENT: Named hurricanes, battles, wars, sports events, etc.", - "WORK_OF_ART": "WORK_OF_ART: Titles of books, songs, etc.", - "LAW": "LAW: Named documents made into laws.", - "LANGUAGE": "LANGUAGE: Any named language.", - "DATE": "DATE: Absolute or relative dates or periods.", - "TIME": "TIME: Times smaller than a day.", - "PERCENT": "PERCENT: Percentage, including ”%“.", - "MONEY": "MONEY: Monetary values, including unit.", - "QUANTITY": "QUANTITY: Measurements, as of weight or distance.", - "ORDINAL": "ORDINAL: “first”, “second”, etc.", - "CARDINAL": "CARDINAL: Numerals that do not fall under another type." - }, - "help": "What types of entities to extract (select at least one)", - "tooltip": "The above list is derived from the SpaCy documentation (see references)." - } - } - - references = [ - "[SpaCy named entities list](https://spacy.io/api/annotation#named-entities)" - ] - - @classmethod - def is_compatible_with(cls, module=None, user=None): - """ - Allow processor on linguistic feature data - - :param module: Module to determine compatibility with - """ - - return module.type == "linguistic-features" - - def process(self): - """ - Opens the SpaCy output and gets ze entities. - - """ - - # Validate whether the user enabled the right parameters. - if "ner" not in self.source_dataset.parameters["enable"]: - self.dataset.update_status("Enable \"Named entity recognition\" in previous module") - self.dataset.finish(0) - return - - else: - # Extract the SpaCy docs first - self.dataset.update_status("Unzipping SpaCy docs") - - # Store all the entities in this list - li_entities = [] - nlp = spacy.load("en_core_web_sm") # Load model - - for doc_file in self.iterate_archive_contents(self.source_file): - with doc_file.open("rb") as pickle_file: - # Load DocBin - file = pickle.load(pickle_file) - doc_bin = DocBin().from_bytes(file) - docs = list(doc_bin.get_docs(nlp.vocab)) - - for doc in docs: - post_entities = [] - - # stop processing if worker has been asked to stop - if self.interrupted: - raise ProcessorInterruptedException("Interrupted while processing documents") - - for ent in doc.ents: - if ent.label_ in self.parameters["entities"]: - post_entities.append((ent.text, ent.label_)) # Add a tuple - - li_entities.append(post_entities) - - results = [] - - if li_entities: - - # Also add the data to the original file, if indicated. - if self.parameters.get("overwrite"): - self.add_field_to_parent(field_name='named_entities', - # Format like "Apple:ORG, Gates:PERSON, ..." and add to the row - new_data=[", ".join([":".join(post_entities) for post_entities in entity]) for entity in li_entities], - which_parent=self.dataset.top_parent(), - update_existing=True) - - all_entities = [] - # Convert to lower and filter out one-letter words. Join the words with the entities so we can group easily. - for post_ents in li_entities: - for pair in post_ents: - if pair and len(pair[0]) > 1: - pair = pair[0].lower() + " |#| " + pair[1] - all_entities.append(pair) - - # Group and rank - count_nouns = Counter(all_entities).most_common() - # Unsplit and list the count. - results = [{"word": tpl[0].split(" |#| ")[0], "entity": tpl[0].split(" |#| ")[1], "count": tpl[1]} for - tpl in count_nouns] - - # done! - if results: - self.dataset.update_status("Finished") - self.write_csv_items_and_finish(results) - else: - self.dataset.update_status("Finished, but no entities were extracted.") - self.dataset.finish(0) - - @classmethod - def get_options(cls, parent_dataset=None, user=None): - """ - Get processor options - - The feature of this processor that overwrites the parent dataset can - only work properly on csv datasets so check the extension before - showing it. - - :param user: - :param parent_dataset: Dataset to get options for - :return dict: - """ - options = cls.options - if parent_dataset and parent_dataset.top_parent().get_results_path().suffix in [".csv", ".ndjson"]: - options["overwrite"] = { - "type": UserInput.OPTION_TOGGLE, - "default": False, - "help": "Add extracted nouns to source csv", - "tooltip": "Will add a column (\"nouns\", \"nouns_and_compounds\", or \"noun_chunks\"), and the found nouns in the post row." - } - - return options diff --git a/processors/text-analysis/get_nouns.py b/processors/text-analysis/get_nouns.py deleted file mode 100644 index cad8653eb..000000000 --- a/processors/text-analysis/get_nouns.py +++ /dev/null @@ -1,196 +0,0 @@ -""" -Extract nouns from SpaCy NLP docs. - -""" -import pickle -import spacy - -from collections import Counter -from spacy.tokens import DocBin -from common.lib.helpers import UserInput -from backend.lib.processor import BasicProcessor - -__author__ = "Sal Hagen" -__credits__ = ["Sal Hagen"] -__maintainer__ = "Sal Hagen" -__email__ = "4cat@oilab.eu" - - -class ExtractNouns(BasicProcessor): - """ - Rank vectors over time - """ - type = "extract-nouns" # job type ID - category = "Text analysis" # category - title = "Extract nouns" # title displayed in UI - description = "Retrieve nouns detected by SpaCy's part-of-speech tagging, and rank by frequency. " \ - "Make sure to have selected \"Part of Speech\" in the previous " \ - "module, as well as \"Dependency parsing\" if you want to extract compound nouns or noun chunks." # description displayed in UI - extension = "csv" # extension of result file, used internally and in UI - - references = ["[Information on noun chunks](https://spacy.io/usage/linguistic-features#noun-chunks)"] - - followups = ["wordcloud"] - - options = { - "type": { - "type": UserInput.OPTION_CHOICE, - "default": ["nouns"], - "options": { - "nouns": "Single-word nouns", - "nouns_and_compounds": "Nouns and compound nouns", - "noun_chunks": "Noun chunks" - }, - "help": "Whether to only get 1) separate words indicated as nouns, 2) nouns and compound nouns " \ - "(nouns with multiple words, e.g.\"United States\") using a custom parser, or 3) noun chunks: " \ - "nouns plus the words describing them, e.g. \"the old grandpa\". See the references for more info." - } - } - - @classmethod - def is_compatible_with(cls, module=None, user=None): - """ - Allow processor on linguistic feature data - - :param module: Module to determine compatibility with - """ - return module.type == "linguistic-features" - - def process(self): - """ - Opens the SpaCy output and gets ze nouns. - - """ - noun_type = self.parameters["type"] - - # Validate whether the user enabled the right parameters. - # Check part of speech tagging - if "tagger" not in self.source_dataset.parameters["enable"]: - self.dataset.update_status("Enable \"Part-of-speech tagging\" in previous module") - self.dataset.finish(0) - - # Check dependency parsing if nouns and compouns nouns is selected - elif (noun_type == "nouns_and_compounds" or noun_type == "noun_chunks") and "parser" not in \ - self.source_dataset.parameters["enable"]: - self.dataset.update_status( - "Enable \"Part-of-speech tagging\" and \"Dependency parsing\" for compound nouns/noun chunks in previous module") - self.dataset.finish(0) - - # Valid parameters - else: - - # Extract the SpaCy docs first - self.dataset.update_status("Unzipping SpaCy docs") - self.dataset.update_status("Extracting nouns") - - # Store all the nouns in this list - li_nouns = [] - nlp = spacy.load("en_core_web_sm") # Load model - spacy.load("en_core_web_sm") - - for doc_file in self.iterate_archive_contents(self.source_file): - with doc_file.open("rb") as pickle_file: - # Load DocBin - file = pickle.load(pickle_file) - doc_bin = DocBin().from_bytes(file) - docs = list(doc_bin.get_docs(nlp.vocab)) - - # Simply add each word if its POS is "NOUN" - if noun_type == "nouns": - for doc in docs: - post_nouns = [] - post_nouns += [token.text for token in doc if token.pos_ == "NOUN"] - li_nouns.append(post_nouns) - - # Use SpaCy's noun chunk detection - elif noun_type == "noun_chunks": - - for doc in docs: - - # Note: this is a workaround for now. - # Serialization of the SpaCy docs does not - # work well with dependency parsing after - # loading. Quick fix: parse again. - - new_doc = nlp(doc.text) - post_nouns = [] - for chunk in new_doc.noun_chunks: - post_nouns.append(chunk.text) - - li_nouns.append(post_nouns) - - # Use a custom script to get single nouns and compound nouns - elif noun_type == "nouns_and_compounds": - for doc in docs: - post_nouns = [] - noun = "" - - for i, token in enumerate(doc): - - # Check for common nouns (general, e.g. "people") - # and proper nouns (specific, e.g. "London") - if token.pos_ == "NOUN" or token.pos_ == "PROPN": - # Check if the token is part of a noun chunk - if token.dep_ == "compound": # Check for a compound relation - noun = token.text - else: - if noun: - noun += " " + token.text - post_nouns.append(noun) - noun = "" - else: - post_nouns.append(token.text) - li_nouns.append(post_nouns) - - results = [] - - if li_nouns: - - # Also add the data to the original file, if indicated. - if self.parameters.get("overwrite"): - self.add_field_to_parent(field_name=noun_type, - # Format like "apple, gates, ..." and add to the row - new_data=[", ".join([post_noun.lower() for post_noun in li_noun if len(post_noun) > 1]) for li_noun in li_nouns], - which_parent=self.dataset.top_parent()) - - # convert to lower and filter out one-letter words - all_nouns = [] - for post_n in li_nouns: - all_nouns += [str(cap_noun).lower() for cap_noun in post_n if len(cap_noun) > 1] - - # Group and rank - count_nouns = Counter(all_nouns).most_common() - results = [{"word": tpl[0], "count": tpl[1]} for tpl in count_nouns] - - # done! - if results: - self.dataset.update_status("Finished") - self.write_csv_items_and_finish(results) - else: - self.dataset.update_status("Finished, but no nouns were extracted.") - self.dataset.finish(0) - - @classmethod - def get_options(cls, parent_dataset=None, user=None): - """ - Get processor options - - The feature of this processor that overwrites the parent dataset can - only work properly on csv datasets so check the extension before - showing it. - - :param user: - :param parent_dataset: Dataset to get options for - :return dict: - """ - options = cls.options - if parent_dataset and parent_dataset.top_parent().get_results_path().suffix in [".csv", ".ndjson"]: - options["overwrite"] = { - "type": UserInput.OPTION_TOGGLE, - "default": False, - "help": "Add extracted nouns to source csv", - "tooltip": "Will add a column (\"nouns\", \"nouns_and_compounds\", or \"noun_chunks\"), and the found " - "nouns in the post row." - } - - return options diff --git a/processors/text-analysis/linguistic_extractor.py b/processors/text-analysis/linguistic_extractor.py deleted file mode 100644 index 92357853a..000000000 --- a/processors/text-analysis/linguistic_extractor.py +++ /dev/null @@ -1,168 +0,0 @@ -""" -Extract linguistic features from text using SpaCy. - -""" -import zipfile -import pickle -import re - -import spacy -from spacy.tokens import DocBin -from spacy.tokenizer import Tokenizer -from spacy.util import compile_prefix_regex, compile_suffix_regex - -from common.lib.helpers import UserInput -from common.lib.exceptions import ProcessorInterruptedException -from backend.lib.processor import BasicProcessor - -__author__ = "Sal Hagen" -__credits__ = ["Sal Hagen", "Stijn Peeters"] -__maintainer__ = "Sal Hagen" -__email__ = "4cat@oilab.eu" - - -class LinguisticFeatures(BasicProcessor): - """ - Rank vectors over time - """ - type = "linguistic-features" # job type ID - category = "Text analysis" # category - title = "Annotate text features with SpaCy" # title displayed in UI - description = "Annotate your text with a variety of linguistic features using the SpaCy library, " \ - "including part-of-speech tagging, depencency parsing, and named entity recognition. " \ - "Subsequent processors can extract the words labelled by SpaCy (e.g. as a noun or name). " \ - "Produces a Doc file using the en_core_web_sm model. Currently only available for datasets " \ - "with less than 100,000 items. " # description displayed in UI - extension = "zip" # extension of result file, used internally and in UI - - followups = ["get-entities", "extract-nouns"] - - references = [ - "[SpaCy Linguistic Features - Documentation](https://spacy.io/usage/linguistic-features/)" - ] - - options = { - "enable": { - "type": UserInput.OPTION_MULTI, - "default": [], - "options": { - "tagger": "Part-of-speech tagging: Tag the grammatical function of words, like nouns and verbs", - "parser": "Dependency parsing: Extract how words in a sentence relate to each other", - "ner": "Named entity recognition: Annotate what kind of objects appear in a sentence (e.g. Apple -> Organisation)" - }, - "help": "What linguistic features to extract. Without any of these selected, it simply saves the SpaCy docs (tokenised sentences) as a serialized file. See references for more information." - } - } - - @classmethod - def is_compatible_with(cls, module=None, user=None): - """ - Allow CSV and NDJSON datasets - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - - def process(self): - """ - Reads text and outputs entities per text body. - """ - - # prepare staging area - staging_area = self.dataset.get_staging_area() - - self.dataset.update_status("Preparing data") - - # go through all archived token sets and vectorise them - results = [] - - # Load the spacy goods - nlp = spacy.load("en_core_web_sm") - nlp.tokenizer = self.custom_tokenizer(nlp) # Keep words with a dash in between - - # Disable what has _not_ been selected - options = ["parser", "tagger", "ner"] - enable = self.parameters.get("enable", False) - - if not enable: - self.dataset.update_status("Select at least one of the options.") - self.dataset.finish(0) - return - - disable = [option for option in options if option not in enable] - - # Get all ze text first so we can process it in batches - posts = [] - for post in self.source_dataset.iterate_items(self): - if post.get("body", ""): - if len(post["body"]) > 1000000: - body = post["body"][:1000000] - else: - body = post["body"] - posts.append(body) - else: - self.dataset.log('Warning: Post %s has no body from which to extract entities' % post.get('id')) - posts.append("") - - # Process the text in batches - if len(posts) < 100000: - self.dataset.update_status("Extracting linguistic features") - else: - self.dataset.update_status( - "Extracting linguistic features is currently only available for datasets with less than 100,000 items.") - self.dataset.finish(0) - return - - # Make sure only the needed information is extracted. - attrs = [] - if "tagger" not in disable: - attrs.append("POS") - if "parser" not in disable: - attrs.append("DEP") - if "ner": - attrs.append("ENT_IOB") - attrs.append("ENT_TYPE") - attrs.append("ENT_ID") - attrs.append("ENT_KB_ID") - - # DocBin for quick saving - doc_bin = DocBin(attrs=attrs) - - # Start the processing! - try: - for i, doc in enumerate(nlp.pipe(posts, disable=disable)): - doc_bin.add(doc) - - # It's quite a heavy process, so make sure it can be interrupted - if self.interrupted: - raise ProcessorInterruptedException("Processor interrupted while iterating through CSV file") - - if i % 1000 == 0: - self.dataset.update_status("Done with post %s out of %s" % (i, len(posts))) - except MemoryError: - self.dataset.update_status("Out of memory. The dataset may be too large to process. Try again with a smaller dataset.", is_final=True) - return - - self.dataset.update_status("Serializing results - this will take a while") - - # Then serialize the NLP docs and the vocab - doc_bytes = doc_bin.to_bytes() - - # Dump ze data in a temporary folder - with staging_area.joinpath("spacy_docs.pb").open("wb") as outputfile: - pickle.dump(doc_bytes, outputfile) - - # create zip of archive and delete temporary files and folder - self.write_archive_and_finish(staging_area, compression=zipfile.ZIP_LZMA) - - def custom_tokenizer(self, nlp): - """ - Custom tokeniser that does not split on dashes. - Useful for names (e.g. Hennis-Plasschaert). - """ - infix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''') - prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) - suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) - - return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, - suffix_search=suffix_re.search, - infix_finditer=infix_re.finditer, - token_match=None) diff --git a/processors/text-analysis/split_sentences.py b/processors/text-analysis/split_sentences.py index c5cce2477..dd2be7c2f 100644 --- a/processors/text-analysis/split_sentences.py +++ b/processors/text-analysis/split_sentences.py @@ -86,8 +86,11 @@ def get_options(cls, parent_dataset=None, user=None): @classmethod def is_compatible_with(cls, module=None, user=None): """ - Allow CSV and NDJSON datasets + Allow processor to run on all csv and NDJSON datasets + + :param module: Dataset or processor to determine compatibility with """ + return module.get_extension() in ("csv", "ndjson") def process(self): diff --git a/processors/text-analysis/tokenise.py b/processors/text-analysis/tokenise.py index fb1b89cbd..a104306f1 100644 --- a/processors/text-analysis/tokenise.py +++ b/processors/text-analysis/tokenise.py @@ -50,8 +50,11 @@ class Tokenise(BasicProcessor): @classmethod def is_compatible_with(cls, module=None, user=None): """ - Allow CSV and NDJSON datasets + Allow processor to run on all csv and NDJSON datasets + + :param module: Dataset or processor to determine compatibility with """ + return module.get_extension() in ("csv", "ndjson") @classmethod diff --git a/processors/visualisation/download_videos.py b/processors/visualisation/download_videos.py index aa24a724b..2b385ffe7 100644 --- a/processors/visualisation/download_videos.py +++ b/processors/visualisation/download_videos.py @@ -234,7 +234,7 @@ def is_compatible_with(cls, module=None, user=None): in principle, but any links to videos are likely to come from the top dataset anyway. - :param str module: Module ID to determine compatibility with + :param module: Module to determine compatibility with :return bool: """ return ((module.type.endswith("-search") or module.is_from_collector()) @@ -645,6 +645,9 @@ def collect_video_urls(self): if not value: continue + if value is not str: + value = str(value) + video_links = self.identify_video_urls_in_string(value) if video_links: item_urls |= set(video_links) @@ -667,7 +670,6 @@ def identify_video_urls_in_string(self, text): :param str text: string that may contain URLs :return list: list containing validated URLs to videos """ - text = str(text) split_comma = self.parameters.get("split-comma", True) if split_comma: texts = text.split(",") diff --git a/processors/visualisation/image_category_wall.py b/processors/visualisation/image_category_wall.py index fee1fb7b0..d74d28e40 100644 --- a/processors/visualisation/image_category_wall.py +++ b/processors/visualisation/image_category_wall.py @@ -61,13 +61,14 @@ class ImageCategoryWallGenerator(BasicProcessor): def is_compatible_with(cls, module=None, user=None): """ Allow processor on CLIP dataset only - + :param module: Dataset or processor to determine compatibility with """ return module.type.startswith("image-to-categories") or \ module.type.startswith("image-downloader") or \ module.type.startswith("video-hasher-1") or \ - module.type.startswith("video-hash-similarity-matrix") + module.type.startswith("video-hash-similarity-matrix") and \ + not module.type not in ["image-downloader-screenshots-search"] @classmethod def get_options(cls, parent_dataset=None, user=None): @@ -170,7 +171,7 @@ def process(self): self.dataset.log(f"Found {image_dataset.type} w/ {image_dataset.num_rows} images and {category_dataset.type} w/ {category_dataset.num_rows} items") category_column = self.parameters.get("category") - if category_column is None: + if not category_column: self.dataset.finish_with_error("No category provided.") return @@ -427,6 +428,3 @@ def process(self): canvas.save(pretty=True) self.dataset.log("Saved to " + str(self.dataset.get_results_path())) return self.dataset.finish(len(category_widths)) - - - diff --git a/processors/visualisation/word-trees.py b/processors/visualisation/word-trees.py index 6446372e8..f7783bcc1 100644 --- a/processors/visualisation/word-trees.py +++ b/processors/visualisation/word-trees.py @@ -104,6 +104,16 @@ class MakeWordtree(BasicProcessor): } } + @classmethod + def is_compatible_with(cls, module=None, user=None): + """ + Allow processor to run on all csv and NDJSON datasets + + :param module: Dataset or processor to determine compatibility with + """ + + return module.get_extension() in ("csv", "ndjson") + # determines how close the nodes are displayed to each other (min. 1) whitespace = 2 @@ -126,13 +136,6 @@ class MakeWordtree(BasicProcessor): # methods limit = 1 - @classmethod - def is_compatible_with(cls, module=None, user=None): - """ - Allow CSV and NDJSON datasets - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - def process(self): """ This takes a 4CAT results file as input, and outputs a plain text file diff --git a/setup.py b/setup.py index e62f292ba..0e4e536f1 100644 --- a/setup.py +++ b/setup.py @@ -8,10 +8,10 @@ version = versionfile.readline().strip() # Universal packages -packages = [ +packages = set([ "anytree~=2.8.0", "bcrypt~=3.2.0", - "beautifulsoup4~=4.11.0", + "beautifulsoup4",#~=4.11.0", "clarifai-grpc~=9.0", "cryptography>=39.0.1", "cssselect~=1.1.0", @@ -22,7 +22,7 @@ "Flask~=2.2", "Flask_Limiter==1.0.1", "Flask_Login~=0.6", - "gensim>=4.1.0, <4.2", + "gensim>=4.3.3, <4.4.0", "google_api_python_client==2.0.2", "html2text==2020.*", "ImageHash>4.2.0", @@ -31,7 +31,7 @@ "lxml~=4.9.0", "markdown==3.0.1", "markdown2==2.4.2", - "nltk==3.9.1", + "nltk~=3.9.1", "networkx~=2.8.0", "numpy>=1.19.2", "opencv-python>=4.6.0.66", @@ -48,11 +48,11 @@ "razdel~=0.5", "requests~=2.27", "requests_futures", + "scikit_learn", "scenedetect==0.6.0.3", "scikit-learn", "scipy==1.10.1", "shapely", - "spacy==3.7.2", "svgwrite~=1.4.0", "tailer", "Telethon~=1.36.0", @@ -64,17 +64,29 @@ "imagedominantcolor @ git+https://github.com/dale-wahl/imagedominantcolor.git@pillow10", "videohash @ git+https://github.com/dale-wahl/videohash@main", "vk_api", - "yt-dlp", - "en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz#egg=en_core_web_sm" -] + "yt-dlp" +]) + +# Check for extension packages +if os.path.isdir("extensions"): + extension_packages = set() + for root, dirs, files in os.walk("extensions"): + for file in files: + if file == "requirements.txt": + with open(os.path.join(root, file)) as extension_requirements: + for line in extension_requirements.readlines(): + extension_packages.add(line.strip()) + if extension_packages: + print("Found extensions, installing additional packages: " + str(extension_packages)) + packages = packages.union(extension_packages) # Some packages don't run on Windows -unix_packages = [ +unix_packages = set([ "python-daemon==2.3.2" -] +]) if os.name != "nt": - packages = packages + unix_packages + packages = packages.union(unix_packages) setup( name='fourcat', @@ -87,5 +99,5 @@ url="https://oilab.eu", packages=['backend', 'webtool', 'datasources'], python_requires='>=3.7', - install_requires=packages, + install_requires=list(packages), ) diff --git a/webtool/__init__.py b/webtool/__init__.py index 7becd1239..6c1786ad5 100644 --- a/webtool/__init__.py +++ b/webtool/__init__.py @@ -105,12 +105,11 @@ # import all views import webtool.views.views_admin +import webtool.views.views_extensions import webtool.views.views_restart import webtool.views.views_user - import webtool.views.views_dataset import webtool.views.views_misc - import webtool.views.api_explorer import webtool.views.api_standalone import webtool.views.api_tool diff --git a/webtool/lib/helpers.py b/webtool/lib/helpers.py index d06f4435c..6cc91eba1 100644 --- a/webtool/lib/helpers.py +++ b/webtool/lib/helpers.py @@ -23,7 +23,7 @@ class Pagination(object): Provide pagination """ - def __init__(self, page, per_page, total_count, route="show_results"): + def __init__(self, page, per_page, total_count, route="show_results", route_args=None): """ Set up pagination object @@ -36,6 +36,7 @@ def __init__(self, page, per_page, total_count, route="show_results"): self.per_page = per_page self.total_count = total_count self.route = route + self.route_args = route_args if route_args else {} @property def pages(self): diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py index c50caca26..6ac9272ba 100644 --- a/webtool/lib/template_filters.py +++ b/webtool/lib/template_filters.py @@ -139,9 +139,12 @@ def _jinja2_filter_add_ahref(content): return content -@app.template_filter('markdown') -def _jinja2_filter_markdown(text): +@app.template_filter('markdown',) +def _jinja2_filter_markdown(text, trim_container=False): val = markdown.markdown(text) + if trim_container: + val = re.sub(r"^

", "", val) + val = re.sub(r"

$", "", val) return val @app.template_filter('isbool') @@ -262,7 +265,7 @@ def _jinja2_filter_post_field(field, post): formatted_field = field field = str(field) - + for key in re.findall(r"\{\{(.*?)\}\}", field): original_key = key @@ -296,7 +299,7 @@ def _jinja2_filter_post_field(field, post): # We see 0 as a valid value - e.g. '0 retweets'. if not val and val != 0: return "" - + # Support some basic string slicing if string_slice: field = field.replace("[" + string_slice + "]", "") @@ -317,7 +320,7 @@ def _jinja2_filter_post_field(field, post): # Apply further filters, if present (e.g. lower) for extra_filter in extra_filters: - + extra_filter = extra_filter.strip() # We're going to parse possible parameters to pass to the filter @@ -328,7 +331,7 @@ def _jinja2_filter_post_field(field, post): extra_filter = extra_filter.split("(")[0] params = [p.strip() for p in params.split(",")] params = [post[param] for param in params] - + val = app.jinja_env.filters[extra_filter](val, *params) if string_slice: @@ -388,3 +391,7 @@ def uniqid(): "__version": version, "uniqid": uniqid } + +@app.template_filter('log') +def _jinja2_filter_log(text): + app.logger.info(text) \ No newline at end of file diff --git a/webtool/static/js/fourcat.js b/webtool/static/js/fourcat.js index e622505b2..950ba523e 100644 --- a/webtool/static/js/fourcat.js +++ b/webtool/static/js/fourcat.js @@ -497,7 +497,7 @@ const query = { applyProgress($('#query-status'), 100); let keyword = json.label; - $('#query-results').append('
  • ' + keyword + ' (' + json.rows + ' items)
  • '); + $('#query-results').append('
  • ' + keyword + ' (' + json.rows + ' items)
  • '); query.reset_form(); popup.alert('Query for \'' + keyword + '\' complete!', 'Success'); } else { @@ -630,17 +630,17 @@ const query = { for (let i = 0; i < json.length; i += 1) { search_queue_length += json[i]['count']; - search_queue_notice += " " + json[i]['jobtype'].replace('-search', '') + ' (' + json[i]['count'] + ')' + '' + search_queue_notice += " " + json[i]['processor_name'] + ' (' + json[i]['count'] + ')' + '' } if (search_queue_length == 0) { search_queue_box.html('Search queue is empty.'); search_queue_list.html(''); } else if (search_queue_length == 1) { - search_queue_box.html('Currently processing 1 search query: '); + search_queue_box.html('Currently collecting 1 dataset: '); search_queue_list.html(search_queue_notice); } else { - search_queue_box.html('Currently processing ' + search_queue_length + ' search queries: '); + search_queue_box.html('Currently collecting ' + search_queue_length + ' datasets: '); search_queue_list.html(search_queue_notice); } }, @@ -1993,4 +1993,4 @@ function find_parent(element, selector) { } return null; -} \ No newline at end of file +} diff --git a/webtool/templates/account/login.html b/webtool/templates/account/login.html index d95d6d9ef..de11e90b5 100644 --- a/webtool/templates/account/login.html +++ b/webtool/templates/account/login.html @@ -8,7 +8,9 @@

    + +
    +

    HTTP request headers

    +
    +{{ headers }}
    +            
    +
    {% endblock %} diff --git a/webtool/templates/create-dataset.html b/webtool/templates/create-dataset.html index 91194aa45..751fcdd5b 100644 --- a/webtool/templates/create-dataset.html +++ b/webtool/templates/create-dataset.html @@ -36,6 +36,7 @@

    Create new dataset

    + {% if __user_config("ui.offer_hashing") %}

    4CAT can remove information it identifies as relating to an item's author, or replace it with a hashed value. Other personal information may persist; it is your responsibility to further anonymise data where @@ -50,16 +51,17 @@

    Create new dataset

    + {% endif %} + {% if __user_config("ui.offer_private") %}
    - -
    + {% endif %} {% if __user_config("ui.option_email") in ["both", "datasources_only"] and __user_config("mail.server") %}
    diff --git a/webtool/templates/data-overview.html b/webtool/templates/data-overview.html index 7b371178f..f31f2178d 100644 --- a/webtool/templates/data-overview.html +++ b/webtool/templates/data-overview.html @@ -45,7 +45,7 @@

    Metadata

  • The data for this data source are gathered and stored by this 4CAT instance.
  • {% elif label == "static" %}
  • The data for this datasource are not updated anymore and show a static snapshot.
  • - {% elif label == "extension" %} + {% elif label == "zeeschuimer" %}
  • The data for this datasource are collected with Zeeschuimer.
  • {% elif label == "external" %}
  • The data for this datasource is collected externally (API or custom upload).
  • diff --git a/webtool/templates/frontpage.html b/webtool/templates/frontpage.html index 178f112c0..78e0eaf1b 100644 --- a/webtool/templates/frontpage.html +++ b/webtool/templates/frontpage.html @@ -12,19 +12,20 @@

    What is {{ __user_config("4cat.name") }}?

    4CAT is developed by OILab and the Digital Methods Initiative at the University of Amsterdam. For more information, take a look at the 4CAT website.

    -

    News and updates

    -
      - {% if news %} - {% for item in news %} -
    1. - - {{ item.text|markdown|safe }} -
    2. - {% endfor %} - {% else %} -
    3. You can add news for your 4CAT instance in news.json in the 4CAT root folder.
    4. - {% endif %} -
    + {% if __user_config("4cat.about_this_server") %} +

    About this server

    +

    {{ __user_config("4cat.about_this_server") }}

    + {% endif %} +

    4CAT updates

    +

    + + + +