diff --git a/.dockerignore b/.dockerignore index 5d1d149e0..558da504b 100644 --- a/.dockerignore +++ b/.dockerignore @@ -2,3 +2,4 @@ data/ .github/ .ipynb_checkpoints/ .gitignore +.idea/ diff --git a/.env b/.env index 69a217df0..d03f9c703 100644 --- a/.env +++ b/.env @@ -30,7 +30,7 @@ TELEGRAM_PORT=443 # Docker Volume Names DOCKER_DB_VOL=4cat_4cat_db DOCKER_DATA_VOL=4cat_4cat_data -DOCKER_CONFIG_VOL=4cat_4cat_share +DOCKER_CONFIG_VOL=4cat_4cat_config DOCKER_LOGS_VOL=4cat_4cat_logs # Gunicorn settings @@ -39,4 +39,3 @@ workers=4 threads=4 worker_class=gthread log_level=debug - diff --git a/.zenodo.json b/.zenodo.json index 3ab05ca45..fd261019f 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -3,7 +3,7 @@ "license": "MPL-2.0", "title": "4CAT Capture and Analysis Toolkit", "upload_type": "software", - "version": "v1.45", + "version": "v1.46", "keywords": [ "webmining", "scraping", diff --git a/VERSION b/VERSION index 6245ec1a2..fa2cb2583 100644 --- a/VERSION +++ b/VERSION @@ -1,4 +1,4 @@ -1.45 +1.46 This file should not be modified. It is used by 4CAT to determine whether it needs to run migration scripts to e.g. update the database structure to a more diff --git a/backend/database.sql b/backend/database.sql index 33f0ea393..1f372a697 100644 --- a/backend/database.sql +++ b/backend/database.sql @@ -56,6 +56,7 @@ CREATE TABLE IF NOT EXISTS datasets ( is_private boolean DEFAULT TRUE, software_version text, software_file text DEFAULT '', + software_source text DEFAULT '', annotation_fields text DEFAULT '' ); diff --git a/backend/lib/processor.py b/backend/lib/processor.py index c67fa7a9d..0ed4cb6a3 100644 --- a/backend/lib/processor.py +++ b/backend/lib/processor.py @@ -164,7 +164,7 @@ def work(self): # start log file self.dataset.update_status("Processing data") - self.dataset.update_version(get_software_commit()) + self.dataset.update_version(get_software_commit(self)) # get parameters # if possible, fill defaults where parameters are not provided @@ -628,7 +628,7 @@ def write_csv_items_and_finish(self, data): self.dataset.update_status("Finished") self.dataset.finish(len(data)) - def write_archive_and_finish(self, files, num_items=None, compression=zipfile.ZIP_STORED): + def write_archive_and_finish(self, files, num_items=None, compression=zipfile.ZIP_STORED, finish=True): """ Archive a bunch of files into a zip archive and finish processing @@ -639,6 +639,7 @@ def write_archive_and_finish(self, files, num_items=None, compression=zipfile.ZI files added to the archive will be used. :param int compression: Type of compression to use. By default, files are not compressed, to speed up unarchiving. + :param bool finish: Finish the dataset/job afterwards or not? """ is_folder = False if issubclass(type(files), PurePath): @@ -665,7 +666,8 @@ def write_archive_and_finish(self, files, num_items=None, compression=zipfile.ZI if num_items is None: num_items = done - self.dataset.finish(num_items) + if finish: + self.dataset.finish(num_items) def create_standalone(self): """ diff --git a/backend/lib/search.py b/backend/lib/search.py index cdcd08115..15b3982d6 100644 --- a/backend/lib/search.py +++ b/backend/lib/search.py @@ -1,16 +1,16 @@ import hashlib +import zipfile import secrets -import shutil import random import json import math import csv +import os from pathlib import Path from abc import ABC, abstractmethod from common.config_manager import config -from common.lib.dataset import DataSet from backend.lib.processor import BasicProcessor from common.lib.helpers import strip_tags, dict_search_and_update, remove_nuls, HashCache from common.lib.exceptions import WorkerInterruptedException, ProcessorInterruptedException, MapItemException @@ -71,7 +71,6 @@ def process(self): items = self.import_from_file(query_parameters.get("file")) else: items = self.search(query_parameters) - except WorkerInterruptedException: raise ProcessorInterruptedException("Interrupted while collecting data, trying again later.") @@ -79,10 +78,12 @@ def process(self): num_items = 0 if items: self.dataset.update_status("Writing collected data to dataset file") - if results_file.suffix == ".ndjson": - num_items = self.items_to_ndjson(items, results_file) - elif results_file.suffix == ".csv": + if self.extension == "csv": num_items = self.items_to_csv(items, results_file) + elif self.extension == "ndjson": + num_items = self.items_to_ndjson(items, results_file) + elif self.extension == "zip": + num_items = self.items_to_archive(items, results_file) else: raise NotImplementedError("Datasource query cannot be saved as %s file" % results_file.suffix) @@ -361,6 +362,22 @@ def items_to_ndjson(self, items, filepath): return processed + def items_to_archive(self, items, filepath): + """ + Save retrieved items as an archive + + Assumes that items is an iterable with one item, a Path object + referring to a folder containing files to be archived. The folder will + be removed afterwards. + + :param items: + :param filepath: Where to store the archive + :return int: Number of items + """ + num_items = len(os.listdir(items)) + self.write_archive_and_finish(items, None, zipfile.ZIP_STORED, False) + return num_items + class SearchWithScope(Search, ABC): """ @@ -404,7 +421,7 @@ def search(self, query): # proportion of items matches # first, get amount of items for all threads in which matching # items occur and that are long enough - thread_ids = tuple([post["thread_id"] for post in items]) + thread_ids = tuple([item["thread_id"] for item in items]) self.dataset.update_status("Retrieving thread metadata for %i threads" % len(thread_ids)) try: min_length = int(query.get("scope_length", 30)) diff --git a/backend/lib/worker.py b/backend/lib/worker.py index 3fe19e067..a5695e673 100644 --- a/backend/lib/worker.py +++ b/backend/lib/worker.py @@ -133,6 +133,17 @@ def run(self): location = "->".join(frames) self.log.error("Worker %s raised exception %s and will abort: %s at %s" % (self.type, e.__class__.__name__, str(e), location)) + # Clean up after work successfully completed or terminates + self.clean_up() + + def clean_up(self): + """ + Clean up after a processor runs successfully or results in error. + Workers should override this method to implement any procedures + to run to clean up a worker; by default this does nothing. + """ + pass + def abort(self): """ Called when the application shuts down diff --git a/common/config_manager.py b/common/config_manager.py index 40bce67a6..86faf9060 100644 --- a/common/config_manager.py +++ b/common/config_manager.py @@ -44,9 +44,9 @@ def with_db(self, db=None): # Replace w/ db if provided else only initialise if not already self.db = db if db else Database(logger=None, dbname=self.get("DB_NAME"), user=self.get("DB_USER"), password=self.get("DB_PASSWORD"), host=self.get("DB_HOST"), - port=self.get("DB_PORT"), appname="config-reader") if not db else db + port=self.get("DB_PORT"), appname="config-reader") else: - # self.db already initialized + # self.db already initialized and no db provided pass def load_user_settings(self): diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py index 0507d0993..d1af7b95d 100644 --- a/common/lib/config_definition.py +++ b/common/lib/config_definition.py @@ -165,20 +165,10 @@ "help": "Can view worker status", "tooltip": "Controls whether users can view worker status via the Control Panel" }, - # The following two options should be set to ensure that every analysis step can + # The following option should be set to ensure that every analysis step can # be traced to a specific version of 4CAT. This allows for reproducible - # research. You can however leave them empty with no ill effect. The version ID - # should be a commit hash, which will be combined with the Github URL to offer - # links to the exact version of 4CAT code that produced an analysis result. - # If no version file is available, the output of "git show" in PATH_ROOT will be used - # to determine the version, if possible. - "path.versionfile": { - "type": UserInput.OPTION_TEXT, - "default": ".git-checked-out", - "help": "Version file", - "tooltip": "Path to file containing GitHub commit hash. File containing a commit ID (everything after the first whitespace found is ignored)", - "global": True - }, + # research. The output of "git show" in PATH_ROOT will be used to determine + # the version of a processor file, if possible. "4cat.github_url": { "type": UserInput.OPTION_TEXT, "default": "https://github.com/digitalmethodsinitiative/4cat", @@ -516,6 +506,18 @@ "tooltip": "If a dataset is a JSON file but it can be mapped to a CSV file, show the CSV in the preview instead" "of the underlying JSON." }, + "ui.offer_hashing": { + "type": UserInput.OPTION_TOGGLE, + "default": True, + "help": "Offer pseudonymisation", + "tooltip": "Add a checkbox to the 'create dataset' forum to allow users to toggle pseudonymisation." + }, + "ui.offer_private": { + "type": UserInput.OPTION_TOGGLE, + "default": True, + "help": "Offer create as private", + "tooltip": "Add a checkbox to the 'create dataset' forum to allow users to make a dataset private." + }, "ui.option_email": { "type": UserInput.OPTION_CHOICE, "options": { diff --git a/common/lib/dataset.py b/common/lib/dataset.py index 8510a5adb..2e75912a1 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -114,6 +114,9 @@ def __init__(self, parameters=None, key=None, job=None, data=None, db=None, pare self.parameters = json.loads(self.data["parameters"]) self.is_new = False else: + self.data = {"type": type} # get_own_processor needs this + own_processor = self.get_own_processor() + version = get_software_commit(own_processor) self.data = { "key": self.key, "query": self.get_label(parameters, default=type), @@ -125,7 +128,8 @@ def __init__(self, parameters=None, key=None, job=None, data=None, db=None, pare "timestamp": int(time.time()), "is_finished": False, "is_private": is_private, - "software_version": get_software_commit(), + "software_version": version[0], + "software_source": version[1], "software_file": "", "num_rows": 0, "progress": 0.0, @@ -139,7 +143,6 @@ def __init__(self, parameters=None, key=None, job=None, data=None, db=None, pare # Find desired extension from processor if not explicitly set if extension is None: - own_processor = self.get_own_processor() if own_processor: extension = own_processor.get_extension(parent_dataset=DataSet(key=parent, db=db) if parent else None) # Still no extension, default to 'csv' @@ -865,10 +868,12 @@ def get_label(self, parameters=None, default="Query"): elif parameters.get("subject_match") and parameters["subject_match"] != "empty": return parameters["subject_match"] elif parameters.get("query"): - label = parameters["query"] if len(parameters["query"]) < 30 else parameters["query"][:25] + "..." + label = parameters["query"] # Some legacy datasets have lists as query data if isinstance(label, list): label = ", ".join(label) + + label = label if len(label) < 30 else label[:25] + "..." label = label.strip().replace("\n", ", ") return label elif parameters.get("country_flag") and parameters["country_flag"] != "all": @@ -1116,7 +1121,8 @@ def update_version(self, version): processor_path = "" updated = self.db.update("datasets", where={"key": self.data["key"]}, data={ - "software_version": version, + "software_version": version[0], + "software_source": version[1], "software_file": processor_path }) @@ -1151,10 +1157,15 @@ def get_version_url(self, file): :param file: File to link within the repository :return: URL, or an empty string """ - if not self.data["software_version"] or not config.get("4cat.github_url"): + if not self.data["software_source"]: return "" - return config.get("4cat.github_url") + "/blob/" + self.data["software_version"] + self.data.get("software_file", "") + filepath = self.data.get("software_file", "") + if filepath.startswith("/extensions/"): + # go to root of extension + filepath = "/" + "/".join(filepath.split("/")[3:]) + + return self.data["software_source"] + "/blob/" + self.data["software_version"] + filepath def top_parent(self): """ diff --git a/common/lib/helpers.py b/common/lib/helpers.py index f6767c929..d98fc8ed6 100644 --- a/common/lib/helpers.py +++ b/common/lib/helpers.py @@ -1,6 +1,7 @@ """ Miscellaneous helper functions for the 4CAT backend """ +import hashlib import subprocess import requests import datetime @@ -16,9 +17,10 @@ import os import io +from pathlib import Path from collections.abc import MutableMapping from html.parser import HTMLParser -from pathlib import Path +from urllib.parse import urlparse, urlunparse from calendar import monthrange from packaging import version @@ -40,7 +42,6 @@ def init_datasource(database, logger, queue, name): """ pass - def strip_tags(html, convert_newlines=True): """ Strip HTML from a string @@ -120,12 +121,9 @@ def get_git_branch(): return "" -def get_software_commit(): +def get_software_commit(worker=None): """ - Get current 4CAT commit hash - - Reads a given version file and returns the first string found in there - (up until the first space). On failure, return an empty string. + Get current 4CAT git commit hash Use `get_software_version()` instead if you need the release version number rather than the precise commit hash. @@ -134,34 +132,58 @@ def get_software_commit(): repository in the 4CAT root folder, and if so, what commit is currently checked out in it. - :return str: 4CAT git commit hash - """ - versionpath = config.get('PATH_ROOT').joinpath(config.get('path.versionfile')) + For extensions, get the repository information for that extension, or if + the extension is not a git repository, return empty data. - if versionpath.exists() and not versionpath.is_file(): - return "" + :param BasicWorker processor: Worker to get commit for. If not given, get + version information for the main 4CAT installation. - if not versionpath.exists(): - # try git command line within the 4CAT root folder - # if it is a checked-out git repository, it will tell us the hash of - # the currently checked-out commit - try: - cwd = os.getcwd() - os.chdir(config.get('PATH_ROOT')) - show = subprocess.run(["git", "show"], stderr=subprocess.PIPE, stdout=subprocess.PIPE) - os.chdir(cwd) - if show.returncode != 0: - raise ValueError() - return show.stdout.decode("utf-8").split("\n")[0].split(" ")[1] - except (subprocess.SubprocessError, IndexError, TypeError, ValueError, FileNotFoundError): - return "" + :return tuple: 4CAT git commit hash, repository name + """ + # try git command line within the 4CAT root folder + # if it is a checked-out git repository, it will tell us the hash of + # the currently checked-out commit + cwd = os.getcwd() + # path has no Path.relative()... + relative_filepath = Path(re.sub(r"^[/\\]+", "", worker.filepath)).parent try: - with open(versionpath, "r", encoding="utf-8", errors="ignore") as versionfile: - version = versionfile.readline().split(" ")[0] - return version - except OSError: - return "" + # if extension, go to the extension file's path + # we will run git here - if it is not its own repository, we have no + # useful version info (since the extension is by definition not in the + # main 4CAT repository) and will return an empty value + if worker and worker.is_extension: + extension_dir = config.get("PATH_ROOT").joinpath(relative_filepath) + os.chdir(extension_dir) + # check if we are in the extensions' own repo or 4CAT's + repo_level = subprocess.run(["git", "rev-parse", "--show-toplevel"], stderr=subprocess.PIPE, stdout=subprocess.PIPE) + if Path(repo_level.stdout.decode("utf-8")) == config.get("PATH_ROOT"): + # not its own repository + return ("", "") + + else: + os.chdir(config.get("PATH_ROOT")) + + show = subprocess.run(["git", "show"], stderr=subprocess.PIPE, stdout=subprocess.PIPE) + if show.returncode != 0: + raise ValueError() + commit = show.stdout.decode("utf-8").split("\n")[0].split(" ")[1] + + # now get the repository the commit belongs to, if we can + origin = subprocess.run(["git", "config", "--get", "remote.origin.url"], stderr=subprocess.PIPE, stdout=subprocess.PIPE) + if origin.returncode != 0 or not origin.stdout: + raise ValueError() + repository = origin.stdout.decode("utf-8").strip() + if repository.endswith(".git"): + repository = repository[:-4] + + except (subprocess.SubprocessError, IndexError, TypeError, ValueError, FileNotFoundError) as e: + return ("", "") + + finally: + os.chdir(cwd) + + return (commit, repository) def get_software_version(): """ @@ -174,7 +196,7 @@ def get_software_version(): :return str: Software version, for example `1.37`. """ - current_version_file = Path(config.get("PATH_ROOT"), "config/.current-version") + current_version_file = config.get("PATH_ROOT").joinpath("config/.current-version") if not current_version_file.exists(): return "" @@ -887,6 +909,37 @@ def _sets_to_lists_gen(d): return dict(_sets_to_lists_gen(d)) + +def url_to_hash(url, remove_scheme=True, remove_www=True): + """ + Convert a URL to a filename; some URLs are too long to be used as filenames, this keeps the domain and hashes the + rest of the URL. + """ + parsed_url = urlparse(url.lower()) + if parsed_url: + if remove_scheme: + parsed_url = parsed_url._replace(scheme="") + if remove_www: + netloc = re.sub(r"^www\.", "", parsed_url.netloc) + parsed_url = parsed_url._replace(netloc=netloc) + + url = re.sub(r"[^0-9a-z]+", "_", urlunparse(parsed_url).strip("/")) + else: + # Unable to parse URL; use regex + if remove_scheme: + url = re.sub(r"^https?://", "", url) + if remove_www: + if not remove_scheme: + scheme = re.match(r"^https?://", url).group() + temp_url = re.sub(r"^https?://", "", url) + url = scheme + re.sub(r"^www\.", "", temp_url) + else: + url = re.sub(r"^www\.", "", url) + + url = re.sub(r"[^0-9a-z]+", "_", url.lower().strip("/")) + + return hashlib.blake2b(url.encode("utf-8"), digest_size=24).hexdigest() + def folder_size(path='.'): """ Get the size of a folder using os.scandir for efficiency diff --git a/common/lib/logger.py b/common/lib/logger.py index c1a015ca6..bbd30c444 100644 --- a/common/lib/logger.py +++ b/common/lib/logger.py @@ -163,7 +163,7 @@ class Logger: } alert_level = "FATAL" - def __init__(self, output=False, filename='4cat.log', log_level="INFO"): + def __init__(self, logger_name='4cat-backend', output=False, filename='4cat.log', log_level="INFO"): """ Set up log handler @@ -181,7 +181,7 @@ def __init__(self, output=False, filename='4cat.log', log_level="INFO"): self.log_path = log_folder.joinpath(filename) self.previous_report = time.time() - self.logger = logging.getLogger("4cat-backend") + self.logger = logging.getLogger(logger_name) self.logger.setLevel(log_level) # this handler manages the text log files diff --git a/common/lib/module_loader.py b/common/lib/module_loader.py index 84e5d951e..6d169d912 100644 --- a/common/lib/module_loader.py +++ b/common/lib/module_loader.py @@ -7,6 +7,7 @@ import pickle import sys import re +import os from common.config_manager import config @@ -69,14 +70,11 @@ def is_4cat_class(object, only_processors=False): """ Determine if a module member is a worker class we can use """ - # it would be super cool to just use issubclass() here! - # but that requires importing the classes themselves, which leads to - # circular imports if inspect.isclass(object): if object.__name__ in("BasicProcessor", "BasicWorker") or inspect.isabstract(object): # ignore abstract and base classes return False - + if hasattr(object, "is_4cat_class"): if only_processors: if hasattr(object, "is_4cat_processor"): @@ -85,7 +83,7 @@ def is_4cat_class(object, only_processors=False): return False else: return object.is_4cat_class() - + return False def load_modules(self): @@ -99,14 +97,17 @@ def load_modules(self): """ # look for workers and processors in pre-defined folders and datasources - paths = [Path(config.get('PATH_ROOT'), "processors"), Path(config.get('PATH_ROOT'), "backend", "workers"), - *[self.datasources[datasource]["path"] for datasource in self.datasources]] + paths = [Path(config.get('PATH_ROOT'), "processors"), + Path(config.get('PATH_ROOT'), "backend", "workers"), + Path(config.get('PATH_ROOT'), "extensions"), + *[self.datasources[datasource]["path"] for datasource in self.datasources]] # extension datasources will be here and the above line... root_match = re.compile(r"^%s" % re.escape(str(config.get('PATH_ROOT')))) root_path = Path(config.get('PATH_ROOT')) for folder in paths: # loop through folders, and files in those folders, recursively + is_extension = folder.is_relative_to(Path(config.get("PATH_ROOT"), "extensions")) for file in folder.rglob("*.py"): # determine module name for file # reduce path to be relative to 4CAT root @@ -147,6 +148,7 @@ def load_modules(self): self.workers[component[1].type] = component[1] self.workers[component[1].type].filepath = relative_path + self.workers[component[1].type].is_extension = is_extension # we can't use issubclass() because for that we would need # to import BasicProcessor, which would lead to a circular @@ -169,8 +171,7 @@ def load_modules(self): for missing_module, processor_list in self.missing_modules.items(): warning += "\t%s (for %s)\n" % (missing_module, ", ".join(processor_list)) - self.log_buffer = warning - + self.log_buffer += warning self.processors = categorised_processors @@ -183,19 +184,21 @@ def load_datasources(self): `DATASOURCE` constant. The latter is taken as the ID for this datasource. """ - for subdirectory in Path(config.get('PATH_ROOT'), "datasources").iterdir(): - # folder name, also the name used in config.py - folder_name = subdirectory.parts[-1] - - # determine module name - module_name = "datasources." + folder_name + def _load_datasource(subdirectory): + """ + Load a single datasource + """ + # determine module name (path relative to 4CAT root w/ periods) + module_name = ".".join(subdirectory.relative_to(Path(config.get("PATH_ROOT"))).parts) try: datasource = importlib.import_module(module_name) except ImportError as e: - continue + self.log_buffer += "Could not import %s: %s\n" % (module_name, e) + return if not hasattr(datasource, "init_datasource") or not hasattr(datasource, "DATASOURCE"): - continue + self.log_buffer += "Could not load datasource %s: missing init_datasource or DATASOURCE\n" % subdirectory + return datasource_id = datasource.DATASOURCE @@ -208,6 +211,19 @@ def load_datasources(self): "config": {} if not hasattr(datasource, "config") else datasource.config } + # Load 4CAT core datasources + for subdirectory in Path(config.get('PATH_ROOT'), "datasources").iterdir(): + if subdirectory.is_dir(): + _load_datasource(subdirectory) + + # Load extension datasources + # os.walk is used to allow for the possibility of multiple extensions, with nested "datasources" folders + for root, dirs, files in os.walk(Path(config.get('PATH_ROOT'), "extensions"), followlinks=True): + if "datasources" in dirs: + for subdirectory in Path(root, "datasources").iterdir(): + if subdirectory.is_dir(): + _load_datasource(subdirectory) + sorted_datasources = {datasource_id: self.datasources[datasource_id] for datasource_id in sorted(self.datasources, key=lambda id: self.datasources[id]["name"])} self.datasources = sorted_datasources diff --git a/docker-compose_build.yml b/docker-compose_build.yml index b1c1fa1af..7466e8ba8 100644 --- a/docker-compose_build.yml +++ b/docker-compose_build.yml @@ -9,7 +9,6 @@ services: - POSTGRES_HOST_AUTH_METHOD=${POSTGRES_HOST_AUTH_METHOD} volumes: - ./data/postgres/:/var/lib/postgresql/data/ -# - 4cat_db:/var/lib/postgresql/data/ healthcheck: test: [ "CMD-SHELL", "pg_isready -U $${POSTGRES_USER}" ] interval: 5s @@ -33,10 +32,6 @@ services: - ./data/datasets/:/usr/src/app/data/ - ./data/config/:/usr/src/app/config/ - ./data/logs/:/usr/src/app/logs/ -# - 4cat_data:/usr/src/app/data/ -# - 4cat_config:/usr/src/app/config/ -# - 4cat_logs:/usr/src/app/logs/ - entrypoint: docker/docker-entrypoint.sh frontend: @@ -54,9 +49,6 @@ services: - ./data/datasets/:/usr/src/app/data/ - ./data/config/:/usr/src/app/config/ - ./data/logs/:/usr/src/app/logs/ -# - 4cat_data:/usr/src/app/data/ -# - 4cat_config:/usr/src/app/config/ -# - 4cat_logs:/usr/src/app/logs/ command: ["docker/wait-for-backend.sh"] volumes: diff --git a/docker/Dockerfile b/docker/Dockerfile index 709d68893..046b39cba 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -24,6 +24,7 @@ ENV PYTHONUNBUFFERED=1 # Install dependencies RUN pip3 install --upgrade pip COPY ./requirements.txt /usr/src/app/requirements.txt +COPY ./extensions /usr/src/app/extensions COPY ./setup.py /usr/src/app/setup.py COPY ./VERSION /usr/src/app/VERSION COPY ./README.md /usr/src/app/README.md diff --git a/docs/datasource.rst b/docs/datasource.rst index 56ae2189c..c4731a3e1 100644 --- a/docs/datasource.rst +++ b/docs/datasource.rst @@ -59,7 +59,7 @@ needs to function (e.g. queueing any recurring workers). A default implementatio ------------------ The `Search` class ------------------ -.. autoclass:: backend.abstract.search.Search +.. autoclass:: backend.lib.search.Search :members: :undoc-members: :show-inheritance: @@ -67,7 +67,7 @@ The `Search` class --------------------------- The `SearchWithScope` class --------------------------- -.. autoclass:: backend.abstract.search.SearchWithScope +.. autoclass:: backend.lib.search.SearchWithScope :members: :undoc-members: :show-inheritance: \ No newline at end of file diff --git a/docs/processor.rst b/docs/processor.rst index 1bc3c5191..3073b3a09 100644 --- a/docs/processor.rst +++ b/docs/processor.rst @@ -27,7 +27,7 @@ A minimal example of a processor could look like this: """ A minimal example 4CAT processor """ - from backend.abstract.processor import BasicProcessor + from backend.lib.processor import BasicProcessor class ExampleProcessor(BasicProcessor): """ @@ -57,7 +57,7 @@ But there is more you can do. The full API looks like this: The `BasicProcessor` class -------------------------- -.. autoclass:: backend.abstract.processor.BasicProcessor +.. autoclass:: backend.lib.processor.BasicProcessor :members: :undoc-members: :show-inheritance: \ No newline at end of file diff --git a/docs/worker.rst b/docs/worker.rst index bc122f7e9..6eafd5f5a 100644 --- a/docs/worker.rst +++ b/docs/worker.rst @@ -8,7 +8,7 @@ TBD The `BasicWorker` class ----------------------- -.. autoclass:: backend.abstract.worker.BasicWorker +.. autoclass:: backend.lib.worker.BasicWorker :members: :undoc-members: :show-inheritance: \ No newline at end of file diff --git a/extensions/.gitignore b/extensions/.gitignore new file mode 100644 index 000000000..d7e401301 --- /dev/null +++ b/extensions/.gitignore @@ -0,0 +1,5 @@ +# Ignore everything in this directory +* +# Except these files +!.gitignore +!README.md diff --git a/extensions/README.md b/extensions/README.md new file mode 100644 index 000000000..f594bc152 --- /dev/null +++ b/extensions/README.md @@ -0,0 +1,39 @@ +This folder contains 4CAT extensions. + +Extensions are processor or data sources that are not part of the main 4CAT codebase, but are otherwise compatible +with it. For example, a processor that interfaces with a closed API would not be useful to most 4CAT users, but if you +have access to it, you could add such a processor to 4CAT as an extension. + + +## Installation +Extensions are simply folders within this 'extensions' folder in which Python files containing the relevant code is +contained. It is strongly recommended that you use git for version control of these folders. Simply commit the code to +a repository somewhere, then clone it into this folder like so: + +```shell +cd [4cat root] +cd extensions +git clone [repository URL] +``` + +This ensures that any dataset created with processors in your extension will be aware of the version of the code they +were created with. This helps debugging and doing reproducible and traceable research. + +## Structure +Processors can simply be .py files in the extension folder. Data sources should be sub-folders in a "datasources" +folder. An extension containing both processors and a data source could look like this: + +``` +[4CAT root]/ +├─ extensions/ +│ ├─ my_extension/ +│ ├─ my_processor.py +│ ├─ my_other_processor.py +│ ├─ datasources/ +│ ├─ my_datasource/ +│ ├─ __init__.py +│ ├─ DESCRIPTION.md +│ ├─ search_my_datasource.py +``` + +In this scenario, `my_extension` would be a git repository within which all other files are contained. \ No newline at end of file diff --git a/helper-scripts/migrate.py b/helper-scripts/migrate.py index fb85772ae..25071afe4 100644 --- a/helper-scripts/migrate.py +++ b/helper-scripts/migrate.py @@ -79,8 +79,39 @@ def check_for_nltk(): nltk.download("omw-1.4", quiet=True) +def install_extensions(no_pip=True): + """ + Check for extensions and run any installation scripts found. -def finish(args, logger): + Note: requirements texts are handled by setup.py + """ + # Check for extension packages + if os.path.isdir("extensions"): + for root, dirs, files in os.walk("extensions"): + for file in files: + if file == "fourcat_install.py": + command = [interpreter, os.path.join(root, file)] + if args.component == "frontend": + command.append("--component=frontend") + elif args.component == "backend": + command.append("--component=backend") + elif args.component == "both": + command.append("--component=both") + + if no_pip: + command.append("--no-pip") + + print(f"Installing extension: {os.path.join(root, file)}") + result = subprocess.run(command, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + if result.returncode != 0: + print("Error while running extension installation script: " + os.path.join(root, file)) + + print(result.stdout.decode("utf-8")) if result.stdout else None + print(result.stderr.decode("utf-8")) if result.stderr else None + + +def finish(args, logger, no_pip=True): """ Finish migration @@ -89,6 +120,7 @@ def finish(args, logger): wrap up and exit. """ check_for_nltk() + install_extensions(no_pip=no_pip) logger.info("\nMigration finished. You can now safely restart 4CAT.\n") if args.restart: @@ -115,7 +147,7 @@ def finish(args, logger): cli.add_argument("--no-migrate", "-m", default=False, action="store_true", help="Do not run scripts to upgrade between minor versions. Use if you only want to use migrate to e.g. upgrade dependencies.") cli.add_argument("--current-version", "-v", default="config/.current-version", help="File path to .current-version file, relative to the 4CAT root") cli.add_argument("--output", "-o", default="", help="By default migrate.py will send output to stdout. If this argument is set, it will write to the given path instead.") -cli.add_argument("--component", "-c", default="both", help="Which component of 4CAT to migrate. Currently only skips check for if 4CAT is running when set to 'frontend'") +cli.add_argument("--component", "-c", default="both", help="Which component of 4CAT to migrate ('both', 'backend', 'frontend'). Skips check for if 4CAT is running when set to 'frontend'. Also used by extensions w/ fourcat_install.py") cli.add_argument("--branch", "-b", default=False, help="Which branch to check out from GitHub. By default, check out the latest release.") args = cli.parse_args() @@ -125,6 +157,9 @@ def finish(args, logger): print("This script needs to be run from the same folder as 4cat-daemon.py\n") exit(1) +# track pip +pip_ran = False + # set up logging logger = logging.getLogger("migrate") logger.setLevel(logging.INFO) @@ -145,6 +180,7 @@ def finish(args, logger): logger.info("Restart after migration: " + ("yes" if args.restart else "no")) logger.info("Repository URL: " + args.repository) logger.info(".current-version path: " + args.current_version) +logger.info(f"Current Datetime: {time.strftime('%Y-%m-%d %H:%M:%S')}") # --------------------------------------------- # Ensure existence of current version file @@ -221,7 +257,7 @@ def finish(args, logger): logger.info(" ...latest release available from GitHub (%s) is older than or equivalent to currently checked out version " "(%s)." % (tag_version, current_version_c)) logger.info(" ...upgrade not necessary, skipping.") - finish(args, logger) + finish(args, logger, no_pip=pip_ran) logger.info(" ...ensuring repository %s is a known remote" % args.repository) remote = subprocess.run(shlex.split("git remote add 4cat_migrate %s" % args.repository), stdout=subprocess.PIPE, @@ -297,7 +333,7 @@ def finish(args, logger): if current_version == target_version: logger.info(" ...already up to date.") - finish(args, logger) + finish(args, logger, no_pip=pip_ran) if current_version_c[0:3] != target_version_c[0:3]: logger.info(" ...cannot migrate between different major versions.") @@ -365,6 +401,7 @@ def log_pip_output(logger, output): pip = subprocess.run([interpreter, "-m", "pip", "install", "-r", "requirements.txt", "--upgrade", "--upgrade-strategy", "eager"], stderr=subprocess.STDOUT, stdout=subprocess.PIPE, check=True, cwd=cwd) log_pip_output(logger, pip.stdout) + pip_ran = True except subprocess.CalledProcessError as e: log_pip_output(logger, e.output) logger.info(f"\n Error running pip: {e}") @@ -410,4 +447,4 @@ def log_pip_output(logger, output): # --------------------------------------------- # Done! Wrap up and finish # --------------------------------------------- -finish(args, logger) +finish(args, logger, no_pip=pip_ran) diff --git a/helper-scripts/migrate/migrate-1.45-1.46.py b/helper-scripts/migrate/migrate-1.45-1.46.py new file mode 100644 index 000000000..8bf5d0683 --- /dev/null +++ b/helper-scripts/migrate/migrate-1.45-1.46.py @@ -0,0 +1,33 @@ +# Ensure unique metrics index exists +import json +import sys +import os + +from pathlib import Path + +sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), "../..")) +from common.lib.database import Database +from common.lib.logger import Logger + +log = Logger(output=True) + +import configparser + +ini = configparser.ConfigParser() +ini.read(Path(__file__).parent.parent.parent.resolve().joinpath("config/config.ini")) +db_config = ini["DATABASE"] + +db = Database(logger=log, dbname=db_config["db_name"], user=db_config["db_user"], password=db_config["db_password"], + host=db_config["db_host"], port=db_config["db_port"], appname="4cat-migrate") + +print(" Checking if datasets table has a column 'software_source'...") +has_column = db.fetchone( + "SELECT COUNT(*) AS num FROM information_schema.columns WHERE table_name = 'datasets' AND column_name = 'software_source'") +if has_column["num"] == 0: + print(" ...No, adding.") + current_source = db.fetchone("SELECT value FROM settings WHERE name = '4cat.github_url' AND tag = ''") + current_source = json.loads(current_source["value"]) if current_source is not None else "" + db.execute("ALTER TABLE datasets ADD COLUMN software_source TEXT DEFAULT %s", (current_source,)) + db.commit() +else: + print(" ...Yes, nothing to update.") \ No newline at end of file diff --git a/processors/filtering/column_filter.py b/processors/filtering/column_filter.py index 01c7fa88f..2dc73b63e 100644 --- a/processors/filtering/column_filter.py +++ b/processors/filtering/column_filter.py @@ -75,7 +75,7 @@ class ColumnFilter(BaseFilter): @classmethod def is_compatible_with(cls, module=None, user=None): """ - Allow processor on top datasets. + Allow processor on top datasets that are CSV or NDJSON. :param module: Module to determine compatibility with """ @@ -262,11 +262,11 @@ class ColumnProcessorFilter(ColumnFilter): @classmethod def is_compatible_with(cls, module=None, user=None): """ - Allow processor on top datasets. + Allow on child datasets and do not create a standalone dataset :param module: Dataset or processor to determine compatibility with """ - return module.get_extension() in ("csv", "ndjson") and not module.is_top_dataset() + return not module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") @classmethod def is_filter(cls): diff --git a/processors/metrics/rank_attribute.py b/processors/metrics/rank_attribute.py index adffe824a..0e38757c6 100644 --- a/processors/metrics/rank_attribute.py +++ b/processors/metrics/rank_attribute.py @@ -110,11 +110,12 @@ class AttributeRanker(BasicProcessor): @classmethod def is_compatible_with(cls, module=None, user=None): """ - Allow processor on top image rankings + Allow processor to run on all csv and NDJSON datasets :param module: Module to determine compatibility with """ - return module.get_extension() in ["csv", "ndjson"] + + return module.get_extension() in ("csv", "ndjson") def process(self): """ @@ -134,7 +135,7 @@ def process(self): weighby = self.parameters.get("weigh") to_lowercase = self.parameters.get("to-lowercase", True) self.include_missing_data = self.parameters.get("count_missing") - + try: if self.parameters.get("filter"): filter = re.compile(".*" + self.parameters.get("filter") + ".*") @@ -203,7 +204,7 @@ def missing_value_placeholder(data, field_name): for value in values: if to_lowercase: value = value.lower() - + if rank_style == "overall" and value not in overall_top: continue @@ -340,4 +341,4 @@ def get_options(cls, parent_dataset=None, user=None): options["columns"]["options"] = {v: v for v in columns} options["columns"]["default"] = ["body"] - return options \ No newline at end of file + return options diff --git a/processors/networks/wikipedia_network.py b/processors/networks/wikipedia_network.py index 00e141fc7..0426c97d2 100644 --- a/processors/networks/wikipedia_network.py +++ b/processors/networks/wikipedia_network.py @@ -3,19 +3,20 @@ """ import re import requests - -from backend.lib.processor import BasicProcessor from lxml import etree from lxml.cssselect import CSSSelector as css from io import StringIO - import networkx as nx +from backend.lib.processor import BasicProcessor +from common.lib.exceptions import ProcessorInterruptedException + __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters", "Sal Hagen"] __maintainer__ = "Stijn Peeters" __email__ = "4cat@oilab.eu" + class WikiURLCoLinker(BasicProcessor): """ Generate URL co-link network diff --git a/processors/presets/neologisms.py b/processors/presets/neologisms.py index 26684e4d0..1cf258503 100644 --- a/processors/presets/neologisms.py +++ b/processors/presets/neologisms.py @@ -19,17 +19,6 @@ class NeologismExtractor(ProcessorPreset): references = ["Van Soest, Jeroen. 2019. 'Language Innovation Tracker: Detecting language innovation in online discussion fora.' (MA thesis), Beuls, K. (Promotor), Van Eecke, P. (Advisor).'"] - @staticmethod - def is_compatible_with(module=None, user=None): - """ - Determine compatibility - - This preset is compatible with any dataset that has columns - - :param Dataset module: Module ID to determine compatibility with - :return bool: - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") @classmethod def get_options(cls, parent_dataset=None, user=None): @@ -60,6 +49,16 @@ def get_options(cls, parent_dataset=None, user=None): return options + @classmethod + def is_compatible_with(cls, module=None, user=None): + """ + Allow processor to run on all csv and NDJSON datasets + + :param module: Dataset or processor to determine compatibility with + """ + + return module.get_extension() in ("csv", "ndjson") + def get_processor_pipeline(self): """ This queues a series of post-processors to extract neologisms from a diff --git a/processors/text-analysis/split_sentences.py b/processors/text-analysis/split_sentences.py index c5cce2477..dd2be7c2f 100644 --- a/processors/text-analysis/split_sentences.py +++ b/processors/text-analysis/split_sentences.py @@ -86,8 +86,11 @@ def get_options(cls, parent_dataset=None, user=None): @classmethod def is_compatible_with(cls, module=None, user=None): """ - Allow CSV and NDJSON datasets + Allow processor to run on all csv and NDJSON datasets + + :param module: Dataset or processor to determine compatibility with """ + return module.get_extension() in ("csv", "ndjson") def process(self): diff --git a/processors/text-analysis/tokenise.py b/processors/text-analysis/tokenise.py index fb1b89cbd..a104306f1 100644 --- a/processors/text-analysis/tokenise.py +++ b/processors/text-analysis/tokenise.py @@ -50,8 +50,11 @@ class Tokenise(BasicProcessor): @classmethod def is_compatible_with(cls, module=None, user=None): """ - Allow CSV and NDJSON datasets + Allow processor to run on all csv and NDJSON datasets + + :param module: Dataset or processor to determine compatibility with """ + return module.get_extension() in ("csv", "ndjson") @classmethod diff --git a/processors/visualisation/download_videos.py b/processors/visualisation/download_videos.py index aa24a724b..2b385ffe7 100644 --- a/processors/visualisation/download_videos.py +++ b/processors/visualisation/download_videos.py @@ -234,7 +234,7 @@ def is_compatible_with(cls, module=None, user=None): in principle, but any links to videos are likely to come from the top dataset anyway. - :param str module: Module ID to determine compatibility with + :param module: Module to determine compatibility with :return bool: """ return ((module.type.endswith("-search") or module.is_from_collector()) @@ -645,6 +645,9 @@ def collect_video_urls(self): if not value: continue + if value is not str: + value = str(value) + video_links = self.identify_video_urls_in_string(value) if video_links: item_urls |= set(video_links) @@ -667,7 +670,6 @@ def identify_video_urls_in_string(self, text): :param str text: string that may contain URLs :return list: list containing validated URLs to videos """ - text = str(text) split_comma = self.parameters.get("split-comma", True) if split_comma: texts = text.split(",") diff --git a/processors/visualisation/image_category_wall.py b/processors/visualisation/image_category_wall.py index fee1fb7b0..d74d28e40 100644 --- a/processors/visualisation/image_category_wall.py +++ b/processors/visualisation/image_category_wall.py @@ -61,13 +61,14 @@ class ImageCategoryWallGenerator(BasicProcessor): def is_compatible_with(cls, module=None, user=None): """ Allow processor on CLIP dataset only - + :param module: Dataset or processor to determine compatibility with """ return module.type.startswith("image-to-categories") or \ module.type.startswith("image-downloader") or \ module.type.startswith("video-hasher-1") or \ - module.type.startswith("video-hash-similarity-matrix") + module.type.startswith("video-hash-similarity-matrix") and \ + not module.type not in ["image-downloader-screenshots-search"] @classmethod def get_options(cls, parent_dataset=None, user=None): @@ -170,7 +171,7 @@ def process(self): self.dataset.log(f"Found {image_dataset.type} w/ {image_dataset.num_rows} images and {category_dataset.type} w/ {category_dataset.num_rows} items") category_column = self.parameters.get("category") - if category_column is None: + if not category_column: self.dataset.finish_with_error("No category provided.") return @@ -427,6 +428,3 @@ def process(self): canvas.save(pretty=True) self.dataset.log("Saved to " + str(self.dataset.get_results_path())) return self.dataset.finish(len(category_widths)) - - - diff --git a/processors/visualisation/word-trees.py b/processors/visualisation/word-trees.py index 6446372e8..f7783bcc1 100644 --- a/processors/visualisation/word-trees.py +++ b/processors/visualisation/word-trees.py @@ -104,6 +104,16 @@ class MakeWordtree(BasicProcessor): } } + @classmethod + def is_compatible_with(cls, module=None, user=None): + """ + Allow processor to run on all csv and NDJSON datasets + + :param module: Dataset or processor to determine compatibility with + """ + + return module.get_extension() in ("csv", "ndjson") + # determines how close the nodes are displayed to each other (min. 1) whitespace = 2 @@ -126,13 +136,6 @@ class MakeWordtree(BasicProcessor): # methods limit = 1 - @classmethod - def is_compatible_with(cls, module=None, user=None): - """ - Allow CSV and NDJSON datasets - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - def process(self): """ This takes a 4CAT results file as input, and outputs a plain text file diff --git a/setup.py b/setup.py index 17079a887..56f5acd16 100644 --- a/setup.py +++ b/setup.py @@ -8,10 +8,10 @@ version = versionfile.readline().strip() # Universal packages -packages = [ +packages = set([ "anytree~=2.8.0", "bcrypt~=3.2.0", - "beautifulsoup4~=4.11.0", + "beautifulsoup4",#~=4.11.0", "clarifai-grpc~=9.0", "cryptography>=39.0.1", "cssselect~=1.1.0", @@ -22,7 +22,7 @@ "Flask~=2.2", "Flask_Limiter==1.0.1", "Flask_Login~=0.6", - "gensim>=4.1.0, <4.2", + "gensim>=4.1.0", "google_api_python_client==2.0.2", "html2text==2020.*", "ImageHash>4.2.0", @@ -31,7 +31,7 @@ "lxml~=4.9.0", "markdown==3.0.1", "markdown2==2.4.2", - "nltk==3.9.1", + "nltk~=3.9.1", "networkx~=2.8.0", "numpy>=1.19.2", "opencv-python>=4.6.0.66", @@ -48,6 +48,7 @@ "razdel~=0.5", "requests~=2.27", "requests_futures", + "scikit_learn", "scenedetect==0.6.0.3", "scikit-learn", "scipy==1.10.1", @@ -64,15 +65,28 @@ "videohash @ git+https://github.com/dale-wahl/videohash@main", "vk_api", "yt-dlp" -] +]) + +# Check for extension packages +if os.path.isdir("extensions"): + extension_packages = set() + for root, dirs, files in os.walk("extensions"): + for file in files: + if file == "requirements.txt": + with open(os.path.join(root, file)) as extension_requirements: + for line in extension_requirements.readlines(): + extension_packages.add(line.strip()) + if extension_packages: + print("Found extensions, installing additional packages: " + str(extension_packages)) + packages = packages.union(extension_packages) # Some packages don't run on Windows -unix_packages = [ +unix_packages = set([ "python-daemon==2.3.2" -] +]) if os.name != "nt": - packages = packages + unix_packages + packages = packages.union(unix_packages) setup( name='fourcat', @@ -85,5 +99,5 @@ url="https://oilab.eu", packages=['backend', 'webtool', 'datasources'], python_requires='>=3.7', - install_requires=packages, + install_requires=list(packages), ) diff --git a/webtool/__init__.py b/webtool/__init__.py index 7becd1239..0fd3ecf5d 100644 --- a/webtool/__init__.py +++ b/webtool/__init__.py @@ -107,10 +107,8 @@ import webtool.views.views_admin import webtool.views.views_restart import webtool.views.views_user - import webtool.views.views_dataset import webtool.views.views_misc - import webtool.views.api_explorer import webtool.views.api_standalone import webtool.views.api_tool diff --git a/webtool/lib/helpers.py b/webtool/lib/helpers.py index d06f4435c..6cc91eba1 100644 --- a/webtool/lib/helpers.py +++ b/webtool/lib/helpers.py @@ -23,7 +23,7 @@ class Pagination(object): Provide pagination """ - def __init__(self, page, per_page, total_count, route="show_results"): + def __init__(self, page, per_page, total_count, route="show_results", route_args=None): """ Set up pagination object @@ -36,6 +36,7 @@ def __init__(self, page, per_page, total_count, route="show_results"): self.per_page = per_page self.total_count = total_count self.route = route + self.route_args = route_args if route_args else {} @property def pages(self): diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py index c50caca26..6ac9272ba 100644 --- a/webtool/lib/template_filters.py +++ b/webtool/lib/template_filters.py @@ -139,9 +139,12 @@ def _jinja2_filter_add_ahref(content): return content -@app.template_filter('markdown') -def _jinja2_filter_markdown(text): +@app.template_filter('markdown',) +def _jinja2_filter_markdown(text, trim_container=False): val = markdown.markdown(text) + if trim_container: + val = re.sub(r"^
", "", val) + val = re.sub(r"
$", "", val) return val @app.template_filter('isbool') @@ -262,7 +265,7 @@ def _jinja2_filter_post_field(field, post): formatted_field = field field = str(field) - + for key in re.findall(r"\{\{(.*?)\}\}", field): original_key = key @@ -296,7 +299,7 @@ def _jinja2_filter_post_field(field, post): # We see 0 as a valid value - e.g. '0 retweets'. if not val and val != 0: return "" - + # Support some basic string slicing if string_slice: field = field.replace("[" + string_slice + "]", "") @@ -317,7 +320,7 @@ def _jinja2_filter_post_field(field, post): # Apply further filters, if present (e.g. lower) for extra_filter in extra_filters: - + extra_filter = extra_filter.strip() # We're going to parse possible parameters to pass to the filter @@ -328,7 +331,7 @@ def _jinja2_filter_post_field(field, post): extra_filter = extra_filter.split("(")[0] params = [p.strip() for p in params.split(",")] params = [post[param] for param in params] - + val = app.jinja_env.filters[extra_filter](val, *params) if string_slice: @@ -388,3 +391,7 @@ def uniqid(): "__version": version, "uniqid": uniqid } + +@app.template_filter('log') +def _jinja2_filter_log(text): + app.logger.info(text) \ No newline at end of file diff --git a/webtool/static/js/fourcat.js b/webtool/static/js/fourcat.js index e622505b2..7e0058fc3 100644 --- a/webtool/static/js/fourcat.js +++ b/webtool/static/js/fourcat.js @@ -630,17 +630,17 @@ const query = { for (let i = 0; i < json.length; i += 1) { search_queue_length += json[i]['count']; - search_queue_notice += " " + json[i]['jobtype'].replace('-search', '') + ' (' + json[i]['count'] + ')' + '' + search_queue_notice += " " + json[i]['processor_name'] + ' (' + json[i]['count'] + ')' + '' } if (search_queue_length == 0) { search_queue_box.html('Search queue is empty.'); search_queue_list.html(''); } else if (search_queue_length == 1) { - search_queue_box.html('Currently processing 1 search query: '); + search_queue_box.html('Currently collecting 1 dataset: '); search_queue_list.html(search_queue_notice); } else { - search_queue_box.html('Currently processing ' + search_queue_length + ' search queries: '); + search_queue_box.html('Currently collecting ' + search_queue_length + ' datasets: '); search_queue_list.html(search_queue_notice); } }, @@ -1993,4 +1993,4 @@ function find_parent(element, selector) { } return null; -} \ No newline at end of file +} diff --git a/webtool/templates/components/datasource-option.html b/webtool/templates/components/datasource-option.html index 5eff77e00..4ee4ba16e 100644 --- a/webtool/templates/components/datasource-option.html +++ b/webtool/templates/components/datasource-option.html @@ -9,7 +9,7 @@{{ settings.tooltip }}
{% endif %} + {% if "tooltip" in settings %}{{ settings.tooltip|markdown(True)|safe }}
{% endif %} {% elif settings.type == "file" %} {% if "tooltip" in settings %} diff --git a/webtool/templates/components/pagination.html b/webtool/templates/components/pagination.html index 607844157..91ea859c8 100644 --- a/webtool/templates/components/pagination.html +++ b/webtool/templates/components/pagination.html @@ -2,12 +2,12 @@ \ No newline at end of file diff --git a/webtool/templates/components/result-details.html b/webtool/templates/components/result-details.html index 9cf51a2a6..ebe8f64ec 100644 --- a/webtool/templates/components/result-details.html +++ b/webtool/templates/components/result-details.html @@ -163,7 +163,7 @@{{ __user_config("4cat.name") }} can remove information it identifies as relating to an item's author, or + {% if __user_config("ui.offer_hashing") %} +
4CAT can remove information it identifies as relating to an item's author, or replace it with a hashed value. Other personal information may persist; it is your responsibility to further anonymise data where appropriate.
@@ -51,17 +51,17 @@This will only hide your dataset from other users. It will NOT encrypt your data and server administrators will still be able to view it. If you are working with sensitive data, you should consider running your own 4CAT instance.