From f196be2709c5681e4e3924901357bf0e6ed220df Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Mon, 19 Aug 2024 14:22:07 +0200
Subject: [PATCH 01/12] export processor

---
 common/lib/dataset.py                    |  16 +++-
 processors/conversion/export_datasets.py | 100 +++++++++++++++++++++++
 webtool/views/api_tool.py                |   6 +-
 3 files changed, 116 insertions(+), 6 deletions(-)
 create mode 100644 processors/conversion/export_datasets.py

diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 8510a5adb..56ea7d463 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -15,7 +15,7 @@
 import backend
 from common.config_manager import config
 from common.lib.job import Job, JobNotFoundException
-from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int
+from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int, get_software_version
 from common.lib.item_mapping import MappedItem, MissingMappedField, DatasetItem
 from common.lib.fourcat_module import FourcatModule
 from common.lib.exceptions import (ProcessorInterruptedException, DataSetException, DataSetNotFoundException,
@@ -1542,6 +1542,20 @@ def get_media_type(self):
 		# Default to text
 		return self.parameters.get("media_type", "text")
 
+	def get_metadata(self):
+		"""
+		Get dataset metadata
+
+		This consists of all the data stored in the database for this dataset, plus the current 4CAT version (appended
+		as 'current_4CAT_version'). This is useful for exporting datasets, as it can be used by another 4CAT instance to
+		update its database (and ensure compatibility with the exporting version of 4CAT).
+		"""
+		metadata = self.db.fetchone("SELECT * FROM datasets WHERE key = %s", (self.key,))
+
+		# get 4CAT version (presumably to ensure export is compatible with import)
+		metadata["current_4CAT_version"] = get_software_version()
+		return metadata
+
 	def get_result_url(self):
 		"""
 		Gets the 4CAT frontend URL of a dataset file.
diff --git a/processors/conversion/export_datasets.py b/processors/conversion/export_datasets.py
new file mode 100644
index 000000000..f45cf92f7
--- /dev/null
+++ b/processors/conversion/export_datasets.py
@@ -0,0 +1,100 @@
+"""
+Export a dataset and all its children to a ZIP file
+"""
+import shutil
+import json
+
+from backend.lib.processor import BasicProcessor
+from common.lib.dataset import DataSet
+from common.lib.exceptions import ProcessorException, DataSetException
+
+__author__ = "Dale Wahl"
+__credits__ = ["Dale Wahl"]
+__maintainer__ = "Dale Wahl"
+__email__ = "4cat@oilab.eu"
+
+
+
+class ExportDatasets(BasicProcessor):
+	"""
+	Export a dataset and all its children to a ZIP file
+	"""
+	type = "export-datasets"  # job type ID
+	category = "Conversion"  # category
+	title = "Export Dataset and All Analyses"  # title displayed in UI
+	description = "Creates a ZIP file containing the dataset and all analyses to be archived and uploaded to a 4CAT instance in the future"  # description displayed in UI
+	extension = "zip"  # extension of result file, used internally and in UI
+
+	@classmethod
+	def is_compatible_with(cls, module=None, user=None):
+		"""
+		Determine if processor is compatible with dataset
+
+		:param module: Module to determine compatibility with
+		"""
+		return module.is_top_dataset() and user.can_access_dataset(dataset=module, role="owner")
+
+	def process(self):
+		"""
+		This takes a CSV file as input and writes the same data as a JSON file
+		"""
+		self.dataset.update_status("Collecting dataset and all analyses")
+
+		results_path = self.dataset.get_staging_area()
+
+		exported_datasets = []
+		failed_exports = []  # keys that failed to import
+		keys = [self.dataset.top_parent().key] # get the key of the top parent
+		while keys:
+			dataset_key = keys.pop(0)
+			self.dataset.log(f"Exporting dataset {dataset_key}.")
+
+			try:
+				dataset = DataSet(key=dataset_key, db=self.db)
+			# TODO: these two should fail for the primary dataset, but should they fail for the children too?
+			except DataSetException:
+				self.dataset.finish_with_error("Dataset not found.")
+				return
+			if not dataset.is_finished():
+				self.dataset.finish_with_error("You cannot export unfinished datasets.")
+				return
+
+			# get metadata
+			metadata = dataset.get_metadata()
+			if metadata["num_rows"] == 0:
+				self.dataset.update_status(f"Skipping empty dataset {dataset_key}")
+				failed_exports.append(dataset_key)
+				continue
+
+			# get data
+			data_file = dataset.get_results_path()
+			if not data_file.exists():
+				self.dataset.finish_with_error(f"Dataset {dataset_key} has no data; skipping.")
+				failed_exports.append(dataset_key)
+				continue
+
+			# get log
+			log_file = dataset.get_results_path().with_suffix(".log")
+
+			# All good, add to ZIP
+			with results_path.joinpath(f"{dataset_key}_metadata.json").open("w", encoding="utf-8") as outfile:
+				outfile.write(json.dumps(metadata))
+			shutil.copy(data_file, results_path.joinpath(data_file.name))
+			if log_file.exists():
+				shutil.copy(log_file, results_path.joinpath(log_file.name))
+
+			# add children to queue
+			# Not using get_all_children() because we want to skip unfinished datasets and only need the keys
+			children = [d["key"] for d in self.db.fetchall("SELECT key FROM datasets WHERE key_parent = %s AND is_finished = TRUE", (dataset_key,))]
+			keys.extend(children)
+
+			self.dataset.update_status(f"Exported dataset {dataset_key}.")
+			exported_datasets.append(dataset_key)
+
+		# Add export log to ZIP
+		self.dataset.log(f"Exported datasets: {exported_datasets}")
+		self.dataset.log(f"Failed to export datasets: {failed_exports}")
+		shutil.copy(self.dataset.get_log_path(), results_path.joinpath("export.log"))
+
+		# done!
+		self.write_archive_and_finish(results_path, len(exported_datasets))
\ No newline at end of file
diff --git a/webtool/views/api_tool.py b/webtool/views/api_tool.py
index e4645e6d5..64c331271 100644
--- a/webtool/views/api_tool.py
+++ b/webtool/views/api_tool.py
@@ -1237,11 +1237,7 @@ def export_packed_dataset(key=None, component=None):
 		return error(403, error="You cannot export unfinished datasets.")
 
 	if component == "metadata":
-		metadata = db.fetchone("SELECT * FROM datasets WHERE key = %s", (dataset.key,))
-
-		# get 4CAT version (presumably to ensure export is compatible with import)
-		metadata["current_4CAT_version"] = get_software_version()
-		return jsonify(metadata)
+		return jsonify(dataset.get_metadata())
 
 	elif component == "children":
 		children = [d["key"] for d in db.fetchall("SELECT key FROM datasets WHERE key_parent = %s AND is_finished = TRUE", (dataset.key,))]

From e23f9253f0f1193a837eaf08fb1e598fd42c2d7a Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 20 Aug 2024 09:07:52 +0200
Subject: [PATCH 02/12] start of importer

---
 datasources/fourcat_import/import_4cat.py | 400 +++++++++++++++-------
 1 file changed, 274 insertions(+), 126 deletions(-)

diff --git a/datasources/fourcat_import/import_4cat.py b/datasources/fourcat_import/import_4cat.py
index cd231b445..069549446 100644
--- a/datasources/fourcat_import/import_4cat.py
+++ b/datasources/fourcat_import/import_4cat.py
@@ -4,6 +4,7 @@
 import requests
 import json
 import time
+import zipfile
 
 from backend.lib.processor import BasicProcessor
 from common.lib.exceptions import (QueryParametersException, FourcatException, ProcessorInterruptedException,
@@ -19,8 +20,8 @@ class FourcatImportException(FourcatException):
 class SearchImportFromFourcat(BasicProcessor):
     type = "import_4cat-search"  # job ID
     category = "Search"  # category
-    title = "Import from 4CAT"  # title displayed in UI
-    description = "Import a dataset from another 4CAT server"  # description displayed in UI
+    title = "Import 4CAT dataset and analyses"  # title displayed in UI
+    description = "Import a dataset from another 4CAT server or from a zip file (exported from a 4CAT server)"  # description displayed in UI
     is_local = False  # Whether this datasource is locally scraped
     is_static = False  # Whether this datasource is still updated
 
@@ -33,29 +34,242 @@ class SearchImportFromFourcat(BasicProcessor):
                     "\n\nTo import a dataset across servers, both servers need to be running the same version of 4CAT. "
                     "You can find the current version in the footer at the bottom of the interface."
         },
+        "method": {
+            "type": UserInput.OPTION_CHOICE,
+            "help": "Import Type",
+            "options": {
+                "zip": "Zip File",
+                "url": "4CAT URL",
+            },
+            "default": "url"
+        },
         "url": {
             "type": UserInput.OPTION_TEXT,
             "help": "Dataset URL",
-            "tooltip": "URL to the dataset's page, for example https://4cat.example/results/28da332f8918e6dc5aacd1c3b0170f01b80bd95f8ff9964ac646cecd33bfee49/."
+            "tooltip": "URL to the dataset's page, for example https://4cat.example/results/28da332f8918e6dc5aacd1c3b0170f01b80bd95f8ff9964ac646cecd33bfee49/.",
+            "requires": "method^=url"
         },
         "intro2": {
             "type": UserInput.OPTION_INFO,
             "help": "You can create an API key via the 'API Access' item in 4CAT's navigation menu. Note that you need "
                     "an API key from **the server you are importing from**, not the one you are looking at right now. "
-                    "Additionally, you need to have owner access to the dataset you want to import."
+                    "Additionally, you need to have owner access to the dataset you want to import.",
+            "requires": "method^=url"
         },
         "api-key": {
             "type": UserInput.OPTION_TEXT,
             "help": "4CAT API Key",
             "sensitive": True,
             "cache": True,
-        }
+            "requires": "method^=url"
+        },
+        "data_upload": {
+            "type": UserInput.OPTION_FILE,
+            "help": "File",
+            "tooltip": "Upload a ZIP file containing a dataset exported from a 4CAT server.",
+            "requires": "method^=zip"
+        },
+
     }
 
     created_datasets = None
     base = None
+    remapped_keys = None
 
     def process(self):
+        """
+        Import 4CAT dataset either from another 4CAT server or from the uploaded zip file
+        """
+        if self.parameters.get("method") == "zip":
+            self.process_zip()
+        else:
+            self.process_urls()
+
+    def after_create(query, dataset, request):
+        """
+        Hook to execute after the dataset for this source has been created
+
+        In this case, put the file in a temporary location so it can be
+        processed properly by the related Job later.
+
+        :param dict query:  Sanitised query parameters
+        :param DataSet dataset:  Dataset created for this query
+        :param request:  Flask request submitted for its creation
+        """
+        if query.get("method") == "zip":
+            file = request.files["option-data_upload"]
+            file.seek(0)
+            with dataset.get_results_path().with_suffix(".importing").open("wb") as outfile:
+                while True:
+                    chunk = file.read(1024)
+                    if len(chunk) == 0:
+                        break
+                    outfile.write(chunk)
+        else:
+            # nothing to do for URLs
+            pass
+
+
+    def process_zip(self):
+        """
+        Import 4CAT dataset from a ZIP file
+        """
+        self.dataset.update_status(f"Importing datasets and analyses from ZIP file.")
+        temp_file = self.dataset.get_results_path().with_suffix(".importing")
+
+        processed_files = []
+        missed_files = []
+        with zipfile.ZipFile(temp_file, "r") as zip_ref:
+            zip_contents = zip_ref.namelist()
+
+            # Get all metadata files and determine primary dataset
+            metadata_files = [file for file in zip_contents if file.endswith("_metadata.json")]
+            if not metadata_files:
+                self.dataset.finish_with_error("No metadata files found in ZIP file; is this a 4CAT export?")
+                return
+
+            # Get the primary dataset
+            primary_dataset_keys = set()
+            datasets = []
+            for file in metadata_files:
+                with zip_ref.open(file) as f:
+                    metadata = json.load(f)
+                    if metadata.get("key_parent") is None:
+                        primary_dataset_keys.add(metadata.get("key"))
+                        datasets.append(metadata)
+                    else:
+                        # Child datasets are skipped for now, as we may need to remap keys
+                        pass
+
+            # Primary dataset will overwrite this dataset; we could address this to support multiple primary datasets
+            if len(primary_dataset_keys) != 1:
+                self.dataset.finish_with_error("ZIP file contains multiple primary datasets; only one is allowed.")
+                return
+
+            # Import datasets (
+            # TODO: this is ordered due to potential issues with keys needing to be remapped, but there may be an issue with a child datasets having additional children and needing remapping...
+            while datasets:
+                metadata = datasets.pop(0)
+                dataset_key = metadata.get("key")
+                processed_metadata = self.process_metadata(metadata)
+                if dataset_key in primary_dataset_keys:
+                    # Import primary dataset
+                    self.dataset.update_status(f"Importing primary dataset {dataset_key}.")
+
+
+
+
+
+
+
+
+            # Check that all files were processed
+            if len(zip_contents) != len(processed_files):
+                for file in zip_contents:
+                    if file not in processed_files:
+                        missed_files.append(file)
+
+
+    @staticmethod
+    def process_metadata(metadata):
+        """
+        Process metadata for import
+        """
+        # get rid of some keys that are server-specific and don't need to
+        # be stored (or don't correspond to database columns)
+        metadata.pop("current_4CAT_version")
+        metadata.pop("id")
+        metadata.pop("job")
+        metadata.pop("is_private")
+        metadata.pop("is_finished")  # we'll finish it ourselves, thank you!!!
+
+        # extra params are stored as JSON...
+        metadata["parameters"] = json.loads(metadata["parameters"])
+        if "copied_from" in metadata["parameters"]:
+            metadata["parameters"].pop("copied_from")
+        metadata["parameters"] = json.dumps(metadata["parameters"])
+
+        return metadata
+
+    def create_dataset(self, metadata, original_key, primary=False):
+        """
+        Create a new dataset
+        """
+        if primary:
+            # if this is the first dataset we're importing, make it the
+            # processor's "own" dataset. the key has already been set to
+            # the imported dataset's key via ensure_key() (or a new unqiue
+            # key if it already existed on this server)
+            # by making it the "own" dataset, the user initiating the
+            # import will see the imported dataset as the "result" of their
+            # import query in the interface, similar to the workflow for
+            # other data sources
+            new_dataset = self.dataset
+            metadata.pop("key")  # key already OK (see above)
+            self.db.update("datasets", where={"key": new_dataset.key}, data=metadata)
+
+        else:
+            # supernumerary datasets - handle on their own
+            # these include any children of imported datasets
+            try:
+                key_exists = DataSet(key=metadata["key"], db=self.db)
+
+                # if we *haven't* thrown a DatasetException now, then the
+                # key is already in use, so create a "dummy" dataset and
+                # overwrite it with the metadata we have (except for the
+                # key). this ensures that a new unique key will be
+                # generated.
+                new_dataset = DataSet(parameters={}, type=self.type, db=self.db)
+                metadata.pop("key")
+                self.db.update("datasets", where={"key": new_dataset.key}, data=metadata)
+
+            except DataSetException:
+                # this is *good* since it means the key doesn't exist, so
+                # we can re-use the key of the imported dataset
+                self.db.insert("datasets", data=metadata)
+                new_dataset = DataSet(key=metadata["key"], db=self.db)
+
+        # make sure the dataset path uses the new key and local dataset
+        # path settings. this also makes sure the log file is created in
+        # the right place (since it is derived from the results file path)
+        extension = metadata["result_file"].split(".")[-1]
+        new_dataset.reserve_result_file(parameters=new_dataset.parameters, extension=extension)
+
+        new_dataset.update_status("Imported dataset created")
+        if new_dataset.key != original_key:
+            # could not use original key because it was already in use
+            # so update any references to use the new key
+            self.remapped_keys[original_key] = new_dataset.key
+            new_dataset.update_status(f"Cannot import with same key - already in use on this server. Using key "
+                                      f"{new_dataset.key} instead of key {original_key}!")
+
+        # refresh object, make sure it's in sync with the database
+        self.created_datasets.add(new_dataset.key)
+        new_dataset = DataSet(key=new_dataset.key, db=self.db)
+        if new_dataset.key == self.dataset.key:
+            # this ensures that the first imported dataset becomes the
+            # processor's "own" dataset, and that the import logs go to
+            # that dataset's log file. For later imports, this evaluates to
+            # False.
+            self.dataset = new_dataset
+
+        # if the key of the parent dataset was changed, change the
+        # reference to it that the child dataset has
+        if new_dataset.key_parent and new_dataset.key_parent in self.remapped_keys:
+            new_dataset.key_parent = self.remapped_keys[new_dataset.key_parent]
+
+        # update some attributes that should come from the new server, not
+        # the old
+        new_dataset.creator = dataset_owner
+        new_dataset.original_timestamp = new_dataset.timestamp
+        new_dataset.imported = True
+        new_dataset.timestamp = int(time.time())
+        new_dataset.db.commit()
+
+        return new_dataset
+
+
+    def process_urls(self):
         """
         Import 4CAT dataset from another 4CAT server
 
@@ -70,7 +284,7 @@ def process(self):
         self.created_datasets = set()   # keys of created datasets - may not be successful!
         imported = []  # successfully imported datasets
         failed_imports = []  # keys that failed to import
-        remapped_keys = {}  # changed dataset keys
+        self.remapped_keys = {}  # changed dataset keys
         num_rows = 0  # will be used later
         dataset_owner = self.dataset.get_owners()[0]  # at this point it has 1 owner
 
@@ -101,90 +315,10 @@ def process(self):
                 failed_imports.append(dataset_key)
                 continue
 
-            # get rid of some keys that are server-specific and don't need to
-            # be stored (or don't correspond to database columns)
-            metadata.pop("current_4CAT_version")
-            metadata.pop("id")
-            metadata.pop("job")
-            metadata.pop("is_private")
-            metadata.pop("is_finished")  # we'll finish it ourselves, thank you!!!
-
-            # extra params are stored as JSON...
-            metadata["parameters"] = json.loads(metadata["parameters"])
-            if "copied_from" in metadata["parameters"]:
-                metadata["parameters"].pop("copied_from")
-            metadata["parameters"] = json.dumps(metadata["parameters"])
-
-            if not imported:
-                # if this is the first dataset we're importing, make it the
-                # processor's "own" dataset. the key has already been set to
-                # the imported dataset's key via ensure_key() (or a new unqiue
-                # key if it already existed on this server)
-                # by making it the "own" dataset, the user initiating the
-                # import will see the imported dataset as the "result" of their
-                # import query in the interface, similar to the workflow for
-                # other data sources
-                new_dataset = self.dataset
-                metadata.pop("key")  # key already OK (see above)
-                self.db.update("datasets", where={"key": new_dataset.key}, data=metadata)
+            metadata = self.process_metadata(metadata)
 
-            else:
-                # supernumerary datasets - handle on their own
-                # these include any children of imported datasets
-                try:
-                    key_exists = DataSet(key=metadata["key"], db=self.db)
-
-                    # if we *haven't* thrown a DatasetException now, then the
-                    # key is already in use, so create a "dummy" dataset and
-                    # overwrite it with the metadata we have (except for the
-                    # key). this ensures that a new unique key will be
-                    # generated.
-                    new_dataset = DataSet(parameters={}, type=self.type, db=self.db)
-                    metadata.pop("key")
-                    self.db.update("datasets", where={"key": new_dataset.key}, data=metadata)
-
-                except DataSetException:
-                    # this is *good* since it means the key doesn't exist, so
-                    # we can re-use the key of the imported dataset
-                    self.db.insert("datasets", data=metadata)
-                    new_dataset = DataSet(key=metadata["key"], db=self.db)
-
-            # make sure the dataset path uses the new key and local dataset
-            # path settings. this also makes sure the log file is created in
-            # the right place (since it is derived from the results file path)
-            extension = metadata["result_file"].split(".")[-1]
-            new_dataset.reserve_result_file(parameters=new_dataset.parameters, extension=extension)
-
-            new_dataset.update_status("Imported dataset created")
-            if new_dataset.key != dataset_key:
-                # could not use original key because it was already in use
-                # so update any references to use the new key
-                remapped_keys[dataset_key] = new_dataset.key
-                new_dataset.update_status(f"Cannot import with same key - already in use on this server. Using key "
-                                f"{new_dataset.key} instead of key {dataset_key}!")
-
-            # refresh object, make sure it's in sync with the database
-            self.created_datasets.add(new_dataset.key)
-            new_dataset = DataSet(key=new_dataset.key, db=self.db)
-            if new_dataset.key == self.dataset.key:
-                # this ensures that the first imported dataset becomes the
-                # processor's "own" dataset, and that the import logs go to
-                # that dataset's log file. For later imports, this evaluates to
-                # False.
-                self.dataset = new_dataset
-
-            # if the key of the parent dataset was changed, change the
-            # reference to it that the child dataset has
-            if new_dataset.key_parent and new_dataset.key_parent in remapped_keys:
-                new_dataset.key_parent = remapped_keys[new_dataset.key_parent]
-
-            # update some attributes that should come from the new server, not
-            # the old
-            new_dataset.creator = dataset_owner
-            new_dataset.original_timestamp = new_dataset.timestamp
-            new_dataset.imported = True
-            new_dataset.timestamp = int(time.time())
-            new_dataset.db.commit()
+            # create the new dataset
+            new_dataset = self.create_dataset(metadata, dataset_key, primary=True if not imported else False)
 
             # then, the log
             self.halt_and_catch_fire()
@@ -353,47 +487,61 @@ def validate_query(query, request, user):
         :param User user:  User object of user who has submitted the query
         :return dict:  Safe query parameters
         """
-        urls = query.get("url")
-        if not urls:
-            return QueryParametersException("Provide at least one dataset URL.")
-
-        urls = urls.split(",")
-        bases = set([url.split("/results/")[0].lower() for url in urls])
-        keys = SearchImportFromFourcat.get_keys_from_urls(urls)
+        if query.get("method") == "zip":
+            file = request.files.get("data_upload")
+            if not file:
+                return QueryParametersException("No file uploaded.")
+
+            if not file.filename.endswith(".zip"):
+                return QueryParametersException("Uploaded file must be a ZIP file.")
+
+            return {
+                "data_upload": file
+            }
+        elif query.get("method") == "url":
+            urls = query.get("url")
+            if not urls:
+                return QueryParametersException("Provide at least one dataset URL.")
+
+            urls = urls.split(",")
+            bases = set([url.split("/results/")[0].lower() for url in urls])
+            keys = SearchImportFromFourcat.get_keys_from_urls(urls)
+
+            if len(keys) != 1:
+                # todo: change this to < 1 if we allow multiple datasets
+                return QueryParametersException("You need to provide a single URL to a 4CAT dataset to import.")
+
+            if len(bases) != 1:
+                return QueryParametersException("All URLs need to point to the same 4CAT server. You can only import from "
+                                                "one 4CAT server at a time.")
+
+            base = urls[0].split("/results/")[0]
+            try:
+                # test if API key is valid and server is reachable
+                test = SearchImportFromFourcat.fetch_from_4cat(base, keys[0], query.get("api-key"), "metadata")
+            except FourcatImportException as e:
+                raise QueryParametersException(str(e))
 
-        if len(keys) != 1:
-            # todo: change this to < 1 if we allow multiple datasets
-            return QueryParametersException("You need to provide a single URL to a 4CAT dataset to import.")
+            try:
+                # test if we get a response we can parse
+                metadata = test.json()
+            except ValueError:
+                raise QueryParametersException(f"Unexpected response when trying to fetch metadata for dataset {keys[0]}.")
 
-        if len(bases) != 1:
-            return QueryParametersException("All URLs need to point to the same 4CAT server. You can only import from "
-                                            "one 4CAT server at a time.")
+            version = get_software_version()
 
-        base = urls[0].split("/results/")[0]
-        try:
-            # test if API key is valid and server is reachable
-            test = SearchImportFromFourcat.fetch_from_4cat(base, keys[0], query.get("api-key"), "metadata")
-        except FourcatImportException as e:
-            raise QueryParametersException(str(e))
+            if metadata.get("current_4CAT_version") != version:
+                raise QueryParametersException(f"This 4CAT server is running a different version of 4CAT ({version}) than "
+                                               f"the one you are trying to import from ({metadata.get('current_4CAT_version')}). Make "
+                                               "sure both are running the same version of 4CAT and try again.")
 
-        try:
-            # test if we get a response we can parse
-            metadata = test.json()
-        except ValueError:
-            raise QueryParametersException(f"Unexpected response when trying to fetch metadata for dataset {keys[0]}.")
-
-        version = get_software_version()
-
-        if metadata.get("current_4CAT_version") != version:
-            raise QueryParametersException(f"This 4CAT server is running a different version of 4CAT ({version}) than "
-                                           f"the one you are trying to import from ({metadata.get('current_4CAT_version')}). Make "
-                                           "sure both are running the same version of 4CAT and try again.")
-
-        # OK, we can import at least one dataset
-        return {
-            "url": ",".join(urls),
-            "api-key": query.get("api-key")
-        }
+            # OK, we can import at least one dataset
+            return {
+                "url": ",".join(urls),
+                "api-key": query.get("api-key")
+            }
+        else:
+            raise QueryParametersException("Import method not yet implemented.")
 
     @staticmethod
     def get_keys_from_urls(urls):

From 358aca2ea331859998db1eaf0683ad625407ed1d Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Fri, 6 Sep 2024 14:36:24 +0200
Subject: [PATCH 03/12] finish off importing ZIP 4CAT datasets

---
 datasources/fourcat_import/import_4cat.py | 146 +++++++++++++++++-----
 1 file changed, 114 insertions(+), 32 deletions(-)

diff --git a/datasources/fourcat_import/import_4cat.py b/datasources/fourcat_import/import_4cat.py
index 069549446..a949599e7 100644
--- a/datasources/fourcat_import/import_4cat.py
+++ b/datasources/fourcat_import/import_4cat.py
@@ -75,11 +75,15 @@ class SearchImportFromFourcat(BasicProcessor):
     created_datasets = None
     base = None
     remapped_keys = None
+    dataset_owner = None
 
     def process(self):
         """
         Import 4CAT dataset either from another 4CAT server or from the uploaded zip file
         """
+        self.created_datasets = set()  # keys of created datasets - may not be successful!
+        self.remapped_keys = {}  # changed dataset keys
+        self.dataset_owner = self.dataset.get_owners()[0]  # at this point it has 1 owner
         if self.parameters.get("method") == "zip":
             self.process_zip()
         else:
@@ -117,8 +121,9 @@ def process_zip(self):
         self.dataset.update_status(f"Importing datasets and analyses from ZIP file.")
         temp_file = self.dataset.get_results_path().with_suffix(".importing")
 
-        processed_files = []
-        missed_files = []
+        imported = []
+        processed_files = 1 # take into account the export.log file
+        failed_imports = []
         with zipfile.ZipFile(temp_file, "r") as zip_ref:
             zip_contents = zip_ref.namelist()
 
@@ -131,44 +136,111 @@ def process_zip(self):
             # Get the primary dataset
             primary_dataset_keys = set()
             datasets = []
+            parent_child_mapping = {}
             for file in metadata_files:
                 with zip_ref.open(file) as f:
                     metadata = json.load(f)
-                    if metadata.get("key_parent") is None:
+                    if not metadata.get("key_parent"):
                         primary_dataset_keys.add(metadata.get("key"))
                         datasets.append(metadata)
                     else:
-                        # Child datasets are skipped for now, as we may need to remap keys
-                        pass
+                        # Store the mapping of parent to child datasets
+                        parent_key = metadata.get("key_parent")
+                        if parent_key not in parent_child_mapping:
+                            parent_child_mapping[parent_key] = []
+                        parent_child_mapping[parent_key].append(metadata)
 
             # Primary dataset will overwrite this dataset; we could address this to support multiple primary datasets
             if len(primary_dataset_keys) != 1:
                 self.dataset.finish_with_error("ZIP file contains multiple primary datasets; only one is allowed.")
                 return
 
-            # Import datasets (
-            # TODO: this is ordered due to potential issues with keys needing to be remapped, but there may be an issue with a child datasets having additional children and needing remapping...
+            # Import datasets
             while datasets:
+                self.halt_and_catch_fire()
+
+                # Create the datasets
                 metadata = datasets.pop(0)
                 dataset_key = metadata.get("key")
                 processed_metadata = self.process_metadata(metadata)
-                if dataset_key in primary_dataset_keys:
-                    # Import primary dataset
-                    self.dataset.update_status(f"Importing primary dataset {dataset_key}.")
-
-
-
-
-
-
+                new_dataset = self.create_dataset(processed_metadata, dataset_key, dataset_key in primary_dataset_keys)
+                processed_files += 1
+
+                # TODO: I am now noticing that we do not update the results_file; it is even more unlikely to collide as it is both a random key and label combined... but...
+                # Copy the log file
+                self.halt_and_catch_fire()
+                log_filename = new_dataset.get_log_path().name
+                if log_filename in zip_contents:
+                    self.dataset.update_status(f"Transferring log file for dataset {new_dataset.key}")
+                    with zip_ref.open(log_filename) as f:
+                        with new_dataset.get_log_path().open("wb") as outfile:
+                            outfile.write(f.read())
+                    processed_files += 1
+                else:
+                    self.dataset.log(f"Log file not found for dataset {new_dataset.key} (original key {dataset_key}).")
+
+                # Copy the results
+                self.halt_and_catch_fire()
+                results_filename = new_dataset.get_results_path().name
+                if results_filename in zip_contents:
+                    self.dataset.update_status(f"Transferring data file for dataset {new_dataset.key}")
+                    with zip_ref.open(results_filename) as f:
+                        with new_dataset.get_results_path().open("wb") as outfile:
+                            outfile.write(f.read())
+                    processed_files += 1
+
+                    if not imported:
+                        # first dataset - use num rows as 'overall'
+                        num_rows = metadata["num_rows"]
+                else:
+                    # TODO: should I just delete the new_dataset here?
+                    self.dataset.log(f"Results file not found for dataset {new_dataset.key} (original key {dataset_key}).")
+                    new_dataset.finish_with_error(f"Results file not found for dataset {new_dataset.key} (original key {dataset_key}).")
+                    failed_imports.append(dataset_key)
+                    continue
+
+                # finally, the kids
+                self.halt_and_catch_fire()
+                if dataset_key in parent_child_mapping:
+                    datasets.extend(parent_child_mapping[dataset_key])
+                    self.dataset.log(f"Adding ({len(parent_child_mapping[dataset_key])}) child datasets to import queue")
+
+                # done - remember that we've imported this one
+                imported.append(new_dataset)
+                new_dataset.update_status(metadata["status"])
 
+                if new_dataset.key != self.dataset.key:
+                    # only finish if this is not the 'main' dataset, or the user
+                    # will think the whole import is done
+                    new_dataset.finish(metadata["num_rows"])
 
             # Check that all files were processed
-            if len(zip_contents) != len(processed_files):
+            missed_files = []
+            if len(zip_contents) != processed_files:
                 for file in zip_contents:
                     if file not in processed_files:
                         missed_files.append(file)
 
+            # todo: this part needs updating if/when we support importing multiple datasets!
+            if failed_imports:
+                self.dataset.update_status(f"Dataset import finished, but not all data was imported properly. "
+                                           f"{len(failed_imports)} dataset(s) were not successfully imported. Check the "
+                                           f"dataset log file for details.", is_final=True)
+            elif missed_files:
+                self.dataset.log(f"ZIP file contained {len(missed_files)} files that were not processed: {missed_files}")
+                self.dataset.update_status(f"Dataset import finished, but not all files were processed. "
+                                           f"{len(missed_files)} files were not successfully imported. Check the "
+                                           f"dataset log file for details.", is_final=True)
+            else:
+                self.dataset.update_status(f"{len(imported)} dataset(s) succesfully imported.",
+                                           is_final=True)
+
+            if not self.dataset.is_finished():
+                # now all related datasets are imported, we can finish the 'main'
+                # dataset, and the user will be alerted that the full import is
+                # complete
+                self.dataset.finish(num_rows)
+
 
     @staticmethod
     def process_metadata(metadata):
@@ -196,6 +268,7 @@ def create_dataset(self, metadata, original_key, primary=False):
         Create a new dataset
         """
         if primary:
+            self.dataset.update_status(f"Importing primary dataset {original_key}.")
             # if this is the first dataset we're importing, make it the
             # processor's "own" dataset. the key has already been set to
             # the imported dataset's key via ensure_key() (or a new unqiue
@@ -209,6 +282,7 @@ def create_dataset(self, metadata, original_key, primary=False):
             self.db.update("datasets", where={"key": new_dataset.key}, data=metadata)
 
         else:
+            self.dataset.update_status(f"Importing child dataset {original_key}.")
             # supernumerary datasets - handle on their own
             # these include any children of imported datasets
             try:
@@ -260,7 +334,7 @@ def create_dataset(self, metadata, original_key, primary=False):
 
         # update some attributes that should come from the new server, not
         # the old
-        new_dataset.creator = dataset_owner
+        new_dataset.creator = self.dataset_owner
         new_dataset.original_timestamp = new_dataset.timestamp
         new_dataset.imported = True
         new_dataset.timestamp = int(time.time())
@@ -281,12 +355,9 @@ def process_urls(self):
         keys = SearchImportFromFourcat.get_keys_from_urls(urls)
         api_key = self.parameters.get("api-key")
 
-        self.created_datasets = set()   # keys of created datasets - may not be successful!
         imported = []  # successfully imported datasets
         failed_imports = []  # keys that failed to import
-        self.remapped_keys = {}  # changed dataset keys
         num_rows = 0  # will be used later
-        dataset_owner = self.dataset.get_owners()[0]  # at this point it has 1 owner
 
         # we can add support for multiple datasets later by removing
         # this part!
@@ -419,7 +490,7 @@ def halt_and_catch_fire(self):
             for deletable in deletables:
                 DataSet(key=deletable, db=self.db).delete()
 
-            self.dataset.finish_with_error(f"Interrupted while importing datasets from {self.base}. Cannot resume - you "
+            self.dataset.finish_with_error(f"Interrupted while importing datasets{' from '+self.base if self.base else ''}. Cannot resume - you "
                                            f"will need to initiate the import again.")
 
             raise ProcessorInterruptedException()
@@ -488,20 +559,31 @@ def validate_query(query, request, user):
         :return dict:  Safe query parameters
         """
         if query.get("method") == "zip":
-            file = request.files.get("data_upload")
-            if not file:
-                return QueryParametersException("No file uploaded.")
-
-            if not file.filename.endswith(".zip"):
-                return QueryParametersException("Uploaded file must be a ZIP file.")
+            filename = ""
+            if "option-data_upload-entries" in request.form:
+                # First pass sends list of files in the zip
+                pass
+            elif "option-data_upload" in request.files:
+                # Second pass sends the actual file
+                file = request.files["option-data_upload"]
+                if not file:
+                    raise QueryParametersException("No file uploaded.")
+
+                if not file.filename.endswith(".zip"):
+                    raise QueryParametersException("Uploaded file must be a ZIP file.")
+
+                filename = file.filename
+            else:
+                raise QueryParametersException("No file was offered for upload.")
 
             return {
-                "data_upload": file
+                "method": "zip",
+                "filename": filename
             }
         elif query.get("method") == "url":
             urls = query.get("url")
             if not urls:
-                return QueryParametersException("Provide at least one dataset URL.")
+                raise QueryParametersException("Provide at least one dataset URL.")
 
             urls = urls.split(",")
             bases = set([url.split("/results/")[0].lower() for url in urls])
@@ -509,10 +591,10 @@ def validate_query(query, request, user):
 
             if len(keys) != 1:
                 # todo: change this to < 1 if we allow multiple datasets
-                return QueryParametersException("You need to provide a single URL to a 4CAT dataset to import.")
+                raise QueryParametersException("You need to provide a single URL to a 4CAT dataset to import.")
 
             if len(bases) != 1:
-                return QueryParametersException("All URLs need to point to the same 4CAT server. You can only import from "
+                raise QueryParametersException("All URLs need to point to the same 4CAT server. You can only import from "
                                                 "one 4CAT server at a time.")
 
             base = urls[0].split("/results/")[0]

From 53f76dcba32d6ea1573ea146dc255a6a700a3035 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Fri, 6 Sep 2024 14:49:07 +0200
Subject: [PATCH 04/12] ensure cleanup on failure

had some weird lost datasets when debugging this
---
 datasources/fourcat_import/import_4cat.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/datasources/fourcat_import/import_4cat.py b/datasources/fourcat_import/import_4cat.py
index a949599e7..dc9868d34 100644
--- a/datasources/fourcat_import/import_4cat.py
+++ b/datasources/fourcat_import/import_4cat.py
@@ -84,10 +84,22 @@ def process(self):
         self.created_datasets = set()  # keys of created datasets - may not be successful!
         self.remapped_keys = {}  # changed dataset keys
         self.dataset_owner = self.dataset.get_owners()[0]  # at this point it has 1 owner
-        if self.parameters.get("method") == "zip":
-            self.process_zip()
-        else:
-            self.process_urls()
+        try:
+            if self.parameters.get("method") == "zip":
+                self.process_zip()
+            else:
+                self.process_urls()
+        except Exception as e:
+            # Catch all exceptions and finish the job with an error
+            # Resuming is impossible because this dataset was overwritten with the importing dataset
+            # halt_and_catch_fire() will clean up and delete the datasets that were created
+            self.interrupted = True
+            try:
+                self.halt_and_catch_fire()
+            except ProcessorInterruptedException:
+                pass
+            # Reraise the original exception for logging
+            raise e
 
     def after_create(query, dataset, request):
         """

From f944c71e08497a3d06d68c5dac8b4667e915de3f Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Fri, 6 Sep 2024 15:59:44 +0200
Subject: [PATCH 05/12] auto-expire export zips

---
 processors/conversion/export_datasets.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/processors/conversion/export_datasets.py b/processors/conversion/export_datasets.py
index f45cf92f7..bd7b81289 100644
--- a/processors/conversion/export_datasets.py
+++ b/processors/conversion/export_datasets.py
@@ -3,10 +3,11 @@
 """
 import shutil
 import json
+import datetime
 
 from backend.lib.processor import BasicProcessor
 from common.lib.dataset import DataSet
-from common.lib.exceptions import ProcessorException, DataSetException
+from common.lib.exceptions import DataSetException
 
 __author__ = "Dale Wahl"
 __credits__ = ["Dale Wahl"]
@@ -22,7 +23,7 @@ class ExportDatasets(BasicProcessor):
 	type = "export-datasets"  # job type ID
 	category = "Conversion"  # category
 	title = "Export Dataset and All Analyses"  # title displayed in UI
-	description = "Creates a ZIP file containing the dataset and all analyses to be archived and uploaded to a 4CAT instance in the future"  # description displayed in UI
+	description = "Creates a ZIP file containing the dataset and all analyses to be archived and uploaded to a 4CAT instance in the future. Automatically expires after 1 day, after which you must run again."  # description displayed in UI
 	extension = "zip"  # extension of result file, used internally and in UI
 
 	@classmethod
@@ -96,5 +97,10 @@ def process(self):
 		self.dataset.log(f"Failed to export datasets: {failed_exports}")
 		shutil.copy(self.dataset.get_log_path(), results_path.joinpath("export.log"))
 
+		# set expiration date
+		# these datasets can be very large and are just copies of the existing datasets, so we don't need to keep them around for long
+		# TODO: convince people to stop using hyphens in python variables and file names...
+		self.dataset.__setattr__("expires-after", (datetime.datetime.now() + datetime.timedelta(days=1)).timestamp())
+
 		# done!
 		self.write_archive_and_finish(results_path, len(exported_datasets))
\ No newline at end of file

From abd9b112240b49c3c8fd660a33c83d5856a9ac52 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Fri, 6 Sep 2024 16:28:21 +0200
Subject: [PATCH 06/12] nltk again

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index fd06a1e55..e62f292ba 100644
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,7 @@
 	"lxml~=4.9.0",
 	"markdown==3.0.1",
 	"markdown2==2.4.2",
-	"nltk==3.9",
+	"nltk==3.9.1",
 	"networkx~=2.8.0",
 	"numpy>=1.19.2",
 	"opencv-python>=4.6.0.66",

From eabb1f5acf8ca5ad1e4a97fe72748710b9a6a0f9 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Thu, 19 Sep 2024 14:04:56 +0200
Subject: [PATCH 07/12] Squashed commit of the following:

commit 3f2a62a124926cfeb840796f104a702878ac10e5
Author: Carsten Schnober <carschno@gmail.com>
Date:   Wed Sep 18 18:18:29 2024 +0200

    Update Gensim to >=4.3.3, <4.4.0 (#450)

    * Update Gensim to >=4.3.3, <4.4.0

    * update nltk as well

    ---------

    Co-authored-by: Dale Wahl <dalewahl@gmail.com>
    Co-authored-by: Sal Hagen <s.h.hagen@uva.nl>

commit fee2c8c08617094f28496963da282d2e2dddeab7
Merge: 3d94b666 f8e93eda
Author: sal-phd-desktop <s.h.hagen@uva.nl>
Date:   Wed Sep 18 18:11:19 2024 +0200

    Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat

commit 3d94b666cedd0de4e0bee953cbf1d787fdc38854
Author: sal-phd-desktop <s.h.hagen@uva.nl>
Date:   Wed Sep 18 18:11:04 2024 +0200

    FINALLY remove 'News' from the front page, replace with 4CAT BlueSky updates and potential information about the specific server (to be set on config page)

commit f8e93edabe9013a2c1229caa4c454fab09620125
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Wed Sep 18 15:11:21 2024 +0200

    Simple extensions page in Control Panel

commit b5be128c7b8682fb233d962326d9118a61053165
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Wed Sep 18 14:08:13 2024 +0200

    Remove 'docs' directory

commit 1e2010af44817016c274c9ec9f7f9971deb57f66
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Wed Sep 18 14:07:38 2024 +0200

    Forgot TikTok and Douyin

commit c757dd51884e7ec9cf62ca1726feacab4b2283b7
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Wed Sep 18 14:01:31 2024 +0200

    Say 'zeeschuimer' instead of 'extension' to avoid confusion with 4CAT extensions

commit ee7f4345478f923541536c86a5b06246deae03f6
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Wed Sep 18 14:00:40 2024 +0200

    RIP Parler data source

commit 11300f2430b51887823b280405de4ded4f15ede1
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Wed Sep 18 11:21:37 2024 +0200

    Tuplestring

commit 547265240eba81ca0ad270cd3c536a2b1dcf512d
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Wed Sep 18 11:15:29 2024 +0200

    Pass user obj instead of str to ConfigWrapper in Processor

commit b21866d7900b5d20ed6ce61ee9aff50f3c0df910
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Tue Sep 17 17:45:01 2024 +0200

    Ensure request-aware config reader in user object when using config wrapper

commit bbe79e4b0fe870ccc36cab7bfe7963b28d1948e3
Author: Sal Hagen <s.h.hagen@uva.nl>
Date:   Tue Sep 17 15:12:46 2024 +0200

    Fix extension path walk for Windows

commit d6064beaf31a6a85b0e34ed4f8126eb4c4fc07e3
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Mon Sep 16 14:50:45 2024 +0200

    Allow tags that have no users

    Use case: tag-based frontend differentiation using X-4CAT-Config-Via-Proxy

commit b542ded6f976809ec88445e7b04f2c81b900188e
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Mon Sep 16 14:13:14 2024 +0200

    Trailing slash in query results list

commit a4bddae575b22a009925206a1337bdd89349e567
Author: Dale Wahl <32108944+dale-wahl@users.noreply.github.com>
Date:   Mon Sep 16 13:57:23 2024 +0200

    4CAT Extension - easy(ier) adding of new datasources/processors that can be mainted seperately from 4CAT base code (#451)

    * domain only

    * fix reference

    * try and collect links with selenium

    * update column_filter to find multiple matches

    * fix up the normal url_scraper datasource

    * ensure all selenium links are strings for join

    * change output of url_scraper to ndjson with map_items

    * missed key/index change

    * update web archive to use json and map to 4CAT

    * fix no text found

    * and none on scraped_links

    * check key first

    * fix up web_archive error reporting

    * handle None type for error

    * record web archive "bad request"

    * add wait after redirect movement

    * increase waittime for redirects

    * add processor for trackers

    * dict to list for addition

    * allow both newline and comma seperated links

    * attempt to scrape iframes as seperate pages

    * Fixes for selenium scraper to work with config database

    * installation of packages, geckodriver, and firefox if selenium enabled

    * update install instructions

    * fix merge error

    * fix dropped function

    * have to be kidding me

    * add note; setup requires docker... need to think about IF this will ever
    be installed without Docker

    * seperate selenium class into wrapper and Search class so wrapper can be
    used in processors!

    * add screenshots; add firefox extension support

    * update selenium definitions

    * regex for extracting urls from strings

    * screenshots processor; extract urls from text and takes screenshots

    * Allow producing zip files from data sources

    * import time

    * pick better default

    * test screenshot datasource

    * validate all params

    * fix enable extension

    * haha break out of while loop

    * count my items

    * whoops, len() is important here

    * must be getting tired...

    * remove redundant logging

    * Eager loading for screenshots, viewport options, etc

    * Woops, wrong folder

    * Fix label shortening

    * Just 'queue' instead of 'search queue'

    * Yeah, make it headless

    * README -> DESCRIPTION

    * h1 -> h2

    * Actually just have no header

    * Use proper filename for downloaded files

    * Configure whether to offer pseudonymisation etc

    * Tweak descriptions

    * fix log missing data

    * add columns to post_topic_matrix

    * fix breadcrumb bug

    * Add top topics column

    * Fix selenium config install parameter (Docker uses this/manual would
    need to run install_selenium, well, manually)

    * this processor is slow; i thought it was broken long before it updated!

    * refactor detect_trackers as conversion processor not filter

    * add geckodriver executable to docker install

    * Auto-configure webdrivers if available in PATH

    * update screenshots to act as image-downloader and benefit from processors

    * fix is_compatible_with

    * Delete helper-scripts/migrate/migrate-1.30-1.31.py

    * fix embeddings is_compatible_with

    * fix up UI options for hashing and private

    * abstract was moved to lib

    * various fixes to selenium based datasources

    * processors not compatible with image datasets

    * update firefox extension handling

    * screenshots datasource fix get_options

    * rename screenshots processor to be detected as image dataset

    * add monthly and weekly frequencies to wayback machine datasource

    * wayback ds: fix fail if all attempts do not realize results; addion frequency options to options; add daily

    * add scroll down page to allow lazy loading for entire page screenshots

    * screenshots: adjust pause time so it can be used to force a wait for images to load

    I have not successfully come up with or found a way to wait for all images to load; document.readyState == 'complete' does not function in this way on certain sites including the wayback machine

    * hash URLs to create filenames

    * remove log

    * add setting to toggle display advanced options

    * add progress bars

    * web archive fix query validation

    * count subpages in progress

    * remove overwritten function

    * move http response to own column

    * special filenames

    * add timestamps to all screenshots

    * restart selenium on failure

    * new build have selenium

    * process urls after start (keep original query parameters)

    * undo default firefox

    * quick max

    * rename SeleniumScraper to SeleniumSearch

    todo: build SeleniumProcessor!

    * max number screenshots configurable

    * method to get url with error handling

    * use get_with_error_handling

    * d'oh, screenshot processor needs to quit selenium

    * update log to contain URL

    * Update scrolling to use Page down key if necessary

    * improve logs

    * update image_category_wall as screenshot datasource does not have category column; this is not ideal and ought to be solved in another way.

    Also, could I get categories from the metadata? That's... ugh.

    * no category, no processor

    * str errors

    * screenshots: dismiss alerts when checking ready state is complete

    * set screenshot timeout to 30 seconds

    * update gensim package

    * screenshots: move processor interrupt into attempts loop

    * if alert disappears before we can dismiss it...

    * selenium specific logger

    * do not switch window when no alert found on dismiss

    * extract wait for page to load to selenium class

    * improve descriptions of screenshot options

    * remove unused line

    * treat timeouts differently from other errors

    these are more likely due to an issue with the website in question

    * debug if requested

    * increase pause time

    * restart browser w/ PID

    * increase max_workers for selenium

    this is by individual worker class not for all selenium classes... so you can really crank them out if desired

    * quick fix restart by pid

    * avoid bad urls

    * missing bracket & attempt to fix-missing dependencies in Docker install

    * Allow dynamic form options in processors

    * Allow 'requires' on data source options as well

    * Handle list values with requires

    * basic processor for apple store; setup checks for additional requirements

    * fix is_4cat_class

    * show preview when no map_item

    * add google store datasource

    * Docker setup.py use extensions

    * Wider support for file upload in processors

    * Log file uploads in DMI service manager

    * add map_item methods and record more data per item

    need additional item data as map_item is staticmethod

    * update from master; merge conflicts

    * fix docker build context (ignore data files)

    * fix option requirements

    * apple store fix: list still tries to get query

    * apple & google stores fix up item mapping

    * missed merge error

    * minor fix

    * remove unused import

    * fix datasources w/ files frontend error

    * fix error w/ datasources having file option

    * better way to name docker volumes

    * update two other docker compose files

    * fix docker-compose ymls

    * minor bug: fix and add warning; fix no results fail

    * update apple field names to better match interface

    * update google store fieldnames and order

    * sneak in jinja logger if needed

    * fix fourcat.js handling checkboxes for dynamic settings

    * add new endpoint for app details to apple store

    * apple_store map new beta app data

    * add default lang/country

    * not all apps have advisories

    * revert so button works

    * add chart positions to beta map items

    * basic scheduler

    To-do
    - fix up and add options to scheduler view (e.g. delete/change)
    - add scheduler view to navigator
    - tie jobs to datasets? (either in scheduler view or, perhaps, filter dataset view)
    - more testing...

    * update scheduler view, add functions to update job interval

    * revert .env

    * working scheduler!

    * basic scheduler view w/ datasets

    * fix postgres tag

    * update job status in scheduled_jobs table

    * fix timestamp; end_date needed for last run check; add dataset label

    * improve scheduler view

    * remove dataset from scheduled_jobs table on delete

    * scheduler view order by last creation

    * scheduler views: separate scheduler list from scheduled dataset list

    * additional update from master fixes

    * apple_store map_items fix missing locales

    * add back depth for pagination

    * correct route

    * modify pagination to accept args

    * pagination fun

    * pagination: i hate testing on live servers...

    * ok ok need the pagination route

    * pagination: add route_args

    * fix up scheduler header

    * improve app store descriptions

    * add azure store

    * fix azure links

    * azure_store: add category search

    * azure fix type of config update timestamp

    OPTION_DATE does not appear correctly in settings and causes it to be written incorrectly

    * basic aws store

    * check if selenium available; get correct app_id

    * aws: implement pagination

    * add logging; wait for elements to load after next page; attempts to rework filter option collection

    * apple_store: handle invalid param error

    * fix filter_options

    * aws: fix filter option collection!

    * more merge

    * move new datasources and processors to extensions and modify setup.py and module loader to use the new locations

    * migrate.py to run extension "fourcat_install.py" files

    * formatting

    * remove extensions; add gitignore

    * excise scheduler merge

    * some additional cleanup from app_studies branch

    * allow nested datasources folders; ignore files in extensions main folder

    * allow extension install scripts to run pip if migrate.py has not

    * Remove unused URL functions we could use ural for

    * Take care of git commit hash tracking for extension processors

    * Get rid of unused path.versionfile config setting

    * Add extensions README

    * Squashed commit of the following:

    commit cd356f7a69d15e8ecc8efffc6d63a16368e62962
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Sat Sep 14 17:36:18 2024 +0200

        UI setting for 4CAT install ad in login

    commit 0945d8c0a11803a6bb411f15099d50fea25f10ab
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Sat Sep 14 17:32:55 2024 +0200

        UI setting for anonymisation controls

        Todo: make per-datasource

    commit 1a2562c2f9a368dbe0fc03264fb387e44313213b
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Sat Sep 14 15:53:27 2024 +0200

        Debug panel for HTTP headers in control panel

    commit 203314ec83fb631d985926a0b5c5c440cfaba9aa
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Sat Sep 14 15:53:17 2024 +0200

        Preview for HTML datasets

    commit 48c20c2ebac382bd41b92da4481ff7d832dc1538
    Author: Desktop Sal <info@salhagen.nl>
    Date:   Wed Sep 11 13:54:23 2024 +0200

        Remove spacy processors (linguistic extractor, get nouns, get entities) and remove dependencies

    commit 657ffd75a7f48ba4537449127e5fa39debf4fdf3
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Fri Sep 6 16:29:19 2024 +0200

        fix nltk where it matters

    commit 2ef5c80f2d1a5b5f893c8977d8394740de6d796d
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Tue Sep 3 12:05:14 2024 +0200

        Actually check progress in text annotator

    commit 693960f41b73e39eda0c2f23eb361c18bde632cd
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Mon Sep 2 18:03:18 2024 +0200

        Add processor for stormtrooper DMI service

    commit 6ae964aad492527bc5d016a00f870145aab6e1af
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Aug 30 17:31:37 2024 +0200

        Fix reference to old stopwords list in neologisms preset

    * Fix Github links for extensions

    * Fix commit detection in extensions

    * Fix extension detection in module loader

    * Follow symlinks when loading extensions

    Probably not uncommon to have a checked out repo somewhere to then symlink into the extensions dir

    * Make queue message on create page more generic

    * Markdown in datasource option tooltips

    * Remove Spacy model from requirements

    * Add software_source to database SQL

    ---------

    Co-authored-by: Stijn Peeters <stijn.peeters@uva.nl>
    Co-authored-by: Stijn Peeters <42036349+stijn-uva@users.noreply.github.com>

commit cd356f7a69d15e8ecc8efffc6d63a16368e62962
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Sat Sep 14 17:36:18 2024 +0200

    UI setting for 4CAT install ad in login

commit 0945d8c0a11803a6bb411f15099d50fea25f10ab
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Sat Sep 14 17:32:55 2024 +0200

    UI setting for anonymisation controls

    Todo: make per-datasource

commit 1a2562c2f9a368dbe0fc03264fb387e44313213b
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Sat Sep 14 15:53:27 2024 +0200

    Debug panel for HTTP headers in control panel

commit 203314ec83fb631d985926a0b5c5c440cfaba9aa
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Sat Sep 14 15:53:17 2024 +0200

    Preview for HTML datasets

commit 48c20c2ebac382bd41b92da4481ff7d832dc1538
Author: Desktop Sal <info@salhagen.nl>
Date:   Wed Sep 11 13:54:23 2024 +0200

    Remove spacy processors (linguistic extractor, get nouns, get entities) and remove dependencies

commit 657ffd75a7f48ba4537449127e5fa39debf4fdf3
Author: Dale Wahl <dalewahl@gmail.com>
Date:   Fri Sep 6 16:29:19 2024 +0200

    fix nltk where it matters
---
 .dockerignore                                 |   1 +
 .env                                          |   3 +-
 .zenodo.json                                  |   2 +-
 VERSION                                       |   2 +-
 backend/database.sql                          |   1 +
 backend/lib/processor.py                      |  11 +-
 backend/lib/search.py                         |  31 ++-
 backend/lib/worker.py                         |  11 +
 common/config_manager.py                      |  13 +-
 common/lib/config_definition.py               |  50 +++--
 common/lib/dataset.py                         |  23 +-
 common/lib/helpers.py                         | 181 +++++++++++++---
 common/lib/logger.py                          |   4 +-
 common/lib/module_loader.py                   |  54 +++--
 common/lib/user.py                            | 109 ++++++----
 datasources/douyin/search_douyin.py           |   2 +-
 datasources/gab/search_gab.py                 |   2 +-
 datasources/imgur/search_imgur.py             |   2 +-
 datasources/instagram/search_instagram.py     |   2 +-
 datasources/linkedin/search_linkedin.py       |   2 +-
 datasources/ninegag/search_9gag.py            |   2 +-
 datasources/parler/DESCRIPTION.md             |  11 -
 datasources/parler/__init__.py                |  12 --
 datasources/parler/search_parler.py           |  66 ------
 datasources/tiktok/search_tiktok.py           |   2 +-
 .../tiktok_comments/search_tiktok_comments.py |   2 +-
 datasources/truth/search_truth.py             |   2 +-
 datasources/twitter-import/search_twitter.py  |   2 +-
 docker-compose_build.yml                      |   8 -
 docker/Dockerfile                             |   1 +
 docs/conf.py                                  |  62 ------
 docs/datasource.rst                           |  73 -------
 docs/index.rst                                |  20 --
 docs/introduction.rst                         |   5 -
 docs/processor.rst                            |  63 ------
 docs/requirements.txt                         |   1 -
 docs/worker.rst                               |  14 --
 extensions/.gitignore                         |   5 +
 extensions/README.md                          |  39 ++++
 helper-scripts/migrate.py                     |  47 ++++-
 helper-scripts/migrate/migrate-1.45-1.46.py   |  33 +++
 processors/filtering/column_filter.py         |   6 +-
 processors/metrics/rank_attribute.py          |  11 +-
 processors/networks/wikipedia_network.py      |   7 +-
 processors/presets/neologisms.py              |  21 +-
 processors/text-analysis/get_entities.py      | 172 ---------------
 processors/text-analysis/get_nouns.py         | 196 ------------------
 .../text-analysis/linguistic_extractor.py     | 168 ---------------
 processors/text-analysis/split_sentences.py   |   5 +-
 processors/text-analysis/tokenise.py          |   5 +-
 processors/visualisation/download_videos.py   |   6 +-
 .../visualisation/image_category_wall.py      |  10 +-
 processors/visualisation/word-trees.py        |  17 +-
 setup.py                                      |  36 ++--
 webtool/__init__.py                           |   3 +-
 webtool/lib/helpers.py                        |   3 +-
 webtool/lib/template_filters.py               |  19 +-
 webtool/static/js/fourcat.js                  |  10 +-
 webtool/templates/account/login.html          |   2 +
 .../components/datasource-option.html         |   2 +-
 webtool/templates/components/pagination.html  |   6 +-
 .../templates/components/result-child.html    |   2 +-
 .../templates/components/result-details.html  |   4 +-
 .../components/result-result-row.html         |   7 +-
 .../controlpanel/extensions-list.html         |  55 +++++
 webtool/templates/controlpanel/layout.html    |   4 +-
 webtool/templates/controlpanel/logs.html      |   7 +
 webtool/templates/create-dataset.html         |   6 +-
 webtool/templates/data-overview.html          |   2 +-
 webtool/templates/frontpage.html              |  27 +--
 webtool/templates/preview/csv.html            |   2 +-
 webtool/templates/preview/html.html           |   1 +
 webtool/views/api_tool.py                     |  16 +-
 webtool/views/views_admin.py                  |   3 +-
 webtool/views/views_dataset.py                |   5 +
 webtool/views/views_extensions.py             |  28 +++
 webtool/views/views_misc.py                   |   4 +-
 77 files changed, 731 insertions(+), 1123 deletions(-)
 delete mode 100644 datasources/parler/DESCRIPTION.md
 delete mode 100644 datasources/parler/__init__.py
 delete mode 100644 datasources/parler/search_parler.py
 delete mode 100644 docs/conf.py
 delete mode 100644 docs/datasource.rst
 delete mode 100644 docs/index.rst
 delete mode 100644 docs/introduction.rst
 delete mode 100644 docs/processor.rst
 delete mode 100644 docs/requirements.txt
 delete mode 100644 docs/worker.rst
 create mode 100644 extensions/.gitignore
 create mode 100644 extensions/README.md
 create mode 100644 helper-scripts/migrate/migrate-1.45-1.46.py
 delete mode 100644 processors/text-analysis/get_entities.py
 delete mode 100644 processors/text-analysis/get_nouns.py
 delete mode 100644 processors/text-analysis/linguistic_extractor.py
 create mode 100644 webtool/templates/controlpanel/extensions-list.html
 create mode 100644 webtool/templates/preview/html.html
 create mode 100644 webtool/views/views_extensions.py

diff --git a/.dockerignore b/.dockerignore
index 5d1d149e0..558da504b 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -2,3 +2,4 @@ data/
 .github/
 .ipynb_checkpoints/
 .gitignore
+.idea/
diff --git a/.env b/.env
index 69a217df0..d03f9c703 100644
--- a/.env
+++ b/.env
@@ -30,7 +30,7 @@ TELEGRAM_PORT=443
 # Docker Volume Names
 DOCKER_DB_VOL=4cat_4cat_db
 DOCKER_DATA_VOL=4cat_4cat_data
-DOCKER_CONFIG_VOL=4cat_4cat_share
+DOCKER_CONFIG_VOL=4cat_4cat_config
 DOCKER_LOGS_VOL=4cat_4cat_logs
 
 # Gunicorn settings
@@ -39,4 +39,3 @@ workers=4
 threads=4
 worker_class=gthread
 log_level=debug
-
diff --git a/.zenodo.json b/.zenodo.json
index 3ab05ca45..fd261019f 100644
--- a/.zenodo.json
+++ b/.zenodo.json
@@ -3,7 +3,7 @@
   "license": "MPL-2.0",
   "title": "4CAT Capture and Analysis Toolkit",
   "upload_type": "software",
-  "version": "v1.45",
+  "version": "v1.46",
   "keywords": [
     "webmining",
     "scraping",
diff --git a/VERSION b/VERSION
index 6245ec1a2..fa2cb2583 100644
--- a/VERSION
+++ b/VERSION
@@ -1,4 +1,4 @@
-1.45
+1.46
 
 This file should not be modified. It is used by 4CAT to determine whether it
 needs to run migration scripts to e.g. update the database structure to a more
diff --git a/backend/database.sql b/backend/database.sql
index 33f0ea393..1f372a697 100644
--- a/backend/database.sql
+++ b/backend/database.sql
@@ -56,6 +56,7 @@ CREATE TABLE IF NOT EXISTS datasets (
   is_private        boolean DEFAULT TRUE,
   software_version  text,
   software_file     text DEFAULT '',
+  software_source   text DEFAULT '',
   annotation_fields text DEFAULT ''
 );
 
diff --git a/backend/lib/processor.py b/backend/lib/processor.py
index c67fa7a9d..339b112b4 100644
--- a/backend/lib/processor.py
+++ b/backend/lib/processor.py
@@ -20,6 +20,7 @@
 from common.lib.exceptions import (WorkerInterruptedException, ProcessorInterruptedException, ProcessorException,
 								   DataSetException, MapItemException)
 from common.config_manager import config, ConfigWrapper
+from common.lib.user import User
 
 
 csv.field_size_limit(1024 * 1024 * 1024)
@@ -112,7 +113,7 @@ def work(self):
 		# creator. This ensures that if a value has been overriden for the owner,
 		# the overridden value is used instead.
 		config.with_db(self.db)
-		self.config = ConfigWrapper(config=config, user=self.owner)
+		self.config = ConfigWrapper(config=config, user=User.get_by_name(self.db, self.owner))
 
 		if self.dataset.data.get("key_parent", None):
 			# search workers never have parents (for now), so we don't need to
@@ -164,7 +165,7 @@ def work(self):
 
 		# start log file
 		self.dataset.update_status("Processing data")
-		self.dataset.update_version(get_software_commit())
+		self.dataset.update_version(get_software_commit(self))
 
 		# get parameters
 		# if possible, fill defaults where parameters are not provided
@@ -628,7 +629,7 @@ def write_csv_items_and_finish(self, data):
 		self.dataset.update_status("Finished")
 		self.dataset.finish(len(data))
 
-	def write_archive_and_finish(self, files, num_items=None, compression=zipfile.ZIP_STORED):
+	def write_archive_and_finish(self, files, num_items=None, compression=zipfile.ZIP_STORED, finish=True):
 		"""
 		Archive a bunch of files into a zip archive and finish processing
 
@@ -639,6 +640,7 @@ def write_archive_and_finish(self, files, num_items=None, compression=zipfile.ZI
 		  files added to the archive will be used.
 		:param int compression:  Type of compression to use. By default, files
 		  are not compressed, to speed up unarchiving.
+		:param bool finish:  Finish the dataset/job afterwards or not?
 		"""
 		is_folder = False
 		if issubclass(type(files), PurePath):
@@ -665,7 +667,8 @@ def write_archive_and_finish(self, files, num_items=None, compression=zipfile.ZI
 		if num_items is None:
 			num_items = done
 
-		self.dataset.finish(num_items)
+		if finish:
+			self.dataset.finish(num_items)
 
 	def create_standalone(self):
 		"""
diff --git a/backend/lib/search.py b/backend/lib/search.py
index cdcd08115..15b3982d6 100644
--- a/backend/lib/search.py
+++ b/backend/lib/search.py
@@ -1,16 +1,16 @@
 import hashlib
+import zipfile
 import secrets
-import shutil
 import random
 import json
 import math
 import csv
+import os
 
 from pathlib import Path
 from abc import ABC, abstractmethod
 
 from common.config_manager import config
-from common.lib.dataset import DataSet
 from backend.lib.processor import BasicProcessor
 from common.lib.helpers import strip_tags, dict_search_and_update, remove_nuls, HashCache
 from common.lib.exceptions import WorkerInterruptedException, ProcessorInterruptedException, MapItemException
@@ -71,7 +71,6 @@ def process(self):
 				items = self.import_from_file(query_parameters.get("file"))
 			else:
 				items = self.search(query_parameters)
-
 		except WorkerInterruptedException:
 			raise ProcessorInterruptedException("Interrupted while collecting data, trying again later.")
 
@@ -79,10 +78,12 @@ def process(self):
 		num_items = 0
 		if items:
 			self.dataset.update_status("Writing collected data to dataset file")
-			if results_file.suffix == ".ndjson":
-				num_items = self.items_to_ndjson(items, results_file)
-			elif results_file.suffix == ".csv":
+			if self.extension == "csv":
 				num_items = self.items_to_csv(items, results_file)
+			elif self.extension == "ndjson":
+				num_items = self.items_to_ndjson(items, results_file)
+			elif self.extension == "zip":
+				num_items = self.items_to_archive(items, results_file)
 			else:
 				raise NotImplementedError("Datasource query cannot be saved as %s file" % results_file.suffix)
 
@@ -361,6 +362,22 @@ def items_to_ndjson(self, items, filepath):
 
 		return processed
 
+	def items_to_archive(self, items, filepath):
+		"""
+		Save retrieved items as an archive
+
+		Assumes that items is an iterable with one item, a Path object
+		referring to a folder containing files to be archived. The folder will
+		be removed afterwards.
+
+		:param items:
+		:param filepath:  Where to store the archive
+		:return int:  Number of items
+		"""
+		num_items = len(os.listdir(items))
+		self.write_archive_and_finish(items, None, zipfile.ZIP_STORED, False)
+		return num_items
+
 
 class SearchWithScope(Search, ABC):
 	"""
@@ -404,7 +421,7 @@ def search(self, query):
 			# proportion of items matches
 			# first, get amount of items for all threads in which matching
 			# items occur and that are long enough
-			thread_ids = tuple([post["thread_id"] for post in items])
+			thread_ids = tuple([item["thread_id"] for item in items])
 			self.dataset.update_status("Retrieving thread metadata for %i threads" % len(thread_ids))
 			try:
 				min_length = int(query.get("scope_length", 30))
diff --git a/backend/lib/worker.py b/backend/lib/worker.py
index 3fe19e067..a5695e673 100644
--- a/backend/lib/worker.py
+++ b/backend/lib/worker.py
@@ -133,6 +133,17 @@ def run(self):
 			location = "->".join(frames)
 			self.log.error("Worker %s raised exception %s and will abort: %s at %s" % (self.type, e.__class__.__name__, str(e), location))
 
+		# Clean up after work successfully completed or terminates
+		self.clean_up()
+
+	def clean_up(self):
+		"""
+		Clean up after a processor runs successfully or results in error.
+		Workers should override this method to implement any procedures
+		to run to clean up a worker; by default this does nothing.
+		"""
+		pass
+
 	def abort(self):
 		"""
 		Called when the application shuts down
diff --git a/common/config_manager.py b/common/config_manager.py
index 40bce67a6..eb6c846d0 100644
--- a/common/config_manager.py
+++ b/common/config_manager.py
@@ -44,9 +44,9 @@ def with_db(self, db=None):
             # Replace w/ db if provided else only initialise if not already
             self.db = db if db else Database(logger=None, dbname=self.get("DB_NAME"), user=self.get("DB_USER"),
                                          password=self.get("DB_PASSWORD"), host=self.get("DB_HOST"),
-                                         port=self.get("DB_PORT"), appname="config-reader") if not db else db
+                                         port=self.get("DB_PORT"), appname="config-reader")
         else:
-            # self.db already initialized
+            # self.db already initialized and no db provided
             pass
 
     def load_user_settings(self):
@@ -170,11 +170,6 @@ def ensure_database(self):
         known_tags = [t["tag"] for t in self.db.fetchall("SELECT DISTINCT tag FROM settings")]
         tag_order = self.get("flask.tag_order")
 
-        for tag in tag_order:
-            # don't include tags not used by users in the tag order
-            if tag not in user_tags:
-                tag_order.remove(tag)
-
         for tag in known_tags:
             # add tags used by a setting to tag order
             if tag and tag not in tag_order:
@@ -442,6 +437,10 @@ def __init__(self, config, user=None, tags=None, request=None):
         self.tags = tags
         self.request = request
 
+        # this ensures the user object in turn reads from the wrapper
+        if self.user:
+            self.user.with_config(self)
+
 
     def set(self, *args, **kwargs):
         """
diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py
index c9601f78c..a4fca2dcd 100644
--- a/common/lib/config_definition.py
+++ b/common/lib/config_definition.py
@@ -56,6 +56,13 @@
                    "software, and a 'powered by 4CAT' notice may also show up in the web interface regardless of the "
                    "value entered here."
     },
+    "4cat.about_this_server": {
+        "type": UserInput.OPTION_TEXT_LARGE,
+        "default": "",
+        "help": "Server information",
+        "tooltip": "Custom server information that is displayed on the 'About' page. Can for instance be used to show "
+                   "information about who maintains the tool or what its intended purpose is."
+    },
     "4cat.crash_message": {
         "type": UserInput.OPTION_TEXT_LARGE,
         "default": "This processor has crashed; the crash has been logged. 4CAT will try again when it is restarted. "
@@ -140,7 +147,7 @@
         "type": UserInput.OPTION_TOGGLE,
         "default": False,
         "help": "Can restart/upgrade",
-        "tooltip": "Controls whether users can restart and upgrade 4CAT via the Control Panel"
+        "tooltip": "Controls whether users can restart, upgrade, and manage extensions 4CAT via the Control Panel"
     },
     "privileges.can_upgrade_to_dev": {
         # this is NOT an admin privilege, because all admins automatically
@@ -165,20 +172,10 @@
         "help": "Can view worker status",
         "tooltip": "Controls whether users can view worker status via the Control Panel"
     },
-    # The following two options should be set to ensure that every analysis step can
+    # The following option should be set to ensure that every analysis step can
     # be traced to a specific version of 4CAT. This allows for reproducible
-    # research. You can however leave them empty with no ill effect. The version ID
-    # should be a commit hash, which will be combined with the Github URL to offer
-    # links to the exact version of 4CAT code that produced an analysis result.
-    # If no version file is available, the output of "git show" in PATH_ROOT will be used
-    # to determine the version, if possible.
-    "path.versionfile": {
-        "type": UserInput.OPTION_TEXT,
-        "default": ".git-checked-out",
-        "help": "Version file",
-        "tooltip": "Path to file containing GitHub commit hash. File containing a commit ID (everything after the first whitespace found is ignored)",
-        "global": True
-    },
+    # research. The output of "git show" in PATH_ROOT will be used to determine
+    # the version of a processor file, if possible.
     "4cat.github_url": {
         "type": UserInput.OPTION_TEXT,
         "default": "https://github.com/digitalmethodsinitiative/4cat",
@@ -479,6 +476,19 @@
         "default": False,
         "tooltip": "Show main dataset preview directly on dataset pages, instead of behind a 'preview' button"
     },
+    "ui.offer_anonymisation": {
+        "type": UserInput.OPTION_TOGGLE,
+        "help": "Offer anonymisation options",
+        "default": True,
+        "tooltip": "Offer users the option to anonymise their datasets at the time of creation. It is strongly "
+                   "recommended to leave this enabled."
+    },
+    "ui.advertise_install": {
+        "type": UserInput.OPTION_TOGGLE,
+        "help": "Advertise local 4CAT",
+        "default": True,
+        "tooltip": "In the login form, remind users of the possibility to install their own 4CAT server."
+    },
     "ui.show_datasource": {
         "type": UserInput.OPTION_TOGGLE,
         "help": "Show data source",
@@ -503,6 +513,18 @@
         "tooltip": "If a dataset is a JSON file but it can be mapped to a CSV file, show the CSV in the preview instead"
                    "of the underlying JSON."
     },
+    "ui.offer_hashing": {
+        "type": UserInput.OPTION_TOGGLE,
+        "default": True,
+        "help": "Offer pseudonymisation",
+        "tooltip": "Add a checkbox to the 'create dataset' forum to allow users to toggle pseudonymisation."
+    },
+    "ui.offer_private": {
+        "type": UserInput.OPTION_TOGGLE,
+        "default": True,
+        "help": "Offer create as private",
+        "tooltip": "Add a checkbox to the 'create dataset' forum to allow users to make a dataset private."
+    },
     "ui.option_email": {
         "type": UserInput.OPTION_CHOICE,
         "options": {
diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 56ea7d463..5a23afb7b 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -114,6 +114,9 @@ def __init__(self, parameters=None, key=None, job=None, data=None, db=None, pare
 			self.parameters = json.loads(self.data["parameters"])
 			self.is_new = False
 		else:
+			self.data = {"type": type}  # get_own_processor needs this
+			own_processor = self.get_own_processor()
+			version = get_software_commit(own_processor)
 			self.data = {
 				"key": self.key,
 				"query": self.get_label(parameters, default=type),
@@ -125,7 +128,8 @@ def __init__(self, parameters=None, key=None, job=None, data=None, db=None, pare
 				"timestamp": int(time.time()),
 				"is_finished": False,
 				"is_private": is_private,
-				"software_version": get_software_commit(),
+				"software_version": version[0],
+				"software_source": version[1],
 				"software_file": "",
 				"num_rows": 0,
 				"progress": 0.0,
@@ -139,7 +143,6 @@ def __init__(self, parameters=None, key=None, job=None, data=None, db=None, pare
 
 			# Find desired extension from processor if not explicitly set
 			if extension is None:
-				own_processor = self.get_own_processor()
 				if own_processor:
 					extension = own_processor.get_extension(parent_dataset=DataSet(key=parent, db=db) if parent else None)
 				# Still no extension, default to 'csv'
@@ -865,10 +868,12 @@ def get_label(self, parameters=None, default="Query"):
 		elif parameters.get("subject_match") and parameters["subject_match"] != "empty":
 			return parameters["subject_match"]
 		elif parameters.get("query"):
-			label = parameters["query"] if len(parameters["query"]) < 30 else parameters["query"][:25] + "..."
+			label = parameters["query"]
 			# Some legacy datasets have lists as query data
 			if isinstance(label, list):
 				label = ", ".join(label)
+
+			label = label if len(label) < 30 else label[:25] + "..."
 			label = label.strip().replace("\n", ", ")
 			return label
 		elif parameters.get("country_flag") and parameters["country_flag"] != "all":
@@ -1116,7 +1121,8 @@ def update_version(self, version):
 			processor_path = ""
 
 		updated = self.db.update("datasets", where={"key": self.data["key"]}, data={
-			"software_version": version,
+			"software_version": version[0],
+			"software_source": version[1],
 			"software_file": processor_path
 		})
 
@@ -1151,10 +1157,15 @@ def get_version_url(self, file):
 		:param file:  File to link within the repository
 		:return:  URL, or an empty string
 		"""
-		if not self.data["software_version"] or not config.get("4cat.github_url"):
+		if not self.data["software_source"]:
 			return ""
 
-		return config.get("4cat.github_url") + "/blob/" + self.data["software_version"] + self.data.get("software_file", "")
+		filepath = self.data.get("software_file", "")
+		if filepath.startswith("/extensions/"):
+			# go to root of extension
+			filepath = "/" + "/".join(filepath.split("/")[3:])
+
+		return self.data["software_source"] + "/blob/" + self.data["software_version"] + filepath
 
 	def top_parent(self):
 		"""
diff --git a/common/lib/helpers.py b/common/lib/helpers.py
index f6767c929..2911044f5 100644
--- a/common/lib/helpers.py
+++ b/common/lib/helpers.py
@@ -1,6 +1,7 @@
 """
 Miscellaneous helper functions for the 4CAT backend
 """
+import hashlib
 import subprocess
 import requests
 import datetime
@@ -16,9 +17,10 @@
 import os
 import io
 
+from pathlib import Path
 from collections.abc import MutableMapping
 from html.parser import HTMLParser
-from pathlib import Path
+from urllib.parse import urlparse, urlunparse
 from calendar import monthrange
 from packaging import version
 
@@ -40,7 +42,6 @@ def init_datasource(database, logger, queue, name):
     """
     pass
 
-
 def strip_tags(html, convert_newlines=True):
     """
     Strip HTML from a string
@@ -120,12 +121,9 @@ def get_git_branch():
         return ""
 
 
-def get_software_commit():
+def get_software_commit(worker=None):
     """
-    Get current 4CAT commit hash
-
-    Reads a given version file and returns the first string found in there
-    (up until the first space). On failure, return an empty string.
+    Get current 4CAT git commit hash
 
     Use `get_software_version()` instead if you need the release version
     number rather than the precise commit hash.
@@ -134,34 +132,58 @@ def get_software_commit():
     repository in the 4CAT root folder, and if so, what commit is currently
     checked out in it.
 
-    :return str:  4CAT git commit hash
-    """
-    versionpath = config.get('PATH_ROOT').joinpath(config.get('path.versionfile'))
+    For extensions, get the repository information for that extension, or if
+    the extension is not a git repository, return empty data.
 
-    if versionpath.exists() and not versionpath.is_file():
-        return ""
+    :param BasicWorker processor:  Worker to get commit for. If not given, get
+    version information for the main 4CAT installation.
 
-    if not versionpath.exists():
-        # try git command line within the 4CAT root folder
-        # if it is a checked-out git repository, it will tell us the hash of
-        # the currently checked-out commit
-        try:
-            cwd = os.getcwd()
-            os.chdir(config.get('PATH_ROOT'))
-            show = subprocess.run(["git", "show"], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
-            os.chdir(cwd)
-            if show.returncode != 0:
-                raise ValueError()
-            return show.stdout.decode("utf-8").split("\n")[0].split(" ")[1]
-        except (subprocess.SubprocessError, IndexError, TypeError, ValueError, FileNotFoundError):
-            return ""
+    :return tuple:  4CAT git commit hash, repository name
+    """
+    # try git command line within the 4CAT root folder
+    # if it is a checked-out git repository, it will tell us the hash of
+    # the currently checked-out commit
+    cwd = os.getcwd()
 
+    # path has no Path.relative()...
+    relative_filepath = Path(re.sub(r"^[/\\]+", "", worker.filepath)).parent
     try:
-        with open(versionpath, "r", encoding="utf-8", errors="ignore") as versionfile:
-            version = versionfile.readline().split(" ")[0]
-            return version
-    except OSError:
-        return ""
+        # if extension, go to the extension file's path
+        # we will run git here - if it is not its own repository, we have no
+        # useful version info (since the extension is by definition not in the
+        # main 4CAT repository) and will return an empty value
+        if worker and worker.is_extension:
+            extension_dir = config.get("PATH_ROOT").joinpath(relative_filepath)
+            os.chdir(extension_dir)
+            # check if we are in the extensions' own repo or 4CAT's
+            repo_level = subprocess.run(["git", "rev-parse", "--show-toplevel"], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+            if Path(repo_level.stdout.decode("utf-8")) == config.get("PATH_ROOT"):
+                # not its own repository
+                return ("", "")
+
+        else:
+            os.chdir(config.get("PATH_ROOT"))
+
+        show = subprocess.run(["git", "show"], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+        if show.returncode != 0:
+            raise ValueError()
+        commit = show.stdout.decode("utf-8").split("\n")[0].split(" ")[1]
+
+        # now get the repository the commit belongs to, if we can
+        origin = subprocess.run(["git", "config", "--get", "remote.origin.url"], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+        if origin.returncode != 0 or not origin.stdout:
+            raise ValueError()
+        repository = origin.stdout.decode("utf-8").strip()
+        if repository.endswith(".git"):
+            repository = repository[:-4]
+
+    except (subprocess.SubprocessError, IndexError, TypeError, ValueError, FileNotFoundError) as e:
+        return ("", "")
+
+    finally:
+        os.chdir(cwd)
+
+    return (commit, repository)
 
 def get_software_version():
     """
@@ -174,7 +196,7 @@ def get_software_version():
 
     :return str:  Software version, for example `1.37`.
     """
-    current_version_file = Path(config.get("PATH_ROOT"), "config/.current-version")
+    current_version_file = config.get("PATH_ROOT").joinpath("config/.current-version")
     if not current_version_file.exists():
         return ""
 
@@ -228,6 +250,70 @@ def get_ffmpeg_version(ffmpeg_path):
     return version.parse(ffmpeg_version)
 
 
+def find_extensions():
+    """
+    Find 4CAT extensions and load their metadata
+
+    Looks for subfolders of the extension folder, and loads additional metadata
+    where available.
+
+    :return tuple:  A tuple with two items; the extensions, as an ID -> metadata
+    dictionary, and a list of (str) errors encountered while loading
+    """
+    extension_path = config.get("PATH_ROOT").joinpath("extensions")
+    errors = []
+    if not extension_path.exists() or not extension_path.is_dir():
+        return [], None
+
+    # each folder in the extensions folder is an extension
+    extensions = {
+        extension.name: {
+            "name": extension.name,
+            "version": "",
+            "url": "",
+            "git_url": "",
+            "is_git": False
+        } for extension in sorted(os.scandir(extension_path), key=lambda x: x.name) if extension.is_dir()
+    }
+
+    # collect metadata for extensions
+    allowed_metadata_keys = ("name", "version", "url")
+    cwd = os.getcwd()
+    for extension in extensions:
+        extension_folder = extension_path.joinpath(extension)
+        metadata_file = extension_folder.joinpath("metadata.json")
+        if metadata_file.exists():
+            with metadata_file.open() as infile:
+                try:
+                    metadata = json.load(infile)
+                    extensions[extension].update({k: metadata[k] for k in metadata if k in allowed_metadata_keys})
+                except (TypeError, ValueError) as e:
+                    errors.append(f"Error reading metadata file for extension '{extension}' ({e})")
+                    continue
+
+        extensions[extension]["is_git"] = extension_folder.joinpath(".git/HEAD").exists()
+        if extensions[extension]["is_git"]:
+            # try to get remote URL
+            try:
+                os.chdir(extension_folder)
+                origin = subprocess.run(["git", "config", "--get", "remote.origin.url"], stderr=subprocess.PIPE,
+                                        stdout=subprocess.PIPE)
+                if origin.returncode != 0 or not origin.stdout:
+                    raise ValueError()
+                repository = origin.stdout.decode("utf-8").strip()
+                if repository.endswith(".git") and "github.com" in repository:
+                    # use repo URL
+                    repository = repository[:-4]
+                extensions[extension]["git_url"] = repository
+            except (subprocess.SubprocessError, IndexError, TypeError, ValueError, FileNotFoundError) as e:
+                print(e)
+                pass
+            finally:
+                os.chdir(cwd)
+
+    return extensions, errors
+
+
 def convert_to_int(value, default=0):
     """
     Convert a value to an integer, with a fallback
@@ -887,6 +973,37 @@ def _sets_to_lists_gen(d):
 
     return dict(_sets_to_lists_gen(d))
 
+
+def url_to_hash(url, remove_scheme=True, remove_www=True):
+    """
+    Convert a URL to a filename; some URLs are too long to be used as filenames, this keeps the domain and hashes the
+    rest of the URL.
+    """
+    parsed_url = urlparse(url.lower())
+    if parsed_url:
+        if remove_scheme:
+            parsed_url = parsed_url._replace(scheme="")
+        if remove_www:
+            netloc = re.sub(r"^www\.", "", parsed_url.netloc)
+            parsed_url = parsed_url._replace(netloc=netloc)
+
+        url = re.sub(r"[^0-9a-z]+", "_", urlunparse(parsed_url).strip("/"))
+    else:
+        # Unable to parse URL; use regex
+        if remove_scheme:
+            url = re.sub(r"^https?://", "", url)
+        if remove_www:
+            if not remove_scheme:
+                scheme = re.match(r"^https?://", url).group()
+                temp_url = re.sub(r"^https?://", "", url)
+                url = scheme + re.sub(r"^www\.", "", temp_url)
+            else:
+                url = re.sub(r"^www\.", "", url)
+
+        url = re.sub(r"[^0-9a-z]+", "_", url.lower().strip("/"))
+
+    return hashlib.blake2b(url.encode("utf-8"), digest_size=24).hexdigest()
+
 def folder_size(path='.'):
     """
     Get the size of a folder using os.scandir for efficiency
diff --git a/common/lib/logger.py b/common/lib/logger.py
index c1a015ca6..bbd30c444 100644
--- a/common/lib/logger.py
+++ b/common/lib/logger.py
@@ -163,7 +163,7 @@ class Logger:
     }
     alert_level = "FATAL"
 
-    def __init__(self, output=False, filename='4cat.log', log_level="INFO"):
+    def __init__(self, logger_name='4cat-backend', output=False, filename='4cat.log', log_level="INFO"):
         """
         Set up log handler
 
@@ -181,7 +181,7 @@ def __init__(self, output=False, filename='4cat.log', log_level="INFO"):
         self.log_path = log_folder.joinpath(filename)
         self.previous_report = time.time()
 
-        self.logger = logging.getLogger("4cat-backend")
+        self.logger = logging.getLogger(logger_name)
         self.logger.setLevel(log_level)
 
         # this handler manages the text log files
diff --git a/common/lib/module_loader.py b/common/lib/module_loader.py
index 84e5d951e..b555801ec 100644
--- a/common/lib/module_loader.py
+++ b/common/lib/module_loader.py
@@ -7,6 +7,7 @@
 import pickle
 import sys
 import re
+import os
 
 from common.config_manager import config
 
@@ -69,14 +70,11 @@ def is_4cat_class(object, only_processors=False):
         """
         Determine if a module member is a worker class we can use
         """
-        # it would be super cool to just use issubclass() here!
-        # but that requires importing the classes themselves, which leads to
-        # circular imports
         if inspect.isclass(object):
             if object.__name__ in("BasicProcessor", "BasicWorker") or inspect.isabstract(object):
                 # ignore abstract and base classes
                 return False
-                
+
             if hasattr(object, "is_4cat_class"):
                 if only_processors:
                     if hasattr(object, "is_4cat_processor"):
@@ -85,7 +83,7 @@ def is_4cat_class(object, only_processors=False):
                         return False
                 else:
                     return object.is_4cat_class()
-        
+
         return False
 
     def load_modules(self):
@@ -99,14 +97,19 @@ def load_modules(self):
         """
         # look for workers and processors in pre-defined folders and datasources
 
-        paths = [Path(config.get('PATH_ROOT'), "processors"), Path(config.get('PATH_ROOT'), "backend", "workers"),
-                 *[self.datasources[datasource]["path"] for datasource in self.datasources]]
+        extension_path = Path(config.get('PATH_ROOT'), "extensions")
+
+        paths = [Path(config.get('PATH_ROOT'), "processors"),
+                 Path(config.get('PATH_ROOT'), "backend", "workers"),
+                 extension_path,
+                 *[self.datasources[datasource]["path"] for datasource in self.datasources]] # extension datasources will be here and the above line...
 
         root_match = re.compile(r"^%s" % re.escape(str(config.get('PATH_ROOT'))))
         root_path = Path(config.get('PATH_ROOT'))
 
         for folder in paths:
             # loop through folders, and files in those folders, recursively
+            is_extension = extension_path in folder.parents or folder == extension_path
             for file in folder.rglob("*.py"):
                 # determine module name for file
                 # reduce path to be relative to 4CAT root
@@ -147,6 +150,7 @@ def load_modules(self):
 
                     self.workers[component[1].type] = component[1]
                     self.workers[component[1].type].filepath = relative_path
+                    self.workers[component[1].type].is_extension = is_extension
 
                     # we can't use issubclass() because for that we would need
                     # to import BasicProcessor, which would lead to a circular
@@ -169,8 +173,7 @@ def load_modules(self):
             for missing_module, processor_list in self.missing_modules.items():
                 warning += "\t%s (for %s)\n" % (missing_module, ", ".join(processor_list))
 
-            self.log_buffer = warning
-
+            self.log_buffer += warning
 
         self.processors = categorised_processors
 
@@ -183,19 +186,21 @@ def load_datasources(self):
         `DATASOURCE` constant. The latter is taken as the ID for this
         datasource.
         """
-        for subdirectory in Path(config.get('PATH_ROOT'), "datasources").iterdir():
-            # folder name, also the name used in config.py
-            folder_name = subdirectory.parts[-1]
-
-            # determine module name
-            module_name = "datasources." + folder_name
+        def _load_datasource(subdirectory):
+            """
+            Load a single datasource
+            """
+            # determine module name (path relative to 4CAT root w/ periods)
+            module_name = ".".join(subdirectory.relative_to(Path(config.get("PATH_ROOT"))).parts)
             try:
                 datasource = importlib.import_module(module_name)
             except ImportError as e:
-                continue
+                self.log_buffer += "Could not import %s: %s\n" % (module_name, e)
+                return
 
             if not hasattr(datasource, "init_datasource") or not hasattr(datasource, "DATASOURCE"):
-                continue
+                self.log_buffer += "Could not load datasource %s: missing init_datasource or DATASOURCE\n" % subdirectory
+                return
 
             datasource_id = datasource.DATASOURCE
 
@@ -208,6 +213,19 @@ def load_datasources(self):
                 "config": {} if not hasattr(datasource, "config") else datasource.config
             }
 
+        # Load 4CAT core datasources
+        for subdirectory in Path(config.get('PATH_ROOT'), "datasources").iterdir():
+            if subdirectory.is_dir():
+                _load_datasource(subdirectory)
+
+        # Load extension datasources
+        # os.walk is used to allow for the possibility of multiple extensions, with nested "datasources" folders
+        for root, dirs, files in os.walk(Path(config.get('PATH_ROOT'), "extensions"), followlinks=True):
+            if "datasources" in dirs:
+                for subdirectory in Path(root, "datasources").iterdir():
+                    if subdirectory.is_dir():
+                        _load_datasource(subdirectory)
+
         sorted_datasources = {datasource_id: self.datasources[datasource_id] for datasource_id in
                               sorted(self.datasources, key=lambda id: self.datasources[id]["name"])}
         self.datasources = sorted_datasources
@@ -225,7 +243,7 @@ def expand_datasources(self):
             self.datasources[datasource_id]["has_worker"] = bool(worker)
             self.datasources[datasource_id]["has_options"] = self.datasources[datasource_id]["has_worker"] and \
                                                              bool(self.workers["%s-search" % datasource_id].get_options())
-            self.datasources[datasource_id]["importable"] = worker and hasattr(worker, "is_from_extension") and worker.is_from_extension
+            self.datasources[datasource_id]["importable"] = worker and hasattr(worker, "is_from_zeeschuimer") and worker.is_from_zeeschuimer
 
     def load_worker_class(self, worker):
         """
diff --git a/common/lib/user.py b/common/lib/user.py
index 2c9788869..2722d7574 100644
--- a/common/lib/user.py
+++ b/common/lib/user.py
@@ -14,7 +14,7 @@
 from email.mime.text import MIMEText
 from common.lib.helpers import send_email
 from common.lib.exceptions import DataSetException
-from common.config_manager import config
+from common.config_manager import config as global_config
 
 
 class User:
@@ -28,12 +28,13 @@ class User:
     is_authenticated = False
     is_active = False
     is_anonymous = True
+    config = None
     db = None
 
     name = "anonymous"
 
     @staticmethod
-    def get_by_login(db, name, password):
+    def get_by_login(db, name, password, config=None):
         """
         Get user object, if login is correct
 
@@ -43,6 +44,8 @@ def get_by_login(db, name, password):
         :param db:  Database connection object
         :param name:  User name
         :param password:  User password
+        :param config:  Configuration manager. Can be used for request-aware user objects using ConfigWrapper. Empty to
+        use a global configuration manager.
         :return:  User object, or `None` if login was invalid
         """
         user = db.fetchone("SELECT * FROM users WHERE name = %s", (name,))
@@ -54,30 +57,34 @@ def get_by_login(db, name, password):
             return None
         else:
             # valid login!
-            return User(db, user, authenticated=True)
+            return User(db, user, authenticated=True, config=config)
 
     @staticmethod
-    def get_by_name(db, name):
+    def get_by_name(db, name, config=None):
         """
         Get user object for given user name
 
         :param db:  Database connection object
         :param str name:  Username to get object for
+        :param config:  Configuration manager. Can be used for request-aware user objects using ConfigWrapper. Empty to
+        use a global configuration manager.
         :return:  User object, or `None` for invalid user name
         """
         user = db.fetchone("SELECT * FROM users WHERE name = %s", (name,))
         if not user:
             return None
         else:
-            return User(db, user)
+            return User(db, user, config=config)
 
     @staticmethod
-    def get_by_token(db, token):
+    def get_by_token(db, token, config=None):
         """
         Get user object for given token, if token is valid
 
         :param db:  Database connection object
         :param str token:  Token to get object for
+        :param config:  Configuration manager. Can be used for request-aware user objects using ConfigWrapper. Empty to
+        use a global configuration manager.
         :return:  User object, or `None` for invalid token
         """
         user = db.fetchone(
@@ -86,36 +93,9 @@ def get_by_token(db, token):
         if not user:
             return None
         else:
-            return User(db, user)
+            return User(db, user, config=config)
 
-    def can_access_dataset(self, dataset, role=None):
-        """
-        Check if this user should be able to access a given dataset.
-
-        This depends mostly on the dataset's owner, which should match the
-        user if the dataset is private. If the dataset is not private, or
-        if the user is an admin or the dataset is private but assigned to
-        an anonymous user, the dataset can be accessed.
-
-        :param dataset:  The dataset to check access to
-        :return bool:
-        """
-        if not dataset.is_private:
-            return True
-
-        elif self.is_admin:
-            return True
-
-        elif dataset.is_accessible_by(self, role=role):
-            return True
-
-        elif dataset.get_owners == ("anonymous",):
-            return True
-
-        else:
-            return False
-
-    def __init__(self, db, data, authenticated=False):
+    def __init__(self, db, data, authenticated=False, config=None):
         """
         Instantiate user object
 
@@ -127,6 +107,9 @@ def __init__(self, db, data, authenticated=False):
         """
         self.db = db
         self.data = data
+
+        self.config = config if config else global_config
+
         try:
             self.userdata = json.loads(self.data["userdata"])
         except (TypeError, json.JSONDecodeError):
@@ -170,7 +153,7 @@ def get_name(self):
         if self.data["name"] == "anonymous":
             return "Anonymous"
         elif self.data["name"] == "autologin":
-            return config.get("flask.autologin.name")
+            return self.config.get("flask.autologin.name")
         else:
             return self.data["name"]
 
@@ -184,6 +167,21 @@ def get_token(self):
         """
         return self.generate_token(regenerate=False)
 
+    def with_config(self, config):
+        """
+        Connect user to configuration manager
+
+        By default, the user object reads from the global configuration
+        manager. For frontend operations it may be desireable to use a
+        request-aware configuration manager, but this is only available after
+        the user has been instantiated. This method can thus be used to connect
+        the user to that config manager later when it is available.
+
+        :param config:  Configuration manager object
+        :return:
+        """
+        self.config = config
+
     def clear_token(self):
         """
         Reset password rest token
@@ -195,6 +193,33 @@ def clear_token(self):
         """
         self.db.update("users", data={"register_token": "", "timestamp_token": 0}, where={"name": self.get_id()})
 
+    def can_access_dataset(self, dataset, role=None):
+        """
+        Check if this user should be able to access a given dataset.
+
+        This depends mostly on the dataset's owner, which should match the
+        user if the dataset is private. If the dataset is not private, or
+        if the user is an admin or the dataset is private but assigned to
+        an anonymous user, the dataset can be accessed.
+
+        :param dataset:  The dataset to check access to
+        :return bool:
+        """
+        if not dataset.is_private:
+            return True
+
+        elif self.is_admin:
+            return True
+
+        elif dataset.is_accessible_by(self, role=role):
+            return True
+
+        elif dataset.get_owners == ("anonymous",):
+            return True
+
+        else:
+            return False
+
     @property
     def is_special(self):
         """
@@ -246,7 +271,7 @@ def email_token(self, new=False):
                           account?
         :return str:  Link for the user to set their password with
         """
-        if not config.get('mail.server'):
+        if not self.config.get('mail.server'):
             raise RuntimeError("No e-mail server configured. 4CAT cannot send any e-mails.")
 
         if self.is_special:
@@ -258,14 +283,14 @@ def email_token(self, new=False):
         register_token = self.generate_token(regenerate=True)
 
         # prepare welcome e-mail
-        sender = config.get('mail.noreply')
+        sender = self.config.get('mail.noreply')
         message = MIMEMultipart("alternative")
         message["From"] = sender
         message["To"] = username
 
         # the actual e-mail...
-        url_base = config.get("flask.server_name")
-        protocol = "https" if config.get("flask.https") else "http"
+        url_base = self.config.get("flask.server_name")
+        protocol = "https" if self.config.get("flask.https") else "http"
         url = "%s://%s/reset-password/?token=%s" % (protocol, url_base, register_token)
 
         # we use slightly different e-mails depending on whether this is the first time setting a password
@@ -408,7 +433,7 @@ def get_notifications(self):
 
         :return list:  Notifications, as a list of dictionaries
         """
-        tag_recipients = ["!everyone", *[f"!{tag}" for tag in self.data["tags"]]]
+        tag_recipients = ["!everyone", *[f"!{tag}" for tag in self.config.get_active_tags(self)]]
         if self.is_admin:
             # for backwards compatibility - used to be called '!admins' even if the tag is 'admin'
             tag_recipients.append("!admins")
@@ -457,7 +482,7 @@ def sort_user_tags(self):
         tags = self.data["tags"]
         sorted_tags = []
 
-        for tag in config.get("flask.tag_order"):
+        for tag in self.config.get("flask.tag_order"):
             if tag in tags:
                 sorted_tags.append(tag)
 
diff --git a/datasources/douyin/search_douyin.py b/datasources/douyin/search_douyin.py
index e66b177ff..4b5d5b814 100644
--- a/datasources/douyin/search_douyin.py
+++ b/datasources/douyin/search_douyin.py
@@ -18,7 +18,7 @@ class SearchDouyin(Search):
     title = "Import scraped Douyin data"  # title displayed in UI
     description = "Import Douyin data collected with an external tool such as Zeeschuimer."  # description displayed in UI
     extension = "ndjson"  # extension of result file, used internally and in UI
-    is_from_extension = True
+    is_from_zeeschuimer = True
 
     # not available as a processor for existing datasets
     accepts = [None]
diff --git a/datasources/gab/search_gab.py b/datasources/gab/search_gab.py
index 4b200b667..2ad7dfc34 100644
--- a/datasources/gab/search_gab.py
+++ b/datasources/gab/search_gab.py
@@ -16,7 +16,7 @@ class SearchGab(Search):
     title = "Import scraped Gab data"  # title displayed in UI
     description = "Import Gab data collected with an external tool such as Zeeschuimer."  # description displayed in UI
     extension = "ndjson"  # extension of result file, used internally and in UI
-    is_from_extension = True
+    is_from_zeeschuimer = True
     fake = ""
 
     # not available as a processor for existing datasets
diff --git a/datasources/imgur/search_imgur.py b/datasources/imgur/search_imgur.py
index d3e55c38d..b8c80ec5b 100644
--- a/datasources/imgur/search_imgur.py
+++ b/datasources/imgur/search_imgur.py
@@ -18,7 +18,7 @@ class SearchNineGag(Search):
     title = "Import scraped Imgur data"  # title displayed in UI
     description = "Import Imgur data collected with an external tool such as Zeeschuimer."  # description displayed in UI
     extension = "ndjson"  # extension of result file, used internally and in UI
-    is_from_extension = True
+    is_from_zeeschuimer = True
 
     # not available as a processor for existing datasets
     accepts = [None]
diff --git a/datasources/instagram/search_instagram.py b/datasources/instagram/search_instagram.py
index b82e4ca3e..3a3b76f4c 100644
--- a/datasources/instagram/search_instagram.py
+++ b/datasources/instagram/search_instagram.py
@@ -21,7 +21,7 @@ class SearchInstagram(Search):
     title = "Import scraped Instagram data"  # title displayed in UI
     description = "Import Instagram data collected with an external tool such as Zeeschuimer."  # description displayed in UI
     extension = "ndjson"  # extension of result file, used internally and in UI
-    is_from_extension = True
+    is_from_zeeschuimer = True
 
     # not available as a processor for existing datasets
     accepts = [None]
diff --git a/datasources/linkedin/search_linkedin.py b/datasources/linkedin/search_linkedin.py
index f954782e0..f357341ed 100644
--- a/datasources/linkedin/search_linkedin.py
+++ b/datasources/linkedin/search_linkedin.py
@@ -21,7 +21,7 @@ class SearchLinkedIn(Search):
     title = "Import scraped LinkedIn data"  # title displayed in UI
     description = "Import LinkedIn data collected with an external tool such as Zeeschuimer."  # description displayed in UI
     extension = "ndjson"  # extension of result file, used internally and in UI
-    is_from_extension = True
+    is_from_zeeschuimer = True
 
     # not available as a processor for existing datasets
     accepts = [None]
diff --git a/datasources/ninegag/search_9gag.py b/datasources/ninegag/search_9gag.py
index 973de82ba..e5d6c267b 100644
--- a/datasources/ninegag/search_9gag.py
+++ b/datasources/ninegag/search_9gag.py
@@ -19,7 +19,7 @@ class SearchNineGag(Search):
     title = "Import scraped 9gag data"  # title displayed in UI
     description = "Import 9gag data collected with an external tool such as Zeeschuimer."  # description displayed in UI
     extension = "ndjson"  # extension of result file, used internally and in UI
-    is_from_extension = True
+    is_from_zeeschuimer = True
 
     # not available as a processor for existing datasets
     accepts = [None]
diff --git a/datasources/parler/DESCRIPTION.md b/datasources/parler/DESCRIPTION.md
deleted file mode 100644
index f2d745c68..000000000
--- a/datasources/parler/DESCRIPTION.md
+++ /dev/null
@@ -1,11 +0,0 @@
-The Parler data source can be used to manipulate data collected from parler.com with 
-[Zeeschuimer](https://github.com/digitalmethodsinitiative/zeeschuimer). Data is collected with the browser extension; 
-4CAT cannot collect data on its own. After collecting data with Zeeschuimer it can be uploaded to 4CAT for further
-processing and analysis. See the Zeeschuimer documentation for more information on how to collect data with it.
-
-Data is collected as it is formatted internally by Parler's website. Posts are stored as (large) JSON objects; it 
-will usually be easier to make sense of the data by downloading it as a CSV file from 4CAT instead.
-
-### Data format
-Most data attributes map to 4CAT's CSV export quite straightforwardly. Note that 'echoes' are Parler's term for what on
-Twitter would be called a 'retweet', i.e. a post reposted by someone else.
\ No newline at end of file
diff --git a/datasources/parler/__init__.py b/datasources/parler/__init__.py
deleted file mode 100644
index 44d34ac14..000000000
--- a/datasources/parler/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-"""
-Initialize Parler data source
-"""
-
-# An init_datasource function is expected to be available to initialize this
-# data source. A default function that does this is available from the
-# backend helpers library.
-from common.lib.helpers import init_datasource
-
-# Internal identifier for this data source
-DATASOURCE = "parler"
-NAME = "Parler"
\ No newline at end of file
diff --git a/datasources/parler/search_parler.py b/datasources/parler/search_parler.py
deleted file mode 100644
index 8ccc7ccd8..000000000
--- a/datasources/parler/search_parler.py
+++ /dev/null
@@ -1,66 +0,0 @@
-"""
-Import scraped Parler data
-
-It's prohibitively difficult to scrape data from Parler within 4CAT itself
-due to its aggressive rate limiting and login wall. Instead, import data
-collected elsewhere.
-"""
-import datetime
-import re
-
-from backend.lib.search import Search
-from common.lib.item_mapping import MappedItem
-
-
-class SearchParler(Search):
-    """
-    Import scraped LinkedIn data
-    """
-    type = "parler-search"  # job ID
-    category = "Search"  # category
-    title = "Import scraped Parler data"  # title displayed in UI
-    description = "Import Parler data collected with an external tool such as Zeeschuimer."  # description displayed in UI
-    extension = "ndjson"  # extension of result file, used internally and in UI
-    is_from_extension = True
-
-    # not available as a processor for existing datasets
-    accepts = [None]
-
-    def get_items(self, query):
-        """
-        Run custom search
-
-        Not available for Parler
-        """
-        raise NotImplementedError("Parler datasets can only be created by importing data from elsewhere")
-
-    @staticmethod
-    def map_item(node):
-        """
-        Parse Parler post
-
-        :param node:  Data as received from Parler
-        :return dict:  Mapped item
-        """
-        post = node["data"]
-        post_time = datetime.datetime.strptime(post["date_created"], "%Y-%m-%dT%H:%M:%S.000000Z")
-
-        return MappedItem({
-            "id": post["postuuid"],
-            "thread_id": post["postuuid"],
-            "body": post["body"],
-            "timestamp": post_time.strftime("%Y-%m-%d %H:%M:%S"),
-            "author": post["user"]["username"],
-            "author_name": post["user"]["name"],
-            "author_followers": post["user"]["follower_count"],
-            "detected_language": post["detected_language"],
-            "views": post["views"],
-            "echoes": post["echos"],
-            "comments": post["total_comments"],
-            "is_sensitive": "yes" if post["sensitive"] else "no",
-            "is_echo": "yes" if post["is_echo"] else "no",
-            "is_ad": "yes" if post["ad"] else "no",
-            "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", post["body"])),
-            "image_url": post["image"] if post["image"] else "",
-            "unix_timestamp": int(post_time.timestamp())
-        })
diff --git a/datasources/tiktok/search_tiktok.py b/datasources/tiktok/search_tiktok.py
index 90f443b49..2ee3c66bd 100644
--- a/datasources/tiktok/search_tiktok.py
+++ b/datasources/tiktok/search_tiktok.py
@@ -20,7 +20,7 @@ class SearchTikTok(Search):
     title = "Import scraped Tiktok data"  # title displayed in UI
     description = "Import Tiktok data collected with an external tool such as Zeeschuimer."  # description displayed in UI
     extension = "ndjson"  # extension of result file, used internally and in UI
-    is_from_extension = True
+    is_from_zeeschuimer = True
 
     # not available as a processor for existing datasets
     accepts = [None]
diff --git a/datasources/tiktok_comments/search_tiktok_comments.py b/datasources/tiktok_comments/search_tiktok_comments.py
index d44581193..efaffc21d 100644
--- a/datasources/tiktok_comments/search_tiktok_comments.py
+++ b/datasources/tiktok_comments/search_tiktok_comments.py
@@ -20,7 +20,7 @@ class SearchTikTokComments(Search):
     title = "Import scraped Tiktok comment data"  # title displayed in UI
     description = "Import Tiktok comment data collected with an external tool such as Zeeschuimer."  # description displayed in UI
     extension = "ndjson"  # extension of result file, used internally and in UI
-    is_from_extension = True
+    is_from_zeeschuimer = True
 
     # not available as a processor for existing datasets
     accepts = [None]
diff --git a/datasources/truth/search_truth.py b/datasources/truth/search_truth.py
index 52057e0fa..c1743e12c 100644
--- a/datasources/truth/search_truth.py
+++ b/datasources/truth/search_truth.py
@@ -16,7 +16,7 @@ class SearchGab(Search):
     title = "Import scraped Truth Social data"  # title displayed in UI
     description = "Import Truth Social data collected with an external tool such as Zeeschuimer."  # description displayed in UI
     extension = "ndjson"  # extension of result file, used internally and in UI
-    is_from_extension = True
+    is_from_zeeschuimer = True
     fake = ""
 
     # not available as a processor for existing datasets
diff --git a/datasources/twitter-import/search_twitter.py b/datasources/twitter-import/search_twitter.py
index baa506923..9acb2b45c 100644
--- a/datasources/twitter-import/search_twitter.py
+++ b/datasources/twitter-import/search_twitter.py
@@ -20,7 +20,7 @@ class SearchTwitterViaZeeschuimer(Search):
     title = "Import scraped X/Twitter data"  # title displayed in UI
     description = "Import X/Twitter data collected with an external tool such as Zeeschuimer."  # description displayed in UI
     extension = "ndjson"  # extension of result file, used internally and in UI
-    is_from_extension = True
+    is_from_zeeschuimer = True
 
     # not available as a processor for existing datasets
     accepts = []
diff --git a/docker-compose_build.yml b/docker-compose_build.yml
index b1c1fa1af..7466e8ba8 100644
--- a/docker-compose_build.yml
+++ b/docker-compose_build.yml
@@ -9,7 +9,6 @@ services:
       - POSTGRES_HOST_AUTH_METHOD=${POSTGRES_HOST_AUTH_METHOD}
     volumes:
       - ./data/postgres/:/var/lib/postgresql/data/
-#      - 4cat_db:/var/lib/postgresql/data/
     healthcheck:
       test: [ "CMD-SHELL", "pg_isready -U $${POSTGRES_USER}" ]
       interval: 5s
@@ -33,10 +32,6 @@ services:
       - ./data/datasets/:/usr/src/app/data/
       - ./data/config/:/usr/src/app/config/
       - ./data/logs/:/usr/src/app/logs/
-#      - 4cat_data:/usr/src/app/data/
-#      - 4cat_config:/usr/src/app/config/
-#      - 4cat_logs:/usr/src/app/logs/
-
     entrypoint: docker/docker-entrypoint.sh
 
   frontend:
@@ -54,9 +49,6 @@ services:
       - ./data/datasets/:/usr/src/app/data/
       - ./data/config/:/usr/src/app/config/
       - ./data/logs/:/usr/src/app/logs/
-#      - 4cat_data:/usr/src/app/data/
-#      - 4cat_config:/usr/src/app/config/
-#      - 4cat_logs:/usr/src/app/logs/
     command: ["docker/wait-for-backend.sh"]
 
 volumes:
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 709d68893..046b39cba 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -24,6 +24,7 @@ ENV PYTHONUNBUFFERED=1
 # Install dependencies
 RUN pip3 install --upgrade pip
 COPY ./requirements.txt /usr/src/app/requirements.txt
+COPY ./extensions /usr/src/app/extensions
 COPY ./setup.py /usr/src/app/setup.py
 COPY ./VERSION /usr/src/app/VERSION
 COPY ./README.md /usr/src/app/README.md
diff --git a/docs/conf.py b/docs/conf.py
deleted file mode 100644
index a08dda6c9..000000000
--- a/docs/conf.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-import os
-import sys
-sys.path.insert(0, os.path.abspath('../..'))
-print(os.path.abspath('../..'))
-
-
-# -- Project information -----------------------------------------------------
-project = '4CAT Capture & Analysis Toolkit'
-copyright = '2021, OILab & Digital Methods Initiative'
-author = 'OILab & Digital Methods Initiative'
-
-
-# -- General configuration ---------------------------------------------------
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.autosummary',
-    "sphinx.ext.napoleon",
-    'm2r2',
-    'sphinx.ext.intersphinx'
-]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = []
-
-source_suffix = [".rst", ".md"]
-
-autodoc_default_options = {
-    "member-order": "groupwise"
-}
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = 'sphinx_rtd_theme'
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
\ No newline at end of file
diff --git a/docs/datasource.rst b/docs/datasource.rst
deleted file mode 100644
index 56ae2189c..000000000
--- a/docs/datasource.rst
+++ /dev/null
@@ -1,73 +0,0 @@
-=================
-4CAT Data sources
-=================
-
-4CAT is a modular tool. Its modules come in two varieties: data sources and processors. This article covers the former.
-
-Data sources are a collection of workers, processors and interface elements that extend 4CAT to allow scraping,
-processing and/or retrieving data for a given platform (such as Instagram, Reddit or Telegram). 4CAT has APIs that can
-do most of the scaffolding around this for you so data source can be quite lightweight and mostly focus on retrieving
-the actual data while 4CAT's back-end takes care of the scheduling, determining where the output should go, et cetera.
-
-Data sources are defined as an arbitrarily-named folder in the datasources folder in the 4CAT root. It is recommended to
-use the datasource ID (see below) as the data source folder name. However, since Python files included in the folder
-will be included as modules by 4CAT, folder names should be allowed as module names. Concretely this means (among other
-things) that data source folder names cannot start with a number (hence the fourchan data source).
-
-*WARNING:* Data sources in multiple ways can define arbitrary code that will be run by either the 4CAT server or
-client-side browsers. Be careful when running a data source supplied by someone else.
-
-A data source will at least contain the following:
-
-* An __init__.py containing data source metadata and initialisation code
-* A search worker, which can collect data according to provided parameters and format it as a CSV or NDJSON file that
-  4CAT can work with.
-
-It may contain additional components:
-
-* Any processors that are specific to datasets created by this data source
-* Views for the web app that allow more advanced behaviour of the web tool interface
-* Database or Sphinx index definitions
-
-The instructions below describe how to format and create these components (work in progress!)
-
--------------------
-Initialisation code
--------------------
-
-The data source root should contain a file `__init__.py` which in turn defines the following:
-
-.. code-block:: python
-
-    DATASOURCE = "datasource-identifier"
-
-This constant defines the data source ID. This is most importantly used in config.py to enable the data source.
-
-.. code-block:: python
-
-    def init_datasource(database, logger, queue, name):
-        pass
-
-This function is called when 4CAT starts, if the data source is enabled, and should set up anything the data source
-needs to function (e.g. queueing any recurring workers). A default implementation of this function can be used instead
-(and when defining your own, it is advised to still call it as part of your own implementation):
-
-.. code-block:: python
-
-    from backend.lib.helpers import init_datasource
-
-------------------
-The `Search` class
-------------------
-.. autoclass:: backend.abstract.search.Search
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
----------------------------
-The `SearchWithScope` class
----------------------------
-.. autoclass:: backend.abstract.search.SearchWithScope
-    :members:
-    :undoc-members:
-    :show-inheritance:
\ No newline at end of file
diff --git a/docs/index.rst b/docs/index.rst
deleted file mode 100644
index addb57b3f..000000000
--- a/docs/index.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-.. 4CAT Capture & Analysis Toolkit documentation master file, created by
-   sphinx-quickstart on Tue Oct 19 11:38:20 2021.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
-
-Welcome to 4CAT Capture & Analysis Toolkit's documentation!
-===========================================================
-
-This documentation collects information about 4CAT's internals
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Contents:
-
-   introduction
-   processor
-   datasource
-   worker
-
-* :ref:`search`
diff --git a/docs/introduction.rst b/docs/introduction.rst
deleted file mode 100644
index b33e21e8d..000000000
--- a/docs/introduction.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-============
-Introduction
-============
-
-.. mdinclude:: ../../README.md
\ No newline at end of file
diff --git a/docs/processor.rst b/docs/processor.rst
deleted file mode 100644
index 1bc3c5191..000000000
--- a/docs/processor.rst
+++ /dev/null
@@ -1,63 +0,0 @@
-===============
-4CAT Processors
-===============
-
-4CAT is a modular tool. Its modules come in two varieties: data sources and processors. This article covers the latter.
-
-Processors are bits of code that produce a dataset. Typically, their input is another dataset. As such they can be used
-to analyse data; for example, a processor can take a csv file containing posts as input, count how many posts occur per
-month, and produce another csv file with the amount of posts per month (one month per row) as output. Processors always
-produce the following things:
-
-* A set of metadata for the Dataset the processor will produce. This is stored in 4CAT's PostgreSQL database. The
-  record for the database is created when the processor's job is first queued, and updated by the processor.
-* A result file, which may have an arbitrary format. This file contains whatever the processor produces, e.g. a list
-  of frequencies, an image wall or a zip archive containing word embedding models.
-* A log file, with the same file name as the result file but with a '.log' extension. This documents any output from
-  the processor while it was producing the result file.
-
-4CAT has an API that can do most of the scaffolding around this for you so processors can be quite lightweight and
-mostly focus on the analysis while 4CAT's back-end takes care of the scheduling, determining where the output should
-go, et cetera.
-
-A minimal example of a processor could look like this:
-
-.. code-block:: python
-
-    """
-    A minimal example 4CAT processor
-    """
-    from backend.abstract.processor import BasicProcessor
-
-    class ExampleProcessor(BasicProcessor):
-        """
-        Example Processor
-        """
-        type = "example-processor"  # job type ID
-        category = "Examples" # category
-        title = "A simple example"  # title displayed in UI
-        description = "This doesn't do much"  # description displayed in UI
-        extension = "csv"  # extension of result file, used internally and in UI
-
-        input = "csv:body"
-        output = "csv:value"
-
-        def process(self):
-            """
-            Saves a CSV file with one column ("value") and one row with a value ("Hello
-            world") and marks the dataset as finished.
-            """
-            data = {"value": "Hello world!"}
-            self.write_csv_items_and_finish(data)
-
-
-But there is more you can do. The full API looks like this:
-
---------------------------
-The `BasicProcessor` class
---------------------------
-
-.. autoclass:: backend.abstract.processor.BasicProcessor
-    :members:
-    :undoc-members:
-    :show-inheritance:
\ No newline at end of file
diff --git a/docs/requirements.txt b/docs/requirements.txt
deleted file mode 100644
index ecd67a4ad..000000000
--- a/docs/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-m2r2
\ No newline at end of file
diff --git a/docs/worker.rst b/docs/worker.rst
deleted file mode 100644
index bc122f7e9..000000000
--- a/docs/worker.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-===============
-4CAT Workers
-===============
-
-TBD
-
------------------------
-The `BasicWorker` class
------------------------
-
-.. autoclass:: backend.abstract.worker.BasicWorker
-    :members:
-    :undoc-members:
-    :show-inheritance:
\ No newline at end of file
diff --git a/extensions/.gitignore b/extensions/.gitignore
new file mode 100644
index 000000000..d7e401301
--- /dev/null
+++ b/extensions/.gitignore
@@ -0,0 +1,5 @@
+# Ignore everything in this directory
+*
+# Except these files
+!.gitignore
+!README.md
diff --git a/extensions/README.md b/extensions/README.md
new file mode 100644
index 000000000..f594bc152
--- /dev/null
+++ b/extensions/README.md
@@ -0,0 +1,39 @@
+This folder contains 4CAT extensions.
+
+Extensions are processor or data sources that are not part of the main 4CAT codebase, but are otherwise compatible 
+with it. For example, a processor that interfaces with a closed API would not be useful to most 4CAT users, but if you
+have access to it, you could add such a processor to 4CAT as an extension.
+
+
+## Installation
+Extensions are simply folders within this 'extensions' folder in which Python files containing the relevant code is 
+contained. It is strongly recommended that you use git for version control of these folders. Simply commit the code to
+a repository somewhere, then clone it into this folder like so:
+
+```shell
+cd [4cat root]
+cd extensions
+git clone [repository URL]
+```
+
+This ensures that any dataset created with processors in your extension will be aware of the version of the code they
+were created with. This helps debugging and doing reproducible and traceable research.
+
+## Structure
+Processors can simply be .py files in the extension folder. Data sources should be sub-folders in a "datasources" 
+folder. An extension containing both processors and a data source could look like this:
+
+```
+[4CAT root]/
+├─ extensions/
+│  ├─ my_extension/
+│     ├─ my_processor.py
+│     ├─ my_other_processor.py
+│     ├─ datasources/
+│        ├─ my_datasource/
+│           ├─ __init__.py
+│           ├─ DESCRIPTION.md
+│           ├─ search_my_datasource.py
+```
+
+In this scenario, `my_extension` would be a git repository within which all other files are contained. 
\ No newline at end of file
diff --git a/helper-scripts/migrate.py b/helper-scripts/migrate.py
index fb85772ae..25071afe4 100644
--- a/helper-scripts/migrate.py
+++ b/helper-scripts/migrate.py
@@ -79,8 +79,39 @@ def check_for_nltk():
 	
 	nltk.download("omw-1.4", quiet=True)
 
+def install_extensions(no_pip=True):
+	"""
+	Check for extensions and run any installation scripts found.
 
-def finish(args, logger):
+	Note: requirements texts are handled by setup.py
+	"""
+	# Check for extension packages
+	if os.path.isdir("extensions"):
+		for root, dirs, files in os.walk("extensions"):
+			for file in files:
+				if file == "fourcat_install.py":
+					command = [interpreter, os.path.join(root, file)]
+					if args.component == "frontend":
+						command.append("--component=frontend")
+					elif args.component == "backend":
+						command.append("--component=backend")
+					elif args.component == "both":
+						command.append("--component=both")
+
+					if no_pip:
+						command.append("--no-pip")
+
+					print(f"Installing extension: {os.path.join(root, file)}")
+					result = subprocess.run(command, stdout=subprocess.PIPE,
+											stderr=subprocess.PIPE)
+					if result.returncode != 0:
+						print("Error while running extension installation script: " + os.path.join(root, file))
+
+					print(result.stdout.decode("utf-8")) if result.stdout else None
+					print(result.stderr.decode("utf-8")) if result.stderr else None
+
+
+def finish(args, logger, no_pip=True):
 	"""
 	Finish migration
 
@@ -89,6 +120,7 @@ def finish(args, logger):
 	wrap up and exit.
 	"""
 	check_for_nltk()
+	install_extensions(no_pip=no_pip)
 	logger.info("\nMigration finished. You can now safely restart 4CAT.\n")
 
 	if args.restart:
@@ -115,7 +147,7 @@ def finish(args, logger):
 cli.add_argument("--no-migrate", "-m", default=False, action="store_true", help="Do not run scripts to upgrade between minor versions. Use if you only want to use migrate to e.g. upgrade dependencies.")
 cli.add_argument("--current-version", "-v", default="config/.current-version", help="File path to .current-version file, relative to the 4CAT root")
 cli.add_argument("--output", "-o", default="", help="By default migrate.py will send output to stdout. If this argument is set, it will write to the given path instead.")
-cli.add_argument("--component", "-c", default="both", help="Which component of 4CAT to migrate. Currently only skips check for if 4CAT is running when set to 'frontend'")
+cli.add_argument("--component", "-c", default="both", help="Which component of 4CAT to migrate ('both', 'backend', 'frontend'). Skips check for if 4CAT is running when set to 'frontend'. Also used by extensions w/ fourcat_install.py")
 cli.add_argument("--branch", "-b", default=False, help="Which branch to check out from GitHub. By default, check out the latest release.")
 args = cli.parse_args()
 
@@ -125,6 +157,9 @@ def finish(args, logger):
 	print("This script needs to be run from the same folder as 4cat-daemon.py\n")
 	exit(1)
 
+# track pip
+pip_ran = False
+
 # set up logging
 logger = logging.getLogger("migrate")
 logger.setLevel(logging.INFO)
@@ -145,6 +180,7 @@ def finish(args, logger):
 logger.info("Restart after migration: " + ("yes" if args.restart else "no"))
 logger.info("Repository URL:          " + args.repository)
 logger.info(".current-version path:   " + args.current_version)
+logger.info(f"Current Datetime:        {time.strftime('%Y-%m-%d %H:%M:%S')}")
 
 # ---------------------------------------------
 #    Ensure existence of current version file
@@ -221,7 +257,7 @@ def finish(args, logger):
 			logger.info("  ...latest release available from GitHub (%s) is older than or equivalent to currently checked out version "
 				  "(%s)." % (tag_version, current_version_c))
 			logger.info("  ...upgrade not necessary, skipping.")
-			finish(args, logger)
+			finish(args, logger, no_pip=pip_ran)
 
 	logger.info("  ...ensuring repository %s is a known remote" % args.repository)
 	remote = subprocess.run(shlex.split("git remote add 4cat_migrate %s" % args.repository), stdout=subprocess.PIPE,
@@ -297,7 +333,7 @@ def finish(args, logger):
 
 if current_version == target_version:
 	logger.info("  ...already up to date.")
-	finish(args, logger)
+	finish(args, logger, no_pip=pip_ran)
 
 if current_version_c[0:3] != target_version_c[0:3]:
 	logger.info("  ...cannot migrate between different major versions.")
@@ -365,6 +401,7 @@ def log_pip_output(logger, output):
 	pip = subprocess.run([interpreter, "-m", "pip", "install", "-r", "requirements.txt", "--upgrade", "--upgrade-strategy", "eager"],
 								stderr=subprocess.STDOUT, stdout=subprocess.PIPE, check=True, cwd=cwd)
 	log_pip_output(logger, pip.stdout)
+	pip_ran = True
 except subprocess.CalledProcessError as e:
 	log_pip_output(logger, e.output)
 	logger.info(f"\n Error running pip: {e}")
@@ -410,4 +447,4 @@ def log_pip_output(logger, output):
 # ---------------------------------------------
 #            Done! Wrap up and finish
 # ---------------------------------------------
-finish(args, logger)
+finish(args, logger, no_pip=pip_ran)
diff --git a/helper-scripts/migrate/migrate-1.45-1.46.py b/helper-scripts/migrate/migrate-1.45-1.46.py
new file mode 100644
index 000000000..8bf5d0683
--- /dev/null
+++ b/helper-scripts/migrate/migrate-1.45-1.46.py
@@ -0,0 +1,33 @@
+# Ensure unique metrics index exists
+import json
+import sys
+import os
+
+from pathlib import Path
+
+sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), "../.."))
+from common.lib.database import Database
+from common.lib.logger import Logger
+
+log = Logger(output=True)
+
+import configparser
+
+ini = configparser.ConfigParser()
+ini.read(Path(__file__).parent.parent.parent.resolve().joinpath("config/config.ini"))
+db_config = ini["DATABASE"]
+
+db = Database(logger=log, dbname=db_config["db_name"], user=db_config["db_user"], password=db_config["db_password"],
+              host=db_config["db_host"], port=db_config["db_port"], appname="4cat-migrate")
+
+print("  Checking if datasets table has a column 'software_source'...")
+has_column = db.fetchone(
+    "SELECT COUNT(*) AS num FROM information_schema.columns WHERE table_name = 'datasets' AND column_name = 'software_source'")
+if has_column["num"] == 0:
+    print("  ...No, adding.")
+    current_source = db.fetchone("SELECT value FROM settings WHERE name = '4cat.github_url' AND tag = ''")
+    current_source = json.loads(current_source["value"]) if current_source is not None else ""
+    db.execute("ALTER TABLE datasets ADD COLUMN software_source TEXT DEFAULT %s", (current_source,))
+    db.commit()
+else:
+    print("  ...Yes, nothing to update.")
\ No newline at end of file
diff --git a/processors/filtering/column_filter.py b/processors/filtering/column_filter.py
index 01c7fa88f..2dc73b63e 100644
--- a/processors/filtering/column_filter.py
+++ b/processors/filtering/column_filter.py
@@ -75,7 +75,7 @@ class ColumnFilter(BaseFilter):
     @classmethod
     def is_compatible_with(cls, module=None, user=None):
         """
-        Allow processor on top datasets.
+        Allow processor on top datasets that are CSV or NDJSON.
 
         :param module: Module to determine compatibility with
         """
@@ -262,11 +262,11 @@ class ColumnProcessorFilter(ColumnFilter):
     @classmethod
     def is_compatible_with(cls, module=None, user=None):
         """
-        Allow processor on top datasets.
+        Allow on child datasets and do not create a standalone dataset
 
         :param module: Dataset or processor to determine compatibility with
         """
-        return module.get_extension() in ("csv", "ndjson") and not module.is_top_dataset()
+        return not module.is_top_dataset() and module.get_extension() in ("csv", "ndjson")
 
     @classmethod
     def is_filter(cls):
diff --git a/processors/metrics/rank_attribute.py b/processors/metrics/rank_attribute.py
index adffe824a..0e38757c6 100644
--- a/processors/metrics/rank_attribute.py
+++ b/processors/metrics/rank_attribute.py
@@ -110,11 +110,12 @@ class AttributeRanker(BasicProcessor):
 	@classmethod
 	def is_compatible_with(cls, module=None, user=None):
 		"""
-		Allow processor on top image rankings
+		Allow processor to run on all csv and NDJSON datasets
 
 		:param module: Module to determine compatibility with
 		"""
-		return module.get_extension() in ["csv", "ndjson"]
+
+		return module.get_extension() in ("csv", "ndjson")
 
 	def process(self):
 		"""
@@ -134,7 +135,7 @@ def process(self):
 		weighby = self.parameters.get("weigh")
 		to_lowercase = self.parameters.get("to-lowercase", True)
 		self.include_missing_data = self.parameters.get("count_missing")
-		
+
 		try:
 			if self.parameters.get("filter"):
 				filter = re.compile(".*" + self.parameters.get("filter") + ".*")
@@ -203,7 +204,7 @@ def missing_value_placeholder(data, field_name):
 			for value in values:
 				if to_lowercase:
 						value = value.lower()
-				
+
 				if rank_style == "overall" and value not in overall_top:
 					continue
 
@@ -340,4 +341,4 @@ def get_options(cls, parent_dataset=None, user=None):
 			options["columns"]["options"] = {v: v for v in columns}
 			options["columns"]["default"] = ["body"]
 
-		return options
\ No newline at end of file
+		return options
diff --git a/processors/networks/wikipedia_network.py b/processors/networks/wikipedia_network.py
index 00e141fc7..0426c97d2 100644
--- a/processors/networks/wikipedia_network.py
+++ b/processors/networks/wikipedia_network.py
@@ -3,19 +3,20 @@
 """
 import re
 import requests
-
-from backend.lib.processor import BasicProcessor
 from lxml import etree
 from lxml.cssselect import CSSSelector as css
 from io import StringIO
-
 import networkx as nx
 
+from backend.lib.processor import BasicProcessor
+from common.lib.exceptions import ProcessorInterruptedException
+
 __author__ = "Stijn Peeters"
 __credits__ = ["Stijn Peeters", "Sal Hagen"]
 __maintainer__ = "Stijn Peeters"
 __email__ = "4cat@oilab.eu"
 
+
 class WikiURLCoLinker(BasicProcessor):
 	"""
 	Generate URL co-link network
diff --git a/processors/presets/neologisms.py b/processors/presets/neologisms.py
index 26684e4d0..1cf258503 100644
--- a/processors/presets/neologisms.py
+++ b/processors/presets/neologisms.py
@@ -19,17 +19,6 @@ class NeologismExtractor(ProcessorPreset):
 
 	references = ["Van Soest, Jeroen. 2019. 'Language Innovation Tracker: Detecting language innovation in online discussion fora.' (MA thesis), Beuls, K. (Promotor), Van Eecke, P. (Advisor).'"]
 
-	@staticmethod
-	def is_compatible_with(module=None, user=None):
-		"""
-        Determine compatibility
-
-        This preset is compatible with any dataset that has columns
-
-        :param Dataset module:  Module ID to determine compatibility with
-        :return bool:
-        """
-		return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson")
 
 	@classmethod
 	def get_options(cls, parent_dataset=None, user=None):
@@ -60,6 +49,16 @@ def get_options(cls, parent_dataset=None, user=None):
 
 		return options
 
+	@classmethod
+	def is_compatible_with(cls, module=None, user=None):
+		"""
+		Allow processor to run on all csv and NDJSON datasets
+
+		:param module: Dataset or processor to determine compatibility with
+		"""
+
+		return module.get_extension() in ("csv", "ndjson")
+
 	def get_processor_pipeline(self):
 		"""
 		This queues a series of post-processors to extract neologisms from a
diff --git a/processors/text-analysis/get_entities.py b/processors/text-analysis/get_entities.py
deleted file mode 100644
index e639c7672..000000000
--- a/processors/text-analysis/get_entities.py
+++ /dev/null
@@ -1,172 +0,0 @@
-"""
-Extract nouns from SpaCy NLP docs.
-
-"""
-import pickle
-import spacy
-
-from collections import Counter
-from spacy.tokens import DocBin
-from common.lib.helpers import UserInput
-from backend.lib.processor import BasicProcessor
-from common.lib.exceptions import ProcessorInterruptedException
-
-__author__ = "Sal Hagen"
-__credits__ = ["Sal Hagen"]
-__maintainer__ = "Sal Hagen"
-__email__ = "4cat@oilab.eu"
-
-
-class ExtractNouns(BasicProcessor):  # TEMPORARILY DISABLED
-    """
-    Rank vectors over time
-    """
-    type = "get-entities"  # job type ID
-    category = "Text analysis"  # category
-    title = "Extract named entities"  # title displayed in UI
-    description = "Retrieve named entities detected by SpaCy, ranked on frequency. Be sure to have selected " \
-                  "\"Named Entity Recognition\" in the previous module." # description displayed in UI
-    extension = "csv"  # extension of result file, used internally and in UI
-
-    followups = ["wordcloud"]
-
-    options = {
-        "entities": {
-            "type": UserInput.OPTION_MULTI,
-            "default": [],
-            "options": {
-                "PERSON": "PERSON: People, including fictional.",
-                "NORP": "NORP: Nationalities or religious or political groups.",
-                "FAC": "FAC: Buildings, airports, highways, bridges, etc.",
-                "ORG": "ORG: Companies, agencies, institutions, etc.",
-                "GPE": "GPE: Countries, cities, states.",
-                "LOC": "LOC: Non-GPE locations, mountain ranges, bodies of water.",
-                "PRODUCT": "PRODUCT: Objects, vehicles, foods, etc. (Not services.)",
-                "EVENT": "EVENT: Named hurricanes, battles, wars, sports events, etc.",
-                "WORK_OF_ART": "WORK_OF_ART: Titles of books, songs, etc.",
-                "LAW": "LAW: Named documents made into laws.",
-                "LANGUAGE": "LANGUAGE: Any named language.",
-                "DATE": "DATE: Absolute or relative dates or periods.",
-                "TIME": "TIME: Times smaller than a day.",
-                "PERCENT": "PERCENT: Percentage, including ”%“.",
-                "MONEY": "MONEY: Monetary values, including unit.",
-                "QUANTITY": "QUANTITY: Measurements, as of weight or distance.",
-                "ORDINAL": "ORDINAL: “first”, “second”, etc.",
-                "CARDINAL": "CARDINAL: Numerals that do not fall under another type."
-            },
-            "help": "What types of entities to extract (select at least one)",
-            "tooltip": "The above list is derived from the SpaCy documentation (see references)."
-        }
-    }
-
-    references = [
-        "[SpaCy named entities list](https://spacy.io/api/annotation#named-entities)"
-    ]
-
-    @classmethod
-    def is_compatible_with(cls, module=None, user=None):
-        """
-        Allow processor on linguistic feature data
-
-        :param module: Module to determine compatibility with
-        """
-
-        return module.type == "linguistic-features"
-
-    def process(self):
-        """
-        Opens the SpaCy output and gets ze entities.
-
-        """
-
-        # Validate whether the user enabled the right parameters.
-        if "ner" not in self.source_dataset.parameters["enable"]:
-            self.dataset.update_status("Enable \"Named entity recognition\" in previous module")
-            self.dataset.finish(0)
-            return
-
-        else:
-            # Extract the SpaCy docs first
-            self.dataset.update_status("Unzipping SpaCy docs")
-
-            # Store all the entities in this list
-            li_entities = []
-            nlp = spacy.load("en_core_web_sm")  # Load model
-
-            for doc_file in self.iterate_archive_contents(self.source_file):
-                with doc_file.open("rb") as pickle_file:
-                    # Load DocBin
-                    file = pickle.load(pickle_file)
-                    doc_bin = DocBin().from_bytes(file)
-                    docs = list(doc_bin.get_docs(nlp.vocab))
-
-                for doc in docs:
-                    post_entities = []
-
-                    # stop processing if worker has been asked to stop
-                    if self.interrupted:
-                        raise ProcessorInterruptedException("Interrupted while processing documents")
-
-                    for ent in doc.ents:
-                        if ent.label_ in self.parameters["entities"]:
-                            post_entities.append((ent.text, ent.label_))  # Add a tuple
-
-                    li_entities.append(post_entities)
-
-            results = []
-
-            if li_entities:
-
-                # Also add the data to the original file, if indicated.
-                if self.parameters.get("overwrite"):
-                    self.add_field_to_parent(field_name='named_entities',
-                                             # Format like "Apple:ORG, Gates:PERSON, ..." and add to the row
-                                             new_data=[", ".join([":".join(post_entities) for post_entities in entity]) for entity in li_entities],
-                                             which_parent=self.dataset.top_parent(),
-                                             update_existing=True)
-
-                all_entities = []
-                # Convert to lower and filter out one-letter words. Join the words with the entities so we can group easily.
-                for post_ents in li_entities:
-                    for pair in post_ents:
-                        if pair and len(pair[0]) > 1:
-                            pair = pair[0].lower() + " |#| " + pair[1]
-                            all_entities.append(pair)
-
-                # Group and rank
-                count_nouns = Counter(all_entities).most_common()
-                # Unsplit and list the count.
-                results = [{"word": tpl[0].split(" |#| ")[0], "entity": tpl[0].split(" |#| ")[1], "count": tpl[1]} for
-                           tpl in count_nouns]
-
-            # done!
-            if results:
-                self.dataset.update_status("Finished")
-                self.write_csv_items_and_finish(results)
-            else:
-                self.dataset.update_status("Finished, but no entities were extracted.")
-                self.dataset.finish(0)
-
-    @classmethod
-    def get_options(cls, parent_dataset=None, user=None):
-        """
-        Get processor options
-
-        The feature of this processor that overwrites the parent dataset can
-        only work properly on csv datasets so check the extension before
-        showing it.
-
-        :param user:
-        :param parent_dataset:  Dataset to get options for
-        :return dict:
-        """
-        options = cls.options
-        if parent_dataset and parent_dataset.top_parent().get_results_path().suffix in [".csv", ".ndjson"]:
-            options["overwrite"] = {
-                "type": UserInput.OPTION_TOGGLE,
-                "default": False,
-                "help": "Add extracted nouns to source csv",
-                "tooltip": "Will add a column (\"nouns\", \"nouns_and_compounds\", or \"noun_chunks\"), and the found nouns in the post row."
-            }
-
-        return options
diff --git a/processors/text-analysis/get_nouns.py b/processors/text-analysis/get_nouns.py
deleted file mode 100644
index cad8653eb..000000000
--- a/processors/text-analysis/get_nouns.py
+++ /dev/null
@@ -1,196 +0,0 @@
-"""
-Extract nouns from SpaCy NLP docs.
-
-"""
-import pickle
-import spacy
-
-from collections import Counter
-from spacy.tokens import DocBin
-from common.lib.helpers import UserInput
-from backend.lib.processor import BasicProcessor
-
-__author__ = "Sal Hagen"
-__credits__ = ["Sal Hagen"]
-__maintainer__ = "Sal Hagen"
-__email__ = "4cat@oilab.eu"
-
-
-class ExtractNouns(BasicProcessor):
-    """
-    Rank vectors over time
-    """
-    type = "extract-nouns"  # job type ID
-    category = "Text analysis"  # category
-    title = "Extract nouns"  # title displayed in UI
-    description = "Retrieve nouns detected by SpaCy's part-of-speech tagging, and rank by frequency. " \
-                  "Make sure to have selected \"Part of Speech\" in the previous " \
-                  "module, as well as \"Dependency parsing\" if you want to extract compound nouns or noun chunks." # description displayed in UI
-    extension = "csv"  # extension of result file, used internally and in UI
-
-    references = ["[Information on noun chunks](https://spacy.io/usage/linguistic-features#noun-chunks)"]
-
-    followups = ["wordcloud"]
-
-    options = {
-        "type": {
-            "type": UserInput.OPTION_CHOICE,
-            "default": ["nouns"],
-            "options": {
-                "nouns": "Single-word nouns",
-                "nouns_and_compounds": "Nouns and compound nouns",
-                "noun_chunks": "Noun chunks"
-            },
-            "help": "Whether to only get 1) separate words indicated as nouns, 2) nouns and compound nouns " \
-                    "(nouns with multiple words, e.g.\"United States\") using a custom parser, or 3) noun chunks: " \
-                    "nouns plus the words describing them, e.g. \"the old grandpa\". See the references for more info."
-        }
-    }
-
-    @classmethod
-    def is_compatible_with(cls, module=None, user=None):
-        """
-        Allow processor on linguistic feature data
-
-        :param module: Module to determine compatibility with
-        """
-        return module.type == "linguistic-features"
-
-    def process(self):
-        """
-        Opens the SpaCy output and gets ze nouns.
-
-        """
-        noun_type = self.parameters["type"]
-
-        # Validate whether the user enabled the right parameters.
-        # Check part of speech tagging
-        if "tagger" not in self.source_dataset.parameters["enable"]:
-            self.dataset.update_status("Enable \"Part-of-speech tagging\" in previous module")
-            self.dataset.finish(0)
-
-        # Check dependency parsing if nouns and compouns nouns is selected
-        elif (noun_type == "nouns_and_compounds" or noun_type == "noun_chunks") and "parser" not in \
-                self.source_dataset.parameters["enable"]:
-            self.dataset.update_status(
-                "Enable \"Part-of-speech tagging\" and \"Dependency parsing\" for compound nouns/noun chunks in previous module")
-            self.dataset.finish(0)
-
-        # Valid parameters
-        else:
-
-            # Extract the SpaCy docs first
-            self.dataset.update_status("Unzipping SpaCy docs")
-            self.dataset.update_status("Extracting nouns")
-
-            # Store all the nouns in this list
-            li_nouns = []
-            nlp = spacy.load("en_core_web_sm")  # Load model
-            spacy.load("en_core_web_sm")
-
-            for doc_file in self.iterate_archive_contents(self.source_file):
-                with doc_file.open("rb") as pickle_file:
-                    # Load DocBin
-                    file = pickle.load(pickle_file)
-                    doc_bin = DocBin().from_bytes(file)
-                    docs = list(doc_bin.get_docs(nlp.vocab))
-
-            # Simply add each word if its POS is "NOUN"
-            if noun_type == "nouns":
-                for doc in docs:
-                    post_nouns = []
-                    post_nouns += [token.text for token in doc if token.pos_ == "NOUN"]
-                    li_nouns.append(post_nouns)
-
-            # Use SpaCy's noun chunk detection
-            elif noun_type == "noun_chunks":
-
-                for doc in docs:
-
-                    # Note: this is a workaround for now.
-                    # Serialization of the SpaCy docs does not
-                    # work well with dependency parsing after
-                    # loading. Quick fix: parse again.
-
-                    new_doc = nlp(doc.text)
-                    post_nouns = []
-                    for chunk in new_doc.noun_chunks:
-                        post_nouns.append(chunk.text)
-
-                    li_nouns.append(post_nouns)
-
-            # Use a custom script to get single nouns and compound nouns
-            elif noun_type == "nouns_and_compounds":
-                for doc in docs:
-                    post_nouns = []
-                    noun = ""
-
-                    for i, token in enumerate(doc):
-
-                        # Check for common nouns (general, e.g. "people")
-                        # and proper nouns (specific, e.g. "London")
-                        if token.pos_ == "NOUN" or token.pos_ == "PROPN":
-                            # Check if the token is part of a noun chunk
-                            if token.dep_ == "compound":  # Check for a compound relation
-                                noun = token.text
-                            else:
-                                if noun:
-                                    noun += " " + token.text
-                                    post_nouns.append(noun)
-                                    noun = ""
-                                else:
-                                    post_nouns.append(token.text)
-                    li_nouns.append(post_nouns)
-
-            results = []
-
-            if li_nouns:
-
-                # Also add the data to the original file, if indicated.
-                if self.parameters.get("overwrite"):
-                    self.add_field_to_parent(field_name=noun_type,
-                                             # Format like "apple, gates, ..." and add to the row
-                                             new_data=[", ".join([post_noun.lower() for post_noun in li_noun if len(post_noun) > 1]) for li_noun in li_nouns],
-                                             which_parent=self.dataset.top_parent())
-
-                # convert to lower and filter out one-letter words
-                all_nouns = []
-                for post_n in li_nouns:
-                    all_nouns += [str(cap_noun).lower() for cap_noun in post_n if len(cap_noun) > 1]
-
-                # Group and rank
-                count_nouns = Counter(all_nouns).most_common()
-                results = [{"word": tpl[0], "count": tpl[1]} for tpl in count_nouns]
-
-            # done!
-            if results:
-                self.dataset.update_status("Finished")
-                self.write_csv_items_and_finish(results)
-            else:
-                self.dataset.update_status("Finished, but no nouns were extracted.")
-                self.dataset.finish(0)
-
-    @classmethod
-    def get_options(cls, parent_dataset=None, user=None):
-        """
-        Get processor options
-
-        The feature of this processor that overwrites the parent dataset can
-        only work properly on csv datasets so check the extension before
-        showing it.
-
-        :param user:
-        :param parent_dataset:  Dataset to get options for
-        :return dict:
-        """
-        options = cls.options
-        if parent_dataset and parent_dataset.top_parent().get_results_path().suffix in [".csv", ".ndjson"]:
-            options["overwrite"] = {
-                "type": UserInput.OPTION_TOGGLE,
-                "default": False,
-                "help": "Add extracted nouns to source csv",
-                "tooltip": "Will add a column (\"nouns\", \"nouns_and_compounds\", or \"noun_chunks\"), and the found "
-                           "nouns in the post row."
-            }
-
-        return options
diff --git a/processors/text-analysis/linguistic_extractor.py b/processors/text-analysis/linguistic_extractor.py
deleted file mode 100644
index 92357853a..000000000
--- a/processors/text-analysis/linguistic_extractor.py
+++ /dev/null
@@ -1,168 +0,0 @@
-"""
-Extract linguistic features from text using SpaCy.
-
-"""
-import zipfile
-import pickle
-import re
-
-import spacy
-from spacy.tokens import DocBin
-from spacy.tokenizer import Tokenizer
-from spacy.util import compile_prefix_regex, compile_suffix_regex
-
-from common.lib.helpers import UserInput
-from common.lib.exceptions import ProcessorInterruptedException
-from backend.lib.processor import BasicProcessor
-
-__author__ = "Sal Hagen"
-__credits__ = ["Sal Hagen", "Stijn Peeters"]
-__maintainer__ = "Sal Hagen"
-__email__ = "4cat@oilab.eu"
-
-
-class LinguisticFeatures(BasicProcessor):
-	"""
-	Rank vectors over time
-	"""
-	type = "linguistic-features"  # job type ID
-	category = "Text analysis"  # category
-	title = "Annotate text features with SpaCy"  # title displayed in UI
-	description = "Annotate your text with a variety of linguistic features using the SpaCy library, " \
-				  "including part-of-speech tagging, depencency parsing, and named entity recognition. " \
-				  "Subsequent processors can extract the words labelled by SpaCy (e.g. as a noun or name). " \
-				  "Produces a Doc file using the en_core_web_sm model. Currently only available for datasets " \
-				  "with less than 100,000 items. " # description displayed in UI
-	extension = "zip"  # extension of result file, used internally and in UI
-
-	followups = ["get-entities", "extract-nouns"]
-
-	references = [
-		"[SpaCy Linguistic Features - Documentation](https://spacy.io/usage/linguistic-features/)"
-	]
-
-	options = {
-		"enable": {
-			"type": UserInput.OPTION_MULTI,
-			"default": [],
-			"options": {
-				"tagger": "Part-of-speech tagging: Tag the grammatical function of words, like nouns and verbs",
-				"parser": "Dependency parsing: Extract how words in a sentence relate to each other",
-				"ner": "Named entity recognition: Annotate what kind of objects appear in a sentence (e.g. Apple -> Organisation)"
-			},
-			"help": "What linguistic features to extract. Without any of these selected, it simply saves the SpaCy docs (tokenised sentences) as a serialized file. See references for more information."
-		}
-	}
-
-	@classmethod
-	def is_compatible_with(cls, module=None, user=None):
-		"""
-        Allow CSV and NDJSON datasets
-        """
-		return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson")
-
-	def process(self):
-		"""
-		Reads text and outputs entities per text body.
-		"""
-
-		# prepare staging area
-		staging_area = self.dataset.get_staging_area()
-
-		self.dataset.update_status("Preparing data")
-
-		# go through all archived token sets and vectorise them
-		results = []
-
-		# Load the spacy goods
-		nlp = spacy.load("en_core_web_sm")
-		nlp.tokenizer = self.custom_tokenizer(nlp)  # Keep words with a dash in between
-
-		# Disable what has _not_ been selected
-		options = ["parser", "tagger", "ner"]
-		enable = self.parameters.get("enable", False)
-
-		if not enable:
-			self.dataset.update_status("Select at least one of the options.")
-			self.dataset.finish(0)
-			return
-
-		disable = [option for option in options if option not in enable]
-
-		# Get all ze text first so we can process it in batches
-		posts = []
-		for post in self.source_dataset.iterate_items(self):
-			if post.get("body", ""):
-				if len(post["body"]) > 1000000:
-					body = post["body"][:1000000]
-				else:
-					body = post["body"]
-				posts.append(body)
-			else:
-				self.dataset.log('Warning: Post %s has no body from which to extract entities' % post.get('id'))
-				posts.append("")
-
-		# Process the text in batches
-		if len(posts) < 100000:
-			self.dataset.update_status("Extracting linguistic features")
-		else:
-			self.dataset.update_status(
-				"Extracting linguistic features is currently only available for datasets with less than 100,000 items.")
-			self.dataset.finish(0)
-			return
-
-		# Make sure only the needed information is extracted.
-		attrs = []
-		if "tagger" not in disable:
-			attrs.append("POS")
-		if "parser" not in disable:
-			attrs.append("DEP")
-		if "ner":
-			attrs.append("ENT_IOB")
-			attrs.append("ENT_TYPE")
-			attrs.append("ENT_ID")
-			attrs.append("ENT_KB_ID")
-
-		# DocBin for quick saving
-		doc_bin = DocBin(attrs=attrs)
-
-		# Start the processing!
-		try:
-			for i, doc in enumerate(nlp.pipe(posts, disable=disable)):
-				doc_bin.add(doc)
-
-				# It's quite a heavy process, so make sure it can be interrupted
-				if self.interrupted:
-					raise ProcessorInterruptedException("Processor interrupted while iterating through CSV file")
-
-				if i % 1000 == 0:
-					self.dataset.update_status("Done with post %s out of %s" % (i, len(posts)))
-		except MemoryError:
-			self.dataset.update_status("Out of memory. The dataset may be too large to process. Try again with a smaller dataset.", is_final=True)
-			return
-
-		self.dataset.update_status("Serializing results - this will take a while")
-
-		# Then serialize the NLP docs and the vocab
-		doc_bytes = doc_bin.to_bytes()
-
-		# Dump ze data in a temporary folder
-		with staging_area.joinpath("spacy_docs.pb").open("wb") as outputfile:
-			pickle.dump(doc_bytes, outputfile)
-
-		# create zip of archive and delete temporary files and folder
-		self.write_archive_and_finish(staging_area, compression=zipfile.ZIP_LZMA)
-
-	def custom_tokenizer(self, nlp):
-		"""
-		Custom tokeniser that does not split on dashes.
-		Useful for names (e.g. Hennis-Plasschaert).
-		"""
-		infix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''')
-		prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
-		suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
-
-		return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
-						 suffix_search=suffix_re.search,
-						 infix_finditer=infix_re.finditer,
-						 token_match=None)
diff --git a/processors/text-analysis/split_sentences.py b/processors/text-analysis/split_sentences.py
index c5cce2477..dd2be7c2f 100644
--- a/processors/text-analysis/split_sentences.py
+++ b/processors/text-analysis/split_sentences.py
@@ -86,8 +86,11 @@ def get_options(cls, parent_dataset=None, user=None):
 	@classmethod
 	def is_compatible_with(cls, module=None, user=None):
 		"""
-		Allow CSV and NDJSON datasets
+		Allow processor to run on all csv and NDJSON datasets
+
+		:param module: Dataset or processor to determine compatibility with
 		"""
+
 		return module.get_extension() in ("csv", "ndjson")
 
 	def process(self):
diff --git a/processors/text-analysis/tokenise.py b/processors/text-analysis/tokenise.py
index fb1b89cbd..a104306f1 100644
--- a/processors/text-analysis/tokenise.py
+++ b/processors/text-analysis/tokenise.py
@@ -50,8 +50,11 @@ class Tokenise(BasicProcessor):
 	@classmethod
 	def is_compatible_with(cls, module=None, user=None):
 		"""
-		Allow CSV and NDJSON datasets
+		Allow processor to run on all csv and NDJSON datasets
+
+		:param module: Dataset or processor to determine compatibility with
 		"""
+
 		return module.get_extension() in ("csv", "ndjson")
 
 	@classmethod
diff --git a/processors/visualisation/download_videos.py b/processors/visualisation/download_videos.py
index aa24a724b..2b385ffe7 100644
--- a/processors/visualisation/download_videos.py
+++ b/processors/visualisation/download_videos.py
@@ -234,7 +234,7 @@ def is_compatible_with(cls, module=None, user=None):
         in principle, but any links to videos are likely to come from the top
         dataset anyway.
 
-        :param str module:  Module ID to determine compatibility with
+        :param module:  Module to determine compatibility with
         :return bool:
         """
         return ((module.type.endswith("-search") or module.is_from_collector())
@@ -645,6 +645,9 @@ def collect_video_urls(self):
                 if not value:
                     continue
 
+                if value is not str:
+                    value = str(value)
+
                 video_links = self.identify_video_urls_in_string(value)
                 if video_links:
                     item_urls |= set(video_links)
@@ -667,7 +670,6 @@ def identify_video_urls_in_string(self, text):
         :param str text:  string that may contain URLs
         :return list:  	  list containing validated URLs to videos
         """
-        text = str(text)
         split_comma = self.parameters.get("split-comma", True)
         if split_comma:
             texts = text.split(",")
diff --git a/processors/visualisation/image_category_wall.py b/processors/visualisation/image_category_wall.py
index fee1fb7b0..d74d28e40 100644
--- a/processors/visualisation/image_category_wall.py
+++ b/processors/visualisation/image_category_wall.py
@@ -61,13 +61,14 @@ class ImageCategoryWallGenerator(BasicProcessor):
 	def is_compatible_with(cls, module=None, user=None):
 		"""
 		Allow processor on CLIP dataset only
-		
+
 		:param module: Dataset or processor to determine compatibility with
 		"""
 		return module.type.startswith("image-to-categories") or \
 			module.type.startswith("image-downloader") or \
 			module.type.startswith("video-hasher-1") or \
-			module.type.startswith("video-hash-similarity-matrix")
+			module.type.startswith("video-hash-similarity-matrix") and \
+			not module.type not in ["image-downloader-screenshots-search"]
 
 	@classmethod
 	def get_options(cls, parent_dataset=None, user=None):
@@ -170,7 +171,7 @@ def process(self):
 		self.dataset.log(f"Found {image_dataset.type} w/ {image_dataset.num_rows} images and {category_dataset.type} w/ {category_dataset.num_rows} items")
 
 		category_column = self.parameters.get("category")
-		if category_column is None:
+		if not category_column:
 			self.dataset.finish_with_error("No category provided.")
 			return
 
@@ -427,6 +428,3 @@ def process(self):
 		canvas.save(pretty=True)
 		self.dataset.log("Saved to " + str(self.dataset.get_results_path()))
 		return self.dataset.finish(len(category_widths))
-
-
-
diff --git a/processors/visualisation/word-trees.py b/processors/visualisation/word-trees.py
index 6446372e8..f7783bcc1 100644
--- a/processors/visualisation/word-trees.py
+++ b/processors/visualisation/word-trees.py
@@ -104,6 +104,16 @@ class MakeWordtree(BasicProcessor):
 		}
 	}
 
+	@classmethod
+	def is_compatible_with(cls, module=None, user=None):
+		"""
+		Allow processor to run on all csv and NDJSON datasets
+
+		:param module: Dataset or processor to determine compatibility with
+		"""
+
+		return module.get_extension() in ("csv", "ndjson")
+
 	# determines how close the nodes are displayed to each other (min. 1)
 	whitespace = 2
 
@@ -126,13 +136,6 @@ class MakeWordtree(BasicProcessor):
 	# methods
 	limit = 1
 
-	@classmethod
-	def is_compatible_with(cls, module=None, user=None):
-		"""
-        Allow CSV and NDJSON datasets
-        """
-		return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson")
-
 	def process(self):
 		"""
 		This takes a 4CAT results file as input, and outputs a plain text file
diff --git a/setup.py b/setup.py
index e62f292ba..0e4e536f1 100644
--- a/setup.py
+++ b/setup.py
@@ -8,10 +8,10 @@
 	version = versionfile.readline().strip()
 
 # Universal packages
-packages = [
+packages = set([
 	"anytree~=2.8.0",
 	"bcrypt~=3.2.0",
-	"beautifulsoup4~=4.11.0",
+	"beautifulsoup4",#~=4.11.0",
 	"clarifai-grpc~=9.0",
 	"cryptography>=39.0.1",
 	"cssselect~=1.1.0",
@@ -22,7 +22,7 @@
 	"Flask~=2.2",
 	"Flask_Limiter==1.0.1",
 	"Flask_Login~=0.6",
-	"gensim>=4.1.0, <4.2",
+	"gensim>=4.3.3, <4.4.0",
 	"google_api_python_client==2.0.2",
 	"html2text==2020.*",
 	"ImageHash>4.2.0",
@@ -31,7 +31,7 @@
 	"lxml~=4.9.0",
 	"markdown==3.0.1",
 	"markdown2==2.4.2",
-	"nltk==3.9.1",
+	"nltk~=3.9.1",
 	"networkx~=2.8.0",
 	"numpy>=1.19.2",
 	"opencv-python>=4.6.0.66",
@@ -48,11 +48,11 @@
 	"razdel~=0.5",
 	"requests~=2.27",
 	"requests_futures",
+	"scikit_learn",
 	"scenedetect==0.6.0.3",
 	"scikit-learn",
 	"scipy==1.10.1",
 	"shapely",
-	"spacy==3.7.2",
 	"svgwrite~=1.4.0",
 	"tailer",
 	"Telethon~=1.36.0",
@@ -64,17 +64,29 @@
 	"imagedominantcolor @ git+https://github.com/dale-wahl/imagedominantcolor.git@pillow10",
 	"videohash @ git+https://github.com/dale-wahl/videohash@main",
 	"vk_api",
-	"yt-dlp",
-	"en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz#egg=en_core_web_sm"
-]
+	"yt-dlp"
+])
+
+# Check for extension packages
+if os.path.isdir("extensions"):
+	extension_packages = set()
+	for root, dirs, files in os.walk("extensions"):
+		for file in files:
+			if file == "requirements.txt":
+				with open(os.path.join(root, file)) as extension_requirements:
+					for line in extension_requirements.readlines():
+						extension_packages.add(line.strip())
+	if extension_packages:
+		print("Found extensions, installing additional packages: " + str(extension_packages))
+		packages = packages.union(extension_packages)
 
 # Some packages don't run on Windows
-unix_packages = [
+unix_packages = set([
 	"python-daemon==2.3.2"
-]
+])
 
 if os.name != "nt":
-	packages = packages + unix_packages
+	packages = packages.union(unix_packages)
 
 setup(
 	name='fourcat',
@@ -87,5 +99,5 @@
 	url="https://oilab.eu",
 	packages=['backend', 'webtool', 'datasources'],
 	python_requires='>=3.7',
-	install_requires=packages,
+	install_requires=list(packages),
 )
diff --git a/webtool/__init__.py b/webtool/__init__.py
index 7becd1239..6c1786ad5 100644
--- a/webtool/__init__.py
+++ b/webtool/__init__.py
@@ -105,12 +105,11 @@
 
 # import all views
 import webtool.views.views_admin
+import webtool.views.views_extensions
 import webtool.views.views_restart
 import webtool.views.views_user
-
 import webtool.views.views_dataset
 import webtool.views.views_misc
-
 import webtool.views.api_explorer
 import webtool.views.api_standalone
 import webtool.views.api_tool
diff --git a/webtool/lib/helpers.py b/webtool/lib/helpers.py
index d06f4435c..6cc91eba1 100644
--- a/webtool/lib/helpers.py
+++ b/webtool/lib/helpers.py
@@ -23,7 +23,7 @@ class Pagination(object):
 	Provide pagination
 	"""
 
-	def __init__(self, page, per_page, total_count, route="show_results"):
+	def __init__(self, page, per_page, total_count, route="show_results", route_args=None):
 		"""
 		Set up pagination object
 
@@ -36,6 +36,7 @@ def __init__(self, page, per_page, total_count, route="show_results"):
 		self.per_page = per_page
 		self.total_count = total_count
 		self.route = route
+		self.route_args = route_args if route_args else {}
 
 	@property
 	def pages(self):
diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index c50caca26..6ac9272ba 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -139,9 +139,12 @@ def _jinja2_filter_add_ahref(content):
 
 	return content
 
-@app.template_filter('markdown')
-def _jinja2_filter_markdown(text):
+@app.template_filter('markdown',)
+def _jinja2_filter_markdown(text, trim_container=False):
 	val = markdown.markdown(text)
+	if trim_container:
+		val = re.sub(r"^<p>", "", val)
+		val = re.sub(r"</p>$", "", val)
 	return val
 
 @app.template_filter('isbool')
@@ -262,7 +265,7 @@ def _jinja2_filter_post_field(field, post):
 	formatted_field = field
 
 	field = str(field)
-	
+
 	for key in re.findall(r"\{\{(.*?)\}\}", field):
 
 		original_key = key
@@ -296,7 +299,7 @@ def _jinja2_filter_post_field(field, post):
 		# We see 0 as a valid value - e.g. '0 retweets'.
 		if not val and val != 0:
 			return ""
-		
+
 		# Support some basic string slicing
 		if string_slice:
 			field = field.replace("[" + string_slice + "]", "")
@@ -317,7 +320,7 @@ def _jinja2_filter_post_field(field, post):
 
 		# Apply further filters, if present (e.g. lower)
 		for extra_filter in extra_filters:
-			
+
 			extra_filter = extra_filter.strip()
 
 			# We're going to parse possible parameters to pass to the filter
@@ -328,7 +331,7 @@ def _jinja2_filter_post_field(field, post):
 				extra_filter = extra_filter.split("(")[0]
 				params = [p.strip() for p in params.split(",")]
 				params = [post[param] for param in params]
-			
+
 			val = app.jinja_env.filters[extra_filter](val, *params)
 
 		if string_slice:
@@ -388,3 +391,7 @@ def uniqid():
 		"__version": version,
 		"uniqid": uniqid
 	}
+
+@app.template_filter('log')
+def _jinja2_filter_log(text):
+	app.logger.info(text)
\ No newline at end of file
diff --git a/webtool/static/js/fourcat.js b/webtool/static/js/fourcat.js
index e622505b2..950ba523e 100644
--- a/webtool/static/js/fourcat.js
+++ b/webtool/static/js/fourcat.js
@@ -497,7 +497,7 @@ const query = {
                     applyProgress($('#query-status'), 100);
                     let keyword = json.label;
 
-                    $('#query-results').append('<li><a href="../results/' + json.key + '">' + keyword + ' (' + json.rows + ' items)</a></li>');
+                    $('#query-results').append('<li><a href="../results/' + json.key + '/">' + keyword + ' (' + json.rows + ' items)</a></li>');
                     query.reset_form();
                     popup.alert('Query for \'' + keyword + '\' complete!', 'Success');
                 } else {
@@ -630,17 +630,17 @@ const query = {
 
                 for (let i = 0; i < json.length; i += 1) {
                     search_queue_length += json[i]['count'];
-                    search_queue_notice += " <span class='property-badge'>" + json[i]['jobtype'].replace('-search', '') + ' (' + json[i]['count'] + ')' + '</span>'
+                    search_queue_notice += " <span class='property-badge'>" + json[i]['processor_name'] + ' (' + json[i]['count'] + ')' + '</span>'
                 }
 
                 if (search_queue_length == 0) {
                     search_queue_box.html('Search queue is empty.');
                     search_queue_list.html('');
                 } else if (search_queue_length == 1) {
-                    search_queue_box.html('Currently processing 1 search query: ');
+                    search_queue_box.html('Currently collecting 1 dataset: ');
                     search_queue_list.html(search_queue_notice);
                 } else {
-                    search_queue_box.html('Currently processing ' + search_queue_length + ' search queries: ');
+                    search_queue_box.html('Currently collecting ' + search_queue_length + ' datasets: ');
                     search_queue_list.html(search_queue_notice);
                 }
             },
@@ -1993,4 +1993,4 @@ function find_parent(element, selector) {
     }
 
     return null;
-}
\ No newline at end of file
+}
diff --git a/webtool/templates/account/login.html b/webtool/templates/account/login.html
index d95d6d9ef..de11e90b5 100644
--- a/webtool/templates/account/login.html
+++ b/webtool/templates/account/login.html
@@ -8,7 +8,9 @@
         <section class="login-form">
             <h2><span>Log in</span></h2>
                 <p>Please log in to access {{ __user_config("4cat.name") }}.{% if __user_config("4cat.allow_access_request") %} If you do not have an account, you can <a href="{{ url_for('request_access') }}">request one</a>.{% endif %}</p>
+        {% if __user_config("ui.advertise_install") %}
                 <p>You can also install your own 4CAT instance. See the <a href="https://github.com/digitalmethodsinitiative/4cat/wiki/Installing-4CAT">installation page on the GitHub wiki</a>.</p>
+        {% endif %}
                 {% for notice in flashes %}
                 <p class="form-notice">{{notice}}</p>
                 {%  endfor %}
diff --git a/webtool/templates/components/datasource-option.html b/webtool/templates/components/datasource-option.html
index 5eff77e00..4ee4ba16e 100644
--- a/webtool/templates/components/datasource-option.html
+++ b/webtool/templates/components/datasource-option.html
@@ -9,7 +9,7 @@
             <div{% if settings.type == "toggle" %} class="with-tail"{% endif %}>
         {% if settings.type == "toggle" %}
             <input name="option-{{ option }}" id="forminput-{{ option }}" type="checkbox" {% if settings.default %} checked{% endif %}>
-            {% if "tooltip" in settings %}<p class="option-help">{{ settings.tooltip }}</p>{% endif %}
+            {% if "tooltip" in settings %}<p class="option-help">{{ settings.tooltip|markdown(True)|safe }}</p>{% endif %}
         {% elif settings.type == "file" %}
             <input name="option-{{ option }}" id="forminput-{{ option }}" type="file"{% if settings.multiple %} multiple="true" {% endif %}{% if settings.accept %}accept=".csv"{% endif %}>
             {% if "tooltip" in settings %}
diff --git a/webtool/templates/components/pagination.html b/webtool/templates/components/pagination.html
index 607844157..91ea859c8 100644
--- a/webtool/templates/components/pagination.html
+++ b/webtool/templates/components/pagination.html
@@ -2,12 +2,12 @@
     <nav class="pagination">
         <ol>
             {% if pagination.has_prev %}
-                    <li><a href="{{ url_for(pagination.route, page=(pagination.page - 1)) }}?{{ filter|http_query }}&amp;depth={{ depth }}">&laquo; Previous</a></li>
+                    <li><a href="{{ url_for(pagination.route, page=(pagination.page - 1), **pagination.route_args) }}?{{ filter|http_query }}&amp;depth={{ depth }}">&laquo; Previous</a></li>
             {% endif %}
             {%- for page in pagination.iter_pages() %}
                 {% if page %}
                     {% if page != pagination.page %}
-                        <li class="page"><a href="{{ url_for(pagination.route, page=page) }}?{{ filter|http_query }}&amp;depth={{ depth }}">{{ page }}</a></li>
+                        <li class="page"><a href="{{ url_for(pagination.route, page=page, **pagination.route_args) }}?{{ filter|http_query }}&amp;depth={{ depth }}">{{ page }}</a></li>
                     {% else %}
                         <li class="page"><strong class="current">{{ page }}</strong></li>
                     {% endif %}
@@ -16,7 +16,7 @@
                 {% endif %}
             {%- endfor %}
             {% if pagination.has_next %}
-                    <li><a href="{{ url_for(pagination.route, page=(pagination.page + 1)) }}?{{ filter|http_query }}&amp;depth={{ depth }}">Next &raquo;</a></li>
+                    <li><a href="{{ url_for(pagination.route, page=(pagination.page + 1), **pagination.route_args) }}?{{ filter|http_query }}&amp;depth={{ depth }}">Next &raquo;</a></li>
             {% endif %}
         </ol>
     </nav>
\ No newline at end of file
diff --git a/webtool/templates/components/result-child.html b/webtool/templates/components/result-child.html
index 4b0b16588..36fb64136 100644
--- a/webtool/templates/components/result-child.html
+++ b/webtool/templates/components/result-child.html
@@ -74,7 +74,7 @@ <h4>{{ processors[item.type].title if not deprecated else "(Deprecated analysis)
         {# Parameters #}
         <div class="parameters">
             <ul>
-                {% if item.get_extension() in ("gexf", "csv", "svg", "jpeg", "jpg", "png", "gif", "webp") or item.get_own_processor().map_item %}
+                {% if item.get_extension() in ("html", "gexf", "csv", "svg", "jpeg", "jpg", "png", "gif", "webp") or item.get_own_processor().map_item %}
                 <li>
                     <a href="{{ url_for('preview_items', key=item.key) }}"
                        data-load-from="{{ url_for('preview_items', key=item.key) }}"
diff --git a/webtool/templates/components/result-details.html b/webtool/templates/components/result-details.html
index 24768ffa9..145769e4e 100644
--- a/webtool/templates/components/result-details.html
+++ b/webtool/templates/components/result-details.html
@@ -163,10 +163,10 @@ <h3 class="blocktitle section-subheader{% if not dataset.children %} collapsed"
         </ol>
     </section>
 
-    {% if __user_config("ui.inline_preview") and (dataset.get_extension() in ("csv", "gexf") or dataset.get_own_processor().map_item) %}
+    {% if __user_config("ui.inline_preview") and (dataset.get_extension() in ("csv", "gexf", "html") or dataset.get_own_processor().map_item or (dataset.get_extension() in ("ndjson",) and not __user_config("ui.prefer_mapped_preview"))) %}
         <section class="inline-preview">
             <h2 class="blocktitle section-subheader">
-                <span>Preview</span>
+                <span>{% if dataset.get_extension == "html" %}View dataset{% else %}Dataset preview{% endif %}</span>
                 <button
                     data-load-from="{{ url_for('preview_items', key=dataset.key) }}" class="popup-trigger"
                     aria-controls="popup-preview-{{ dataset.key }}">
diff --git a/webtool/templates/components/result-result-row.html b/webtool/templates/components/result-result-row.html
index 6a24e484d..665ed5fd0 100644
--- a/webtool/templates/components/result-result-row.html
+++ b/webtool/templates/components/result-result-row.html
@@ -33,14 +33,13 @@
         <p role="tooltip" id="tooltip-get-mapped-result-{{ dataset.key }}" aria-hidden="true">Download data as CSV w/ 4CAT defined columns</p>
         </li>
     {% endif %}
-    {% if (dataset.get_extension() in ("csv", "gexf") or dataset.get_own_processor().map_item) and not __user_config("ui.inline_preview") %}
+    {% if (dataset.get_extension() in ("csv", "gexf", "html") or dataset.get_own_processor().map_item) and not __user_config("ui.inline_preview") %}
         <li>
             <a href="{{ url_for('preview_items', key=dataset.key) }}"
-               data-load-from="{{ url_for('preview_items', key=dataset.key) }}" class="tooltip-trigger popup-trigger"
-               aria-controls="tooltip-preview-{{ dataset.key }} popup-preview-{{ dataset.key }}">
+               data-load-from="{{ url_for('preview_items', key=dataset.key) }}" class="popup-trigger"
+               aria-controls="popup-preview-{{ dataset.key }}">
                 <i class="fa fa-eye" aria-hidden="true"></i> Preview
             </a>
-            <p role="tooltip" id="tooltip-preview-{{ dataset.key }}" aria-hidden="true">Preview data as table w/ 4CAT defined columns</p>
         <div role="dialog" id="popup-preview-{{ dataset.key }}"></div>
         </li>
     {% endif %}
diff --git a/webtool/templates/controlpanel/extensions-list.html b/webtool/templates/controlpanel/extensions-list.html
new file mode 100644
index 000000000..bd7243fde
--- /dev/null
+++ b/webtool/templates/controlpanel/extensions-list.html
@@ -0,0 +1,55 @@
+{% extends "controlpanel/layout.html" %}
+
+{% block title %}4CAT Extensions{% endblock %}
+{% block body_class %}plain-page admin {{ body_class }}{% endblock %}
+{% block subbreadcrumbs %}{% set navigation.sub = "extensions" %}{% endblock %}
+
+{% block body %}
+    <article class="small">
+        <section class="result-list">
+            <h2><span>4CAT Extensions</span></h2>
+                        {% for notice in flashes %}
+                            <p class="form-notice">{{ notice|safe }}</p>
+                        {% endfor %}
+            <p>4CAT extensions can be installed in the <code>extensions</code> folder in the 4CAT root. For more
+                information, see the README file in that folder. This page lists all currently installed extensions;
+                currently, to manage extensions you will need to access the filesystem and move files into the correct
+                location manually.</p>
+            <div class="user-panel">
+                <table class="fullwidth notification-table cp-table">
+                    <colgroup>
+                        <col>
+                        <col>
+                        <col class="actions">
+                    </colgroup>
+                    <tr>
+                        <th>Extension</th>
+                        <th>Version</th>
+                        <th>Links</th>
+                    </tr>
+                    {% if extensions %}
+                        {% for extension_id, extension in extensions.items() %}
+                            <tr>
+                                <td><span class="property-badge">{{ extension_id }}</span>{% if extension_id != extension.name %}
+                                    {{ extension.name }}{% endif %}</td>
+                                <td>{% if extension.version %}{{ extension.version }}{% else %}unknown{% endif %}</td>
+                                <td>
+                                    {% if extension.url and extension.url != extension.git_url %}
+                                        <a href="{{ extension.url }}"><i class="fa fa-fw fa-link" aria-hidden="true"></i><span
+                                                class="sr-only">Website</span></a>{% endif %}
+                                    {% if extension.git_url %}<a href="{{ extension.git_url }}"><i class="fab fa-fw fa-git{% if "github.com" in extension.git_url %}hub{% endif %}"
+                                                                                                   aria-hidden="true"></i><span
+                                            class="sr-only">Remote git repository</span></a>{% endif %}
+                                </td>
+                            </tr>
+                        {% endfor %}
+                    {% else %}
+                        <tr>
+                            <td colspan="3">No 4CAT extensions are installed.</td>
+                        </tr>
+                    {% endif %}
+                </table>
+            </div>
+        </section>
+    </article>
+{% endblock %}
\ No newline at end of file
diff --git a/webtool/templates/controlpanel/layout.html b/webtool/templates/controlpanel/layout.html
index b6fe93e07..ade0dcf7e 100644
--- a/webtool/templates/controlpanel/layout.html
+++ b/webtool/templates/controlpanel/layout.html
@@ -14,7 +14,9 @@
         <li{% if navigation.sub == "tags" %} class="current"{% endif %}><a href="{{ url_for("manipulate_tags") }}">Configuration Tags</a></li>{% endif %}
     {% if __user_config("privileges.admin.can_manage_notifications") %}
         <li{% if navigation.sub == "notifications" %} class="current"{% endif %}><a href="{{ url_for("manipulate_notifications") }}">Notifications</a></li>{% endif %}
-    {% if __user_config("privileges.admin.can_view_status") %}
+    {% if __user_config("privileges.admin.can_restart") %}
+        <li{% if navigation.sub == "extensions" %} class="current"{% endif %}><a href="{{ url_for("extensions_panel") }}">Extensions</a></li>{% endif %}
+    {% if __user_config("privileges.admin.can_manage_users") %}
         <li{% if navigation.sub == "logs" %} class="current"{% endif %}><a href="{{ url_for("view_logs") }}">View logs</a></li>{% endif %}
     {% if __user_config("privileges.admin.can_manipulate_all_datasets") %}
         <li{% if navigation.sub == "dataset-bulk" %} class="current"{% endif %}><a href="{{ url_for("dataset_bulk") }}">Dataset bulk management</a></li>{% endif %}
diff --git a/webtool/templates/controlpanel/logs.html b/webtool/templates/controlpanel/logs.html
index b30ac0460..42d6313f2 100644
--- a/webtool/templates/controlpanel/logs.html
+++ b/webtool/templates/controlpanel/logs.html
@@ -20,5 +20,12 @@ <h2><span>stderr of backend daemon</span></h2>
                 Loading log file...
             </pre>
         </section>
+
+        <section>
+            <h2><span>HTTP request headers</span></h2>
+            <pre id="http-log" class="content-container">
+{{ headers }}
+            </pre>
+        </section>
     </article>
 {% endblock %}
diff --git a/webtool/templates/create-dataset.html b/webtool/templates/create-dataset.html
index 91194aa45..751fcdd5b 100644
--- a/webtool/templates/create-dataset.html
+++ b/webtool/templates/create-dataset.html
@@ -36,6 +36,7 @@ <h2><span>Create new dataset</span></h2>
                 </div>
 
                 <div id="data-options">
+                    {% if __user_config("ui.offer_hashing") %}
                     <p class="form-intro dataset-anonymisation">4CAT can remove information it identifies as relating to an item's author, or
                         replace it with a <a href="https://techterms.com/definition/hash">hashed</a> value. Other
                         personal information may persist; it is your responsibility to further anonymise data where
@@ -50,16 +51,17 @@ <h2><span>Create new dataset</span></h2>
                             </select>
                         </div>
                     </div>
+                    {% endif %}
+                    {% if __user_config("ui.offer_private") %}
                     <div class="form-element">
                         <label for="data-make-private">Make private:</label>
                         <div class="filter-parameters">
                             <label><input type="checkbox" name="make-private" id="data-make-private" checked="checked"> Make dataset private</label>
                             <button class="tooltip-trigger" aria-controls="tooltip-dataset-private" aria-label="Extended help for option">?</button>
                         </div>
-
-
                         <p role="tooltip" id="tooltip-dataset-private">This will only hide your dataset from other users. It will NOT encrypt your data and server administrators will still be able to view it. If you are working with sensitive data, you should consider running your own 4CAT instance.</p>
                     </div>
+                    {% endif %}
                     {% if __user_config("ui.option_email") in ["both", "datasources_only"] and __user_config("mail.server") %}
                     <div class="form-element">
                         <label for="data-email-complete">Receive email on completion:</label>
diff --git a/webtool/templates/data-overview.html b/webtool/templates/data-overview.html
index 7b371178f..f31f2178d 100644
--- a/webtool/templates/data-overview.html
+++ b/webtool/templates/data-overview.html
@@ -45,7 +45,7 @@ <h3>Metadata</h3>
                                         <li>The data for this data source are gathered and stored by this 4CAT instance.</li>
                                     {% elif label == "static" %}
                                         <li>The data for this datasource are not updated anymore and show a static snapshot.</li>
-                                    {% elif label == "extension" %}
+                                    {% elif label == "zeeschuimer" %}
                                         <li>The data for this datasource are collected with <a href="https://github.com/digitalmethodsinitiative/zeeschuimer">Zeeschuimer</a>.</li>
                                     {% elif label == "external" %}
                                         <li>The data for this datasource is collected externally (API or custom upload).</li>
diff --git a/webtool/templates/frontpage.html b/webtool/templates/frontpage.html
index 178f112c0..78e0eaf1b 100644
--- a/webtool/templates/frontpage.html
+++ b/webtool/templates/frontpage.html
@@ -12,19 +12,20 @@ <h2><span>What is {{ __user_config("4cat.name") }}?</span></h2>
             <p>4CAT is developed by <a href="https://oilab.eu">OILab</a> and the <a
                     href="https://www.digitalmethods.net">Digital Methods Initiative</a> at the University of Amsterdam.
                 For more information, take a look at the <a href="https://4cat.nl">4CAT website</a>.</p>
-            <h2><span>News and updates</span></h2>
-            <ol class="news">
-                {% if news %}
-                {% for item in news %}
-                <li>
-                    <time>{{ item.time }}</time>
-                    {{ item.text|markdown|safe }}
-                </li>
-                {% endfor %}
-                {% else %}
-                <li>You can add news for your 4CAT instance in <code>news.json</code> in the 4CAT root folder.</li>
-                {% endif %}
-            </ol>
+            {% if __user_config("4cat.about_this_server") %}
+            <h2><span>About this server</span></h2>
+            <p>{{ __user_config("4cat.about_this_server") }}</p>
+            {% endif %}
+            <h2><span>4CAT updates</span></h2>
+            <p class="updates">
+                  <script src="https://cdn.jsdelivr.net/npm/bsky-embed@0.1.5/dist/bsky-embed.es.js" async></script>
+                  <bsky-embed
+                    username="4cat.nl"
+                    mode="light"
+                    limit="20"
+                  >
+                  </bsky-embed>
+            </p>
         </section>
 
         <aside>
diff --git a/webtool/templates/preview/csv.html b/webtool/templates/preview/csv.html
index 14aa70ba6..fc36bb9d1 100644
--- a/webtool/templates/preview/csv.html
+++ b/webtool/templates/preview/csv.html
@@ -5,7 +5,7 @@
 </head>
 <body class="csv-preview">
 {% if dataset.num_rows > max_items %}
-<p class="warning">Note: only the first {{ "{:,}".format(max_items) }} of {{ "{:,}".format(dataset.num_rows) }} total items of this dataset are shown in this preview (~{{ ((max_items / dataset.num_rows) * 100)|round(0)|int }}%)</p>
+<p class="warning">Note: only the first {{ "{:,}".format(max_items) }} of {{ "{:,}".format(dataset.num_rows) }} total items of this dataset are shown in this preview (~{{ ((max_items / dataset.num_rows) * 100)|round(0)|int }}%). Download the dataset file for the rest of the data.</p>
 {% endif %}
 <table>
     {% set ns = namespace(links=[]) %}
diff --git a/webtool/templates/preview/html.html b/webtool/templates/preview/html.html
new file mode 100644
index 000000000..cd10e94a2
--- /dev/null
+++ b/webtool/templates/preview/html.html
@@ -0,0 +1 @@
+{{ html|safe }}
\ No newline at end of file
diff --git a/webtool/views/api_tool.py b/webtool/views/api_tool.py
index 64c331271..10f946bcb 100644
--- a/webtool/views/api_tool.py
+++ b/webtool/views/api_tool.py
@@ -384,8 +384,9 @@ def queue_dataset():
 	if hasattr(search_worker, "after_create"):
 		search_worker.after_create(sanitised_query, dataset, request)
 
-	queue.add_job(jobtype=search_worker_id, remote_id=dataset.key)
-	dataset.link_job(Job.get_by_remote_ID(dataset.key, db))
+	queue.add_job(jobtype=search_worker_id, remote_id=dataset.key, interval=0)
+	new_job = Job.get_by_remote_ID(dataset.key, db)
+	dataset.link_job(new_job)
 
 	return jsonify({"status": "success", "message": "", "key": dataset.key})
 
@@ -924,9 +925,16 @@ def check_search_queue():
 
 	:return-schema: {type=array,properties={jobtype={type=string}, count={type=integer}},items={type=string}}
 	"""
-	unfinished_datasets = db.fetchall("SELECT jobtype, COUNT(*)count FROM jobs WHERE jobtype LIKE '%-search' GROUP BY jobtype ORDER BY count DESC;")
+	unfinished_jobs = db.fetchall("SELECT jobtype, COUNT(*)count FROM jobs WHERE jobtype LIKE '%-search' GROUP BY jobtype ORDER BY count DESC;")
 
-	return jsonify(unfinished_datasets)
+	for i, job in enumerate(unfinished_jobs):
+		processor = backend.all_modules.processors.get(job["jobtype"])
+		if processor:
+			unfinished_jobs[i]["processor_name"] = processor.title
+		else:
+			unfinished_jobs[i]["processor_name"] = job["jobtype"]
+
+	return jsonify(unfinished_jobs)
 
 @app.route("/api/toggle-dataset-favourite/<string:key>")
 @login_required
diff --git a/webtool/views/views_admin.py b/webtool/views/views_admin.py
index baffcc787..c7fe01c2a 100644
--- a/webtool/views/views_admin.py
+++ b/webtool/views/views_admin.py
@@ -720,7 +720,8 @@ def view_logs():
 
     :return:
     """
-    return render_template("controlpanel/logs.html")
+    headers = "\n".join([f"{h}: {request.headers[h]}" for h in dict(request.headers)])
+    return render_template("controlpanel/logs.html", headers=headers)
 
 
 @app.route("/logs/<string:logfile>/")
diff --git a/webtool/views/views_dataset.py b/webtool/views/views_dataset.py
index 24bb0d1da..0b77d331f 100644
--- a/webtool/views/views_dataset.py
+++ b/webtool/views/views_dataset.py
@@ -310,6 +310,11 @@ def preview_items(key):
         # just show image in an empty page
         return render_template("preview/image.html", dataset=dataset)
 
+    elif dataset.get_extension() == "html":
+        # just render the file!
+        with dataset.get_results_path().open() as infile:
+            return render_template("preview/html.html", html=infile.read())
+
     elif dataset.get_extension() not in ("json", "ndjson") or use_mapper:
         # iterable data, which we use iterate_items() for, which in turn will
         # use map_item if the underlying data is not CSV but JSON
diff --git a/webtool/views/views_extensions.py b/webtool/views/views_extensions.py
new file mode 100644
index 000000000..2f120e2a3
--- /dev/null
+++ b/webtool/views/views_extensions.py
@@ -0,0 +1,28 @@
+"""
+4CAT extension views - routes to manipulate 4CAT extensions
+"""
+
+from flask import render_template, request, flash, get_flashed_messages
+from flask_login import current_user, login_required
+
+from webtool import app, config
+from common.lib.helpers import find_extensions
+
+from common.config_manager import ConfigWrapper
+
+config = ConfigWrapper(config, user=current_user, request=request)
+
+
+@app.route("/admin/extensions/")
+@login_required
+def extensions_panel():
+    extensions, load_errors = find_extensions()
+
+    if extensions is None:
+        return render_template("error.html", message="No extensions folder is available - cannot "
+                                                         "list or manipulate extensions in this 4CAT server."), 500
+
+    for error in load_errors:
+        flash(error)
+
+    return render_template("controlpanel/extensions-list.html", extensions=extensions, flashes=get_flashed_messages())
diff --git a/webtool/views/views_misc.py b/webtool/views/views_misc.py
index 6433b530e..aed8c2d03 100644
--- a/webtool/views/views_misc.py
+++ b/webtool/views/views_misc.py
@@ -155,8 +155,8 @@ def data_overview(datasource=None):
         if is_static:
             labels.append("static")
 
-        if hasattr(worker_class, "is_from_extension"):
-            labels.append("extension")
+        if hasattr(worker_class, "is_from_zeeschuimer"):
+            labels.append("zeeschuimer")
 
         # Get daily post counts for local datasource to display in a graph
         if is_local == "local":

From bc5dd4e6b87a42359b9af279817f5afe5158defd Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Fri, 27 Sep 2024 11:44:06 +0200
Subject: [PATCH 08/12] merge docker files

---
 docker-compose_build.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docker-compose_build.yml b/docker-compose_build.yml
index 7466e8ba8..6164c6568 100644
--- a/docker-compose_build.yml
+++ b/docker-compose_build.yml
@@ -9,6 +9,7 @@ services:
       - POSTGRES_HOST_AUTH_METHOD=${POSTGRES_HOST_AUTH_METHOD}
     volumes:
       - ./data/postgres/:/var/lib/postgresql/data/
+#      - 4cat_db:/var/lib/postgresql/data/
     healthcheck:
       test: [ "CMD-SHELL", "pg_isready -U $${POSTGRES_USER}" ]
       interval: 5s
@@ -32,6 +33,9 @@ services:
       - ./data/datasets/:/usr/src/app/data/
       - ./data/config/:/usr/src/app/config/
       - ./data/logs/:/usr/src/app/logs/
+#      - 4cat_data:/usr/src/app/data/
+#      - 4cat_config:/usr/src/app/config/
+#      - 4cat_logs:/usr/src/app/logs/
     entrypoint: docker/docker-entrypoint.sh
 
   frontend:
@@ -49,6 +53,9 @@ services:
       - ./data/datasets/:/usr/src/app/data/
       - ./data/config/:/usr/src/app/config/
       - ./data/logs/:/usr/src/app/logs/
+#      - 4cat_data:/usr/src/app/data/
+#      - 4cat_config:/usr/src/app/config/
+#      - 4cat_logs:/usr/src/app/logs/
     command: ["docker/wait-for-backend.sh"]
 
 volumes:

From 56e3b7f05fd2a794a6f6880ad30c8db7965dd439 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Fri, 27 Sep 2024 12:13:46 +0200
Subject: [PATCH 09/12] fix merge issues

---
 backend/lib/processor.py  |  2 +-
 common/lib/dataset.py     |  1 +
 webtool/views/api_tool.py | 23 +++++++++++++++--------
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/backend/lib/processor.py b/backend/lib/processor.py
index e9e4d85a4..e90e2f9eb 100644
--- a/backend/lib/processor.py
+++ b/backend/lib/processor.py
@@ -307,7 +307,7 @@ def after_process(self):
 		if "attach_to" in self.parameters:
 			try:
 				# copy metadata and results to the surrogate
-				surrogate = DataSet(key=self.parameters["attach_to"], db=self.db)
+				surrogate = DataSet(key=self.parameters["attach_to"], db=self.db, modules=self.modules)
 
 				if self.dataset.get_results_path().exists():
 					# Update the surrogate's results file suffix to match this dataset's suffix
diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 9f27d156e..97321a6d7 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -45,6 +45,7 @@ class DataSet(FourcatModule):
 	genealogy = None
 	preset_parent = None
 	parameters = None
+	modules = None
 
 	owners = None
 	tagged_owners = None
diff --git a/webtool/views/api_tool.py b/webtool/views/api_tool.py
index 27c38f416..718d5f39c 100644
--- a/webtool/views/api_tool.py
+++ b/webtool/views/api_tool.py
@@ -426,7 +426,7 @@ def check_dataset():
 	block = request.args.get("block", "status")
 
 	try:
-		dataset = DataSet(key=dataset_key, db=db)
+		dataset = DataSet(key=dataset_key, db=db, modules=fourcat_modules)
 	except DataSetException:
 		return error(404, error="Dataset does not exist.")
 
@@ -496,7 +496,7 @@ def edit_dataset_label(key):
 	label = request.form.get("label", "")
 
 	try:
-		dataset = DataSet(key=dataset_key, db=db)
+		dataset = DataSet(key=dataset_key, db=db, modules=fourcat_modules)
 	except DataSetException:
 		return error(404, error="Dataset does not exist.")
 
@@ -541,7 +541,7 @@ def convert_dataset(key):
 	datasource = request.form.get("to_datasource", "")
 
 	try:
-		dataset = DataSet(key=dataset_key, db=db)
+		dataset = DataSet(key=dataset_key, db=db, modules=fourcat_modules)
 	except DataSetException:
 		return error(404, error="Dataset does not exist.")
 
@@ -589,7 +589,7 @@ def nuke_dataset(key=None, reason=None):
 		reason = "[no reason given]"
 
 	try:
-		dataset = DataSet(key=dataset_key, db=db)
+		dataset = DataSet(key=dataset_key, db=db, modules=fourcat_modules)
 	except DataSetException:
 		return error(404, error="Dataset does not exist.")
 
@@ -662,7 +662,7 @@ def delete_dataset(key=None):
 	dataset_key = request.form.get("key", "") if not key else key
 
 	try:
-		dataset = DataSet(key=dataset_key, db=db)
+		dataset = DataSet(key=dataset_key, db=db, modules=fourcat_modules)
 	except DataSetException:
 		return error(404, error="Dataset does not exist.")
 
@@ -693,7 +693,7 @@ def delete_dataset(key=None):
 					 message="The 4CAT backend is not available. Try again in a minute or contact the instance maintainer if the problem persists.")
 
 	# do we have a parent?
-	parent_dataset = DataSet(key=dataset.key_parent, db=db) if dataset.key_parent else None
+	parent_dataset = DataSet(key=dataset.key_parent, db=db, modules=fourcat_modules) if dataset.key_parent else None
 
 	# and delete the dataset and child datasets
 	dataset.delete()
@@ -732,7 +732,7 @@ def erase_credentials(key=None):
 	dataset_key = request.form.get("key", "") if not key else key
 
 	try:
-		dataset = DataSet(key=dataset_key, db=db)
+		dataset = DataSet(key=dataset_key, db=db, modules=fourcat_modules)
 	except DataSetException:
 		return error(404, error="Dataset does not exist.")
 
@@ -927,7 +927,14 @@ def check_search_queue():
 	"""
 	unfinished_jobs = db.fetchall("SELECT jobtype, COUNT(*)count FROM jobs WHERE jobtype LIKE '%-search' GROUP BY jobtype ORDER BY count DESC;")
 
-	return jsonify(unfinished_datasets)
+	for i, job in enumerate(unfinished_jobs):
+		processor = fourcat_modules.processors.get(job["jobtype"])
+		if processor:
+			unfinished_jobs[i]["processor_name"] = processor.title
+		else:
+			unfinished_jobs[i]["processor_name"] = job["jobtype"]
+
+	return jsonify(unfinished_jobs)
 
 @app.route("/api/toggle-dataset-favourite/<string:key>")
 @login_required

From 0000c4c5ba4be0f0a32e5ca67cb4aa00e495bd77 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Fri, 27 Sep 2024 12:16:17 +0200
Subject: [PATCH 10/12] more modules passing fixes

---
 backend/lib/processor.py  | 2 +-
 webtool/views/api_tool.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/lib/processor.py b/backend/lib/processor.py
index e90e2f9eb..e9e4d85a4 100644
--- a/backend/lib/processor.py
+++ b/backend/lib/processor.py
@@ -307,7 +307,7 @@ def after_process(self):
 		if "attach_to" in self.parameters:
 			try:
 				# copy metadata and results to the surrogate
-				surrogate = DataSet(key=self.parameters["attach_to"], db=self.db, modules=self.modules)
+				surrogate = DataSet(key=self.parameters["attach_to"], db=self.db)
 
 				if self.dataset.get_results_path().exists():
 					# Update the surrogate's results file suffix to match this dataset's suffix
diff --git a/webtool/views/api_tool.py b/webtool/views/api_tool.py
index 718d5f39c..f7f66ad6e 100644
--- a/webtool/views/api_tool.py
+++ b/webtool/views/api_tool.py
@@ -830,7 +830,7 @@ def add_dataset_owner(key=None, username=None, role=None):
 	usernames = request.form.get("name", "") if not username else username
 
 	try:
-		dataset = DataSet(key=dataset_key, db=db)
+		dataset = DataSet(key=dataset_key, db=db, modules=fourcat_modules)
 	except DataSetException:
 		return error(404, error="Dataset does not exist.")
 

From a8e6affbd07f1476625f6006a6120ac8d3674c78 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Fri, 27 Sep 2024 12:20:53 +0200
Subject: [PATCH 11/12] disappearing import

not sure pycharm's merge is super awesome...
---
 common/lib/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 97321a6d7..b494acbd3 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -15,7 +15,7 @@
 from common.config_manager import config
 from common.lib.job import Job, JobNotFoundException
 from common.lib.module_loader import ModuleCollector
-from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int
+from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int, get_software_version
 from common.lib.item_mapping import MappedItem, MissingMappedField, DatasetItem
 from common.lib.fourcat_module import FourcatModule
 from common.lib.exceptions import (ProcessorInterruptedException, DataSetException, DataSetNotFoundException,

From 3793ee9573520d6e4026a882a8fc98b36dc066e9 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 1 Oct 2024 10:35:03 +0200
Subject: [PATCH 12/12] fix import 4cat datasource with modules changes

---
 datasources/fourcat_import/import_4cat.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/datasources/fourcat_import/import_4cat.py b/datasources/fourcat_import/import_4cat.py
index dc9868d34..dc5d079fc 100644
--- a/datasources/fourcat_import/import_4cat.py
+++ b/datasources/fourcat_import/import_4cat.py
@@ -298,14 +298,14 @@ def create_dataset(self, metadata, original_key, primary=False):
             # supernumerary datasets - handle on their own
             # these include any children of imported datasets
             try:
-                key_exists = DataSet(key=metadata["key"], db=self.db)
+                key_exists = DataSet(key=metadata["key"], db=self.db, modules=self.modules)
 
                 # if we *haven't* thrown a DatasetException now, then the
                 # key is already in use, so create a "dummy" dataset and
                 # overwrite it with the metadata we have (except for the
                 # key). this ensures that a new unique key will be
                 # generated.
-                new_dataset = DataSet(parameters={}, type=self.type, db=self.db)
+                new_dataset = DataSet(parameters={}, type=self.type, db=self.db, modules=self.modules)
                 metadata.pop("key")
                 self.db.update("datasets", where={"key": new_dataset.key}, data=metadata)
 
@@ -313,7 +313,7 @@ def create_dataset(self, metadata, original_key, primary=False):
                 # this is *good* since it means the key doesn't exist, so
                 # we can re-use the key of the imported dataset
                 self.db.insert("datasets", data=metadata)
-                new_dataset = DataSet(key=metadata["key"], db=self.db)
+                new_dataset = DataSet(key=metadata["key"], db=self.db, modules=self.modules)
 
         # make sure the dataset path uses the new key and local dataset
         # path settings. this also makes sure the log file is created in
@@ -331,7 +331,7 @@ def create_dataset(self, metadata, original_key, primary=False):
 
         # refresh object, make sure it's in sync with the database
         self.created_datasets.add(new_dataset.key)
-        new_dataset = DataSet(key=new_dataset.key, db=self.db)
+        new_dataset = DataSet(key=new_dataset.key, db=self.db, modules=self.modules)
         if new_dataset.key == self.dataset.key:
             # this ensures that the first imported dataset becomes the
             # processor's "own" dataset, and that the import logs go to
@@ -500,7 +500,7 @@ def halt_and_catch_fire(self):
             # overwritten by this point
             deletables = [k for k in self.created_datasets if k != self.dataset.key]
             for deletable in deletables:
-                DataSet(key=deletable, db=self.db).delete()
+                DataSet(key=deletable, db=self.db, modules=self.modules).delete()
 
             self.dataset.finish_with_error(f"Interrupted while importing datasets{' from '+self.base if self.base else ''}. Cannot resume - you "
                                            f"will need to initiate the import again.")