diff --git a/common/lib/dataset.py b/common/lib/dataset.py index b092d2a4e..b494acbd3 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -15,7 +15,7 @@ from common.config_manager import config from common.lib.job import Job, JobNotFoundException from common.lib.module_loader import ModuleCollector -from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int +from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int, get_software_version from common.lib.item_mapping import MappedItem, MissingMappedField, DatasetItem from common.lib.fourcat_module import FourcatModule from common.lib.exceptions import (ProcessorInterruptedException, DataSetException, DataSetNotFoundException, @@ -1586,6 +1586,20 @@ def get_media_type(self): # Default to text return self.parameters.get("media_type", "text") + def get_metadata(self): + """ + Get dataset metadata + + This consists of all the data stored in the database for this dataset, plus the current 4CAT version (appended + as 'current_4CAT_version'). This is useful for exporting datasets, as it can be used by another 4CAT instance to + update its database (and ensure compatibility with the exporting version of 4CAT). + """ + metadata = self.db.fetchone("SELECT * FROM datasets WHERE key = %s", (self.key,)) + + # get 4CAT version (presumably to ensure export is compatible with import) + metadata["current_4CAT_version"] = get_software_version() + return metadata + def get_result_url(self): """ Gets the 4CAT frontend URL of a dataset file. diff --git a/datasources/fourcat_import/import_4cat.py b/datasources/fourcat_import/import_4cat.py index cd231b445..dc5d079fc 100644 --- a/datasources/fourcat_import/import_4cat.py +++ b/datasources/fourcat_import/import_4cat.py @@ -4,6 +4,7 @@ import requests import json import time +import zipfile from backend.lib.processor import BasicProcessor from common.lib.exceptions import (QueryParametersException, FourcatException, ProcessorInterruptedException, @@ -19,8 +20,8 @@ class FourcatImportException(FourcatException): class SearchImportFromFourcat(BasicProcessor): type = "import_4cat-search" # job ID category = "Search" # category - title = "Import from 4CAT" # title displayed in UI - description = "Import a dataset from another 4CAT server" # description displayed in UI + title = "Import 4CAT dataset and analyses" # title displayed in UI + description = "Import a dataset from another 4CAT server or from a zip file (exported from a 4CAT server)" # description displayed in UI is_local = False # Whether this datasource is locally scraped is_static = False # Whether this datasource is still updated @@ -33,29 +34,328 @@ class SearchImportFromFourcat(BasicProcessor): "\n\nTo import a dataset across servers, both servers need to be running the same version of 4CAT. " "You can find the current version in the footer at the bottom of the interface." }, + "method": { + "type": UserInput.OPTION_CHOICE, + "help": "Import Type", + "options": { + "zip": "Zip File", + "url": "4CAT URL", + }, + "default": "url" + }, "url": { "type": UserInput.OPTION_TEXT, "help": "Dataset URL", - "tooltip": "URL to the dataset's page, for example https://4cat.example/results/28da332f8918e6dc5aacd1c3b0170f01b80bd95f8ff9964ac646cecd33bfee49/." + "tooltip": "URL to the dataset's page, for example https://4cat.example/results/28da332f8918e6dc5aacd1c3b0170f01b80bd95f8ff9964ac646cecd33bfee49/.", + "requires": "method^=url" }, "intro2": { "type": UserInput.OPTION_INFO, "help": "You can create an API key via the 'API Access' item in 4CAT's navigation menu. Note that you need " "an API key from **the server you are importing from**, not the one you are looking at right now. " - "Additionally, you need to have owner access to the dataset you want to import." + "Additionally, you need to have owner access to the dataset you want to import.", + "requires": "method^=url" }, "api-key": { "type": UserInput.OPTION_TEXT, "help": "4CAT API Key", "sensitive": True, "cache": True, - } + "requires": "method^=url" + }, + "data_upload": { + "type": UserInput.OPTION_FILE, + "help": "File", + "tooltip": "Upload a ZIP file containing a dataset exported from a 4CAT server.", + "requires": "method^=zip" + }, + } created_datasets = None base = None + remapped_keys = None + dataset_owner = None def process(self): + """ + Import 4CAT dataset either from another 4CAT server or from the uploaded zip file + """ + self.created_datasets = set() # keys of created datasets - may not be successful! + self.remapped_keys = {} # changed dataset keys + self.dataset_owner = self.dataset.get_owners()[0] # at this point it has 1 owner + try: + if self.parameters.get("method") == "zip": + self.process_zip() + else: + self.process_urls() + except Exception as e: + # Catch all exceptions and finish the job with an error + # Resuming is impossible because this dataset was overwritten with the importing dataset + # halt_and_catch_fire() will clean up and delete the datasets that were created + self.interrupted = True + try: + self.halt_and_catch_fire() + except ProcessorInterruptedException: + pass + # Reraise the original exception for logging + raise e + + def after_create(query, dataset, request): + """ + Hook to execute after the dataset for this source has been created + + In this case, put the file in a temporary location so it can be + processed properly by the related Job later. + + :param dict query: Sanitised query parameters + :param DataSet dataset: Dataset created for this query + :param request: Flask request submitted for its creation + """ + if query.get("method") == "zip": + file = request.files["option-data_upload"] + file.seek(0) + with dataset.get_results_path().with_suffix(".importing").open("wb") as outfile: + while True: + chunk = file.read(1024) + if len(chunk) == 0: + break + outfile.write(chunk) + else: + # nothing to do for URLs + pass + + + def process_zip(self): + """ + Import 4CAT dataset from a ZIP file + """ + self.dataset.update_status(f"Importing datasets and analyses from ZIP file.") + temp_file = self.dataset.get_results_path().with_suffix(".importing") + + imported = [] + processed_files = 1 # take into account the export.log file + failed_imports = [] + with zipfile.ZipFile(temp_file, "r") as zip_ref: + zip_contents = zip_ref.namelist() + + # Get all metadata files and determine primary dataset + metadata_files = [file for file in zip_contents if file.endswith("_metadata.json")] + if not metadata_files: + self.dataset.finish_with_error("No metadata files found in ZIP file; is this a 4CAT export?") + return + + # Get the primary dataset + primary_dataset_keys = set() + datasets = [] + parent_child_mapping = {} + for file in metadata_files: + with zip_ref.open(file) as f: + metadata = json.load(f) + if not metadata.get("key_parent"): + primary_dataset_keys.add(metadata.get("key")) + datasets.append(metadata) + else: + # Store the mapping of parent to child datasets + parent_key = metadata.get("key_parent") + if parent_key not in parent_child_mapping: + parent_child_mapping[parent_key] = [] + parent_child_mapping[parent_key].append(metadata) + + # Primary dataset will overwrite this dataset; we could address this to support multiple primary datasets + if len(primary_dataset_keys) != 1: + self.dataset.finish_with_error("ZIP file contains multiple primary datasets; only one is allowed.") + return + + # Import datasets + while datasets: + self.halt_and_catch_fire() + + # Create the datasets + metadata = datasets.pop(0) + dataset_key = metadata.get("key") + processed_metadata = self.process_metadata(metadata) + new_dataset = self.create_dataset(processed_metadata, dataset_key, dataset_key in primary_dataset_keys) + processed_files += 1 + + # TODO: I am now noticing that we do not update the results_file; it is even more unlikely to collide as it is both a random key and label combined... but... + # Copy the log file + self.halt_and_catch_fire() + log_filename = new_dataset.get_log_path().name + if log_filename in zip_contents: + self.dataset.update_status(f"Transferring log file for dataset {new_dataset.key}") + with zip_ref.open(log_filename) as f: + with new_dataset.get_log_path().open("wb") as outfile: + outfile.write(f.read()) + processed_files += 1 + else: + self.dataset.log(f"Log file not found for dataset {new_dataset.key} (original key {dataset_key}).") + + # Copy the results + self.halt_and_catch_fire() + results_filename = new_dataset.get_results_path().name + if results_filename in zip_contents: + self.dataset.update_status(f"Transferring data file for dataset {new_dataset.key}") + with zip_ref.open(results_filename) as f: + with new_dataset.get_results_path().open("wb") as outfile: + outfile.write(f.read()) + processed_files += 1 + + if not imported: + # first dataset - use num rows as 'overall' + num_rows = metadata["num_rows"] + else: + # TODO: should I just delete the new_dataset here? + self.dataset.log(f"Results file not found for dataset {new_dataset.key} (original key {dataset_key}).") + new_dataset.finish_with_error(f"Results file not found for dataset {new_dataset.key} (original key {dataset_key}).") + failed_imports.append(dataset_key) + continue + + # finally, the kids + self.halt_and_catch_fire() + if dataset_key in parent_child_mapping: + datasets.extend(parent_child_mapping[dataset_key]) + self.dataset.log(f"Adding ({len(parent_child_mapping[dataset_key])}) child datasets to import queue") + + # done - remember that we've imported this one + imported.append(new_dataset) + new_dataset.update_status(metadata["status"]) + + if new_dataset.key != self.dataset.key: + # only finish if this is not the 'main' dataset, or the user + # will think the whole import is done + new_dataset.finish(metadata["num_rows"]) + + # Check that all files were processed + missed_files = [] + if len(zip_contents) != processed_files: + for file in zip_contents: + if file not in processed_files: + missed_files.append(file) + + # todo: this part needs updating if/when we support importing multiple datasets! + if failed_imports: + self.dataset.update_status(f"Dataset import finished, but not all data was imported properly. " + f"{len(failed_imports)} dataset(s) were not successfully imported. Check the " + f"dataset log file for details.", is_final=True) + elif missed_files: + self.dataset.log(f"ZIP file contained {len(missed_files)} files that were not processed: {missed_files}") + self.dataset.update_status(f"Dataset import finished, but not all files were processed. " + f"{len(missed_files)} files were not successfully imported. Check the " + f"dataset log file for details.", is_final=True) + else: + self.dataset.update_status(f"{len(imported)} dataset(s) succesfully imported.", + is_final=True) + + if not self.dataset.is_finished(): + # now all related datasets are imported, we can finish the 'main' + # dataset, and the user will be alerted that the full import is + # complete + self.dataset.finish(num_rows) + + + @staticmethod + def process_metadata(metadata): + """ + Process metadata for import + """ + # get rid of some keys that are server-specific and don't need to + # be stored (or don't correspond to database columns) + metadata.pop("current_4CAT_version") + metadata.pop("id") + metadata.pop("job") + metadata.pop("is_private") + metadata.pop("is_finished") # we'll finish it ourselves, thank you!!! + + # extra params are stored as JSON... + metadata["parameters"] = json.loads(metadata["parameters"]) + if "copied_from" in metadata["parameters"]: + metadata["parameters"].pop("copied_from") + metadata["parameters"] = json.dumps(metadata["parameters"]) + + return metadata + + def create_dataset(self, metadata, original_key, primary=False): + """ + Create a new dataset + """ + if primary: + self.dataset.update_status(f"Importing primary dataset {original_key}.") + # if this is the first dataset we're importing, make it the + # processor's "own" dataset. the key has already been set to + # the imported dataset's key via ensure_key() (or a new unqiue + # key if it already existed on this server) + # by making it the "own" dataset, the user initiating the + # import will see the imported dataset as the "result" of their + # import query in the interface, similar to the workflow for + # other data sources + new_dataset = self.dataset + metadata.pop("key") # key already OK (see above) + self.db.update("datasets", where={"key": new_dataset.key}, data=metadata) + + else: + self.dataset.update_status(f"Importing child dataset {original_key}.") + # supernumerary datasets - handle on their own + # these include any children of imported datasets + try: + key_exists = DataSet(key=metadata["key"], db=self.db, modules=self.modules) + + # if we *haven't* thrown a DatasetException now, then the + # key is already in use, so create a "dummy" dataset and + # overwrite it with the metadata we have (except for the + # key). this ensures that a new unique key will be + # generated. + new_dataset = DataSet(parameters={}, type=self.type, db=self.db, modules=self.modules) + metadata.pop("key") + self.db.update("datasets", where={"key": new_dataset.key}, data=metadata) + + except DataSetException: + # this is *good* since it means the key doesn't exist, so + # we can re-use the key of the imported dataset + self.db.insert("datasets", data=metadata) + new_dataset = DataSet(key=metadata["key"], db=self.db, modules=self.modules) + + # make sure the dataset path uses the new key and local dataset + # path settings. this also makes sure the log file is created in + # the right place (since it is derived from the results file path) + extension = metadata["result_file"].split(".")[-1] + new_dataset.reserve_result_file(parameters=new_dataset.parameters, extension=extension) + + new_dataset.update_status("Imported dataset created") + if new_dataset.key != original_key: + # could not use original key because it was already in use + # so update any references to use the new key + self.remapped_keys[original_key] = new_dataset.key + new_dataset.update_status(f"Cannot import with same key - already in use on this server. Using key " + f"{new_dataset.key} instead of key {original_key}!") + + # refresh object, make sure it's in sync with the database + self.created_datasets.add(new_dataset.key) + new_dataset = DataSet(key=new_dataset.key, db=self.db, modules=self.modules) + if new_dataset.key == self.dataset.key: + # this ensures that the first imported dataset becomes the + # processor's "own" dataset, and that the import logs go to + # that dataset's log file. For later imports, this evaluates to + # False. + self.dataset = new_dataset + + # if the key of the parent dataset was changed, change the + # reference to it that the child dataset has + if new_dataset.key_parent and new_dataset.key_parent in self.remapped_keys: + new_dataset.key_parent = self.remapped_keys[new_dataset.key_parent] + + # update some attributes that should come from the new server, not + # the old + new_dataset.creator = self.dataset_owner + new_dataset.original_timestamp = new_dataset.timestamp + new_dataset.imported = True + new_dataset.timestamp = int(time.time()) + new_dataset.db.commit() + + return new_dataset + + + def process_urls(self): """ Import 4CAT dataset from another 4CAT server @@ -67,12 +367,9 @@ def process(self): keys = SearchImportFromFourcat.get_keys_from_urls(urls) api_key = self.parameters.get("api-key") - self.created_datasets = set() # keys of created datasets - may not be successful! imported = [] # successfully imported datasets failed_imports = [] # keys that failed to import - remapped_keys = {} # changed dataset keys num_rows = 0 # will be used later - dataset_owner = self.dataset.get_owners()[0] # at this point it has 1 owner # we can add support for multiple datasets later by removing # this part! @@ -101,90 +398,10 @@ def process(self): failed_imports.append(dataset_key) continue - # get rid of some keys that are server-specific and don't need to - # be stored (or don't correspond to database columns) - metadata.pop("current_4CAT_version") - metadata.pop("id") - metadata.pop("job") - metadata.pop("is_private") - metadata.pop("is_finished") # we'll finish it ourselves, thank you!!! - - # extra params are stored as JSON... - metadata["parameters"] = json.loads(metadata["parameters"]) - if "copied_from" in metadata["parameters"]: - metadata["parameters"].pop("copied_from") - metadata["parameters"] = json.dumps(metadata["parameters"]) - - if not imported: - # if this is the first dataset we're importing, make it the - # processor's "own" dataset. the key has already been set to - # the imported dataset's key via ensure_key() (or a new unqiue - # key if it already existed on this server) - # by making it the "own" dataset, the user initiating the - # import will see the imported dataset as the "result" of their - # import query in the interface, similar to the workflow for - # other data sources - new_dataset = self.dataset - metadata.pop("key") # key already OK (see above) - self.db.update("datasets", where={"key": new_dataset.key}, data=metadata) + metadata = self.process_metadata(metadata) - else: - # supernumerary datasets - handle on their own - # these include any children of imported datasets - try: - key_exists = DataSet(key=metadata["key"], db=self.db) - - # if we *haven't* thrown a DatasetException now, then the - # key is already in use, so create a "dummy" dataset and - # overwrite it with the metadata we have (except for the - # key). this ensures that a new unique key will be - # generated. - new_dataset = DataSet(parameters={}, type=self.type, db=self.db) - metadata.pop("key") - self.db.update("datasets", where={"key": new_dataset.key}, data=metadata) - - except DataSetException: - # this is *good* since it means the key doesn't exist, so - # we can re-use the key of the imported dataset - self.db.insert("datasets", data=metadata) - new_dataset = DataSet(key=metadata["key"], db=self.db) - - # make sure the dataset path uses the new key and local dataset - # path settings. this also makes sure the log file is created in - # the right place (since it is derived from the results file path) - extension = metadata["result_file"].split(".")[-1] - new_dataset.reserve_result_file(parameters=new_dataset.parameters, extension=extension) - - new_dataset.update_status("Imported dataset created") - if new_dataset.key != dataset_key: - # could not use original key because it was already in use - # so update any references to use the new key - remapped_keys[dataset_key] = new_dataset.key - new_dataset.update_status(f"Cannot import with same key - already in use on this server. Using key " - f"{new_dataset.key} instead of key {dataset_key}!") - - # refresh object, make sure it's in sync with the database - self.created_datasets.add(new_dataset.key) - new_dataset = DataSet(key=new_dataset.key, db=self.db) - if new_dataset.key == self.dataset.key: - # this ensures that the first imported dataset becomes the - # processor's "own" dataset, and that the import logs go to - # that dataset's log file. For later imports, this evaluates to - # False. - self.dataset = new_dataset - - # if the key of the parent dataset was changed, change the - # reference to it that the child dataset has - if new_dataset.key_parent and new_dataset.key_parent in remapped_keys: - new_dataset.key_parent = remapped_keys[new_dataset.key_parent] - - # update some attributes that should come from the new server, not - # the old - new_dataset.creator = dataset_owner - new_dataset.original_timestamp = new_dataset.timestamp - new_dataset.imported = True - new_dataset.timestamp = int(time.time()) - new_dataset.db.commit() + # create the new dataset + new_dataset = self.create_dataset(metadata, dataset_key, primary=True if not imported else False) # then, the log self.halt_and_catch_fire() @@ -283,9 +500,9 @@ def halt_and_catch_fire(self): # overwritten by this point deletables = [k for k in self.created_datasets if k != self.dataset.key] for deletable in deletables: - DataSet(key=deletable, db=self.db).delete() + DataSet(key=deletable, db=self.db, modules=self.modules).delete() - self.dataset.finish_with_error(f"Interrupted while importing datasets from {self.base}. Cannot resume - you " + self.dataset.finish_with_error(f"Interrupted while importing datasets{' from '+self.base if self.base else ''}. Cannot resume - you " f"will need to initiate the import again.") raise ProcessorInterruptedException() @@ -353,47 +570,72 @@ def validate_query(query, request, user): :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ - urls = query.get("url") - if not urls: - return QueryParametersException("Provide at least one dataset URL.") - - urls = urls.split(",") - bases = set([url.split("/results/")[0].lower() for url in urls]) - keys = SearchImportFromFourcat.get_keys_from_urls(urls) + if query.get("method") == "zip": + filename = "" + if "option-data_upload-entries" in request.form: + # First pass sends list of files in the zip + pass + elif "option-data_upload" in request.files: + # Second pass sends the actual file + file = request.files["option-data_upload"] + if not file: + raise QueryParametersException("No file uploaded.") + + if not file.filename.endswith(".zip"): + raise QueryParametersException("Uploaded file must be a ZIP file.") + + filename = file.filename + else: + raise QueryParametersException("No file was offered for upload.") + + return { + "method": "zip", + "filename": filename + } + elif query.get("method") == "url": + urls = query.get("url") + if not urls: + raise QueryParametersException("Provide at least one dataset URL.") + + urls = urls.split(",") + bases = set([url.split("/results/")[0].lower() for url in urls]) + keys = SearchImportFromFourcat.get_keys_from_urls(urls) + + if len(keys) != 1: + # todo: change this to < 1 if we allow multiple datasets + raise QueryParametersException("You need to provide a single URL to a 4CAT dataset to import.") + + if len(bases) != 1: + raise QueryParametersException("All URLs need to point to the same 4CAT server. You can only import from " + "one 4CAT server at a time.") + + base = urls[0].split("/results/")[0] + try: + # test if API key is valid and server is reachable + test = SearchImportFromFourcat.fetch_from_4cat(base, keys[0], query.get("api-key"), "metadata") + except FourcatImportException as e: + raise QueryParametersException(str(e)) - if len(keys) != 1: - # todo: change this to < 1 if we allow multiple datasets - return QueryParametersException("You need to provide a single URL to a 4CAT dataset to import.") + try: + # test if we get a response we can parse + metadata = test.json() + except ValueError: + raise QueryParametersException(f"Unexpected response when trying to fetch metadata for dataset {keys[0]}.") - if len(bases) != 1: - return QueryParametersException("All URLs need to point to the same 4CAT server. You can only import from " - "one 4CAT server at a time.") + version = get_software_version() - base = urls[0].split("/results/")[0] - try: - # test if API key is valid and server is reachable - test = SearchImportFromFourcat.fetch_from_4cat(base, keys[0], query.get("api-key"), "metadata") - except FourcatImportException as e: - raise QueryParametersException(str(e)) + if metadata.get("current_4CAT_version") != version: + raise QueryParametersException(f"This 4CAT server is running a different version of 4CAT ({version}) than " + f"the one you are trying to import from ({metadata.get('current_4CAT_version')}). Make " + "sure both are running the same version of 4CAT and try again.") - try: - # test if we get a response we can parse - metadata = test.json() - except ValueError: - raise QueryParametersException(f"Unexpected response when trying to fetch metadata for dataset {keys[0]}.") - - version = get_software_version() - - if metadata.get("current_4CAT_version") != version: - raise QueryParametersException(f"This 4CAT server is running a different version of 4CAT ({version}) than " - f"the one you are trying to import from ({metadata.get('current_4CAT_version')}). Make " - "sure both are running the same version of 4CAT and try again.") - - # OK, we can import at least one dataset - return { - "url": ",".join(urls), - "api-key": query.get("api-key") - } + # OK, we can import at least one dataset + return { + "url": ",".join(urls), + "api-key": query.get("api-key") + } + else: + raise QueryParametersException("Import method not yet implemented.") @staticmethod def get_keys_from_urls(urls): diff --git a/docker-compose_build.yml b/docker-compose_build.yml index 7466e8ba8..b81a9fb94 100644 --- a/docker-compose_build.yml +++ b/docker-compose_build.yml @@ -32,6 +32,9 @@ services: - ./data/datasets/:/usr/src/app/data/ - ./data/config/:/usr/src/app/config/ - ./data/logs/:/usr/src/app/logs/ +# - 4cat_data:/usr/src/app/data/ +# - 4cat_config:/usr/src/app/config/ +# - 4cat_logs:/usr/src/app/logs/ entrypoint: docker/docker-entrypoint.sh frontend: @@ -49,6 +52,9 @@ services: - ./data/datasets/:/usr/src/app/data/ - ./data/config/:/usr/src/app/config/ - ./data/logs/:/usr/src/app/logs/ +# - 4cat_data:/usr/src/app/data/ +# - 4cat_config:/usr/src/app/config/ +# - 4cat_logs:/usr/src/app/logs/ command: ["docker/wait-for-backend.sh"] volumes: diff --git a/processors/conversion/export_datasets.py b/processors/conversion/export_datasets.py new file mode 100644 index 000000000..bd7b81289 --- /dev/null +++ b/processors/conversion/export_datasets.py @@ -0,0 +1,106 @@ +""" +Export a dataset and all its children to a ZIP file +""" +import shutil +import json +import datetime + +from backend.lib.processor import BasicProcessor +from common.lib.dataset import DataSet +from common.lib.exceptions import DataSetException + +__author__ = "Dale Wahl" +__credits__ = ["Dale Wahl"] +__maintainer__ = "Dale Wahl" +__email__ = "4cat@oilab.eu" + + + +class ExportDatasets(BasicProcessor): + """ + Export a dataset and all its children to a ZIP file + """ + type = "export-datasets" # job type ID + category = "Conversion" # category + title = "Export Dataset and All Analyses" # title displayed in UI + description = "Creates a ZIP file containing the dataset and all analyses to be archived and uploaded to a 4CAT instance in the future. Automatically expires after 1 day, after which you must run again." # description displayed in UI + extension = "zip" # extension of result file, used internally and in UI + + @classmethod + def is_compatible_with(cls, module=None, user=None): + """ + Determine if processor is compatible with dataset + + :param module: Module to determine compatibility with + """ + return module.is_top_dataset() and user.can_access_dataset(dataset=module, role="owner") + + def process(self): + """ + This takes a CSV file as input and writes the same data as a JSON file + """ + self.dataset.update_status("Collecting dataset and all analyses") + + results_path = self.dataset.get_staging_area() + + exported_datasets = [] + failed_exports = [] # keys that failed to import + keys = [self.dataset.top_parent().key] # get the key of the top parent + while keys: + dataset_key = keys.pop(0) + self.dataset.log(f"Exporting dataset {dataset_key}.") + + try: + dataset = DataSet(key=dataset_key, db=self.db) + # TODO: these two should fail for the primary dataset, but should they fail for the children too? + except DataSetException: + self.dataset.finish_with_error("Dataset not found.") + return + if not dataset.is_finished(): + self.dataset.finish_with_error("You cannot export unfinished datasets.") + return + + # get metadata + metadata = dataset.get_metadata() + if metadata["num_rows"] == 0: + self.dataset.update_status(f"Skipping empty dataset {dataset_key}") + failed_exports.append(dataset_key) + continue + + # get data + data_file = dataset.get_results_path() + if not data_file.exists(): + self.dataset.finish_with_error(f"Dataset {dataset_key} has no data; skipping.") + failed_exports.append(dataset_key) + continue + + # get log + log_file = dataset.get_results_path().with_suffix(".log") + + # All good, add to ZIP + with results_path.joinpath(f"{dataset_key}_metadata.json").open("w", encoding="utf-8") as outfile: + outfile.write(json.dumps(metadata)) + shutil.copy(data_file, results_path.joinpath(data_file.name)) + if log_file.exists(): + shutil.copy(log_file, results_path.joinpath(log_file.name)) + + # add children to queue + # Not using get_all_children() because we want to skip unfinished datasets and only need the keys + children = [d["key"] for d in self.db.fetchall("SELECT key FROM datasets WHERE key_parent = %s AND is_finished = TRUE", (dataset_key,))] + keys.extend(children) + + self.dataset.update_status(f"Exported dataset {dataset_key}.") + exported_datasets.append(dataset_key) + + # Add export log to ZIP + self.dataset.log(f"Exported datasets: {exported_datasets}") + self.dataset.log(f"Failed to export datasets: {failed_exports}") + shutil.copy(self.dataset.get_log_path(), results_path.joinpath("export.log")) + + # set expiration date + # these datasets can be very large and are just copies of the existing datasets, so we don't need to keep them around for long + # TODO: convince people to stop using hyphens in python variables and file names... + self.dataset.__setattr__("expires-after", (datetime.datetime.now() + datetime.timedelta(days=1)).timestamp()) + + # done! + self.write_archive_and_finish(results_path, len(exported_datasets)) \ No newline at end of file diff --git a/webtool/views/api_tool.py b/webtool/views/api_tool.py index 5b47c030d..f7f66ad6e 100644 --- a/webtool/views/api_tool.py +++ b/webtool/views/api_tool.py @@ -1246,11 +1246,7 @@ def export_packed_dataset(key=None, component=None): return error(403, error="You cannot export unfinished datasets.") if component == "metadata": - metadata = db.fetchone("SELECT * FROM datasets WHERE key = %s", (dataset.key,)) - - # get 4CAT version (presumably to ensure export is compatible with import) - metadata["current_4CAT_version"] = get_software_version() - return jsonify(metadata) + return jsonify(dataset.get_metadata()) elif component == "children": children = [d["key"] for d in db.fetchall("SELECT key FROM datasets WHERE key_parent = %s AND is_finished = TRUE", (dataset.key,))]