diff --git a/.isort.cfg b/.isort.cfg index 31ed4f4a8..3404c5f80 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -4,4 +4,4 @@ include_trailing_comma = true force_grid_wrap = 0 use_parentheses = true line_length = 100 -known_third_party =alembic,dateutil,flask,flask_cachebuster,flask_cors,freezegun,geoalchemy2,geopy,halo,iterfzf,loguru,pg8000,pint,prompt_toolkit,psycopg2,pyfiglet,pygments,pytest,setuptools,shapely,sqlalchemy,sqlalchemy_utils,tabulate,testing,tqdm,twisted,waitress \ No newline at end of file +known_third_party =alembic,dateutil,flask,flask_cachebuster,flask_cors,freezegun,geoalchemy2,geopy,halo,iterfzf,loguru,pexpect,pg8000,pint,prompt_toolkit,psycopg2,pyfiglet,pygments,pytest,setuptools,shapely,sqlalchemy,sqlalchemy_utils,tabulate,testing,tqdm,twisted,waitress \ No newline at end of file diff --git a/HISTORY.rst b/HISTORY.rst index 2f7da2098..b685aa162 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,6 +2,12 @@ History ======= +0.0.29 (2021-09-29) +------------------- + +* Record extractions to database for future data audit +* Performance improvement in generating table summaries +* Performance improvement in opening some tables in Maintenance GUI 0.0.28 (2021-09-24) ------------------- @@ -9,7 +15,6 @@ History * Minor fixes for timeline (-100%, include_in_timeline, default_interval) * Fix error for viewing Platform Entry in GUI * Improve speed of generating table summaries (affects import performance) -* 0.0.27 (2021-09-16) ------------------- diff --git a/alembic.ini b/alembic.ini index fbabcdcbb..b48e4d364 100644 --- a/alembic.ini +++ b/alembic.ini @@ -91,4 +91,4 @@ format = %(levelname)-5.5s [%(name)s] %(message)s datefmt = %H:%M:%S [alembic:exclude] -tables = alembic_version,ElementaryGeometries,spatial_ref_sys,spatial_ref_sys_aux,spatialite_history,sql_statements_log,sqlite_sequence,SpatialIndex \ No newline at end of file +tables = alembic_version,ElementaryGeometries,spatial_ref_sys,spatial_ref_sys_aux,spatialite_history,sql_statements_log,sqlite_sequence,SpatialIndex,KNN,data_licenses \ No newline at end of file diff --git a/docs/importer_dev_guide.rst b/docs/importer_dev_guide.rst index 1620630a1..d45c77a99 100644 --- a/docs/importer_dev_guide.rst +++ b/docs/importer_dev_guide.rst @@ -159,12 +159,14 @@ files showing which parts of the file have been used to extract each individual part field in the created measurements, and tracks the extraction in the database to help understand data provenance. -But this token highlighting does come with a performance cost. For high volume -file types that are tightly structured, with little room for misinterpretation, -the overhead may not be justified. In these circumstances, a call to -:code:`self.disable_recording()` in the :code:`__init__` method will turn off -the extraction highlighting for this importer, and significantly speed up the processing -of large files. +But this token highlighting and database recording does come with +a performance cost. For high volume file types that are tightly +structured, with little room for misinterpretation, the overhead +may not be justified. You can configure the level of extraction that will +take place by calling :code:`self.set_highlighting_level()` in the :code:`__init__` method. +Three different values can be passed to this function: :code:`HighlightLevel.NONE` will turn off all extraction +and highlighting, :code:`"HighlightLevel.HTML"` will record extractions to HTML but not to the database, and +:code:`"HighlightLevel.DATABASE"` will record to both a HTML file and the database. The default is :code:`"HighlightLevel.HTML"`. Similarly, it may be justified to capture extraction data in the early stages of developing/maintaining the parser for a new file format, with level of @@ -220,6 +222,13 @@ one extraction can be recorded from disparate data in the file. For example: combine_tokens(long_degrees_token, lat_degrees_token).record( self.name, "location", state.location, "decimal degrees") +Once token extractions have been recorded using the :code:`record` method, the recorded information +must be linked to the appropriate measurement object (State/Contact/Comment etc) and prepared for saving +to the database. This can be done using the :code:`Datafile.flush_extracted_tokens` method, which should be called once +all the data has been loaded for a specific measurement object. Usually this will be at the end of the +:meth:`~pepys_import.file.importer._load_this_line` method, or at the end of a loop inside the +:meth:`~pepys_import.file.importer._load_this_file` method - but for complex importers it may be elsewhere. + Creating measurement objects ############################ diff --git a/importers/aircraft_csv_format_importer.py b/importers/aircraft_csv_format_importer.py index 16e5a39af..370b07db1 100644 --- a/importers/aircraft_csv_format_importer.py +++ b/importers/aircraft_csv_format_importer.py @@ -149,6 +149,8 @@ def _load_this_line(self, data_store, line_number, line, datafile, change_id): state.course = course course_token.record(self.name, "course", course) + datafile.flush_extracted_tokens() + @staticmethod def parse_timestamp(date, time): format_str = "%d/%m/%Y " diff --git a/importers/e_trac_importer.py b/importers/e_trac_importer.py index ffb3bc9e1..91b7b944c 100644 --- a/importers/e_trac_importer.py +++ b/importers/e_trac_importer.py @@ -138,6 +138,8 @@ def _load_this_line(self, data_store, line_number, line, datafile, change_id): state.speed = speed speed_token.record(self.name, "speed", speed) + datafile.flush_extracted_tokens() + @staticmethod def name_for(token): # split into two diff --git a/importers/eag_importer.py b/importers/eag_importer.py index 9be104346..23c4fb306 100644 --- a/importers/eag_importer.py +++ b/importers/eag_importer.py @@ -175,6 +175,8 @@ def _load_this_file(self, data_store, path, file_object, datafile, change_id): state.heading = heading heading_token.record(self.name, "heading", heading) + datafile.flush_extracted_tokens() + def get_previous_sunday(self, date_of_recording_str): format_str = "%Y%m%d" date_of_recording = datetime.datetime.strptime(date_of_recording_str, format_str) diff --git a/importers/gpx_importer.py b/importers/gpx_importer.py index b4a90376d..34acab5c3 100644 --- a/importers/gpx_importer.py +++ b/importers/gpx_importer.py @@ -148,6 +148,8 @@ def _load_this_file(self, data_store, path, file_object, datafile, change_id): if elevation_valid: state.elevation = elevation + datafile.flush_extracted_tokens() + def get_child_and_text_if_exists(self, element, search_string): child = element.find(search_string) if child is not None: diff --git a/importers/nisida_importer.py b/importers/nisida_importer.py index e0c3c3fd7..cad62ab34 100644 --- a/importers/nisida_importer.py +++ b/importers/nisida_importer.py @@ -7,6 +7,7 @@ from pepys_import.core.formats import unit_registry from pepys_import.core.formats.location import Location from pepys_import.core.validators import constants +from pepys_import.file.highlighter.level import HighlightLevel from pepys_import.file.highlighter.support.combine import combine_tokens from pepys_import.file.importer import CANCEL_IMPORT, Importer from pepys_import.utils.sqlalchemy_utils import get_lowest_privacy @@ -48,6 +49,8 @@ def __init__(self): self.year = None self.platform = None + self.set_highlighting_level(HighlightLevel.DATABASE) + def can_load_this_type(self, suffix): return suffix.upper() == ".TXT" @@ -129,6 +132,7 @@ def _load_this_line(self, data_store, line_number, line, datafile, change_id): else: self.last_entry_with_text.remarks = self.last_entry_with_text.remarks + text_to_add line.record(self.name, "comment text", text_to_add) + datafile.flush_extracted_tokens() elif len(line.text) > 7 and line.text[7] == "/" and line.text[0:5].isdigit(): # Check whether line starts with something like "311206Z/" (a timestamp and a slash) # Checking like this is faster than using regular expressions on each line @@ -183,6 +187,7 @@ def _load_this_line(self, data_store, line_number, line, datafile, change_id): } ) return + datafile.flush_extracted_tokens() else: # Not a line we recognise, so just skip to next one return diff --git a/importers/nmea_importer.py b/importers/nmea_importer.py index 2300df72a..be25735ad 100644 --- a/importers/nmea_importer.py +++ b/importers/nmea_importer.py @@ -192,6 +192,8 @@ def _load_this_line(self, data_store, line_number, line, datafile, change_id): self.location = None self.depth = None + datafile.flush_extracted_tokens() + @staticmethod def parse_timestamp(date, time): if len(date) == 6: diff --git a/importers/replay_comment_importer.py b/importers/replay_comment_importer.py index a347aa210..358c944ca 100644 --- a/importers/replay_comment_importer.py +++ b/importers/replay_comment_importer.py @@ -96,3 +96,5 @@ def _load_this_line(self, data_store, line_number, line, datafile, change_id): comment_type=comment_type, parser_name=self.short_name, ) + + datafile.flush_extracted_tokens() diff --git a/importers/replay_contact_importer.py b/importers/replay_contact_importer.py index ae30e20c9..5f4bb9811 100644 --- a/importers/replay_contact_importer.py +++ b/importers/replay_contact_importer.py @@ -260,3 +260,5 @@ def _load_this_line(self, data_store, line_number, line, datafile, change_id): if ambig_bearing_valid: ambig_bearing_token.record(self.name, "ambig bearing", ambig_bearing) contact.ambig_bearing = ambig_bearing + + datafile.flush_extracted_tokens() diff --git a/importers/replay_importer.py b/importers/replay_importer.py index 222eef6c4..41511bc61 100644 --- a/importers/replay_importer.py +++ b/importers/replay_importer.py @@ -15,8 +15,11 @@ def __init__(self): self.depth = 0.0 # Example: Uncomment this line to turn off recording of extractions - # for this importer - # self.disable_recording() + # for this importer: + # self.set_highlighting_level(HighlightLevel.NONE) + # or to turn on database recording: + # self.set_highlighting_level(HighlightLevel.DATABASE) + # (default is HTML recording) def can_load_this_type(self, suffix): return suffix.upper() == ".REP" or suffix.upper() == ".DSF" @@ -66,6 +69,8 @@ def _load_this_line(self, data_store, line_number, line, datafile, change_id): state.speed = rep_line.speed state.location = rep_line.get_location() + datafile.flush_extracted_tokens() + @staticmethod def degrees_for(degs, mins, secs, hemi: str): if hemi.upper() == "S" or hemi.upper() == "W": diff --git a/migrations/env.py b/migrations/env.py index 1f83909f2..d2967723d 100644 --- a/migrations/env.py +++ b/migrations/env.py @@ -194,8 +194,6 @@ def process_revision_directives(context_, revision, directives): context.run_migrations() else: # Turn off the enforcement of foreign key constraints before running the migration - connection.execute(text("PRAGMA foreign_keys=OFF;")) - connection.commit() context.configure( connection=connection, target_metadata=target_metadata, @@ -205,11 +203,9 @@ def process_revision_directives(context_, revision, directives): compare_type=special_compare_type, ) with context.begin_transaction(): + connection.execute(text("PRAGMA foreign_keys=OFF;")) context.run_migrations() - - # Turn on the enforcement of foreign key constraints after the migration is done - connection.execute(text("PRAGMA foreign_keys=ON;")) - connection.commit() + connection.execute(text("PRAGMA foreign_keys=ON;")) if context.is_offline_mode(): diff --git a/migrations/latest_revisions.json b/migrations/latest_revisions.json index 44e222608..de8fcd31b 100644 --- a/migrations/latest_revisions.json +++ b/migrations/latest_revisions.json @@ -1,4 +1,4 @@ { - "LATEST_POSTGRES_VERSION": "c16bbfed85dc", - "LATEST_SQLITE_VERSION": "a7f75ead6204" + "LATEST_SQLITE_VERSION": "feb548c7c6c0", + "LATEST_POSTGRES_VERSION": "4899e94653f1" } \ No newline at end of file diff --git a/migrations/postgres_versions/2021-09-24_4899e94653f1_alter_extractions_table.py b/migrations/postgres_versions/2021-09-24_4899e94653f1_alter_extractions_table.py new file mode 100644 index 000000000..7f7e40ad0 --- /dev/null +++ b/migrations/postgres_versions/2021-09-24_4899e94653f1_alter_extractions_table.py @@ -0,0 +1,89 @@ +"""Alter Extractions table + +Revision ID: 4899e94653f1 +Revises: bfb29dfcef94 +Create Date: 2021-09-24 12:40:23.320197+00:00 + +""" +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "4899e94653f1" +down_revision = "c16bbfed85dc" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "Extractions", + sa.Column("destination_table", sa.String(length=150), nullable=True), + schema="pepys", + ) + op.add_column( + "Extractions", + sa.Column("entry_id", postgresql.UUID(as_uuid=True), nullable=True), + schema="pepys", + ) + op.add_column( + "Extractions", + sa.Column("datafile_id", postgresql.UUID(as_uuid=True), nullable=False), + schema="pepys", + ) + op.add_column("Extractions", sa.Column("text", sa.Text(), nullable=False), schema="pepys") + op.add_column( + "Extractions", + sa.Column("text_location", sa.String(length=200), nullable=False), + schema="pepys", + ) + op.add_column( + "Extractions", sa.Column("importer", sa.String(length=150), nullable=False), schema="pepys" + ) + op.add_column( + "Extractions", sa.Column("interpreted_value", sa.Text(), nullable=False), schema="pepys" + ) + op.create_foreign_key( + op.f("fk_Extractions_datafile_id_Datafiles"), + "Extractions", + "Datafiles", + ["datafile_id"], + ["datafile_id"], + source_schema="pepys", + referent_schema="pepys", + onupdate="cascade", + ondelete="cascade", + ) + op.drop_column("Extractions", "chars", schema="pepys") + op.drop_column("Extractions", "table", schema="pepys") + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "Extractions", + sa.Column("table", sa.VARCHAR(length=150), autoincrement=False, nullable=False), + schema="pepys", + ) + op.add_column( + "Extractions", + sa.Column("chars", sa.VARCHAR(length=150), autoincrement=False, nullable=False), + schema="pepys", + ) + op.drop_constraint( + op.f("fk_Extractions_datafile_id_Datafiles"), + "Extractions", + schema="pepys", + type_="foreignkey", + ) + op.drop_column("Extractions", "interpreted_value", schema="pepys") + op.drop_column("Extractions", "importer", schema="pepys") + op.drop_column("Extractions", "text_location", schema="pepys") + op.drop_column("Extractions", "text", schema="pepys") + op.drop_column("Extractions", "datafile_id", schema="pepys") + op.drop_column("Extractions", "entry_id", schema="pepys") + op.drop_column("Extractions", "destination_table", schema="pepys") + # ### end Alembic commands ### diff --git a/migrations/sqlite_versions/2021-09-20_feb548c7c6c0_alter_extractions_table.py b/migrations/sqlite_versions/2021-09-20_feb548c7c6c0_alter_extractions_table.py new file mode 100644 index 000000000..0c6af13bd --- /dev/null +++ b/migrations/sqlite_versions/2021-09-20_feb548c7c6c0_alter_extractions_table.py @@ -0,0 +1,70 @@ +"""Alter Extractions table + +Revision ID: feb548c7c6c0 +Revises: a7f75ead6204 +Create Date: 2021-09-20 12:38:20.179908+00:00 + +""" +import sqlalchemy as sa +from alembic import op + +import pepys_import + +# revision identifiers, used by Alembic. +revision = "feb548c7c6c0" +down_revision = "a7f75ead6204" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("Extractions", schema=None) as batch_op: + batch_op.add_column(sa.Column("destination_table", sa.String(length=150), nullable=True)) + batch_op.add_column( + sa.Column( + "entry_id", pepys_import.utils.sqlalchemy_utils.UUIDType(length=16), nullable=True + ) + ) + batch_op.add_column( + sa.Column( + "datafile_id", + pepys_import.utils.sqlalchemy_utils.UUIDType(length=16), + nullable=False, + ) + ) + batch_op.add_column(sa.Column("text", sa.Text(), nullable=False)) + batch_op.add_column(sa.Column("text_location", sa.String(length=200), nullable=False)) + batch_op.add_column(sa.Column("importer", sa.String(length=150), nullable=False)) + batch_op.add_column(sa.Column("interpreted_value", sa.Text(), nullable=False)) + batch_op.create_foreign_key( + batch_op.f("fk_Extractions_datafile_id_Datafiles"), + "Datafiles", + ["datafile_id"], + ["datafile_id"], + onupdate="cascade", + ondelete="cascade", + ) + batch_op.drop_column("table") + batch_op.drop_column("chars") + + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("Extractions", schema=None) as batch_op: + batch_op.add_column(sa.Column("chars", sa.VARCHAR(length=150), nullable=False)) + batch_op.add_column(sa.Column("table", sa.VARCHAR(length=150), nullable=False)) + batch_op.drop_constraint( + batch_op.f("fk_Extractions_datafile_id_Datafiles"), type_="foreignkey" + ) + batch_op.drop_column("interpreted_value") + batch_op.drop_column("importer") + batch_op.drop_column("text_location") + batch_op.drop_column("text") + batch_op.drop_column("datafile_id") + batch_op.drop_column("entry_id") + batch_op.drop_column("destination_table") + + # ### end Alembic commands ### diff --git a/pepys_admin/maintenance/column_data.py b/pepys_admin/maintenance/column_data.py index 0a4253328..9a049e885 100644 --- a/pepys_admin/maintenance/column_data.py +++ b/pepys_admin/maintenance/column_data.py @@ -180,6 +180,7 @@ def create_assoc_proxy_data(ap_name, ap_obj, data_store, table_object): # For all other columns, no special processing is needed all_records = data_store.session.query(ap_obj.target_class).all() values = [str_if_not_none(getattr(record, ap_obj.value_attr)) for record in all_records] + sorted_values = sorted(set(values)) details["values"] = sorted_values @@ -213,18 +214,25 @@ def create_normal_column_data(col, data_store, table_object): details["required"] = not col.prop.columns[0].nullable - if details["type"] == "id" and col.key != get_primary_key_for_table(table_object): + if ( + details["type"] == "id" + and col.key != get_primary_key_for_table(table_object) + and col.key != "entry_id" + ): # Skip all ID columns except the primary key + # Make a special exception for the entry_id field in the Extractions table + # where there is no relationship to use to navigate between tables, as entry_id could be + # a primary key in any measurement table return None, None # Skip getting values for the remarks column, as we don't need a dropdown for that if details["type"] == "string" and details["system_name"] != "remarks": # Get values - - all_records = data_store.session.query(table_object).all() - values = [ - str_if_not_none(getattr(record, details["system_name"])) for record in all_records - ] + # Here we query for just the specific column name (details['system_name']) so that + # the generated SQL is just selecting that column, rather than selecting all the columns + # and doing all the joins to get the denormalised data + all_records = data_store.session.query(getattr(table_object, details["system_name"])).all() + values = [str_if_not_none(record[0]) for record in all_records] details["values"] = sorted(remove_duplicates_and_nones(values)) return get_display_name(sys_name), details diff --git a/pepys_admin/maintenance/gui.py b/pepys_admin/maintenance/gui.py index e412dbb37..103147a6f 100644 --- a/pepys_admin/maintenance/gui.py +++ b/pepys_admin/maintenance/gui.py @@ -155,6 +155,7 @@ def init_ui_components(self): "WargameParticipants", "SerialParticipants", "ConfigOptions", + "Extractions", ] measurement_tables = sorted( [mc.__tablename__ for mc in self.data_store.meta_classes[TableTypes.MEASUREMENT]] diff --git a/pepys_admin/merge.py b/pepys_admin/merge.py index 3c4339c5f..e5f0c6635 100644 --- a/pepys_admin/merge.py +++ b/pepys_admin/merge.py @@ -169,6 +169,7 @@ def merge_all_metadata_tables(self): metadata_table_names.remove("Log") metadata_table_names.remove("Change") metadata_table_names.remove("Synonym") + metadata_table_names.remove("Extraction") added_names = {} @@ -664,6 +665,34 @@ def merge_logs_and_changes(self): # Add the log entries self.add_logs(logs_to_add) + def merge_extractions(self, added_datafile_ids): + to_add = [] + + with self.slave_store.session_scope(): + with self.master_store.session_scope(): + print("Merging Extractions") + for datafile_ids_chunk in tqdm(self.split_list(added_datafile_ids)): + # Search for all slave Extraction entries with IDs in this list + results = ( + self.slave_store.session.query(self.slave_store.db_classes.Extraction) + .filter( + self.slave_store.db_classes.Extraction.datafile_id.in_( + datafile_ids_chunk + ) + ) + .options(undefer("*")) + .all() + ) + + # Convert the rows to a list of dicts + dict_results = self.rows_to_list_of_dicts(results) + + to_add.extend(dict_results) + + self.master_store.session.bulk_insert_mappings( + self.master_store.db_classes.Extraction, to_add + ) + def merge_all_tables(self): """ Does a full merge, taking all data from the slave_store database and merging it into the master_store @@ -748,6 +777,9 @@ def merge_all_tables(self): # Merge the Logs and Changes table, only merging ones which still match something in the new db self.merge_logs_and_changes() + # Merge the Extractions table, only merging those that match a Datafile that has been added + self.merge_extractions([d["id"] for d in df_ids["added"]]) + print("Statistics:\n") print("Reference tables:") print( diff --git a/pepys_import/__init__.py b/pepys_import/__init__.py index 04938d7e2..722907142 100644 --- a/pepys_import/__init__.py +++ b/pepys_import/__init__.py @@ -4,5 +4,5 @@ __author__ = "Ian Mayo" __email__ = "ian@planetmayo.com" -__version__ = "0.0.28" +__version__ = "0.0.29" __build_timestamp__ = None diff --git a/pepys_import/core/store/common_db.py b/pepys_import/core/store/common_db.py index 493e62f2a..7a8d1e70e 100644 --- a/pepys_import/core/store/common_db.py +++ b/pepys_import/core/store/common_db.py @@ -6,7 +6,7 @@ from sqlalchemy.orm import backref, declared_attr, relationship from tqdm import tqdm -from config import LOCAL_BASIC_TESTS, LOCAL_ENHANCED_TESTS +import config from pepys_import.core.formats import unit_registry from pepys_import.core.formats.location import Location from pepys_import.core.store import constants @@ -18,8 +18,20 @@ from pepys_import.utils.sqlalchemy_utils import get_primary_key_for_table from pepys_import.utils.text_formatting_utils import format_error_menu -LOCAL_BASIC_VALIDATORS = import_validators(LOCAL_BASIC_TESTS) -LOCAL_ENHANCED_VALIDATORS = import_validators(LOCAL_ENHANCED_TESTS) +LOCAL_BASIC_VALIDATORS = [] +LOCAL_ENHANCED_VALIDATORS = [] + + +def reload_local_validators(): + global LOCAL_BASIC_VALIDATORS, LOCAL_ENHANCED_VALIDATORS + LOCAL_BASIC_VALIDATORS = import_validators(config.LOCAL_BASIC_TESTS) + LOCAL_ENHANCED_VALIDATORS = import_validators(config.LOCAL_ENHANCED_TESTS) + + +# On initial load of this file, load the local validators (saves us doing it once +# for each file we process). This function can be called in a test to reload them, +# after we've played with the config values as part of a test +reload_local_validators() class HostedByMixin: @@ -624,6 +636,33 @@ def datafile_type(self): def datafile_type_name(self): return association_proxy("datafile_type", "name") + def flush_extracted_tokens(self): + """Flush the current list of extracted tokens out to the dict linking measurement + objects to tokens, ready for writing to the database at the end of the import. + + This should be called when all the extractions have been done for a _single_ measurement + object (State/Contact etc). Often this will be at the end of the `_load_this_line()` method, + but in more complex importers it may be needed elsewhere.""" + if len(self.pending_extracted_tokens) == 0: + return + + # If there aren't any tokens recorded for this measurement object already + # then put the list into the dict. If there are already tokens recorded, then append the list + # to the list that's already in the dict + if ( + self.measurement_object_to_tokens_list.get(self.current_measurement_object, None) + is None + ): + self.measurement_object_to_tokens_list[ + self.current_measurement_object + ] = self.pending_extracted_tokens + else: + self.measurement_object_to_tokens_list[ + self.current_measurement_object + ] += self.pending_extracted_tokens + + self.pending_extracted_tokens = [] + def create_state(self, data_store, platform, sensor, timestamp, parser_name): """Creates a new State object to record information on the state of a particular platform at a specific time. @@ -654,6 +693,8 @@ def create_state(self, data_store, platform, sensor, timestamp, parser_name): platform=platform, ) self.add_measurement_to_dict(state, parser_name) + + self.current_measurement_object = state return state def create_contact(self, data_store, platform, sensor, timestamp, parser_name): @@ -686,6 +727,7 @@ def create_contact(self, data_store, platform, sensor, timestamp, parser_name): platform=platform, ) self.add_measurement_to_dict(contact, parser_name) + self.current_measurement_object = contact return contact def create_comment( @@ -727,6 +769,7 @@ def create_comment( platform=platform, ) self.add_measurement_to_dict(comment, parser_name) + self.current_measurement_object = comment return comment def create_geometry(self, data_store, geom, geom_type_id, geom_sub_type_id, parser_name): @@ -737,6 +780,7 @@ def create_geometry(self, data_store, geom, geom_type_id, geom_sub_type_id, pars geo_sub_type_id=geom_sub_type_id, ) self.add_measurement_to_dict(geometry, parser_name) + self.current_measurement_object = geometry return geometry def create_activation(self, data_store, sensor, start, end, parser_name): @@ -747,6 +791,7 @@ def create_activation(self, data_store, sensor, start, end, parser_name): source_id=self.datafile_id, ) self.add_measurement_to_dict(activation, parser_name) + self.current_measurement_object = activation return activation def add_measurement_to_dict(self, measurement, parser_name): @@ -801,6 +846,7 @@ def validate( parser="Default", skip_validation=False, ): + # If there is no parsing error, it will return None. If that's the case, # create a new list for validation errors. if errors is None: @@ -882,7 +928,6 @@ def validate( del errors[-1] prev_object_dict[curr_object.platform_name] = curr_object - if not errors: return (True, failed_validators) return (False, failed_validators) @@ -922,6 +967,30 @@ def commit(self, data_store, change_id): ) extraction_log.append(f"{total_objects} measurements extracted by {parser}.") + + # Loop through the dict linking measurement objects to lists of extraction tokens + # and fill in more details on the extraction tokens, then join all the lists together + # ready for insert into the database + extraction_data = [] + for measurement_obj, tokens_data in self.measurement_object_to_tokens_list.items(): + if measurement_obj is None: + continue + for entry in tokens_data: + entry_id = getattr(measurement_obj, get_primary_key_for_table(measurement_obj)) + entry["entry_id"] = entry_id + entry["destination_table"] = measurement_obj.__table__.name + entry["datafile_id"] = self.datafile_id + extraction_data += tokens_data + + print("Submitting extraction data") + for chunk_extraction_data in tqdm(chunked_list(extraction_data, size=1000)): + data_store.session.bulk_insert_mappings( + data_store.db_classes.Extraction, chunk_extraction_data + ) + + self.measurement_object_to_tokens_list = {} + self.pending_extracted_tokens = [] + return extraction_log @@ -2015,3 +2084,8 @@ def parent__name(self): class NationalityMixin: _default_preview_fields = ["name", "priority"] _default_dropdown_fields = ["name"] + + +class ExtractionMixin: + _default_preview_fields = ["field", "text", "interpreted_value"] + _default_dropdown_fields = ["field", "text", "interpreted_value"] diff --git a/pepys_import/core/store/postgres_db.py b/pepys_import/core/store/postgres_db.py index 9dfe08afc..27532aed5 100644 --- a/pepys_import/core/store/postgres_db.py +++ b/pepys_import/core/store/postgres_db.py @@ -18,6 +18,7 @@ ContactMixin, DatafileMixin, ElevationPropertyMixin, + ExtractionMixin, GeometryMixin, GeometrySubTypeMixin, HostedByMixin, @@ -291,6 +292,10 @@ class Datafile(BasePostGIS, DatafileMixin): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.measurements = dict() + self.highlighted_file = None + self.pending_extracted_tokens = [] + self.measurement_object_to_tokens_list = {} + self.current_measurement_object = None __tablename__ = constants.DATAFILE table_type = TableTypes.METADATA @@ -374,15 +379,24 @@ class Log(BasePostGIS, LogMixin): created_date = Column(DateTime, default=datetime.utcnow) -class Extraction(BasePostGIS): +class Extraction(BasePostGIS, ExtractionMixin): __tablename__ = constants.EXTRACTION table_type = TableTypes.METADATA table_type_id = 10 extraction_id = Column(UUID(as_uuid=True), primary_key=True, default=uuid4) - table = Column(String(150), nullable=False) + destination_table = Column(String(150)) + entry_id = Column(UUID(as_uuid=True)) field = Column(String(150), nullable=False) - chars = Column(String(150), nullable=False) + datafile_id = Column( + UUID(as_uuid=True), + ForeignKey("Datafiles.datafile_id", onupdate="cascade", ondelete="cascade"), + nullable=False, + ) + text = Column(Text(), nullable=False) + text_location = Column(String(200), nullable=False) + importer = Column(String(150), nullable=False) + interpreted_value = Column(Text(), nullable=False) created_date = Column(DateTime, default=datetime.utcnow) diff --git a/pepys_import/core/store/sqlite_db.py b/pepys_import/core/store/sqlite_db.py index c9145e7ec..9f2c2a54c 100644 --- a/pepys_import/core/store/sqlite_db.py +++ b/pepys_import/core/store/sqlite_db.py @@ -18,6 +18,7 @@ ContactMixin, DatafileMixin, ElevationPropertyMixin, + ExtractionMixin, GeometryMixin, GeometrySubTypeMixin, HostedByMixin, @@ -292,6 +293,10 @@ class Datafile(BaseSpatiaLite, DatafileMixin): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.measurements = dict() + self.highlighted_file = None + self.pending_extracted_tokens = [] + self.measurement_object_to_tokens_list = {} + self.current_measurement_object = None __tablename__ = constants.DATAFILE table_type = TableTypes.METADATA @@ -375,15 +380,24 @@ class Log(BaseSpatiaLite, LogMixin): created_date = Column(DateTime, default=datetime.utcnow) -class Extraction(BaseSpatiaLite): +class Extraction(BaseSpatiaLite, ExtractionMixin): __tablename__ = constants.EXTRACTION table_type = TableTypes.METADATA table_type_id = 10 extraction_id = Column(UUIDType, primary_key=True, default=uuid4) - table = Column(String(150), nullable=False) + destination_table = Column(String(150)) + entry_id = Column(UUIDType) field = Column(String(150), nullable=False) - chars = Column(String(150), nullable=False) + datafile_id = Column( + UUIDType, + ForeignKey("Datafiles.datafile_id", onupdate="cascade", ondelete="cascade"), + nullable=False, + ) + text = Column(Text(), nullable=False) + text_location = Column(String(200), nullable=False) + importer = Column(String(150), nullable=False) + interpreted_value = Column(Text(), nullable=False) created_date = Column(DateTime, default=datetime.utcnow) diff --git a/pepys_import/core/store/table_summary.py b/pepys_import/core/store/table_summary.py index cf28ffb97..0df7844d5 100644 --- a/pepys_import/core/store/table_summary.py +++ b/pepys_import/core/store/table_summary.py @@ -1,4 +1,3 @@ -from sqlalchemy.orm import undefer from tabulate import tabulate from pepys_import.core.store import constants @@ -27,10 +26,7 @@ def __init__(self, session, table): def table_summary(self): number_of_rows = self.session.query(self.table).count() last_row = ( - self.session.query(self.table) - .options( - undefer("*") - ) # Fetch all attributes to enforce to failing if there is any mismatch + self.session.query(self.table.created_date) .order_by(self.table.created_date.desc()) .first() ) diff --git a/pepys_import/file/file_processor.py b/pepys_import/file/file_processor.py index 44fe3e7a9..3886c4fca 100644 --- a/pepys_import/file/file_processor.py +++ b/pepys_import/file/file_processor.py @@ -332,6 +332,9 @@ def process_file( privacy=privacy, ) + highlighted_file.datafile = datafile + datafile.highlighted_file = highlighted_file + # Update change object change.datafile_id = datafile.datafile_id data_store.session.flush() diff --git a/pepys_import/file/highlighter/highlighter.py b/pepys_import/file/highlighter/highlighter.py index e3eea2ff9..0f0db7f58 100644 --- a/pepys_import/file/highlighter/highlighter.py +++ b/pepys_import/file/highlighter/highlighter.py @@ -14,7 +14,7 @@ class that can load/tokenize a datafile, record changes to the file, then export a highlighted version of the file that indicates extraction """ - def __init__(self, filename: str, number_of_lines=None): + def __init__(self, filename: str, number_of_lines=None, datafile=None): """ Constructor for this object Args: @@ -26,9 +26,9 @@ def __init__(self, filename: str, number_of_lines=None): self.filename = filename self.dict_color = {} self.number_of_lines = number_of_lines + self.datafile = datafile - # List of importers to ignore record calls from - self.ignored_importers = [] + self.importer_highlighting_levels = {} def chars_debug(self): """ diff --git a/pepys_import/file/highlighter/level.py b/pepys_import/file/highlighter/level.py new file mode 100644 index 000000000..3af38c71b --- /dev/null +++ b/pepys_import/file/highlighter/level.py @@ -0,0 +1,12 @@ +from enum import Enum + + +class HighlightLevel(Enum): + """The level of recording highlighted extractions""" + + NONE = 1 + """No highlighting or recording of extractions""" + HTML = 2 + """Produce a highlighted HTML file showing the extractions""" + DATABASE = 3 + """Produce a highlighted HTML file and record extractions to the database""" diff --git a/pepys_import/file/highlighter/support/line.py b/pepys_import/file/highlighter/support/line.py index 82049214e..2ebc1b9c1 100644 --- a/pepys_import/file/highlighter/support/line.py +++ b/pepys_import/file/highlighter/support/line.py @@ -1,5 +1,8 @@ from re import finditer, search +from pepys_import.file.highlighter.level import HighlightLevel +from pepys_import.file.highlighter.support.utils import merge_adjacent_text_locations + from .token import SubToken, Token from .usages import SingleUsage @@ -101,6 +104,7 @@ def tokens(self, reg_exp=WHITESPACE_TOKENISER, strip_char="", quoted_name=QUOTED char_index = token_str.find(strip_char) if char_index == 0: token_str = token_str[1:] + token_start += 1 # and ditch any new whitespace token_str = token_str.strip() @@ -130,9 +134,8 @@ def record(self, tool: str, field: str, value: str, units: str = None): Adds a SingleUsage object to each of the relevant characters in the char array referenced by each SubToken child. """ - # Don't record anything if the importer that called record - # has called 'disable_recording()` - if tool in self.highlighted_file.ignored_importers: + recording_level = self.highlighted_file.importer_highlighting_levels.get(tool, None) + if recording_level == HighlightLevel.NONE: return self.highlighted_file.fill_char_array_if_needed() @@ -143,7 +146,28 @@ def record(self, tool: str, field: str, value: str, units: str = None): else: message = "Value:" + str(value) + text_locations = [] + for child in self.children: - for i in range(int(child.start()), int(child.end())): + start = child.start() + end = child.end() + + text_locations.append((start, end)) + + for i in range(start, end): usage = SingleUsage(tool_field, message) child.chars[i].usages.append(usage) + + if recording_level == HighlightLevel.DATABASE: + merged_text_locations = merge_adjacent_text_locations(text_locations) + text_location_str = ",".join([f"{low}-{high}" for low, high in merged_text_locations]) + + self.highlighted_file.datafile.pending_extracted_tokens.append( + { + "text": self.text, + "interpreted_value": str(value), + "text_location": text_location_str, + "importer": tool, + "field": field, + } + ) diff --git a/pepys_import/file/highlighter/support/test_utils.py b/pepys_import/file/highlighter/support/test_utils.py index 2dedd493d..e5691d023 100644 --- a/pepys_import/file/highlighter/support/test_utils.py +++ b/pepys_import/file/highlighter/support/test_utils.py @@ -26,3 +26,15 @@ def create_test_line_object(line_str): new_line = Line([sub_token], test_hf) return new_line + + +class FakeDatafile: + def __init__(self): + self.pending_extracted_tokens = [] + + +def delete_entries(d, keys_to_delete): + for key in keys_to_delete: + del d[key] + + return d diff --git a/pepys_import/file/highlighter/support/token.py b/pepys_import/file/highlighter/support/token.py index 1c59076b9..e8947069e 100644 --- a/pepys_import/file/highlighter/support/token.py +++ b/pepys_import/file/highlighter/support/token.py @@ -1,3 +1,6 @@ +from pepys_import.file.highlighter.level import HighlightLevel +from pepys_import.file.highlighter.support.utils import merge_adjacent_text_locations + from .usages import SingleUsage @@ -78,6 +81,15 @@ def text(self): res += child.text return res + @property + def text_space_separated(self): + """Returns the entire text of the Line, with spaces separating the different subtokens + + :return: Entire text content of the Line + :rtype: String + """ + return " ".join([child.text for child in self.children]) + def record(self, tool: str, field: str, value: str, units: str = None): """ Record the usage of this token for a specific purpose @@ -95,9 +107,8 @@ def record(self, tool: str, field: str, value: str, units: str = None): This adds SingleUsage objects to each of the relevant characters in the character array stored by the SubToken objects that are children of this object. """ - # Don't record anything if the importer that called record - # has called 'disable_recording()` - if tool in self.highlighted_file.ignored_importers: + recording_level = self.highlighted_file.importer_highlighting_levels.get(tool, None) + if recording_level == HighlightLevel.NONE: return self.highlighted_file.fill_char_array_if_needed() @@ -110,11 +121,15 @@ def record(self, tool: str, field: str, value: str, units: str = None): usage = SingleUsage(tool_field, message) + text_locations = [] + # This loop gives us each SubToken that is a child of this Token for subtoken in self.children: start = subtoken.start() end = subtoken.end() + text_locations.append((start, end)) + for i in range(start, end): # Note: subtoken.chars is a reference to a single char array # that was originally created by the HighlightedFile class @@ -122,3 +137,17 @@ def record(self, tool: str, field: str, value: str, units: str = None): # char array, even though it is accessed via different SubToken # objects subtoken.chars[i].usages.append(usage) + + if recording_level == HighlightLevel.DATABASE: + merged_text_locations = merge_adjacent_text_locations(text_locations) + text_location_str = ",".join([f"{low}-{high}" for low, high in merged_text_locations]) + + self.highlighted_file.datafile.pending_extracted_tokens.append( + { + "text": self.text_space_separated, + "interpreted_value": str(value), + "text_location": text_location_str, + "importer": tool, + "field": field, + } + ) diff --git a/pepys_import/file/highlighter/support/utils.py b/pepys_import/file/highlighter/support/utils.py new file mode 100644 index 000000000..f9623451d --- /dev/null +++ b/pepys_import/file/highlighter/support/utils.py @@ -0,0 +1,18 @@ +def merge_adjacent_text_locations(text_locations): + if len(text_locations) == 0: + return [] + + current_low = text_locations[0][0] + current_high = text_locations[0][1] + output = [] + + for low, high in text_locations[1:]: + if low <= current_high + 1: + current_high = high + else: + output.append((current_low, current_high)) + current_low = low + current_high = high + output.append((current_low, current_high)) + + return output diff --git a/pepys_import/file/highlighter/xml_parser.py b/pepys_import/file/highlighter/xml_parser.py index 0075fd4a5..b2754e6c7 100644 --- a/pepys_import/file/highlighter/xml_parser.py +++ b/pepys_import/file/highlighter/xml_parser.py @@ -128,6 +128,19 @@ def record(self, tool: str, field: str, value: str, units: str = None, xml_part= self.highlighted_file.file_byte_contents[start:end].decode() ) + text_location_str = f"{start_in_chars}-{end_in_chars}" + text = self.highlighted_file.file_byte_contents[start:end].decode() + + self.highlighted_file.datafile.pending_extracted_tokens.append( + { + "text": text, + "interpreted_value": str(value), + "text_location": text_location_str, + "importer": tool, + "field": field, + } + ) + # This return returns the start and end index, mainly for use for testing return self.highlighted_file.set_usages_for_slice(start_in_chars, end_in_chars, usage) diff --git a/pepys_import/file/importer.py b/pepys_import/file/importer.py index 5107981c8..c54fdbe75 100644 --- a/pepys_import/file/importer.py +++ b/pepys_import/file/importer.py @@ -3,6 +3,7 @@ from tqdm import tqdm +from pepys_import.file.highlighter.level import HighlightLevel from pepys_import.utils.text_formatting_utils import ( custom_print_formatted_text, format_error_message, @@ -22,13 +23,20 @@ def __init__(self, name, validation_level, short_name, datafile_type, default_pr self.errors = None self.error_type = None - self.do_recording = True + # By default all importers will record extractions to a highlighted + # HTML file, but not record them to the database + self.highlighting_level = HighlightLevel.HTML def __str__(self): return self.name - def disable_recording(self): - self.do_recording = False + def set_highlighting_level(self, level): + """Sets the HighlightLevel of recording highlighted extractions. Can be one of: + + - NONE: No highlighting or recording of extractions + - HTML: Produce a highlighted html file showing the extractions + - DATABASE: Produce a highlighted html file and record extractions to the database""" + self.highlighting_level = level @abstractmethod def can_load_this_type(self, suffix) -> bool: @@ -97,6 +105,9 @@ def load_this_file(self, data_store, path, file_object, datafile, change_id): self.error_type = f"{self.short_name} - Parsing error on {basename}" datafile.measurements[self.short_name] = dict() + datafile.current_measurement_object = None + datafile.pending_extracted_tokens = [] + # Initialise the platform->sensor mapping here # so that we get a separate mapping for each file that we process self.platform_sensor_mapping = {} @@ -105,11 +116,7 @@ def load_this_file(self, data_store, path, file_object, datafile, change_id): # so we get a separate cache for each file we process self.platform_cache = {} - # If we've turned off recording of extractions for this importer - # then add this to the list of ignored importers for this HighlightedFile - # object - if not self.do_recording: - file_object.ignored_importers.append(self.name) + file_object.importer_highlighting_levels[self.name] = self.highlighting_level # perform load self._load_this_file(data_store, path, file_object, datafile, change_id) diff --git a/requirements_dev.txt b/requirements_dev.txt index 419922aca..46b25b952 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -16,4 +16,5 @@ git+https://github.com/tk0miya/testing.postgresql.git#egg=testing.postgresql isort pytest-benchmark pyte -freezegun==1.* \ No newline at end of file +freezegun==1.* +pexpect \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 26345e99a..b4efec92a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.0.28 +current_version = 0.0.29 commit = True tag = True diff --git a/setup.py b/setup.py index 6eeaa12c2..f54cc56c8 100644 --- a/setup.py +++ b/setup.py @@ -43,6 +43,6 @@ test_suite="tests", tests_require=test_requirements, url="https://github.com/debrief/pepys-import", - version="0.0.28", + version="0.0.29", zip_safe=False, ) diff --git a/tests/benchmarks/test_highlighter_benchmark.py b/tests/benchmarks/test_highlighter_benchmark.py index 4a1fef36f..8aab1d2d4 100644 --- a/tests/benchmarks/test_highlighter_benchmark.py +++ b/tests/benchmarks/test_highlighter_benchmark.py @@ -134,7 +134,7 @@ def run_highlighter_on_whole_file(): def test_highlighter_on_whole_file_benchmark(benchmark): benchmark(run_highlighter_on_whole_file) - TIME_THRESHOLD = 11 + TIME_THRESHOLD = 11.5 if running_on_ci(): if benchmark.stats.stats.mean > TIME_THRESHOLD: diff --git a/tests/config_file_tests/test_config_file.py b/tests/config_file_tests/test_config_file.py index 7395c7434..abe672445 100644 --- a/tests/config_file_tests/test_config_file.py +++ b/tests/config_file_tests/test_config_file.py @@ -9,7 +9,6 @@ import config from importers.replay_importer import ReplayImporter -from pepys_import.core.store import common_db from pepys_import.core.store.data_store import DataStore from pepys_import.file.file_processor import FileProcessor from tests.utils import side_effect @@ -160,18 +159,5 @@ def test_no_archive_path_given(self): os.rmdir(os.path.join(processing_path, "output")) -class CommonDBVariablesTestCase(unittest.TestCase): - @patch("config.LOCAL_BASIC_TESTS", BASIC_PARSERS_PATH) - @patch("config.LOCAL_ENHANCED_TESTS", ENHANCED_PARSERS_PATH) - def test_local_parser_tests(self): - assert not common_db.LOCAL_BASIC_VALIDATORS - assert not common_db.LOCAL_ENHANCED_VALIDATORS - - # reload common_db module - reload(common_db) - assert len(common_db.LOCAL_BASIC_VALIDATORS) == 1 - assert len(common_db.LOCAL_ENHANCED_VALIDATORS) == 1 - - if __name__ == "__main__": unittest.main() diff --git a/tests/config_file_tests/test_local_and_core_validators.py b/tests/config_file_tests/test_local_and_core_validators.py index 0fa494407..8f137f2be 100644 --- a/tests/config_file_tests/test_local_and_core_validators.py +++ b/tests/config_file_tests/test_local_and_core_validators.py @@ -1,11 +1,10 @@ import os import unittest -from importlib import reload from unittest.mock import patch from importers.e_trac_importer import ETracImporter from importers.replay_importer import ReplayImporter -from pepys_import.core.store import common_db +from pepys_import.core.store.common_db import reload_local_validators from pepys_import.core.store.data_store import DataStore from pepys_import.file.file_processor import FileProcessor @@ -39,92 +38,96 @@ def setUp(self): def tearDown(self): pass - @patch("config.LOCAL_BASIC_TESTS", BASIC_PARSERS_PATH) - @patch("config.LOCAL_ENHANCED_TESTS", ENHANCED_PARSERS_PATH) def test_local_basic_tests(self): - reload(common_db) - - # check states empty - with self.store.session_scope(): - # there must be no states at the beginning - states = self.store.session.query(self.store.db_classes.State).all() - self.assertEqual(len(states), 0) - - # there must be no platforms at the beginning - platforms = self.store.session.query(self.store.db_classes.Platform).all() - self.assertEqual(len(platforms), 0) - - # there must be no datafiles at the beginning - datafiles = self.store.session.query(self.store.db_classes.Datafile).all() - self.assertEqual(len(datafiles), 0) - - processor = FileProcessor(archive=False) - processor.register_importer(ETracImporter()) - - # parse the folder - processor.process(OTHER_DATA_PATH, self.store, False) - - # check data got created - with self.store.session_scope(): - # there must be states after the import - states = self.store.session.query(self.store.db_classes.State).all() - self.assertEqual(len(states), 44) - - # there must be platforms after the import - platforms = self.store.session.query(self.store.db_classes.Platform).all() - self.assertEqual(len(platforms), 18) - - # there must be one datafile afterwards - datafiles = self.store.session.query(self.store.db_classes.Datafile).all() - self.assertEqual(len(datafiles), 1) - - # Check that there is an elevation of 147 reported (test file was manually edited - # to contain an elevation of 147m) - results = ( - self.store.session.query(self.store.db_classes.State) - .filter(self.store.db_classes.State.elevation == 147) - .all() - ) - assert len(results) == 1 - - @patch("config.LOCAL_BASIC_TESTS", BASIC_PARSERS_PATH) - @patch("config.LOCAL_ENHANCED_TESTS", ENHANCED_PARSERS_PATH) + with patch("config.LOCAL_BASIC_TESTS", BASIC_PARSERS_PATH): + with patch("config.LOCAL_ENHANCED_TESTS", ENHANCED_PARSERS_PATH): + reload_local_validators() + + # check states empty + with self.store.session_scope(): + # there must be no states at the beginning + states = self.store.session.query(self.store.db_classes.State).all() + self.assertEqual(len(states), 0) + + # there must be no platforms at the beginning + platforms = self.store.session.query(self.store.db_classes.Platform).all() + self.assertEqual(len(platforms), 0) + + # there must be no datafiles at the beginning + datafiles = self.store.session.query(self.store.db_classes.Datafile).all() + self.assertEqual(len(datafiles), 0) + + processor = FileProcessor(archive=False) + processor.register_importer(ETracImporter()) + + # parse the folder + processor.process(OTHER_DATA_PATH, self.store, False) + + # check data got created + with self.store.session_scope(): + # there must be states after the import + states = self.store.session.query(self.store.db_classes.State).all() + self.assertEqual(len(states), 44) + + # there must be platforms after the import + platforms = self.store.session.query(self.store.db_classes.Platform).all() + self.assertEqual(len(platforms), 18) + + # there must be one datafile afterwards + datafiles = self.store.session.query(self.store.db_classes.Datafile).all() + self.assertEqual(len(datafiles), 1) + + # Check that there is an elevation of 147 reported (test file was manually edited + # to contain an elevation of 147m) + results = ( + self.store.session.query(self.store.db_classes.State) + .filter(self.store.db_classes.State.elevation == 147) + .all() + ) + assert len(results) == 1 + + reload_local_validators() + def test_local_basic_and_enhanced_tests(self): - reload(common_db) + with patch("config.LOCAL_BASIC_TESTS", BASIC_PARSERS_PATH): + with patch("config.LOCAL_ENHANCED_TESTS", ENHANCED_PARSERS_PATH): + reload_local_validators() + + processor = FileProcessor(archive=False) + processor.register_importer(ReplayImporter()) - processor = FileProcessor(archive=False) - processor.register_importer(ReplayImporter()) + # check states empty + with self.store.session_scope(): + # there must be no states at the beginning + states = self.store.session.query(self.store.db_classes.State).all() + self.assertEqual(len(states), 0) - # check states empty - with self.store.session_scope(): - # there must be no states at the beginning - states = self.store.session.query(self.store.db_classes.State).all() - self.assertEqual(len(states), 0) + # there must be no platforms at the beginning + platforms = self.store.session.query(self.store.db_classes.Platform).all() + self.assertEqual(len(platforms), 0) - # there must be no platforms at the beginning - platforms = self.store.session.query(self.store.db_classes.Platform).all() - self.assertEqual(len(platforms), 0) + # there must be no datafiles at the beginning + datafiles = self.store.session.query(self.store.db_classes.Datafile).all() + self.assertEqual(len(datafiles), 0) - # there must be no datafiles at the beginning - datafiles = self.store.session.query(self.store.db_classes.Datafile).all() - self.assertEqual(len(datafiles), 0) + # parse the folder + processor.process(REP_DATA_PATH, self.store, False) - # parse the folder - processor.process(REP_DATA_PATH, self.store, False) + # check data got created + with self.store.session_scope(): + # there must be states after the import + states = self.store.session.query(self.store.db_classes.State).all() + self.assertEqual(len(states), 8) - # check data got created - with self.store.session_scope(): - # there must be states after the import - states = self.store.session.query(self.store.db_classes.State).all() - self.assertEqual(len(states), 8) + # there must be platforms after the import + platforms = self.store.session.query(self.store.db_classes.Platform).all() + self.assertEqual(len(platforms), 2) - # there must be platforms after the import - platforms = self.store.session.query(self.store.db_classes.Platform).all() - self.assertEqual(len(platforms), 2) + # there must be one datafile afterwards + datafiles = self.store.session.query(self.store.db_classes.Datafile).all() + self.assertEqual(len(datafiles), 1) - # there must be one datafile afterwards - datafiles = self.store.session.query(self.store.db_classes.Datafile).all() - self.assertEqual(len(datafiles), 1) + reload_local_validators() class TestLocalTestsFails(unittest.TestCase): @@ -135,84 +138,88 @@ def setUp(self): def tearDown(self): pass - @patch("config.LOCAL_BASIC_TESTS", BASIC_PARSERS_FAILS_PATH) - @patch("config.LOCAL_ENHANCED_TESTS", ENHANCED_PARSERS_FAILS_PATH) def test_local_basic_tests(self): - reload(common_db) + with patch("config.LOCAL_BASIC_TESTS", BASIC_PARSERS_FAILS_PATH): + with patch("config.LOCAL_ENHANCED_TESTS", ENHANCED_PARSERS_FAILS_PATH): + reload_local_validators() + + # check states empty + with self.store.session_scope(): + # there must be no states at the beginning + states = self.store.session.query(self.store.db_classes.State).all() + self.assertEqual(len(states), 0) - # check states empty - with self.store.session_scope(): - # there must be no states at the beginning - states = self.store.session.query(self.store.db_classes.State).all() - self.assertEqual(len(states), 0) + # there must be no platforms at the beginning + platforms = self.store.session.query(self.store.db_classes.Platform).all() + self.assertEqual(len(platforms), 0) - # there must be no platforms at the beginning - platforms = self.store.session.query(self.store.db_classes.Platform).all() - self.assertEqual(len(platforms), 0) + # there must be no datafiles at the beginning + datafiles = self.store.session.query(self.store.db_classes.Datafile).all() + self.assertEqual(len(datafiles), 0) - # there must be no datafiles at the beginning - datafiles = self.store.session.query(self.store.db_classes.Datafile).all() - self.assertEqual(len(datafiles), 0) + processor = FileProcessor(archive=False) + processor.register_importer(ETracImporter()) - processor = FileProcessor(archive=False) - processor.register_importer(ETracImporter()) + # parse the folder + processor.process(OTHER_DATA_PATH, self.store, False) - # parse the folder - processor.process(OTHER_DATA_PATH, self.store, False) + # check data got created + with self.store.session_scope(): + # there must be no states after the import + states = self.store.session.query(self.store.db_classes.State).all() + self.assertEqual(len(states), 0) - # check data got created - with self.store.session_scope(): - # there must be no states after the import - states = self.store.session.query(self.store.db_classes.State).all() - self.assertEqual(len(states), 0) + # there must be platforms after the import + platforms = self.store.session.query(self.store.db_classes.Platform).all() + self.assertEqual(len(platforms), 18) - # there must be platforms after the import - platforms = self.store.session.query(self.store.db_classes.Platform).all() - self.assertEqual(len(platforms), 18) + # there must be no datafiles afterwards - as all files gave errors + datafiles = self.store.session.query(self.store.db_classes.Datafile).all() + self.assertEqual(len(datafiles), 0) - # there must be no datafiles afterwards - as all files gave errors - datafiles = self.store.session.query(self.store.db_classes.Datafile).all() - self.assertEqual(len(datafiles), 0) + reload_local_validators() - @patch("config.LOCAL_BASIC_TESTS", BASIC_PARSERS_FAILS_PATH) - @patch("config.LOCAL_ENHANCED_TESTS", ENHANCED_PARSERS_FAILS_PATH) def test_local_basic_and_enhanced_tests(self): - reload(common_db) - - processor = FileProcessor(archive=False) - processor.register_importer(ReplayImporter()) - - # check states empty - with self.store.session_scope(): - # there must be no states at the beginning - states = self.store.session.query(self.store.db_classes.State).all() - self.assertEqual(len(states), 0) - - # there must be no platforms at the beginning - platforms = self.store.session.query(self.store.db_classes.Platform).all() - self.assertEqual(len(platforms), 0) - - # there must be no datafiles at the beginning - datafiles = self.store.session.query(self.store.db_classes.Datafile).all() - self.assertEqual(len(datafiles), 0) - - # parse the folder - with patch("pepys_import.core.store.common_db.prompt", return_value="2"): - processor.process(REP_DATA_PATH, self.store, False) - - # check data got created - with self.store.session_scope(): - # there must be no states after the import - states = self.store.session.query(self.store.db_classes.State).all() - self.assertEqual(len(states), 0) - - # there must be platforms after the import - platforms = self.store.session.query(self.store.db_classes.Platform).all() - self.assertEqual(len(platforms), 2) - - # there must be no datafiles afterwards - as all files gave errors - datafiles = self.store.session.query(self.store.db_classes.Datafile).all() - self.assertEqual(len(datafiles), 0) + with patch("config.LOCAL_BASIC_TESTS", BASIC_PARSERS_FAILS_PATH): + with patch("config.LOCAL_ENHANCED_TESTS", ENHANCED_PARSERS_FAILS_PATH): + reload_local_validators() + + processor = FileProcessor(archive=False) + processor.register_importer(ReplayImporter()) + + # check states empty + with self.store.session_scope(): + # there must be no states at the beginning + states = self.store.session.query(self.store.db_classes.State).all() + self.assertEqual(len(states), 0) + + # there must be no platforms at the beginning + platforms = self.store.session.query(self.store.db_classes.Platform).all() + self.assertEqual(len(platforms), 0) + + # there must be no datafiles at the beginning + datafiles = self.store.session.query(self.store.db_classes.Datafile).all() + self.assertEqual(len(datafiles), 0) + + # parse the folder + with patch("pepys_import.core.store.common_db.prompt", return_value="2"): + processor.process(REP_DATA_PATH, self.store, False) + + # check data got created + with self.store.session_scope(): + # there must be no states after the import + states = self.store.session.query(self.store.db_classes.State).all() + self.assertEqual(len(states), 0) + + # there must be platforms after the import + platforms = self.store.session.query(self.store.db_classes.Platform).all() + self.assertEqual(len(platforms), 2) + + # there must be no datafiles afterwards - as all files gave errors + datafiles = self.store.session.query(self.store.db_classes.Datafile).all() + self.assertEqual(len(datafiles), 0) + + reload_local_validators() if __name__ == "__main__": diff --git a/tests/highlighter_tests/test_db_recording.py b/tests/highlighter_tests/test_db_recording.py new file mode 100644 index 000000000..eae782fd4 --- /dev/null +++ b/tests/highlighter_tests/test_db_recording.py @@ -0,0 +1,362 @@ +import os +from unittest.mock import patch + +from sqlalchemy import func + +from importers.gpx_importer import GPXImporter +from importers.replay_comment_importer import ReplayCommentImporter +from importers.replay_contact_importer import ReplayContactImporter +from importers.replay_importer import ReplayImporter +from pepys_admin.utils import sqlalchemy_obj_to_dict +from pepys_import.core.store.data_store import DataStore +from pepys_import.file.file_processor import FileProcessor +from pepys_import.file.highlighter.highlighter import HighlightedFile +from pepys_import.file.highlighter.level import HighlightLevel +from pepys_import.file.highlighter.support.test_utils import FakeDatafile + +path = os.path.abspath(__file__) +dir_path = os.path.dirname(path) +DATA_FILE = os.path.join(dir_path, "sample_files/file.txt") +DATA_PATH = os.path.join(dir_path, "..", "sample_data") +REP_TEST1_PATH = os.path.join(DATA_PATH, "track_files", "rep_data", "rep_test1.rep") +UK_TRACK_PATH = os.path.join(DATA_PATH, "track_files", "rep_data", "uk_track.rep") +GPX_PATH = os.path.join(DATA_PATH, "track_files", "gpx", "gpx_1_0.gpx") +REP_FOLDER_PATH = os.path.join(DATA_PATH, "track_files", "rep_data") + + +def test_pending_extractions_generation(): + hf = HighlightedFile(DATA_FILE) + hf.datafile = FakeDatafile() + + hf.importer_highlighting_levels["Test Importer"] = HighlightLevel.DATABASE + + lines = hf.lines() + lines[0].record("Test Importer", "Test Field", "Test Value", "Test Units") + lines[1].record("Test Importer", "Test Field 2", "Test Value") + + assert len(hf.datafile.pending_extracted_tokens) == 2 + + assert hf.datafile.pending_extracted_tokens[0] == { + "field": "Test Field", + "importer": "Test Importer", + "interpreted_value": "Test Value", + "text": "951212 050000.000 MONDEO_44 @C 269.7 10.0 10", + "text_location": "0-55", + } + + assert hf.datafile.pending_extracted_tokens[1] == { + "field": "Test Field 2", + "importer": "Test Importer", + "interpreted_value": "Test Value", + "text": "// EVENT 951212 050300.000 BRAVO", + "text_location": "56-88", + } + + +def test_extraction_into_measurement_object_tokens_dict(): + ds = DataStore("", "", "", 0, ":memory:", db_type="sqlite") + ds.initialise() + + hf = HighlightedFile(DATA_FILE) + hf.datafile = ds.db_classes.Datafile() + + hf.importer_highlighting_levels["Test Importer"] = HighlightLevel.DATABASE + + lines = hf.lines() + lines[0].record("Test Importer", "Test Field", "Test Value", "Test Units") + lines[1].record("Test Importer", "Test Field 2", "Test Value") + + hf.datafile.current_measurement_object = "TEST" + + hf.datafile.flush_extracted_tokens() + + assert len(hf.datafile.measurement_object_to_tokens_list) == 1 + assert hf.datafile.measurement_object_to_tokens_list["TEST"] == [ + { + "field": "Test Field", + "importer": "Test Importer", + "interpreted_value": "Test Value", + "text": "951212 050000.000 MONDEO_44 @C 269.7 10.0 10", + "text_location": "0-55", + }, + { + "field": "Test Field 2", + "importer": "Test Importer", + "interpreted_value": "Test Value", + "text": "// EVENT 951212 050300.000 BRAVO", + "text_location": "56-88", + }, + ] + + +def test_recording_to_database_single_file(): + ds = DataStore("", "", "", 0, ":memory:", db_type="sqlite") + ds.initialise() + + processor = FileProcessor(archive=False) + + rep_importer = ReplayImporter() + rep_importer.set_highlighting_level(HighlightLevel.DATABASE) + + processor.register_importer(rep_importer) + + processor.process(REP_TEST1_PATH, ds, True) + + with ds.session_scope(): + all_results = ds.session.query(ds.db_classes.Extraction).all() + + assert len(all_results) == 56 + + state_entry = ds.session.query(ds.db_classes.State).first() + + extractions_for_state = ( + ds.session.query(ds.db_classes.Extraction) + .filter(ds.db_classes.Extraction.entry_id == state_entry.state_id) + .all() + ) + + assert len(extractions_for_state) == 7 + + extractions = [ + sqlalchemy_obj_to_dict(item, remove_id=True) for item in extractions_for_state + ] + + entry_id = state_entry.state_id + datafile_id = state_entry.source_id + + assert extractions == [ + { + "datafile_id": datafile_id, + "destination_table": "States", + "entry_id": entry_id, + "field": "timestamp", + "importer": "Replay File Format Importer", + "interpreted_value": "2010-01-12 11:58:00", + "text": "100112 115800", + "text_location": "240-253", + }, + { + "datafile_id": datafile_id, + "destination_table": "States", + "entry_id": entry_id, + "field": "vessel name", + "importer": "Replay File Format Importer", + "interpreted_value": "SUBJECT", + "text": "SUBJECT", + "text_location": "254-261", + }, + { + "datafile_id": datafile_id, + "destination_table": "States", + "entry_id": entry_id, + "field": "latitude", + "importer": "Replay File Format Importer", + "interpreted_value": "None, 60.395", + "text": "60 23 40.25 N", + "text_location": "265-278", + }, + { + "datafile_id": datafile_id, + "destination_table": "States", + "entry_id": entry_id, + "field": "longitude", + "importer": "Replay File Format Importer", + "interpreted_value": "0.024, 60.395", + "text": "000 01 25.86 E", + "text_location": "279-293", + }, + { + "datafile_id": datafile_id, + "destination_table": "States", + "entry_id": entry_id, + "field": "heading", + "importer": "Replay File Format Importer", + "interpreted_value": "109.08 degree", + "text": "109.08", + "text_location": "294-300", + }, + { + "datafile_id": datafile_id, + "destination_table": "States", + "entry_id": entry_id, + "field": "speed", + "importer": "Replay File Format Importer", + "interpreted_value": "6.0 knot", + "text": "6.00", + "text_location": "302-306", + }, + { + "datafile_id": datafile_id, + "destination_table": "States", + "entry_id": entry_id, + "field": "depth", + "importer": "Replay File Format Importer", + "interpreted_value": "0.0 meter", + "text": "0.00", + "text_location": "308-312", + }, + ] + + +@patch("pepys_import.core.store.common_db.prompt", return_value="2") +def test_recording_to_database_multiple_files_and_importers(mock): + ds = DataStore("", "", "", 0, ":memory:", db_type="sqlite") + ds.initialise() + + processor = FileProcessor(archive=False) + + rep_importer = ReplayImporter() + rep_importer.set_highlighting_level(HighlightLevel.DATABASE) + + rep_com_importer = ReplayCommentImporter() + rep_com_importer.set_highlighting_level(HighlightLevel.DATABASE) + + rep_contact_importer = ReplayContactImporter() + rep_contact_importer.set_highlighting_level(HighlightLevel.DATABASE) + + processor.register_importer(rep_importer) + processor.register_importer(rep_com_importer) + processor.register_importer(rep_contact_importer) + + processor.process(REP_FOLDER_PATH, ds, False) + + with ds.session_scope(): + all_results = ds.session.query(ds.db_classes.Extraction).all() + + assert len(all_results) == 5784 + + grouped_by_datafile = ( + ds.session.query( + ds.db_classes.Datafile.reference, + func.count(ds.db_classes.Extraction.datafile_id), + ) + .group_by(ds.db_classes.Extraction.datafile_id) + .join(ds.db_classes.Datafile) + .all() + ) + + assert set(grouped_by_datafile) == set( + [ + ("sen_tracks.rep", 2296), + ("rep_test1.rep", 119), + ("sen_ssk_freq.dsf", 32), + ("uk_track.rep", 2814), + ("sen_frig_sensor.dsf", 523), + ] + ) + + grouped_by_importer = ( + ds.session.query( + ds.db_classes.Extraction.importer, + func.count(ds.db_classes.Extraction.importer), + ) + .group_by(ds.db_classes.Extraction.importer) + .all() + ) + + assert set(grouped_by_importer) == set( + [ + ("Replay Comment Importer", 22), + ("Replay Contact Importer", 596), + ("Replay File Format Importer", 5166), + ] + ) + + +def test_recording_to_database_single_xml_file(): + ds = DataStore("", "", "", 0, ":memory:", db_type="sqlite") + ds.initialise() + + processor = FileProcessor(archive=False) + + gpx_importer = GPXImporter() + gpx_importer.set_highlighting_level(HighlightLevel.DATABASE) + + processor.register_importer(gpx_importer) + + processor.process(GPX_PATH, ds, True) + + with ds.session_scope(): + all_results = ds.session.query(ds.db_classes.Extraction).all() + + assert len(all_results) == 26 + + state_entry = ds.session.query(ds.db_classes.State).first() + + extractions_for_state = ( + ds.session.query(ds.db_classes.Extraction) + .filter(ds.db_classes.Extraction.entry_id == state_entry.state_id) + .all() + ) + + assert len(extractions_for_state) == 6 + + extractions = [ + sqlalchemy_obj_to_dict(item, remove_id=True) for item in extractions_for_state + ] + + entry_id = state_entry.state_id + datafile_id = state_entry.source_id + + assert extractions == [ + { + "datafile_id": datafile_id, + "destination_table": "States", + "entry_id": entry_id, + "field": "name", + "importer": "GPX Format Importer", + "interpreted_value": "NELSON", + "text": "NELSON", + "text_location": "450-456", + }, + { + "datafile_id": datafile_id, + "destination_table": "States", + "entry_id": entry_id, + "field": "timestamp", + "importer": "GPX Format Importer", + "interpreted_value": "2012-04-27 15:29:38", + "text": "2012-04-27T16:29:38+01:00", + "text_location": "564-589", + }, + { + "datafile_id": datafile_id, + "destination_table": "States", + "entry_id": entry_id, + "field": "location", + "importer": "GPX Format Importer", + "interpreted_value": "-21.698, 22.186", + "text": '\n' "\t\t\t\t", + "text_location": "482-531", + }, + { + "datafile_id": datafile_id, + "destination_table": "States", + "entry_id": entry_id, + "field": "course", + "importer": "GPX Format Importer", + "interpreted_value": "268.7 degree", + "text": "268.7", + "text_location": "613-618", + }, + { + "datafile_id": datafile_id, + "destination_table": "States", + "entry_id": entry_id, + "field": "speed", + "importer": "GPX Format Importer", + "interpreted_value": "4.5 meter / second", + "text": "4.5", + "text_location": "643-646", + }, + { + "datafile_id": datafile_id, + "destination_table": "States", + "entry_id": entry_id, + "field": "elevation", + "importer": "GPX Format Importer", + "interpreted_value": "0.0 meter", + "text": "0.000", + "text_location": "538-543", + }, + ] diff --git a/tests/highlighter_tests/test_recording.py b/tests/highlighter_tests/test_recording.py index 33c953601..8656c2317 100644 --- a/tests/highlighter_tests/test_recording.py +++ b/tests/highlighter_tests/test_recording.py @@ -5,7 +5,10 @@ import pytest from pepys_import.file.highlighter.highlighter import HighlightedFile +from pepys_import.file.highlighter.level import HighlightLevel from pepys_import.file.highlighter.support.combine import combine_tokens +from pepys_import.file.highlighter.support.test_utils import FakeDatafile +from pepys_import.file.highlighter.support.utils import merge_adjacent_text_locations PATH = os.path.abspath(__file__) DIR_PATH = os.path.dirname(PATH) @@ -148,10 +151,10 @@ def test_multi_lines(self): data_file.export(os.path.join(OUTPUT_FOLDER, "track_lines.html"), True) - def test_ignored_importers(self): + def test_setting_no_highlighting(self): data_file = HighlightedFile(DATA_FILE) - data_file.ignored_importers.append("Test Importer") + data_file.importer_highlighting_levels["Test Importer"] = HighlightLevel.NONE lines = data_file.lines() @@ -165,6 +168,74 @@ def test_ignored_importers(self): # and therefore the record calls did nothing assert len(data_file.chars) == 0 + def test_setting_no_db_highlighting(self): + hf = HighlightedFile(DATA_FILE) + hf.datafile = FakeDatafile() + + hf.importer_highlighting_levels["Test Importer"] = HighlightLevel.HTML + + lines = hf.lines() + + lines[0].record("Test Importer", "Test", "Test") + + tokens = lines[1].tokens() + + tokens[0].record("Test Importer", "Test", "Test") + + # Assert that no initialisation of the chars array took place + # and therefore the record calls did nothing + assert len(hf.datafile.pending_extracted_tokens) == 0 + + def test_setting_with_db_highlighting(self): + hf = HighlightedFile(DATA_FILE) + hf.datafile = FakeDatafile() + + hf.importer_highlighting_levels["Test Importer"] = HighlightLevel.DATABASE + + lines = hf.lines() + + lines[0].record("Test Importer", "Test", "Test") + + tokens = lines[1].tokens() + + tokens[0].record("Test Importer", "Test", "Test") + + # Assert that no initialisation of the chars array took place + # and therefore the record calls did nothing + assert len(hf.datafile.pending_extracted_tokens) == 2 + + +def test_merge_adjacent_text_locations(): + text_locations = [(5, 10), (15, 20), (21, 36), (40, 50), (51, 72), (73, 100)] + + result = merge_adjacent_text_locations(text_locations) + + assert result == [(5, 10), (15, 36), (40, 100)] + + +def test_merge_adjacent_text_locations_only_one(): + text_locations = [(5, 37)] + + result = merge_adjacent_text_locations(text_locations) + + assert result == [(5, 37)] + + +def test_merge_adjacent_text_locations_all_merged(): + text_locations = [(5, 10), (11, 20), (21, 36), (37, 50), (51, 72), (73, 100)] + + result = merge_adjacent_text_locations(text_locations) + + assert result == [(5, 100)] + + +def test_merge_adjacent_text_locations_no_entries(): + text_locations = [] + + result = merge_adjacent_text_locations(text_locations) + + assert result == [] + if __name__ == "__main__": unittest.main() diff --git a/tests/migration_tests/test_migration.py b/tests/migration_tests/test_migration.py index 824ae080c..81cb1ad3c 100644 --- a/tests/migration_tests/test_migration.py +++ b/tests/migration_tests/test_migration.py @@ -248,14 +248,14 @@ def setUp(self) -> None: # The following dictionaries are going to be used to import datafiles. When the version of # the database is sufficient (if the version is the same with a key in dictionary) self.sqlite_version_datafile_dict = { - "a7f75ead6204": [ + "feb548c7c6c0": [ "nisida_example.txt", "nisida_invalid_header_line.txt", "nisida_split_narrative.txt", ] } self.postgres_version_datafile_dict = { - "4d047dd311ef": [ + "4899e94653f": [ "nisida_example.txt", "nisida_invalid_header_line.txt", "nisida_split_narrative.txt", diff --git a/tests/pexpect_tests/test_pexpect_gpx_import.py b/tests/pexpect_tests/test_pexpect_gpx_import.py new file mode 100644 index 000000000..2dce4b645 --- /dev/null +++ b/tests/pexpect_tests/test_pexpect_gpx_import.py @@ -0,0 +1,109 @@ +# This test uses the pexpect module (based on the expect Unix tool) to run through +# a full import of a GPX file, providing input in response to various prompts, +# and then checking that the right number of states are imported. +# It is a very simple example of a pexpect test, and more could easily be written to test +# other parts of the system +# The benefit of this over the existing end-to-end tests is that this is entirely separate +# to Pepys itself, and does not import any of its code or mock/patch any of its functions +# It deals with Pepys as an independent program which it controls +import os +import re +import sys + +import pexpect +import pytest + +if sys.platform.startswith("win"): + pytest.skip("Skipping pexpect tests on Windows", allow_module_level=True) + + +def test_gpx_import_end_to_end(): + if os.path.exists("pexpect_test.db"): + os.remove("pexpect_test.db") + + child = pexpect.spawn( + "python -m pepys_import.cli --path tests/sample_data/track_files/gpx/gpx_1_0.gpx --db pexpect_test.db", + encoding="utf-8", + ) + + child.logfile = open("pexpect.log", "w") + + # Classification for datafile: Public + child.expect_exact("> ") + child.sendline("2") + + # Create datafile: Yes + child.expect_exact("> ") + child.sendline("1") + + # Select platform: Add new platform + child.expect_exact("> ") + child.sendline("1") + + # Enter a name: (accept default) + child.expect_exact("NELSON") + child.sendline("") + + # Enter pennant or tail number: 123 + child.expect_exact("pennant or tail number") + child.sendline("123") + + # Enter trigraph: (accept default) + child.expect_exact("trigraph (optional)") + child.sendline("") + + # Enter quadgraph: (accept default) + child.expect_exact("quadgraph") + child.sendline("") + + # Select nationality: UK + child.expect_exact("> ") + child.sendline("2") + + # Select platform type: Naval - frigate + child.expect_exact("> ") + child.sendline("3") + + # Select classification: Public + child.expect_exact("> ") + child.sendline("2") + + # Create platform: Yes + child.expect_exact("> ") + child.sendline("1") + + # Sensor not found: Create + child.expect_exact("Sensor 'GPS' on platform 'NELSON' not found.") + child.sendline("1") + + # Enter name: (accept default) + child.expect_exact("Please enter a name") + child.sendline("") + + # Select classification: Public + child.expect_exact("> ") + child.sendline("2") + + # Create sensor: Yes + child.expect_exact("> ") + child.sendline("1") + + # What to import: Metadata and measurements + child.expect_exact("Import metadata and measurements") + import_output = child.before + child.sendline("2") + + # Check number of files processed + child.expect_exact("Files got processed: 1 times") + + # Expect end of output + child.expect(pexpect.EOF) + + child.logfile.close() + + child.close() + + # Run a regex on the status output printed out after the import, to check that + # we imported the correct number of States + match = re.search(r"States \ +\|\ +(\d+) ", import_output) + assert int(match.group(1)) == 5 diff --git a/tests/test_admin_cli.py b/tests/test_admin_cli.py index 93b7d2c55..3feb01517 100644 --- a/tests/test_admin_cli.py +++ b/tests/test_admin_cli.py @@ -807,8 +807,7 @@ def setUp(self): port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") try: self.store = DataStore( db_name="test", @@ -820,7 +819,7 @@ def setUp(self): ) self.store.initialise() except OperationalError: - print("Database schema and data population failed! Test is skipping.") + raise Exception("Creating database schema in testing Postgres database failed") def tearDown(self): try: diff --git a/tests/test_constraints.py b/tests/test_constraints.py index 3056132b2..1b0d992a0 100644 --- a/tests/test_constraints.py +++ b/tests/test_constraints.py @@ -284,8 +284,7 @@ def setUp(self): port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") self.store = DataStore( db_name="test", diff --git a/tests/test_data_store_api_postgis.py b/tests/test_data_store_api_postgis.py index 3f8b17ab2..f22af01f1 100644 --- a/tests/test_data_store_api_postgis.py +++ b/tests/test_data_store_api_postgis.py @@ -37,8 +37,7 @@ def setUp(self) -> None: port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") try: self.store = DataStore( db_name="test", @@ -53,8 +52,8 @@ def setUp(self) -> None: self.change_id = self.store.add_to_changes( "TEST", datetime.utcnow(), "TEST" ).change_id - except OperationalError: - print("Database schema and data population failed! Test is skipping.") + except Exception: + raise Exception("Testing postgres server could not be started/accessed") def tearDown(self) -> None: try: @@ -258,8 +257,7 @@ def setUp(self) -> None: port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") try: self.store = DataStore( db_name="test", @@ -275,7 +273,7 @@ def setUp(self) -> None: "TEST", datetime.utcnow(), "TEST" ).change_id except OperationalError: - print("Database schema and data population failed! Test is skipping.") + raise Exception("Creating database schema in testing Postgres database failed") def tearDown(self) -> None: try: @@ -406,8 +404,7 @@ def setUp(self) -> None: port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") try: self.store = DataStore( db_name="test", @@ -430,7 +427,7 @@ def setUp(self) -> None: ).name self.privacy = self.store.add_to_privacies("test_privacy", 0, self.change_id).name except OperationalError: - print("Database schema and data population failed! Test is skipping.") + raise Exception("Creating database schema in testing Postgres database failed") def tearDown(self) -> None: try: @@ -637,8 +634,7 @@ def setUp(self) -> None: port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") try: self.store = DataStore( db_name="test", @@ -653,7 +649,7 @@ def setUp(self) -> None: self.store.populate_reference(TEST_DATA_PATH) self.store.populate_metadata(TEST_DATA_PATH) except OperationalError: - print("Database schema and data population failed! Test is skipping.") + raise Exception("Creating database schema in testing Postgres database failed") def tearDown(self) -> None: try: @@ -718,8 +714,7 @@ def setUp(self) -> None: port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") try: self.store = DataStore( db_name="test", @@ -754,7 +749,7 @@ def setUp(self) -> None: ) self.store.session.expunge(self.platform) except OperationalError: - print("Database schema and data population failed! Test is skipping.") + raise Exception("Creating database schema in testing Postgres database failed") def tearDown(self) -> None: try: @@ -831,8 +826,7 @@ def setUp(self) -> None: port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") try: self.store = DataStore( db_name="test", @@ -879,7 +873,7 @@ def setUp(self) -> None: self.store.session.expunge(self.file) self.store.session.expunge(self.comment_type) except OperationalError: - print("Database schema and data population failed! Test is skipping.") + raise Exception("Creating database schema in testing Postgres database failed") class TestParser(Importer): def __init__( @@ -1018,8 +1012,7 @@ def setUp(self) -> None: port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") def tearDown(self) -> None: try: diff --git a/tests/test_data_store_clear_db.py b/tests/test_data_store_clear_db.py index f5c7ee6d8..6effd175b 100644 --- a/tests/test_data_store_clear_db.py +++ b/tests/test_data_store_clear_db.py @@ -8,6 +8,7 @@ from pepys_import.core.store.data_store import DataStore +@pytest.mark.postgres class DataStoreClearContentsPostGISDBTestCase(TestCase): def setUp(self): self.store = None @@ -20,7 +21,7 @@ def setUp(self): port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") + raise Exception("Creating database schema in testing Postgres database failed") def tearDown(self): try: @@ -108,7 +109,7 @@ def setUp(self): port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") + raise Exception("Creating database schema in testing Postgres database failed") def tearDown(self): try: diff --git a/tests/test_data_store_export_datafile.py b/tests/test_data_store_export_datafile.py index b378b4a38..113282646 100644 --- a/tests/test_data_store_export_datafile.py +++ b/tests/test_data_store_export_datafile.py @@ -27,7 +27,7 @@ def setUp(self): port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") + raise Exception("Creating database schema in testing Postgres database failed") def tearDown(self) -> None: if os.path.exists(self.path): diff --git a/tests/test_data_store_initialise.py b/tests/test_data_store_initialise.py index 57c579507..7838e32c6 100644 --- a/tests/test_data_store_initialise.py +++ b/tests/test_data_store_initialise.py @@ -22,7 +22,7 @@ def setUp(self): port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") + raise Exception("Creating database schema in testing Postgres database failed") def tearDown(self): try: diff --git a/tests/test_data_store_populate.py b/tests/test_data_store_populate.py index 37b981a19..0f91edd75 100644 --- a/tests/test_data_store_populate.py +++ b/tests/test_data_store_populate.py @@ -156,8 +156,7 @@ def setUp(self) -> None: port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") try: self.store = DataStore( db_name="test", @@ -169,7 +168,7 @@ def setUp(self) -> None: ) self.store.initialise() except OperationalError: - print("Database schema and data population failed! Test is skipping.") + raise Exception("Creating database schema in testing Postgres database failed") def tearDown(self) -> None: try: diff --git a/tests/test_datastore_utils.py b/tests/test_datastore_utils.py index fff3d1207..d39384761 100644 --- a/tests/test_datastore_utils.py +++ b/tests/test_datastore_utils.py @@ -249,8 +249,7 @@ def setUp(self): port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") try: self.store = DataStore( db_name="test", @@ -262,7 +261,7 @@ def setUp(self): ) self.store.initialise() except Exception: - print("Database schema and data population failed! Test is skipping.") + raise Exception("Creating database schema in testing Postgres database failed") def tearDown(self): try: @@ -354,8 +353,7 @@ def setUp(self): port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") try: self.store = DataStore( db_name="test", @@ -367,7 +365,7 @@ def setUp(self): ) self.store.initialise() except Exception: - print("Database schema and data population failed! Test is skipping.") + raise Exception("Creating database schema in testing Postgres database failed") def tearDown(self): try: diff --git a/tests/test_import_cli.py b/tests/test_import_cli.py index 6449ecff8..40049a889 100644 --- a/tests/test_import_cli.py +++ b/tests/test_import_cli.py @@ -80,8 +80,7 @@ def setUp(self): port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") try: self.store = DataStore( db_name="test", @@ -93,7 +92,7 @@ def setUp(self): ) self.store.initialise() except OperationalError: - print("Database schema and data population failed! Test is skipping.") + raise Exception("Creating database schema in testing Postgres database failed") def tearDown(self): try: @@ -142,8 +141,7 @@ def setUp(self): port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") try: self.store = DataStore( db_name="test", @@ -155,7 +153,7 @@ def setUp(self): ) self.store.initialise() except OperationalError: - print("Database schema and data population failed! Test is skipping.") + raise Exception("Creating database schema in testing Postgres database failed") def tearDown(self): try: @@ -163,34 +161,6 @@ def tearDown(self): except AttributeError: return - @patch("pepys_import.utils.error_handling.custom_print_formatted_text", side_effect=side_effect) - def test_import_with_wrong_type_db_field(self, patched_print): - conn = pg8000.connect(user="postgres", password="postgres", database="test", port=55527) - cursor = conn.cursor() - # Alter table to change heading column to be a timestamp - cursor.execute( - 'ALTER TABLE pepys."States" ALTER COLUMN heading SET DATA TYPE character varying(150);' - ) - - conn.commit() - conn.close() - - temp_output = StringIO() - with redirect_stdout(temp_output): - db_config = { - "name": "test", - "host": "localhost", - "username": "postgres", - "password": "postgres", - "port": 55527, - "type": "postgres", - } - - process(path=DATA_PATH, archive=False, db=db_config, resolver="default") - output = temp_output.getvalue() - - assert "ERROR: SQL error when communicating with database" in output - @patch("pepys_import.cli.DefaultResolver") def test_process_resolver_specification_default(patched_default_resolver): @@ -452,8 +422,7 @@ def setUp(self): port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") try: self.store = DataStore( db_name="test", @@ -465,7 +434,7 @@ def setUp(self): ) self.store.initialise() except OperationalError: - print("Database schema and data population failed! Test is skipping.") + raise Exception("Creating database schema in testing Postgres database failed") def tearDown(self): try: diff --git a/tests/test_import_cli_end_to_end.py b/tests/test_import_cli_end_to_end.py index 98a24dbaf..aee340cbc 100644 --- a/tests/test_import_cli_end_to_end.py +++ b/tests/test_import_cli_end_to_end.py @@ -236,8 +236,7 @@ def setup_method(self, test_method): port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") try: self.store = DataStore( db_name="test", @@ -250,7 +249,7 @@ def setup_method(self, test_method): ) self.store.initialise() except OperationalError: - print("Database schema and data population failed! Test is skipping.") + raise Exception("Creating database schema in testing Postgres database failed") with self.store.session_scope(): self.store.populate_reference() diff --git a/tests/test_importers.py b/tests/test_importers.py index 7c48e2732..0bf6fa02d 100644 --- a/tests/test_importers.py +++ b/tests/test_importers.py @@ -17,6 +17,7 @@ from pepys_import.core.store.data_store import DataStore from pepys_import.core.validators import constants as validation_constants from pepys_import.file.file_processor import FileProcessor +from pepys_import.file.highlighter.level import HighlightLevel from pepys_import.file.importer import Importer from pepys_import.resolvers.command_line_resolver import CommandLineResolver from pepys_import.resolvers.default_resolver import DefaultResolver @@ -522,7 +523,7 @@ def __init__(self): short_name="Test Importer", datafile_type="Importer", ) - self.disable_recording() + self.set_highlighting_level(HighlightLevel.NONE) def can_load_this_header(self, header) -> bool: return True @@ -537,7 +538,78 @@ def can_load_this_file(self, file_contents): return True def _load_this_file(self, data_store, path, file_object, datafile, change_id): - assert file_object.ignored_importers == ["Test Importer"] + assert "Test Importer" in file_object.importer_highlighting_levels + assert ( + file_object.importer_highlighting_levels["Test Importer"] == HighlightLevel.NONE + ) + + processor = FileProcessor() + + processor.register_importer(TestImporter()) + processor.process(DATA_PATH, None, False) + + def test_record_to_database(self): + class TestImporter(Importer): + def __init__(self): + super().__init__( + name="Test Importer", + validation_level=validation_constants.BASIC_LEVEL, + short_name="Test Importer", + datafile_type="Importer", + ) + self.set_highlighting_level(HighlightLevel.DATABASE) + + def can_load_this_header(self, header) -> bool: + return True + + def can_load_this_filename(self, filename): + return True + + def can_load_this_type(self, suffix): + return True + + def can_load_this_file(self, file_contents): + return True + + def _load_this_file(self, data_store, path, file_object, datafile, change_id): + assert "Test Importer" in file_object.importer_highlighting_levels + assert ( + file_object.importer_highlighting_levels["Test Importer"] + == HighlightLevel.DATABASE + ) + + processor = FileProcessor() + + processor.register_importer(TestImporter()) + processor.process(DATA_PATH, None, False) + + def test_default_recording_level(self): + class TestImporter(Importer): + def __init__(self): + super().__init__( + name="Test Importer", + validation_level=validation_constants.BASIC_LEVEL, + short_name="Test Importer", + datafile_type="Importer", + ) + + def can_load_this_header(self, header) -> bool: + return True + + def can_load_this_filename(self, filename): + return True + + def can_load_this_type(self, suffix): + return True + + def can_load_this_file(self, file_contents): + return True + + def _load_this_file(self, data_store, path, file_object, datafile, change_id): + assert "Test Importer" in file_object.importer_highlighting_levels + assert ( + file_object.importer_highlighting_levels["Test Importer"] == HighlightLevel.HTML + ) processor = FileProcessor() diff --git a/tests/test_importers_postgis.py b/tests/test_importers_postgis.py index 0f5fcf251..4408892d5 100644 --- a/tests/test_importers_postgis.py +++ b/tests/test_importers_postgis.py @@ -30,8 +30,7 @@ def setUp(self) -> None: port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") try: self.store = DataStore( db_name="test", @@ -43,7 +42,7 @@ def setUp(self) -> None: ) self.store.initialise() except OperationalError: - print("Database schema and data population failed! Test is skipping.") + raise Exception("Creating database schema in testing Postgres database failed") def tearDown(self) -> None: try: diff --git a/tests/test_merging.py b/tests/test_merging.py index 5a88edfba..63e4dff42 100644 --- a/tests/test_merging.py +++ b/tests/test_merging.py @@ -999,6 +999,10 @@ def setUp(self): sqlalchemy_obj_to_dict(item, remove_id=True) for item in results ] + self.master_extractions_count = self.master_store.session.query( + self.master_store.db_classes.Extraction.extraction_id + ).count() + # Import two files into slave processor = FileProcessor(archive=False) processor.load_importers_dynamically() @@ -1089,6 +1093,25 @@ def test_merge_state_from_import(self): ) assert len(results) == len(self.master_gpx_states) + # Check that the extractions copied over properly and didn't duplicate + slave_new_extractions_count = ( + self.slave_store.session.query( + self.slave_store.db_classes.Extraction.extraction_id + ) + .join(self.slave_store.db_classes.Datafile) + .filter(self.slave_store.db_classes.Datafile.reference == "uk_track.rep") + .count() + ) + + after_merge_extractions_count = self.master_store.session.query( + self.master_store.db_classes.Extraction.extraction_id + ).count() + + assert ( + after_merge_extractions_count + == self.master_extractions_count + slave_new_extractions_count + ) + @pytest.mark.postgres class TestMergeStateFromImport_Postgres(unittest.TestCase): @@ -1106,8 +1129,7 @@ def setUp(self): port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") self.master_store = DataStore( db_name="test", @@ -2377,8 +2399,7 @@ def setUp(self, patched_input, patched_iterfzf): port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") self.master_store = DataStore( db_name="test", diff --git a/tests/test_spatial_data.py b/tests/test_spatial_data.py index 698e97ff4..541ede784 100644 --- a/tests/test_spatial_data.py +++ b/tests/test_spatial_data.py @@ -97,8 +97,7 @@ def setUp(self): port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") try: self.store = DataStore( db_name="test", @@ -131,7 +130,7 @@ def setUp(self): state.location = loc self.store.session.add(state) except OperationalError: - print("Database schema and data population failed! Test is skipping.") + raise Exception("Creating database schema in testing Postgres database failed") def tearDown(self): try: diff --git a/tests/test_tasks_and_participants.py b/tests/test_tasks_and_participants.py index 2172e2232..193cb6d03 100644 --- a/tests/test_tasks_and_participants.py +++ b/tests/test_tasks_and_participants.py @@ -333,8 +333,7 @@ def setUp(self): port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") self.store = DataStore( db_name="test", diff --git a/tests/test_timezones.py b/tests/test_timezones.py index 0df40d423..df4789dde 100644 --- a/tests/test_timezones.py +++ b/tests/test_timezones.py @@ -45,8 +45,7 @@ def setUp(self): port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") self.postgres_store = DataStore( db_name="test", @@ -95,8 +94,7 @@ def setUp(self): port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") self.postgres_store = DataStore( db_name="test", diff --git a/tests/test_view_data_cli.py b/tests/test_view_data_cli.py index 0f2171e4e..cd44eebeb 100644 --- a/tests/test_view_data_cli.py +++ b/tests/test_view_data_cli.py @@ -194,8 +194,7 @@ def setUp(self) -> None: port=55527, ) except RuntimeError: - print("PostgreSQL database couldn't be created! Test is skipping.") - return + raise Exception("Testing Postgres server could not be started/accessed") try: self.store = DataStore( db_name="test", @@ -207,7 +206,7 @@ def setUp(self) -> None: ) self.store.initialise() except OperationalError: - print("Database schema and data population failed! Test is skipping.") + raise Exception("Creating database schema in testing Postgres database failed") # Parse the REP files processor = FileProcessor(archive=False) diff --git a/tests/test_xml_parser.py b/tests/test_xml_parser.py index ee56bf41c..c1ece2c84 100644 --- a/tests/test_xml_parser.py +++ b/tests/test_xml_parser.py @@ -89,6 +89,11 @@ def _check_element_record(el, file_contents): _check_element_record(child_el, file_contents) +class FakeDatafile: + def __init__(self): + self.pending_extracted_tokens = [] + + # To test with unicode we need to actually call the record method # and then get the locations it's used out from there def test_parser_record_works_correctly_unicode(): @@ -96,6 +101,7 @@ def test_parser_record_works_correctly_unicode(): file_contents = f.read() hf = HighlightedFile(UNICODE_GPX_PATH) + hf.datafile = FakeDatafile() doc = parse(UNICODE_GPX_PATH, hf) @@ -107,6 +113,7 @@ def test_parser_record_works_correctly_ascii(): file_contents = f.read() hf = HighlightedFile(ASCII_GPX_PATH) + hf.datafile = FakeDatafile() doc = parse(ASCII_GPX_PATH, hf)