Merge pull request #1045 from debrief/develop

New release
debrief · Sep 29, 2021 · 7b294aa · 7b294aa
2 parents 630df0d + 4ee66a0
commit 7b294aa
Show file tree

Hide file tree

Showing 62 changed files with 1,363 additions and 326 deletions.
diff --git a/.isort.cfg b/.isort.cfg
@@ -4,4 +4,4 @@ include_trailing_comma = true
 force_grid_wrap = 0
 use_parentheses = true
 line_length = 100
-known_third_party =alembic,dateutil,flask,flask_cachebuster,flask_cors,freezegun,geoalchemy2,geopy,halo,iterfzf,loguru,pg8000,pint,prompt_toolkit,psycopg2,pyfiglet,pygments,pytest,setuptools,shapely,sqlalchemy,sqlalchemy_utils,tabulate,testing,tqdm,twisted,waitress
+known_third_party =alembic,dateutil,flask,flask_cachebuster,flask_cors,freezegun,geoalchemy2,geopy,halo,iterfzf,loguru,pexpect,pg8000,pint,prompt_toolkit,psycopg2,pyfiglet,pygments,pytest,setuptools,shapely,sqlalchemy,sqlalchemy_utils,tabulate,testing,tqdm,twisted,waitress
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -2,14 +2,19 @@
 History
 =======
 
+0.0.29 (2021-09-29)
+-------------------
+
+* Record extractions to database for future data audit
+* Performance improvement in generating table summaries
+* Performance improvement in opening some tables in Maintenance GUI
 
 0.0.28 (2021-09-24)
 -------------------
 
 * Minor fixes for timeline (-100%, include_in_timeline, default_interval)
 * Fix error for viewing Platform Entry in GUI
 * Improve speed of generating table summaries (affects import performance)
-* 
 
 0.0.27 (2021-09-16)
 -------------------

diff --git a/alembic.ini b/alembic.ini
@@ -91,4 +91,4 @@ format = %(levelname)-5.5s [%(name)s] %(message)s
 datefmt = %H:%M:%S
 
 [alembic:exclude]
-tables = alembic_version,ElementaryGeometries,spatial_ref_sys,spatial_ref_sys_aux,spatialite_history,sql_statements_log,sqlite_sequence,SpatialIndex
+tables = alembic_version,ElementaryGeometries,spatial_ref_sys,spatial_ref_sys_aux,spatialite_history,sql_statements_log,sqlite_sequence,SpatialIndex,KNN,data_licenses
diff --git a/docs/importer_dev_guide.rst b/docs/importer_dev_guide.rst
@@ -159,12 +159,14 @@ files showing which parts of the file have been used to extract each individual
 part field in the created measurements, and tracks the extraction in the
 database to help understand data provenance.
 
-But this token highlighting does come with a performance cost. For high volume
-file types that are tightly structured, with little room for misinterpretation,
-the overhead may not be justified. In these circumstances, a call to
-:code:`self.disable_recording()` in the :code:`__init__` method will turn off
-the extraction highlighting for this importer, and significantly speed up the processing
-of large files.
+But this token highlighting and database recording does come with
+a performance cost. For high volume file types that are tightly
+structured, with little room for misinterpretation, the overhead
+may not be justified. You can configure the level of extraction that will
+take place by calling :code:`self.set_highlighting_level()` in the :code:`__init__` method.
+Three different values can be passed to this function: :code:`HighlightLevel.NONE` will turn off all extraction
+and highlighting, :code:`"HighlightLevel.HTML"` will record extractions to HTML but not to the database, and
+:code:`"HighlightLevel.DATABASE"` will record to both a HTML file and the database. The default is :code:`"HighlightLevel.HTML"`.
 
 Similarly, it may be justified to capture extraction data in the early stages of
 developing/maintaining the parser for a new file format, with level of
@@ -220,6 +222,13 @@ one extraction can be recorded from disparate data in the file. For example:
     combine_tokens(long_degrees_token, lat_degrees_token).record(
         self.name, "location", state.location, "decimal degrees")
 
+Once token extractions have been recorded using the :code:`record` method, the recorded information
+must be linked to the appropriate measurement object (State/Contact/Comment etc) and prepared for saving
+to the database. This can be done using the :code:`Datafile.flush_extracted_tokens` method, which should be called once
+all the data has been loaded for a specific measurement object. Usually this will be at the end of the
+:meth:`~pepys_import.file.importer._load_this_line` method, or at the end of a loop inside the
+:meth:`~pepys_import.file.importer._load_this_file` method - but for complex importers it may be elsewhere.
+
 
 Creating measurement objects
 ############################

diff --git a/importers/aircraft_csv_format_importer.py b/importers/aircraft_csv_format_importer.py
@@ -149,6 +149,8 @@ def _load_this_line(self, data_store, line_number, line, datafile, change_id):
             state.course = course
             course_token.record(self.name, "course", course)
 
+        datafile.flush_extracted_tokens()
+
     @staticmethod
     def parse_timestamp(date, time):
         format_str = "%d/%m/%Y "

diff --git a/importers/e_trac_importer.py b/importers/e_trac_importer.py
@@ -138,6 +138,8 @@ def _load_this_line(self, data_store, line_number, line, datafile, change_id):
             state.speed = speed
             speed_token.record(self.name, "speed", speed)
 
+        datafile.flush_extracted_tokens()
+
     @staticmethod
     def name_for(token):
         # split into two

diff --git a/importers/eag_importer.py b/importers/eag_importer.py
@@ -175,6 +175,8 @@ def _load_this_file(self, data_store, path, file_object, datafile, change_id):
                 state.heading = heading
                 heading_token.record(self.name, "heading", heading)
 
+            datafile.flush_extracted_tokens()
+
     def get_previous_sunday(self, date_of_recording_str):
         format_str = "%Y%m%d"
         date_of_recording = datetime.datetime.strptime(date_of_recording_str, format_str)

diff --git a/importers/gpx_importer.py b/importers/gpx_importer.py
@@ -148,6 +148,8 @@ def _load_this_file(self, data_store, path, file_object, datafile, change_id):
                     if elevation_valid:
                         state.elevation = elevation
 
+                datafile.flush_extracted_tokens()
+
     def get_child_and_text_if_exists(self, element, search_string):
         child = element.find(search_string)
         if child is not None:

diff --git a/importers/nisida_importer.py b/importers/nisida_importer.py
@@ -7,6 +7,7 @@
 from pepys_import.core.formats import unit_registry
 from pepys_import.core.formats.location import Location
 from pepys_import.core.validators import constants
+from pepys_import.file.highlighter.level import HighlightLevel
 from pepys_import.file.highlighter.support.combine import combine_tokens
 from pepys_import.file.importer import CANCEL_IMPORT, Importer
 from pepys_import.utils.sqlalchemy_utils import get_lowest_privacy
@@ -48,6 +49,8 @@ def __init__(self):
         self.year = None
         self.platform = None
 
+        self.set_highlighting_level(HighlightLevel.DATABASE)
+
     def can_load_this_type(self, suffix):
         return suffix.upper() == ".TXT"
 
@@ -129,6 +132,7 @@ def _load_this_line(self, data_store, line_number, line, datafile, change_id):
             else:
                 self.last_entry_with_text.remarks = self.last_entry_with_text.remarks + text_to_add
             line.record(self.name, "comment text", text_to_add)
+            datafile.flush_extracted_tokens()
         elif len(line.text) > 7 and line.text[7] == "/" and line.text[0:5].isdigit():
             # Check whether line starts with something like "311206Z/" (a timestamp and a slash)
             # Checking like this is faster than using regular expressions on each line
@@ -183,6 +187,7 @@ def _load_this_line(self, data_store, line_number, line, datafile, change_id):
                     }
                 )
                 return
+            datafile.flush_extracted_tokens()
         else:
             # Not a line we recognise, so just skip to next one
             return

diff --git a/importers/nmea_importer.py b/importers/nmea_importer.py
@@ -192,6 +192,8 @@ def _load_this_line(self, data_store, line_number, line, datafile, change_id):
                 self.location = None
                 self.depth = None
 
+                datafile.flush_extracted_tokens()
+
     @staticmethod
     def parse_timestamp(date, time):
         if len(date) == 6:

diff --git a/importers/replay_comment_importer.py b/importers/replay_comment_importer.py
@@ -96,3 +96,5 @@ def _load_this_line(self, data_store, line_number, line, datafile, change_id):
                 comment_type=comment_type,
                 parser_name=self.short_name,
             )
+
+            datafile.flush_extracted_tokens()
diff --git a/importers/replay_contact_importer.py b/importers/replay_contact_importer.py
@@ -260,3 +260,5 @@ def _load_this_line(self, data_store, line_number, line, datafile, change_id):
                     if ambig_bearing_valid:
                         ambig_bearing_token.record(self.name, "ambig bearing", ambig_bearing)
                         contact.ambig_bearing = ambig_bearing
+
+            datafile.flush_extracted_tokens()
diff --git a/importers/replay_importer.py b/importers/replay_importer.py
@@ -15,8 +15,11 @@ def __init__(self):
         self.depth = 0.0
 
         # Example: Uncomment this line to turn off recording of extractions
-        # for this importer
-        # self.disable_recording()
+        # for this importer:
+        # self.set_highlighting_level(HighlightLevel.NONE)
+        # or to turn on database recording:
+        # self.set_highlighting_level(HighlightLevel.DATABASE)
+        # (default is HTML recording)
 
     def can_load_this_type(self, suffix):
         return suffix.upper() == ".REP" or suffix.upper() == ".DSF"
@@ -66,6 +69,8 @@ def _load_this_line(self, data_store, line_number, line, datafile, change_id):
         state.speed = rep_line.speed
         state.location = rep_line.get_location()
 
+        datafile.flush_extracted_tokens()
+
     @staticmethod
     def degrees_for(degs, mins, secs, hemi: str):
         if hemi.upper() == "S" or hemi.upper() == "W":

diff --git a/migrations/env.py b/migrations/env.py
@@ -194,8 +194,6 @@ def process_revision_directives(context_, revision, directives):
                 context.run_migrations()
         else:
             # Turn off the enforcement of foreign key constraints before running the migration
-            connection.execute(text("PRAGMA foreign_keys=OFF;"))
-            connection.commit()
             context.configure(
                 connection=connection,
                 target_metadata=target_metadata,
@@ -205,11 +203,9 @@ def process_revision_directives(context_, revision, directives):
                 compare_type=special_compare_type,
             )
             with context.begin_transaction():
+                connection.execute(text("PRAGMA foreign_keys=OFF;"))
                 context.run_migrations()
-
-            # Turn on the enforcement of foreign key constraints after the migration is done
-            connection.execute(text("PRAGMA foreign_keys=ON;"))
-            connection.commit()
+                connection.execute(text("PRAGMA foreign_keys=ON;"))
 
 
 if context.is_offline_mode():

diff --git a/migrations/latest_revisions.json b/migrations/latest_revisions.json
@@ -1,4 +1,4 @@
 {
-    "LATEST_POSTGRES_VERSION": "c16bbfed85dc",
-    "LATEST_SQLITE_VERSION": "a7f75ead6204"
+    "LATEST_SQLITE_VERSION": "feb548c7c6c0",
+    "LATEST_POSTGRES_VERSION": "4899e94653f1"
 }
diff --git a/migrations/postgres_versions/2021-09-24_4899e94653f1_alter_extractions_table.py b/migrations/postgres_versions/2021-09-24_4899e94653f1_alter_extractions_table.py
@@ -0,0 +1,89 @@
+"""Alter Extractions table
+
+Revision ID: 4899e94653f1
+Revises: bfb29dfcef94
+Create Date: 2021-09-24 12:40:23.320197+00:00
+
+"""
+import sqlalchemy as sa
+from alembic import op
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "4899e94653f1"
+down_revision = "c16bbfed85dc"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column(
+        "Extractions",
+        sa.Column("destination_table", sa.String(length=150), nullable=True),
+        schema="pepys",
+    )
+    op.add_column(
+        "Extractions",
+        sa.Column("entry_id", postgresql.UUID(as_uuid=True), nullable=True),
+        schema="pepys",
+    )
+    op.add_column(
+        "Extractions",
+        sa.Column("datafile_id", postgresql.UUID(as_uuid=True), nullable=False),
+        schema="pepys",
+    )
+    op.add_column("Extractions", sa.Column("text", sa.Text(), nullable=False), schema="pepys")
+    op.add_column(
+        "Extractions",
+        sa.Column("text_location", sa.String(length=200), nullable=False),
+        schema="pepys",
+    )
+    op.add_column(
+        "Extractions", sa.Column("importer", sa.String(length=150), nullable=False), schema="pepys"
+    )
+    op.add_column(
+        "Extractions", sa.Column("interpreted_value", sa.Text(), nullable=False), schema="pepys"
+    )
+    op.create_foreign_key(
+        op.f("fk_Extractions_datafile_id_Datafiles"),
+        "Extractions",
+        "Datafiles",
+        ["datafile_id"],
+        ["datafile_id"],
+        source_schema="pepys",
+        referent_schema="pepys",
+        onupdate="cascade",
+        ondelete="cascade",
+    )
+    op.drop_column("Extractions", "chars", schema="pepys")
+    op.drop_column("Extractions", "table", schema="pepys")
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column(
+        "Extractions",
+        sa.Column("table", sa.VARCHAR(length=150), autoincrement=False, nullable=False),
+        schema="pepys",
+    )
+    op.add_column(
+        "Extractions",
+        sa.Column("chars", sa.VARCHAR(length=150), autoincrement=False, nullable=False),
+        schema="pepys",
+    )
+    op.drop_constraint(
+        op.f("fk_Extractions_datafile_id_Datafiles"),
+        "Extractions",
+        schema="pepys",
+        type_="foreignkey",
+    )
+    op.drop_column("Extractions", "interpreted_value", schema="pepys")
+    op.drop_column("Extractions", "importer", schema="pepys")
+    op.drop_column("Extractions", "text_location", schema="pepys")
+    op.drop_column("Extractions", "text", schema="pepys")
+    op.drop_column("Extractions", "datafile_id", schema="pepys")
+    op.drop_column("Extractions", "entry_id", schema="pepys")
+    op.drop_column("Extractions", "destination_table", schema="pepys")
+    # ### end Alembic commands ###
diff --git a/migrations/sqlite_versions/2021-09-20_feb548c7c6c0_alter_extractions_table.py b/migrations/sqlite_versions/2021-09-20_feb548c7c6c0_alter_extractions_table.py
@@ -0,0 +1,70 @@
+"""Alter Extractions table
+
+Revision ID: feb548c7c6c0
+Revises: a7f75ead6204
+Create Date: 2021-09-20 12:38:20.179908+00:00
+
+"""
+import sqlalchemy as sa
+from alembic import op
+
+import pepys_import
+
+# revision identifiers, used by Alembic.
+revision = "feb548c7c6c0"
+down_revision = "a7f75ead6204"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table("Extractions", schema=None) as batch_op:
+        batch_op.add_column(sa.Column("destination_table", sa.String(length=150), nullable=True))
+        batch_op.add_column(
+            sa.Column(
+                "entry_id", pepys_import.utils.sqlalchemy_utils.UUIDType(length=16), nullable=True
+            )
+        )
+        batch_op.add_column(
+            sa.Column(
+                "datafile_id",
+                pepys_import.utils.sqlalchemy_utils.UUIDType(length=16),
+                nullable=False,
+            )
+        )
+        batch_op.add_column(sa.Column("text", sa.Text(), nullable=False))
+        batch_op.add_column(sa.Column("text_location", sa.String(length=200), nullable=False))
+        batch_op.add_column(sa.Column("importer", sa.String(length=150), nullable=False))
+        batch_op.add_column(sa.Column("interpreted_value", sa.Text(), nullable=False))
+        batch_op.create_foreign_key(
+            batch_op.f("fk_Extractions_datafile_id_Datafiles"),
+            "Datafiles",
+            ["datafile_id"],
+            ["datafile_id"],
+            onupdate="cascade",
+            ondelete="cascade",
+        )
+        batch_op.drop_column("table")
+        batch_op.drop_column("chars")
+
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table("Extractions", schema=None) as batch_op:
+        batch_op.add_column(sa.Column("chars", sa.VARCHAR(length=150), nullable=False))
+        batch_op.add_column(sa.Column("table", sa.VARCHAR(length=150), nullable=False))
+        batch_op.drop_constraint(
+            batch_op.f("fk_Extractions_datafile_id_Datafiles"), type_="foreignkey"
+        )
+        batch_op.drop_column("interpreted_value")
+        batch_op.drop_column("importer")
+        batch_op.drop_column("text_location")
+        batch_op.drop_column("text")
+        batch_op.drop_column("datafile_id")
+        batch_op.drop_column("entry_id")
+        batch_op.drop_column("destination_table")
+
+    # ### end Alembic commands ###