From b7bef1ad35a6a25fa9ca40fc6932ee8c91cd5805 Mon Sep 17 00:00:00 2001 From: Simen Fivelstad Smaaberg <66635118+simensma-fresh@users.noreply.github.com> Date: Wed, 23 Oct 2024 19:58:13 +0000 Subject: [PATCH] MDS-6103 Reverted permit changes --- .../mine/Permit/PermitConditions.tsx | 1 - .../azure_document_intelligence_converter.py | 117 +----------------- .../filter_conditions_paragraphs.py | 17 +-- 3 files changed, 3 insertions(+), 132 deletions(-) diff --git a/services/core-web/src/components/mine/Permit/PermitConditions.tsx b/services/core-web/src/components/mine/Permit/PermitConditions.tsx index 6f0a3f291e..17e92a0e00 100644 --- a/services/core-web/src/components/mine/Permit/PermitConditions.tsx +++ b/services/core-web/src/components/mine/Permit/PermitConditions.tsx @@ -90,7 +90,6 @@ const PermitConditions: FC = ({ return Promise.resolve(); }; - console.log(permitExtraction) if (isLoading) { return ; } diff --git a/services/permits/app/permit_conditions/converters/azure_document_intelligence_converter.py b/services/permits/app/permit_conditions/converters/azure_document_intelligence_converter.py index 021737b6d7..d6b046619a 100644 --- a/services/permits/app/permit_conditions/converters/azure_document_intelligence_converter.py +++ b/services/permits/app/permit_conditions/converters/azure_document_intelligence_converter.py @@ -66,25 +66,16 @@ def run( result = self.run_document_intelligence(file_path) if DEBUG_MODE: + self.write_to_cache(cache_key, result) docs = [] - paragraphs = result.paragraphs - - # for table in result.tables: - # paragraphs = self.replace_paragraphs_with_table(paragraphs, table) - - result.paragraphs = paragraphs - - for idx, p in enumerate(paragraphs): + for idx, p in enumerate(result.paragraphs): doc = self.add_metadata_to_document(idx, p) docs.append(doc) - with open("debug/azure_document_intelligence_repl.json", "w") as f: - dp = [d.to_dict() for d in result.paragraphs] - json.dump(dp, f, indent=4) permit_condition_csv = _create_csv_representation(docs) return { @@ -149,15 +140,6 @@ def write_to_cache(self, cache_key, result): dp = [d.to_dict() for d in result.paragraphs] json.dump(dp, f, indent=4) - with open("debug/azure_document_intelligence_result.txt", "w") as f: - dp = [d.content for d in result.paragraphs] - f.write("\n".join(dp)) - - with open("debug/azure_document_intelligence_result_tables.json", "w") as f: - dp = [d.to_dict() for d in result.tables] - json.dump(dp, f, indent=4) - - def retrieve_cached_result(self, cache_key): try: with open(f"app/cache/{cache_key}.pickle", "rb") as f: @@ -169,101 +151,6 @@ def retrieve_cached_result(self, cache_key): return result - def replace_paragraphs_with_table(self, paragraphs, table): - # A paragraph has the following structure (accessed as class properties): - # { - # "content": "abc 123", - # "bounding_regions": [ - # { - # "page_number": 48, - # "polygon": [ - # { - # "x": 1.6239, - # "y": 8.0825 - # }, - # { - # "x": 7.2495, - # "y": 8.0924 - # }, - # { - # "x": 7.2485, - # "y": 8.6889 - # }, - # { - # "x": 1.6229, - # "y": 8.679 - # } - # ] - # } - # ], - # } - - # A table has the following structure (accessed as class properties): - # {"cells": [ - # { - # "kind": "content", - or columnHeader - # "row_index": 0, - # "column_index": 0, - # "row_span": 1, - # "column_span": 1, - # "content": "6.", - # "bounding_regions": [ - # { - # "page_number": 11, - # "polygon": [ - # { - # "x": 1.1908, - # "y": 1.1164 - # }, - # { - # "x": 1.6108, - # "y": 1.1164 - # }, - # { - # "x": 1.6045, - # "y": 1.3546 - # }, - # { - # "x": 1.1908, - # "y": 1.3546 - # } - # ] - # } - # ], - #]} - table_paragraphs = [] - for cell in table.cells: - for paragraph in paragraphs: - if cell.bounding_regions[0].polygon == paragraph.bounding_regions[0].polygon: - table_paragraphs.append(paragraph) - - logger.info(f"Found {len(table_paragraphs)} paragraphs that are part of a table") - - if table_paragraphs: - table_data = [["" for _ in range(table.column_count)] for _ in range(table.row_count)] - for cell in table.cells: - row_index = cell.row_index - column_index = cell.column_index - if row_index < table.row_count and column_index < table.column_count: - table_data[row_index][column_index] = cell.content - - table_df = pd.DataFrame(table_data) - table_text = table_df.to_csv(index=False, header=False, sep='\t') - - new_idx = paragraphs.index(table_paragraphs[0]) - - # Create a new paragraph with the table text - new_paragraph = table_paragraphs[0] - new_paragraph.content = table_text - new_paragraph.bounding_regions = table.bounding_regions - - # paragraphs[new_idx] = new_paragraph - # Replace the original paragraphs with the new paragraph - paragraphs = [p for idx, p in enumerate(paragraphs) if p not in table_paragraphs or idx == new_idx] - - return paragraphs - - def _create_csv_representation(docs): content = json.dumps([json.loads(doc.content) for doc in docs]) jsn = pd.read_json(io.StringIO(content)) diff --git a/services/permits/app/permit_conditions/converters/filter_conditions_paragraphs.py b/services/permits/app/permit_conditions/converters/filter_conditions_paragraphs.py index 0f10950411..849741a96f 100644 --- a/services/permits/app/permit_conditions/converters/filter_conditions_paragraphs.py +++ b/services/permits/app/permit_conditions/converters/filter_conditions_paragraphs.py @@ -65,34 +65,19 @@ def run( def filter_paragraphs(paragraphs): - - with open("debug/filter_conditions_before.json", "w") as f: - cnt = [{"meta": d.meta, "content": d.content} for d in paragraphs] - f.write(json.dumps(cnt, indent=4)) - # Filter out paragraphs that are part of the page header paragraphs, max_page_header_y = _exclude_paragraphs_overlapping_page_header( paragraphs ) - with open("debug/filter_conditions_after_header.json", "w") as f: - cnt = [{"meta": d.meta, "content": d.content} for d in paragraphs] - f.write(json.dumps(cnt, indent=4)) # Filter out paragraphs that are not part of the conditions section paragraphs = _exclude_paragraphs_not_in_conditions_section(paragraphs) - with open("debug/filter_conditions_after_conditions_section.json", "w") as f: - cnt = [{"meta": d.meta, "content": d.content} for d in paragraphs] - f.write(json.dumps(cnt, indent=4)) # Filter out paragraphs that are not paragraphs paragraphs = _exclude_paragraphs_with_non_paragraph_roles( paragraphs, max_page_header_y ) - with open("debug/filter_conditions_after_non_paragraph_roles.json", "w") as f: - cnt = [{"meta": d.meta, "content": d.content} for d in paragraphs] - f.write(json.dumps(cnt, indent=4)) - logger.info( f"Found {len(paragraphs)} paragraphs after filtering, {max_page_header_y}" ) @@ -208,4 +193,4 @@ def _identify_bottom_of_first_page_header(paragraphs): and is_like_page_header ): return paragraphs[page_header_end_idx].meta["bounding_box"]["bottom"] - return None + return None \ No newline at end of file