From 9436b070a39a4e749d2ed98ffcacdc7b2333da1b Mon Sep 17 00:00:00 2001 From: luis guilherme Date: Fri, 16 Feb 2024 17:52:24 -0300 Subject: [PATCH] adding filter by department on DOU Search --- src/dou_dag_generator.py | 3 +++ src/dou_hook.py | 2 ++ src/parsers.py | 3 +++ src/searchers.py | 22 ++++++++++++++++++---- 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/src/dou_dag_generator.py b/src/dou_dag_generator.py index 0597d49..39fd5a3 100755 --- a/src/dou_dag_generator.py +++ b/src/dou_dag_generator.py @@ -171,6 +171,7 @@ def create_dag(self, specs: DAGConfig, config_file: str) -> DAG: 'is_exact_search': specs.is_exact_search, 'ignore_signature_match': specs.ignore_signature_match, 'force_rematch': specs.force_rematch, + 'department': specs.department, 'result_as_email': result_as_html(specs), }, ) @@ -216,6 +217,7 @@ def perform_searches( ignore_signature_match: bool, force_rematch: bool, result_as_email: bool, + department: str, **context) -> dict: """Performs the search in each source and merge the results """ @@ -232,6 +234,7 @@ def perform_searches( is_exact_search, ignore_signature_match, force_rematch, + department, get_trigger_date(context, local_time = True)) if 'QD' in sources: diff --git a/src/dou_hook.py b/src/dou_hook.py index 92c3ed3..de7056d 100644 --- a/src/dou_hook.py +++ b/src/dou_hook.py @@ -197,6 +197,7 @@ def search_text( if search_results: for content in search_results: + #if "Ministério da Gestão e da Inovação em Serviços Públicos" in content["hierarchyList"]: item = {} item["section"] = content["pubName"].lower() item["title"] = content["title"] @@ -205,6 +206,7 @@ def search_text( item["date"] = content["pubDate"] item["id"] = content["classPK"] item["display_date_sortable"] = content["displayDateSortable"] + item["hierarchyList"] = content["hierarchyList"] all_results.append(item) diff --git a/src/parsers.py b/src/parsers.py index d17dde3..35eec55 100644 --- a/src/parsers.py +++ b/src/parsers.py @@ -35,6 +35,7 @@ class DAGConfig: doc_md: str dag_tags: Set[str] owner: str + department: str class FileParser(ABC): """Abstract class to build file parsers with DAG configuration. @@ -111,6 +112,7 @@ def _parse_yaml(self) -> DAGConfig: is_exact_search = search.get('is_exact_search', True) ignore_signature_match = search.get('ignore_signature_match', False) force_rematch = search.get('force_rematch', None) + department = search.get('department', None) schedule = self._get_safe_schedule(dag, self.DEFAULT_SCHEDULE) doc_md = dag.get('doc_md', None) if doc_md: @@ -148,6 +150,7 @@ def _parse_yaml(self) -> DAGConfig: doc_md=doc_md, dag_tags=set(dag_tags), owner=owner, + department=department, ) def _get_terms_params(self, search) -> Tuple[List[str], str, str]: diff --git a/src/searchers.py b/src/searchers.py index 54b0742..567235b 100644 --- a/src/searchers.py +++ b/src/searchers.py @@ -116,7 +116,8 @@ def exec_search(self, is_exact_search: bool, ignore_signature_match: bool, force_rematch: bool, - reference_date: datetime): + reference_date: datetime, + department: str): search_results = self._search_all_terms( self._cast_term_list(term_list), dou_sections, @@ -125,7 +126,8 @@ def exec_search(self, field, is_exact_search, ignore_signature_match, - force_rematch) + force_rematch, + department) return self._group_results(search_results, term_list) @@ -137,7 +139,8 @@ def _search_all_terms(self, field, is_exact_search, ignore_signature_match, - force_rematch) -> dict: + force_rematch, + department) -> dict: search_results = {} for search_term in term_list: logging.info('Starting search for term: %s', search_term) @@ -157,7 +160,8 @@ def _search_all_terms(self, results = [r for r in results if self._really_matched(search_term, r.get('abstract'))] - + self._department_matched(results, department) + self._render_section_descriptions(results) self._add_standard_highlight_formatting(results) @@ -240,6 +244,16 @@ def _is_signature(self, search_term: str, abstract: str) -> bool: # ' JOSÉ `ANTONIO DE OLIVEIRA` MATOS' norm_abstract_without_start_name.startswith(norm_term)) ) + + def _department_matched(self, results: list, department: str) -> list: + """Verifica se o termo encontrado pela API realmente é igual ao + órgão de busca. Esta função é útil para filtrar resultados + retornardos pela API, mas que são específicas do órgão. + """ + + for result in results[:]: + if not department in result["hierarchyList"]: + results.remove(result) def _get_prior_and_matched_name(self, raw_html: str) -> Tuple[str, str]: groups = self.SPLIT_MATCH_RE.match(raw_html).groups()