adding filter by department on DOU Search

gestaogovbr · Feb 16, 2024 · 9436b07 · 9436b07
1 parent 70a45b3
commit 9436b07
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 4 deletions.
diff --git a/src/dou_dag_generator.py b/src/dou_dag_generator.py
@@ -171,6 +171,7 @@ def create_dag(self, specs: DAGConfig, config_file: str) -> DAG:
                     'is_exact_search': specs.is_exact_search,
                     'ignore_signature_match': specs.ignore_signature_match,
                     'force_rematch': specs.force_rematch,
+                    'department': specs.department,
                     'result_as_email': result_as_html(specs),
                     },
             )
@@ -216,6 +217,7 @@ def perform_searches(
         ignore_signature_match: bool,
         force_rematch: bool,
         result_as_email: bool,
+        department: str,
         **context) -> dict:
         """Performs the search in each source and merge the results
         """
@@ -232,6 +234,7 @@ def perform_searches(
                 is_exact_search,
                 ignore_signature_match,
                 force_rematch,
+                department,
                 get_trigger_date(context, local_time = True))
 
         if 'QD' in sources:

diff --git a/src/dou_hook.py b/src/dou_hook.py
@@ -197,6 +197,7 @@ def search_text(
 
             if search_results:
                 for content in search_results:
+                    #if "Ministério da Gestão e da Inovação em Serviços Públicos" in content["hierarchyList"]:
                     item = {}
                     item["section"] = content["pubName"].lower()
                     item["title"] = content["title"]
@@ -205,6 +206,7 @@ def search_text(
                     item["date"] = content["pubDate"]
                     item["id"] = content["classPK"]
                     item["display_date_sortable"] = content["displayDateSortable"]
+                    item["hierarchyList"] = content["hierarchyList"]
 
                     all_results.append(item)
 

diff --git a/src/parsers.py b/src/parsers.py
@@ -35,6 +35,7 @@ class DAGConfig:
     doc_md: str
     dag_tags: Set[str]
     owner: str
+    department: str
 
 class FileParser(ABC):
     """Abstract class to build file parsers with DAG configuration.
@@ -111,6 +112,7 @@ def _parse_yaml(self) -> DAGConfig:
         is_exact_search = search.get('is_exact_search', True)
         ignore_signature_match = search.get('ignore_signature_match', False)
         force_rematch = search.get('force_rematch', None)
+        department = search.get('department', None)
         schedule = self._get_safe_schedule(dag, self.DEFAULT_SCHEDULE)
         doc_md = dag.get('doc_md', None)
         if doc_md:
@@ -148,6 +150,7 @@ def _parse_yaml(self) -> DAGConfig:
             doc_md=doc_md,
             dag_tags=set(dag_tags),
             owner=owner,
+            department=department,
             )
 
     def _get_terms_params(self, search) -> Tuple[List[str], str, str]:

diff --git a/src/searchers.py b/src/searchers.py
@@ -116,7 +116,8 @@ def exec_search(self,
                     is_exact_search: bool,
                     ignore_signature_match: bool,
                     force_rematch: bool,
-                    reference_date: datetime):
+                    reference_date: datetime,
+                    department: str):
         search_results = self._search_all_terms(
             self._cast_term_list(term_list),
             dou_sections,
@@ -125,7 +126,8 @@ def exec_search(self,
             field,
             is_exact_search,
             ignore_signature_match,
-            force_rematch)
+            force_rematch,
+            department)
 
         return self._group_results(search_results, term_list)
 
@@ -137,7 +139,8 @@ def _search_all_terms(self,
                           field,
                           is_exact_search,
                           ignore_signature_match,
-                          force_rematch) -> dict:
+                          force_rematch,
+                          department) -> dict:
         search_results = {}
         for search_term in term_list:
             logging.info('Starting search for term: %s', search_term)
@@ -157,7 +160,8 @@ def _search_all_terms(self,
                 results = [r for r in results
                            if self._really_matched(search_term,
                                                    r.get('abstract'))]
-
+            self._department_matched(results, department)
+
             self._render_section_descriptions(results)
 
             self._add_standard_highlight_formatting(results)
@@ -240,6 +244,16 @@ def _is_signature(self, search_term: str, abstract: str) -> bool:
                 # ' JOSÉ `ANTONIO DE OLIVEIRA` MATOS'
                 norm_abstract_without_start_name.startswith(norm_term))
         )
+
+    def _department_matched(self, results: list, department: str) -> list:
+            """Verifica se o termo encontrado pela API realmente é igual ao
+            órgão de busca. Esta função é útil para filtrar resultados
+            retornardos pela API, mas que são específicas do órgão.
+            """
+
+            for result in results[:]:
+                if not department in result["hierarchyList"]:
+                    results.remove(result)
 
     def _get_prior_and_matched_name(self, raw_html: str) -> Tuple[str, str]:
         groups = self.SPLIT_MATCH_RE.match(raw_html).groups()