big bang

gestaogovbr · Mar 26, 2024 · b58990b · b58990b
1 parent 1683e14
commit b58990b
Show file tree

Hide file tree

Showing 13 changed files with 994 additions and 105 deletions.
diff --git a/dag_confs/examples_and_tests/inlabs_example.yaml b/dag_confs/examples_and_tests/inlabs_example.yaml
@@ -0,0 +1,19 @@
+dag:
+  id: inlabs_example
+  description: DAG de teste
+  tags:
+    - inlabs
+  schedule: 0 8 * * MON-FRI
+  owner:
+    - cdata
+  search:
+    sources:
+    - INLABS
+    terms:
+    - tecnologia
+    - informação
+  report:
+    emails:
+      - [email protected]
+    attach_csv: True
+    subject: "Teste do Ro-dou"
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,3 @@
 pandas==1.5.2,<2
-unidecode==1.2.0
+unidecode==1.2.0
+html2text==2024.2.26
diff --git a/schemas/ro-dou.json b/schemas/ro-dou.json
@@ -42,7 +42,7 @@
               "description": "description",
               "items": {
                 "type": "string",
-                "enum": ["QD", "DOU"]
+                "enum": ["QD", "DOU", "INLABS"]
               }
             },
             "territory_id": {
@@ -89,9 +89,9 @@
               "description": "departamento para filtro na busca",
               "items": {
                 "type": "string",
-                "description": "nome do departamento"                
+                "description": "nome do departamento"
               }
-            },            
+            },
             "field": {
               "type": "string",
               "description": "description",
@@ -164,7 +164,9 @@
                   "description": "description",
                   "format": "uri-reference"
                 }
-              }
+              },
+              "required": ["webhook"],
+              "additionalProperties": false
             },
             "discord": {
               "type": "object",
@@ -175,7 +177,9 @@
                   "description": "description",
                   "format": "uri-reference"
                 }
-              }
+              },
+              "required": ["webhook"],
+              "additionalProperties": false
             },
             "emails": {
               "type": "array",

diff --git a/src/dou_dag_generator.py b/src/dou_dag_generator.py
@@ -31,7 +31,7 @@
     get_trigger_date, template_ano_mes_dia_trigger_local_time)
 from notification.notifier import Notifier
 from parsers import DAGConfig, YAMLParser
-from searchers import BaseSearcher, DOUSearcher, QDSearcher
+from searchers import BaseSearcher, DOUSearcher, QDSearcher, INLABSSearcher
 
 class DouDigestDagGenerator():
     """
@@ -53,6 +53,7 @@ def __init__(self, on_retry_callback=None, on_failure_callback=None):
         self.searchers = {
             'DOU': DOUSearcher(),
             'QD': QDSearcher(),
+            'INLABS': INLABSSearcher(),
         }
         self.on_retry_callback = on_retry_callback
         self.on_failure_callback = on_failure_callback
@@ -221,9 +222,8 @@ def perform_searches(
         **context) -> dict:
         """Performs the search in each source and merge the results
         """
-        logging.info('Searching for: %s', ', '.join(term_list))
-        logging.info(
-            f'Trigger date: {get_trigger_date(context, local_time=True)}')
+        logging.info('Searching for: %s', term_list)
+        logging.info('Trigger date: %s', get_trigger_date(context, local_time=True))
 
         if 'DOU' in sources:
             dou_result = self.searchers['DOU'].exec_search(
@@ -236,6 +236,15 @@ def perform_searches(
                 force_rematch,
                 department,
                 get_trigger_date(context, local_time = True))
+        elif 'INLABS' in sources:
+            inlabs_result = self.searchers['INLABS'].exec_search(
+                term_list,
+                dou_sections,
+                search_date,
+                department,
+                ignore_signature_match,
+                get_trigger_date(context, local_time = True)
+            )
 
         if 'QD' in sources:
             qd_result = self.searchers['QD'].exec_search(
@@ -252,8 +261,12 @@ def perform_searches(
 
         if 'DOU' in sources and 'QD' in sources:
             return merge_results(qd_result, dou_result)
+        elif 'INLABS' in sources and 'QD' in sources:
+            return merge_results(qd_result, inlabs_result)
         elif 'DOU' in sources:
             return dou_result
+        elif 'INLABS' in sources:
+            return inlabs_result
         else:
             return qd_result
 

diff --git a/src/dou_hook.py → src/hooks/dou_hook.py b/src/dou_hook.py → src/hooks/dou_hook.py
@@ -1,10 +1,11 @@
 """
 Hook para realizar operações de consultas à API do Diário Oficial da União.
 """
+import sys
+import os
 import logging
-from datetime import datetime, timedelta
+from datetime import datetime
 import time
-from enum import Enum
 import json
 from typing import List
 import requests
@@ -13,42 +14,8 @@
 
 from bs4 import BeautifulSoup
 
-
-class Section(Enum):
-    """Define the section options to be used as parameter in the search"""
-
-    SECAO_1 = "do1"
-    SECAO_2 = "do2"
-    SECAO_3 = "do3"
-    EDICAO_EXTRA = "doe"
-    EDICAO_EXTRA_1A = "do1_extra_a"
-    EDICAO_EXTRA_1B = "do1_extra_b"
-    EDICAO_EXTRA_1D = "do1_extra_d"
-    EDICAO_EXTRA_2A = "do2_extra_a"
-    EDICAO_EXTRA_2B = "do2_extra_b"
-    EDICAO_EXTRA_2D = "do2_extra_d"
-    EDICAO_EXTRA_3A = "do3_extra_a"
-    EDICAO_EXTRA_3B = "do3_extra_b"
-    EDICAO_EXTRA_3D = "do3_extra_d"
-    EDICAO_SUPLEMENTAR = "do1a"
-    TODOS = "todos"
-
-
-class SearchDate(Enum):
-    """Define the search date options to be used as parameter in the search"""
-
-    DIA = "dia"
-    SEMANA = "semana"
-    MES = "mes"
-    ANO = "ano"
-
-
-class Field(Enum):
-    """Define the search field options to be used as parameter in the search"""
-
-    TUDO = "tudo"
-    TITULO = "title_pt_BR"
-    CONTEUDO = "ddm__text__21040__texto_pt_BR"
+sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
+from utils.search_domains import SearchDate, Field, Section, calculate_from_datetime
 
 
 class DOUHook(BaseHook):
@@ -88,36 +55,14 @@ def _get_query_str(self, term, field, is_exact_search):
         else:
             return f"{field.value}-{term}"
 
-    def calculate_from_datetime(
-        self, publish_to_date: datetime, search_date: SearchDate
-    ):
-        """
-        Calculate parameter `publishFrom` to be passed to the API based
-        on publishTo parameter and `search_date`. Perform especial
-        calculation to the MES (month) parameter option
-        """
-        if search_date == SearchDate.DIA:
-            return publish_to_date
-
-        elif search_date == SearchDate.SEMANA:
-            return publish_to_date - timedelta(days=6)
-
-        elif search_date == SearchDate.MES:
-            end_prev_month = publish_to_date.replace(day=1) - timedelta(days=1)
-            publish_from_date = end_prev_month.replace(day=publish_to_date.day)
-            return publish_from_date - timedelta(days=1)
-
-        elif search_date == SearchDate.ANO:
-            return publish_to_date - timedelta(days=364)
-
     def _request_page(self, with_retry: bool, payload: dict):
         try:
             return requests.get(self.IN_API_BASE_URL, params=payload, timeout=10)
         except requests.exceptions.ConnectionError:
             if with_retry:
                 logging.info("Sleep for 30 seconds before retry requests.get().")
                 time.sleep(30)
-                return requests.get(self.IN_API_BASE_URL, params=payload, timeout=10)   
+                return requests.get(self.IN_API_BASE_URL, params=payload, timeout=10)
 
 
     def search_text(
@@ -141,25 +86,25 @@ def search_text(
             - A list of dicts of structred results.
         """
 
-        publish_from = self.calculate_from_datetime(reference_date, search_date)
+        publish_from = calculate_from_datetime(reference_date, search_date)
 
         payload = {
             "q": self._get_query_str(search_term, field, is_exact_search),
             "exactDate": "personalizado",
             "publishFrom": publish_from.strftime("%d-%m-%Y"),
             "publishTo": reference_date.strftime("%d-%m-%Y"),
-            "sortType": "0",    
-            "s": [section.value for section in sections]
+            "sortType": "0",
+            "s": [section.value for section in sections],
         }
         page = self._request_page(payload=payload, with_retry=with_retry)
 
         soup = BeautifulSoup(page.content, "html.parser")
-        
+
         # Checks if there is more than one page of results
         pagination_tag = soup.find(
             'button', id='lastPage'
         )
-        
+
         if (pagination_tag) is not None:
             # Get the number of pages in the pagination bar
             number_pages = int(pagination_tag.text.strip())
@@ -174,11 +119,11 @@ def search_text(
         # Loop for each page of result
         for page_num in range(number_pages):
             logging.info("Searching in page %s", str(page_num + 1))
-            
+
             # If there is more than one page add extra payload params and reload the page
             if page_num > 0:
                 # The id is needed for pagination to work because it requires
-                # passing the last id from the previous item page in request URL          
+                # passing the last id from the previous item page in request URL
                 # Delta is the number of records per page. By now is restricted up to 20.
                 payload.update({
                     "id": item["id"],
@@ -188,11 +133,11 @@ def search_text(
                     "currentPage": page_num,
                 })
                 page = self._request_page(payload=payload, with_retry=with_retry)
-                soup = BeautifulSoup(page.content, "html.parser")            
+                soup = BeautifulSoup(page.content, "html.parser")
 
             script_tag = soup.find(
                 "script", id="_br_com_seatecnologia_in_buscadou_BuscaDouPortlet_params"
-            )   
+            )
             search_results = json.loads(script_tag.contents[0])["jsonArray"]
 
             if search_results: