Skip to content

Commit

Permalink
big bang
Browse files Browse the repository at this point in the history
  • Loading branch information
vitorbellini committed Mar 26, 2024
1 parent 1683e14 commit b58990b
Show file tree
Hide file tree
Showing 13 changed files with 994 additions and 105 deletions.
19 changes: 19 additions & 0 deletions dag_confs/examples_and_tests/inlabs_example.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
dag:
id: inlabs_example
description: DAG de teste
tags:
- inlabs
schedule: 0 8 * * MON-FRI
owner:
- cdata
search:
sources:
- INLABS
terms:
- tecnologia
- informação
report:
emails:
- [email protected]
attach_csv: True
subject: "Teste do Ro-dou"
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pandas==1.5.2,<2
unidecode==1.2.0
unidecode==1.2.0
html2text==2024.2.26
14 changes: 9 additions & 5 deletions schemas/ro-dou.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
"description": "description",
"items": {
"type": "string",
"enum": ["QD", "DOU"]
"enum": ["QD", "DOU", "INLABS"]
}
},
"territory_id": {
Expand Down Expand Up @@ -89,9 +89,9 @@
"description": "departamento para filtro na busca",
"items": {
"type": "string",
"description": "nome do departamento"
"description": "nome do departamento"
}
},
},
"field": {
"type": "string",
"description": "description",
Expand Down Expand Up @@ -164,7 +164,9 @@
"description": "description",
"format": "uri-reference"
}
}
},
"required": ["webhook"],
"additionalProperties": false
},
"discord": {
"type": "object",
Expand All @@ -175,7 +177,9 @@
"description": "description",
"format": "uri-reference"
}
}
},
"required": ["webhook"],
"additionalProperties": false
},
"emails": {
"type": "array",
Expand Down
21 changes: 17 additions & 4 deletions src/dou_dag_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
get_trigger_date, template_ano_mes_dia_trigger_local_time)
from notification.notifier import Notifier
from parsers import DAGConfig, YAMLParser
from searchers import BaseSearcher, DOUSearcher, QDSearcher
from searchers import BaseSearcher, DOUSearcher, QDSearcher, INLABSSearcher

class DouDigestDagGenerator():
"""
Expand All @@ -53,6 +53,7 @@ def __init__(self, on_retry_callback=None, on_failure_callback=None):
self.searchers = {
'DOU': DOUSearcher(),
'QD': QDSearcher(),
'INLABS': INLABSSearcher(),
}
self.on_retry_callback = on_retry_callback
self.on_failure_callback = on_failure_callback
Expand Down Expand Up @@ -221,9 +222,8 @@ def perform_searches(
**context) -> dict:
"""Performs the search in each source and merge the results
"""
logging.info('Searching for: %s', ', '.join(term_list))
logging.info(
f'Trigger date: {get_trigger_date(context, local_time=True)}')
logging.info('Searching for: %s', term_list)
logging.info('Trigger date: %s', get_trigger_date(context, local_time=True))

if 'DOU' in sources:
dou_result = self.searchers['DOU'].exec_search(
Expand All @@ -236,6 +236,15 @@ def perform_searches(
force_rematch,
department,
get_trigger_date(context, local_time = True))
elif 'INLABS' in sources:
inlabs_result = self.searchers['INLABS'].exec_search(
term_list,
dou_sections,
search_date,
department,
ignore_signature_match,
get_trigger_date(context, local_time = True)
)

if 'QD' in sources:
qd_result = self.searchers['QD'].exec_search(
Expand All @@ -252,8 +261,12 @@ def perform_searches(

if 'DOU' in sources and 'QD' in sources:
return merge_results(qd_result, dou_result)
elif 'INLABS' in sources and 'QD' in sources:
return merge_results(qd_result, inlabs_result)
elif 'DOU' in sources:
return dou_result
elif 'INLABS' in sources:
return inlabs_result
else:
return qd_result

Expand Down
85 changes: 15 additions & 70 deletions src/dou_hook.py → src/hooks/dou_hook.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""
Hook para realizar operações de consultas à API do Diário Oficial da União.
"""
import sys
import os
import logging
from datetime import datetime, timedelta
from datetime import datetime
import time
from enum import Enum
import json
from typing import List
import requests
Expand All @@ -13,42 +14,8 @@

from bs4 import BeautifulSoup


class Section(Enum):
"""Define the section options to be used as parameter in the search"""

SECAO_1 = "do1"
SECAO_2 = "do2"
SECAO_3 = "do3"
EDICAO_EXTRA = "doe"
EDICAO_EXTRA_1A = "do1_extra_a"
EDICAO_EXTRA_1B = "do1_extra_b"
EDICAO_EXTRA_1D = "do1_extra_d"
EDICAO_EXTRA_2A = "do2_extra_a"
EDICAO_EXTRA_2B = "do2_extra_b"
EDICAO_EXTRA_2D = "do2_extra_d"
EDICAO_EXTRA_3A = "do3_extra_a"
EDICAO_EXTRA_3B = "do3_extra_b"
EDICAO_EXTRA_3D = "do3_extra_d"
EDICAO_SUPLEMENTAR = "do1a"
TODOS = "todos"


class SearchDate(Enum):
"""Define the search date options to be used as parameter in the search"""

DIA = "dia"
SEMANA = "semana"
MES = "mes"
ANO = "ano"


class Field(Enum):
"""Define the search field options to be used as parameter in the search"""

TUDO = "tudo"
TITULO = "title_pt_BR"
CONTEUDO = "ddm__text__21040__texto_pt_BR"
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
from utils.search_domains import SearchDate, Field, Section, calculate_from_datetime


class DOUHook(BaseHook):
Expand Down Expand Up @@ -88,36 +55,14 @@ def _get_query_str(self, term, field, is_exact_search):
else:
return f"{field.value}-{term}"

def calculate_from_datetime(
self, publish_to_date: datetime, search_date: SearchDate
):
"""
Calculate parameter `publishFrom` to be passed to the API based
on publishTo parameter and `search_date`. Perform especial
calculation to the MES (month) parameter option
"""
if search_date == SearchDate.DIA:
return publish_to_date

elif search_date == SearchDate.SEMANA:
return publish_to_date - timedelta(days=6)

elif search_date == SearchDate.MES:
end_prev_month = publish_to_date.replace(day=1) - timedelta(days=1)
publish_from_date = end_prev_month.replace(day=publish_to_date.day)
return publish_from_date - timedelta(days=1)

elif search_date == SearchDate.ANO:
return publish_to_date - timedelta(days=364)

def _request_page(self, with_retry: bool, payload: dict):
try:
return requests.get(self.IN_API_BASE_URL, params=payload, timeout=10)
except requests.exceptions.ConnectionError:
if with_retry:
logging.info("Sleep for 30 seconds before retry requests.get().")
time.sleep(30)
return requests.get(self.IN_API_BASE_URL, params=payload, timeout=10)
return requests.get(self.IN_API_BASE_URL, params=payload, timeout=10)


def search_text(
Expand All @@ -141,25 +86,25 @@ def search_text(
- A list of dicts of structred results.
"""

publish_from = self.calculate_from_datetime(reference_date, search_date)
publish_from = calculate_from_datetime(reference_date, search_date)

payload = {
"q": self._get_query_str(search_term, field, is_exact_search),
"exactDate": "personalizado",
"publishFrom": publish_from.strftime("%d-%m-%Y"),
"publishTo": reference_date.strftime("%d-%m-%Y"),
"sortType": "0",
"s": [section.value for section in sections]
"sortType": "0",
"s": [section.value for section in sections],
}
page = self._request_page(payload=payload, with_retry=with_retry)

soup = BeautifulSoup(page.content, "html.parser")

# Checks if there is more than one page of results
pagination_tag = soup.find(
'button', id='lastPage'
)

if (pagination_tag) is not None:
# Get the number of pages in the pagination bar
number_pages = int(pagination_tag.text.strip())
Expand All @@ -174,11 +119,11 @@ def search_text(
# Loop for each page of result
for page_num in range(number_pages):
logging.info("Searching in page %s", str(page_num + 1))

# If there is more than one page add extra payload params and reload the page
if page_num > 0:
# The id is needed for pagination to work because it requires
# passing the last id from the previous item page in request URL
# passing the last id from the previous item page in request URL
# Delta is the number of records per page. By now is restricted up to 20.
payload.update({
"id": item["id"],
Expand All @@ -188,11 +133,11 @@ def search_text(
"currentPage": page_num,
})
page = self._request_page(payload=payload, with_retry=with_retry)
soup = BeautifulSoup(page.content, "html.parser")
soup = BeautifulSoup(page.content, "html.parser")

script_tag = soup.find(
"script", id="_br_com_seatecnologia_in_buscadou_BuscaDouPortlet_params"
)
)
search_results = json.loads(script_tag.contents[0])["jsonArray"]

if search_results:
Expand Down
Loading

0 comments on commit b58990b

Please sign in to comment.