-
Notifications
You must be signed in to change notification settings - Fork 10
/
verify_links.py
136 lines (119 loc) · 4.33 KB
/
verify_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
"""Common code for link verification scripts in data validation.
"""
import logging
import random
import re
from typing import Tuple
import requests
import pandas as pd
from bs4 import BeautifulSoup
from unidecode import unidecode
from frictionless import Package
from settings import USER_AGENT, DEFAULT_TIMEOUT as TIMEOUT
WEBSITE_RESOURCE_NAME = 'brazilian-municipality-and-state-websites'
def healthy_link(link: str) -> requests.Response:
"""Check whether or not the link is healthy.
Args:
link (str): The url of the link to be verified.
Returns:
requests.Response: The Response object in case the link
is healthy, None otherwise.
"""
try:
response = requests.get(
link,
headers={'user-agent': USER_AGENT},
timeout=TIMEOUT
)
except (
requests.exceptions.ConnectionError,
requests.exceptions.InvalidURL,
requests.exceptions.TooManyRedirects,
requests.exceptions.ReadTimeout
):
return None
if response and response.status_code == 200:
return response
return None
def get_title_and_type(
response: requests.Response,
candidates: pd.DataFrame) -> Tuple[str, str]:
"""Try to infer the type of site this is.
Args:
response (requests.Response): The Response object obtained when
crawling the page.
candidates (pd.DataFrame): The Pandas dataframe slice containing
the candidate links.
Returns:
Tuple[str, str]: The page title and link type category.
"""
soup = BeautifulSoup(response.text, 'html.parser')
title_tag = soup.find('title')
if title_tag is None:
return None, None
title = unidecode(title_tag.text.lower())
link_types = candidates.link_type
if 'prefeitura' in link_types:
link_type = 'prefeitura'
elif 'camara' in link_types:
link_type = 'camara'
elif 'hino' in title:
link_type = None
elif 'brasao' in title:
link_type = None
elif 'prefeitura' in title:
link_type = 'prefeitura'
elif 'municipio' in title:
link_type = 'prefeitura'
elif re.match(r'c.{0,3}mara', title, re.IGNORECASE):
link_type = 'camara'
elif 'poder executivo' in title:
link_type = 'prefeitura'
elif 'governo municipal' in title:
link_type = 'prefeitura'
elif 'pref.' in title:
link_type = 'prefeitura'
else:
logging.warning(
'Unable to determine site type from title: “%s”.', title)
link_type = None
return title_tag.text, link_type
def get_candidate_links(file_path: str, max_quantity: int) -> pd.DataFrame:
"""Reads the csv table containing the candidate links.
Args:
file_path (str): The path to the csv file.
max_quantity (int): The maximum number of entries to read. If
`None`, returns all the data. If less than the number of
entries in the file, selects a sample of this site.
Returns:
pd.DataFrame: The Pandas dataframe containing the read table.
"""
candidates = pd.read_csv(file_path)
logging.info('Found %d websites in %s.', len(candidates), file_path)
codes = candidates.code.unique()
random.shuffle(codes) # randomize sequence
if max_quantity:
codes = codes[:max_quantity] # take a subsample for quicker processing
return candidates[candidates.code.isin(codes)]
def get_output_to_be_merged(data_package_path: str) -> pd.DataFrame:
"""Gets the dataframe for merging the output with.
Args:
data_package_path (str): The path to the data package.
Returns:
pd.DataFrame: The dataframe with the data.
"""
package = Package(data_package_path)
resource = package.get_resource(WEBSITE_RESOURCE_NAME)
return resource.to_pandas()
def store_csv(table: pd.DataFrame, data_package_path: str):
"""Stores the csv file in the output folder.
Args:
table (pd.DataFrame): The dataframe containing the data.
data_package_path (str): Path to the data package.
"""
package = Package(data_package_path)
resource = package.get_resource(WEBSITE_RESOURCE_NAME)
output = resource.fullpath # filename of csv to write
logging.info('Recording %s...', output)
# store the file
table.to_csv(output, index=False, date_format='%Y-%m-%dT%H:%M:%SZ')