Skip to content

Commit

Permalink
scripts: switch to API for 165 data
Browse files Browse the repository at this point in the history
  • Loading branch information
danny0838 committed Mar 26, 2024
1 parent 1ab4ea7 commit 4c7d087
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 32 deletions.
67 changes: 39 additions & 28 deletions scripts/build.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
#!/usr/bin/env python3
"""Check and publish blocklists for Content Farm Terminator."""
import argparse
import csv
import glob
import inspect
import io
import ipaddress
import logging
import os
Expand Down Expand Up @@ -998,6 +996,31 @@ def fetch(self, url):

return r

def fetch_data_gov_tw(self, base_url, chunk_size=3000):
"""Fetch from API at https://od.moi.gov.tw/api/v1/rest/datastore/"""
result = []
offset = 0
while True:
url = f'{base_url}?limit={chunk_size}' + (f'&offset={offset}' if offset else '')
log.debug('Fetching: %s', url)
try:
r = requests.get(url)
except requests.exceptions.RequestException as exc:
raise RuntimeError(f'Failed to fetch "{url}": {exc}') from exc

if not r.ok:
raise RuntimeError(f'Failed to fetch "{url}": {r.status_code}')

data = r.json()

if not data['success']:
break

result.extend(data['result']['records'])
offset += chunk_size

return result

def get_rules(self, type, url):
fn = getattr(self, f'get_rules_{type}')
return fn(url)
Expand Down Expand Up @@ -1126,20 +1149,14 @@ def get_rules_json_twnicscams(self, url):
rules.append(rule)
return rules

def get_rules_csv_165jtz(self, url):
"""Special CSV for 假投資 sites from 165."""
response = self.fetch(url)
def get_rules_json_165jtz(self, url):
"""假投資 sites from 165."""
rules = []
fh = io.StringIO(response.text)
reader = csv.reader(fh)
i = 0
for row in reader:
# skip first 2 rows, which are field definitions
if i < 2:
i += 1
continue

u = urlsplit(('' if row[1].startswith('https:') else 'http://') + row[1])
records = self.fetch_data_gov_tw(url)
records.pop(0) # first record is fields
for record in records:
weburl = record['WEBURL']
u = urlsplit(('' if weburl.startswith('https:') else 'http://') + weburl)

domain = u.hostname
if not domain.strip():
Expand All @@ -1155,24 +1172,18 @@ def get_rules_csv_165jtz(self, url):
rules.append(rule)
return rules

def get_rules_csv_165line(self, url):
"""Special CSV for fake Line IDs from 165."""
response = self.fetch(url)
def get_rules_json_165line(self, url):
"""Fake Line IDs from 165."""
rules = []
fh = io.StringIO(response.text)
reader = csv.reader(fh)
i = 0
for row in reader:
# skip first 1 row, which is field definition
if i < 1:
i += 1
continue
records = self.fetch_data_gov_tw(url)
for record in records:
id_ = record['帳號']

# support Line Pages only, since the invite link for a line user ID is encoded
if not (row[1] and row[1].startswith('@')):
if not (id_ and id_.startswith('@')):
continue

rule = Rule(f'line-page:{row[1][1:]}', path=url)
rule = Rule(f'line-page:{id_[1:]}', path=url)
rules.append(rule)
return rules

Expand Down
8 changes: 4 additions & 4 deletions src/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -241,14 +241,14 @@ aggregate:
type: json_twnicscams
- name: 165jtz
homepage: https://data.gov.tw/dataset/160055
source: https://data.moi.gov.tw/MoiOD/System/DownloadFile.aspx?DATA=3BB8E3CE-8223-43AF-B1AB-5824FA889883
source: https://od.moi.gov.tw/api/v1/rest/datastore/A01010000C-002150-013
dest: src/aggregations/scam-sites/165jtz.txt
type: csv_165jtz
type: json_165jtz
- name: 165line
homepage: https://data.gov.tw/dataset/78432
source: https://data.moi.gov.tw/MoiOD/System/DownloadFile.aspx?DATA=7F6BE616-8CE6-449E-8620-5F627C22AA0D
source: https://od.moi.gov.tw/api/v1/rest/datastore/A01010000C-001277-053
dest: src/aggregations/scam-sites/165line.txt
type: csv_165line
type: json_165line

build:
- source:
Expand Down

0 comments on commit 4c7d087

Please sign in to comment.