Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First steps to lambda deploy #13

Merged
merged 7 commits into from
Aug 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ data
.DS_Store
.pytest_cache
*.sqlite
.aws-sam


# Byte-compiled / optimized / DLL files
Expand Down
16 changes: 16 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
.DEFAULT_GOAL := help

.PHONY: all
all: clean requirements.txt

.PHONY: clean
clean: ## Delete requirements.txt
rm -rf requirements.txt

requirements.txt: Pipfile Pipfile.lock ## Update the requirements.txt file used to build this Lambda function's DependenciesLayer
pipenv lock -r > requirements.txt

.PHONY: help
# gratuitously adapted from https://marmelab.com/blog/2016/02/29/auto-documented-makefile.html
help: ## Display this help text
@grep -E '^[-a-zA-Z0-9_/.]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%s\033[0m\n\t%s\n", $$1, $$2}'
2 changes: 2 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,11 @@ python-dateutil = "*"
retry = "*"
black = "*"
rich = "*"
boto3 = "*"

[dev-packages]
pytest-mypy-plugins = "*"
aws-sam-cli = "*"

[requires]
python_version = "3.8"
Expand Down
882 changes: 668 additions & 214 deletions Pipfile.lock

Large diffs are not rendered by default.

Empty file added lgsf/aws_lambda/__init__.py
Empty file.
20 changes: 20 additions & 0 deletions lgsf/aws_lambda/fixtures/sqs-message-der.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"Records": [
{
"messageId": "19dd0b57-b21e-4ac1-bd88-01bbb068cb78",
"receiptHandle": "MessageReceiptHandle",
"body": "{\"scraper_type\": \"councillors\",\"council\": \"DER\"}",
"attributes": {
"ApproximateReceiveCount": "1",
"SentTimestamp": "1523232000000",
"SenderId": "123456789012",
"ApproximateFirstReceiveTimestamp": "1523232000001"
},
"messageAttributes": {},
"md5OfBody": "7b270e59b47ff90a553787216d55d91d",
"eventSource": "aws:sqs",
"eventSourceARN": "arn:aws:sqs:us-east-1:123456789012:MyQueue",
"awsRegion": "eu-west-2"
}
]
}
20 changes: 20 additions & 0 deletions lgsf/aws_lambda/fixtures/sqs-message-sfk.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"Records": [
{
"messageId": "19dd0b57-b21e-4ac1-bd88-01bbb068cb78",
"receiptHandle": "MessageReceiptHandle",
"body": "{\"scraper_type\": \"councillors\",\"council\": \"SFK\"}",
"attributes": {
"ApproximateReceiveCount": "1",
"SentTimestamp": "1523232000000",
"SenderId": "123456789012",
"ApproximateFirstReceiveTimestamp": "1523232000001"
},
"messageAttributes": {},
"md5OfBody": "7b270e59b47ff90a553787216d55d91d",
"eventSource": "aws:sqs",
"eventSourceARN": "arn:aws:sqs:us-east-1:123456789012:MyQueue",
"awsRegion": "eu-west-2"
}
]
}
20 changes: 20 additions & 0 deletions lgsf/aws_lambda/fixtures/sqs-message-wlv.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"Records": [
{
"messageId": "19dd0b57-b21e-4ac1-bd88-01bbb068cb78",
"receiptHandle": "MessageReceiptHandle",
"body": "{\"scraper_type\": \"councillors\",\"council\": \"WLV\"}",
"attributes": {
"ApproximateReceiveCount": "1",
"SentTimestamp": "1523232000000",
"SenderId": "123456789012",
"ApproximateFirstReceiveTimestamp": "1523232000001"
},
"messageAttributes": {},
"md5OfBody": "7b270e59b47ff90a553787216d55d91d",
"eventSource": "aws:sqs",
"eventSourceARN": "arn:aws:sqs:us-east-1:123456789012:MyQueue",
"awsRegion": "eu-west-2"
}
]
}
44 changes: 44 additions & 0 deletions lgsf/aws_lambda/handlers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import json
import sys
from datetime import datetime

import boto3
from rich.console import Console

from lgsf.councillors.commands import Command
from lgsf.path_utils import load_scraper


def scraper_worker_handler(event, context):
message = json.loads(event["Records"][0]["body"])
council = message["council"]
command_name = message["scraper_type"]
scraper_cls = load_scraper(council, command_name)
options = {"council": council, "verbose": True, "aws_lambda": True}
GeoWill marked this conversation as resolved.
Show resolved Hide resolved
console = Console(file=sys.stdout)
console.log(f"Begin attempting to scrape: {council}")
scraper = scraper_cls(options, console)
try:
scraper.run()
except Exception as e:
scraper.console.log(e)
scraper.delete_branch()
console.log(f"Finished attempting to scrape: {council}")


def queue_builder_handler(event, context):
councillors_command = Command(argv=["", "--all-councils"], stdout=sys.stdout)
councillors_command.options = {"all_councils": True, "exclude_missing": True}
councils = councillors_command.councils_to_run

sqs = boto3.resource("sqs")

queue = sqs.get_queue_by_name(QueueName="ScraperQueue")

for council in councils:
message = {
"scraper_type": "councillors",
"council": council,
} # TODO Define this somewhere else so scraper_worker_handler can share it.
queue.send_message(MessageBody=json.dumps(message))
print(message)
8 changes: 8 additions & 0 deletions lgsf/commands/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ def create_parser(self):
action="store_true",
help="Run this command for all councils",
)
self.parser.add_argument(
"--exclude-missing",
action="store_true",
help="Don't run councils missing a scraper matching command name",
)
self.parser.add_argument(
"-t",
"--tags",
Expand Down Expand Up @@ -195,6 +200,9 @@ def councils_to_run(self):
for council in self.options["council"].split(","):
council = council.strip().split("-")[0].upper()
councils.append(council)

if self.options["exclude_missing"]:
councils = list(set(councils) - set(c["code"] for c in self.missing()))
return councils

def run_councils(self):
Expand Down
62 changes: 55 additions & 7 deletions lgsf/councillors/scrapers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
from bs4 import BeautifulSoup
from dateutil.parser import parse

from lgsf.scrapers import ScraperBase
from lgsf.councillors import CouncillorBase
from lgsf.scrapers import ScraperBase, CodeCommitMixin
from lgsf.councillors import CouncillorBase, json


class BaseCouncillorScraper(ScraperBase):
class BaseCouncillorScraper(CodeCommitMixin, ScraperBase):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels a bit brittle, as the order matters here. But I guess that's just multiple inheritance.

tags = []
class_tags = []
ext = "html"
Expand All @@ -16,6 +16,7 @@ class BaseCouncillorScraper(ScraperBase):
def __init__(self, options, console):
super().__init__(options, console)
self.councillors = set()
self.repository = "CouncillorsRepo"

@abc.abstractmethod
def get_councillors(self):
Expand All @@ -36,11 +37,47 @@ def get_tags(self):

def run(self):

if self.options.get("aws_lambda"):
self.delete_data_if_exists()

for councillor_html in self.get_councillors():
councillor = self.get_single_councillor(councillor_html)
self.save_councillor(councillor_html, councillor)
self.process_councillor(councillor, councillor_html)

self.aws_tidy_up()

self.report()

def process_councillor(self, councillor, councillor_raw_str):
if self.options.get("aws_lambda"):
# stage...
self.stage_councillor(councillor_raw_str, councillor)

# Do a batch commit if needed...
if len(self.put_files) > 90:
self.process_batch()
else:
self.save_councillor(councillor_raw_str, councillor)

def stage_councillor(self, councillor_html, councillor):
council = self.options["council"]
json_file_path = f"{council}/json/{councillor.as_file_name()}.json"
raw_file_path = f"{council}/raw/{councillor.as_file_name()}.html"
self.put_files.extend(
[
{
"filePath": json_file_path,
"fileContent": bytes(
json.dumps(councillor.as_dict(), indent=4), "utf-8"
),
},
{
"filePath": raw_file_path,
"fileContent": bytes(councillor_html.prettify(), "utf-8"),
},
]
)

def save_councillor(self, raw_content, councillor_obj):
assert (
type(councillor_obj) == CouncillorBase
Expand All @@ -55,7 +92,14 @@ def report(self):
raise ValueError(
"Not many councillors found ({})".format(len(self.councillors))
)
self.console.log("Found {} councillors".format(len(self.councillors)))
if self.new_data:
self.console.log(
f"Found {len(self.councillors)} councillors with some new data"
)
else:
self.console.log(
f"Found {len(self.councillors)} councillors but no new data"
)


class HTMLCouncillorScraper(BaseCouncillorScraper):
Expand Down Expand Up @@ -110,12 +154,16 @@ class ModGovCouncillorScraper(BaseCouncillorScraper):
ext = "xml"

def run(self):
wards = self.get_councillors()

if self.options.get("aws_lambda"):
self.delete_data_if_exists()
wards = self.get_councillors()
for ward in wards:
for councillor_xml in ward.find_all("councillor"):
councillor = self.get_single_councillor(ward, councillor_xml)
self.save_councillor(councillor_xml, councillor)
self.process_councillor(councillor, councillor_xml)

self.aws_tidy_up()
self.report()

def format_councillor_api_url(self):
Expand Down
Loading