DemocracyClub · GeoWill · Aug 12, 2021 · Aug 11, 2021 · Aug 11, 2021 · Aug 11, 2021
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@ data
 .DS_Store
 .pytest_cache
 *.sqlite
+.aws-sam
 
 
 # Byte-compiled / optimized / DLL files

diff --git a/Makefile b/Makefile
@@ -0,0 +1,16 @@
+.DEFAULT_GOAL := help
+
+.PHONY: all
+all: clean requirements.txt
+
+.PHONY: clean
+clean: ## Delete requirements.txt
+	rm -rf requirements.txt
+
+requirements.txt: Pipfile Pipfile.lock ## Update the requirements.txt file used to build this Lambda function's DependenciesLayer
+	pipenv lock -r > requirements.txt
+
+.PHONY: help
+# gratuitously adapted from https://marmelab.com/blog/2016/02/29/auto-documented-makefile.html
+help: ## Display this help text
+	@grep -E '^[-a-zA-Z0-9_/.]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%s\033[0m\n\t%s\n", $$1, $$2}'
diff --git a/Pipfile b/Pipfile
@@ -21,9 +21,11 @@ python-dateutil = "*"
 retry = "*"
 black = "*"
 rich = "*"
+boto3 = "*"
 
 [dev-packages]
 pytest-mypy-plugins = "*"
+aws-sam-cli = "*"
 
 [requires]
 python_version = "3.8"

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/lgsf/aws_lambda/__init__.py b/lgsf/aws_lambda/__init__.py
diff --git a/lgsf/aws_lambda/fixtures/sqs-message-der.json b/lgsf/aws_lambda/fixtures/sqs-message-der.json
@@ -0,0 +1,20 @@
+{
+  "Records": [
+    {
+      "messageId": "19dd0b57-b21e-4ac1-bd88-01bbb068cb78",
+      "receiptHandle": "MessageReceiptHandle",
+      "body": "{\"scraper_type\": \"councillors\",\"council\": \"DER\"}",
+    "attributes": {
+        "ApproximateReceiveCount": "1",
+        "SentTimestamp": "1523232000000",
+        "SenderId": "123456789012",
+        "ApproximateFirstReceiveTimestamp": "1523232000001"
+      },
+      "messageAttributes": {},
+      "md5OfBody": "7b270e59b47ff90a553787216d55d91d",
+      "eventSource": "aws:sqs",
+      "eventSourceARN": "arn:aws:sqs:us-east-1:123456789012:MyQueue",
+      "awsRegion": "eu-west-2"
+    }
+  ]
+}
diff --git a/lgsf/aws_lambda/fixtures/sqs-message-sfk.json b/lgsf/aws_lambda/fixtures/sqs-message-sfk.json
@@ -0,0 +1,20 @@
+{
+  "Records": [
+    {
+      "messageId": "19dd0b57-b21e-4ac1-bd88-01bbb068cb78",
+      "receiptHandle": "MessageReceiptHandle",
+      "body": "{\"scraper_type\": \"councillors\",\"council\": \"SFK\"}",
+      "attributes": {
+        "ApproximateReceiveCount": "1",
+        "SentTimestamp": "1523232000000",
+        "SenderId": "123456789012",
+        "ApproximateFirstReceiveTimestamp": "1523232000001"
+      },
+      "messageAttributes": {},
+      "md5OfBody": "7b270e59b47ff90a553787216d55d91d",
+      "eventSource": "aws:sqs",
+      "eventSourceARN": "arn:aws:sqs:us-east-1:123456789012:MyQueue",
+      "awsRegion": "eu-west-2"
+    }
+  ]
+}
diff --git a/lgsf/aws_lambda/fixtures/sqs-message-wlv.json b/lgsf/aws_lambda/fixtures/sqs-message-wlv.json
@@ -0,0 +1,20 @@
+{
+  "Records": [
+    {
+      "messageId": "19dd0b57-b21e-4ac1-bd88-01bbb068cb78",
+      "receiptHandle": "MessageReceiptHandle",
+      "body": "{\"scraper_type\": \"councillors\",\"council\": \"WLV\"}",
+      "attributes": {
+        "ApproximateReceiveCount": "1",
+        "SentTimestamp": "1523232000000",
+        "SenderId": "123456789012",
+        "ApproximateFirstReceiveTimestamp": "1523232000001"
+      },
+      "messageAttributes": {},
+      "md5OfBody": "7b270e59b47ff90a553787216d55d91d",
+      "eventSource": "aws:sqs",
+      "eventSourceARN": "arn:aws:sqs:us-east-1:123456789012:MyQueue",
+      "awsRegion": "eu-west-2"
+    }
+  ]
+}
diff --git a/lgsf/aws_lambda/handlers.py b/lgsf/aws_lambda/handlers.py
@@ -0,0 +1,44 @@
+import json
+import sys
+from datetime import datetime
+
+import boto3
+from rich.console import Console
+
+from lgsf.councillors.commands import Command
+from lgsf.path_utils import load_scraper
+
+
+def scraper_worker_handler(event, context):
+    message = json.loads(event["Records"][0]["body"])
+    council = message["council"]
+    command_name = message["scraper_type"]
+    scraper_cls = load_scraper(council, command_name)
+    options = {"council": council, "verbose": True, "aws_lambda": True}
+    console = Console(file=sys.stdout)
+    console.log(f"Begin attempting to scrape: {council}")
+    scraper = scraper_cls(options, console)
+    try:
+        scraper.run()
+    except Exception as e:
+        scraper.console.log(e)
+        scraper.delete_branch()
+    console.log(f"Finished attempting to scrape: {council}")
+
+
+def queue_builder_handler(event, context):
+    councillors_command = Command(argv=["", "--all-councils"], stdout=sys.stdout)
+    councillors_command.options = {"all_councils": True, "exclude_missing": True}
+    councils = councillors_command.councils_to_run
+
+    sqs = boto3.resource("sqs")
+
+    queue = sqs.get_queue_by_name(QueueName="ScraperQueue")
+
+    for council in councils:
+        message = {
+            "scraper_type": "councillors",
+            "council": council,
+        }  # TODO Define this somewhere else so scraper_worker_handler can share it.
+        queue.send_message(MessageBody=json.dumps(message))
+        print(message)
diff --git a/lgsf/commands/base.py b/lgsf/commands/base.py
@@ -68,6 +68,11 @@ def create_parser(self):
             action="store_true",
             help="Run this command for all councils",
         )
+        self.parser.add_argument(
+            "--exclude-missing",
+            action="store_true",
+            help="Don't run councils missing a scraper matching command name",
+        )
         self.parser.add_argument(
             "-t",
             "--tags",
@@ -195,6 +200,9 @@ def councils_to_run(self):
             for council in self.options["council"].split(","):
                 council = council.strip().split("-")[0].upper()
                 councils.append(council)
+
+        if self.options["exclude_missing"]:
+            councils = list(set(councils) - set(c["code"] for c in self.missing()))
         return councils
 
     def run_councils(self):

diff --git a/lgsf/councillors/scrapers.py b/lgsf/councillors/scrapers.py
@@ -3,11 +3,11 @@
 from bs4 import BeautifulSoup
 from dateutil.parser import parse
 
-from lgsf.scrapers import ScraperBase
-from lgsf.councillors import CouncillorBase
+from lgsf.scrapers import ScraperBase, CodeCommitMixin
+from lgsf.councillors import CouncillorBase, json
 
 
-class BaseCouncillorScraper(ScraperBase):
+class BaseCouncillorScraper(CodeCommitMixin, ScraperBase):
     tags = []
     class_tags = []
     ext = "html"
@@ -16,6 +16,7 @@ class BaseCouncillorScraper(ScraperBase):
     def __init__(self, options, console):
         super().__init__(options, console)
         self.councillors = set()
+        self.repository = "CouncillorsRepo"
 
     @abc.abstractmethod
     def get_councillors(self):
@@ -36,11 +37,47 @@ def get_tags(self):
 
     def run(self):
 
+        if self.options.get("aws_lambda"):
+            self.delete_data_if_exists()
+
         for councillor_html in self.get_councillors():
             councillor = self.get_single_councillor(councillor_html)
-            self.save_councillor(councillor_html, councillor)
+            self.process_councillor(councillor, councillor_html)
+
+        self.aws_tidy_up()
+
         self.report()
 
+    def process_councillor(self, councillor, councillor_raw_str):
+        if self.options.get("aws_lambda"):
+            # stage...
+            self.stage_councillor(councillor_raw_str, councillor)
+
+            # Do a batch commit if needed...
+            if len(self.put_files) > 90:
+                self.process_batch()
+        else:
+            self.save_councillor(councillor_raw_str, councillor)
+
+    def stage_councillor(self, councillor_html, councillor):
+        council = self.options["council"]
+        json_file_path = f"{council}/json/{councillor.as_file_name()}.json"
+        raw_file_path = f"{council}/raw/{councillor.as_file_name()}.html"
+        self.put_files.extend(
+            [
+                {
+                    "filePath": json_file_path,
+                    "fileContent": bytes(
+                        json.dumps(councillor.as_dict(), indent=4), "utf-8"
+                    ),
+                },
+                {
+                    "filePath": raw_file_path,
+                    "fileContent": bytes(councillor_html.prettify(), "utf-8"),
+                },
+            ]
+        )
+
     def save_councillor(self, raw_content, councillor_obj):
         assert (
             type(councillor_obj) == CouncillorBase
@@ -55,7 +92,14 @@ def report(self):
                 raise ValueError(
                     "Not many councillors found ({})".format(len(self.councillors))
                 )
-            self.console.log("Found {} councillors".format(len(self.councillors)))
+            if self.new_data:
+                self.console.log(
+                    f"Found {len(self.councillors)} councillors with some new data"
+                )
+            else:
+                self.console.log(
+                    f"Found {len(self.councillors)} councillors but no new data"
+                )
 
 
 class HTMLCouncillorScraper(BaseCouncillorScraper):
@@ -110,12 +154,16 @@ class ModGovCouncillorScraper(BaseCouncillorScraper):
     ext = "xml"
 
     def run(self):
-        wards = self.get_councillors()
 
+        if self.options.get("aws_lambda"):
+            self.delete_data_if_exists()
+        wards = self.get_councillors()
         for ward in wards:
             for councillor_xml in ward.find_all("councillor"):
                 councillor = self.get_single_councillor(ward, councillor_xml)
-                self.save_councillor(councillor_xml, councillor)
+                self.process_councillor(councillor, councillor_xml)
+
+        self.aws_tidy_up()
         self.report()
 
     def format_councillor_api_url(self):