-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Committee notifier #54
Open
geoffkilpin
wants to merge
2
commits into
mysociety:master
Choose a base branch
from
geoffkilpin:committee-notifier
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
# -*- coding: utf-8 -*- | ||
import distutils.spawn | ||
import hashlib | ||
import os | ||
import sys | ||
import re | ||
import requests | ||
import subprocess | ||
import tempfile | ||
import warnings | ||
import datetime | ||
import lxml.etree | ||
|
||
import parslepy | ||
|
||
from django.core.exceptions import ImproperlyConfigured | ||
from django.conf import settings | ||
|
||
from za_hansard.models import ATCDocument | ||
|
||
#Largely based on question_scraper.py | ||
|
||
# from https://github.com/scraperwiki/scraperwiki-python/blob/a96582f6c20cc1897f410d522e2a5bf37d301220/scraperwiki/utils.py#L38-L54 | ||
# Copied rather than included as the scraperwiki __init__.py was having trouble | ||
# loading the sqlite code, which is something we don't actually need. | ||
|
||
def ensure_executable_found(name): | ||
if not distutils.spawn.find_executable(name): | ||
raise ImproperlyConfigured("Can't find executable '{0}' which is needed by this code".format(name)) | ||
|
||
ensure_executable_found("pdftohtml") | ||
def pdftoxml(pdfdata): | ||
"""converts pdf file to xml file""" | ||
pdffout = tempfile.NamedTemporaryFile(suffix='.pdf') | ||
pdffout.write(pdfdata) | ||
pdffout.flush() | ||
|
||
xmlin = tempfile.NamedTemporaryFile(mode='r', suffix='.xml') | ||
tmpxml = xmlin.name # "temph.xml" | ||
cmd = 'pdftohtml -xml -nodrm -zoom 1.5 -enc UTF-8 -noframes "%s" "%s"' % (pdffout.name, os.path.splitext(tmpxml)[0]) | ||
cmd = cmd + " >/dev/null 2>&1" # can't turn off output, so throw away even stderr yeuch | ||
os.system(cmd) | ||
|
||
pdffout.close() | ||
#xmlfin = open(tmpxml) | ||
xmldata = xmlin.read() | ||
xmlin.close() | ||
|
||
# pdftohtml version 0.18.4 occasionally produces bad markup of the form <b>...<i>...</b> </i> | ||
# Since ee don't actually need <i> tags, we may as well get rid of them all now, which will fix this. | ||
# Note that we're working with a byte string version of utf-8 encoded data here. | ||
|
||
xmldata = re.sub(r'</?i>', '', xmldata) | ||
|
||
return xmldata | ||
|
||
class ATCDocumentParser(object): | ||
def __init__(self, **kwargs): | ||
self.kwargs = kwargs | ||
|
||
@classmethod | ||
def check_committee_membership_announcement(cls, document_path): | ||
|
||
if os.path.exists(document_path): | ||
with open(document_path) as f: | ||
contents = f.read() | ||
|
||
if not contents: | ||
return | ||
|
||
xmldata = pdftoxml(contents) | ||
|
||
if not xmldata: | ||
sys.stdout.write(' SKIPPING - Got no XML data\n') | ||
return | ||
|
||
text = lxml.etree.fromstring(xmldata) | ||
|
||
for el in text.iterfind('.//text'): | ||
if 'Membership of Committees' in re.match(ur'(?s)<text.*?>(.*?)</text>', lxml.etree.tostring(el, encoding='unicode')).group(1): | ||
return True | ||
|
||
#committee announcement not found | ||
return False | ||
|
||
|
232 changes: 232 additions & 0 deletions
232
za_hansard/management/commands/za_hansard_atc_scraper.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,232 @@ | ||
import pprint | ||
import httplib2 | ||
import re | ||
import datetime | ||
import time | ||
import sys | ||
|
||
from optparse import make_option | ||
from bs4 import BeautifulSoup | ||
|
||
from django.conf import settings | ||
|
||
|
||
from django.core.management.base import BaseCommand, CommandError | ||
from django.core.mail import send_mail | ||
|
||
from za_hansard.models import ATCDocument | ||
from ... import atc_scraper | ||
|
||
#Largely based on za_hansard_check_for_new_sources.py and za_hansard_run_parsing.py | ||
|
||
class FailedToRetrieveSourceException (Exception): | ||
pass | ||
|
||
class Command(BaseCommand): | ||
help = 'Scrapes ZA Parliament Announcements, Tablings and Committee Reports documents. Currently only monitors for mentions of changes to committee membership.' | ||
option_list = BaseCommand.option_list + ( | ||
make_option('--check-all', | ||
default=False, | ||
action='store_true', | ||
help="Don't stop when when reaching a previously seen item (applies to checking contents and for new papers)", | ||
), | ||
make_option('--check-all-papers', | ||
default=False, | ||
action='store_true', | ||
help="Don't stop when when reaching a previously seen item (applies only to new papers)", | ||
), | ||
make_option('--retry', | ||
default=False, | ||
action='store_true', | ||
help='Retry attempted (but not completed) parses and previously 404\'d documents', | ||
), | ||
make_option('--check-for-papers', | ||
default=False, | ||
action='store_true', | ||
help='Check for new ATC papers', | ||
), | ||
make_option('--check-committees', | ||
default=False, | ||
action='store_true', | ||
help='Checks for changes to committee membership ', | ||
), | ||
make_option('--historical-limit', | ||
default='2014-05-07', | ||
type='str', | ||
help='Limit earliest historical entry to check (in yyyy-mm-dd format, default 2014-05-07)', | ||
), | ||
make_option('--limit', | ||
default=0, | ||
type='str', | ||
help='Limit number of entries to check (applies to checking contents and for new papers)', | ||
), | ||
make_option('--run-all-steps', | ||
default=False, | ||
action='store_true', | ||
help='Check for new papers and for new committee memberships', | ||
), | ||
) | ||
|
||
def handle(self, *args, **options): | ||
|
||
self.historical_limit = datetime.datetime.strptime(options['historical_limit'], '%Y-%m-%d').date() | ||
self.limit = options['limit'] | ||
self.check_all = options['check_all'] | ||
self.check_all_papers = options['check_all_papers'] | ||
self.retry = options['retry'] | ||
|
||
if options['check_for_papers']: | ||
self.check_for_papers(options) | ||
elif options['check_committees']: | ||
self.check_committees(options) | ||
elif options['run_all_steps']: | ||
self.check_for_papers(options) | ||
self.check_committees(options) | ||
|
||
def check_committees(self, options): | ||
sources = ATCDocument.objects.all() | ||
if not self.check_all: | ||
sources = sources.filter( last_processing_success=None ) | ||
if (not self.retry) and (not self.check_all): | ||
sources = sources.filter(is404 = False).filter( last_processing_attempt=None ) | ||
|
||
if not sources: | ||
print 'No documents to check.' | ||
|
||
for s in (sources[:self.limit] if self.limit else sources): | ||
|
||
if s.date < self.historical_limit: | ||
print "Reached historical limit. Stopping.\n" | ||
return | ||
|
||
s.last_processing_attempt = datetime.datetime.now() | ||
s.save() | ||
|
||
try: | ||
try: | ||
filename = s.file() | ||
if s.is404: | ||
s.is404 = False | ||
s.save() | ||
except SourceUrlCouldNotBeRetrieved as e: | ||
s.is404 = True | ||
s.save() | ||
raise e | ||
|
||
if atc_scraper.ATCDocumentParser.check_committee_membership_announcement(filename): | ||
self.stdout.write( "Committee announcement found %s (%d)\n" % (s.document_name, s.document_number) ) | ||
s.contains_committee_announcement = True | ||
|
||
message = '''A committee announcement was found in the following ATC document:\n | ||
\n | ||
Document: %s\n | ||
Date: %s\n | ||
House: %s\n | ||
Language: %s\n | ||
URL: %s | ||
''' % (s.document_name, s.date, s.house, s.language, 'http://www.parliament.gov.za/live/' + s.url) | ||
|
||
send_mail('New committee announcement found - People\'s Assembly', message, settings.FROM_EMAIL, settings.ZA_COMMITTEE_NOTIFICATION_EMAIL, fail_silently=False) | ||
|
||
s.last_processing_success = datetime.datetime.now() | ||
|
||
s.save() | ||
self.stdout.write( "Processed %s (%d)\n" % (s.document_name, s.document_number) ) | ||
except Exception as e: | ||
# raise CommandError("Failed to run parsing: %s" % str(e)) | ||
self.stderr.write("WARN: Failed to run parsing: %s" % str(e)) | ||
|
||
def check_for_papers(self, options): | ||
sources = self.retrieve_sources(0, options) | ||
sources.reverse() | ||
sources_db = [ATCDocument.objects.get_or_create(**source) for source in sources] | ||
sources_count = len(sources) | ||
created_count = sum([1 for (_,created) in sources_db if created]) | ||
self.stdout.write('ATC documents found: %d\nATC documents created: %d\n' % ( | ||
sources_count, created_count)) | ||
|
||
def retrieve_sources(self, start, options): | ||
|
||
try: | ||
url = 'http://www.parliament.gov.za/live/content.php?Category_ID=227&DocumentStart=%d' % (start or 0) | ||
self.stdout.write("Retrieving %s\n" % url) | ||
h = httplib2.Http( settings.HTTPLIB2_CACHE_DIR ) | ||
response, content = h.request(url) | ||
assert response.status == 200 | ||
self.stdout.write("OK\n") | ||
# content = open('test.html').read() | ||
|
||
# parse content | ||
soup = BeautifulSoup( | ||
content, | ||
'xml', | ||
) | ||
|
||
rx = re.compile(r'Displaying (\d+) (\d+) of the most recent (\d+)') | ||
|
||
pager = soup.find('td', text=rx) | ||
match = rx.search(pager.text) | ||
(pstart, pend, ptotal) = [int(p) for p in match.groups()] | ||
|
||
self.stdout.write( "Processing %d to %d\n" % (pstart, pend) ) | ||
|
||
nodes = soup.findAll( 'a', text="View Document" ) | ||
def scrape(node): | ||
url = node['href'] | ||
table = node.find_parent('table') | ||
rx = re.compile(r'>([^:<]*) : ([^<]*)<') | ||
|
||
data = {} | ||
for match in re.finditer(rx, str(table)): | ||
groups = match.groups() | ||
data[groups[0]] = groups[1] | ||
|
||
title = '' | ||
try: | ||
data['Title'] = table.find('b').text | ||
except: | ||
data['Title'] = data.get('Document Summary', '(unknown)') | ||
|
||
try: | ||
document_date = datetime.datetime.strptime(data['Date Published'], '%d %B %Y').date() | ||
except Exception as e: | ||
raise CommandError( "Date could not be parsed\n%s" % str(e) ) | ||
# document_date = datetime.date.today() | ||
|
||
#(obj, created) = Source.objects.get_or_create( | ||
return { | ||
'document_name': data['Document Name'], | ||
'document_number': data['Document Number'], | ||
'defaults': { | ||
'url': url, | ||
'title': data['Title'], | ||
'language': data.get('Language', 'English'), | ||
'house': data.get('House', '(unknown)'), | ||
'date': document_date, | ||
} | ||
} | ||
scraped = [] | ||
for node in nodes: | ||
s = scrape(node) | ||
if ATCDocument.objects.filter( | ||
document_name = s['document_name'], | ||
document_number = s['document_number']).exists(): | ||
if (not self.check_all) and (not self.check_all_papers): | ||
print "Reached seen document. Stopping.\n" | ||
return scraped | ||
if s['defaults']['date'] < self.historical_limit: | ||
print "Reached historical limit. Stopping.\n" | ||
return scraped | ||
|
||
# otherwise | ||
scraped.append(s) | ||
|
||
if pend < (self.limit or ptotal): | ||
# NB following isn't phrased as a tail call, could rewrite if | ||
# that becomes important | ||
scraped = scraped + self.retrieve_sources(pend, options) | ||
return scraped | ||
|
||
except Exception as e: | ||
print >> sys.stderr, "ERROR: %s" % str(e) | ||
return [] |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would be worth factoring out
retrieve_sources
into a mixin, since it's almost exactly the same inza_hansard_check_for_new_sources
, it'd just need the following additional parameters, I think:url
ATCDocument
orSource
)Also, I think it would be worth replacing the recursion of that method with a loop - it doesn't seem worth adding to the stack for each new document.