Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Committee notifier #54

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 86 additions & 0 deletions za_hansard/atc_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# -*- coding: utf-8 -*-
import distutils.spawn
import hashlib
import os
import sys
import re
import requests
import subprocess
import tempfile
import warnings
import datetime
import lxml.etree

import parslepy

from django.core.exceptions import ImproperlyConfigured
from django.conf import settings

from za_hansard.models import ATCDocument

#Largely based on question_scraper.py

# from https://github.com/scraperwiki/scraperwiki-python/blob/a96582f6c20cc1897f410d522e2a5bf37d301220/scraperwiki/utils.py#L38-L54
# Copied rather than included as the scraperwiki __init__.py was having trouble
# loading the sqlite code, which is something we don't actually need.

def ensure_executable_found(name):
if not distutils.spawn.find_executable(name):
raise ImproperlyConfigured("Can't find executable '{0}' which is needed by this code".format(name))

ensure_executable_found("pdftohtml")
def pdftoxml(pdfdata):
"""converts pdf file to xml file"""
pdffout = tempfile.NamedTemporaryFile(suffix='.pdf')
pdffout.write(pdfdata)
pdffout.flush()

xmlin = tempfile.NamedTemporaryFile(mode='r', suffix='.xml')
tmpxml = xmlin.name # "temph.xml"
cmd = 'pdftohtml -xml -nodrm -zoom 1.5 -enc UTF-8 -noframes "%s" "%s"' % (pdffout.name, os.path.splitext(tmpxml)[0])
cmd = cmd + " >/dev/null 2>&1" # can't turn off output, so throw away even stderr yeuch
os.system(cmd)

pdffout.close()
#xmlfin = open(tmpxml)
xmldata = xmlin.read()
xmlin.close()

# pdftohtml version 0.18.4 occasionally produces bad markup of the form <b>...<i>...</b> </i>
# Since ee don't actually need <i> tags, we may as well get rid of them all now, which will fix this.
# Note that we're working with a byte string version of utf-8 encoded data here.

xmldata = re.sub(r'</?i>', '', xmldata)

return xmldata

class ATCDocumentParser(object):
def __init__(self, **kwargs):
self.kwargs = kwargs

@classmethod
def check_committee_membership_announcement(cls, document_path):

if os.path.exists(document_path):
with open(document_path) as f:
contents = f.read()

if not contents:
return

xmldata = pdftoxml(contents)

if not xmldata:
sys.stdout.write(' SKIPPING - Got no XML data\n')
return

text = lxml.etree.fromstring(xmldata)

for el in text.iterfind('.//text'):
if 'Membership of Committees' in re.match(ur'(?s)<text.*?>(.*?)</text>', lxml.etree.tostring(el, encoding='unicode')).group(1):
return True

#committee announcement not found
return False


232 changes: 232 additions & 0 deletions za_hansard/management/commands/za_hansard_atc_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
import pprint
import httplib2
import re
import datetime
import time
import sys

from optparse import make_option
from bs4 import BeautifulSoup

from django.conf import settings


from django.core.management.base import BaseCommand, CommandError
from django.core.mail import send_mail

from za_hansard.models import ATCDocument
from ... import atc_scraper

#Largely based on za_hansard_check_for_new_sources.py and za_hansard_run_parsing.py

class FailedToRetrieveSourceException (Exception):
pass

class Command(BaseCommand):
help = 'Scrapes ZA Parliament Announcements, Tablings and Committee Reports documents. Currently only monitors for mentions of changes to committee membership.'
option_list = BaseCommand.option_list + (
make_option('--check-all',
default=False,
action='store_true',
help="Don't stop when when reaching a previously seen item (applies to checking contents and for new papers)",
),
make_option('--check-all-papers',
default=False,
action='store_true',
help="Don't stop when when reaching a previously seen item (applies only to new papers)",
),
make_option('--retry',
default=False,
action='store_true',
help='Retry attempted (but not completed) parses and previously 404\'d documents',
),
make_option('--check-for-papers',
default=False,
action='store_true',
help='Check for new ATC papers',
),
make_option('--check-committees',
default=False,
action='store_true',
help='Checks for changes to committee membership ',
),
make_option('--historical-limit',
default='2014-05-07',
type='str',
help='Limit earliest historical entry to check (in yyyy-mm-dd format, default 2014-05-07)',
),
make_option('--limit',
default=0,
type='str',
help='Limit number of entries to check (applies to checking contents and for new papers)',
),
make_option('--run-all-steps',
default=False,
action='store_true',
help='Check for new papers and for new committee memberships',
),
)

def handle(self, *args, **options):

self.historical_limit = datetime.datetime.strptime(options['historical_limit'], '%Y-%m-%d').date()
self.limit = options['limit']
self.check_all = options['check_all']
self.check_all_papers = options['check_all_papers']
self.retry = options['retry']

if options['check_for_papers']:
self.check_for_papers(options)
elif options['check_committees']:
self.check_committees(options)
elif options['run_all_steps']:
self.check_for_papers(options)
self.check_committees(options)

def check_committees(self, options):
sources = ATCDocument.objects.all()
if not self.check_all:
sources = sources.filter( last_processing_success=None )
if (not self.retry) and (not self.check_all):
sources = sources.filter(is404 = False).filter( last_processing_attempt=None )

if not sources:
print 'No documents to check.'

for s in (sources[:self.limit] if self.limit else sources):

if s.date < self.historical_limit:
print "Reached historical limit. Stopping.\n"
return

s.last_processing_attempt = datetime.datetime.now()
s.save()

try:
try:
filename = s.file()
if s.is404:
s.is404 = False
s.save()
except SourceUrlCouldNotBeRetrieved as e:
s.is404 = True
s.save()
raise e

if atc_scraper.ATCDocumentParser.check_committee_membership_announcement(filename):
self.stdout.write( "Committee announcement found %s (%d)\n" % (s.document_name, s.document_number) )
s.contains_committee_announcement = True

message = '''A committee announcement was found in the following ATC document:\n
\n
Document: %s\n
Date: %s\n
House: %s\n
Language: %s\n
URL: %s
''' % (s.document_name, s.date, s.house, s.language, 'http://www.parliament.gov.za/live/' + s.url)

send_mail('New committee announcement found - People\'s Assembly', message, settings.FROM_EMAIL, settings.ZA_COMMITTEE_NOTIFICATION_EMAIL, fail_silently=False)

s.last_processing_success = datetime.datetime.now()

s.save()
self.stdout.write( "Processed %s (%d)\n" % (s.document_name, s.document_number) )
except Exception as e:
# raise CommandError("Failed to run parsing: %s" % str(e))
self.stderr.write("WARN: Failed to run parsing: %s" % str(e))

def check_for_papers(self, options):
sources = self.retrieve_sources(0, options)
sources.reverse()
sources_db = [ATCDocument.objects.get_or_create(**source) for source in sources]
sources_count = len(sources)
created_count = sum([1 for (_,created) in sources_db if created])
self.stdout.write('ATC documents found: %d\nATC documents created: %d\n' % (
sources_count, created_count))

def retrieve_sources(self, start, options):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be worth factoring out retrieve_sources into a mixin, since it's almost exactly the same in za_hansard_check_for_new_sources, it'd just need the following additional parameters, I think:

  • The category ID for url
  • the source model class (either ATCDocument or Source)
  • the limit

Also, I think it would be worth replacing the recursion of that method with a loop - it doesn't seem worth adding to the stack for each new document.


try:
url = 'http://www.parliament.gov.za/live/content.php?Category_ID=227&DocumentStart=%d' % (start or 0)
self.stdout.write("Retrieving %s\n" % url)
h = httplib2.Http( settings.HTTPLIB2_CACHE_DIR )
response, content = h.request(url)
assert response.status == 200
self.stdout.write("OK\n")
# content = open('test.html').read()

# parse content
soup = BeautifulSoup(
content,
'xml',
)

rx = re.compile(r'Displaying (\d+) (\d+) of the most recent (\d+)')

pager = soup.find('td', text=rx)
match = rx.search(pager.text)
(pstart, pend, ptotal) = [int(p) for p in match.groups()]

self.stdout.write( "Processing %d to %d\n" % (pstart, pend) )

nodes = soup.findAll( 'a', text="View Document" )
def scrape(node):
url = node['href']
table = node.find_parent('table')
rx = re.compile(r'>([^:<]*) : ([^<]*)<')

data = {}
for match in re.finditer(rx, str(table)):
groups = match.groups()
data[groups[0]] = groups[1]

title = ''
try:
data['Title'] = table.find('b').text
except:
data['Title'] = data.get('Document Summary', '(unknown)')

try:
document_date = datetime.datetime.strptime(data['Date Published'], '%d %B %Y').date()
except Exception as e:
raise CommandError( "Date could not be parsed\n%s" % str(e) )
# document_date = datetime.date.today()

#(obj, created) = Source.objects.get_or_create(
return {
'document_name': data['Document Name'],
'document_number': data['Document Number'],
'defaults': {
'url': url,
'title': data['Title'],
'language': data.get('Language', 'English'),
'house': data.get('House', '(unknown)'),
'date': document_date,
}
}
scraped = []
for node in nodes:
s = scrape(node)
if ATCDocument.objects.filter(
document_name = s['document_name'],
document_number = s['document_number']).exists():
if (not self.check_all) and (not self.check_all_papers):
print "Reached seen document. Stopping.\n"
return scraped
if s['defaults']['date'] < self.historical_limit:
print "Reached historical limit. Stopping.\n"
return scraped

# otherwise
scraped.append(s)

if pend < (self.limit or ptotal):
# NB following isn't phrased as a tail call, could rewrite if
# that becomes important
scraped = scraped + self.retrieve_sources(pend, options)
return scraped

except Exception as e:
print >> sys.stderr, "ERROR: %s" % str(e)
return []
Loading