From 73bad5b91e823f27fc2aad8fc4be3d280a26f0b3 Mon Sep 17 00:00:00 2001 From: Geoffrey Kilpin Date: Sun, 6 Jul 2014 22:48:24 +0200 Subject: [PATCH 1/2] Create ATCDocument model Creates the ATCDocument model, which is largely identical to the Source model. --- .../migrations/0028_auto__add_atcdocument.py | 203 ++++++++++++++++++ za_hansard/models.py | 118 ++++++++++ 2 files changed, 321 insertions(+) create mode 100644 za_hansard/migrations/0028_auto__add_atcdocument.py diff --git a/za_hansard/migrations/0028_auto__add_atcdocument.py b/za_hansard/migrations/0028_auto__add_atcdocument.py new file mode 100644 index 0000000..9b63780 --- /dev/null +++ b/za_hansard/migrations/0028_auto__add_atcdocument.py @@ -0,0 +1,203 @@ +# -*- coding: utf-8 -*- +from south.utils import datetime_utils as datetime +from south.db import db +from south.v2 import SchemaMigration +from django.db import models + + +class Migration(SchemaMigration): + + def forwards(self, orm): + # Adding model 'ATCDocument' + db.create_table(u'za_hansard_atcdocument', ( + (u'id', self.gf('django.db.models.fields.AutoField')(primary_key=True)), + ('title', self.gf('django.db.models.fields.CharField')(max_length=200)), + ('document_name', self.gf('django.db.models.fields.CharField')(max_length=200)), + ('document_number', self.gf('django.db.models.fields.IntegerField')(unique=True)), + ('date', self.gf('django.db.models.fields.DateField')()), + ('url', self.gf('django.db.models.fields.URLField')(max_length=1000)), + ('is404', self.gf('django.db.models.fields.BooleanField')(default=False)), + ('house', self.gf('django.db.models.fields.CharField')(max_length=200)), + ('language', self.gf('django.db.models.fields.CharField')(max_length=200)), + ('last_processing_attempt', self.gf('django.db.models.fields.DateTimeField')(null=True, blank=True)), + ('last_processing_success', self.gf('django.db.models.fields.DateTimeField')(null=True, blank=True)), + ('contains_committee_announcement', self.gf('django.db.models.fields.BooleanField')(default=False)), + )) + db.send_create_signal(u'za_hansard', ['ATCDocument']) + + + def backwards(self, orm): + # Deleting model 'ATCDocument' + db.delete_table(u'za_hansard_atcdocument') + + + models = { + u'auth.group': { + 'Meta': {'object_name': 'Group'}, + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '80'}), + 'permissions': ('django.db.models.fields.related.ManyToManyField', [], {'to': u"orm['auth.Permission']", 'symmetrical': 'False', 'blank': 'True'}) + }, + u'auth.permission': { + 'Meta': {'ordering': "(u'content_type__app_label', u'content_type__model', u'codename')", 'unique_together': "((u'content_type', u'codename'),)", 'object_name': 'Permission'}, + 'codename': ('django.db.models.fields.CharField', [], {'max_length': '100'}), + 'content_type': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['contenttypes.ContentType']"}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'max_length': '50'}) + }, + u'auth.user': { + 'Meta': {'object_name': 'User'}, + 'date_joined': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}), + 'email': ('django.db.models.fields.EmailField', [], {'max_length': '75', 'blank': 'True'}), + 'first_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}), + 'groups': ('django.db.models.fields.related.ManyToManyField', [], {'symmetrical': 'False', 'related_name': "u'user_set'", 'blank': 'True', 'to': u"orm['auth.Group']"}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'is_active': ('django.db.models.fields.BooleanField', [], {'default': 'True'}), + 'is_staff': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'is_superuser': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'last_login': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}), + 'last_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}), + 'password': ('django.db.models.fields.CharField', [], {'max_length': '128'}), + 'user_permissions': ('django.db.models.fields.related.ManyToManyField', [], {'symmetrical': 'False', 'related_name': "u'user_set'", 'blank': 'True', 'to': u"orm['auth.Permission']"}), + 'username': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '30'}) + }, + u'contenttypes.contenttype': { + 'Meta': {'ordering': "('name',)", 'unique_together': "(('app_label', 'model'),)", 'object_name': 'ContentType', 'db_table': "'django_content_type'"}, + 'app_label': ('django.db.models.fields.CharField', [], {'max_length': '100'}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'model': ('django.db.models.fields.CharField', [], {'max_length': '100'}), + 'name': ('django.db.models.fields.CharField', [], {'max_length': '100'}) + }, + u'instances.instance': { + 'Meta': {'object_name': 'Instance'}, + 'created_by': ('django.db.models.fields.related.ForeignKey', [], {'blank': 'True', 'related_name': "'created_instances'", 'null': 'True', 'to': u"orm['auth.User']"}), + 'description': ('django.db.models.fields.TextField', [], {'blank': 'True'}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'label': ('instances.fields.DNSLabelField', [], {'unique': 'True', 'max_length': '63', 'db_index': 'True'}), + 'title': ('django.db.models.fields.CharField', [], {'max_length': '100'}), + 'users': ('django.db.models.fields.related.ManyToManyField', [], {'symmetrical': 'False', 'related_name': "'instances'", 'blank': 'True', 'to': u"orm['auth.User']"}) + }, + u'speeches.section': { + 'Meta': {'ordering': "('id',)", 'unique_together': "(('parent', 'slug', 'instance'),)", 'object_name': 'Section'}, + 'created': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'instance': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['instances.Instance']"}), + 'modified': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'blank': 'True'}), + 'parent': ('django.db.models.fields.related.ForeignKey', [], {'blank': 'True', 'related_name': "'children'", 'null': 'True', 'to': u"orm['speeches.Section']"}), + 'slug': ('sluggable.fields.SluggableField', [], {'unique_with': "('parent', 'instance')", 'max_length': '50', 'populate_from': "'title'"}), + 'title': ('django.db.models.fields.TextField', [], {'blank': 'True'}) + }, + u'za_hansard.answer': { + 'Meta': {'unique_together': "(('oral_number', 'house', 'year'), ('written_number', 'house', 'year'), ('president_number', 'house', 'year'), ('dp_number', 'house', 'year'))", 'object_name': 'Answer'}, + 'date': ('django.db.models.fields.DateField', [], {}), + 'date_published': ('django.db.models.fields.DateField', [], {}), + 'document_name': ('django.db.models.fields.TextField', [], {}), + 'dp_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}), + 'house': ('django.db.models.fields.CharField', [], {'max_length': '1', 'db_index': 'True'}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'language': ('django.db.models.fields.TextField', [], {}), + 'name': ('django.db.models.fields.TextField', [], {}), + 'oral_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}), + 'president_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}), + 'processed_code': ('django.db.models.fields.IntegerField', [], {'default': '0', 'db_index': 'True'}), + 'text': ('django.db.models.fields.TextField', [], {}), + 'type': ('django.db.models.fields.TextField', [], {}), + 'url': ('django.db.models.fields.TextField', [], {'db_index': 'True'}), + 'written_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}), + 'year': ('django.db.models.fields.IntegerField', [], {'db_index': 'True'}) + }, + u'za_hansard.atcdocument': { + 'Meta': {'ordering': "['-date', 'document_name']", 'object_name': 'ATCDocument'}, + 'contains_committee_announcement': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'date': ('django.db.models.fields.DateField', [], {}), + 'document_name': ('django.db.models.fields.CharField', [], {'max_length': '200'}), + 'document_number': ('django.db.models.fields.IntegerField', [], {'unique': 'True'}), + 'house': ('django.db.models.fields.CharField', [], {'max_length': '200'}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'is404': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'language': ('django.db.models.fields.CharField', [], {'max_length': '200'}), + 'last_processing_attempt': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}), + 'last_processing_success': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}), + 'title': ('django.db.models.fields.CharField', [], {'max_length': '200'}), + 'url': ('django.db.models.fields.URLField', [], {'max_length': '1000'}) + }, + u'za_hansard.pmgcommitteeappearance': { + 'Meta': {'object_name': 'PMGCommitteeAppearance'}, + 'committee': ('django.db.models.fields.TextField', [], {}), + 'committee_url': ('django.db.models.fields.TextField', [], {}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'meeting': ('django.db.models.fields.TextField', [], {}), + 'meeting_date': ('django.db.models.fields.DateField', [], {}), + 'meeting_url': ('django.db.models.fields.TextField', [], {}), + 'party': ('django.db.models.fields.TextField', [], {}), + 'person': ('django.db.models.fields.TextField', [], {}), + 'report': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'appearances'", 'null': 'True', 'to': u"orm['za_hansard.PMGCommitteeReport']"}), + 'text': ('django.db.models.fields.TextField', [], {}) + }, + u'za_hansard.pmgcommitteereport': { + 'Meta': {'object_name': 'PMGCommitteeReport'}, + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'last_sayit_import': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}), + 'meeting_url': ('django.db.models.fields.TextField', [], {}), + 'premium': ('django.db.models.fields.BooleanField', [], {}), + 'processed': ('django.db.models.fields.BooleanField', [], {}), + 'sayit_section': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['speeches.Section']", 'null': 'True', 'on_delete': 'models.PROTECT', 'blank': 'True'}) + }, + u'za_hansard.question': { + 'Meta': {'unique_together': "(('written_number', 'house', 'year'), ('oral_number', 'house', 'year'), ('president_number', 'house', 'year'), ('dp_number', 'house', 'year'), ('id_number', 'house', 'year'))", 'object_name': 'Question'}, + 'answer': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'question'", 'null': 'True', 'to': u"orm['za_hansard.Answer']"}), + 'answer_type': ('django.db.models.fields.CharField', [], {'max_length': '1'}), + 'askedby': ('django.db.models.fields.TextField', [], {}), + 'date': ('django.db.models.fields.DateField', [], {}), + 'date_transferred': ('django.db.models.fields.DateField', [], {'null': 'True'}), + 'dp_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}), + 'house': ('django.db.models.fields.CharField', [], {'max_length': '1', 'db_index': 'True'}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'id_number': ('django.db.models.fields.IntegerField', [], {'db_index': 'True'}), + 'identifier': ('django.db.models.fields.CharField', [], {'max_length': '10', 'db_index': 'True'}), + 'intro': ('django.db.models.fields.TextField', [], {}), + 'last_sayit_import': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}), + 'oral_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}), + 'paper': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['za_hansard.QuestionPaper']", 'null': 'True', 'on_delete': 'models.SET_NULL'}), + 'president_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}), + 'question': ('django.db.models.fields.TextField', [], {}), + 'questionto': ('django.db.models.fields.TextField', [], {}), + 'sayit_section': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['speeches.Section']", 'null': 'True', 'on_delete': 'models.PROTECT', 'blank': 'True'}), + 'translated': ('django.db.models.fields.BooleanField', [], {}), + 'written_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}), + 'year': ('django.db.models.fields.IntegerField', [], {'db_index': 'True'}) + }, + u'za_hansard.questionpaper': { + 'Meta': {'unique_together': "(('year', 'issue_number', 'house', 'parliament_number'),)", 'object_name': 'QuestionPaper'}, + 'date_published': ('django.db.models.fields.DateField', [], {}), + 'document_name': ('django.db.models.fields.TextField', [], {'max_length': '32'}), + 'document_number': ('django.db.models.fields.IntegerField', [], {}), + 'house': ('django.db.models.fields.CharField', [], {'max_length': '64'}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'issue_number': ('django.db.models.fields.IntegerField', [], {}), + 'language': ('django.db.models.fields.CharField', [], {'max_length': '16'}), + 'parliament_number': ('django.db.models.fields.IntegerField', [], {}), + 'session_number': ('django.db.models.fields.IntegerField', [], {}), + 'source_url': ('django.db.models.fields.URLField', [], {'max_length': '1000'}), + 'text': ('django.db.models.fields.TextField', [], {}), + 'year': ('django.db.models.fields.IntegerField', [], {}) + }, + u'za_hansard.source': { + 'Meta': {'ordering': "['-date', 'document_name']", 'object_name': 'Source'}, + 'date': ('django.db.models.fields.DateField', [], {}), + 'document_name': ('django.db.models.fields.CharField', [], {'max_length': '200'}), + 'document_number': ('django.db.models.fields.IntegerField', [], {'unique': 'True'}), + 'house': ('django.db.models.fields.CharField', [], {'max_length': '200'}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'is404': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'language': ('django.db.models.fields.CharField', [], {'max_length': '200'}), + 'last_processing_attempt': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}), + 'last_processing_success': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}), + 'last_sayit_import': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}), + 'sayit_section': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['speeches.Section']", 'null': 'True', 'on_delete': 'models.PROTECT', 'blank': 'True'}), + 'title': ('django.db.models.fields.CharField', [], {'max_length': '200'}), + 'url': ('django.db.models.fields.URLField', [], {'max_length': '1000'}) + } + } + + complete_apps = ['za_hansard'] \ No newline at end of file diff --git a/za_hansard/models.py b/za_hansard/models.py index efac72b..43a8562 100644 --- a/za_hansard/models.py +++ b/za_hansard/models.py @@ -436,3 +436,121 @@ class Meta: # 1) At least one of written_number and oral_number must be non-null. #CREATE TABLE completed_documents (`url` string); + +class ATCDocument(models.Model): + """ + ZA Parliament Announcement, Tablings and Committee Reports. + """ + + title = models.CharField(max_length=200) + document_name = models.CharField(max_length=200) + document_number = models.IntegerField(unique=True) + date = models.DateField() + url = models.URLField(max_length=1000) + is404 = models.BooleanField(default=False) + house = models.CharField(max_length=200) + language = models.CharField(max_length=200) + + last_processing_attempt = models.DateTimeField(blank=True, null=True) + last_processing_success = models.DateTimeField(blank=True, null=True) + + contains_committee_announcement = models.BooleanField(default=False) + + class Meta: + ordering = [ '-date', 'document_name' ] + + + def __unicode__(self): + return self.document_name + + + def delete(self): + """After deleting from db, delete the cached file too""" + cache_file_path = self.cache_file_path() + super( Source, self ).delete() + + if os.path.exists( cache_file_path ): + os.remove( cache_file_path ) + + + def file(self, debug=False): + """ + Return as a file object the resource that the url is pointing to. + + Should check the local cache first, and fetch and store if it is not + found there. + + Raises a SourceUrlCouldNotBeRetrieved exception if URL could not be + retrieved. + """ + cache_file_path = self.cache_file_path() + + found = os.path.isfile(cache_file_path) + + if debug: + print >> sys.stderr, "%s (%s)" % (cache_file_path, found) + + # If the file exists open it, read it and return it + if found: + return cache_file_path + + # If not fetch the file, save to cache and then return fh + h = httplib2.Http() + url = 'http://www.parliament.gov.za/live/' + self.url + + def request_url(url): + if debug: + print >> sys.stderr, 'Requesting %s' % url + (response, content) = h.request(url) + if response.status != 200: + raise SourceUrlCouldNotBeRetrieved("status code: %s, url: %s" % (response.status, self.url) ) + self.is404 = False + self.save() + return (response, content) + + try: + (response, content) = request_url(url) + except SourceUrlCouldNotBeRetrieved as e: + try: + if not url[-4:] == '.doc': + (response, content) = request_url(url + '.doc') + self.url = self.url + '.doc' + self.save() + else: + raise e + except: + raise e + + if not content: + raise SourceUrlCouldNotBeRetrieved("WTF?") + with open(cache_file_path, "w") as new_cache_file: + new_cache_file.write(content) + + return cache_file_path + + def cache_file_path(self): + """Absolute path to the cache file for this source""" + + id_str= "%05u" % self.id + + # do some simple partitioning + # FIXME - put in something to prevent the test suite overwriting non-test files. + aaa = id_str[-1] + bbb = id_str[-2] + cache_dir = os.path.join(settings.HANSARD_CACHE, aaa, bbb) + + # check that the dir exists + if not os.path.exists( cache_dir ): + os.makedirs( cache_dir ) + + d = self.date.strftime('%Y-%m-%d') + + # create the path to the file + cache_file_path = os.path.join(cache_dir, '-'.join([d, id_str, self.document_name])) + return cache_file_path + + def xml_file_path(self): + xml_file_path = '%s.xml' % self.cache_file_path() + if os.path.isfile(xml_file_path): + return xml_file_path + return None From abb623a7105a3c37049fd1087af35461e44079de Mon Sep 17 00:00:00 2001 From: Geoffrey Kilpin Date: Mon, 7 Jul 2014 20:44:19 +0200 Subject: [PATCH 2/2] ATC scraper and committee change notifier Scrapes ATC documents and detects announcements of changes to committee membership. --- za_hansard/atc_scraper.py | 86 +++++++ .../commands/za_hansard_atc_scraper.py | 232 ++++++++++++++++++ 2 files changed, 318 insertions(+) create mode 100644 za_hansard/atc_scraper.py create mode 100644 za_hansard/management/commands/za_hansard_atc_scraper.py diff --git a/za_hansard/atc_scraper.py b/za_hansard/atc_scraper.py new file mode 100644 index 0000000..6c10bf4 --- /dev/null +++ b/za_hansard/atc_scraper.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- +import distutils.spawn +import hashlib +import os +import sys +import re +import requests +import subprocess +import tempfile +import warnings +import datetime +import lxml.etree + +import parslepy + +from django.core.exceptions import ImproperlyConfigured +from django.conf import settings + +from za_hansard.models import ATCDocument + +#Largely based on question_scraper.py + +# from https://github.com/scraperwiki/scraperwiki-python/blob/a96582f6c20cc1897f410d522e2a5bf37d301220/scraperwiki/utils.py#L38-L54 +# Copied rather than included as the scraperwiki __init__.py was having trouble +# loading the sqlite code, which is something we don't actually need. + +def ensure_executable_found(name): + if not distutils.spawn.find_executable(name): + raise ImproperlyConfigured("Can't find executable '{0}' which is needed by this code".format(name)) + +ensure_executable_found("pdftohtml") +def pdftoxml(pdfdata): + """converts pdf file to xml file""" + pdffout = tempfile.NamedTemporaryFile(suffix='.pdf') + pdffout.write(pdfdata) + pdffout.flush() + + xmlin = tempfile.NamedTemporaryFile(mode='r', suffix='.xml') + tmpxml = xmlin.name # "temph.xml" + cmd = 'pdftohtml -xml -nodrm -zoom 1.5 -enc UTF-8 -noframes "%s" "%s"' % (pdffout.name, os.path.splitext(tmpxml)[0]) + cmd = cmd + " >/dev/null 2>&1" # can't turn off output, so throw away even stderr yeuch + os.system(cmd) + + pdffout.close() + #xmlfin = open(tmpxml) + xmldata = xmlin.read() + xmlin.close() + + # pdftohtml version 0.18.4 occasionally produces bad markup of the form ...... + # Since ee don't actually need tags, we may as well get rid of them all now, which will fix this. + # Note that we're working with a byte string version of utf-8 encoded data here. + + xmldata = re.sub(r'', '', xmldata) + + return xmldata + +class ATCDocumentParser(object): + def __init__(self, **kwargs): + self.kwargs = kwargs + + @classmethod + def check_committee_membership_announcement(cls, document_path): + + if os.path.exists(document_path): + with open(document_path) as f: + contents = f.read() + + if not contents: + return + + xmldata = pdftoxml(contents) + + if not xmldata: + sys.stdout.write(' SKIPPING - Got no XML data\n') + return + + text = lxml.etree.fromstring(xmldata) + + for el in text.iterfind('.//text'): + if 'Membership of Committees' in re.match(ur'(?s)(.*?)', lxml.etree.tostring(el, encoding='unicode')).group(1): + return True + + #committee announcement not found + return False + + diff --git a/za_hansard/management/commands/za_hansard_atc_scraper.py b/za_hansard/management/commands/za_hansard_atc_scraper.py new file mode 100644 index 0000000..8f41c48 --- /dev/null +++ b/za_hansard/management/commands/za_hansard_atc_scraper.py @@ -0,0 +1,232 @@ +import pprint +import httplib2 +import re +import datetime +import time +import sys + +from optparse import make_option +from bs4 import BeautifulSoup + +from django.conf import settings + + +from django.core.management.base import BaseCommand, CommandError +from django.core.mail import send_mail + +from za_hansard.models import ATCDocument +from ... import atc_scraper + +#Largely based on za_hansard_check_for_new_sources.py and za_hansard_run_parsing.py + +class FailedToRetrieveSourceException (Exception): + pass + +class Command(BaseCommand): + help = 'Scrapes ZA Parliament Announcements, Tablings and Committee Reports documents. Currently only monitors for mentions of changes to committee membership.' + option_list = BaseCommand.option_list + ( + make_option('--check-all', + default=False, + action='store_true', + help="Don't stop when when reaching a previously seen item (applies to checking contents and for new papers)", + ), + make_option('--check-all-papers', + default=False, + action='store_true', + help="Don't stop when when reaching a previously seen item (applies only to new papers)", + ), + make_option('--retry', + default=False, + action='store_true', + help='Retry attempted (but not completed) parses and previously 404\'d documents', + ), + make_option('--check-for-papers', + default=False, + action='store_true', + help='Check for new ATC papers', + ), + make_option('--check-committees', + default=False, + action='store_true', + help='Checks for changes to committee membership ', + ), + make_option('--historical-limit', + default='2014-05-07', + type='str', + help='Limit earliest historical entry to check (in yyyy-mm-dd format, default 2014-05-07)', + ), + make_option('--limit', + default=0, + type='str', + help='Limit number of entries to check (applies to checking contents and for new papers)', + ), + make_option('--run-all-steps', + default=False, + action='store_true', + help='Check for new papers and for new committee memberships', + ), + ) + + def handle(self, *args, **options): + + self.historical_limit = datetime.datetime.strptime(options['historical_limit'], '%Y-%m-%d').date() + self.limit = options['limit'] + self.check_all = options['check_all'] + self.check_all_papers = options['check_all_papers'] + self.retry = options['retry'] + + if options['check_for_papers']: + self.check_for_papers(options) + elif options['check_committees']: + self.check_committees(options) + elif options['run_all_steps']: + self.check_for_papers(options) + self.check_committees(options) + + def check_committees(self, options): + sources = ATCDocument.objects.all() + if not self.check_all: + sources = sources.filter( last_processing_success=None ) + if (not self.retry) and (not self.check_all): + sources = sources.filter(is404 = False).filter( last_processing_attempt=None ) + + if not sources: + print 'No documents to check.' + + for s in (sources[:self.limit] if self.limit else sources): + + if s.date < self.historical_limit: + print "Reached historical limit. Stopping.\n" + return + + s.last_processing_attempt = datetime.datetime.now() + s.save() + + try: + try: + filename = s.file() + if s.is404: + s.is404 = False + s.save() + except SourceUrlCouldNotBeRetrieved as e: + s.is404 = True + s.save() + raise e + + if atc_scraper.ATCDocumentParser.check_committee_membership_announcement(filename): + self.stdout.write( "Committee announcement found %s (%d)\n" % (s.document_name, s.document_number) ) + s.contains_committee_announcement = True + + message = '''A committee announcement was found in the following ATC document:\n + \n + Document: %s\n + Date: %s\n + House: %s\n + Language: %s\n + URL: %s + ''' % (s.document_name, s.date, s.house, s.language, 'http://www.parliament.gov.za/live/' + s.url) + + send_mail('New committee announcement found - People\'s Assembly', message, settings.FROM_EMAIL, settings.ZA_COMMITTEE_NOTIFICATION_EMAIL, fail_silently=False) + + s.last_processing_success = datetime.datetime.now() + + s.save() + self.stdout.write( "Processed %s (%d)\n" % (s.document_name, s.document_number) ) + except Exception as e: + # raise CommandError("Failed to run parsing: %s" % str(e)) + self.stderr.write("WARN: Failed to run parsing: %s" % str(e)) + + def check_for_papers(self, options): + sources = self.retrieve_sources(0, options) + sources.reverse() + sources_db = [ATCDocument.objects.get_or_create(**source) for source in sources] + sources_count = len(sources) + created_count = sum([1 for (_,created) in sources_db if created]) + self.stdout.write('ATC documents found: %d\nATC documents created: %d\n' % ( + sources_count, created_count)) + + def retrieve_sources(self, start, options): + + try: + url = 'http://www.parliament.gov.za/live/content.php?Category_ID=227&DocumentStart=%d' % (start or 0) + self.stdout.write("Retrieving %s\n" % url) + h = httplib2.Http( settings.HTTPLIB2_CACHE_DIR ) + response, content = h.request(url) + assert response.status == 200 + self.stdout.write("OK\n") + # content = open('test.html').read() + + # parse content + soup = BeautifulSoup( + content, + 'xml', + ) + + rx = re.compile(r'Displaying (\d+) (\d+) of the most recent (\d+)') + + pager = soup.find('td', text=rx) + match = rx.search(pager.text) + (pstart, pend, ptotal) = [int(p) for p in match.groups()] + + self.stdout.write( "Processing %d to %d\n" % (pstart, pend) ) + + nodes = soup.findAll( 'a', text="View Document" ) + def scrape(node): + url = node['href'] + table = node.find_parent('table') + rx = re.compile(r'>([^:<]*) : ([^<]*)<') + + data = {} + for match in re.finditer(rx, str(table)): + groups = match.groups() + data[groups[0]] = groups[1] + + title = '' + try: + data['Title'] = table.find('b').text + except: + data['Title'] = data.get('Document Summary', '(unknown)') + + try: + document_date = datetime.datetime.strptime(data['Date Published'], '%d %B %Y').date() + except Exception as e: + raise CommandError( "Date could not be parsed\n%s" % str(e) ) + # document_date = datetime.date.today() + + #(obj, created) = Source.objects.get_or_create( + return { + 'document_name': data['Document Name'], + 'document_number': data['Document Number'], + 'defaults': { + 'url': url, + 'title': data['Title'], + 'language': data.get('Language', 'English'), + 'house': data.get('House', '(unknown)'), + 'date': document_date, + } + } + scraped = [] + for node in nodes: + s = scrape(node) + if ATCDocument.objects.filter( + document_name = s['document_name'], + document_number = s['document_number']).exists(): + if (not self.check_all) and (not self.check_all_papers): + print "Reached seen document. Stopping.\n" + return scraped + if s['defaults']['date'] < self.historical_limit: + print "Reached historical limit. Stopping.\n" + return scraped + + # otherwise + scraped.append(s) + + if pend < (self.limit or ptotal): + # NB following isn't phrased as a tail call, could rewrite if + # that becomes important + scraped = scraped + self.retrieve_sources(pend, options) + return scraped + + except Exception as e: + print >> sys.stderr, "ERROR: %s" % str(e) + return []