diff --git a/za_hansard/atc_scraper.py b/za_hansard/atc_scraper.py
new file mode 100644
index 0000000..6c10bf4
--- /dev/null
+++ b/za_hansard/atc_scraper.py
@@ -0,0 +1,86 @@
+# -*- coding: utf-8 -*-
+import distutils.spawn
+import hashlib
+import os
+import sys
+import re
+import requests
+import subprocess
+import tempfile
+import warnings
+import datetime
+import lxml.etree
+
+import parslepy
+
+from django.core.exceptions import ImproperlyConfigured
+from django.conf import settings
+
+from za_hansard.models import ATCDocument
+
+#Largely based on question_scraper.py
+
+# from https://github.com/scraperwiki/scraperwiki-python/blob/a96582f6c20cc1897f410d522e2a5bf37d301220/scraperwiki/utils.py#L38-L54
+# Copied rather than included as the scraperwiki __init__.py was having trouble
+# loading the sqlite code, which is something we don't actually need.
+
+def ensure_executable_found(name):
+ if not distutils.spawn.find_executable(name):
+ raise ImproperlyConfigured("Can't find executable '{0}' which is needed by this code".format(name))
+
+ensure_executable_found("pdftohtml")
+def pdftoxml(pdfdata):
+ """converts pdf file to xml file"""
+ pdffout = tempfile.NamedTemporaryFile(suffix='.pdf')
+ pdffout.write(pdfdata)
+ pdffout.flush()
+
+ xmlin = tempfile.NamedTemporaryFile(mode='r', suffix='.xml')
+ tmpxml = xmlin.name # "temph.xml"
+ cmd = 'pdftohtml -xml -nodrm -zoom 1.5 -enc UTF-8 -noframes "%s" "%s"' % (pdffout.name, os.path.splitext(tmpxml)[0])
+ cmd = cmd + " >/dev/null 2>&1" # can't turn off output, so throw away even stderr yeuch
+ os.system(cmd)
+
+ pdffout.close()
+ #xmlfin = open(tmpxml)
+ xmldata = xmlin.read()
+ xmlin.close()
+
+ # pdftohtml version 0.18.4 occasionally produces bad markup of the form ......
+ # Since ee don't actually need tags, we may as well get rid of them all now, which will fix this.
+ # Note that we're working with a byte string version of utf-8 encoded data here.
+
+ xmldata = re.sub(r'?i>', '', xmldata)
+
+ return xmldata
+
+class ATCDocumentParser(object):
+ def __init__(self, **kwargs):
+ self.kwargs = kwargs
+
+ @classmethod
+ def check_committee_membership_announcement(cls, document_path):
+
+ if os.path.exists(document_path):
+ with open(document_path) as f:
+ contents = f.read()
+
+ if not contents:
+ return
+
+ xmldata = pdftoxml(contents)
+
+ if not xmldata:
+ sys.stdout.write(' SKIPPING - Got no XML data\n')
+ return
+
+ text = lxml.etree.fromstring(xmldata)
+
+ for el in text.iterfind('.//text'):
+ if 'Membership of Committees' in re.match(ur'(?s)(.*?)', lxml.etree.tostring(el, encoding='unicode')).group(1):
+ return True
+
+ #committee announcement not found
+ return False
+
+
diff --git a/za_hansard/management/commands/za_hansard_atc_scraper.py b/za_hansard/management/commands/za_hansard_atc_scraper.py
new file mode 100644
index 0000000..8f41c48
--- /dev/null
+++ b/za_hansard/management/commands/za_hansard_atc_scraper.py
@@ -0,0 +1,232 @@
+import pprint
+import httplib2
+import re
+import datetime
+import time
+import sys
+
+from optparse import make_option
+from bs4 import BeautifulSoup
+
+from django.conf import settings
+
+
+from django.core.management.base import BaseCommand, CommandError
+from django.core.mail import send_mail
+
+from za_hansard.models import ATCDocument
+from ... import atc_scraper
+
+#Largely based on za_hansard_check_for_new_sources.py and za_hansard_run_parsing.py
+
+class FailedToRetrieveSourceException (Exception):
+ pass
+
+class Command(BaseCommand):
+ help = 'Scrapes ZA Parliament Announcements, Tablings and Committee Reports documents. Currently only monitors for mentions of changes to committee membership.'
+ option_list = BaseCommand.option_list + (
+ make_option('--check-all',
+ default=False,
+ action='store_true',
+ help="Don't stop when when reaching a previously seen item (applies to checking contents and for new papers)",
+ ),
+ make_option('--check-all-papers',
+ default=False,
+ action='store_true',
+ help="Don't stop when when reaching a previously seen item (applies only to new papers)",
+ ),
+ make_option('--retry',
+ default=False,
+ action='store_true',
+ help='Retry attempted (but not completed) parses and previously 404\'d documents',
+ ),
+ make_option('--check-for-papers',
+ default=False,
+ action='store_true',
+ help='Check for new ATC papers',
+ ),
+ make_option('--check-committees',
+ default=False,
+ action='store_true',
+ help='Checks for changes to committee membership ',
+ ),
+ make_option('--historical-limit',
+ default='2014-05-07',
+ type='str',
+ help='Limit earliest historical entry to check (in yyyy-mm-dd format, default 2014-05-07)',
+ ),
+ make_option('--limit',
+ default=0,
+ type='str',
+ help='Limit number of entries to check (applies to checking contents and for new papers)',
+ ),
+ make_option('--run-all-steps',
+ default=False,
+ action='store_true',
+ help='Check for new papers and for new committee memberships',
+ ),
+ )
+
+ def handle(self, *args, **options):
+
+ self.historical_limit = datetime.datetime.strptime(options['historical_limit'], '%Y-%m-%d').date()
+ self.limit = options['limit']
+ self.check_all = options['check_all']
+ self.check_all_papers = options['check_all_papers']
+ self.retry = options['retry']
+
+ if options['check_for_papers']:
+ self.check_for_papers(options)
+ elif options['check_committees']:
+ self.check_committees(options)
+ elif options['run_all_steps']:
+ self.check_for_papers(options)
+ self.check_committees(options)
+
+ def check_committees(self, options):
+ sources = ATCDocument.objects.all()
+ if not self.check_all:
+ sources = sources.filter( last_processing_success=None )
+ if (not self.retry) and (not self.check_all):
+ sources = sources.filter(is404 = False).filter( last_processing_attempt=None )
+
+ if not sources:
+ print 'No documents to check.'
+
+ for s in (sources[:self.limit] if self.limit else sources):
+
+ if s.date < self.historical_limit:
+ print "Reached historical limit. Stopping.\n"
+ return
+
+ s.last_processing_attempt = datetime.datetime.now()
+ s.save()
+
+ try:
+ try:
+ filename = s.file()
+ if s.is404:
+ s.is404 = False
+ s.save()
+ except SourceUrlCouldNotBeRetrieved as e:
+ s.is404 = True
+ s.save()
+ raise e
+
+ if atc_scraper.ATCDocumentParser.check_committee_membership_announcement(filename):
+ self.stdout.write( "Committee announcement found %s (%d)\n" % (s.document_name, s.document_number) )
+ s.contains_committee_announcement = True
+
+ message = '''A committee announcement was found in the following ATC document:\n
+ \n
+ Document: %s\n
+ Date: %s\n
+ House: %s\n
+ Language: %s\n
+ URL: %s
+ ''' % (s.document_name, s.date, s.house, s.language, 'http://www.parliament.gov.za/live/' + s.url)
+
+ send_mail('New committee announcement found - People\'s Assembly', message, settings.FROM_EMAIL, settings.ZA_COMMITTEE_NOTIFICATION_EMAIL, fail_silently=False)
+
+ s.last_processing_success = datetime.datetime.now()
+
+ s.save()
+ self.stdout.write( "Processed %s (%d)\n" % (s.document_name, s.document_number) )
+ except Exception as e:
+ # raise CommandError("Failed to run parsing: %s" % str(e))
+ self.stderr.write("WARN: Failed to run parsing: %s" % str(e))
+
+ def check_for_papers(self, options):
+ sources = self.retrieve_sources(0, options)
+ sources.reverse()
+ sources_db = [ATCDocument.objects.get_or_create(**source) for source in sources]
+ sources_count = len(sources)
+ created_count = sum([1 for (_,created) in sources_db if created])
+ self.stdout.write('ATC documents found: %d\nATC documents created: %d\n' % (
+ sources_count, created_count))
+
+ def retrieve_sources(self, start, options):
+
+ try:
+ url = 'http://www.parliament.gov.za/live/content.php?Category_ID=227&DocumentStart=%d' % (start or 0)
+ self.stdout.write("Retrieving %s\n" % url)
+ h = httplib2.Http( settings.HTTPLIB2_CACHE_DIR )
+ response, content = h.request(url)
+ assert response.status == 200
+ self.stdout.write("OK\n")
+ # content = open('test.html').read()
+
+ # parse content
+ soup = BeautifulSoup(
+ content,
+ 'xml',
+ )
+
+ rx = re.compile(r'Displaying (\d+) (\d+) of the most recent (\d+)')
+
+ pager = soup.find('td', text=rx)
+ match = rx.search(pager.text)
+ (pstart, pend, ptotal) = [int(p) for p in match.groups()]
+
+ self.stdout.write( "Processing %d to %d\n" % (pstart, pend) )
+
+ nodes = soup.findAll( 'a', text="View Document" )
+ def scrape(node):
+ url = node['href']
+ table = node.find_parent('table')
+ rx = re.compile(r'>([^:<]*) : ([^<]*)<')
+
+ data = {}
+ for match in re.finditer(rx, str(table)):
+ groups = match.groups()
+ data[groups[0]] = groups[1]
+
+ title = ''
+ try:
+ data['Title'] = table.find('b').text
+ except:
+ data['Title'] = data.get('Document Summary', '(unknown)')
+
+ try:
+ document_date = datetime.datetime.strptime(data['Date Published'], '%d %B %Y').date()
+ except Exception as e:
+ raise CommandError( "Date could not be parsed\n%s" % str(e) )
+ # document_date = datetime.date.today()
+
+ #(obj, created) = Source.objects.get_or_create(
+ return {
+ 'document_name': data['Document Name'],
+ 'document_number': data['Document Number'],
+ 'defaults': {
+ 'url': url,
+ 'title': data['Title'],
+ 'language': data.get('Language', 'English'),
+ 'house': data.get('House', '(unknown)'),
+ 'date': document_date,
+ }
+ }
+ scraped = []
+ for node in nodes:
+ s = scrape(node)
+ if ATCDocument.objects.filter(
+ document_name = s['document_name'],
+ document_number = s['document_number']).exists():
+ if (not self.check_all) and (not self.check_all_papers):
+ print "Reached seen document. Stopping.\n"
+ return scraped
+ if s['defaults']['date'] < self.historical_limit:
+ print "Reached historical limit. Stopping.\n"
+ return scraped
+
+ # otherwise
+ scraped.append(s)
+
+ if pend < (self.limit or ptotal):
+ # NB following isn't phrased as a tail call, could rewrite if
+ # that becomes important
+ scraped = scraped + self.retrieve_sources(pend, options)
+ return scraped
+
+ except Exception as e:
+ print >> sys.stderr, "ERROR: %s" % str(e)
+ return []
diff --git a/za_hansard/migrations/0028_auto__add_atcdocument.py b/za_hansard/migrations/0028_auto__add_atcdocument.py
new file mode 100644
index 0000000..9b63780
--- /dev/null
+++ b/za_hansard/migrations/0028_auto__add_atcdocument.py
@@ -0,0 +1,203 @@
+# -*- coding: utf-8 -*-
+from south.utils import datetime_utils as datetime
+from south.db import db
+from south.v2 import SchemaMigration
+from django.db import models
+
+
+class Migration(SchemaMigration):
+
+ def forwards(self, orm):
+ # Adding model 'ATCDocument'
+ db.create_table(u'za_hansard_atcdocument', (
+ (u'id', self.gf('django.db.models.fields.AutoField')(primary_key=True)),
+ ('title', self.gf('django.db.models.fields.CharField')(max_length=200)),
+ ('document_name', self.gf('django.db.models.fields.CharField')(max_length=200)),
+ ('document_number', self.gf('django.db.models.fields.IntegerField')(unique=True)),
+ ('date', self.gf('django.db.models.fields.DateField')()),
+ ('url', self.gf('django.db.models.fields.URLField')(max_length=1000)),
+ ('is404', self.gf('django.db.models.fields.BooleanField')(default=False)),
+ ('house', self.gf('django.db.models.fields.CharField')(max_length=200)),
+ ('language', self.gf('django.db.models.fields.CharField')(max_length=200)),
+ ('last_processing_attempt', self.gf('django.db.models.fields.DateTimeField')(null=True, blank=True)),
+ ('last_processing_success', self.gf('django.db.models.fields.DateTimeField')(null=True, blank=True)),
+ ('contains_committee_announcement', self.gf('django.db.models.fields.BooleanField')(default=False)),
+ ))
+ db.send_create_signal(u'za_hansard', ['ATCDocument'])
+
+
+ def backwards(self, orm):
+ # Deleting model 'ATCDocument'
+ db.delete_table(u'za_hansard_atcdocument')
+
+
+ models = {
+ u'auth.group': {
+ 'Meta': {'object_name': 'Group'},
+ u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+ 'name': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '80'}),
+ 'permissions': ('django.db.models.fields.related.ManyToManyField', [], {'to': u"orm['auth.Permission']", 'symmetrical': 'False', 'blank': 'True'})
+ },
+ u'auth.permission': {
+ 'Meta': {'ordering': "(u'content_type__app_label', u'content_type__model', u'codename')", 'unique_together': "((u'content_type', u'codename'),)", 'object_name': 'Permission'},
+ 'codename': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
+ 'content_type': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['contenttypes.ContentType']"}),
+ u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+ 'name': ('django.db.models.fields.CharField', [], {'max_length': '50'})
+ },
+ u'auth.user': {
+ 'Meta': {'object_name': 'User'},
+ 'date_joined': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}),
+ 'email': ('django.db.models.fields.EmailField', [], {'max_length': '75', 'blank': 'True'}),
+ 'first_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}),
+ 'groups': ('django.db.models.fields.related.ManyToManyField', [], {'symmetrical': 'False', 'related_name': "u'user_set'", 'blank': 'True', 'to': u"orm['auth.Group']"}),
+ u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+ 'is_active': ('django.db.models.fields.BooleanField', [], {'default': 'True'}),
+ 'is_staff': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
+ 'is_superuser': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
+ 'last_login': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}),
+ 'last_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}),
+ 'password': ('django.db.models.fields.CharField', [], {'max_length': '128'}),
+ 'user_permissions': ('django.db.models.fields.related.ManyToManyField', [], {'symmetrical': 'False', 'related_name': "u'user_set'", 'blank': 'True', 'to': u"orm['auth.Permission']"}),
+ 'username': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '30'})
+ },
+ u'contenttypes.contenttype': {
+ 'Meta': {'ordering': "('name',)", 'unique_together': "(('app_label', 'model'),)", 'object_name': 'ContentType', 'db_table': "'django_content_type'"},
+ 'app_label': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
+ u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+ 'model': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
+ 'name': ('django.db.models.fields.CharField', [], {'max_length': '100'})
+ },
+ u'instances.instance': {
+ 'Meta': {'object_name': 'Instance'},
+ 'created_by': ('django.db.models.fields.related.ForeignKey', [], {'blank': 'True', 'related_name': "'created_instances'", 'null': 'True', 'to': u"orm['auth.User']"}),
+ 'description': ('django.db.models.fields.TextField', [], {'blank': 'True'}),
+ u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+ 'label': ('instances.fields.DNSLabelField', [], {'unique': 'True', 'max_length': '63', 'db_index': 'True'}),
+ 'title': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
+ 'users': ('django.db.models.fields.related.ManyToManyField', [], {'symmetrical': 'False', 'related_name': "'instances'", 'blank': 'True', 'to': u"orm['auth.User']"})
+ },
+ u'speeches.section': {
+ 'Meta': {'ordering': "('id',)", 'unique_together': "(('parent', 'slug', 'instance'),)", 'object_name': 'Section'},
+ 'created': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}),
+ u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+ 'instance': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['instances.Instance']"}),
+ 'modified': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'blank': 'True'}),
+ 'parent': ('django.db.models.fields.related.ForeignKey', [], {'blank': 'True', 'related_name': "'children'", 'null': 'True', 'to': u"orm['speeches.Section']"}),
+ 'slug': ('sluggable.fields.SluggableField', [], {'unique_with': "('parent', 'instance')", 'max_length': '50', 'populate_from': "'title'"}),
+ 'title': ('django.db.models.fields.TextField', [], {'blank': 'True'})
+ },
+ u'za_hansard.answer': {
+ 'Meta': {'unique_together': "(('oral_number', 'house', 'year'), ('written_number', 'house', 'year'), ('president_number', 'house', 'year'), ('dp_number', 'house', 'year'))", 'object_name': 'Answer'},
+ 'date': ('django.db.models.fields.DateField', [], {}),
+ 'date_published': ('django.db.models.fields.DateField', [], {}),
+ 'document_name': ('django.db.models.fields.TextField', [], {}),
+ 'dp_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}),
+ 'house': ('django.db.models.fields.CharField', [], {'max_length': '1', 'db_index': 'True'}),
+ u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+ 'language': ('django.db.models.fields.TextField', [], {}),
+ 'name': ('django.db.models.fields.TextField', [], {}),
+ 'oral_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}),
+ 'president_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}),
+ 'processed_code': ('django.db.models.fields.IntegerField', [], {'default': '0', 'db_index': 'True'}),
+ 'text': ('django.db.models.fields.TextField', [], {}),
+ 'type': ('django.db.models.fields.TextField', [], {}),
+ 'url': ('django.db.models.fields.TextField', [], {'db_index': 'True'}),
+ 'written_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}),
+ 'year': ('django.db.models.fields.IntegerField', [], {'db_index': 'True'})
+ },
+ u'za_hansard.atcdocument': {
+ 'Meta': {'ordering': "['-date', 'document_name']", 'object_name': 'ATCDocument'},
+ 'contains_committee_announcement': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
+ 'date': ('django.db.models.fields.DateField', [], {}),
+ 'document_name': ('django.db.models.fields.CharField', [], {'max_length': '200'}),
+ 'document_number': ('django.db.models.fields.IntegerField', [], {'unique': 'True'}),
+ 'house': ('django.db.models.fields.CharField', [], {'max_length': '200'}),
+ u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+ 'is404': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
+ 'language': ('django.db.models.fields.CharField', [], {'max_length': '200'}),
+ 'last_processing_attempt': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}),
+ 'last_processing_success': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}),
+ 'title': ('django.db.models.fields.CharField', [], {'max_length': '200'}),
+ 'url': ('django.db.models.fields.URLField', [], {'max_length': '1000'})
+ },
+ u'za_hansard.pmgcommitteeappearance': {
+ 'Meta': {'object_name': 'PMGCommitteeAppearance'},
+ 'committee': ('django.db.models.fields.TextField', [], {}),
+ 'committee_url': ('django.db.models.fields.TextField', [], {}),
+ u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+ 'meeting': ('django.db.models.fields.TextField', [], {}),
+ 'meeting_date': ('django.db.models.fields.DateField', [], {}),
+ 'meeting_url': ('django.db.models.fields.TextField', [], {}),
+ 'party': ('django.db.models.fields.TextField', [], {}),
+ 'person': ('django.db.models.fields.TextField', [], {}),
+ 'report': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'appearances'", 'null': 'True', 'to': u"orm['za_hansard.PMGCommitteeReport']"}),
+ 'text': ('django.db.models.fields.TextField', [], {})
+ },
+ u'za_hansard.pmgcommitteereport': {
+ 'Meta': {'object_name': 'PMGCommitteeReport'},
+ u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+ 'last_sayit_import': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}),
+ 'meeting_url': ('django.db.models.fields.TextField', [], {}),
+ 'premium': ('django.db.models.fields.BooleanField', [], {}),
+ 'processed': ('django.db.models.fields.BooleanField', [], {}),
+ 'sayit_section': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['speeches.Section']", 'null': 'True', 'on_delete': 'models.PROTECT', 'blank': 'True'})
+ },
+ u'za_hansard.question': {
+ 'Meta': {'unique_together': "(('written_number', 'house', 'year'), ('oral_number', 'house', 'year'), ('president_number', 'house', 'year'), ('dp_number', 'house', 'year'), ('id_number', 'house', 'year'))", 'object_name': 'Question'},
+ 'answer': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'question'", 'null': 'True', 'to': u"orm['za_hansard.Answer']"}),
+ 'answer_type': ('django.db.models.fields.CharField', [], {'max_length': '1'}),
+ 'askedby': ('django.db.models.fields.TextField', [], {}),
+ 'date': ('django.db.models.fields.DateField', [], {}),
+ 'date_transferred': ('django.db.models.fields.DateField', [], {'null': 'True'}),
+ 'dp_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}),
+ 'house': ('django.db.models.fields.CharField', [], {'max_length': '1', 'db_index': 'True'}),
+ u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+ 'id_number': ('django.db.models.fields.IntegerField', [], {'db_index': 'True'}),
+ 'identifier': ('django.db.models.fields.CharField', [], {'max_length': '10', 'db_index': 'True'}),
+ 'intro': ('django.db.models.fields.TextField', [], {}),
+ 'last_sayit_import': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}),
+ 'oral_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}),
+ 'paper': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['za_hansard.QuestionPaper']", 'null': 'True', 'on_delete': 'models.SET_NULL'}),
+ 'president_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}),
+ 'question': ('django.db.models.fields.TextField', [], {}),
+ 'questionto': ('django.db.models.fields.TextField', [], {}),
+ 'sayit_section': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['speeches.Section']", 'null': 'True', 'on_delete': 'models.PROTECT', 'blank': 'True'}),
+ 'translated': ('django.db.models.fields.BooleanField', [], {}),
+ 'written_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}),
+ 'year': ('django.db.models.fields.IntegerField', [], {'db_index': 'True'})
+ },
+ u'za_hansard.questionpaper': {
+ 'Meta': {'unique_together': "(('year', 'issue_number', 'house', 'parliament_number'),)", 'object_name': 'QuestionPaper'},
+ 'date_published': ('django.db.models.fields.DateField', [], {}),
+ 'document_name': ('django.db.models.fields.TextField', [], {'max_length': '32'}),
+ 'document_number': ('django.db.models.fields.IntegerField', [], {}),
+ 'house': ('django.db.models.fields.CharField', [], {'max_length': '64'}),
+ u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+ 'issue_number': ('django.db.models.fields.IntegerField', [], {}),
+ 'language': ('django.db.models.fields.CharField', [], {'max_length': '16'}),
+ 'parliament_number': ('django.db.models.fields.IntegerField', [], {}),
+ 'session_number': ('django.db.models.fields.IntegerField', [], {}),
+ 'source_url': ('django.db.models.fields.URLField', [], {'max_length': '1000'}),
+ 'text': ('django.db.models.fields.TextField', [], {}),
+ 'year': ('django.db.models.fields.IntegerField', [], {})
+ },
+ u'za_hansard.source': {
+ 'Meta': {'ordering': "['-date', 'document_name']", 'object_name': 'Source'},
+ 'date': ('django.db.models.fields.DateField', [], {}),
+ 'document_name': ('django.db.models.fields.CharField', [], {'max_length': '200'}),
+ 'document_number': ('django.db.models.fields.IntegerField', [], {'unique': 'True'}),
+ 'house': ('django.db.models.fields.CharField', [], {'max_length': '200'}),
+ u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+ 'is404': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
+ 'language': ('django.db.models.fields.CharField', [], {'max_length': '200'}),
+ 'last_processing_attempt': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}),
+ 'last_processing_success': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}),
+ 'last_sayit_import': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}),
+ 'sayit_section': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['speeches.Section']", 'null': 'True', 'on_delete': 'models.PROTECT', 'blank': 'True'}),
+ 'title': ('django.db.models.fields.CharField', [], {'max_length': '200'}),
+ 'url': ('django.db.models.fields.URLField', [], {'max_length': '1000'})
+ }
+ }
+
+ complete_apps = ['za_hansard']
\ No newline at end of file
diff --git a/za_hansard/models.py b/za_hansard/models.py
index efac72b..43a8562 100644
--- a/za_hansard/models.py
+++ b/za_hansard/models.py
@@ -436,3 +436,121 @@ class Meta:
# 1) At least one of written_number and oral_number must be non-null.
#CREATE TABLE completed_documents (`url` string);
+
+class ATCDocument(models.Model):
+ """
+ ZA Parliament Announcement, Tablings and Committee Reports.
+ """
+
+ title = models.CharField(max_length=200)
+ document_name = models.CharField(max_length=200)
+ document_number = models.IntegerField(unique=True)
+ date = models.DateField()
+ url = models.URLField(max_length=1000)
+ is404 = models.BooleanField(default=False)
+ house = models.CharField(max_length=200)
+ language = models.CharField(max_length=200)
+
+ last_processing_attempt = models.DateTimeField(blank=True, null=True)
+ last_processing_success = models.DateTimeField(blank=True, null=True)
+
+ contains_committee_announcement = models.BooleanField(default=False)
+
+ class Meta:
+ ordering = [ '-date', 'document_name' ]
+
+
+ def __unicode__(self):
+ return self.document_name
+
+
+ def delete(self):
+ """After deleting from db, delete the cached file too"""
+ cache_file_path = self.cache_file_path()
+ super( Source, self ).delete()
+
+ if os.path.exists( cache_file_path ):
+ os.remove( cache_file_path )
+
+
+ def file(self, debug=False):
+ """
+ Return as a file object the resource that the url is pointing to.
+
+ Should check the local cache first, and fetch and store if it is not
+ found there.
+
+ Raises a SourceUrlCouldNotBeRetrieved exception if URL could not be
+ retrieved.
+ """
+ cache_file_path = self.cache_file_path()
+
+ found = os.path.isfile(cache_file_path)
+
+ if debug:
+ print >> sys.stderr, "%s (%s)" % (cache_file_path, found)
+
+ # If the file exists open it, read it and return it
+ if found:
+ return cache_file_path
+
+ # If not fetch the file, save to cache and then return fh
+ h = httplib2.Http()
+ url = 'http://www.parliament.gov.za/live/' + self.url
+
+ def request_url(url):
+ if debug:
+ print >> sys.stderr, 'Requesting %s' % url
+ (response, content) = h.request(url)
+ if response.status != 200:
+ raise SourceUrlCouldNotBeRetrieved("status code: %s, url: %s" % (response.status, self.url) )
+ self.is404 = False
+ self.save()
+ return (response, content)
+
+ try:
+ (response, content) = request_url(url)
+ except SourceUrlCouldNotBeRetrieved as e:
+ try:
+ if not url[-4:] == '.doc':
+ (response, content) = request_url(url + '.doc')
+ self.url = self.url + '.doc'
+ self.save()
+ else:
+ raise e
+ except:
+ raise e
+
+ if not content:
+ raise SourceUrlCouldNotBeRetrieved("WTF?")
+ with open(cache_file_path, "w") as new_cache_file:
+ new_cache_file.write(content)
+
+ return cache_file_path
+
+ def cache_file_path(self):
+ """Absolute path to the cache file for this source"""
+
+ id_str= "%05u" % self.id
+
+ # do some simple partitioning
+ # FIXME - put in something to prevent the test suite overwriting non-test files.
+ aaa = id_str[-1]
+ bbb = id_str[-2]
+ cache_dir = os.path.join(settings.HANSARD_CACHE, aaa, bbb)
+
+ # check that the dir exists
+ if not os.path.exists( cache_dir ):
+ os.makedirs( cache_dir )
+
+ d = self.date.strftime('%Y-%m-%d')
+
+ # create the path to the file
+ cache_file_path = os.path.join(cache_dir, '-'.join([d, id_str, self.document_name]))
+ return cache_file_path
+
+ def xml_file_path(self):
+ xml_file_path = '%s.xml' % self.cache_file_path()
+ if os.path.isfile(xml_file_path):
+ return xml_file_path
+ return None