From 73bad5b91e823f27fc2aad8fc4be3d280a26f0b3 Mon Sep 17 00:00:00 2001
From: Geoffrey Kilpin <geoffreykilpin@gmail.com>
Date: Sun, 6 Jul 2014 22:48:24 +0200
Subject: [PATCH 1/2] Create ATCDocument model

Creates the ATCDocument model, which is largely identical to the Source
model.
---
 .../migrations/0028_auto__add_atcdocument.py  | 203 ++++++++++++++++++
 za_hansard/models.py                          | 118 ++++++++++
 2 files changed, 321 insertions(+)
 create mode 100644 za_hansard/migrations/0028_auto__add_atcdocument.py

diff --git a/za_hansard/migrations/0028_auto__add_atcdocument.py b/za_hansard/migrations/0028_auto__add_atcdocument.py
new file mode 100644
index 0000000..9b63780
--- /dev/null
+++ b/za_hansard/migrations/0028_auto__add_atcdocument.py
@@ -0,0 +1,203 @@
+# -*- coding: utf-8 -*-
+from south.utils import datetime_utils as datetime
+from south.db import db
+from south.v2 import SchemaMigration
+from django.db import models
+
+
+class Migration(SchemaMigration):
+
+    def forwards(self, orm):
+        # Adding model 'ATCDocument'
+        db.create_table(u'za_hansard_atcdocument', (
+            (u'id', self.gf('django.db.models.fields.AutoField')(primary_key=True)),
+            ('title', self.gf('django.db.models.fields.CharField')(max_length=200)),
+            ('document_name', self.gf('django.db.models.fields.CharField')(max_length=200)),
+            ('document_number', self.gf('django.db.models.fields.IntegerField')(unique=True)),
+            ('date', self.gf('django.db.models.fields.DateField')()),
+            ('url', self.gf('django.db.models.fields.URLField')(max_length=1000)),
+            ('is404', self.gf('django.db.models.fields.BooleanField')(default=False)),
+            ('house', self.gf('django.db.models.fields.CharField')(max_length=200)),
+            ('language', self.gf('django.db.models.fields.CharField')(max_length=200)),
+            ('last_processing_attempt', self.gf('django.db.models.fields.DateTimeField')(null=True, blank=True)),
+            ('last_processing_success', self.gf('django.db.models.fields.DateTimeField')(null=True, blank=True)),
+            ('contains_committee_announcement', self.gf('django.db.models.fields.BooleanField')(default=False)),
+        ))
+        db.send_create_signal(u'za_hansard', ['ATCDocument'])
+
+
+    def backwards(self, orm):
+        # Deleting model 'ATCDocument'
+        db.delete_table(u'za_hansard_atcdocument')
+
+
+    models = {
+        u'auth.group': {
+            'Meta': {'object_name': 'Group'},
+            u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+            'name': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '80'}),
+            'permissions': ('django.db.models.fields.related.ManyToManyField', [], {'to': u"orm['auth.Permission']", 'symmetrical': 'False', 'blank': 'True'})
+        },
+        u'auth.permission': {
+            'Meta': {'ordering': "(u'content_type__app_label', u'content_type__model', u'codename')", 'unique_together': "((u'content_type', u'codename'),)", 'object_name': 'Permission'},
+            'codename': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
+            'content_type': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['contenttypes.ContentType']"}),
+            u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+            'name': ('django.db.models.fields.CharField', [], {'max_length': '50'})
+        },
+        u'auth.user': {
+            'Meta': {'object_name': 'User'},
+            'date_joined': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}),
+            'email': ('django.db.models.fields.EmailField', [], {'max_length': '75', 'blank': 'True'}),
+            'first_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}),
+            'groups': ('django.db.models.fields.related.ManyToManyField', [], {'symmetrical': 'False', 'related_name': "u'user_set'", 'blank': 'True', 'to': u"orm['auth.Group']"}),
+            u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+            'is_active': ('django.db.models.fields.BooleanField', [], {'default': 'True'}),
+            'is_staff': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
+            'is_superuser': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
+            'last_login': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}),
+            'last_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}),
+            'password': ('django.db.models.fields.CharField', [], {'max_length': '128'}),
+            'user_permissions': ('django.db.models.fields.related.ManyToManyField', [], {'symmetrical': 'False', 'related_name': "u'user_set'", 'blank': 'True', 'to': u"orm['auth.Permission']"}),
+            'username': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '30'})
+        },
+        u'contenttypes.contenttype': {
+            'Meta': {'ordering': "('name',)", 'unique_together': "(('app_label', 'model'),)", 'object_name': 'ContentType', 'db_table': "'django_content_type'"},
+            'app_label': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
+            u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+            'model': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
+            'name': ('django.db.models.fields.CharField', [], {'max_length': '100'})
+        },
+        u'instances.instance': {
+            'Meta': {'object_name': 'Instance'},
+            'created_by': ('django.db.models.fields.related.ForeignKey', [], {'blank': 'True', 'related_name': "'created_instances'", 'null': 'True', 'to': u"orm['auth.User']"}),
+            'description': ('django.db.models.fields.TextField', [], {'blank': 'True'}),
+            u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+            'label': ('instances.fields.DNSLabelField', [], {'unique': 'True', 'max_length': '63', 'db_index': 'True'}),
+            'title': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
+            'users': ('django.db.models.fields.related.ManyToManyField', [], {'symmetrical': 'False', 'related_name': "'instances'", 'blank': 'True', 'to': u"orm['auth.User']"})
+        },
+        u'speeches.section': {
+            'Meta': {'ordering': "('id',)", 'unique_together': "(('parent', 'slug', 'instance'),)", 'object_name': 'Section'},
+            'created': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}),
+            u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+            'instance': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['instances.Instance']"}),
+            'modified': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'blank': 'True'}),
+            'parent': ('django.db.models.fields.related.ForeignKey', [], {'blank': 'True', 'related_name': "'children'", 'null': 'True', 'to': u"orm['speeches.Section']"}),
+            'slug': ('sluggable.fields.SluggableField', [], {'unique_with': "('parent', 'instance')", 'max_length': '50', 'populate_from': "'title'"}),
+            'title': ('django.db.models.fields.TextField', [], {'blank': 'True'})
+        },
+        u'za_hansard.answer': {
+            'Meta': {'unique_together': "(('oral_number', 'house', 'year'), ('written_number', 'house', 'year'), ('president_number', 'house', 'year'), ('dp_number', 'house', 'year'))", 'object_name': 'Answer'},
+            'date': ('django.db.models.fields.DateField', [], {}),
+            'date_published': ('django.db.models.fields.DateField', [], {}),
+            'document_name': ('django.db.models.fields.TextField', [], {}),
+            'dp_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}),
+            'house': ('django.db.models.fields.CharField', [], {'max_length': '1', 'db_index': 'True'}),
+            u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+            'language': ('django.db.models.fields.TextField', [], {}),
+            'name': ('django.db.models.fields.TextField', [], {}),
+            'oral_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}),
+            'president_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}),
+            'processed_code': ('django.db.models.fields.IntegerField', [], {'default': '0', 'db_index': 'True'}),
+            'text': ('django.db.models.fields.TextField', [], {}),
+            'type': ('django.db.models.fields.TextField', [], {}),
+            'url': ('django.db.models.fields.TextField', [], {'db_index': 'True'}),
+            'written_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}),
+            'year': ('django.db.models.fields.IntegerField', [], {'db_index': 'True'})
+        },
+        u'za_hansard.atcdocument': {
+            'Meta': {'ordering': "['-date', 'document_name']", 'object_name': 'ATCDocument'},
+            'contains_committee_announcement': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
+            'date': ('django.db.models.fields.DateField', [], {}),
+            'document_name': ('django.db.models.fields.CharField', [], {'max_length': '200'}),
+            'document_number': ('django.db.models.fields.IntegerField', [], {'unique': 'True'}),
+            'house': ('django.db.models.fields.CharField', [], {'max_length': '200'}),
+            u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+            'is404': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
+            'language': ('django.db.models.fields.CharField', [], {'max_length': '200'}),
+            'last_processing_attempt': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}),
+            'last_processing_success': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}),
+            'title': ('django.db.models.fields.CharField', [], {'max_length': '200'}),
+            'url': ('django.db.models.fields.URLField', [], {'max_length': '1000'})
+        },
+        u'za_hansard.pmgcommitteeappearance': {
+            'Meta': {'object_name': 'PMGCommitteeAppearance'},
+            'committee': ('django.db.models.fields.TextField', [], {}),
+            'committee_url': ('django.db.models.fields.TextField', [], {}),
+            u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+            'meeting': ('django.db.models.fields.TextField', [], {}),
+            'meeting_date': ('django.db.models.fields.DateField', [], {}),
+            'meeting_url': ('django.db.models.fields.TextField', [], {}),
+            'party': ('django.db.models.fields.TextField', [], {}),
+            'person': ('django.db.models.fields.TextField', [], {}),
+            'report': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'appearances'", 'null': 'True', 'to': u"orm['za_hansard.PMGCommitteeReport']"}),
+            'text': ('django.db.models.fields.TextField', [], {})
+        },
+        u'za_hansard.pmgcommitteereport': {
+            'Meta': {'object_name': 'PMGCommitteeReport'},
+            u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+            'last_sayit_import': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}),
+            'meeting_url': ('django.db.models.fields.TextField', [], {}),
+            'premium': ('django.db.models.fields.BooleanField', [], {}),
+            'processed': ('django.db.models.fields.BooleanField', [], {}),
+            'sayit_section': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['speeches.Section']", 'null': 'True', 'on_delete': 'models.PROTECT', 'blank': 'True'})
+        },
+        u'za_hansard.question': {
+            'Meta': {'unique_together': "(('written_number', 'house', 'year'), ('oral_number', 'house', 'year'), ('president_number', 'house', 'year'), ('dp_number', 'house', 'year'), ('id_number', 'house', 'year'))", 'object_name': 'Question'},
+            'answer': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'question'", 'null': 'True', 'to': u"orm['za_hansard.Answer']"}),
+            'answer_type': ('django.db.models.fields.CharField', [], {'max_length': '1'}),
+            'askedby': ('django.db.models.fields.TextField', [], {}),
+            'date': ('django.db.models.fields.DateField', [], {}),
+            'date_transferred': ('django.db.models.fields.DateField', [], {'null': 'True'}),
+            'dp_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}),
+            'house': ('django.db.models.fields.CharField', [], {'max_length': '1', 'db_index': 'True'}),
+            u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+            'id_number': ('django.db.models.fields.IntegerField', [], {'db_index': 'True'}),
+            'identifier': ('django.db.models.fields.CharField', [], {'max_length': '10', 'db_index': 'True'}),
+            'intro': ('django.db.models.fields.TextField', [], {}),
+            'last_sayit_import': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}),
+            'oral_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}),
+            'paper': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['za_hansard.QuestionPaper']", 'null': 'True', 'on_delete': 'models.SET_NULL'}),
+            'president_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}),
+            'question': ('django.db.models.fields.TextField', [], {}),
+            'questionto': ('django.db.models.fields.TextField', [], {}),
+            'sayit_section': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['speeches.Section']", 'null': 'True', 'on_delete': 'models.PROTECT', 'blank': 'True'}),
+            'translated': ('django.db.models.fields.BooleanField', [], {}),
+            'written_number': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'db_index': 'True'}),
+            'year': ('django.db.models.fields.IntegerField', [], {'db_index': 'True'})
+        },
+        u'za_hansard.questionpaper': {
+            'Meta': {'unique_together': "(('year', 'issue_number', 'house', 'parliament_number'),)", 'object_name': 'QuestionPaper'},
+            'date_published': ('django.db.models.fields.DateField', [], {}),
+            'document_name': ('django.db.models.fields.TextField', [], {'max_length': '32'}),
+            'document_number': ('django.db.models.fields.IntegerField', [], {}),
+            'house': ('django.db.models.fields.CharField', [], {'max_length': '64'}),
+            u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+            'issue_number': ('django.db.models.fields.IntegerField', [], {}),
+            'language': ('django.db.models.fields.CharField', [], {'max_length': '16'}),
+            'parliament_number': ('django.db.models.fields.IntegerField', [], {}),
+            'session_number': ('django.db.models.fields.IntegerField', [], {}),
+            'source_url': ('django.db.models.fields.URLField', [], {'max_length': '1000'}),
+            'text': ('django.db.models.fields.TextField', [], {}),
+            'year': ('django.db.models.fields.IntegerField', [], {})
+        },
+        u'za_hansard.source': {
+            'Meta': {'ordering': "['-date', 'document_name']", 'object_name': 'Source'},
+            'date': ('django.db.models.fields.DateField', [], {}),
+            'document_name': ('django.db.models.fields.CharField', [], {'max_length': '200'}),
+            'document_number': ('django.db.models.fields.IntegerField', [], {'unique': 'True'}),
+            'house': ('django.db.models.fields.CharField', [], {'max_length': '200'}),
+            u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+            'is404': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
+            'language': ('django.db.models.fields.CharField', [], {'max_length': '200'}),
+            'last_processing_attempt': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}),
+            'last_processing_success': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}),
+            'last_sayit_import': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}),
+            'sayit_section': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['speeches.Section']", 'null': 'True', 'on_delete': 'models.PROTECT', 'blank': 'True'}),
+            'title': ('django.db.models.fields.CharField', [], {'max_length': '200'}),
+            'url': ('django.db.models.fields.URLField', [], {'max_length': '1000'})
+        }
+    }
+
+    complete_apps = ['za_hansard']
\ No newline at end of file
diff --git a/za_hansard/models.py b/za_hansard/models.py
index efac72b..43a8562 100644
--- a/za_hansard/models.py
+++ b/za_hansard/models.py
@@ -436,3 +436,121 @@ class Meta:
         # 1) At least one of written_number and oral_number must be non-null.
 
 #CREATE TABLE completed_documents (`url` string);
+
+class ATCDocument(models.Model):
+    """
+    ZA Parliament Announcement, Tablings and Committee Reports.
+    """
+
+    title           = models.CharField(max_length=200)
+    document_name   = models.CharField(max_length=200)
+    document_number = models.IntegerField(unique=True)
+    date            = models.DateField()
+    url             = models.URLField(max_length=1000)
+    is404           = models.BooleanField(default=False)
+    house           = models.CharField(max_length=200)
+    language        = models.CharField(max_length=200)
+
+    last_processing_attempt = models.DateTimeField(blank=True, null=True)
+    last_processing_success = models.DateTimeField(blank=True, null=True)
+
+    contains_committee_announcement = models.BooleanField(default=False)
+
+    class Meta:
+        ordering = [ '-date', 'document_name' ]
+
+
+    def __unicode__(self):
+        return self.document_name
+
+
+    def delete(self):
+        """After deleting from db, delete the cached file too"""
+        cache_file_path = self.cache_file_path()
+        super( Source, self ).delete()
+
+        if os.path.exists( cache_file_path ):
+            os.remove( cache_file_path )
+
+
+    def file(self, debug=False):
+        """
+        Return as a file object the resource that the url is pointing to.
+
+        Should check the local cache first, and fetch and store if it is not
+        found there.
+
+        Raises a SourceUrlCouldNotBeRetrieved exception if URL could not be
+        retrieved.
+        """
+        cache_file_path = self.cache_file_path()
+
+        found = os.path.isfile(cache_file_path)
+
+        if debug:
+            print >> sys.stderr, "%s (%s)" % (cache_file_path, found)
+
+        # If the file exists open it, read it and return it
+        if found:
+            return cache_file_path
+
+        # If not fetch the file, save to cache and then return fh
+        h = httplib2.Http()
+        url = 'http://www.parliament.gov.za/live/' + self.url
+
+        def request_url(url):
+            if debug:
+                print >> sys.stderr, 'Requesting %s' % url
+            (response, content) = h.request(url)
+            if response.status != 200:
+                raise SourceUrlCouldNotBeRetrieved("status code: %s, url: %s" % (response.status, self.url) )
+            self.is404 = False
+            self.save()
+            return (response, content)
+
+        try:
+            (response, content) = request_url(url)
+        except SourceUrlCouldNotBeRetrieved as e:
+            try:
+                if not url[-4:] == '.doc':
+                    (response, content) = request_url(url + '.doc')
+                    self.url = self.url + '.doc'
+                    self.save()
+                else:
+                    raise e
+            except:
+                raise e
+
+        if not content:
+            raise SourceUrlCouldNotBeRetrieved("WTF?")
+        with open(cache_file_path, "w") as new_cache_file:
+            new_cache_file.write(content)
+
+        return cache_file_path
+
+    def cache_file_path(self):
+        """Absolute path to the cache file for this source"""
+
+        id_str= "%05u" % self.id
+
+        # do some simple partitioning
+        # FIXME - put in something to prevent the test suite overwriting non-test files.
+        aaa = id_str[-1]
+        bbb = id_str[-2]
+        cache_dir = os.path.join(settings.HANSARD_CACHE, aaa, bbb)
+
+        # check that the dir exists
+        if not os.path.exists( cache_dir ):
+            os.makedirs( cache_dir )
+
+        d = self.date.strftime('%Y-%m-%d')
+
+        # create the path to the file
+        cache_file_path = os.path.join(cache_dir, '-'.join([d, id_str, self.document_name]))
+        return cache_file_path
+
+    def xml_file_path(self):
+        xml_file_path = '%s.xml' % self.cache_file_path()
+        if os.path.isfile(xml_file_path):
+            return xml_file_path
+        return None

From abb623a7105a3c37049fd1087af35461e44079de Mon Sep 17 00:00:00 2001
From: Geoffrey Kilpin <geoffreykilpin@gmail.com>
Date: Mon, 7 Jul 2014 20:44:19 +0200
Subject: [PATCH 2/2] ATC scraper and committee change notifier

Scrapes ATC documents and detects announcements of changes
to committee membership.
---
 za_hansard/atc_scraper.py                     |  86 +++++++
 .../commands/za_hansard_atc_scraper.py        | 232 ++++++++++++++++++
 2 files changed, 318 insertions(+)
 create mode 100644 za_hansard/atc_scraper.py
 create mode 100644 za_hansard/management/commands/za_hansard_atc_scraper.py

diff --git a/za_hansard/atc_scraper.py b/za_hansard/atc_scraper.py
new file mode 100644
index 0000000..6c10bf4
--- /dev/null
+++ b/za_hansard/atc_scraper.py
@@ -0,0 +1,86 @@
+# -*- coding: utf-8 -*-
+import distutils.spawn
+import hashlib
+import os
+import sys
+import re
+import requests
+import subprocess
+import tempfile
+import warnings
+import datetime
+import lxml.etree
+
+import parslepy
+
+from django.core.exceptions import ImproperlyConfigured
+from django.conf import settings
+
+from za_hansard.models import ATCDocument
+
+#Largely based on question_scraper.py
+
+# from https://github.com/scraperwiki/scraperwiki-python/blob/a96582f6c20cc1897f410d522e2a5bf37d301220/scraperwiki/utils.py#L38-L54
+# Copied rather than included as the scraperwiki __init__.py was having trouble
+# loading the sqlite code, which is something we don't actually need.
+
+def ensure_executable_found(name):
+    if not distutils.spawn.find_executable(name):
+        raise ImproperlyConfigured("Can't find executable '{0}' which is needed by this code".format(name))
+
+ensure_executable_found("pdftohtml")
+def pdftoxml(pdfdata):
+    """converts pdf file to xml file"""
+    pdffout = tempfile.NamedTemporaryFile(suffix='.pdf')
+    pdffout.write(pdfdata)
+    pdffout.flush()
+
+    xmlin = tempfile.NamedTemporaryFile(mode='r', suffix='.xml')
+    tmpxml = xmlin.name # "temph.xml"
+    cmd = 'pdftohtml -xml -nodrm -zoom 1.5 -enc UTF-8 -noframes "%s" "%s"' % (pdffout.name, os.path.splitext(tmpxml)[0])
+    cmd = cmd + " >/dev/null 2>&1" # can't turn off output, so throw away even stderr yeuch
+    os.system(cmd)
+
+    pdffout.close()
+    #xmlfin = open(tmpxml)
+    xmldata = xmlin.read()
+    xmlin.close()
+
+    # pdftohtml version 0.18.4 occasionally produces bad markup of the form <b>...<i>...</b> </i>
+    # Since ee don't actually need <i> tags, we may as well get rid of them all now, which will fix this.
+    # Note that we're working with a byte string version of utf-8 encoded data here.
+
+    xmldata = re.sub(r'</?i>', '', xmldata)
+
+    return xmldata
+
+class ATCDocumentParser(object):
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+
+    @classmethod
+    def check_committee_membership_announcement(cls, document_path):
+
+        if os.path.exists(document_path):
+            with open(document_path) as f:
+                contents = f.read()
+
+        if not contents:
+            return
+
+        xmldata = pdftoxml(contents)
+
+        if not xmldata:
+            sys.stdout.write(' SKIPPING - Got no XML data\n')
+            return
+
+        text = lxml.etree.fromstring(xmldata)
+
+        for el in text.iterfind('.//text'):
+            if 'Membership of Committees' in re.match(ur'(?s)<text.*?>(.*?)</text>', lxml.etree.tostring(el, encoding='unicode')).group(1):
+                return True
+
+        #committee announcement not found
+        return False
+
+
diff --git a/za_hansard/management/commands/za_hansard_atc_scraper.py b/za_hansard/management/commands/za_hansard_atc_scraper.py
new file mode 100644
index 0000000..8f41c48
--- /dev/null
+++ b/za_hansard/management/commands/za_hansard_atc_scraper.py
@@ -0,0 +1,232 @@
+import pprint
+import httplib2
+import re
+import datetime
+import time
+import sys
+
+from optparse import make_option
+from bs4 import BeautifulSoup
+
+from django.conf import settings
+
+
+from django.core.management.base import BaseCommand, CommandError
+from django.core.mail import send_mail
+
+from za_hansard.models import ATCDocument
+from ... import atc_scraper
+
+#Largely based on za_hansard_check_for_new_sources.py and za_hansard_run_parsing.py
+
+class FailedToRetrieveSourceException (Exception):
+    pass
+
+class Command(BaseCommand):
+    help = 'Scrapes ZA Parliament Announcements, Tablings and Committee Reports documents. Currently only monitors for mentions of changes to committee membership.'
+    option_list = BaseCommand.option_list + (
+        make_option('--check-all',
+            default=False,
+            action='store_true',
+            help="Don't stop when when reaching a previously seen item (applies to checking contents and for new papers)",
+        ),
+        make_option('--check-all-papers',
+            default=False,
+            action='store_true',
+            help="Don't stop when when reaching a previously seen item (applies only to new papers)",
+        ),
+        make_option('--retry',
+            default=False,
+            action='store_true',
+            help='Retry attempted (but not completed) parses and previously 404\'d documents',
+        ),
+        make_option('--check-for-papers',
+            default=False,
+            action='store_true',
+            help='Check for new ATC papers',
+        ),
+        make_option('--check-committees',
+            default=False,
+            action='store_true',
+            help='Checks for changes to committee membership ',
+        ),
+        make_option('--historical-limit',
+            default='2014-05-07',
+            type='str',
+            help='Limit earliest historical entry to check (in yyyy-mm-dd format, default 2014-05-07)',
+        ),
+        make_option('--limit',
+            default=0,
+            type='str',
+            help='Limit number of entries to check (applies to checking contents and for new papers)',
+        ),
+        make_option('--run-all-steps',
+            default=False,
+            action='store_true',
+            help='Check for new papers and for new committee memberships',
+        ),
+    )
+
+    def handle(self, *args, **options):
+
+        self.historical_limit = datetime.datetime.strptime(options['historical_limit'], '%Y-%m-%d').date()
+        self.limit = options['limit']
+        self.check_all = options['check_all']
+        self.check_all_papers = options['check_all_papers']
+        self.retry = options['retry']
+
+        if options['check_for_papers']:
+            self.check_for_papers(options)
+        elif options['check_committees']:
+            self.check_committees(options)
+        elif options['run_all_steps']:
+            self.check_for_papers(options)
+            self.check_committees(options)
+
+    def check_committees(self, options):
+        sources = ATCDocument.objects.all()
+        if not self.check_all:
+            sources = sources.filter( last_processing_success=None )
+        if (not self.retry) and (not self.check_all):
+            sources = sources.filter(is404 = False).filter( last_processing_attempt=None )
+
+        if not sources:
+            print 'No documents to check.'
+
+        for s in (sources[:self.limit] if self.limit else sources):
+
+            if s.date < self.historical_limit:
+                print "Reached historical limit. Stopping.\n"
+                return
+
+            s.last_processing_attempt = datetime.datetime.now()
+            s.save()
+
+            try:
+                try:
+                    filename = s.file()
+                    if s.is404:
+                        s.is404 = False
+                        s.save()
+                except SourceUrlCouldNotBeRetrieved as e:
+                    s.is404 = True
+                    s.save()
+                    raise e
+
+                if atc_scraper.ATCDocumentParser.check_committee_membership_announcement(filename):
+                    self.stdout.write( "Committee announcement found %s (%d)\n" % (s.document_name, s.document_number) )
+                    s.contains_committee_announcement = True
+
+                    message = '''A committee announcement was found in the following ATC document:\n
+                    \n
+                    Document: %s\n
+                    Date: %s\n
+                    House: %s\n
+                    Language: %s\n
+                    URL: %s
+                    ''' % (s.document_name, s.date, s.house, s.language, 'http://www.parliament.gov.za/live/' + s.url)
+
+                    send_mail('New committee announcement found - People\'s Assembly', message, settings.FROM_EMAIL, settings.ZA_COMMITTEE_NOTIFICATION_EMAIL, fail_silently=False)
+
+                s.last_processing_success = datetime.datetime.now()
+
+                s.save()
+                self.stdout.write( "Processed %s (%d)\n" % (s.document_name, s.document_number) )
+            except Exception as e:
+                # raise CommandError("Failed to run parsing: %s" % str(e))
+                self.stderr.write("WARN: Failed to run parsing: %s" % str(e))
+
+    def check_for_papers(self, options):
+        sources = self.retrieve_sources(0, options)
+        sources.reverse()
+        sources_db = [ATCDocument.objects.get_or_create(**source) for source in sources]
+        sources_count = len(sources)
+        created_count = sum([1 for (_,created) in sources_db if created])
+        self.stdout.write('ATC documents found: %d\nATC documents created: %d\n' % (
+            sources_count, created_count))
+
+    def retrieve_sources(self, start, options):
+
+        try:
+            url = 'http://www.parliament.gov.za/live/content.php?Category_ID=227&DocumentStart=%d' % (start or 0)
+            self.stdout.write("Retrieving %s\n" % url)
+            h = httplib2.Http( settings.HTTPLIB2_CACHE_DIR )
+            response, content = h.request(url)
+            assert response.status == 200
+            self.stdout.write("OK\n")
+            # content = open('test.html').read()
+
+            # parse content
+            soup = BeautifulSoup(
+                content,
+                'xml',
+            )
+
+            rx = re.compile(r'Displaying (\d+)  (\d+) of the most recent (\d+)')
+
+            pager = soup.find('td', text=rx)
+            match = rx.search(pager.text)
+            (pstart, pend, ptotal) = [int(p) for p in match.groups()]
+
+            self.stdout.write( "Processing %d to %d\n" % (pstart, pend) )
+
+            nodes = soup.findAll( 'a', text="View Document" )
+            def scrape(node):
+                url = node['href']
+                table = node.find_parent('table')
+                rx = re.compile(r'>([^:<]*) : ([^<]*)<')
+
+                data = {}
+                for match in re.finditer(rx, str(table)):
+                    groups = match.groups()
+                    data[groups[0]] = groups[1]
+
+                title = ''
+                try:
+                    data['Title'] = table.find('b').text
+                except:
+                    data['Title'] = data.get('Document Summary', '(unknown)')
+
+                try:
+                    document_date = datetime.datetime.strptime(data['Date Published'], '%d %B %Y').date()
+                except Exception as e:
+                    raise CommandError( "Date could not be parsed\n%s" % str(e) )
+                    # document_date = datetime.date.today()
+
+                #(obj, created) = Source.objects.get_or_create(
+                return {
+                    'document_name':   data['Document Name'],
+                    'document_number': data['Document Number'],
+                    'defaults': {
+                        'url':      url,
+                        'title':    data['Title'],
+                        'language': data.get('Language', 'English'),
+                        'house':    data.get('House', '(unknown)'),
+                        'date':     document_date,
+                    }
+                }
+            scraped = []
+            for node in nodes:
+                s = scrape(node)
+                if ATCDocument.objects.filter(
+                    document_name   = s['document_name'],
+                    document_number = s['document_number']).exists():
+                    if (not self.check_all) and (not self.check_all_papers):
+                        print "Reached seen document. Stopping.\n"
+                        return scraped
+                if s['defaults']['date'] < self.historical_limit:
+                    print "Reached historical limit. Stopping.\n"
+                    return scraped
+
+                # otherwise
+                scraped.append(s)
+
+            if pend < (self.limit or ptotal):
+                # NB following isn't phrased as a tail call, could rewrite if
+                # that becomes important
+                scraped = scraped + self.retrieve_sources(pend, options)
+            return scraped
+
+        except Exception as e:
+            print >> sys.stderr, "ERROR: %s" % str(e)
+            return []