From 33756fc9ab9ab3511d410b18b27791cd8e922c17 Mon Sep 17 00:00:00 2001 From: Dave Voutila Date: Thu, 2 Feb 2017 12:21:51 -0500 Subject: [PATCH] Py3 upgrade and Pacer Refactoring (#171) * lots of changes to bring into line for Python 3.6 using six and other tricks. added tox for testing. still an issue with the title case function due to how python handles unicode strings now. * found a possible fix for the unicode issue in py3. bit of a hack...but tries to see if a string starts with unicode or not. * added python 3.5 and 3.6 to travis file. * turning off Debug in the title case test. * refixing the requirements to be exact versions for now. put a py2/3 compatability wrapper function around calls to the requests response objects. * set requests to new version that works locally. fixed an issue with the mock not closing a connection. removed my stupid broken non-fix for test_pacer.py * refactored cookie creation so be a bit more explicit in setting a cookie jar instance. refactored out posts to PACER as it turns out you need some black magick voodoo to form the post body into something it will enjoy. * bumped requests version back down to same version as CL for now. added mocks dependency for unit tests (to tox.ini and requirements-dev.txt start refactoring some of the Pacer stuff into a PacerSession class that extends requests.Session to handle PACER nuances tests passing locally with tox using free login. * cleaned up setup.py and moved some test requirements out of base requirements.txt file. still need to update README.rst about changes. refactored the BadLoginException into the juriscraper.pacer.http module as it fits better next to the place that raises it. added default timeout value of 300 to pacer sessions since it seemed commonly set elsewhere * relaxing error condition for logins * attempt to refactor PACER login to use central auth service while still supporting the legacy test site that does not seem supported at the moment. * slimming down the tests to focus on key functionality vs. breadth of courts. * changes to README.rst, minor tweaks related to code review. * segregated python2 and python3 specific regex due to issues with unicode raw string literals. minor tweaks per code review. * added new exception class to distinguish bad pacer credentials, changed login to test site based on "psc" court_id instead of username of tr1234 --- .gitignore | 2 + .travis.yml | 2 + README.rst | 42 ++-- juriscraper/AbstractSite.py | 19 +- juriscraper/OpinionSite.py | 2 +- juriscraper/OralArgumentSite.py | 2 +- juriscraper/lib/date_utils.py | 8 +- juriscraper/lib/html_utils.py | 12 +- juriscraper/lib/importer.py | 6 +- juriscraper/lib/log_tools.py | 18 +- juriscraper/lib/string_utils.py | 116 +++++---- juriscraper/lib/string_utils_py2.py | 28 +++ juriscraper/lib/test_utils.py | 11 +- .../federal_district/ed_louisiana.py | 3 +- .../opinions/united_states/state/vt_u.py | 2 +- .../federal_appellate/ca4.py | 2 +- .../federal_appellate/ca5.py | 2 +- .../state/ind_2005.py | 2 +- .../united_states/federal_appellate/ca6.py | 2 +- juriscraper/pacer/__init__.py | 4 +- juriscraper/pacer/auth.py | 55 ---- juriscraper/pacer/docket_report.py | 55 ++-- juriscraper/pacer/exceptions.py | 5 - juriscraper/pacer/free_documents.py | 63 +++-- juriscraper/pacer/http.py | 177 +++++++++++++ juriscraper/pacer/utils.py | 1 - requirements-dev.txt | 3 + requirements.txt | 7 +- setup.py | 13 +- test_pacer.py/__init__.py | 0 tests/__init__.py | 8 + tests/test_everything.py | 43 ++-- tests/test_pacer.py | 234 ++++++++++++++---- tox.ini | 6 + 34 files changed, 642 insertions(+), 313 deletions(-) create mode 100644 juriscraper/lib/string_utils_py2.py delete mode 100644 juriscraper/pacer/auth.py delete mode 100644 juriscraper/pacer/exceptions.py create mode 100644 juriscraper/pacer/http.py delete mode 100644 test_pacer.py/__init__.py create mode 100644 tox.ini diff --git a/.gitignore b/.gitignore index 2b6198b2d..39bf6f681 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,5 @@ juriscraper.egg-info/ # Private PACER stuff and test fixtures juriscraper/pacer/private_settings.py tests/fixtures/cassettes/ + +.tox diff --git a/.travis.yml b/.travis.yml index fe37e78ec..bacbad21e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,8 @@ sudo: false language: python python: - '2.7' +- '3.5' +- '3.6' script: python setup.py test install: pip install -U setuptools ; pip install . cache: pip diff --git a/README.rst b/README.rst index c1d82b2b2..14e4e6c8d 100644 --- a/README.rst +++ b/README.rst @@ -44,15 +44,21 @@ First step: Install Python 2.7.x, then: :: - # install the dependencies - sudo apt-get install libxml2-dev libxslt-dev libyaml-dev - - # Install PhantomJS - wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-1.9.7-linux-x86_64.tar.bz2 - tar -x -f phantomjs-1.9.7-linux-x86_64.tar.bz2 - sudo mkdir -p /usr/local/phantomjs - sudo mv phantomjs-1.9.7-linux-x86_64/bin/phantomjs /usr/local/phantomjs - rm -r phantomjs-1.9.7* # Cleanup + # -- Install the dependencies + # On Ubuntu/Debian Linux: + sudo apt-get install libxml2-dev libxslt-dev libyaml-dev + # On macOS with Homebrew : + brew install libyaml + + # -- Install PhantomJS + # On Ubuntu/Debian Linux + wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-1.9.7-linux-x86_64.tar.bz2 + tar -x -f phantomjs-1.9.7-linux-x86_64.tar.bz2 + sudo mkdir -p /usr/local/phantomjs + sudo mv phantomjs-1.9.7-linux-x86_64/bin/phantomjs /usr/local/phantomjs + rm -r phantomjs-1.9.7* # Cleanup + # On macOS with Homebrew: + brew install phantomjs # Finally, install the code. pip install juriscraper @@ -74,15 +80,15 @@ We also generally use Intellij with PyCharm installed. These are useful because For scrapers to be merged: -- ``python setup.py test`` must pass, listing the results for any new - scrapers. This will be run automatically by +- Running tests via ``tox`` must pass, listing the results for any new + scrapers. The test suite will be run automatically by `Travis-CI `__. If changes are being made to the pacer code, the pacer tests must also pass when run. These tests are skipped by default. To run them, set environment variables for PACER_USERNAME and PACER_PASSWORD. -- a \*\_example\* file must be included in the ``tests/examples`` +- A \*\_example\* file must be included in the ``tests/examples`` directory (this is needed for the tests to run your code). -- your code should be +- Your code should be `PEP8 `__ compliant with no major Pylint problems or Intellij inspection issues. -- your code should efficiently parse a page, returning no exceptions or +- Your code should efficiently parse a page, returning no exceptions or speed warnings during tests on a modern machine. When you're ready to develop a scraper, get in touch, and we'll find you @@ -117,8 +123,8 @@ Instead of installing Juriscraper via pip, do the following: :: git clone https://github.com/freelawproject/juriscraper.git . - python setup.py install - + pip install -r requirements.txt + python setup.py test Usage ===== @@ -188,8 +194,8 @@ Tests ===== We got that! You can (and should) run the tests with -``python setup.py test``. This will iterate over all of the -``*_example*`` files and run the scrapers against them. +``tox``. This will run ``python setup.py test`` for all supported Python runtimes, +iterating over all of the ``*_example*`` files and run the scrapers against them. In addition, we use `Travis-CI `__ to automatically run the tests whenever code is committed to the repository diff --git a/juriscraper/AbstractSite.py b/juriscraper/AbstractSite.py index 414e91adb..a3f9387a5 100644 --- a/juriscraper/AbstractSite.py +++ b/juriscraper/AbstractSite.py @@ -1,9 +1,8 @@ -import re import json import certifi import hashlib import requests - +import six from datetime import date, datetime from requests.adapters import HTTPAdapter @@ -139,7 +138,7 @@ def _clean_attributes(self): if attr == 'download_urls': sub_item = sub_item.strip() else: - if isinstance(sub_item, basestring): + if isinstance(sub_item, six.string_types): sub_item = clean_string(sub_item) elif isinstance(sub_item, datetime): sub_item = sub_item.date() @@ -178,7 +177,7 @@ def _check_sanity(self): for attr in self._all_attrs: if self.__getattribute__(attr) is not None: lengths[attr] = len(self.__getattribute__(attr)) - values = lengths.values() + values = list(lengths.values()) if values.count(values[0]) != len(values): # Are all elements equal? raise InsanityException("%s: Scraped meta data fields have differing" @@ -236,10 +235,10 @@ def _date_sort(self): obj_list_attrs = [self.__getattribute__(attr) for attr in self._all_attrs if isinstance(self.__getattribute__(attr), list)] - zipped = zip(*obj_list_attrs) + zipped = list(zip(*obj_list_attrs)) zipped.sort(reverse=True) i = 0 - obj_list_attrs = zip(*zipped) + obj_list_attrs = list(zip(*zipped)) for attr in self._all_attrs: if isinstance(self.__getattribute__(attr), list): self.__setattr__(attr, obj_list_attrs[i][:]) @@ -249,7 +248,7 @@ def _make_hash(self): """Make a unique ID. ETag and Last-Modified from courts cannot be trusted """ - self.hash = hashlib.sha1(str(self.case_names)).hexdigest() + self.hash = hashlib.sha1(str(self.case_names).encode()).hexdigest() def _get_adapter_instance(self): """Hook for returning a custom HTTPAdapter @@ -339,7 +338,11 @@ def _return_request_text_object(self): if 'json' in self.request['request'].headers.get('content-type', ''): return self.request['request'].json() else: - text = self._clean_text(self.request['request'].text) + payload = self.request['request'].content + if six.PY2: + payload = self.request['request'].text + + text = self._clean_text(payload) html_tree = self._make_html_tree(text) html_tree.rewrite_links(fix_links_in_lxml_tree, base_href=self.request['url']) diff --git a/juriscraper/OpinionSite.py b/juriscraper/OpinionSite.py index 794598952..1259b42bd 100644 --- a/juriscraper/OpinionSite.py +++ b/juriscraper/OpinionSite.py @@ -1,4 +1,4 @@ -from AbstractSite import AbstractSite +from juriscraper.AbstractSite import AbstractSite class OpinionSite(AbstractSite): diff --git a/juriscraper/OralArgumentSite.py b/juriscraper/OralArgumentSite.py index 592df567f..ad75f611b 100644 --- a/juriscraper/OralArgumentSite.py +++ b/juriscraper/OralArgumentSite.py @@ -1,4 +1,4 @@ -from AbstractSite import AbstractSite +from juriscraper.AbstractSite import AbstractSite class OralArgumentSite(AbstractSite): diff --git a/juriscraper/lib/date_utils.py b/juriscraper/lib/date_utils.py index dac7b50bc..9624f6336 100644 --- a/juriscraper/lib/date_utils.py +++ b/juriscraper/lib/date_utils.py @@ -108,11 +108,11 @@ def parse_dates(s, debug=False, sane_start=datetime.datetime(1750, 1, 1), # Ditch unicode (_timelex() flips out on unicode if the system has # cStringIO installed -- the default) - if isinstance(s, unicode): - s = s.encode('ascii', 'ignore') + #if isinstance(s, six.text_type): + # s = s.encode('ascii', 'ignore') # Fix misspellings - for i, j in MISSPELLINGS.iteritems(): + for i, j in MISSPELLINGS.items(): s = s.replace(i, j) @@ -127,7 +127,7 @@ def parse_dates(s, debug=False, sane_start=datetime.datetime(1750, 1, 1), hit_default_day_and_month = (d.month == DEFAULT.month and d.day == DEFAULT.day) if not any([hit_default_year, hit_default_day_and_month]): if debug: - print "Item %s parsed as: %s" % (item, d) + print("Item %s parsed as: %s" % (item, d)) if sane_start < d < sane_end: dates.append(d) except OverflowError: diff --git a/juriscraper/lib/html_utils.py b/juriscraper/lib/html_utils.py index ea32424ba..82bb9c891 100644 --- a/juriscraper/lib/html_utils.py +++ b/juriscraper/lib/html_utils.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # encoding: utf-8 -from urlparse import urlsplit -from urlparse import urlunsplit +from six import text_type +from six.moves.urllib.parse import urlsplit, urlunsplit import re from lxml import html @@ -78,7 +78,11 @@ def set_response_encoding(request): # HTTP headers. This way it is done before r.text is accessed # (which would do it with vanilla chardet). This is a big # performance boon, and can be removed once requests is upgraded - request.encoding = chardet.detect(request.content)['encoding'] + if isinstance(request.content, text_type): + as_bytes = request.content.encode() + request.encoding = chardet.detect(as_bytes)['encoding'] + else: + request.encoding = chardet.detect(request.content)['encoding'] def clean_html(text): @@ -100,7 +104,7 @@ def clean_html(text): # attribute, but we remove it in all cases, as there's no downside to # removing it. This moves our encoding detection to chardet, rather than # lxml. - if isinstance(text, unicode): + if isinstance(text, text_type): text = re.sub(r'^\s*<\?xml\s+.*?\?>', '', text) # Fix
diff --git a/juriscraper/lib/importer.py b/juriscraper/lib/importer.py index 0c1cb2cc7..7ff935a04 100644 --- a/juriscraper/lib/importer.py +++ b/juriscraper/lib/importer.py @@ -34,9 +34,9 @@ def find_all_attr_or_punt(court_id): # juriscraper.opinions.united_states.federal_appellate.ca1, # therefore, we add it to our list! module_strings.append(court_id) - except ImportError, e: + except ImportError as e: # Something has gone wrong with the import - print "Import error: %s" % e + print("Import error: %s" % e) return [] find_all_attr_or_punt(court_id) @@ -51,5 +51,5 @@ def site_yielder(iterable, mod): try: site._download_backwards(i) yield site - except HTTPError, e: + except HTTPError as e: continue diff --git a/juriscraper/lib/log_tools.py b/juriscraper/lib/log_tools.py index ecdbe690e..f80fb6613 100644 --- a/juriscraper/lib/log_tools.py +++ b/juriscraper/lib/log_tools.py @@ -24,28 +24,28 @@ def make_default_logger(file_path=LOG_FILENAME): maxBytes=5120000, backupCount=7 ) - except IOError, e: + except IOError as e: if e.errno == 2: - print "\nWarning: %s: %s. " \ + print("\nWarning: %s: %s. " \ "Have you created the directory for the log?" % ( e.strerror, file_path, - ) + )) elif e.errno == 13: - print "\nWarning: %s: %s. " \ + print("\nWarning: %s: %s. " \ "Cannot access file as user: %s" % ( e.strerror, file_path, getpass.getuser(), - ) + )) else: - print "\nIOError [%s]: %s\n%s" % ( + print("\nIOError [%s]: %s\n%s" % ( e.errno, e.strerror, traceback.format_exc() - ) - print "Juriscraper will continue to run, and all logs will be " \ - "sent to stdout." + )) + print("Juriscraper will continue to run, and all logs will be " \ + "sent to stdout.") handler = logging.StreamHandler(sys.stdout) handler.setFormatter( logging.Formatter('%(asctime)s - %(levelname)s: %(message)s') diff --git a/juriscraper/lib/string_utils.py b/juriscraper/lib/string_utils.py index 7d724dfd2..848cce642 100644 --- a/juriscraper/lib/string_utils.py +++ b/juriscraper/lib/string_utils.py @@ -4,33 +4,40 @@ import string import calendar import geonamescache +import six from dateutil import parser from datetime import timedelta # For use in titlecase -BIG = ('3D|AFL|AKA|A/K/A|BMG|CBS|CDC|CDT|CEO|CIO|CNMI|D/B/A|DOJ|DVA|EFF|FCC|' - 'FTC|HSBC|IBM|II|III|IV|JJ|LLC|LLP|MCI|MJL|MSPB|ND|NLRB|PTO|SD|UPS|RSS|SEC|UMG|US|USA|USC|' - 'USPS|WTO') -SMALL = 'a|an|and|as|at|but|by|en|for|if|in|is|of|on|or|the|to|v\.?|via|vs\.?' -NUMS = '0123456789' -PUNCT = r"""!"#$¢%&'‘()*+,\-./:;?@[\\\]_—`{|}~""" -WEIRD_CHARS = r'¼½¾§ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜßàáâãäåæçèéêëìíîïñòóôœõöøùúûüÿ' -BIG_WORDS = re.compile(r'^(%s)[%s]?$' % (BIG, PUNCT), re.I) -SMALL_WORDS = re.compile(r'^(%s)$' % SMALL, re.I) -SMALL_WORD_INLINE = re.compile(r'(^|\s)(%s)(\s|$)' % SMALL, re.I) -INLINE_PERIOD = re.compile(r'[a-z][.][a-z]', re.I) -INLINE_SLASH = re.compile(r'[a-z][/][a-z]', re.I) -INLINE_AMPERSAND = re.compile(r'([a-z][&][a-z])(.*)', re.I) -UC_ELSEWHERE = re.compile(r'[%s]*?[a-zA-Z]+[A-Z]+?' % PUNCT) -CAPFIRST = re.compile(r"^[%s]*?([A-Za-z])" % PUNCT) -SMALL_FIRST = re.compile(r'^([%s]*)(%s)\b' % (PUNCT, SMALL), re.I) -SMALL_LAST = re.compile(r'\b(%s)[%s]?$' % (SMALL, PUNCT), re.I) -SUBPHRASE = re.compile(r'([:;?!][ ])(%s)' % SMALL) -APOS_SECOND = re.compile(r"^[dol]{1}['‘]{1}[a-z]+$", re.I) -ALL_CAPS = re.compile(r'^[A-Z\s%s%s%s]+$' % (PUNCT, WEIRD_CHARS, NUMS)) -UC_INITIALS = re.compile(r"^(?:[A-Z]{1}\.{1}|[A-Z]{1}\.{1}[A-Z]{1})+,?$") -MAC_MC = re.compile(r'^([Mm]a?c)(\w+.*)') +if six.PY2: + # Python 3.x doesn't like the old ur'' notation, so we need to hide it. + from .string_utils_py2 import * +else: + BIG = ('3D|AFL|AKA|A/K/A|BMG|CBS|CDC|CDT|CEO|CIO|CNMI|D/B/A|DOJ|DVA|EFF|' + 'FCC|FTC|HSBC|IBM|II|III|IV|JJ|LLC|LLP|MCI|MJL|MSPB|ND|NLRB|PTO|' + 'SD|UPS|RSS|SEC|UMG|US|USA|USC|USPS|WTO') + SMALL = 'a|an|and|as|at|but|by|en|for|if|in|is|of|on|or|the|to|v\.?|via|vs\.?' + NUMS = '0123456789' + PUNCT = r"""!"#$¢%&'‘()*+,\-./:;?@[\\\]_—`{|}~""" + WEIRD_CHARS = r'¼½¾§ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜßàáâãäåæçèéêëìíîïñòóôœõöøùúûüÿ' + BIG_WORDS = re.compile(r'^(%s)[%s]?$' % (BIG, PUNCT), re.I | re.U) + SMALL_WORDS = re.compile(r'^(%s)$' % SMALL, re.I) + SMALL_WORD_INLINE = re.compile(r'(^|\s)(%s)(\s|$)' % SMALL, re.I | re.U) + INLINE_PERIOD = re.compile(r'[a-z][.][a-z]', re.I) + INLINE_SLASH = re.compile(r'[a-z][/][a-z]', re.I) + INLINE_AMPERSAND = re.compile(r'([a-z][&][a-z])(.*)', re.I) + UC_ELSEWHERE = re.compile(r'[%s]*?[a-zA-Z]+[A-Z]+?' % PUNCT, re.U) + CAPFIRST = re.compile(r"^[%s]*?([A-Za-z])" % PUNCT) + SMALL_FIRST = re.compile(r'^([%s]*)(%s)\b' % (PUNCT, SMALL), re.I | re.U) + SMALL_LAST = re.compile(r'\b(%s)[%s]?$' % (SMALL, PUNCT), re.I | re.U) + SUBPHRASE = re.compile(r'([:;?!][ ])(%s)' % SMALL) + APOS_SECOND = re.compile(r"^[dol]{1}['‘]{1}[a-z]+$", re.I) + ALL_CAPS = re.compile(r'^[A-Z\s%s%s%s]+$' % (PUNCT, WEIRD_CHARS, NUMS)) + UC_INITIALS = re.compile(r"^(?:[A-Z]{1}\.{1}|[A-Z]{1}\.{1}[A-Z]{1})+,?$") + MAC_MC = re.compile(r'^([Mm]a?c)(\w+.*)') + + def titlecase(text, DEBUG=False): """Titlecases input text @@ -50,10 +57,10 @@ def titlecase(text, DEBUG=False): # if, after removing small words, the entire string is uppercase, # we lowercase it if DEBUG: - print "Entire string is uppercase, thus lowercasing." + print("Entire string is uppercase, thus lowercasing.") text = text.lower() elif not text_sans_small_words.isupper() and DEBUG: - print "Entire string not upper case. Not lowercasing: %s" % text + print("Entire string not upper case. Not lowercasing: %s" % text) lines = re.split('[\r\n]+', text) processed = [] @@ -61,31 +68,31 @@ def titlecase(text, DEBUG=False): all_caps = ALL_CAPS.match(line) words = re.split('[\t ]', line) tc_line = [] - for word in words: + for i, word in enumerate(words): if DEBUG: - print "Word: " + word + print("Word: " + word) if all_caps: if UC_INITIALS.match(word): if DEBUG: - print " UC_INITIALS match for: " + word + print(" UC_INITIALS match for: " + word) tc_line.append(word) continue else: if DEBUG: - print " Not initials. Lowercasing: " + word + print(" Not initials. Lowercasing: " + word) word = word.lower() if APOS_SECOND.match(word): # O'Reiley, L'Oreal, D'Angelo if DEBUG: - print " APOS_SECOND matched. Fixing it: " + word + print(" APOS_SECOND matched. Fixing it: " + word) word = word[0:3].upper() + word[3:] tc_line.append(word) continue if INLINE_PERIOD.search(word): if DEBUG: - print " INLINE_PERIOD matched. Uppercasing if == 1 char: " + word + print(" INLINE_PERIOD matched. Uppercasing if == 1 char: " + word) parts = word.split('.') new_parts = [] for part in parts: @@ -102,7 +109,7 @@ def titlecase(text, DEBUG=False): if INLINE_SLASH.search(word): # This repeats INLINE_PERIOD. Could be more elegant. if DEBUG: - print " INLINE_SLASH matched. Uppercasing if == 1 char: " + word + print(" INLINE_SLASH matched. Uppercasing if == 1 char: " + word) parts = word.split('/') new_parts = [] for part in parts: @@ -119,40 +126,40 @@ def titlecase(text, DEBUG=False): amp_match = INLINE_AMPERSAND.match(word) if amp_match: if DEBUG: - print " INLINE_AMPERSAND matched. Uppercasing: " + word + print(" INLINE_AMPERSAND matched. Uppercasing: " + word) tc_line.append("%s%s" % (amp_match.group(1).upper(), amp_match.group(2))) continue if UC_ELSEWHERE.match(word): if DEBUG: - print " UC_ELSEWHERE matched. Leaving unchanged: " + word + print(" UC_ELSEWHERE matched. Leaving unchanged: " + word) tc_line.append(word) continue if SMALL_WORDS.match(word): if DEBUG: - print " SMALL_WORDS matched. Lowercasing: " + word + print(" SMALL_WORDS matched. Lowercasing: " + word) tc_line.append(word.lower()) continue if BIG_WORDS.match(word): if DEBUG: - print " BIG_WORDS matched. Uppercasing: " + word + print(" BIG_WORDS matched. Uppercasing: " + word) tc_line.append(word.upper()) continue match = MAC_MC.match(word) if match and (word not in ['mack', 'machine']): if DEBUG: - print " MAC_MAC matched. Capitlizing: " + word + print(" MAC_MAC matched. Capitalizing: " + word) tc_line.append("%s%s" % (match.group(1).capitalize(), match.group(2).capitalize())) continue hyphenated = [] for item in word.split('-'): - hyphenated.append(CAPFIRST.sub(lambda m: m.group(0).upper(), item)) + hyphenated.append(_uppercase_word(item)) tc_line.append("-".join(hyphenated)) result = " ".join(tc_line) @@ -175,6 +182,21 @@ def titlecase(text, DEBUG=False): return text +def _uppercase_word(word): + """ + Helper function for uppercasing a word if it doesn't begin with Unicode characters. + + This is needed due to differences between Python 2 and 3. + :param word: unicode string to uppercase + """ + #if six.PY2: + # if word[0] in (u'\u2019',): + # return CAPFIRST.sub(lambda m: m.group(0).upper(), word[1:]) + + return CAPFIRST.sub(lambda m: m.group(0).upper(), word) + + + def fix_camel_case(s): """Sometimes courts provide nasty camel-cased content instead of real words. This code attempts to fix that.""" @@ -182,7 +204,7 @@ def fix_camel_case(s): s_out = s else: s_out = s[0] - for i in xrange(1, len(s)): + for i in range(1, len(s)): # Iterate over the letters, starting with the second one. if s[i - 1].isupper() and s[i].isupper(): # A pattern like 'PAPublic' --> PA Public @@ -248,6 +270,8 @@ def fix_camel_case(s): '|respond(e|a)nts?(--?|/)appell(ee|ant)s?|cross(--?|/)respondents?|crosss?(--?|/)petitioners?' + \ '|cross(--?|/)appell(ees|ant)s?|deceased' BAD_WORDS = re.compile(r'^(%s)(,|\.)?$' % BW, re.I) + + def harmonize(text): """Fixes case names so they are cleaner. @@ -363,20 +387,20 @@ def clean_string(s): def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): - # Borrows heavily from django.utils.encoding.force_unicde. + # Borrows heavily from django.utils.encoding.force_unicode. # This should be applied to *input* not *output*! # Handle the common case first, saves 30-40% in performance when s # is an instance of unicode. This function gets called often in that # setting. - if isinstance(s, unicode): + if isinstance(s, six.text_type): return s try: - if not isinstance(s, basestring,): + if not isinstance(s, six.string_types): if hasattr(s, '__unicode__'): - s = unicode(s) + s = six.text_type(s) else: try: - s = unicode(str(s), encoding, errors) + s = six.text_type(str(s), encoding, errors) except UnicodeEncodeError: if not isinstance(s, Exception): raise @@ -388,12 +412,12 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): # output should be. s = ' '.join([force_unicode(arg, encoding, strings_only, errors) for arg in s]) - elif not isinstance(s, unicode): + elif not isinstance(s, six.text_type): # Note: We use .decode() here, instead of unicode(s, encoding, # errors), so that if s is a SafeString, it ends up being a # SafeUnicode at the end. s = s.decode(encoding, errors) - except UnicodeDecodeError, e: + except UnicodeDecodeError as e: if not isinstance(s, Exception): raise else: @@ -458,7 +482,7 @@ def split_date_range_string(date_range_string): end_date = convert_date_string('%s %d, %s' % (month2, last_day, year)) delta = end_date - start_date dates_in_range = [start_date + timedelta(d) for d in range(delta.days + 1)] - return dates_in_range[len(dates_in_range) / 2] + return dates_in_range[int(len(dates_in_range) / 2)] def normalize_dashes(raw_string): diff --git a/juriscraper/lib/string_utils_py2.py b/juriscraper/lib/string_utils_py2.py new file mode 100644 index 000000000..169fb4bde --- /dev/null +++ b/juriscraper/lib/string_utils_py2.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +""" +Python 2.x Regular Express patterns for string_utils.py +""" +import re + +BIG = ('3D|AFL|AKA|A/K/A|BMG|CBS|CDC|CDT|CEO|CIO|CNMI|D/B/A|DOJ|DVA|EFF|FCC|' + 'FTC|HSBC|IBM|II|III|IV|JJ|LLC|LLP|MCI|MJL|MSPB|ND|NLRB|PTO|SD|UPS|RSS|SEC|UMG|US|USA|USC|' + 'USPS|WTO') +SMALL = u'a|an|and|as|at|but|by|en|for|if|in|is|of|on|or|the|to|v\.?|via|vs\.?' +NUMS = u'0123456789' +PUNCT = ur"""!"#$¢%&'‘()*+,\-./:;?@[\\\]_—`{|}~""" +WEIRD_CHARS = ur'¼½¾§ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜßàáâãäåæçèéêëìíîïñòóôœõöøùúûüÿ' +BIG_WORDS = re.compile(ur'^(%s)[%s]?$' % (BIG, PUNCT), re.I | re.U) +SMALL_WORDS = re.compile(r'^(%s)$' % SMALL, re.I) +SMALL_WORD_INLINE = re.compile(r'(^|\s)(%s)(\s|$)' % SMALL, re.I | re.U) +INLINE_PERIOD = re.compile(r'[a-z][.][a-z]', re.I) +INLINE_SLASH = re.compile(r'[a-z][/][a-z]', re.I) +INLINE_AMPERSAND = re.compile(r'([a-z][&][a-z])(.*)', re.I) +UC_ELSEWHERE = re.compile(ur'[%s]*?[a-zA-Z]+[A-Z]+?' % PUNCT, re.U) +CAPFIRST = re.compile(ur"^[%s]*?([A-Za-z])" % PUNCT) +SMALL_FIRST = re.compile(ur'^([%s]*)(%s)\b' % (PUNCT, SMALL), re.I | re.U) +SMALL_LAST = re.compile(r'\b(%s)[%s]?$' % (SMALL, PUNCT), re.I | re.U) +SUBPHRASE = re.compile(r'([:;?!][ ])(%s)' % SMALL) +APOS_SECOND = re.compile(r"^[dol]{1}['‘]{1}[a-z]+$", re.I) +ALL_CAPS = re.compile(ur'^[A-Z\s%s%s%s]+$' % (PUNCT, WEIRD_CHARS, NUMS)) +UC_INITIALS = re.compile(r"^(?:[A-Z]{1}\.{1}|[A-Z]{1}\.{1}[A-Z]{1})+,?$") +MAC_MC = re.compile(r'^([Mm]a?c)(\w+.*)') \ No newline at end of file diff --git a/juriscraper/lib/test_utils.py b/juriscraper/lib/test_utils.py index 9d15b581c..9f373e4cc 100644 --- a/juriscraper/lib/test_utils.py +++ b/juriscraper/lib/test_utils.py @@ -19,11 +19,12 @@ def __init__(self, url=None): def get(self): r = Response() try: - r._content = open(self.url).read() - #: Integer Code of responded HTTP Status. - r.status_code = 200 - if self.url.endswith('json'): - r.headers['content-type'] = 'application/json' + with open(self.url) as stream: + r._content = stream.read() + #: Integer Code of responded HTTP Status. + r.status_code = 200 + if self.url.endswith('json'): + r.headers['content-type'] = 'application/json' except IOError as e: r.status_code = 404 raise ConnectionError(e) diff --git a/juriscraper/opinions/united_states/federal_district/ed_louisiana.py b/juriscraper/opinions/united_states/federal_district/ed_louisiana.py index 606ce8696..99be1e23a 100644 --- a/juriscraper/opinions/united_states/federal_district/ed_louisiana.py +++ b/juriscraper/opinions/united_states/federal_district/ed_louisiana.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- import datetime -import urlparse +from six.moves.urllib import urlparse import sys from dateutil.rrule import rrule, DAILY @@ -97,4 +97,3 @@ def _download_backwards(self, d): # Setting status is important because it prevents the download # function from being run a second time by the parse method. self.status = 200 - diff --git a/juriscraper/opinions/united_states/state/vt_u.py b/juriscraper/opinions/united_states/state/vt_u.py index 5b1c9bfd2..a1114230c 100644 --- a/juriscraper/opinions/united_states/state/vt_u.py +++ b/juriscraper/opinions/united_states/state/vt_u.py @@ -7,7 +7,7 @@ """ from datetime import datetime -from urlparse import urlsplit +from six.moves.urllib import urlsplit from juriscraper.OpinionSite import OpinionSite from lxml import html diff --git a/juriscraper/opinions/united_states_backscrapers/federal_appellate/ca4.py b/juriscraper/opinions/united_states_backscrapers/federal_appellate/ca4.py index ec63b825a..38a3bbe9e 100644 --- a/juriscraper/opinions/united_states_backscrapers/federal_appellate/ca4.py +++ b/juriscraper/opinions/united_states_backscrapers/federal_appellate/ca4.py @@ -67,7 +67,7 @@ def _get_precedential_statuses(self): def _download_backwards(self, dt): self.end_date = dt + timedelta(days=6) - self.resource_org_end_date = date(2007, 07, 31) + self.resource_org_end_date = date(2007, 7, 31) # We only get unpublished docs when we're in a period of time during which we have resource.org docs. self.only_get_unpublished = (self.end_date < self.resource_org_end_date) self.parameters['FROMDATE'] = dt.strftime('%m-%d-%Y') diff --git a/juriscraper/opinions/united_states_backscrapers/federal_appellate/ca5.py b/juriscraper/opinions/united_states_backscrapers/federal_appellate/ca5.py index c9a7cbc73..9eedaa4c9 100644 --- a/juriscraper/opinions/united_states_backscrapers/federal_appellate/ca5.py +++ b/juriscraper/opinions/united_states_backscrapers/federal_appellate/ca5.py @@ -23,7 +23,7 @@ def __init__(self, *args, **kwargs): self.back_scrape_iterable = [i.date() for i in rrule( DAILY, interval=self.interval, # Every interval days - dtstart=date(1992, 05, 14), + dtstart=date(1992, 5, 14), until=date(2015, 1, 1), )] self.uses_selenium = True diff --git a/juriscraper/opinions/united_states_backscrapers/state/ind_2005.py b/juriscraper/opinions/united_states_backscrapers/state/ind_2005.py index d36569525..cb4e16710 100644 --- a/juriscraper/opinions/united_states_backscrapers/state/ind_2005.py +++ b/juriscraper/opinions/united_states_backscrapers/state/ind_2005.py @@ -41,7 +41,7 @@ def _get_case_dates(self): dates.append(date.fromtimestamp( time.mktime(time.strptime(date_string, '%m%d%y')))) else: - print date_string + print(date_string) return dates def _get_download_urls(self): diff --git a/juriscraper/oral_args/united_states/federal_appellate/ca6.py b/juriscraper/oral_args/united_states/federal_appellate/ca6.py index 666ae71cf..3c62489cb 100644 --- a/juriscraper/oral_args/united_states/federal_appellate/ca6.py +++ b/juriscraper/oral_args/united_states/federal_appellate/ca6.py @@ -10,7 +10,7 @@ import re from datetime import datetime -from urlparse import urlparse, urljoin, parse_qs +from six.moves.urllib.parse import urlparse, urljoin, parse_qs from juriscraper.OralArgumentSite import OralArgumentSite from juriscraper.lib.string_utils import convert_date_string diff --git a/juriscraper/pacer/__init__.py b/juriscraper/pacer/__init__.py index d2a6f8a7f..fcc76cf40 100644 --- a/juriscraper/pacer/__init__.py +++ b/juriscraper/pacer/__init__.py @@ -1,4 +1,4 @@ -from free_documents import FreeOpinionReport -from docket_report import DocketReport +from .free_documents import FreeOpinionReport +from .docket_report import DocketReport __all__ = ['FreeOpinionReport', 'DocketReport'] diff --git a/juriscraper/pacer/auth.py b/juriscraper/pacer/auth.py deleted file mode 100644 index 3e099bf95..000000000 --- a/juriscraper/pacer/auth.py +++ /dev/null @@ -1,55 +0,0 @@ -import re - -import requests - -from exceptions import BadLoginException -from juriscraper.pacer.free_documents import logger - - -def make_pacer_cookie_dict(name, value): - """Make a cookie dict with the provided name and value""" - return { - "version": 0, - 'name': name, - 'value': value, - 'port': None, - 'domain': '.uscourts.gov', - 'path': '/', - 'secure': True, - 'rest': { - 'hostonly': False, - 'httponly': False, - } - } - - -def make_login_url(court_id): - """Make a login URL for a given court id.""" - if court_id == 'psc': - return 'https://dcecf.psc.uscourts.gov/cgi-bin/login.pl' - else: - return 'https://ecf.%s.uscourts.gov/cgi-bin/login.pl' % court_id - - -def login(court_id, username, password): - """Log into a PACER jurisdiction. Return cookies for the user.""" - s = requests.session() - url = make_login_url(court_id) - logger.info("Logging into: %s at %s" % (court_id, url)) - r = s.post( - url, - headers={'User-Agent': 'Juriscraper'}, - verify=False, - timeout=60, - files={ - 'login': ('', username), - 'key': ('', password) - }, - ) - if 'Invalid ID or password' in r.text: - raise BadLoginException(r.text) - - # The cookie value is in the HTML. Extract it. - m = re.search('PacerSession=(\w+);', r.text) - if m is not None: - return make_pacer_cookie_dict('PacerSession', m.group(1)) diff --git a/juriscraper/pacer/docket_report.py b/juriscraper/pacer/docket_report.py index 1a388aca1..804fc52a9 100644 --- a/juriscraper/pacer/docket_report.py +++ b/juriscraper/pacer/docket_report.py @@ -1,14 +1,12 @@ -import requests from juriscraper.lib.log_tools import make_default_logger logger = make_default_logger() class DocketReport(object): - def __init__(self, court_id, cookie): + def __init__(self, court_id, pacer_session): self.court_id = court_id - self.session = requests.session() - self.session.cookies.set(**cookie) + self.session = pacer_session super(DocketReport, self).__init__() @property @@ -66,47 +64,44 @@ def query(self, pacer_case_id, date_range_type='Filed', date_start='', raise ValueError("Cannot show terminated parties if parties and " "counsel are not also requested.") - files_params = { - 'all_case_ids': ('', pacer_case_id), - 'sort1': ('', order_by), - 'date_range_type': ('', date_range_type), - 'output_format': ('', output_format), + query_params = { + 'all_case_ids': pacer_case_id, + 'sort1': order_by, + 'date_range_type': date_range_type, + 'output_format': output_format, # Any value works in this parameter, but it cannot be blank. # Normally this would have a value like '3:12-cv-3879', but that's # not even necessary. - 'case_num': ('', ' '), + 'case_num': ' ' # These fields seem to be unnecessary/unused. - # 'view_comb_doc_text': ('', ''), - # 'PreResetField': ('', ''), - # 'PreResetFields': ('', ''), + # 'view_comb_doc_text': '', + # 'PreResetField': '', + # 'PreResetFields': '', } if date_start: - files_params['date_from'] = ('', date_start.strftime('%m/%d/%Y')) + query_params['date_from'] = date_start.strftime('%m/%d/%Y') if date_end: - files_params['date_to'] = ('', date_end.strftime('%m/%d/%Y')) + query_params['date_to'] = date_end.strftime('%m/%d/%Y') if doc_num_start: - files_params['documents_numbered_from_'] = ('', str(int(doc_num_start))) + query_params['documents_numbered_from_'] = str(int(doc_num_start)) if doc_num_end: - files_params['documents_numbered_to_'] = ('', str(int(doc_num_end))) + query_params['documents_numbered_to_'] = str(int(doc_num_end)) if show_parties_and_counsel is True: - files_params['list_of_parties_and_counsel'] = ('', 'on') + query_params['list_of_parties_and_counsel'] = 'on' if show_terminated_parties is True: - files_params['terminated_parties'] = ('', 'on') + query_params['terminated_parties'] = 'on' if show_list_of_member_cases is True: - files_params['list_of_member_cases'] = ('', 'on') + query_params['list_of_member_cases'] = 'on' if include_pdf_headers is True: - files_params['pdf_header'] = ('', '1') + query_params['pdf_header'] = '1' if show_multiple_docs is True: - files_params['view_multi_docs'] = ('', 'on') + query_params['view_multi_docs'] = 'on' logger.info("Querying docket report for case ID '%s' with params %s" % - (pacer_case_id, files_params)) - return self.session.post( - self.url + '?1-L_1_0-1', - headers={'User-Agent': 'Juriscraper'}, - verify=False, - timeout=300, - files=files_params, - ) + (pacer_case_id, query_params)) + + return self.session.post(self.url + '?1-L_1_0-1', + data=query_params, + timeout=300) diff --git a/juriscraper/pacer/exceptions.py b/juriscraper/pacer/exceptions.py deleted file mode 100644 index 8f20fb851..000000000 --- a/juriscraper/pacer/exceptions.py +++ /dev/null @@ -1,5 +0,0 @@ -class BadLoginException(Exception): - """The document could not be formed""" - - def __init__(self, message): - Exception.__init__(self, message) diff --git a/juriscraper/pacer/free_documents.py b/juriscraper/pacer/free_documents.py index 1451f9bd9..6f961d029 100644 --- a/juriscraper/pacer/free_documents.py +++ b/juriscraper/pacer/free_documents.py @@ -1,6 +1,3 @@ -import re - -import requests from dateutil.rrule import rrule, DAILY from lxml.html import tostring @@ -24,10 +21,10 @@ class FreeOpinionReport(object): EXCLUDED_COURT_IDS = ['casb', 'ganb', 'innb', 'mieb', 'miwb', 'nmib', 'nvb', 'ohsb', 'tnwb', 'vib'] - def __init__(self, court_id, cookie): + def __init__(self, court_id, pacer_session): self.court_id = court_id - self.session = requests.session() - self.session.cookies.set(**cookie) + self.session = pacer_session + super(FreeOpinionReport, self).__init__() @property @@ -44,32 +41,31 @@ def query(self, start, end): "not provided by the court or is in disuse." % self.court_id) return [] + dates = [d.strftime('%m/%d/%Y') for d in rrule( DAILY, interval=1, dtstart=start, until=end)] responses = [] + for d in dates: # Iterate one day at a time. Any more and PACER chokes. logger.info("Querying written opinions report for '%s' between %s " "and %s" % (self.court_id, d, d)) - responses.append(self.session.post( - self.url + '?1-L_1_0-1', - headers={'User-Agent': 'Juriscraper'}, - verify=False, - timeout=300, - files={ - 'filed_from': ('', d), - 'filed_to': ('', d), - 'ShowFull': ('', '1'), - 'Key1': ('', 'cs_sort_case_numb'), - 'all_case_ids': ('', '0'), - } - )) + data = { + 'filed_from': d, + 'filed_to': d, + 'ShowFull': '1', + 'Key1': 'cs_sort_case_numb', + 'all_case_ids': '0' + } + response = self.session.post(self.url + '?1-L_1_0-1', data=data) + responses.append(response) + return responses @staticmethod def parse(responses): - """Using a list of responses, parse out useful information and return it as - a list of dicts. + """Using a list of responses, parse out useful information and return + it as a list of dicts. """ results = [] court_id = "Court not yet set." @@ -103,26 +99,25 @@ def download_pdf(self, pacer_case_id, pacer_document_number): Note that this doesn't support attachments yet. """ + timeout = (60, 300) url = make_doc1_url(self.court_id, pacer_document_number, True) data = { 'caseid': pacer_case_id, 'got_receipt': '1', } + logger.info("GETting PDF at URL: %s with params: %s" % (url, data)) - r = self.session.get( - url, - params=data, - headers={'User-Agent': 'Juriscraper'}, - verify=False, - timeout=300, - ) + r = self.session.get(url, params=data, timeout=timeout) + # The request above sometimes generates an HTML page with an iframe # containing the PDF, and other times returns the PDF. Our task is thus # to either get the src of the iframe and download the PDF or just # return the pdf. r.raise_for_status() if is_pdf(r): + logger.info('Got PDF binary data for case %s at: %s' % (url, data)) return r + text = clean_html(r.text) tree = get_html_parsed_text(text) tree.rewrite_links(fix_links_in_lxml_tree, @@ -135,12 +130,12 @@ def download_pdf(self, pacer_case_id, pacer_document_number): "directly in HTML. URL: %s, caseid: %s" % (url, pacer_case_id)) return None - r = self.session.get( - iframe_src, - headers={'User-Agent': 'Juriscraper'}, - verify=False, - timeout=300, - ) + + r = self.session.get(iframe_src, timeout=timeout) + if is_pdf(r): + msg = 'Got iframed PDF data for case %s at: %s' % (url, iframe_src) + logger.info(msg) + return r diff --git a/juriscraper/pacer/http.py b/juriscraper/pacer/http.py new file mode 100644 index 000000000..b2bb4046f --- /dev/null +++ b/juriscraper/pacer/http.py @@ -0,0 +1,177 @@ +""" +Functions for Authenticating with PACER +""" +import re +import requests + +from juriscraper.lib.log_tools import make_default_logger + +logger = make_default_logger() + + +class PacerSession(requests.Session): + """ + Extension of requests.Session to handle PACER oddities making it easier + for folks to just POST data to PACER endpoints/apis + """ + + def __init__(self, pacer_token=None, cookie_jar=None): + """ + Instantiate a new PACER HTTP Session with some Juriscraper defaults + :param pacer_token: a PACER_SESSION token value + """ + super(PacerSession, self).__init__() + self.headers['User-Agent'] = 'Juriscraper' + self.verify = False + + if pacer_token: + self.cookies.set('PacerSession', + pacer_token, + domain='.uscourts.gov', + path='/') + + if cookie_jar: + self.cookies = cookie_jar + + def post(self, url, data=None, json=None, **kwargs): + """ + Overrides requests.Session.post with PACER-specific fun. + + Will automatically convert data dict into proper multi-part form data + and pass to the files parameter instead. + + Will set a timeout of 300 if not provided. + + All other uses or parameters will pass through untouched + :param url: url string to post to + :param data: post data + :param json: json object to post + :param kwargs: assorted keyword arguments + :return: requests.Response + """ + kwargs.setdefault('timeout', 300) + + if data: + pacer_data = self._prepare_multipart_form_data(data) + return super(PacerSession, self).post(url, files=pacer_data, **kwargs) + + return super(PacerSession, self).post(url, data=data, json=json, **kwargs) + + @staticmethod + def _prepare_multipart_form_data(data): + """ + Transforms a data dictionary into the multi-part form data that PACER + expects as the POST body + :param data: dict of data to transform + :return: dict with values wrapped into tuples like:(None, ) + """ + output = dict() + for key in data: + output[key] = (None, data[key]) + return output + + +def _make_login_url(court_id): + """Make a login URL for a given court id.""" + if court_id == 'psc': + # training account + return 'https://dcecf.psc.uscourts.gov/cgi-bin/login.pl' + return 'https://pacer.login.uscourts.gov/csologin/login.jsf?pscCourtId=%s' % court_id + + +def login(court_id, username, password): + """ + Log into a PACER jurisdiction via the main PACER portal which should set our global PacerSession + + :param court_id: id of the court to authenticate with + :param username: PACER username + :param password: PACER password + :return: new PacerSession configured with PacerSession token in cookie + """ + if court_id == 'psc': + return _login_training(court_id, username, password) + + url = _make_login_url(court_id) + logger.info("Logging into: %s at %s" % (court_id, url)) + + login_session = requests.Session() + login_session.headers['User-Agent'] = 'Juriscraper' + login_session.verify = False + + # initial GET to login page to get JSESSIONID + r = login_session.get(url, timeout=60) + if not r.status_code == 200: + msg = 'Could not navigate to PACER central login url: %s' % url + logger.error(msg) + raise PacerLoginException(msg) + + # with our JSESSIONID, try the login + login_data = { + 'login': 'login', + 'login:loginName': username, + 'login:password': password, + 'login:clientCode': '', + 'login:fbtnLogin': '', + 'javax.faces.ViewState': 'stateless' + } + r = login_session.post(url, timeout=60, data=login_data, allow_redirects=False) + + if r.status_code == 302: + # we should be redirected on success with cookies! + if not login_session.cookies.get('PacerSession', None, '.uscourts.gov', '/'): + logger.error('Failed to get a PacerSession token!') + raise PacerLoginException('Failed to get a PacerSession token!') + else: + msg = 'Unknown PACER login error: http status %s' % r.status_code + if 'Invalid ID or password' in r.text: + msg = 'Invalid PACER ID or password.' + logger.error(msg) + raise PacerLoginException(msg) + + logger.info('New PacerSession established.') + return PacerSession(cookie_jar=login_session.cookies) + + +def _login_training(court_id, username, password): + """ + Attempt to log into the PACER training site. + :param court_id: training court_id + :param username: training username + :param password: training password + :return: + """ + url = _make_login_url(court_id) + logger.info('attempting PACER Training Site login') + r = requests.post( + url, + headers={'User-Agent': 'Juriscraper'}, + verify=False, + timeout=60, + data={ + 'login': username, + 'key': password + }, + ) + if 'Invalid ID or password' in r.text: + raise BadPacerCredentials(r.text) + + # The cookie value is in the HTML. Extract it. + m = re.search('PacerSession=(\w+);', r.text) + if m is not None: + return PacerSession(pacer_token=m.group(1)) + + raise PacerLoginException('could not create new training PacerSession') + + +class PacerLoginException(Exception): + """Raised when the system cannot authenticate with PACER""" + + def __init__(self, message): + Exception.__init__(self, message) + + +class BadPacerCredentials(Exception): + """Raised when the credentials failed to authenticate the client to PACER""" + + def __init__(self, message): + Exception.__init__(self, message) diff --git a/juriscraper/pacer/utils.py b/juriscraper/pacer/utils.py index 4fb0688c6..a98456a2d 100644 --- a/juriscraper/pacer/utils.py +++ b/juriscraper/pacer/utils.py @@ -1,4 +1,3 @@ -import certifi import re import requests import tldextract diff --git a/requirements-dev.txt b/requirements-dev.txt index af996cf7e..a20746bb1 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1 +1,4 @@ +mock +vcrpy twine +tox diff --git a/requirements.txt b/requirements.txt index 268c78f2a..27b6b665d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ -argparse==1.2.1 -cchardet +six +argparse +cchardet>=1.1.2 certifi chardet geonamescache==0.20 @@ -9,5 +10,3 @@ python-dateutil==2.5.0 requests==2.9.1 selenium==2.53.6 tldextract -wsgiref==0.1.2 -vcrpy diff --git a/setup.py b/setup.py index 4f661e2e2..511803b80 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,10 @@ def read(*parts): with codecs.open(os.path.join(HERE, *parts), "rb", "utf-8") as f: return f.read() +requirements = [ + str(r.req) for r in + parse_requirements('requirements.txt', session=False) +] setup( name="juriscraper", @@ -42,14 +46,15 @@ def read(*parts): "Programming Language :: Python :: 2", "Programming Language :: Python :: 2.6", "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", "Topic :: Software Development :: Libraries :: Python Modules", ], - install_requires=[ - str(r.req) for r in - parse_requirements('requirements.txt', session=False) - ], + install_requires=requirements, + tests_require=['mock', 'vcrpy'], include_package_data=True, test_suite='tests', ) diff --git a/test_pacer.py/__init__.py b/test_pacer.py/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/__init__.py b/tests/__init__.py index 8b1378917..aab4d157e 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1 +1,9 @@ +import os +import juriscraper + +JURISCRAPER_ROOT = os.path.realpath( + os.path.join( + os.path.realpath(juriscraper.__file__), + '..')) +TESTS_ROOT = os.path.realpath(os.path.join(JURISCRAPER_ROOT, '../tests')) \ No newline at end of file diff --git a/tests/test_everything.py b/tests/test_everything.py index 9eee43395..7657a0185 100755 --- a/tests/test_everything.py +++ b/tests/test_everything.py @@ -9,6 +9,7 @@ import unittest import vcr +from . import TESTS_ROOT from juriscraper.lib.importer import build_module_list from juriscraper.lib.date_utils import ( parse_dates, quarter, is_first_month_in_quarter, fix_future_year_typo @@ -20,7 +21,7 @@ from juriscraper.opinions.united_states.state import alaska, colo, mass, massappct, nh, pa from juriscraper.oral_args.united_states.federal_appellate import ca6 -vcr = vcr.VCR(cassette_library_dir='tests/fixtures/cassettes') +vcr = vcr.VCR(cassette_library_dir=os.path.join(TESTS_ROOT, 'fixtures/cassettes')) class SlownessException(Exception): @@ -43,7 +44,7 @@ def test_various_date_extractions(self): [datetime.datetime(1924, 9, 19)]), # Using 'Term' as an indicator. ('November Term 2004.', - [datetime.datetime(2004, 11, 01)]), + [datetime.datetime(2004, 11, 1)]), (u'April 26, 1961.[†]', [datetime.datetime(1961, 4, 26)]), ) @@ -60,7 +61,7 @@ def test_fix_future_year_typo(self): '12/01/2806': '12/01/2806', # Should not change '12/01/2886': '12/01/2886', # Should not change } - for before, after in expectations.iteritems(): + for before, after in expectations.items(): fixed_date = fix_future_year_typo(convert_date_string(before)) self.assertEqual(fixed_date, convert_date_string(after)) @@ -83,8 +84,8 @@ def test_scrape_all_example_files(self): module_strings = build_module_list('juriscraper') num_scrapers = len([s for s in module_strings if 'backscraper' not in s]) - print "Testing {count} scrapers against their example files:".format( - count=num_scrapers) + msg = "Testing {count} scrapers against their example files:" + print(msg.format(count=num_scrapers)) max_len_mod_string = max(len(mod) for mod in module_strings if 'backscraper' not in mod) + 2 num_example_files = 0 @@ -155,19 +156,16 @@ def test_scrape_all_example_files(self): else: msg = '' - print '(%s test(s) in %0.1f seconds%s)' % ( - num_tests, speed, msg - ) + print('(%s test(s) in %0.1f seconds%s)' % (num_tests, speed, msg)) print ("\n{num_scrapers} scrapers tested successfully against " "{num_example_files} example files, with {num_warnings} " "speed warnings.".format( - num_scrapers=num_scrapers, - num_example_files=num_example_files, - num_warnings=num_warnings, - )) + num_scrapers=num_scrapers, + num_example_files=num_example_files, + num_warnings=num_warnings,)) if num_warnings: - print ("\nAt least one speed warning was triggered during the " + print("\nAt least one speed warning was triggered during the " "tests. If this is due to a slow scraper you wrote, we " "suggest attempting to speed it up, as it will be slow " "both in production and while running tests. This is " @@ -175,8 +173,8 @@ def test_scrape_all_example_files(self): "future as performance requirements are tightened.") else: # Someday, this line of code will be run. That day is not today. - print "\nNo speed warnings detected. That's great, keep up the " \ - "good work!" + print("\nNo speed warnings detected. That's great, keep up the " \ + "good work!") class StringUtilTest(unittest.TestCase): @@ -387,7 +385,7 @@ def test_make_short_name(self): def test_quarter(self): answers = {1: 1, 2: 1, 3: 1, 4: 2, 5: 2, 6: 2, 7: 3, 8: 3, 9: 3, 10: 4, 11: 4, 12: 4} - for month, q in answers.iteritems(): + for month, q in answers.items(): self.assertEqual(quarter(month), q) def test_is_first_month_in_quarter(self): @@ -400,7 +398,7 @@ def test_is_first_month_in_quarter(self): 6: False, 7: True, } - for month, is_first in answers.iteritems(): + for month, is_first in answers.items(): self.assertEqual(is_first_month_in_quarter(month), is_first) def test_harmonize_and_clean_string_tests(self): @@ -586,9 +584,9 @@ def test_titlecase(self): u'iTunes Should Be Unmolested'], ['Reading between the lines of steve jobs’s ‘thoughts on music’', # Tests unicode - u'Reading Between the Lines of Steve Jobs’s ‘thoughts on Music’'], + u'Reading Between the Lines of Steve Jobs’s ‘Thoughts on Music’'], ['seriously, ‘repair permissions’ is voodoo', # Tests unicode - u'Seriously, ‘repair Permissions’ is Voodoo'], + u'Seriously, ‘Repair Permissions’ is Voodoo'], [ 'generalissimo francisco franco: still dead; kieren McCarthy: ' 'still a jackass', @@ -616,9 +614,10 @@ def test_titlecase(self): u'United States 07-693l And'], ['CARVER v. US', u'Carver v. US']] + for pair in test_pairs: - self.assertEqual(titlecase(force_unicode(pair[0])), - pair[1]) + unicode_string = force_unicode(pair[0]) + self.assertEqual(titlecase(unicode_string, DEBUG=False), pair[1]) def test_fixing_camel_case(self): """Can we correctly identify and fix camelCase?""" @@ -907,7 +906,7 @@ def test_colo_coloctapp(self): } scraper = colo.Site() - for raw_string, data in tests.iteritems(): + for raw_string, data in tests.items(): for field in ['docket', 'name']: attribute = '_extract_%s_from_text' % field result = getattr(scraper, attribute)(raw_string) diff --git a/tests/test_pacer.py b/tests/test_pacer.py index 0738011df..ce03b9d97 100644 --- a/tests/test_pacer.py +++ b/tests/test_pacer.py @@ -4,79 +4,161 @@ import unittest from datetime import timedelta, date +import mock import vcr from requests import ConnectionError from juriscraper.lib.html_utils import get_html_parsed_text from juriscraper.lib.string_utils import convert_date_string from juriscraper.pacer import DocketReport, FreeOpinionReport -from juriscraper.pacer.auth import login +from juriscraper.pacer.http import login, PacerSession, PacerLoginException from juriscraper.pacer.utils import ( get_courts_from_json, get_court_id_from_url, get_pacer_case_id_from_docket_url, get_pacer_document_number_from_doc1_url, reverse_goDLS_function, make_doc1_url ) +from . import JURISCRAPER_ROOT, TESTS_ROOT -vcr = vcr.VCR(cassette_library_dir='tests/fixtures/cassettes') +vcr = vcr.VCR(cassette_library_dir=os.path.join(TESTS_ROOT, 'fixtures/cassettes')) -def get_pacer_credentials_or_skip(): - try: - username = os.environ['PACER_USERNAME'] - password = os.environ['PACER_PASSWORD'] - except KeyError: - msg = ("Unable to run PACER tests. Please set PACER_USERNAME and " - "PACER_PASSWORD environment variables.") - raise unittest.SkipTest(msg) - else: - return username, password +PACER_USERNAME = os.environ.get('PACER_USERNAME', None) +PACER_PASSWORD = os.environ.get('PACER_PASSWORD', None) +PACER_SETTINGS_MSG = "Skipping test. Please set PACER_USERNAME and " \ + "PACER_PASSWORD environment variables to run this test." +SKIP_IF_NO_PACER_LOGIN = unittest.skipUnless( + (PACER_USERNAME and PACER_PASSWORD), + reason=PACER_SETTINGS_MSG) + + +class PacerSessionTest(unittest.TestCase): + """ + Test the PacerSession wrapper class + """ + + def setUp(self): + self.session = PacerSession() + + def test_data_transformation(self): + """ + Test our data transformation routine for building out PACER-compliant + multi-part form data + """ + data = {'case_id': 123, 'case_type': 'something'} + expected = {'case_id': (None, 123), 'case_type': (None, 'something')} + output = self.session._prepare_multipart_form_data(data) + self.assertEqual(output, expected) + + @mock.patch('juriscraper.pacer.http.requests.Session.post') + def test_ignores_non_data_posts(self, mock_post): + """ + Test that POSTs without a data parameter just pass through as normal. + + :param mock_post: mocked Session.post method + """ + data = {'name': ('filename', 'junk')} + + self.session.post('https://free.law', files=data) + + self.assertTrue(mock_post.called, + 'request.Session.post should be called') + self.assertEqual(data, mock_post.call_args[1]['files'], + 'the data should not be changed if using a files call') + + @mock.patch('juriscraper.pacer.http.requests.Session.post') + def test_transforms_data_on_post(self, mock_post): + """ + Test that POSTs using the data parameter get transformed into PACER's + delightfully odd multi-part form data. + + :param mock_post: mocked Session.post method + """ + data = {'name': 'dave', 'age': 33} + expected = {'name': (None, 'dave'), 'age': (None, 33)} + + self.session.post('https://free.law', data=data) + + self.assertTrue(mock_post.called, + 'request.Session.post should be called') + self.assertNotIn('data', mock_post.call_args[1], + 'we should intercept data arguments') + self.assertEqual(expected, mock_post.call_args[1]['files'], + 'we should transform and populate the files argument') + + @mock.patch('juriscraper.pacer.http.requests.Session.post') + def test_sets_default_timeout(self, mock_post): + self.session.post('https://free.law', data={}) + + self.assertTrue(mock_post.called, + 'request.Session.post should be called') + self.assertIn('timeout', mock_post.call_args[1], + 'we should add a default timeout automatically') + self.assertEqual(300, mock_post.call_args[1]['timeout'], + 'default should be 300') class PacerAuthTest(unittest.TestCase): """Test the authentication methods""" - def setUp(self): - # Get the latest court info from our Heroku app. - with open('juriscraper/pacer/courts.json') as j: - self.courts = get_courts_from_json(json.load(j)) - self.username, self.password = get_pacer_credentials_or_skip() + @SKIP_IF_NO_PACER_LOGIN + def test_logging_into_pacer(self): + court_id = 'ca1' + try: + pacer_session = login(court_id, PACER_USERNAME, PACER_PASSWORD) + self.assertIsNotNone(pacer_session) + self.assertIsNotNone(pacer_session.cookies.get( + 'PacerSession', None, domain='.uscourts.gov', path='/')) - @vcr.use_cassette() - def test_logging_in(self): - for court in self.courts: - court_id = get_court_id_from_url(court['court_link']) - login(court_id, self.username, self.password) + except PacerLoginException: + self.fail('Could not log into court %s' % court_id) + + def test_logging_into_test_site(self): + try: + pacer_session = login('psc', 'tr1234', 'Pass!234') + self.assertIsNotNone(pacer_session) + self.assertIsNotNone(pacer_session.cookies.get( + 'PacerSession', None, domain='.uscourts.gov', path='/')) + + except PacerLoginException: + self.fail('Could not log into PACER test site!') class PacerFreeOpinionsTest(unittest.TestCase): """A variety of tests relating to the Free Written Opinions report""" - def setUp(self): - self.username, self.password = get_pacer_credentials_or_skip() - # CAND chosen at random - self.cookie = login('cand', self.username, self.password) - with open('juriscraper/pacer/courts.json') as j: - self.courts = get_courts_from_json(json.load(j)) - self.reports = {} - for court in self.courts: + @classmethod + def setUpClass(cls): + pacer_session = PacerSession() + + if PACER_USERNAME and PACER_PASSWORD: + # CAND chosen at random + pacer_session = login('cand', PACER_USERNAME, PACER_PASSWORD) + + with open(os.path.join(JURISCRAPER_ROOT, 'pacer/courts.json')) as j: + cls.courts = get_courts_from_json(json.load(j)) + + with open(os.path.join(TESTS_ROOT, 'fixtures/valid_free_opinion_dates.json')) as j: + cls.valid_dates = json.load(j) + + cls.reports = {} + for court in cls.courts: court_id = get_court_id_from_url(court['court_link']) - self.reports[court_id] = FreeOpinionReport(court_id, - self.cookie) + cls.reports[court_id] = FreeOpinionReport(court_id, pacer_session) + @unittest.skip('disabling during refactor') @vcr.use_cassette(record_mode='new_episodes') def test_extract_written_documents_report(self): """Do all the written reports work?""" - with open('tests/fixtures/valid_free_opinion_dates.json') as j: - valid_dates = json.load(j) + for court in self.courts: if court['type'] == "U.S. Courts of Appeals": continue court_id = get_court_id_from_url(court['court_link']) - if court_id in valid_dates: + if court_id in self.valid_dates: results = [] report = self.reports[court_id] - some_date = convert_date_string(valid_dates[court_id]) + some_date = convert_date_string(self.valid_dates[court_id]) retry_count = 1 max_retries = 5 # We'll try five times total while not results and retry_count <= max_retries: @@ -90,13 +172,13 @@ def test_extract_written_documents_report(self): responses = report.query(some_date, some_date) except ConnectionError as e: if retry_count <= max_retries: - print ("%s. Trying again (%s of %s)" % - (e, retry_count, max_retries)) + print("%s. Trying again (%s of %s)" % + (e, retry_count, max_retries)) time.sleep(15) # Give the server a moment of rest. retry_count += 1 continue else: - print "%s: Repeated errors at this court." % e + print("%s: Repeated errors at this court." % e) raise e if not responses: break # Not a supported court. @@ -123,23 +205,75 @@ def test_extract_written_documents_report(self): self.assertEqual(r.headers['Content-Type'], 'application/pdf') + @SKIP_IF_NO_PACER_LOGIN @vcr.use_cassette(record_mode='new_episodes') - def test_download_a_free_document(self): - """Can we download a free document?""" + def test_download_iframed_report(self): + """Can we download a PDF document returned in IFrame?""" report = self.reports['vib'] r = report.download_pdf('1507', '1921141093') self.assertEqual(r.headers['Content-Type'], 'application/pdf') + @SKIP_IF_NO_PACER_LOGIN + def test_download_direct_report(self): + """Can we download a PDF document returned directly?""" + report = self.reports['alnb'] + r = report.download_pdf('602431', '018129511556') + self.assertEqual(r.headers['Content-Type'], 'application/pdf') + + @SKIP_IF_NO_PACER_LOGIN + def test_query_can_get_multiple_results(self): + """ + Can we run a query that gets multiple rows and parse them all? + """ + court_id = 'paeb' + report = self.reports[court_id] + some_date = convert_date_string(self.valid_dates[court_id]) + responses = report.query(some_date, some_date) + results = report.parse(responses) + self.assertEqual(3, len(results), 'should get 3 responses for ksb') + + @SKIP_IF_NO_PACER_LOGIN + def test_query_using_last_good_row(self): + """ + Can we run a query that triggers no content in first cell? + """ + court_id = 'ksb' + report = self.reports[court_id] + some_date = convert_date_string(self.valid_dates[court_id]) + responses = report.query(some_date, some_date) + results = report.parse(responses) + self.assertEqual(2, len(results), 'should get 2 response for ksb') + + def test_catch_excluded_court_ids(self): + """Do we properly catch and prevent a query against disused courts?""" + mock_session = mock.MagicMock() + + report = self.reports['ganb'] + report.session = mock_session + + some_date = convert_date_string('1/1/2015') + + results = report.query(some_date, some_date) + self.assertEqual([], results, 'should have empty result set') + self.assertFalse(mock_session.post.called, 'should not trigger a POST query') + + report = self.reports['cand'] + report.session = mock_session + report.query(some_date, some_date) + self.assertTrue(mock_session.post.called, 'good court should POST') + class PacerDocketReportTest(unittest.TestCase): """A variety of tests for the docket report""" - def setUp(self): - self.cookie = login('psc', 'tr1234', 'Pass!234') - self.report = DocketReport('psc', self.cookie) - self.pacer_case_id = '62866' + @classmethod + def setUpClass(cls): + pacer_session = login('psc', 'tr1234', 'Pass!234') + cls.report = DocketReport('psc', pacer_session) + cls.pacer_case_id = '62866' - def count_rows(self, html): + @staticmethod + def _count_rows(html): """Count the rows in the docket report. :param html: The HTML of the docket report. @@ -156,28 +290,28 @@ def test_queries(self): msg="Super basic query failed") r = self.report.query(self.pacer_case_id, date_start=date(2007, 2, 7)) - row_count = self.count_rows(r.text) + row_count = self._count_rows(r.text) self.assertEqual(row_count, 25, msg="Didn't get expected number of " "rows when filtering by start " "date. Got %s." % row_count) r = self.report.query(self.pacer_case_id, date_start=date(2007, 2, 7), date_end=date(2007, 2, 8)) - row_count = self.count_rows(r.text) + row_count = self._count_rows(r.text) self.assertEqual(row_count, 2, msg="Didn't get expected number of " "rows when filtering by start and " "end dates. Got %s." % row_count) r = self.report.query(self.pacer_case_id, doc_num_start=5, doc_num_end=5) - row_count = self.count_rows(r.text) + row_count = self._count_rows(r.text) self.assertEqual(row_count, 1, msg="Didn't get expected number of rows " "when filtering by doc number. Got " "%s" % row_count) r = self.report.query(self.pacer_case_id, date_start=date(2007, 2, 7), date_end=date(2007, 2, 8), date_range_type="Entered") - row_count = self.count_rows(r.text) + row_count = self._count_rows(r.text) self.assertEqual(row_count, 2, msg="Didn't get expected number of rows " "when filtering by start and end " "dates and date_range_type of " diff --git a/tox.ini b/tox.ini new file mode 100644 index 000000000..f5821d2f3 --- /dev/null +++ b/tox.ini @@ -0,0 +1,6 @@ +[tox] +envlist=py27,py3 + +[testenv] +passenv=PACER_USERNAME PACER_PASSWORD +commands=python setup.py test