Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Py3 upgrade and Pacer Refactoring #171

Merged
merged 16 commits into from
Feb 2, 2017
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ juriscraper.egg-info/
# Private PACER stuff and test fixtures
juriscraper/pacer/private_settings.py
tests/fixtures/cassettes/

.tox
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ sudo: false
language: python
python:
- '2.7'
- '3.5'
- '3.6'
script: python setup.py test
install: pip install -U setuptools ; pip install .
cache: pip
Expand Down
19 changes: 11 additions & 8 deletions juriscraper/AbstractSite.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import re
import json
import certifi
import hashlib
import requests

import six

from datetime import date, datetime
from requests.adapters import HTTPAdapter
Expand Down Expand Up @@ -139,7 +138,7 @@ def _clean_attributes(self):
if attr == 'download_urls':
sub_item = sub_item.strip()
else:
if isinstance(sub_item, basestring):
if isinstance(sub_item, six.string_types):
sub_item = clean_string(sub_item)
elif isinstance(sub_item, datetime):
sub_item = sub_item.date()
Expand Down Expand Up @@ -178,7 +177,7 @@ def _check_sanity(self):
for attr in self._all_attrs:
if self.__getattribute__(attr) is not None:
lengths[attr] = len(self.__getattribute__(attr))
values = lengths.values()
values = list(lengths.values())
if values.count(values[0]) != len(values):
# Are all elements equal?
raise InsanityException("%s: Scraped meta data fields have differing"
Expand Down Expand Up @@ -236,10 +235,10 @@ def _date_sort(self):
obj_list_attrs = [self.__getattribute__(attr) for attr in
self._all_attrs if
isinstance(self.__getattribute__(attr), list)]
zipped = zip(*obj_list_attrs)
zipped = list(zip(*obj_list_attrs))
zipped.sort(reverse=True)
i = 0
obj_list_attrs = zip(*zipped)
obj_list_attrs = list(zip(*zipped))
for attr in self._all_attrs:
if isinstance(self.__getattribute__(attr), list):
self.__setattr__(attr, obj_list_attrs[i][:])
Expand All @@ -249,7 +248,7 @@ def _make_hash(self):
"""Make a unique ID. ETag and Last-Modified from courts cannot be
trusted
"""
self.hash = hashlib.sha1(str(self.case_names)).hexdigest()
self.hash = hashlib.sha1(str(self.case_names).encode()).hexdigest()

def _get_adapter_instance(self):
"""Hook for returning a custom HTTPAdapter
Expand Down Expand Up @@ -339,7 +338,11 @@ def _return_request_text_object(self):
if 'json' in self.request['request'].headers.get('content-type', ''):
return self.request['request'].json()
else:
text = self._clean_text(self.request['request'].text)
payload = self.request['request'].content
if six.PY2:
payload = self.request['request'].text

text = self._clean_text(payload)
html_tree = self._make_html_tree(text)
html_tree.rewrite_links(fix_links_in_lxml_tree,
base_href=self.request['url'])
Expand Down
2 changes: 1 addition & 1 deletion juriscraper/OpinionSite.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from AbstractSite import AbstractSite
from juriscraper.AbstractSite import AbstractSite


class OpinionSite(AbstractSite):
Expand Down
2 changes: 1 addition & 1 deletion juriscraper/OralArgumentSite.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from AbstractSite import AbstractSite
from juriscraper.AbstractSite import AbstractSite


class OralArgumentSite(AbstractSite):
Expand Down
9 changes: 5 additions & 4 deletions juriscraper/lib/date_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
import six
from math import ceil

from dateutil.parser import _timelex, parser, parserinfo
Expand Down Expand Up @@ -108,11 +109,11 @@ def parse_dates(s, debug=False, sane_start=datetime.datetime(1750, 1, 1),

# Ditch unicode (_timelex() flips out on unicode if the system has
# cStringIO installed -- the default)
if isinstance(s, unicode):
s = s.encode('ascii', 'ignore')
#if isinstance(s, six.text_type):
# s = s.encode('ascii', 'ignore')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why commented out?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trying to remember...but I believe it was doing nothing but cause problems since Py3 works fine with unicode and Py2 does better. I couldn't figure out the point of forcing things to ASCII as it didn't seem to impact the tests. Either I'm missing something or it's legacy stuff.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. This code isn't used often, so we can leave this and uncomment it if needed.


# Fix misspellings
for i, j in MISSPELLINGS.iteritems():
for i, j in six.iteritems(MISSPELLINGS):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't we just do .items() in Python 2.7 and in 3.x?

s = s.replace(i, j)


Expand All @@ -127,7 +128,7 @@ def parse_dates(s, debug=False, sane_start=datetime.datetime(1750, 1, 1),
hit_default_day_and_month = (d.month == DEFAULT.month and d.day == DEFAULT.day)
if not any([hit_default_year, hit_default_day_and_month]):
if debug:
print "Item %s parsed as: %s" % (item, d)
print("Item %s parsed as: %s" % (item, d))
if sane_start < d < sane_end:
dates.append(d)
except OverflowError:
Expand Down
12 changes: 8 additions & 4 deletions juriscraper/lib/html_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# encoding: utf-8
from urlparse import urlsplit
from urlparse import urlunsplit
from six import text_type
from six.moves.urllib.parse import urlsplit, urlunsplit

import re
from lxml import html
Expand Down Expand Up @@ -78,7 +78,11 @@ def set_response_encoding(request):
# HTTP headers. This way it is done before r.text is accessed
# (which would do it with vanilla chardet). This is a big
# performance boon, and can be removed once requests is upgraded
request.encoding = chardet.detect(request.content)['encoding']
if isinstance(request.content, text_type):
as_bytes = request.content.encode()
request.encoding = chardet.detect(as_bytes)['encoding']
else:
request.encoding = chardet.detect(request.content)['encoding']


def clean_html(text):
Expand All @@ -100,7 +104,7 @@ def clean_html(text):
# attribute, but we remove it in all cases, as there's no downside to
# removing it. This moves our encoding detection to chardet, rather than
# lxml.
if isinstance(text, unicode):
if isinstance(text, text_type):
text = re.sub(r'^\s*<\?xml\s+.*?\?>', '', text)

# Fix </br>
Expand Down
6 changes: 3 additions & 3 deletions juriscraper/lib/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ def find_all_attr_or_punt(court_id):
# juriscraper.opinions.united_states.federal_appellate.ca1,
# therefore, we add it to our list!
module_strings.append(court_id)
except ImportError, e:
except ImportError as e:
# Something has gone wrong with the import
print "Import error: %s" % e
print("Import error: %s" % e)
return []

find_all_attr_or_punt(court_id)
Expand All @@ -51,5 +51,5 @@ def site_yielder(iterable, mod):
try:
site._download_backwards(i)
yield site
except HTTPError, e:
except HTTPError as e:
continue
18 changes: 9 additions & 9 deletions juriscraper/lib/log_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,28 +24,28 @@ def make_default_logger(file_path=LOG_FILENAME):
maxBytes=5120000,
backupCount=7
)
except IOError, e:
except IOError as e:
if e.errno == 2:
print "\nWarning: %s: %s. " \
print("\nWarning: %s: %s. " \
"Have you created the directory for the log?" % (
e.strerror,
file_path,
)
))
elif e.errno == 13:
print "\nWarning: %s: %s. " \
print("\nWarning: %s: %s. " \
"Cannot access file as user: %s" % (
e.strerror,
file_path,
getpass.getuser(),
)
))
else:
print "\nIOError [%s]: %s\n%s" % (
print("\nIOError [%s]: %s\n%s" % (
e.errno,
e.strerror,
traceback.format_exc()
)
print "Juriscraper will continue to run, and all logs will be " \
"sent to stdout."
))
print("Juriscraper will continue to run, and all logs will be " \
"sent to stdout.")
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
Expand Down
Loading