Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
sbma44 committed Dec 8, 2011
0 parents commit 5551778
Show file tree
Hide file tree
Showing 17 changed files with 14,237 additions and 0 deletions.
1,390 changes: 1,390 additions & 0 deletions 617.html

Large diffs are not rendered by default.

514 changes: 514 additions & 0 deletions 617_scores.html

Large diffs are not rendered by default.

1,336 changes: 1,336 additions & 0 deletions 618.html

Large diffs are not rendered by default.

493 changes: 493 additions & 0 deletions 618_scores.html

Large diffs are not rendered by default.

1,462 changes: 1,462 additions & 0 deletions 619.html

Large diffs are not rendered by default.

542 changes: 542 additions & 0 deletions 619_scores.html

Large diffs are not rendered by default.

1,462 changes: 1,462 additions & 0 deletions 620.html

Large diffs are not rendered by default.

542 changes: 542 additions & 0 deletions 620_scores.html

Large diffs are not rendered by default.

1,300 changes: 1,300 additions & 0 deletions 621.html

Large diffs are not rendered by default.

479 changes: 479 additions & 0 deletions 621_scores.html

Large diffs are not rendered by default.

1,497 changes: 1,497 additions & 0 deletions 732.html

Large diffs are not rendered by default.

555 changes: 555 additions & 0 deletions 732_scores.html

Large diffs are not rendered by default.

1,480 changes: 1,480 additions & 0 deletions 736.html

Large diffs are not rendered by default.

549 changes: 549 additions & 0 deletions 736_scores.html

Large diffs are not rendered by default.

400 changes: 400 additions & 0 deletions alex.py

Large diffs are not rendered by default.

111 changes: 111 additions & 0 deletions metacategories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import re

normalizer = re.compile(r'[^\dA-Z&\s]')
whitespace = re.compile(r'[\s+]')

def normalize_category_name(s):
return whitespace.sub(' ', normalizer.sub('', s.strip().upper().replace('&', '&')))

METACATEGORIES = { 'SCIENCE': [
'PHYSICS',
'THE SENSES',
'BIRDS',
'BIOLOGY',
'GEMS',
'ANIMALS',
'THE BODY HUMAN',
'CONSTELLATIONS',
'PLANTS',
'TECHNOLOGY',
'MAMMALS',
],
'BUSINESS': [
'MUSIC BUSINESS',
'BUSINESS & INDUSTRY',
'CORPORATE AMERICA',
'MONEY',
],
'RELIGION': [
'YE GODS',
'THE BIBLE',
'RELIGION',
],
'ARTS': [
'POETIC TERMS',
'FICTIONAL CHARACTERS',
'20TH CENTURY POETRY',
'HISTORICAL DRAMAS',
'SHAKESPEAREAN TRIVIA'
],
'POP CULTURE': [
'MR MOVIES',
'STORYTELLERS',
'THE HUSBAND MARRIED',
'TOUGH TV TRIVIA',
'THE OLYMPICS',
'STARRY SONGS',
'TV COMMERCIALS',
'SPORTS STADIUMS',
'60S BRITISH ROCK',
'MOVIE AUTHORS',
'SPORTS',
'FASHION',
'LYRICS',
'ACTORS & ROLE',
'TV CAPTAINS',
'SILLY SONGS',
],
'GEOGRAPHY': [
'NEIGHBORHOODS',
'WORLD CAPITALS',
'WORLD GEOGRAPHY',
'ISLANDS',
'LAKES & RIVERS',
'AMERICAN RIVERS',
'US STATES',
'EUROPE',
'NEPAL',
'ARKANSAS',
'MEMPHIS',
],
'HISTORY': [
'APOLLO 11',
'ANCIENT VIPS',
'PRESIDENTS',
'GREAT DAMES',
'WORLD WAR II',
'COLONIAL AMERICA',
'ROYALTY',
'REVOLUTIONARY WAR',
'DYNASTIES',
'NATIVE AMER WOMEN',
'INDIANS',
'FIRST LADIES',
'1946',
'DEMOCRATS',
],
'WORDPLAY': [
'STARS WITH C',
'STARTS WITH P',
'WORDS',
'LETTER PERFECT',
'13LETTER WORDS',
'AC',
'DC',
'NAMES THE SAME',
'HOMOPHONIC PAIRS',
],
'MISC': [
'MEATS',
'THE MAIN INGREDIENT',
'NEWSPAPERS',
'TRANSPORTATION',
'SUMMER',
'AUTOMOBILES',
'LESSERKNOWN NAMES',
'GOING IN STYLE',
'AKA',
'FAMOUS QUOTES',
'HOLIDAYS',

] }
125 changes: 125 additions & 0 deletions soupselect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
"""
soupselect.py
CSS selector support for BeautifulSoup.
soup = BeautifulSoup('<html>...')
select(soup, 'div')
- returns a list of div elements
select(soup, 'div#main ul a')
- returns a list of links inside a ul inside div#main
"""

import re

tag_re = re.compile('^[a-z0-9]+$')

attribselect_re = re.compile(
r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' +
r'=?"?(?P<value>[^\]"]*)"?\]$'
)

# /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
# \---/ \---/\-------------/ \-------/
# | | | |
# | | | The value
# | | ~,|,^,$,* or =
# | Attribute
# Tag

def attribute_checker(operator, attribute, value=''):
"""
Takes an operator, attribute and optional value; returns a function that
will return True for elements that match that combination.
"""
return {
'=': lambda el: el.get(attribute) == value,
# attribute includes value as one of a set of space separated tokens
'~': lambda el: value in el.get(attribute, '').split(),
# attribute starts with value
'^': lambda el: el.get(attribute, '').startswith(value),
# attribute ends with value
'$': lambda el: el.get(attribute, '').endswith(value),
# attribute contains value
'*': lambda el: value in el.get(attribute, ''),
# attribute is either exactly value or starts with value-
'|': lambda el: el.get(attribute, '') == value \
or el.get(attribute, '').startswith('%s-' % value),
}.get(operator, lambda el: el.has_key(attribute))


def select(soup, selector):
"""
soup should be a BeautifulSoup instance; selector is a CSS selector
specifying the elements you want to retrieve.
"""
tokens = selector.split()
current_context = [soup]
for token in tokens:
m = attribselect_re.match(token)
if m:
# Attribute selector
tag, attribute, operator, value = m.groups()
if not tag:
tag = True
checker = attribute_checker(operator, attribute, value)
found = []
for context in current_context:
found.extend([el for el in context.findAll(tag) if checker(el)])
current_context = found
continue
if '#' in token:
# ID selector
tag, id = token.split('#', 1)
if not tag:
tag = True
el = current_context[0].find(tag, {'id': id})
if not el:
return [] # No match
current_context = [el]
continue
if '.' in token:
# Class selector
tag, klass = token.split('.', 1)
if not tag:
tag = True
found = []
for context in current_context:
found.extend(
context.findAll(tag,
{'class': lambda attr: attr and klass in attr.split()}
)
)
current_context = found
continue
if token == '*':
# Star selector
found = []
for context in current_context:
found.extend(context.findAll(True))
current_context = found
continue
# Here we should just have a regular tag
if not tag_re.match(token):
return []
found = []
for context in current_context:
found.extend(context.findAll(token))
current_context = found
return current_context

def monkeypatch(BeautifulSoupClass=None):
"""
If you don't explicitly state the class to patch, defaults to the most
common import location for BeautifulSoup.
"""
if not BeautifulSoupClass:
from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
BeautifulSoupClass.findSelect = select

def unmonkeypatch(BeautifulSoupClass=None):
if not BeautifulSoupClass:
from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
delattr(BeautifulSoupClass, 'findSelect')

0 comments on commit 5551778

Please sign in to comment.