first commit

sunlightlabs · Dec 8, 2011 · 5551778 · 5551778
commit 5551778
Show file tree

Hide file tree

Showing 17 changed files with 14,237 additions and 0 deletions.
diff --git a/617.html b/617.html
diff --git a/617_scores.html b/617_scores.html
diff --git a/618.html b/618.html
diff --git a/618_scores.html b/618_scores.html
diff --git a/619.html b/619.html
diff --git a/619_scores.html b/619_scores.html
diff --git a/620.html b/620.html
diff --git a/620_scores.html b/620_scores.html
diff --git a/621.html b/621.html
diff --git a/621_scores.html b/621_scores.html
diff --git a/732.html b/732.html
diff --git a/732_scores.html b/732_scores.html
diff --git a/736.html b/736.html
diff --git a/736_scores.html b/736_scores.html
diff --git a/alex.py b/alex.py
diff --git a/metacategories.py b/metacategories.py
@@ -0,0 +1,111 @@
+import re
+
+normalizer = re.compile(r'[^\dA-Z&\s]')
+whitespace = re.compile(r'[\s+]')
+
+def normalize_category_name(s):
+    return whitespace.sub(' ', normalizer.sub('', s.strip().upper().replace('&AMP;', '&')))
+
+METACATEGORIES = { 'SCIENCE': [ 
+    'PHYSICS',
+    'THE SENSES',
+    'BIRDS',
+    'BIOLOGY',
+    'GEMS',
+    'ANIMALS',
+    'THE BODY HUMAN',
+    'CONSTELLATIONS',
+    'PLANTS',
+    'TECHNOLOGY',
+    'MAMMALS',  
+],
+'BUSINESS': [
+    'MUSIC BUSINESS',
+    'BUSINESS & INDUSTRY',
+    'CORPORATE AMERICA',
+    'MONEY',    
+],
+'RELIGION': [
+    'YE GODS',
+    'THE BIBLE',
+    'RELIGION',
+],
+'ARTS': [
+    'POETIC TERMS',
+    'FICTIONAL CHARACTERS',
+    '20TH CENTURY POETRY',
+    'HISTORICAL DRAMAS',
+    'SHAKESPEAREAN TRIVIA'
+],
+'POP CULTURE': [
+    'MR MOVIES',
+    'STORYTELLERS',
+    'THE HUSBAND MARRIED',
+    'TOUGH TV TRIVIA',
+    'THE OLYMPICS',
+    'STARRY SONGS',
+    'TV COMMERCIALS',
+    'SPORTS STADIUMS',
+    '60S BRITISH ROCK',   
+    'MOVIE AUTHORS',
+    'SPORTS',
+    'FASHION',
+    'LYRICS',
+    'ACTORS & ROLE',
+    'TV CAPTAINS',  
+    'SILLY SONGS',      
+],
+'GEOGRAPHY': [
+    'NEIGHBORHOODS',
+    'WORLD CAPITALS',
+    'WORLD GEOGRAPHY',
+    'ISLANDS',
+    'LAKES & RIVERS', 
+    'AMERICAN RIVERS',
+    'US STATES',
+    'EUROPE',   
+    'NEPAL',       
+    'ARKANSAS',       
+    'MEMPHIS',    
+],
+'HISTORY': [
+    'APOLLO 11',
+    'ANCIENT VIPS',    
+    'PRESIDENTS',
+    'GREAT DAMES',
+    'WORLD WAR II',
+    'COLONIAL AMERICA',
+    'ROYALTY',
+    'REVOLUTIONARY WAR',
+    'DYNASTIES',
+    'NATIVE AMER WOMEN',
+    'INDIANS',
+    'FIRST LADIES',
+    '1946', 
+    'DEMOCRATS',
+],
+'WORDPLAY': [
+    'STARS WITH C',
+    'STARTS WITH P',
+    'WORDS',
+    'LETTER PERFECT',
+    '13LETTER WORDS',
+    'AC',
+    'DC',
+    'NAMES THE SAME',
+    'HOMOPHONIC PAIRS',    
+],
+'MISC': [
+    'MEATS',
+    'THE MAIN INGREDIENT',
+    'NEWSPAPERS',
+    'TRANSPORTATION',
+    'SUMMER',
+    'AUTOMOBILES',
+    'LESSERKNOWN NAMES',
+    'GOING IN STYLE',
+    'AKA',
+    'FAMOUS QUOTES',
+    'HOLIDAYS',
+
+] }
diff --git a/soupselect.py b/soupselect.py
@@ -0,0 +1,125 @@
+"""
+soupselect.py
+
+CSS selector support for BeautifulSoup.
+
+soup = BeautifulSoup('<html>...')
+select(soup, 'div')
+- returns a list of div elements
+
+select(soup, 'div#main ul a')
+- returns a list of links inside a ul inside div#main
+
+"""
+
+import re
+
+tag_re = re.compile('^[a-z0-9]+$')
+
+attribselect_re = re.compile(
+    r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' + 
+    r'=?"?(?P<value>[^\]"]*)"?\]$'
+)
+
+# /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
+#   \---/  \---/\-------------/    \-------/
+#     |      |         |               |
+#     |      |         |           The value
+#     |      |    ~,|,^,$,* or =
+#     |   Attribute 
+#    Tag
+
+def attribute_checker(operator, attribute, value=''):
+    """
+    Takes an operator, attribute and optional value; returns a function that
+    will return True for elements that match that combination.
+    """
+    return {
+        '=': lambda el: el.get(attribute) == value,
+        # attribute includes value as one of a set of space separated tokens
+        '~': lambda el: value in el.get(attribute, '').split(),
+        # attribute starts with value
+        '^': lambda el: el.get(attribute, '').startswith(value),
+        # attribute ends with value
+        '$': lambda el: el.get(attribute, '').endswith(value),
+        # attribute contains value
+        '*': lambda el: value in el.get(attribute, ''),
+        # attribute is either exactly value or starts with value-
+        '|': lambda el: el.get(attribute, '') == value \
+            or el.get(attribute, '').startswith('%s-' % value),
+    }.get(operator, lambda el: el.has_key(attribute))
+
+
+def select(soup, selector):
+    """
+    soup should be a BeautifulSoup instance; selector is a CSS selector 
+    specifying the elements you want to retrieve.
+    """
+    tokens = selector.split()
+    current_context = [soup]
+    for token in tokens:
+        m = attribselect_re.match(token)
+        if m:
+            # Attribute selector
+            tag, attribute, operator, value = m.groups()
+            if not tag:
+                tag = True
+            checker = attribute_checker(operator, attribute, value)
+            found = []
+            for context in current_context:
+                found.extend([el for el in context.findAll(tag) if checker(el)])
+            current_context = found
+            continue
+        if '#' in token:
+            # ID selector
+            tag, id = token.split('#', 1)
+            if not tag:
+                tag = True
+            el = current_context[0].find(tag, {'id': id})
+            if not el:
+                return [] # No match
+            current_context = [el]
+            continue
+        if '.' in token:
+            # Class selector
+            tag, klass = token.split('.', 1)
+            if not tag:
+                tag = True
+            found = []
+            for context in current_context:
+                found.extend(
+                    context.findAll(tag,
+                        {'class': lambda attr: attr and klass in attr.split()}
+                    )
+                )
+            current_context = found
+            continue
+        if token == '*':
+            # Star selector
+            found = []
+            for context in current_context:
+                found.extend(context.findAll(True))
+            current_context = found
+            continue
+        # Here we should just have a regular tag
+        if not tag_re.match(token):
+            return []
+        found = []
+        for context in current_context:
+            found.extend(context.findAll(token))
+        current_context = found
+    return current_context
+
+def monkeypatch(BeautifulSoupClass=None):
+    """
+    If you don't explicitly state the class to patch, defaults to the most 
+    common import location for BeautifulSoup.
+    """
+    if not BeautifulSoupClass:
+        from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
+    BeautifulSoupClass.findSelect = select
+
+def unmonkeypatch(BeautifulSoupClass=None):
+    if not BeautifulSoupClass:
+        from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
+    delattr(BeautifulSoupClass, 'findSelect')