Skip to content

Commit

Permalink
refactor (#206)
Browse files Browse the repository at this point in the history
* po

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint
  • Loading branch information
dogweather authored Dec 31, 2023
1 parent 2b06762 commit 68289e1
Show file tree
Hide file tree
Showing 16 changed files with 378 additions and 361 deletions.
435 changes: 216 additions & 219 deletions poetry.lock

Large diffs are not rendered by default.

15 changes: 0 additions & 15 deletions public_law/flipped.py

This file was deleted.

38 changes: 38 additions & 0 deletions public_law/html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from typing import Any
from toolz.functoolz import curry

from scrapy.selector.unified import Selector, SelectorList
from scrapy.http.response.xml import XmlResponse

from .exceptions import ParseException


def node_name(node: Selector) -> str | None:
return node.xpath("name()").get()

def just_text(node: Selector | SelectorList | Any) -> str | None:
return node.xpath("text()").get()

def xpath(selector: str, dom: XmlResponse) -> str:
"""
Extracts the text content from the XML response using the given XPath selector.
It does this by appending "/text()" to the selector and returning the first
match. If no match is found, it raises a ParseException.
Args:
selector (str): The XPath selector to match the desired elements.
dom (XmlResponse): The XML response object.
Returns:
str: The extracted text content.
Raises:
ParseException: If the specified XPath selector cannot be found in the XML response.
"""
match dom.xpath(selector + "/text()").get():
case str(value):
return value
case None:
raise ParseException(f"Could not find {xpath} in {dom.url}")

xpath = curry(xpath) # type: ignore
5 changes: 3 additions & 2 deletions public_law/models/glossary.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from dataclasses import dataclass
import dataclasses
from dataclasses import dataclass
from functools import cache
from typing import Any, Iterable, Callable, TypeAlias
from typing import Any, Callable, Iterable, TypeAlias

from scrapy.http.response.html import HtmlResponse

from ..metadata import Metadata
Expand Down
17 changes: 12 additions & 5 deletions public_law/parsers/aus/dv_glossary.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
from typing import Any, Iterable, cast
from toolz.functoolz import pipe # type: ignore
from public_law.flipped import rstrip

from scrapy.http.response.html import HtmlResponse
from toolz.functoolz import pipe # type: ignore

from public_law import text

from ...metadata import Metadata, Subject
from ...models.glossary import GlossaryEntry, GlossaryParseResult
from ...text import URL, LoCSubject, NonemptyString as String
from ...text import Sentence, ensure_ends_with_period, make_soup, normalize_nonempty
from ...text import URL, LoCSubject
from ...text import NonemptyString as String
from ...text import (Sentence, ensure_ends_with_period, make_soup,
normalize_nonempty)


def parse_glossary(html: HtmlResponse) -> GlossaryParseResult:
Expand Down Expand Up @@ -44,7 +48,10 @@ def __parse_entries(html: HtmlResponse) -> Iterable[GlossaryEntry]:
"""TODO: Refactor into a parent class"""

for phrase, defn in __raw_entries(html):
fixed_phrase: String = cast(Sentence, pipe(phrase, rstrip(": "), String)) # type: ignore
fixed_phrase = text.pipe(
phrase
, text.rstrip(": ") # type: ignore
)

fixed_definition: Sentence = cast(Sentence, pipe(defn, ensure_ends_with_period, normalize_nonempty, Sentence))

Expand Down
38 changes: 17 additions & 21 deletions public_law/parsers/irl/courts_glossary.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,13 @@
from scrapy.http.response.html import HtmlResponse
from toolz.functoolz import pipe # type: ignore

from ...flipped import lstrip, rstrip
from public_law import text

from ...metadata import Metadata, Subject
from ...models.glossary import GlossaryEntry, GlossaryParseResult
from ...text import URL, LoCSubject, NonemptyString as String, WikidataTopic
from ...text import (
Sentence,
capitalize_first_char,
ensure_ends_with_period,
normalize_nonempty,
)
from ...text import URL, LoCSubject
from ...text import NonemptyString as String
from ...text import (Sentence, WikidataTopic)


def parse_glossary(html: HtmlResponse) -> GlossaryParseResult:
Expand Down Expand Up @@ -52,23 +49,22 @@ def _parse_entries(html: HtmlResponse) -> Iterable[GlossaryEntry]:
functions for cleaning up the definitions and phrases.
"""

def cleanup_definition(defn: str) -> Sentence:
def cleanup_definition(definition: str) -> Sentence:
return pipe(
defn,
normalize_nonempty,
lstrip(":"), # type: ignore
ensure_ends_with_period,
normalize_nonempty,
capitalize_first_char,
Sentence,
definition
, text.normalize_nonempty
, text.lstrip(":") # type: ignore
, text.ensure_ends_with_period
, text.normalize_nonempty
, text.capitalize_first_char
, Sentence
)

def cleanup_phrase(phrase: str) -> String:
return pipe(
phrase,
rstrip(":"), # type: ignore
normalize_nonempty,
String,
return text.pipe(
phrase
, text.rstrip(":") # type: ignore
, text.normalize_nonempty
)

for phrase, defn in _raw_entries(html):
Expand Down
55 changes: 21 additions & 34 deletions public_law/parsers/usa/colorado/crs.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,53 @@
from scrapy.selector.unified import Selector
from scrapy.http.response.xml import XmlResponse
from typing import Optional, Protocol

from typing import Any, Optional, cast, Protocol
from toolz.functoolz import curry, flip, pipe # type: ignore
from scrapy.http.response.xml import XmlResponse
from scrapy.selector.unified import Selector

from public_law import html, seq, text
from public_law.exceptions import ParseException
from public_law.selector_util import xpath_get
from public_law.text import NonemptyString, URL, titleize
import public_law.text as text
from public_law.items.crs import Article, Division, Title
from public_law.parsers.usa.colorado.crs_articles import parse_articles
from public_law.parsers.usa.colorado.crs_articles import parse_articles
from public_law.parsers.usa.colorado.crs_divisions import parse_divisions

split = curry(flip(str.split))
xpath_get = curry(xpath_get)

def second(x: list[Any]) -> Any:
return x[1]

class Logger(Protocol):
"""Defines a simple shape-based logger interface."""
def warn(self, message: str) -> None: ...



def parse_title_bang(dom: XmlResponse, logger: Logger) -> Title:
match parse_title(dom, logger):
case None:
raise Exception("Could not parse title")
raise ParseException("Could not parse title")
case title:
return title


def parse_title(dom: XmlResponse, logger: Logger) -> Optional[Title]:
try:
name = string_pipe(
"//TITLE-TEXT/text()",
xpath_get(dom),
titleize
name = text.pipe(
dom
, html.xpath("//TITLE-TEXT") # type: ignore
, text.titleize
)
number = string_pipe(
"//TITLE-NUM/text()",
xpath_get(dom),
text.split_on_space,
second
number = text.pipe(
dom
, html.xpath("//TITLE-NUM") # type: ignore
, text.split(" ") # type: ignore
, seq.get(1) # type: ignore
)
children = _parse_divisions_or_articles(number, dom, logger)
url = source_url(number)
url = _source_url(number)

return Title(name, number, children, url)

except ParseException as e:
logger.warn(f"Could not parse the title: {e}")
return None


def string_pipe(*args: Any) -> NonemptyString:
"""A wrapper around pipe() that casts the result to a NonemptyString."""
args_with_string: Any = args + (NonemptyString,)

return cast(NonemptyString, pipe(*args_with_string))


def _parse_divisions_or_articles(title_number: NonemptyString, dom: Selector | XmlResponse, logger: Logger) -> list[Division] | list[Article]:
def _parse_divisions_or_articles(title_number: text.NonemptyString, dom: Selector | XmlResponse, logger: Logger) -> list[Division] | list[Article]:
division_nodes = dom.xpath("//T-DIV")
article_nodes = dom.xpath("//TA-LIST")

Expand All @@ -75,6 +62,6 @@ def _parse_divisions_or_articles(title_number: NonemptyString, dom: Selector | X
return parse_fun(title_number, dom, logger)


def source_url(title_number: NonemptyString) -> URL:
def _source_url(title_number: text.NonemptyString) -> text.URL:
url_number = title_number.rjust(2, "0")
return URL(f"https://leg.colorado.gov/sites/default/files/images/olls/crs2022-title-{url_number}.pdf")
return text.URL(f"https://leg.colorado.gov/sites/default/files/images/olls/crs2022-title-{url_number}.pdf")
2 changes: 1 addition & 1 deletion public_law/parsers/usa/colorado/crs_articles.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from scrapy.selector.unified import Selector
from scrapy.http.response.xml import XmlResponse

from public_law.selector_util import node_name
from public_law.html import node_name
from public_law.items.crs import *
from public_law.text import remove_trailing_period, normalize_whitespace, NonemptyString

Expand Down
2 changes: 1 addition & 1 deletion public_law/parsers/usa/colorado/crs_divisions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from itertools import takewhile, dropwhile


from public_law.selector_util import just_text
from public_law.html import just_text
from public_law.text import NonemptyString
from public_law.items.crs import Division, Subdivision
from public_law.parsers.usa.colorado.crs_articles import div_name_text, parse_articles_from_division
Expand Down
2 changes: 1 addition & 1 deletion public_law/parsers/usa/colorado/crs_sections.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from scrapy.http.response.xml import XmlResponse
from scrapy.selector.unified import Selector

from public_law.selector_util import just_text
from public_law.html import just_text
from public_law.items.crs import Section
from public_law.text import remove_trailing_period, normalize_whitespace, NonemptyString

Expand Down
12 changes: 6 additions & 6 deletions public_law/parsers/usa/uscis_glossary.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
capitalize_first_char,
normalize_nonempty,
)
from public_law import text


def parse_glossary(html: HtmlResponse) -> GlossaryParseResult:
Expand Down Expand Up @@ -64,12 +65,11 @@ def cleanup_definition(defn: str) -> Sentence:
def cleanup_phrase(phrase: str) -> String:
assert isinstance(phrase, str)

return pipe(
phrase,
normalize_nonempty,
String,
) # type: ignore

return text.pipe(
phrase
, normalize_nonempty
)

for phrase, defn in _raw_entries(html):
assert isinstance(phrase, str)
assert isinstance(defn, str)
Expand Down
22 changes: 0 additions & 22 deletions public_law/selector_util.py

This file was deleted.

12 changes: 12 additions & 0 deletions public_law/seq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""Sequence (Iterables, Lists, etc.) functions."""


from typing import Any

from toolz.functoolz import curry


def get(index: int, x: list[Any]) -> Any:
return x[index]

get = curry(get)
Loading

0 comments on commit 68289e1

Please sign in to comment.