test: add integration tests

alphanome-ai · Dec 30, 2023 · 86d39d8 · 86d39d8
1 parent 1f40110
commit 86d39d8
Show file tree

Hide file tree

Showing 11 changed files with 183 additions and 5 deletions.
diff --git a/Taskfile.yml b/Taskfile.yml
@@ -17,7 +17,7 @@ tasks:
     cmds:
       # Recommended coverage viewer in VSCode: https://marketplace.visualstudio.com/items?itemName=ryanluker.vscode-coverage-gutters
       # Note: also update .codecov.yml when changing the target coverage.
-      - poetry run ptw -- -- {{.CLI_ARGS}} -rx --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=90 tests/unit/
+      - poetry run ptw -- -- {{.CLI_ARGS}} -rx --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=90 tests/unit/ tests/integration/
 
   monitor-accuracy-tests:
     desc: "Run unit tests and rerun them immediately upon file modification."
@@ -108,7 +108,11 @@ tasks:
     cmds:
       # Recommended coverage viewer in VSCode: https://marketplace.visualstudio.com/items?itemName=ryanluker.vscode-coverage-gutters
       # Note: also update .codecov.yml when changing the target coverage.
-      - poetry run pytest -rx --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=90 {{.CLI_ARGS}} tests/unit/
+      - poetry run pytest -rx --cov --cov-report=lcov:lcov.info --cov-report=term:skip-covered --cov-fail-under=90 {{.CLI_ARGS}} tests/unit/ tests/integration/
+
+  integration-tests: # Execute integration tests.
+    cmds:
+      - poetry run pytest -rx {{.CLI_ARGS}} tests/integration/
 
   lint: # Perform linting on the code and automatically fix issues.
     cmds:

diff --git a/tests/accuracy/structure_and_text/test_structure_and_text.py b/tests/accuracy/structure_and_text/test_structure_and_text.py
@@ -111,6 +111,10 @@ def test_structure_and_text(
         if actual_element not in expected_elements_json
     ]
 
+    # STEP: Sanity check
+    if not elements_not_found_in_actual and not elements_not_expected_but_present:
+        assert actual_json == expected_elements_json
+
     # STEP: Report and save the results
     total_expected = len(expected_elements_json)
     total_missing = len(elements_not_found_in_actual)

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -32,6 +32,12 @@ def pytest_addoption(parser):
         default=False,
         help="Create missing files. Overwrite files that were previously generated automatically.",
     )
+    parser.addoption(
+        "--create-missing",
+        action="store_true",
+        default=False,
+        help="Create missing files.",
+    )
 
 
 @pytest.fixture(scope="session")

diff --git a/tests/unit/integration/__init__.py → tests/integration/__init__.py b/tests/unit/integration/__init__.py → tests/integration/__init__.py
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -0,0 +1,123 @@
+from __future__ import annotations
+
+import json
+from typing import TYPE_CHECKING, Callable
+
+import pytest
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from sec_parser.processing_engine.core import AbstractSemanticElementParser
+    from sec_parser.semantic_elements.abstract_semantic_element import (
+        AbstractSemanticElement,
+    )
+
+
+@pytest.fixture(scope="session")
+def check() -> (
+    Callable[[AbstractSemanticElementParser, Path, pytest.FixtureRequest], None]
+):
+    def _check(
+        parser: AbstractSemanticElementParser,
+        html_path: Path,
+        request: pytest.FixtureRequest,
+    ) -> None:
+        # Arrange
+        assert html_path.exists(), f"{html_path} does not exist"
+        with html_path.open("r") as file:
+            html_content = file.read()
+
+        # Act
+        elements = parser.parse(html_content)
+        actual_elements_dicts = _elements_to_dicts(elements)
+
+        # Pre-Assert: Load expected results or save actual results as expected
+        json_file = html_path.with_suffix(".json")
+        if (
+            not json_file.exists() and request.config.getoption("--create-missing")
+        ) or request.config.getoption("--update"):
+            with json_file.open("w") as f:
+                json.dump(
+                    actual_elements_dicts,
+                    f,
+                    indent=4,
+                    ensure_ascii=False,
+                    sort_keys=True,
+                )
+            pytest.skip(f"{json_file} was missing and has been created.")
+        elif not json_file.exists():
+            pytest.fail(f"{json_file} is missing. Use --create-missing to create it.")
+
+        # Assert
+        with json_file.open("r") as f:
+            expected_elements_dicts = json.load(f)
+        missing, unexpected = _compare_elements(
+            expected_elements_dicts,
+            actual_elements_dicts,
+        )
+        error_messages = []
+        if unexpected:
+            e = json.dumps(
+                missing,
+                indent=4,
+                ensure_ascii=False,
+                sort_keys=True,
+            )
+            error_messages.append(
+                f"Unexpected elements in {html_path}:\n{e}",
+            )
+        if missing:
+            e = json.dumps(
+                missing,
+                indent=4,
+                ensure_ascii=False,
+                sort_keys=True,
+            )
+            error_messages.append(
+                f"Missing elements in {html_path}:\n{e}",
+            )
+        if error_messages:
+            pytest.fail("\n\n".join(error_messages))
+
+    return _check
+
+
+def _elements_to_dicts(elements: list[AbstractSemanticElement]) -> list[dict]:
+    return [
+        e.to_dict(
+            include_previews=False,
+            include_contents=True,
+        )
+        for e in elements
+    ]
+
+
+def _compare_elements(
+    expected_elements_dicts: list[dict],
+    actual_elements_dicts: list[dict],
+) -> tuple[list[dict], list[dict]]:
+    # STEP: Compare the actual elements to the expected elements
+    index_of_last_matched_element = 0
+    elements_not_found_in_actual = []
+    for expected_element in expected_elements_dicts:
+        for index_in_actual in range(
+            index_of_last_matched_element,
+            len(actual_elements_dicts),
+        ):
+            if actual_elements_dicts[index_in_actual] == expected_element:
+                index_of_last_matched_element = index_in_actual + 1
+                break
+        else:
+            elements_not_found_in_actual.append(expected_element)
+    elements_not_expected_but_present = [
+        actual_element
+        for actual_element in actual_elements_dicts
+        if actual_element not in expected_elements_dicts
+    ]
+
+    # STEP: Sanity check
+    if not elements_not_found_in_actual and not elements_not_expected_but_present:
+        assert actual_elements_dicts == expected_elements_dicts
+
+    return elements_not_found_in_actual, elements_not_expected_but_present
diff --git a/...nit/integration/section_title/__init__.py → tests/integration/section_title/__init__.py b/...nit/integration/section_title/__init__.py → tests/integration/section_title/__init__.py
diff --git a/tests/integration/section_title/data/AA_0001193125-18-236766_01.html b/tests/integration/section_title/data/AA_0001193125-18-236766_01.html
@@ -0,0 +1,13 @@
+<P STYLE="margin-top:0pt; margin-bottom:0pt; font-size:10pt; font-family:Times New Roman">
+    <B>D. Restructuring and Other Charges</B> &#150; In the second quarter and
+    <FONT STYLE="white-space:nowrap">six-month</FONT> period of 2018, Alcoa Corporation
+    recorded Restructuring and other charges of $231 and $212, respectively, which were
+    comprised of the following components: $167 and $144 (net), respectively, related
+    to settlements and/or curtailments of certain pension and other postretirement employee
+    benefits (see Note K); $80 and $84, respectively, for additional costs related to the
+    curtailed Wenatchee (Washington) smelter, including $73 in both periods
+    associated with recent management decisions (see below); a $15 net benefit in both
+    periods related to the Portovesme (Italy) smelter (see &#147;Italy 148&#148; in the
+    Litigation section of Note N); and a $1 net benefit in both periods for
+    miscellaneous items.
+</P>
diff --git a/tests/integration/section_title/data/AA_0001193125-18-236766_01.json b/tests/integration/section_title/data/AA_0001193125-18-236766_01.json
@@ -0,0 +1,6 @@
+[
+    {
+        "cls_name": "TextElement",
+        "text_content": "D. Restructuring and Other Charges  In the second quarter and\n    six-month period of 2018, Alcoa Corporation\n    recorded Restructuring and other charges of $231 and $212, respectively, which were\n    comprised of the following components: $167 and $144 (net), respectively, related\n    to settlements and/or curtailments of certain pension and other postretirement employee\n    benefits (see Note K); $80 and $84, respectively, for additional costs related to the\n    curtailed Wenatchee (Washington) smelter, including $73 in both periods\n    associated with recent management decisions (see below); a $15 net benefit in both\n    periods related to the Portovesme (Italy) smelter (see Italy 148 in the\n    Litigation section of Note N); and a $1 net benefit in both periods for\n    miscellaneous items."
+    }
+]
diff --git a/tests/integration/section_title/test_section_title.py b/tests/integration/section_title/test_section_title.py
@@ -0,0 +1,25 @@
+import json
+from pathlib import Path
+from typing import Callable
+
+import pytest
+
+from sec_parser.processing_engine.core import (
+    AbstractSemanticElementParser,
+    Edgar10QParser,
+)
+
+CURRENT_DIR = Path(__file__).resolve().parent
+
+
+@pytest.mark.parametrize("html_path", list((CURRENT_DIR / "data").glob("*.html")))
+def test_bold_titles(
+    html_path: Path,
+    check: Callable[[AbstractSemanticElementParser, Path, pytest.FixtureRequest], None],
+    request: pytest.FixtureRequest,
+):
+    def get_steps():
+        return [k for k in Edgar10QParser().get_default_steps() if True]
+
+    parser = Edgar10QParser(get_steps)
+    check(parser, html_path, request)
diff --git a/tests/unit/integration/section_title/data/AA_0001193125-18-236766_01.html b/tests/unit/integration/section_title/data/AA_0001193125-18-236766_01.html
diff --git a/tests/unit/integration/section_title/test_bold_titles.py b/tests/unit/integration/section_title/test_bold_titles.py