Princeton-CDH · laurejt · Nov 1, 2024 · Nov 1, 2024 · Nov 6, 2024 · rlskoeser
diff --git a/...ppa/poetry_detection/annotation/recipe.py → ...etection/annotation/annotation_recipes.py b/...ppa/poetry_detection/annotation/recipe.py → ...etection/annotation/annotation_recipes.py
diff --git a/src/corppa/poetry_detection/annotation/command_recipes.py b/src/corppa/poetry_detection/annotation/command_recipes.py
@@ -0,0 +1,87 @@
+from collections import Counter, defaultdict
+
+from prodigy.components.db import connect
+from prodigy.core import Arg, recipe
+from prodigy.errors import RecipeError
+from prodigy.util import SESSION_ID_ATTR, msg
+
+
+@recipe(
+    "page-stats",
+    dataset=Arg(help="Prodigy dataset ID"),
+)
+def ppa_stats(dataset: str) -> None:
+    # Load examples
+    DB = connect()
+    if dataset not in DB:
+        raise RecipeError(f"Can't find dataset '{dataset}' in database")
+    examples = DB.get_dataset_examples(dataset)
+    n_examples = len(examples)
+    msg.good(f"Loaded {n_examples} annotations from {dataset} dataset")
+
+    # Get stats
+    examples_by_page = Counter()
+    examples_by_session = defaultdict(list)
+    for ex in examples:
+        # Skip examples without answer or (page) id
+        if "answer" not in ex and "id" not in ex:
+            # Ignore "unanswered" examples
+            continue
+        page_id = ex["id"]
+        examples_by_page[page_id] += 1
+        session_id = ex[SESSION_ID_ATTR]
+        examples_by_session[session_id].append(page_id)
+    # Get frequencies of page-level annotation counts
+    count_freqs = Counter()
+    total = 0
+    for count in examples_by_page.values():
+        count_freqs[count] += 1
+        total += count
-    # Get frequencies of page-level annotation counts
-    count_freqs = Counter()
-    total = 0
-    for count in examples_by_page.values():
-        count_freqs[count] += 1
-        total += count
+    # Get frequencies of page-level annotation counts
+    count_freqs = Counter(examples_by_page.values())
+    total = Sum(examples_by_page.values())
-    # Get frequencies of page-level annotation counts
-    count_freqs = Counter()
-    total = 0
-    for count in examples_by_page.values():
-        count_freqs[count] += 1
-        total += count
+    # Get frequencies of page-level annotation counts
+    count_freqs = Counter(examples_by_page.values())
+    total = Sum(examples_by_page.values())
+
+    # Build overall table
+    header = ["# Annotations"]
+    row = ["# Pages"]
+    for key, val in sorted(count_freqs.items()):
+        header.append(f"{key}")
+        row.append(val)
+    header.append("Total")
+    row.append(total)
+    aligns = ["r", "r", "r", "r"]
+    msg.table(
+        [row],
+        title="Overall Annotation Progress",
+        header=header,
+        aligns=aligns,
+        divider=True,
+    )
+
+    # Build session table
+    data = []
+    total = 0
+    for session, pages in sorted(examples_by_session.items()):
+        count = len(pages)
+        unique = len(set(pages))
+        total += count
+        row = [session, count, unique, total]
+        data.append(row)
+    header = [
+        "Session",
+        "Count",
+        "Unique",
+        "Total",
+    ]
+    aligns = ["l", "r", "r", "r"]
+    # info = {
+    #    "Session": "Session name",
+    #    "Count": "Completed annotations",
+    #    "Unique": "Unique annotations (distinct pages)",
+    #    "Total": "Total annotations collected",
+    # }
+    # msg.table(info, title="Legend")
+    msg.table(
+        data,
+        title="Session Annotation Progress",
+        header=header,
+        aligns=aligns,
+        divider=True,
+    )
diff --git a/..._detection/test_annotation/test_recipe.py → ...est_annotation/test_annotation_recipes.py b/..._detection/test_annotation/test_recipe.py → ...est_annotation/test_annotation_recipes.py
@@ -24,7 +24,7 @@
 mock_prodigy_preprocess = MagicMock()
 sys.modules["prodigy.components.preprocess"] = mock_prodigy_preprocess
 
-from corppa.poetry_detection.annotation.recipe import (
+from corppa.poetry_detection.annotation.annotation_recipes import (
     ReviewStream,
     add_image,
     add_images,
@@ -57,7 +57,7 @@ def test_add_image():
     assert example["image"] == f"prefix/{example['image_path']}"
 
 
-@patch("corppa.poetry_detection.annotation.recipe.add_image")
+@patch("corppa.poetry_detection.annotation.annotation_recipes.add_image")
 def test_add_images(mock_add_image):
     examples = [{"image_path": "a"}, {"image_path": "b"}]
 
@@ -74,7 +74,7 @@ def test_add_images(mock_add_image):
     )
 
 
-@patch("corppa.poetry_detection.annotation.recipe.add_image")
+@patch("corppa.poetry_detection.annotation.annotation_recipes.add_image")
 def test_remove_image_data(mock_add_image):
     # Empty list (i.e. no examples)
     assert remove_image_data([]) == []
@@ -113,7 +113,10 @@ def test_remove_image_data(mock_add_image):
     assert mock_add_image.call_args == call(mixed_examples[-1], image_prefix="prefix")
 
 
-@patch("corppa.poetry_detection.annotation.recipe.SESSION_ID_ATTR", "session_id")
+@patch(
+    "corppa.poetry_detection.annotation.annotation_recipes.SESSION_ID_ATTR",
+    "session_id",
+)
 def test_get_session_name():
     # Typical case: drop db prefix
     example = {"session_id": "db-id-alice"}
@@ -136,8 +139,8 @@ def test_remove_label_prefix():
     assert remove_label_prefix("no_prefix") == "no_prefix"
 
 
-@patch("corppa.poetry_detection.annotation.recipe.get_session_name")
-@patch("corppa.poetry_detection.annotation.recipe.add_label_prefix")
+@patch("corppa.poetry_detection.annotation.annotation_recipes.get_session_name")
+@patch("corppa.poetry_detection.annotation.annotation_recipes.add_label_prefix")
 def test_add_session_prefix(mock_add_label_prefix, mock_get_session_name):
     mock_get_session_name.return_value = "session"
 
@@ -157,7 +160,7 @@ def test_add_session_prefix(mock_add_label_prefix, mock_get_session_name):
     mock_add_label_prefix.assert_has_calls([call("a", "session"), call("b", "session")])
 
 
-@patch("corppa.poetry_detection.annotation.recipe.remove_label_prefix")
+@patch("corppa.poetry_detection.annotation.annotation_recipes.remove_label_prefix")
 def test_remove_session_prefix(mock_remove_label_prefix):
     # example without spans
     example = {"text": "some text..."}
@@ -172,7 +175,7 @@ def test_remove_session_prefix(mock_remove_label_prefix):
     mock_remove_label_prefix.assert_has_calls([call("a"), call("b")])
 
 
-@patch("corppa.poetry_detection.annotation.recipe.remove_label_prefix")
+@patch("corppa.poetry_detection.annotation.annotation_recipes.remove_label_prefix")
 def test_has_span_overlap(mock_remove_label_prefix):
     # example without spans
     ex_no_spans = {}
@@ -226,7 +229,7 @@ def test_has_span_overlap(mock_remove_label_prefix):
     mock_remove_label_prefix.assert_not_called()
 
 
-@patch("corppa.poetry_detection.annotation.recipe.has_span_overlap")
+@patch("corppa.poetry_detection.annotation.annotation_recipes.has_span_overlap")
 def test_validate_review_stream(mock_has_span_overlap):
     mock_has_span_overlap.side_effect = [False, True, False]
 
@@ -263,8 +266,8 @@ def test_init(self, mock_get_data):
         mock_get_data.assert_called_once()
         assert mock_get_data.call_args == call(data, "pfx", "fetch_media")
 
-    @patch("corppa.poetry_detection.annotation.recipe.add_session_prefix")
-    @patch("corppa.poetry_detection.annotation.recipe.get_session_name")
+    @patch("corppa.poetry_detection.annotation.annotation_recipes.add_session_prefix")
+    @patch("corppa.poetry_detection.annotation.annotation_recipes.get_session_name")
     def test_create_review_example(
         self, mock_get_session_name, mock_add_session_prefix
     ):
@@ -329,7 +332,7 @@ def test_create_review_example(
         }
 
     @patch.object(ReviewStream, "create_review_example")
-    @patch("corppa.poetry_detection.annotation.recipe.add_image")
+    @patch("corppa.poetry_detection.annotation.annotation_recipes.add_image")
     def test_get_data(self, mock_add_image, mock_create_review_example):
         mock_prodigy_preprocess.fetch_media.reset_mock(
             return_value=True, side_effect=True
@@ -371,8 +374,11 @@ def test_get_data(self, mock_add_image, mock_create_review_example):
         mock_add_image.assert_has_calls([call("review", None) for _ in range(3)])
 
 
-@patch("corppa.poetry_detection.annotation.recipe.INPUT_HASH_ATTR", "input_hash")
-@patch("corppa.poetry_detection.annotation.recipe.ReviewStream")
+@patch(
+    "corppa.poetry_detection.annotation.annotation_recipes.INPUT_HASH_ATTR",
+    "input_hash",
+)
+@patch("corppa.poetry_detection.annotation.annotation_recipes.ReviewStream")
 def test_get_review_stream(mock_stream):
     mock_prodigy.set_hashes.reset_mock(return_value=True, side_effect=True)
     mock_prodigy.set_hashes.side_effect = lambda x, overwrite, input_keys, task_keys: x