Skip to content

Commit

Permalink
[proofing] Add project stats
Browse files Browse the repository at this point in the history
These statistics let us quantify the work required for a project, which
helps us with pricing projects for proofreaders.

Test plan: unit tests
  • Loading branch information
akprasad authored Apr 7, 2023
1 parent 3712258 commit 73c5a17
Show file tree
Hide file tree
Showing 7 changed files with 171 additions and 4 deletions.
9 changes: 5 additions & 4 deletions ambuda/templates/macros/components.html
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,15 @@

{# A list of tabs. #}
{% macro tabs(urls, active) %}
<nav class="border-b mb-8">
<ul class="flex a-hover-underline">
<nav class="md:border-b mb-8">
<ul class="border-l pl-4 md:border-0 md:p-0 md:flex a-hover-underline">
{% for name, label, url in urls %}
{% if name == active %}
<li class="p-4 font-semibold border-b-2 border-black">{{ label }}</li>
<li class="md:p-4 font-semibold underline
md:no-underline md:border-b-2 md:border-black">{{ label }}</li>
{% else %}
<li class="font-semibold border-b-2 border-transparent text-slate-400">
<a class="block p-4" href="{{ url }}">{{ label }}</a>
<a class="block md:p-4" href="{{ url }}">{{ label }}</a>
</li>
{% endif %}
{% endfor %}
Expand Down
1 change: 1 addition & 0 deletions ambuda/templates/macros/proofing.html
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ <h1 class="mt-2 font-bold">{{ _('Contribute') }}</h1>
("download", _("Download"), url_for("proofing.project.download", slug=project.slug)),
] %}
{% if is_mod %}{% set routes = routes + [
("stats", _("Stats"), url_for("proofing.project.stats", slug=project.slug)),
("admin", _("Admin"), url_for("proofing.project.admin", slug=project.slug)),
] %}{% endif %}

Expand Down
40 changes: 40 additions & 0 deletions ambuda/templates/proofing/projects/stats.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{% extends 'proofing/base-sidebar.html' %}
{% import "macros/proofing.html" as m %}


{% block title %}Stats: {{ project.title }} | Ambuda{% endblock %}


{% block sidebar %}{{ m.main_nav('projects', current_user=current_user) }}{% endblock %}


{% block content %}
{{ m.project_header_nested('Stats', project) }}
{{ m.project_tabs(project=project, active='stats', is_mod=current_user.is_moderator) }}

<div class="prose">
<table>
<tr><th>&nbsp;</th><th>Total</th><th>Per page</th></tr>
<tr>
<th>Pages</th>
<td>{{ "{:,}".format(stats.num_pages) }}</td>
<td>&ndash;</td>
</tr>
<tr>
<th>Words</th>
<td>{{ "{:,}".format(stats.num_words) }}</td>
<td>{{ "{:,}".format(stats.num_words // stats.num_pages) }}</td>
</tr>
<tr>
<th>Roman characters</th>
<td>{{ "{:,}".format(stats.num_roman_characters) }}</td>
<td>{{ "{:,}".format(stats.num_roman_characters // stats.num_pages) }}</td></tr>
</tr>
<tr>
<th>Aksharas</th>
<td>{{ "{:,}".format(stats.num_aksharas) }}</td>
<td>{{ "{:,}".format(stats.num_aksharas // stats.num_pages) }}</td>
</tr>
</table>
</div>
{% endblock %}
19 changes: 19 additions & 0 deletions ambuda/views/proofing/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from ambuda.utils import project_utils, proofing_utils
from ambuda.utils.revisions import add_revision
from ambuda.views.proofing.decorators import moderator_required, p2_required
from ambuda.views.proofing.stats import calculate_stats

bp = Blueprint("project", __name__)
LOG = logging.getLogger(__name__)
Expand Down Expand Up @@ -275,6 +276,24 @@ def download_as_xml(slug):
return response


@bp.route("/<slug>/stats")
@moderator_required
def stats(slug):
"""Show basic statistics about this project.
Currently, these stats don't show any sensitive information. But since that
might change in the future, limit this page to moderators only.
"""
project_ = q.project(slug)
if project_ is None:
abort(404)

stats_ = calculate_stats(project_)
return render_template(
"proofing/projects/stats.html", project=project_, stats=stats_
)


@bp.route("/<slug>/search")
@login_required
def search(slug):
Expand Down
81 changes: 81 additions & 0 deletions ambuda/views/proofing/stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""Utilities for calculating project statistics.
These stats are useful for estimating the size of a project, which helps us
wtih cost estimation for our proofers.
"""

import re
from collections.abc import Iterable
from dataclasses import dataclass

from indic_transliteration import detect, sanscript

from ambuda.database import Project

#: Matches whitespace spans.
RE_SPACE = re.compile(r"\s+", re.MULTILINE)
#: Matches Sanskrit vowels in SLP1.
RE_VOWEL = re.compile(r"[aAiIuUfFxXeEoO]")

#: Scripts that don't use aksharas
#: (copied from `indic_transliteration`)
ROMAN_SCHEMES = {
"hk",
"iast",
"itrans",
"kolkata_v2",
"slp1",
"velthuis",
}


@dataclass
class Stats:
"""Statistics for some project."""

#: The number of pages.
num_pages: int
#: The number of words.
#: Here, a "word" is a continuous span of characters with no whitespace.
num_words: int
#: The number of Roman characters.
num_roman_characters: int
#: The number of aksharas (syllables) in Devanagari, Kannada, or some other
#: Brahmic script.
num_aksharas: int


def _calculate_stats_for_strings(strings: Iterable[str]) -> Stats:
num_pages = 0
num_words = 0
num_roman_characters = 0
num_aksharas = 0
for page_text in strings:
num_pages += 1
# N words will have n-1 spaces, so add 1 to get a better word count.
spaces = RE_SPACE.findall(page_text)
num_words += 1 + len(spaces)

encoding = detect.detect(page_text)
if encoding in ROMAN_SCHEMES:
num_space_chars = sum(len(x) for x in spaces)
num_roman_characters += len(page_text) - num_space_chars
else:
slp1_text = sanscript.transliterate(page_text, encoding, "slp1")
num_aksharas += len(RE_VOWEL.findall(slp1_text))

return Stats(
num_pages=num_pages,
num_words=num_words,
num_roman_characters=num_roman_characters,
num_aksharas=num_aksharas,
)


def _iter_page_strings(project: Project) -> Iterable[str]:
for page in project.pages:
yield page.revisions[-1].content if page.revisions else ""


def calculate_stats(project: Project) -> Stats:
return _calculate_stats_for_strings(_iter_page_strings(project))
9 changes: 9 additions & 0 deletions test/ambuda/views/proofing/test_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,15 @@ def test_download_as_xml__bad_project(client):
assert resp.status_code == 404


def test_stats(moderator_client, rama_client):
resp = moderator_client.get("/proofing/test-project/stats")
assert resp.status_code == 200
assert "Roman characters" in resp.text

resp = rama_client.get("/proofing/test-project/stats")
assert resp.status_code == 302


def test_search(rama_client):
resp = rama_client.get("/proofing/test-project/search")
assert "Search:" in resp.text
Expand Down
16 changes: 16 additions & 0 deletions test/ambuda/views/proofing/test_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from ambuda.views.proofing.stats import _calculate_stats_for_strings


def test_calculate_stats():
strings = [
"Test",
"अहम्",
"astyuttarasyāṃ diśi devatātmā",
"foo bar baz",
]
stats = _calculate_stats_for_strings(strings)
assert stats.num_pages == 4
assert stats.num_words == 8
assert stats.num_aksharas == 2
# 4 + (29 - 2) + (11 - 2) = 40
assert stats.num_roman_characters == 40

0 comments on commit 73c5a17

Please sign in to comment.