forked from ambuda-org/ambuda
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
These statistics let us quantify the work required for a project, which helps us with pricing projects for proofreaders. Test plan: unit tests
- Loading branch information
Showing
7 changed files
with
171 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
{% extends 'proofing/base-sidebar.html' %} | ||
{% import "macros/proofing.html" as m %} | ||
|
||
|
||
{% block title %}Stats: {{ project.title }} | Ambuda{% endblock %} | ||
|
||
|
||
{% block sidebar %}{{ m.main_nav('projects', current_user=current_user) }}{% endblock %} | ||
|
||
|
||
{% block content %} | ||
{{ m.project_header_nested('Stats', project) }} | ||
{{ m.project_tabs(project=project, active='stats', is_mod=current_user.is_moderator) }} | ||
|
||
<div class="prose"> | ||
<table> | ||
<tr><th> </th><th>Total</th><th>Per page</th></tr> | ||
<tr> | ||
<th>Pages</th> | ||
<td>{{ "{:,}".format(stats.num_pages) }}</td> | ||
<td>–</td> | ||
</tr> | ||
<tr> | ||
<th>Words</th> | ||
<td>{{ "{:,}".format(stats.num_words) }}</td> | ||
<td>{{ "{:,}".format(stats.num_words // stats.num_pages) }}</td> | ||
</tr> | ||
<tr> | ||
<th>Roman characters</th> | ||
<td>{{ "{:,}".format(stats.num_roman_characters) }}</td> | ||
<td>{{ "{:,}".format(stats.num_roman_characters // stats.num_pages) }}</td></tr> | ||
</tr> | ||
<tr> | ||
<th>Aksharas</th> | ||
<td>{{ "{:,}".format(stats.num_aksharas) }}</td> | ||
<td>{{ "{:,}".format(stats.num_aksharas // stats.num_pages) }}</td> | ||
</tr> | ||
</table> | ||
</div> | ||
{% endblock %} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
"""Utilities for calculating project statistics. | ||
These stats are useful for estimating the size of a project, which helps us | ||
wtih cost estimation for our proofers. | ||
""" | ||
|
||
import re | ||
from collections.abc import Iterable | ||
from dataclasses import dataclass | ||
|
||
from indic_transliteration import detect, sanscript | ||
|
||
from ambuda.database import Project | ||
|
||
#: Matches whitespace spans. | ||
RE_SPACE = re.compile(r"\s+", re.MULTILINE) | ||
#: Matches Sanskrit vowels in SLP1. | ||
RE_VOWEL = re.compile(r"[aAiIuUfFxXeEoO]") | ||
|
||
#: Scripts that don't use aksharas | ||
#: (copied from `indic_transliteration`) | ||
ROMAN_SCHEMES = { | ||
"hk", | ||
"iast", | ||
"itrans", | ||
"kolkata_v2", | ||
"slp1", | ||
"velthuis", | ||
} | ||
|
||
|
||
@dataclass | ||
class Stats: | ||
"""Statistics for some project.""" | ||
|
||
#: The number of pages. | ||
num_pages: int | ||
#: The number of words. | ||
#: Here, a "word" is a continuous span of characters with no whitespace. | ||
num_words: int | ||
#: The number of Roman characters. | ||
num_roman_characters: int | ||
#: The number of aksharas (syllables) in Devanagari, Kannada, or some other | ||
#: Brahmic script. | ||
num_aksharas: int | ||
|
||
|
||
def _calculate_stats_for_strings(strings: Iterable[str]) -> Stats: | ||
num_pages = 0 | ||
num_words = 0 | ||
num_roman_characters = 0 | ||
num_aksharas = 0 | ||
for page_text in strings: | ||
num_pages += 1 | ||
# N words will have n-1 spaces, so add 1 to get a better word count. | ||
spaces = RE_SPACE.findall(page_text) | ||
num_words += 1 + len(spaces) | ||
|
||
encoding = detect.detect(page_text) | ||
if encoding in ROMAN_SCHEMES: | ||
num_space_chars = sum(len(x) for x in spaces) | ||
num_roman_characters += len(page_text) - num_space_chars | ||
else: | ||
slp1_text = sanscript.transliterate(page_text, encoding, "slp1") | ||
num_aksharas += len(RE_VOWEL.findall(slp1_text)) | ||
|
||
return Stats( | ||
num_pages=num_pages, | ||
num_words=num_words, | ||
num_roman_characters=num_roman_characters, | ||
num_aksharas=num_aksharas, | ||
) | ||
|
||
|
||
def _iter_page_strings(project: Project) -> Iterable[str]: | ||
for page in project.pages: | ||
yield page.revisions[-1].content if page.revisions else "" | ||
|
||
|
||
def calculate_stats(project: Project) -> Stats: | ||
return _calculate_stats_for_strings(_iter_page_strings(project)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
from ambuda.views.proofing.stats import _calculate_stats_for_strings | ||
|
||
|
||
def test_calculate_stats(): | ||
strings = [ | ||
"Test", | ||
"अहम्", | ||
"astyuttarasyāṃ diśi devatātmā", | ||
"foo bar baz", | ||
] | ||
stats = _calculate_stats_for_strings(strings) | ||
assert stats.num_pages == 4 | ||
assert stats.num_words == 8 | ||
assert stats.num_aksharas == 2 | ||
# 4 + (29 - 2) + (11 - 2) = 40 | ||
assert stats.num_roman_characters == 40 |