From c06344a10a8521627bfef431173d2756fe5b0931 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hynek=20Kydl=C3=AD=C4=8Dek?= Date: Mon, 2 Sep 2024 14:16:29 +0200 Subject: [PATCH] improve formatting of readme + small nit in stats docs --- README.md | 14 +++++++------- src/datatrove/pipeline/stats/line_stats.py | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 498454a5..9d859a43 100644 --- a/README.md +++ b/README.md @@ -332,15 +332,15 @@ stats["cnn.com"].mean ``` Following stats are available: -- `contamination_stats.py`: `word_contamination_{words[0]}: Frequency of words contamination in the document. +- `contamination_stats.py`: `word_contamination_{words[0]}`: Frequency of words contamination in the document. - `doc_stats.py`: `length`: Length of the document, `white_space_ratio`: Ratio of whitespace characters, `non_alpha_digit_ratio`: Ratio of non-alphabetic and non-digit characters, `digit_ratio`: Ratio of digits, `uppercase_ratio`: Ratio of uppercase letters, `elipsis_ratio`: Ratio of elipsis characters, `punctuation_ratio`: Punctuation ratio -- `lang_stats.py`: `fasttext_{language}`: Language of the document using fastText -- `line_stats.py`: `n_lines`: Number of lines per doc, `avg_line_length`: Average length of line per doc, `long_line_ratio_words`: Ratio of lines with more than k chars, `short_line_ratio_chars`: Ratio of lines with more than k chars, `bullet_point_lines_ratio`: Ratio of bullet points, `line_duplicates`: Ratio of lines that are duplicates, `line_char_duplicates`: Ratio of chars in duplicated lines -- `paragraph_stats.py`: `n_paragraphs`: Number of paragraphs, `avg_paragraph_length`: Average paragraph length, `short_paragraph_ratio_{chars}`: Ratio of short paragraphs (<{chars} chars), `long_paragraph_ratio_{chars}`: Ratio of long paragraphs (>{chars} chars) -- `perplexity_stats.py`: `ccnet_perplexity_{model_dataset}_{language}`: Perplexity of the document using the CCNet model for {model} on {dataset} in {language} -- `sentence_stats.py`: `n_sentences`: Number of sentences, `avg_sentence_length`: Average sentence length, `short_sentence_ratio_{chars}`: Ratio of short sentences (<{chars} chars), `long_sentence_ratio_{chars}`: Ratio of long sentences (>{chars} chars) +- `lang_stats.py`: `fasttext_{language}`: Score of document being written in `language`. Score is computed using FastText model. +- `line_stats.py`: `n_lines`: Number of lines per doc, `avg_line_length`: Average length of line per doc, `long_line_ratio_chars_{chars}`: Ratio of lines with more than k chars, `short_line_ratio_chars_{chars}`: Ratio of lines with less than k chars, `bullet_point_lines_ratio`: Ratio of line starting with bullet point, `line_duplicates`: Ratio of lines that are duplicates, `line_char_duplicates`: Ratio of chars in duplicated lines to all chars. +- `paragraph_stats.py`: `n_paragraphs`: Number of paragraphs, `avg_paragraph_length`: Average paragraph length, `short_paragraph_ratio_{chars}`: Ratio of short paragraphs (`<{chars}` chars), `long_paragraph_ratio_{chars}`: Ratio of long paragraphs (`>{chars}` chars) +- `perplexity_stats.py`: `ccnet_perplexity_{model_dataset}_{language}`: Perplexity of the document using the CCNet model for `{model}` on `{dataset}` in `{language}` +- `sentence_stats.py`: `n_sentences`: Number of sentences, `avg_sentence_length`: Average sentence length, `short_sentence_ratio_{chars}`: Ratio of short sentences (`<{chars}` chars), `long_sentence_ratio_{chars}`: Ratio of long sentences (`>{chars}` chars) - `token_stats.py`:`token_count`: Number of tokens in the document -- `word_stats.py`: `n_words`: Number of words in the document, `avg_word_length`: Average length of words in the document, `avg_words_per_line`: Average number of words per line in the document, `short_word_ratio_{chars}`: Ratio of words shorter than {chars} characters, `stop_word_ratio`: Ratio of stop words, `long_word_ratio_{chars}`: Ratio of words longer than {chars} characters, `type_token_ratio`: Number of unique words / Number of tokens, `capitalized_word_ratio`: Ratio of capitalized words, `uppercase_word_ratio`: Ratio of uppercase words +- `word_stats.py`: `n_words`: Number of words in the document, `avg_word_length`: Average length of words in the document, `avg_words_per_line`: Average number of words per line in the document, `short_word_ratio_{chars}`: Ratio of words shorter than `{chars}` characters, `stop_word_ratio`: Ratio of stop words, `long_word_ratio_{chars}`: Ratio of words longer than `{chars}` characters, `type_token_ratio`: Number of unique words / Number of tokens, `capitalized_word_ratio`: Ratio of capitalized words, `uppercase_word_ratio`: Ratio of uppercase words diff --git a/src/datatrove/pipeline/stats/line_stats.py b/src/datatrove/pipeline/stats/line_stats.py index 0b84b619..fb6a33c7 100644 --- a/src/datatrove/pipeline/stats/line_stats.py +++ b/src/datatrove/pipeline/stats/line_stats.py @@ -29,8 +29,8 @@ class LineStats(BaseStats): Available stats: n_lines: Number of lines per doc avg_line_length: Average length of line per doc - long_line_ratio_words: Ratio of lines with more than k chars - short_line_ratio_chars: Ratio of lines with more than k chars + long_line_ratio_chars_{chars}: Ratio of lines with more than k chars + short_line_ratio_chars_{chars}: Ratio of lines with more than k chars bullet_point_lines_ratio: Ratio of bullet points line_duplicates: Ratio of lines that are duplicates line_char_duplicates: Ratio of chars in duplicated lines