sytle: read op description from markdown

modelscope · Nov 16, 2023 · 29fee1e · 29fee1e
1 parent 8f29460
commit 29fee1e
Showing 1 changed file with 13 additions and 90 deletions.
diff --git a/demos/overview_scan/app.py b/demos/overview_scan/app.py
@@ -82,98 +82,21 @@
 `config_all.yaml` which includes **all** ops and default arguments. You just need to **remove** ops that you won't use and refine some arguments of ops.
 '''
 
-op_desc = '''
-The operators in Data-Juicer are categorized into 5 types.
-
-| Type                              | Number | Description                                     |
-|-----------------------------------|:------:|-------------------------------------------------|
-| Formatter         |   7    | Discovers, loads, and canonicalizes source data |
-| Mapper            |   21   | Edits and transforms samples                    |
-| Filter            |   20   | Filters out low-quality samples                 |
-| Deduplicator      |   4    | Detects and removes duplicate samples           |
-| Selector          |   2    | Selects top samples based on ranking            |
-'''
+with open(os.path.join(project_path, 'docs/Operators.md'), 'r') as f:
+    op_text = f.read()
+
+def extract_op_desp(markdown_text,  header):
+    start_index = markdown_text.find(header)
+    end_index = markdown_text.find("\n##", start_index + len(header)) 
+    return markdown_text[start_index+ len(header):end_index].strip()
 
+op_desc = extract_op_desp(op_text, '## Overview')
 op_list_desc = {
-    'formatter':
-    '''
-| Operator          | Domain  |  Lang  | Description                                                        |
-|-------------------|---------|--------|--------------------------------------------------------------------|
-| remote_formatter  | General | en, zh | Prepares datasets from remote (e.g., HuggingFace)                  |
-| csv_formatter     | General | en, zh | Prepares local `.csv` files                                        |
-| tsv_formatter     | General | en, zh | Prepares local `.tsv` files                                        |
-| json_formatter    | General | en, zh | Prepares local `.json`, `.jsonl`, `.jsonl.zst` files               |
-| parquet_formatter | General | en, zh | Prepares local `.parquet` files                                    |
-| text_formatter    | General | en, zh | Prepares other local text files ([complete list](../data_juicer/format/text_formatter.py#L63,73)) |
-| mixture_formatter | General | en, zh | Handles a mixture of all the supported local file types
-''',
-    'mapper':
-    '''
-| Operator                                      | Domain             | Lang   | Description                                                                                                    |
-|-----------------------------------------------|--------------------|--------|----------------------------------------------------------------------------------------------------------------|
-| chinese_convert_mapper                              | General            | zh     | Convert Chinese between Traditional Chinese, Simplified Chinese and Japanese Kanji (by [opencc](https://github.com/BYVoid/OpenCC))                 |
-| clean_copyright_mapper                              | Code               | en, zh | Removes copyright notice at the beginning of code files (:warning: must contain the word *copyright*)          |
-| clean_email_mapper                                  | General            | en, zh | Removes email information                                                                                      |
-| clean_html_mapper                                   | General            | en, zh | Removes HTML tags and returns plain text of all the nodes                                                      |
-| clean_ip_mapper                                     | General            | en, zh | Removes IP addresses                                                                                           |
-| clean_links_mapper                                  | General, Code      | en, zh | Removes links, such as those starting with http or ftp                                                         |
-| expand_macro_mapper                                 | LaTeX              | en, zh | Expands macros usually defined at the top of TeX documents                                                     |
-| fix_unicode_mapper                                  | General            | en, zh | Fixes broken Unicodes (by [ftfy](https://ftfy.readthedocs.io/))                                                |
-| nlpaug_en_mapper                                    | General            | en     | Simply augment texts in English based on the `nlpaug` library                                                  |
-| nlpcda_zh_mapper                                    | General            | zh     | Simply augment texts in Chinese based on the `nlpcda` library                                                  |
-| punctuation_normalization_mapper                    | General            | en, zh | Normalizes various Unicode punctuations to their ASCII equivalents                                             |
-| remove_bibliography_mapper                          | LaTeX              | en, zh | Removes the bibliography of TeX documents                                                                      |
-| remove_comments_mapper                              | LaTeX              | en, zh | Removes the comments of TeX documents                                                                          |
-| remove_header_mapper                                | LaTeX              | en, zh | Removes the running headers of TeX documents, e.g., titles, chapter or section numbers/names                   |
-| remove_long_words_mapper                            | General            | en, zh | Removes words with length outside the specified range                                                          |
-| remove_non_chinese_character_mapper                 | General            | en, zh | Remove non Chinese character in text samples. |
-| remove_specific_chars_mapper                        | General            | en, zh | Removes any user-specified characters or substrings                                                            |
-| remove_table_text_mapper                            | General, Financial | en     | Detects and removes possible table contents (:warning: relies on regular expression matching and thus fragile) |
-| remove_words_with_incorrect_<br />substrings_mapper | General            | en, zh | Removes words containing specified substrings                                                                  |
-| sentence_split_mapper                               | General            | en     | Splits and reorganizes sentences according to semantics                                                        |
-| whitespace_normalization_mapper                     | General            | en, zh | Normalizes various Unicode whitespaces to the normal ASCII space (U+0020)                                      |
-''',
-    'filter':
-    '''
-| Operator                       | Domain  | Lang   | Description                                                                                |
-|--------------------------------|---------|--------|--------------------------------------------------------------------------------------------|
-| alphanumeric_filter            | General | en, zh | Keeps samples with alphanumeric ratio within the specified range                           |
-| average_line_length_filter     | Code    | en, zh | Keeps samples with average line length within the specified range                          |
-| character_repetition_filter    | General | en, zh | Keeps samples with char-level n-gram repetition ratio within the specified range           |
-| clip_similarity_filter         | Multimodal |   -    |  Keeps samples with similarity between text and images within the specified range       |
-| flagged_words_filter           | General | en, zh | Keeps samples with flagged-word ratio below the specified threshold                        |
-| image_aspect_ratio_filter      | Image   |   -    | Keeps samples contains images with aspect ratios within specific range                     |
-| image_shape_filter             | Image   |   -    | Keeps samples contains images with widths and heights within specific ranges               |
-| image_size_filter      | Image   |   -    | Keeps samples contains images whose size in bytes are within specific range                     |
-| language_id_score_filter       | General | en, zh | Keeps samples of the specified language, judged by a predicted confidence score            |
-| maximum_line_length_filter     | Code    | en, zh | Keeps samples with maximum line length within the specified range                          |
-| perplexity_filter              | General | en, zh | Keeps samples with perplexity score below the specified threshold                          |
-| special_characters_filter      | General | en, zh | Keeps samples with special-char ratio within the specified range                           |
-| specified_field_filter         | General | en, zh | Filters samples based on field, with value lies in the specified targets                   |
-| specified_numeric_field_filter | General | en, zh | Filters samples based on field, with value lies in the specified range (for numeric types) |
-| stopwords_filter               | General | en, zh | Keeps samples with stopword ratio above the specified threshold                            |
-| suffix_filter                  | General | en, zh | Keeps samples with specified suffixes                                                      |
-| text_length_filter             | General | en, zh | Keeps samples with total text length within the specified range                            |
-| token_num_filter               | General | en, zh | Keeps samples with token count within the specified range                                  |
-| word_num_filter                | General | en, zh | Keeps samples with word count within the specified range                                   |
-| word_repetition_filter         | General | en, zh | Keeps samples with word-level n-gram repetition ratio within the specified range           |
-''',
-    'deduplicator':
-    '''
-| Operator                      | Domain  | Lang   | Description                                                 |
-|-------------------------------|---------|--------|-------------------------------------------------------------|
-| document_deduplicator         | General | en, zh | Deduplicate samples at document-level by comparing MD5 hash |
-| document_minhash_deduplicator | General | en, zh | Deduplicate samples at document-level using MinHashLSH      |
-| document_simhash_deduplicator | General | en, zh | Deduplicate samples at document-level using SimHash         |
-| image_deduplicator            | Image   |   -    | Deduplicate samples at document-level using exact matching of images between documents |
-''',
-    'selector':
-    '''
-| Operator                           | Domain  | Lang   | Description                                                           |
-|------------------------------------|---------|--------|-----------------------------------------------------------------------|
-| topk_specified_field_selector      | General | en, zh | Selects top samples by comparing the values of the specified field    |
-| frequency_specified_field_selector | General | en, zh | Selects top samples by comparing the frequency of the specified field |
-'''
+    'formatter': extract_op_desp(op_text, '## Formatter <a name="formatter"/>'),
+    'mapper':extract_op_desp(op_text, '## Mapper <a name="mapper"/>'),
+    'filter':extract_op_desp(op_text, '## Filter <a name="filter"/>'),
+    'deduplicator':extract_op_desp(op_text, '## Deduplicator <a name="deduplicator"/>'),
+    'selector':extract_op_desp(op_text, '## Selector <a name="selector"/>'),
 }
 
 demo_desc = '''