From 2514aabe6e74a7be0557c848da28c335f10e3236 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Thu, 14 Nov 2024 11:53:15 -0800 Subject: [PATCH 01/16] add packaging Signed-off-by: Sarah Yurick --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index a69dd716..2037facf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,6 +57,7 @@ dependencies = [ "nemo_toolkit[nlp]>=1.23.0", "numpy<2", "openai", + "packaging", "peft", "presidio-analyzer==2.2.351", "presidio-anonymizer==2.2.351", From 7e147ed35437282f193c09866b4cdebdffbf5cf1 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Thu, 14 Nov 2024 11:55:56 -0800 Subject: [PATCH 02/16] move to requires Signed-off-by: Sarah Yurick --- pyproject.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2037facf..5578992f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ # limitations under the License. [build-system] -requires = ["setuptools"] +requires = ["setuptools", "packaging"] build-backend = "setuptools.build_meta" [project] @@ -57,7 +57,6 @@ dependencies = [ "nemo_toolkit[nlp]>=1.23.0", "numpy<2", "openai", - "packaging", "peft", "presidio-analyzer==2.2.351", "presidio-anonymizer==2.2.351", From 320a1b4bb2648c0d97d1e5ce4cf586c4219f21dd Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Thu, 14 Nov 2024 11:57:50 -0800 Subject: [PATCH 03/16] move to github ci file Signed-off-by: Sarah Yurick --- .github/workflows/test.yml | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index baa968f4..af83e53c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -39,7 +39,7 @@ jobs: # https://github.com/facebookresearch/fastText/issues/512#issuecomment-1837367666 # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94 run: | - pip install wheel cython + pip install wheel cython packaging pip install --no-cache-dir . pip install pytest - name: Run tests diff --git a/pyproject.toml b/pyproject.toml index 5578992f..a69dd716 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ # limitations under the License. [build-system] -requires = ["setuptools", "packaging"] +requires = ["setuptools"] build-backend = "setuptools.build_meta" [project] From d232be481b4d6b889a794002c248b07faeb0080d Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Thu, 14 Nov 2024 13:11:22 -0800 Subject: [PATCH 04/16] add pin Signed-off-by: Sarah Yurick --- .github/workflows/test.yml | 2 +- pyproject.toml | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index af83e53c..baa968f4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -39,7 +39,7 @@ jobs: # https://github.com/facebookresearch/fastText/issues/512#issuecomment-1837367666 # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94 run: | - pip install wheel cython packaging + pip install wheel cython pip install --no-cache-dir . pip install pytest - name: Run tests diff --git a/pyproject.toml b/pyproject.toml index a69dd716..f82fe1a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,7 +54,8 @@ dependencies = [ "lxml_html_clean", "mecab-python3", "mwparserfromhell==0.6.5", - "nemo_toolkit[nlp]>=1.23.0", + # TODO: Pin until dependencies from https://github.com/NVIDIA/NeMo?tab=readme-ov-file#install-nemo-framework are updated + "nemo_toolkit[nlp]<2.0.0", "numpy<2", "openai", "peft", From 9089871f4d7c4dad3f6d966b9d022850ca490f6a Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 15 Nov 2024 11:25:56 -0800 Subject: [PATCH 05/16] add torch Signed-off-by: Sarah Yurick --- .github/workflows/test.yml | 2 +- pyproject.toml | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index baa968f4..16e2c026 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -39,7 +39,7 @@ jobs: # https://github.com/facebookresearch/fastText/issues/512#issuecomment-1837367666 # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94 run: | - pip install wheel cython + pip install wheel cython packaging torch pip install --no-cache-dir . pip install pytest - name: Run tests diff --git a/pyproject.toml b/pyproject.toml index f82fe1a6..a69dd716 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,8 +54,7 @@ dependencies = [ "lxml_html_clean", "mecab-python3", "mwparserfromhell==0.6.5", - # TODO: Pin until dependencies from https://github.com/NVIDIA/NeMo?tab=readme-ov-file#install-nemo-framework are updated - "nemo_toolkit[nlp]<2.0.0", + "nemo_toolkit[nlp]>=1.23.0", "numpy<2", "openai", "peft", From 763128ba81f14cbbab388a2340431cd479a8c6cc Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 15 Nov 2024 11:31:10 -0800 Subject: [PATCH 06/16] add suggestion from mamba readme Signed-off-by: Sarah Yurick --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 16e2c026..2e0ec29f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -39,7 +39,7 @@ jobs: # https://github.com/facebookresearch/fastText/issues/512#issuecomment-1837367666 # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94 run: | - pip install wheel cython packaging torch + pip install wheel cython packaging torch --no-build-isolation pip install --no-cache-dir . pip install pytest - name: Run tests From 238763649d540bdafcbda6e7b4c2a10541e2e1e1 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 15 Nov 2024 11:37:06 -0800 Subject: [PATCH 07/16] try github install Signed-off-by: Sarah Yurick --- .github/workflows/test.yml | 2 +- pyproject.toml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2e0ec29f..baa968f4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -39,7 +39,7 @@ jobs: # https://github.com/facebookresearch/fastText/issues/512#issuecomment-1837367666 # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94 run: | - pip install wheel cython packaging torch --no-build-isolation + pip install wheel cython pip install --no-cache-dir . pip install pytest - name: Run tests diff --git a/pyproject.toml b/pyproject.toml index a69dd716..95d68d02 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ dependencies = [ "distributed>=2021.7.1", "fasttext==0.9.2", "ftfy==6.1.1", + "git+https://github.com/state-spaces/mamba.git" "in-place==0.5.0", "jieba==0.42.1", "justext==3.0.1", From 81024d9edf4700344ef011dc61c0defcab32cfa3 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 15 Nov 2024 11:38:23 -0800 Subject: [PATCH 08/16] add comma Signed-off-by: Sarah Yurick --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 95d68d02..0fe25b16 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,8 @@ dependencies = [ "distributed>=2021.7.1", "fasttext==0.9.2", "ftfy==6.1.1", - "git+https://github.com/state-spaces/mamba.git" + # TODO: Remove after 2.2.3 release + "git+https://github.com/state-spaces/mamba.git", "in-place==0.5.0", "jieba==0.42.1", "justext==3.0.1", From c827d95874136d939e6c8fa9266bb64780bfa376 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 15 Nov 2024 11:43:16 -0800 Subject: [PATCH 09/16] another attempt Signed-off-by: Sarah Yurick --- .github/workflows/test.yml | 3 ++- pyproject.toml | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index baa968f4..2fd13a93 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -39,7 +39,8 @@ jobs: # https://github.com/facebookresearch/fastText/issues/512#issuecomment-1837367666 # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94 run: | - pip install wheel cython + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + pip install wheel cython packaging pip install --no-cache-dir . pip install pytest - name: Run tests diff --git a/pyproject.toml b/pyproject.toml index 0fe25b16..a69dd716 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,8 +48,6 @@ dependencies = [ "distributed>=2021.7.1", "fasttext==0.9.2", "ftfy==6.1.1", - # TODO: Remove after 2.2.3 release - "git+https://github.com/state-spaces/mamba.git", "in-place==0.5.0", "jieba==0.42.1", "justext==3.0.1", From c1311d6f9a7c23a76a18489597249db4bcc6e237 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 15 Nov 2024 12:04:57 -0800 Subject: [PATCH 10/16] remove nemo toolkit Signed-off-by: Sarah Yurick --- .github/workflows/test.yml | 3 +-- pyproject.toml | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2fd13a93..baa968f4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -39,8 +39,7 @@ jobs: # https://github.com/facebookresearch/fastText/issues/512#issuecomment-1837367666 # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94 run: | - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu - pip install wheel cython packaging + pip install wheel cython pip install --no-cache-dir . pip install pytest - name: Run tests diff --git a/pyproject.toml b/pyproject.toml index a69dd716..f1efe670 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,7 +54,6 @@ dependencies = [ "lxml_html_clean", "mecab-python3", "mwparserfromhell==0.6.5", - "nemo_toolkit[nlp]>=1.23.0", "numpy<2", "openai", "peft", From ad10ab2717baf66cbafb33f63ccc02108b90618c Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 15 Nov 2024 12:11:09 -0800 Subject: [PATCH 11/16] add datasets Signed-off-by: Sarah Yurick --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index f1efe670..89f5c61e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ dependencies = [ "crossfit>=0.0.6", "dask-mpi>=2021.11.0", "dask[complete]>=2021.7.1", + "datasets", "distributed>=2021.7.1", "fasttext==0.9.2", "ftfy==6.1.1", From 4e3fd8f6e31d9aaaf93ac2c1fd09def3f2d45217 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 15 Nov 2024 12:19:30 -0800 Subject: [PATCH 12/16] try removing cython Signed-off-by: Sarah Yurick --- .github/workflows/test.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index baa968f4..1d8cc925 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -37,9 +37,8 @@ jobs: # Installing wheel beforehand due to fasttext issue: # https://github.com/facebookresearch/fastText/issues/512#issuecomment-1837367666 - # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94 run: | - pip install wheel cython + pip install wheel pip install --no-cache-dir . pip install pytest - name: Run tests From 87997e4d3b8564cdba91db9b0322d335446c2121 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 15 Nov 2024 12:25:32 -0800 Subject: [PATCH 13/16] remove cython Signed-off-by: Sarah Yurick --- Dockerfile | 2 +- README.md | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 16ddd54a..51fe7be4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,7 +38,7 @@ RUN conda create -y --name curator -c conda-forge -c nvidia \ libcusparse \ libcusolver && \ source activate curator && \ - pip install --upgrade cython pytest pip + pip install --upgrade pytest pip RUN \ --mount=type=bind,source=/opt/NeMo-Curator/nemo_curator/__init__.py,target=/opt/NeMo-Curator/nemo_curator/__init__.py,from=curator-update \ diff --git a/README.md b/README.md index 5cba9d10..4513c7af 100644 --- a/README.md +++ b/README.md @@ -83,14 +83,12 @@ You can get NeMo-Curator in 3 ways. #### PyPi ```bash -pip install cython pip install --extra-index-url https://pypi.nvidia.com nemo-curator[all] ``` #### Source ```bash git clone https://github.com/NVIDIA/NeMo-Curator.git -pip install cython pip install --extra-index-url https://pypi.nvidia.com "./NeMo-Curator[all]" ``` From 45f2a447232c2e27f1a4f1ffef1a16b7ab780c7b Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 15 Nov 2024 13:13:36 -0800 Subject: [PATCH 14/16] sentencepiece Signed-off-by: Sarah Yurick --- docs/user-guide/image/gettingstarted.rst | 2 - nemo_curator/filters/code.py | 5 +- .../filters/sentencepiece_tokenizer.py | 267 ++++++++++++++++++ pyproject.toml | 1 + tutorials/image-curation/image-curation.ipynb | 2 +- ...ta Generation - Hello World Examples.ipynb | 4 +- 6 files changed, 275 insertions(+), 6 deletions(-) create mode 100644 nemo_curator/filters/sentencepiece_tokenizer.py diff --git a/docs/user-guide/image/gettingstarted.rst b/docs/user-guide/image/gettingstarted.rst index dae4240d..2ccacb25 100644 --- a/docs/user-guide/image/gettingstarted.rst +++ b/docs/user-guide/image/gettingstarted.rst @@ -33,7 +33,6 @@ NeMo Curator's PyPi page can be found `here = self.original_vocab_size: + tokens.append(self.id_to_special_token[id]) + else: + tokens.append(self.tokenizer.id_to_piece(id)) + return tokens + + def tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]: + if isinstance(tokens, str): + tokens = [tokens] + ids = [] + for token in tokens: + ids.append(self.token_to_id(token)) + return ids + + def add_special_tokens(self, special_tokens): + if not self.legacy: + raise AttributeError("Special Token addition does not work when legacy is set to False.") + + if isinstance(special_tokens, list): + for token in special_tokens: + if ( + self.tokenizer.piece_to_id(token) == self.tokenizer.unk_id() + and token not in self.special_token_to_id + ): + self.special_token_to_id[token] = self.vocab_size + self.id_to_special_token[self.vocab_size] = token + self.vocab_size += 1 + elif isinstance(special_tokens, dict): + for token_name, token in special_tokens.items(): + setattr(self, token_name, token) + if ( + self.tokenizer.piece_to_id(token) == self.tokenizer.unk_id() + and token not in self.special_token_to_id + ): + self.special_token_to_id[token] = self.vocab_size + self.id_to_special_token[self.vocab_size] = token + self.vocab_size += 1 + + @property + def pad_id(self): + if self.legacy: + pad_id = self.tokens_to_ids([self.pad_token])[0] + else: + pad_id = self.tokenizer.pad_id() + return pad_id + + @property + def bos_token_id(self): + if self.legacy: + bos_id = self.tokens_to_ids([self.bos_token])[0] + else: + bos_id = self.tokenizer.bos_id() + return bos_id + + @property + def eos_token_id(self): + if self.legacy: + eos_id = self.tokens_to_ids([self.eos_token])[0] + else: + eos_id = self.tokenizer.eos_id() + return eos_id + + @property + def sep_id(self): + if self.legacy: + return self.tokens_to_ids([self.sep_token])[0] + else: + raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.") + + @property + def cls_id(self): + if self.legacy: + return self.tokens_to_ids([self.cls_token])[0] + else: + raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.") + + @property + def mask_id(self): + if self.legacy: + return self.tokens_to_ids([self.mask_token])[0] + else: + raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.") + + @property + def unk_id(self): + return self.tokenizer.unk_id() + + @property + def additional_special_tokens_ids(self): + """Returns a list of the additional special tokens (excluding bos, eos, pad, unk). Used to return sentinel tokens for e.g. T5.""" + special_tokens = set( + [self.bos_token, self.eos_token, self.pad_token, self.mask_token, self.cls_token, self.sep_token] + ) + return [v for k, v in self.special_token_to_id.items() if k not in special_tokens] + + @property + def vocab(self): + main_vocab = [self.tokenizer.id_to_piece(id) for id in range(self.tokenizer.get_piece_size())] + special_tokens = [ + self.id_to_special_token[self.original_vocab_size + i] + for i in range(self.vocab_size - self.original_vocab_size) + ] + return main_vocab + special_tokens + + ### Below are a few methods that mimic transformers.PreTrainedTokenizer for vLLM + + def convert_ids_to_tokens(self, ids, skip_special_tokens: bool = False): + return self.ids_to_tokens(ids) # TODO: support skip_special_tokens + + def convert_tokens_to_string(self, tokens: List[str]): + return self.tokens_to_text(tokens) + + def __len__(self): + return self.vocab_size + + @property + def is_fast(self): + return True + + def get_added_vocab(self): + return None diff --git a/pyproject.toml b/pyproject.toml index 89f5c61e..21cc8eac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,6 +62,7 @@ dependencies = [ "presidio-anonymizer==2.2.351", "pycld2", "resiliparse", + "sentencepiece", "spacy>=3.6.0, <3.8.0", "unidic-lite==1.0.8", "usaddress==0.5.10", diff --git a/tutorials/image-curation/image-curation.ipynb b/tutorials/image-curation/image-curation.ipynb index 947fbfef..1ac3c102 100644 --- a/tutorials/image-curation/image-curation.ipynb +++ b/tutorials/image-curation/image-curation.ipynb @@ -49,7 +49,7 @@ }, "outputs": [], "source": [ - "!pip install cython ipywidgets aiofiles\n", + "!pip install ipywidgets aiofiles\n", "# Install from source by default\n", "!pip install --extra-index-url https://pypi.nvidia.com ../../[image]\n", "%env DASK_DATAFRAME__QUERY_PLANNING False" diff --git a/tutorials/synthetic-data-hello-world/Synthetic Data Generation - Hello World Examples.ipynb b/tutorials/synthetic-data-hello-world/Synthetic Data Generation - Hello World Examples.ipynb index bbe0ed8c..1bc14d14 100644 --- a/tutorials/synthetic-data-hello-world/Synthetic Data Generation - Hello World Examples.ipynb +++ b/tutorials/synthetic-data-hello-world/Synthetic Data Generation - Hello World Examples.ipynb @@ -58,11 +58,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "!pip install -qU wheel cython" + "!pip install -qU wheel" ] }, { From 3f20ecb56ba16f08991ddb269362256f6cbfbbf2 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 15 Nov 2024 13:23:34 -0800 Subject: [PATCH 15/16] run black Signed-off-by: Sarah Yurick --- .../filters/sentencepiece_tokenizer.py | 43 +++++++++++++++---- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/nemo_curator/filters/sentencepiece_tokenizer.py b/nemo_curator/filters/sentencepiece_tokenizer.py index e47b1c66..c242fa9e 100644 --- a/nemo_curator/filters/sentencepiece_tokenizer.py +++ b/nemo_curator/filters/sentencepiece_tokenizer.py @@ -32,7 +32,10 @@ class SentencePieceTokenizer: """ def __init__( - self, model_path: str, special_tokens: Optional[Union[Dict[str, str], List[str]]] = None, legacy: bool = False + self, + model_path: str, + special_tokens: Optional[Union[Dict[str, str], List[str]]] = None, + legacy: bool = False, ): if not model_path or not os.path.exists(model_path): raise ValueError(f"model_path: {model_path} is invalid") @@ -50,7 +53,9 @@ def __init__( "Special tokens must be None when legacy is set to False. Provide special tokens at train time." ) self.add_special_tokens(special_tokens) - self.space_sensitive = self.text_to_tokens('x y') != self.text_to_tokens('x') + self.text_to_tokens('y') + self.space_sensitive = self.text_to_tokens("x y") != self.text_to_tokens( + "x" + ) + self.text_to_tokens("y") def text_to_tokens(self, text): if self.legacy: @@ -160,7 +165,9 @@ def tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]: def add_special_tokens(self, special_tokens): if not self.legacy: - raise AttributeError("Special Token addition does not work when legacy is set to False.") + raise AttributeError( + "Special Token addition does not work when legacy is set to False." + ) if isinstance(special_tokens, list): for token in special_tokens: @@ -211,21 +218,27 @@ def sep_id(self): if self.legacy: return self.tokens_to_ids([self.sep_token])[0] else: - raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.") + raise NameError( + "Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos." + ) @property def cls_id(self): if self.legacy: return self.tokens_to_ids([self.cls_token])[0] else: - raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.") + raise NameError( + "Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos." + ) @property def mask_id(self): if self.legacy: return self.tokens_to_ids([self.mask_token])[0] else: - raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.") + raise NameError( + "Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos." + ) @property def unk_id(self): @@ -235,13 +248,25 @@ def unk_id(self): def additional_special_tokens_ids(self): """Returns a list of the additional special tokens (excluding bos, eos, pad, unk). Used to return sentinel tokens for e.g. T5.""" special_tokens = set( - [self.bos_token, self.eos_token, self.pad_token, self.mask_token, self.cls_token, self.sep_token] + [ + self.bos_token, + self.eos_token, + self.pad_token, + self.mask_token, + self.cls_token, + self.sep_token, + ] ) - return [v for k, v in self.special_token_to_id.items() if k not in special_tokens] + return [ + v for k, v in self.special_token_to_id.items() if k not in special_tokens + ] @property def vocab(self): - main_vocab = [self.tokenizer.id_to_piece(id) for id in range(self.tokenizer.get_piece_size())] + main_vocab = [ + self.tokenizer.id_to_piece(id) + for id in range(self.tokenizer.get_piece_size()) + ] special_tokens = [ self.id_to_special_token[self.original_vocab_size + i] for i in range(self.vocab_size - self.original_vocab_size) From 959b6b4d0d96b30d2c585e2e3fc2ab8d72710075 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 15 Nov 2024 14:26:15 -0800 Subject: [PATCH 16/16] apply ryan's suggestion Signed-off-by: Sarah Yurick --- nemo_curator/filters/code.py | 11 +- .../filters/sentencepiece_tokenizer.py | 292 ------------------ 2 files changed, 4 insertions(+), 299 deletions(-) delete mode 100644 nemo_curator/filters/sentencepiece_tokenizer.py diff --git a/nemo_curator/filters/code.py b/nemo_curator/filters/code.py index 57b284c3..487cae77 100644 --- a/nemo_curator/filters/code.py +++ b/nemo_curator/filters/code.py @@ -15,6 +15,7 @@ import csv import warnings +import sentencepiece from bs4 import BeautifulSoup from comment_parser import comment_parser @@ -102,22 +103,18 @@ def keep_document(self, score): class TokenizerFertilityFilter(DocumentFilter): def __init__(self, path_to_tokenizer=None, min_char_to_token_ratio=2.5): - try: - from nemo.collections.common.tokenizers import SentencePieceTokenizer - except (ImportError, ModuleNotFoundError): - from .sentencepiece_tokenizer import SentencePieceTokenizer - if path_to_tokenizer is None: raise ValueError( "Must provide a valid path to a SentencePiece " "tokenizer" ) - self._tokenizer = SentencePieceTokenizer(path_to_tokenizer) + self._tokenizer = sentencepiece.SentencePieceProcessor() + self._tokenizer.Load(path_to_tokenizer) self._threshold = min_char_to_token_ratio self._name = "tokenizer_fertility" def score_document(self, source): - tokens = self._tokenizer.text_to_tokens(source) + tokens = self._tokenizer.encode_as_pieces(source) num_chars = len(source) num_tokens = len(tokens) if num_tokens == 0: diff --git a/nemo_curator/filters/sentencepiece_tokenizer.py b/nemo_curator/filters/sentencepiece_tokenizer.py deleted file mode 100644 index c242fa9e..00000000 --- a/nemo_curator/filters/sentencepiece_tokenizer.py +++ /dev/null @@ -1,292 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from typing import Dict, List, Optional, Union - -import numpy as np -import sentencepiece -import torch - - -class SentencePieceTokenizer: - """ - SentencePieceTokenizer https://github.com/google/sentencepiece - - Args: - model_path: path to sentence piece tokenizer model. - special_tokens: either list of special tokens or dictionary of token name to token value - legacy: when set to True, the previous behavior of the SentecePiece wrapper will be restored, - including the possibility to add special tokens inside wrapper. - """ - - def __init__( - self, - model_path: str, - special_tokens: Optional[Union[Dict[str, str], List[str]]] = None, - legacy: bool = False, - ): - if not model_path or not os.path.exists(model_path): - raise ValueError(f"model_path: {model_path} is invalid") - self.tokenizer = sentencepiece.SentencePieceProcessor() - self.tokenizer.Load(model_path) - - self.original_vocab_size = self.tokenizer.get_piece_size() - self.vocab_size = self.tokenizer.get_piece_size() - self.legacy = legacy - self.special_token_to_id = {} - self.id_to_special_token = {} - if special_tokens: - if not self.legacy: - raise ValueError( - "Special tokens must be None when legacy is set to False. Provide special tokens at train time." - ) - self.add_special_tokens(special_tokens) - self.space_sensitive = self.text_to_tokens("x y") != self.text_to_tokens( - "x" - ) + self.text_to_tokens("y") - - def text_to_tokens(self, text): - if self.legacy: - tokens = [] - idx = 0 - - while 1: - indices = {} - - for token in self.special_token_to_id: - try: - indices[token] = text[idx:].index(token) - except ValueError: - continue - - if len(indices) == 0: - break - - next_token = min(indices, key=indices.get) - next_idx = idx + indices[next_token] - - tokens.extend(self.tokenizer.encode_as_pieces(text[idx:next_idx])) - tokens.append(next_token) - idx = next_idx + len(next_token) - - tokens.extend(self.tokenizer.encode_as_pieces(text[idx:])) - return tokens - - return self.tokenizer.encode_as_pieces(text) - - def encode(self, text): - if self.legacy: - ids = [] - idx = 0 - - while 1: - indices = {} - - for token in self.special_token_to_id: - try: - indices[token] = text[idx:].index(token) - except ValueError: - continue - - if len(indices) == 0: - break - - next_token = min(indices, key=indices.get) - next_idx = idx + indices[next_token] - - ids.extend(self.tokenizer.encode_as_ids(text[idx:next_idx])) - ids.append(self.special_token_to_id[next_token]) - idx = next_idx + len(next_token) - - ids.extend(self.tokenizer.encode_as_ids(text[idx:])) - return ids - - return self.tokenizer.encode_as_ids(text) - - def tokens_to_text(self, tokens): - if isinstance(tokens, np.ndarray): - tokens = tokens.tolist() - - return self.tokenizer.decode_pieces(tokens) - - def batch_decode(self, ids): - if isinstance(ids, np.ndarray) or torch.is_tensor(ids): - ids = ids.tolist() - - if self.legacy: - text = "" - last_i = 0 - - for i, id in enumerate(ids): - if id in self.id_to_special_token: - text += self.tokenizer.decode_ids(ids[last_i:i]) + " " - text += self.id_to_special_token[id] + " " - last_i = i + 1 - - text += self.tokenizer.decode_ids(ids[last_i:]) - return text.strip() - - return self.tokenizer.decode(ids) - - def token_to_id(self, token): - if self.legacy and token in self.special_token_to_id: - return self.special_token_to_id[token] - - return self.tokenizer.piece_to_id(token) - - def ids_to_tokens(self, ids): - tokens = [] - for id in ids: - if id >= self.original_vocab_size: - tokens.append(self.id_to_special_token[id]) - else: - tokens.append(self.tokenizer.id_to_piece(id)) - return tokens - - def tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]: - if isinstance(tokens, str): - tokens = [tokens] - ids = [] - for token in tokens: - ids.append(self.token_to_id(token)) - return ids - - def add_special_tokens(self, special_tokens): - if not self.legacy: - raise AttributeError( - "Special Token addition does not work when legacy is set to False." - ) - - if isinstance(special_tokens, list): - for token in special_tokens: - if ( - self.tokenizer.piece_to_id(token) == self.tokenizer.unk_id() - and token not in self.special_token_to_id - ): - self.special_token_to_id[token] = self.vocab_size - self.id_to_special_token[self.vocab_size] = token - self.vocab_size += 1 - elif isinstance(special_tokens, dict): - for token_name, token in special_tokens.items(): - setattr(self, token_name, token) - if ( - self.tokenizer.piece_to_id(token) == self.tokenizer.unk_id() - and token not in self.special_token_to_id - ): - self.special_token_to_id[token] = self.vocab_size - self.id_to_special_token[self.vocab_size] = token - self.vocab_size += 1 - - @property - def pad_id(self): - if self.legacy: - pad_id = self.tokens_to_ids([self.pad_token])[0] - else: - pad_id = self.tokenizer.pad_id() - return pad_id - - @property - def bos_token_id(self): - if self.legacy: - bos_id = self.tokens_to_ids([self.bos_token])[0] - else: - bos_id = self.tokenizer.bos_id() - return bos_id - - @property - def eos_token_id(self): - if self.legacy: - eos_id = self.tokens_to_ids([self.eos_token])[0] - else: - eos_id = self.tokenizer.eos_id() - return eos_id - - @property - def sep_id(self): - if self.legacy: - return self.tokens_to_ids([self.sep_token])[0] - else: - raise NameError( - "Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos." - ) - - @property - def cls_id(self): - if self.legacy: - return self.tokens_to_ids([self.cls_token])[0] - else: - raise NameError( - "Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos." - ) - - @property - def mask_id(self): - if self.legacy: - return self.tokens_to_ids([self.mask_token])[0] - else: - raise NameError( - "Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos." - ) - - @property - def unk_id(self): - return self.tokenizer.unk_id() - - @property - def additional_special_tokens_ids(self): - """Returns a list of the additional special tokens (excluding bos, eos, pad, unk). Used to return sentinel tokens for e.g. T5.""" - special_tokens = set( - [ - self.bos_token, - self.eos_token, - self.pad_token, - self.mask_token, - self.cls_token, - self.sep_token, - ] - ) - return [ - v for k, v in self.special_token_to_id.items() if k not in special_tokens - ] - - @property - def vocab(self): - main_vocab = [ - self.tokenizer.id_to_piece(id) - for id in range(self.tokenizer.get_piece_size()) - ] - special_tokens = [ - self.id_to_special_token[self.original_vocab_size + i] - for i in range(self.vocab_size - self.original_vocab_size) - ] - return main_vocab + special_tokens - - ### Below are a few methods that mimic transformers.PreTrainedTokenizer for vLLM - - def convert_ids_to_tokens(self, ids, skip_special_tokens: bool = False): - return self.ids_to_tokens(ids) # TODO: support skip_special_tokens - - def convert_tokens_to_string(self, tokens: List[str]): - return self.tokens_to_text(tokens) - - def __len__(self): - return self.vocab_size - - @property - def is_fast(self): - return True - - def get_added_vocab(self): - return None