From 2514aabe6e74a7be0557c848da28c335f10e3236 Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Thu, 14 Nov 2024 11:53:15 -0800
Subject: [PATCH 01/16] add packaging

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index a69dd716..2037facf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,6 +57,7 @@ dependencies = [
     "nemo_toolkit[nlp]>=1.23.0",
     "numpy<2",
     "openai",
+    "packaging",
     "peft",
     "presidio-analyzer==2.2.351",
     "presidio-anonymizer==2.2.351",

From 7e147ed35437282f193c09866b4cdebdffbf5cf1 Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Thu, 14 Nov 2024 11:55:56 -0800
Subject: [PATCH 02/16] move to requires

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 pyproject.toml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2037facf..5578992f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 [build-system]
-requires = ["setuptools"]
+requires = ["setuptools", "packaging"]
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -57,7 +57,6 @@ dependencies = [
     "nemo_toolkit[nlp]>=1.23.0",
     "numpy<2",
     "openai",
-    "packaging",
     "peft",
     "presidio-analyzer==2.2.351",
     "presidio-anonymizer==2.2.351",

From 320a1b4bb2648c0d97d1e5ce4cf586c4219f21dd Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Thu, 14 Nov 2024 11:57:50 -0800
Subject: [PATCH 03/16] move to github ci file

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 .github/workflows/test.yml | 2 +-
 pyproject.toml             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index baa968f4..af83e53c 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -39,7 +39,7 @@ jobs:
         # https://github.com/facebookresearch/fastText/issues/512#issuecomment-1837367666
         # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94
         run: |
-          pip install wheel cython
+          pip install wheel cython packaging
           pip install --no-cache-dir .
           pip install pytest
       - name: Run tests
diff --git a/pyproject.toml b/pyproject.toml
index 5578992f..a69dd716 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 [build-system]
-requires = ["setuptools", "packaging"]
+requires = ["setuptools"]
 build-backend = "setuptools.build_meta"
 
 [project]

From d232be481b4d6b889a794002c248b07faeb0080d Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Thu, 14 Nov 2024 13:11:22 -0800
Subject: [PATCH 04/16] add pin

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 .github/workflows/test.yml | 2 +-
 pyproject.toml             | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index af83e53c..baa968f4 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -39,7 +39,7 @@ jobs:
         # https://github.com/facebookresearch/fastText/issues/512#issuecomment-1837367666
         # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94
         run: |
-          pip install wheel cython packaging
+          pip install wheel cython
           pip install --no-cache-dir .
           pip install pytest
       - name: Run tests
diff --git a/pyproject.toml b/pyproject.toml
index a69dd716..f82fe1a6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,7 +54,8 @@ dependencies = [
     "lxml_html_clean",
     "mecab-python3",
     "mwparserfromhell==0.6.5",
-    "nemo_toolkit[nlp]>=1.23.0",
+    # TODO: Pin until dependencies from https://github.com/NVIDIA/NeMo?tab=readme-ov-file#install-nemo-framework are updated
+    "nemo_toolkit[nlp]<2.0.0",
     "numpy<2",
     "openai",
     "peft",

From 9089871f4d7c4dad3f6d966b9d022850ca490f6a Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Fri, 15 Nov 2024 11:25:56 -0800
Subject: [PATCH 05/16] add torch

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 .github/workflows/test.yml | 2 +-
 pyproject.toml             | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index baa968f4..16e2c026 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -39,7 +39,7 @@ jobs:
         # https://github.com/facebookresearch/fastText/issues/512#issuecomment-1837367666
         # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94
         run: |
-          pip install wheel cython
+          pip install wheel cython packaging torch
           pip install --no-cache-dir .
           pip install pytest
       - name: Run tests
diff --git a/pyproject.toml b/pyproject.toml
index f82fe1a6..a69dd716 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,8 +54,7 @@ dependencies = [
     "lxml_html_clean",
     "mecab-python3",
     "mwparserfromhell==0.6.5",
-    # TODO: Pin until dependencies from https://github.com/NVIDIA/NeMo?tab=readme-ov-file#install-nemo-framework are updated
-    "nemo_toolkit[nlp]<2.0.0",
+    "nemo_toolkit[nlp]>=1.23.0",
     "numpy<2",
     "openai",
     "peft",

From 763128ba81f14cbbab388a2340431cd479a8c6cc Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Fri, 15 Nov 2024 11:31:10 -0800
Subject: [PATCH 06/16] add suggestion from mamba readme

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 16e2c026..2e0ec29f 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -39,7 +39,7 @@ jobs:
         # https://github.com/facebookresearch/fastText/issues/512#issuecomment-1837367666
         # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94
         run: |
-          pip install wheel cython packaging torch
+          pip install wheel cython packaging torch --no-build-isolation
           pip install --no-cache-dir .
           pip install pytest
       - name: Run tests

From 238763649d540bdafcbda6e7b4c2a10541e2e1e1 Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Fri, 15 Nov 2024 11:37:06 -0800
Subject: [PATCH 07/16] try github install

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 .github/workflows/test.yml | 2 +-
 pyproject.toml             | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 2e0ec29f..baa968f4 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -39,7 +39,7 @@ jobs:
         # https://github.com/facebookresearch/fastText/issues/512#issuecomment-1837367666
         # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94
         run: |
-          pip install wheel cython packaging torch --no-build-isolation
+          pip install wheel cython
           pip install --no-cache-dir .
           pip install pytest
       - name: Run tests
diff --git a/pyproject.toml b/pyproject.toml
index a69dd716..95d68d02 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,6 +48,7 @@ dependencies = [
     "distributed>=2021.7.1",
     "fasttext==0.9.2",
     "ftfy==6.1.1",
+    "git+https://github.com/state-spaces/mamba.git"
     "in-place==0.5.0",
     "jieba==0.42.1",
     "justext==3.0.1",

From 81024d9edf4700344ef011dc61c0defcab32cfa3 Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Fri, 15 Nov 2024 11:38:23 -0800
Subject: [PATCH 08/16] add comma

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 95d68d02..0fe25b16 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,7 +48,8 @@ dependencies = [
     "distributed>=2021.7.1",
     "fasttext==0.9.2",
     "ftfy==6.1.1",
-    "git+https://github.com/state-spaces/mamba.git"
+    # TODO: Remove after 2.2.3 release
+    "git+https://github.com/state-spaces/mamba.git",
     "in-place==0.5.0",
     "jieba==0.42.1",
     "justext==3.0.1",

From c827d95874136d939e6c8fa9266bb64780bfa376 Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Fri, 15 Nov 2024 11:43:16 -0800
Subject: [PATCH 09/16] another attempt

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 .github/workflows/test.yml | 3 ++-
 pyproject.toml             | 2 --
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index baa968f4..2fd13a93 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -39,7 +39,8 @@ jobs:
         # https://github.com/facebookresearch/fastText/issues/512#issuecomment-1837367666
         # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94
         run: |
-          pip install wheel cython
+          pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+          pip install wheel cython packaging
           pip install --no-cache-dir .
           pip install pytest
       - name: Run tests
diff --git a/pyproject.toml b/pyproject.toml
index 0fe25b16..a69dd716 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,8 +48,6 @@ dependencies = [
     "distributed>=2021.7.1",
     "fasttext==0.9.2",
     "ftfy==6.1.1",
-    # TODO: Remove after 2.2.3 release
-    "git+https://github.com/state-spaces/mamba.git",
     "in-place==0.5.0",
     "jieba==0.42.1",
     "justext==3.0.1",

From c1311d6f9a7c23a76a18489597249db4bcc6e237 Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Fri, 15 Nov 2024 12:04:57 -0800
Subject: [PATCH 10/16] remove nemo toolkit

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 .github/workflows/test.yml | 3 +--
 pyproject.toml             | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 2fd13a93..baa968f4 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -39,8 +39,7 @@ jobs:
         # https://github.com/facebookresearch/fastText/issues/512#issuecomment-1837367666
         # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94
         run: |
-          pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          pip install wheel cython packaging
+          pip install wheel cython
           pip install --no-cache-dir .
           pip install pytest
       - name: Run tests
diff --git a/pyproject.toml b/pyproject.toml
index a69dd716..f1efe670 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,7 +54,6 @@ dependencies = [
     "lxml_html_clean",
     "mecab-python3",
     "mwparserfromhell==0.6.5",
-    "nemo_toolkit[nlp]>=1.23.0",
     "numpy<2",
     "openai",
     "peft",

From ad10ab2717baf66cbafb33f63ccc02108b90618c Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Fri, 15 Nov 2024 12:11:09 -0800
Subject: [PATCH 11/16] add datasets

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index f1efe670..89f5c61e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,6 +45,7 @@ dependencies = [
     "crossfit>=0.0.6",
     "dask-mpi>=2021.11.0",
     "dask[complete]>=2021.7.1",
+    "datasets",
     "distributed>=2021.7.1",
     "fasttext==0.9.2",
     "ftfy==6.1.1",

From 4e3fd8f6e31d9aaaf93ac2c1fd09def3f2d45217 Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Fri, 15 Nov 2024 12:19:30 -0800
Subject: [PATCH 12/16] try removing cython

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 .github/workflows/test.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index baa968f4..1d8cc925 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -37,9 +37,8 @@ jobs:
 
         # Installing wheel beforehand due to fasttext issue:
         # https://github.com/facebookresearch/fastText/issues/512#issuecomment-1837367666
-        # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94
         run: |
-          pip install wheel cython
+          pip install wheel
           pip install --no-cache-dir .
           pip install pytest
       - name: Run tests

From 87997e4d3b8564cdba91db9b0322d335446c2121 Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Fri, 15 Nov 2024 12:25:32 -0800
Subject: [PATCH 13/16] remove cython

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 Dockerfile | 2 +-
 README.md  | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 16ddd54a..51fe7be4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -38,7 +38,7 @@ RUN conda create -y --name curator -c conda-forge -c nvidia \
   libcusparse \
   libcusolver && \
   source activate curator && \
-  pip install --upgrade cython pytest pip
+  pip install --upgrade pytest pip
 
 RUN \
 --mount=type=bind,source=/opt/NeMo-Curator/nemo_curator/__init__.py,target=/opt/NeMo-Curator/nemo_curator/__init__.py,from=curator-update \
diff --git a/README.md b/README.md
index 5cba9d10..4513c7af 100644
--- a/README.md
+++ b/README.md
@@ -83,14 +83,12 @@ You can get NeMo-Curator in 3 ways.
 #### PyPi
 
 ```bash
-pip install cython
 pip install --extra-index-url https://pypi.nvidia.com nemo-curator[all]
 ```
 
 #### Source
 ```bash
 git clone https://github.com/NVIDIA/NeMo-Curator.git
-pip install cython
 pip install --extra-index-url https://pypi.nvidia.com "./NeMo-Curator[all]"
 ```
 

From 45f2a447232c2e27f1a4f1ffef1a16b7ab780c7b Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Fri, 15 Nov 2024 13:13:36 -0800
Subject: [PATCH 14/16] sentencepiece

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 docs/user-guide/image/gettingstarted.rst      |   2 -
 nemo_curator/filters/code.py                  |   5 +-
 .../filters/sentencepiece_tokenizer.py        | 267 ++++++++++++++++++
 pyproject.toml                                |   1 +
 tutorials/image-curation/image-curation.ipynb |   2 +-
 ...ta Generation - Hello World Examples.ipynb |   4 +-
 6 files changed, 275 insertions(+), 6 deletions(-)
 create mode 100644 nemo_curator/filters/sentencepiece_tokenizer.py

diff --git a/docs/user-guide/image/gettingstarted.rst b/docs/user-guide/image/gettingstarted.rst
index dae4240d..2ccacb25 100644
--- a/docs/user-guide/image/gettingstarted.rst
+++ b/docs/user-guide/image/gettingstarted.rst
@@ -33,7 +33,6 @@ NeMo Curator's PyPi page can be found `here <https://pypi.org/project/nemo-curat
 
 .. code-block:: bash
 
-    pip install cython
     pip install nemo-curator[image]
 
 #####################
@@ -44,7 +43,6 @@ NeMo Curator's GitHub can be found `here <https://github.com/NVIDIA/NeMo-Curator
 .. code-block:: bash
 
     git clone https://github.com/NVIDIA/NeMo-Curator.git
-    pip install cython
     pip install ./NeMo-Curator[image]
 
 ############################
diff --git a/nemo_curator/filters/code.py b/nemo_curator/filters/code.py
index a05bb2b6..57b284c3 100644
--- a/nemo_curator/filters/code.py
+++ b/nemo_curator/filters/code.py
@@ -102,7 +102,10 @@ def keep_document(self, score):
 class TokenizerFertilityFilter(DocumentFilter):
 
     def __init__(self, path_to_tokenizer=None, min_char_to_token_ratio=2.5):
-        from nemo.collections.common.tokenizers import SentencePieceTokenizer
+        try:
+            from nemo.collections.common.tokenizers import SentencePieceTokenizer
+        except (ImportError, ModuleNotFoundError):
+            from .sentencepiece_tokenizer import SentencePieceTokenizer
 
         if path_to_tokenizer is None:
             raise ValueError(
diff --git a/nemo_curator/filters/sentencepiece_tokenizer.py b/nemo_curator/filters/sentencepiece_tokenizer.py
new file mode 100644
index 00000000..e47b1c66
--- /dev/null
+++ b/nemo_curator/filters/sentencepiece_tokenizer.py
@@ -0,0 +1,267 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import sentencepiece
+import torch
+
+
+class SentencePieceTokenizer:
+    """
+    SentencePieceTokenizer https://github.com/google/sentencepiece
+
+        Args:
+        model_path: path to sentence piece tokenizer model.
+        special_tokens: either list of special tokens or dictionary of token name to token value
+        legacy: when set to True, the previous behavior of the SentecePiece wrapper will be restored,
+            including the possibility to add special tokens inside wrapper.
+    """
+
+    def __init__(
+        self, model_path: str, special_tokens: Optional[Union[Dict[str, str], List[str]]] = None, legacy: bool = False
+    ):
+        if not model_path or not os.path.exists(model_path):
+            raise ValueError(f"model_path: {model_path} is invalid")
+        self.tokenizer = sentencepiece.SentencePieceProcessor()
+        self.tokenizer.Load(model_path)
+
+        self.original_vocab_size = self.tokenizer.get_piece_size()
+        self.vocab_size = self.tokenizer.get_piece_size()
+        self.legacy = legacy
+        self.special_token_to_id = {}
+        self.id_to_special_token = {}
+        if special_tokens:
+            if not self.legacy:
+                raise ValueError(
+                    "Special tokens must be None when legacy is set to False. Provide special tokens at train time."
+                )
+            self.add_special_tokens(special_tokens)
+        self.space_sensitive = self.text_to_tokens('x y') != self.text_to_tokens('x') + self.text_to_tokens('y')
+
+    def text_to_tokens(self, text):
+        if self.legacy:
+            tokens = []
+            idx = 0
+
+            while 1:
+                indices = {}
+
+                for token in self.special_token_to_id:
+                    try:
+                        indices[token] = text[idx:].index(token)
+                    except ValueError:
+                        continue
+
+                if len(indices) == 0:
+                    break
+
+                next_token = min(indices, key=indices.get)
+                next_idx = idx + indices[next_token]
+
+                tokens.extend(self.tokenizer.encode_as_pieces(text[idx:next_idx]))
+                tokens.append(next_token)
+                idx = next_idx + len(next_token)
+
+            tokens.extend(self.tokenizer.encode_as_pieces(text[idx:]))
+            return tokens
+
+        return self.tokenizer.encode_as_pieces(text)
+
+    def encode(self, text):
+        if self.legacy:
+            ids = []
+            idx = 0
+
+            while 1:
+                indices = {}
+
+                for token in self.special_token_to_id:
+                    try:
+                        indices[token] = text[idx:].index(token)
+                    except ValueError:
+                        continue
+
+                if len(indices) == 0:
+                    break
+
+                next_token = min(indices, key=indices.get)
+                next_idx = idx + indices[next_token]
+
+                ids.extend(self.tokenizer.encode_as_ids(text[idx:next_idx]))
+                ids.append(self.special_token_to_id[next_token])
+                idx = next_idx + len(next_token)
+
+            ids.extend(self.tokenizer.encode_as_ids(text[idx:]))
+            return ids
+
+        return self.tokenizer.encode_as_ids(text)
+
+    def tokens_to_text(self, tokens):
+        if isinstance(tokens, np.ndarray):
+            tokens = tokens.tolist()
+
+        return self.tokenizer.decode_pieces(tokens)
+
+    def batch_decode(self, ids):
+        if isinstance(ids, np.ndarray) or torch.is_tensor(ids):
+            ids = ids.tolist()
+
+        if self.legacy:
+            text = ""
+            last_i = 0
+
+            for i, id in enumerate(ids):
+                if id in self.id_to_special_token:
+                    text += self.tokenizer.decode_ids(ids[last_i:i]) + " "
+                    text += self.id_to_special_token[id] + " "
+                    last_i = i + 1
+
+            text += self.tokenizer.decode_ids(ids[last_i:])
+            return text.strip()
+
+        return self.tokenizer.decode(ids)
+
+    def token_to_id(self, token):
+        if self.legacy and token in self.special_token_to_id:
+            return self.special_token_to_id[token]
+
+        return self.tokenizer.piece_to_id(token)
+
+    def ids_to_tokens(self, ids):
+        tokens = []
+        for id in ids:
+            if id >= self.original_vocab_size:
+                tokens.append(self.id_to_special_token[id])
+            else:
+                tokens.append(self.tokenizer.id_to_piece(id))
+        return tokens
+
+    def tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
+        if isinstance(tokens, str):
+            tokens = [tokens]
+        ids = []
+        for token in tokens:
+            ids.append(self.token_to_id(token))
+        return ids
+
+    def add_special_tokens(self, special_tokens):
+        if not self.legacy:
+            raise AttributeError("Special Token addition does not work when legacy is set to False.")
+
+        if isinstance(special_tokens, list):
+            for token in special_tokens:
+                if (
+                    self.tokenizer.piece_to_id(token) == self.tokenizer.unk_id()
+                    and token not in self.special_token_to_id
+                ):
+                    self.special_token_to_id[token] = self.vocab_size
+                    self.id_to_special_token[self.vocab_size] = token
+                    self.vocab_size += 1
+        elif isinstance(special_tokens, dict):
+            for token_name, token in special_tokens.items():
+                setattr(self, token_name, token)
+                if (
+                    self.tokenizer.piece_to_id(token) == self.tokenizer.unk_id()
+                    and token not in self.special_token_to_id
+                ):
+                    self.special_token_to_id[token] = self.vocab_size
+                    self.id_to_special_token[self.vocab_size] = token
+                    self.vocab_size += 1
+
+    @property
+    def pad_id(self):
+        if self.legacy:
+            pad_id = self.tokens_to_ids([self.pad_token])[0]
+        else:
+            pad_id = self.tokenizer.pad_id()
+        return pad_id
+
+    @property
+    def bos_token_id(self):
+        if self.legacy:
+            bos_id = self.tokens_to_ids([self.bos_token])[0]
+        else:
+            bos_id = self.tokenizer.bos_id()
+        return bos_id
+
+    @property
+    def eos_token_id(self):
+        if self.legacy:
+            eos_id = self.tokens_to_ids([self.eos_token])[0]
+        else:
+            eos_id = self.tokenizer.eos_id()
+        return eos_id
+
+    @property
+    def sep_id(self):
+        if self.legacy:
+            return self.tokens_to_ids([self.sep_token])[0]
+        else:
+            raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.")
+
+    @property
+    def cls_id(self):
+        if self.legacy:
+            return self.tokens_to_ids([self.cls_token])[0]
+        else:
+            raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.")
+
+    @property
+    def mask_id(self):
+        if self.legacy:
+            return self.tokens_to_ids([self.mask_token])[0]
+        else:
+            raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.")
+
+    @property
+    def unk_id(self):
+        return self.tokenizer.unk_id()
+
+    @property
+    def additional_special_tokens_ids(self):
+        """Returns a list of the additional special tokens (excluding bos, eos, pad, unk). Used to return sentinel tokens for e.g. T5."""
+        special_tokens = set(
+            [self.bos_token, self.eos_token, self.pad_token, self.mask_token, self.cls_token, self.sep_token]
+        )
+        return [v for k, v in self.special_token_to_id.items() if k not in special_tokens]
+
+    @property
+    def vocab(self):
+        main_vocab = [self.tokenizer.id_to_piece(id) for id in range(self.tokenizer.get_piece_size())]
+        special_tokens = [
+            self.id_to_special_token[self.original_vocab_size + i]
+            for i in range(self.vocab_size - self.original_vocab_size)
+        ]
+        return main_vocab + special_tokens
+
+    ### Below are a few methods that mimic transformers.PreTrainedTokenizer for vLLM
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens: bool = False):
+        return self.ids_to_tokens(ids)  # TODO: support skip_special_tokens
+
+    def convert_tokens_to_string(self, tokens: List[str]):
+        return self.tokens_to_text(tokens)
+
+    def __len__(self):
+        return self.vocab_size
+
+    @property
+    def is_fast(self):
+        return True
+
+    def get_added_vocab(self):
+        return None
diff --git a/pyproject.toml b/pyproject.toml
index 89f5c61e..21cc8eac 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -62,6 +62,7 @@ dependencies = [
     "presidio-anonymizer==2.2.351",
     "pycld2",
     "resiliparse",
+    "sentencepiece",
     "spacy>=3.6.0, <3.8.0",
     "unidic-lite==1.0.8",
     "usaddress==0.5.10",
diff --git a/tutorials/image-curation/image-curation.ipynb b/tutorials/image-curation/image-curation.ipynb
index 947fbfef..1ac3c102 100644
--- a/tutorials/image-curation/image-curation.ipynb
+++ b/tutorials/image-curation/image-curation.ipynb
@@ -49,7 +49,7 @@
    },
    "outputs": [],
    "source": [
-    "!pip install cython ipywidgets aiofiles\n",
+    "!pip install ipywidgets aiofiles\n",
     "# Install from source by default\n",
     "!pip install --extra-index-url https://pypi.nvidia.com ../../[image]\n",
     "%env DASK_DATAFRAME__QUERY_PLANNING False"
diff --git a/tutorials/synthetic-data-hello-world/Synthetic Data Generation - Hello World Examples.ipynb b/tutorials/synthetic-data-hello-world/Synthetic Data Generation - Hello World Examples.ipynb
index bbe0ed8c..1bc14d14 100644
--- a/tutorials/synthetic-data-hello-world/Synthetic Data Generation - Hello World Examples.ipynb	
+++ b/tutorials/synthetic-data-hello-world/Synthetic Data Generation - Hello World Examples.ipynb	
@@ -58,11 +58,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install -qU wheel cython"
+    "!pip install -qU wheel"
    ]
   },
   {

From 3f20ecb56ba16f08991ddb269362256f6cbfbbf2 Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Fri, 15 Nov 2024 13:23:34 -0800
Subject: [PATCH 15/16] run black

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 .../filters/sentencepiece_tokenizer.py        | 43 +++++++++++++++----
 1 file changed, 34 insertions(+), 9 deletions(-)

diff --git a/nemo_curator/filters/sentencepiece_tokenizer.py b/nemo_curator/filters/sentencepiece_tokenizer.py
index e47b1c66..c242fa9e 100644
--- a/nemo_curator/filters/sentencepiece_tokenizer.py
+++ b/nemo_curator/filters/sentencepiece_tokenizer.py
@@ -32,7 +32,10 @@ class SentencePieceTokenizer:
     """
 
     def __init__(
-        self, model_path: str, special_tokens: Optional[Union[Dict[str, str], List[str]]] = None, legacy: bool = False
+        self,
+        model_path: str,
+        special_tokens: Optional[Union[Dict[str, str], List[str]]] = None,
+        legacy: bool = False,
     ):
         if not model_path or not os.path.exists(model_path):
             raise ValueError(f"model_path: {model_path} is invalid")
@@ -50,7 +53,9 @@ def __init__(
                     "Special tokens must be None when legacy is set to False. Provide special tokens at train time."
                 )
             self.add_special_tokens(special_tokens)
-        self.space_sensitive = self.text_to_tokens('x y') != self.text_to_tokens('x') + self.text_to_tokens('y')
+        self.space_sensitive = self.text_to_tokens("x y") != self.text_to_tokens(
+            "x"
+        ) + self.text_to_tokens("y")
 
     def text_to_tokens(self, text):
         if self.legacy:
@@ -160,7 +165,9 @@ def tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
 
     def add_special_tokens(self, special_tokens):
         if not self.legacy:
-            raise AttributeError("Special Token addition does not work when legacy is set to False.")
+            raise AttributeError(
+                "Special Token addition does not work when legacy is set to False."
+            )
 
         if isinstance(special_tokens, list):
             for token in special_tokens:
@@ -211,21 +218,27 @@ def sep_id(self):
         if self.legacy:
             return self.tokens_to_ids([self.sep_token])[0]
         else:
-            raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.")
+            raise NameError(
+                "Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos."
+            )
 
     @property
     def cls_id(self):
         if self.legacy:
             return self.tokens_to_ids([self.cls_token])[0]
         else:
-            raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.")
+            raise NameError(
+                "Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos."
+            )
 
     @property
     def mask_id(self):
         if self.legacy:
             return self.tokens_to_ids([self.mask_token])[0]
         else:
-            raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.")
+            raise NameError(
+                "Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos."
+            )
 
     @property
     def unk_id(self):
@@ -235,13 +248,25 @@ def unk_id(self):
     def additional_special_tokens_ids(self):
         """Returns a list of the additional special tokens (excluding bos, eos, pad, unk). Used to return sentinel tokens for e.g. T5."""
         special_tokens = set(
-            [self.bos_token, self.eos_token, self.pad_token, self.mask_token, self.cls_token, self.sep_token]
+            [
+                self.bos_token,
+                self.eos_token,
+                self.pad_token,
+                self.mask_token,
+                self.cls_token,
+                self.sep_token,
+            ]
         )
-        return [v for k, v in self.special_token_to_id.items() if k not in special_tokens]
+        return [
+            v for k, v in self.special_token_to_id.items() if k not in special_tokens
+        ]
 
     @property
     def vocab(self):
-        main_vocab = [self.tokenizer.id_to_piece(id) for id in range(self.tokenizer.get_piece_size())]
+        main_vocab = [
+            self.tokenizer.id_to_piece(id)
+            for id in range(self.tokenizer.get_piece_size())
+        ]
         special_tokens = [
             self.id_to_special_token[self.original_vocab_size + i]
             for i in range(self.vocab_size - self.original_vocab_size)

From 959b6b4d0d96b30d2c585e2e3fc2ab8d72710075 Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Fri, 15 Nov 2024 14:26:15 -0800
Subject: [PATCH 16/16] apply ryan's suggestion

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 nemo_curator/filters/code.py                  |  11 +-
 .../filters/sentencepiece_tokenizer.py        | 292 ------------------
 2 files changed, 4 insertions(+), 299 deletions(-)
 delete mode 100644 nemo_curator/filters/sentencepiece_tokenizer.py

diff --git a/nemo_curator/filters/code.py b/nemo_curator/filters/code.py
index 57b284c3..487cae77 100644
--- a/nemo_curator/filters/code.py
+++ b/nemo_curator/filters/code.py
@@ -15,6 +15,7 @@
 import csv
 import warnings
 
+import sentencepiece
 from bs4 import BeautifulSoup
 from comment_parser import comment_parser
 
@@ -102,22 +103,18 @@ def keep_document(self, score):
 class TokenizerFertilityFilter(DocumentFilter):
 
     def __init__(self, path_to_tokenizer=None, min_char_to_token_ratio=2.5):
-        try:
-            from nemo.collections.common.tokenizers import SentencePieceTokenizer
-        except (ImportError, ModuleNotFoundError):
-            from .sentencepiece_tokenizer import SentencePieceTokenizer
-
         if path_to_tokenizer is None:
             raise ValueError(
                 "Must provide a valid path to a SentencePiece " "tokenizer"
             )
-        self._tokenizer = SentencePieceTokenizer(path_to_tokenizer)
+        self._tokenizer = sentencepiece.SentencePieceProcessor()
+        self._tokenizer.Load(path_to_tokenizer)
         self._threshold = min_char_to_token_ratio
 
         self._name = "tokenizer_fertility"
 
     def score_document(self, source):
-        tokens = self._tokenizer.text_to_tokens(source)
+        tokens = self._tokenizer.encode_as_pieces(source)
         num_chars = len(source)
         num_tokens = len(tokens)
         if num_tokens == 0:
diff --git a/nemo_curator/filters/sentencepiece_tokenizer.py b/nemo_curator/filters/sentencepiece_tokenizer.py
deleted file mode 100644
index c242fa9e..00000000
--- a/nemo_curator/filters/sentencepiece_tokenizer.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import Dict, List, Optional, Union
-
-import numpy as np
-import sentencepiece
-import torch
-
-
-class SentencePieceTokenizer:
-    """
-    SentencePieceTokenizer https://github.com/google/sentencepiece
-
-        Args:
-        model_path: path to sentence piece tokenizer model.
-        special_tokens: either list of special tokens or dictionary of token name to token value
-        legacy: when set to True, the previous behavior of the SentecePiece wrapper will be restored,
-            including the possibility to add special tokens inside wrapper.
-    """
-
-    def __init__(
-        self,
-        model_path: str,
-        special_tokens: Optional[Union[Dict[str, str], List[str]]] = None,
-        legacy: bool = False,
-    ):
-        if not model_path or not os.path.exists(model_path):
-            raise ValueError(f"model_path: {model_path} is invalid")
-        self.tokenizer = sentencepiece.SentencePieceProcessor()
-        self.tokenizer.Load(model_path)
-
-        self.original_vocab_size = self.tokenizer.get_piece_size()
-        self.vocab_size = self.tokenizer.get_piece_size()
-        self.legacy = legacy
-        self.special_token_to_id = {}
-        self.id_to_special_token = {}
-        if special_tokens:
-            if not self.legacy:
-                raise ValueError(
-                    "Special tokens must be None when legacy is set to False. Provide special tokens at train time."
-                )
-            self.add_special_tokens(special_tokens)
-        self.space_sensitive = self.text_to_tokens("x y") != self.text_to_tokens(
-            "x"
-        ) + self.text_to_tokens("y")
-
-    def text_to_tokens(self, text):
-        if self.legacy:
-            tokens = []
-            idx = 0
-
-            while 1:
-                indices = {}
-
-                for token in self.special_token_to_id:
-                    try:
-                        indices[token] = text[idx:].index(token)
-                    except ValueError:
-                        continue
-
-                if len(indices) == 0:
-                    break
-
-                next_token = min(indices, key=indices.get)
-                next_idx = idx + indices[next_token]
-
-                tokens.extend(self.tokenizer.encode_as_pieces(text[idx:next_idx]))
-                tokens.append(next_token)
-                idx = next_idx + len(next_token)
-
-            tokens.extend(self.tokenizer.encode_as_pieces(text[idx:]))
-            return tokens
-
-        return self.tokenizer.encode_as_pieces(text)
-
-    def encode(self, text):
-        if self.legacy:
-            ids = []
-            idx = 0
-
-            while 1:
-                indices = {}
-
-                for token in self.special_token_to_id:
-                    try:
-                        indices[token] = text[idx:].index(token)
-                    except ValueError:
-                        continue
-
-                if len(indices) == 0:
-                    break
-
-                next_token = min(indices, key=indices.get)
-                next_idx = idx + indices[next_token]
-
-                ids.extend(self.tokenizer.encode_as_ids(text[idx:next_idx]))
-                ids.append(self.special_token_to_id[next_token])
-                idx = next_idx + len(next_token)
-
-            ids.extend(self.tokenizer.encode_as_ids(text[idx:]))
-            return ids
-
-        return self.tokenizer.encode_as_ids(text)
-
-    def tokens_to_text(self, tokens):
-        if isinstance(tokens, np.ndarray):
-            tokens = tokens.tolist()
-
-        return self.tokenizer.decode_pieces(tokens)
-
-    def batch_decode(self, ids):
-        if isinstance(ids, np.ndarray) or torch.is_tensor(ids):
-            ids = ids.tolist()
-
-        if self.legacy:
-            text = ""
-            last_i = 0
-
-            for i, id in enumerate(ids):
-                if id in self.id_to_special_token:
-                    text += self.tokenizer.decode_ids(ids[last_i:i]) + " "
-                    text += self.id_to_special_token[id] + " "
-                    last_i = i + 1
-
-            text += self.tokenizer.decode_ids(ids[last_i:])
-            return text.strip()
-
-        return self.tokenizer.decode(ids)
-
-    def token_to_id(self, token):
-        if self.legacy and token in self.special_token_to_id:
-            return self.special_token_to_id[token]
-
-        return self.tokenizer.piece_to_id(token)
-
-    def ids_to_tokens(self, ids):
-        tokens = []
-        for id in ids:
-            if id >= self.original_vocab_size:
-                tokens.append(self.id_to_special_token[id])
-            else:
-                tokens.append(self.tokenizer.id_to_piece(id))
-        return tokens
-
-    def tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
-        if isinstance(tokens, str):
-            tokens = [tokens]
-        ids = []
-        for token in tokens:
-            ids.append(self.token_to_id(token))
-        return ids
-
-    def add_special_tokens(self, special_tokens):
-        if not self.legacy:
-            raise AttributeError(
-                "Special Token addition does not work when legacy is set to False."
-            )
-
-        if isinstance(special_tokens, list):
-            for token in special_tokens:
-                if (
-                    self.tokenizer.piece_to_id(token) == self.tokenizer.unk_id()
-                    and token not in self.special_token_to_id
-                ):
-                    self.special_token_to_id[token] = self.vocab_size
-                    self.id_to_special_token[self.vocab_size] = token
-                    self.vocab_size += 1
-        elif isinstance(special_tokens, dict):
-            for token_name, token in special_tokens.items():
-                setattr(self, token_name, token)
-                if (
-                    self.tokenizer.piece_to_id(token) == self.tokenizer.unk_id()
-                    and token not in self.special_token_to_id
-                ):
-                    self.special_token_to_id[token] = self.vocab_size
-                    self.id_to_special_token[self.vocab_size] = token
-                    self.vocab_size += 1
-
-    @property
-    def pad_id(self):
-        if self.legacy:
-            pad_id = self.tokens_to_ids([self.pad_token])[0]
-        else:
-            pad_id = self.tokenizer.pad_id()
-        return pad_id
-
-    @property
-    def bos_token_id(self):
-        if self.legacy:
-            bos_id = self.tokens_to_ids([self.bos_token])[0]
-        else:
-            bos_id = self.tokenizer.bos_id()
-        return bos_id
-
-    @property
-    def eos_token_id(self):
-        if self.legacy:
-            eos_id = self.tokens_to_ids([self.eos_token])[0]
-        else:
-            eos_id = self.tokenizer.eos_id()
-        return eos_id
-
-    @property
-    def sep_id(self):
-        if self.legacy:
-            return self.tokens_to_ids([self.sep_token])[0]
-        else:
-            raise NameError(
-                "Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos."
-            )
-
-    @property
-    def cls_id(self):
-        if self.legacy:
-            return self.tokens_to_ids([self.cls_token])[0]
-        else:
-            raise NameError(
-                "Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos."
-            )
-
-    @property
-    def mask_id(self):
-        if self.legacy:
-            return self.tokens_to_ids([self.mask_token])[0]
-        else:
-            raise NameError(
-                "Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos."
-            )
-
-    @property
-    def unk_id(self):
-        return self.tokenizer.unk_id()
-
-    @property
-    def additional_special_tokens_ids(self):
-        """Returns a list of the additional special tokens (excluding bos, eos, pad, unk). Used to return sentinel tokens for e.g. T5."""
-        special_tokens = set(
-            [
-                self.bos_token,
-                self.eos_token,
-                self.pad_token,
-                self.mask_token,
-                self.cls_token,
-                self.sep_token,
-            ]
-        )
-        return [
-            v for k, v in self.special_token_to_id.items() if k not in special_tokens
-        ]
-
-    @property
-    def vocab(self):
-        main_vocab = [
-            self.tokenizer.id_to_piece(id)
-            for id in range(self.tokenizer.get_piece_size())
-        ]
-        special_tokens = [
-            self.id_to_special_token[self.original_vocab_size + i]
-            for i in range(self.vocab_size - self.original_vocab_size)
-        ]
-        return main_vocab + special_tokens
-
-    ### Below are a few methods that mimic transformers.PreTrainedTokenizer for vLLM
-
-    def convert_ids_to_tokens(self, ids, skip_special_tokens: bool = False):
-        return self.ids_to_tokens(ids)  # TODO: support skip_special_tokens
-
-    def convert_tokens_to_string(self, tokens: List[str]):
-        return self.tokens_to_text(tokens)
-
-    def __len__(self):
-        return self.vocab_size
-
-    @property
-    def is_fast(self):
-        return True
-
-    def get_added_vocab(self):
-        return None