explosion · rmitsch · Oct 4, 2023 · Oct 4, 2023
diff --git a/spacy_llm/models/hf/dolly.py b/spacy_llm/models/hf/dolly.py
@@ -14,7 +14,9 @@ def init_model(self) -> Any:
         """Sets up HF model and needed utilities.
         RETURNS (Any): HF model.
         """
-        return transformers.pipeline(model=self._name, **self._config_init)
+        return transformers.pipeline(
+            model=self._name, return_full_text=False, **self._config_init
+        )
 
     def __call__(self, prompts: Iterable[str]) -> Iterable[str]:  # type: ignore[override]
         """Queries Dolly HF model.

diff --git a/spacy_llm/models/hf/falcon.py b/spacy_llm/models/hf/falcon.py
@@ -38,6 +38,7 @@ def init_model(self) -> Any:
             "text-generation",
             model=self._name,
             tokenizer=self._tokenizer,
+            return_full_text=False,
             **self._config_init,
         )
 

diff --git a/spacy_llm/models/hf/llama2.py b/spacy_llm/models/hf/llama2.py
@@ -32,6 +32,7 @@ def init_model(self) -> Any:
             "text-generation",
             model=self._name,
             use_auth_token=True,
+            return_full_text=False,
             **self._config_init,
         )
 

diff --git a/spacy_llm/models/hf/openllama.py b/spacy_llm/models/hf/openllama.py
@@ -54,7 +54,9 @@ def __call__(self, prompts: Iterable[str]) -> Iterable[str]:  # type: ignore[ove
         assert hasattr(self._model, "generate")
         return [
             self._tokenizer.decode(
-                self._model.generate(input_ids=tii, **self._config_run)[0],
+                self._model.generate(input_ids=tii, **self._config_run)[
+                    :, tii.shape[1] :
+                ][0],
             )
             for tii in tokenized_input_ids
         ]

diff --git a/spacy_llm/models/hf/stablelm.py b/spacy_llm/models/hf/stablelm.py
@@ -68,8 +68,8 @@ def hf_account(self) -> str:
 
     def __call__(self, prompts: Iterable[str]) -> Iterable[str]:  # type: ignore[override]
         assert callable(self._tokenizer)
-        tokenized_prompts = [
-            self._tokenizer(prompt, return_tensors="pt")
+        tokenized_input_ids = [
+            self._tokenizer(prompt, return_tensors="pt").input_ids
             for prompt in (
                 # Add prompt formatting for tuned model.
                 prompts
@@ -81,15 +81,17 @@ def __call__(self, prompts: Iterable[str]) -> Iterable[str]:  # type: ignore[ove
             )
         ]
         if self._device:
-            tokenized_prompts = [tp.to(self._device) for tp in tokenized_prompts]
+            tokenized_input_ids = [tp.to(self._device) for tp in tokenized_input_ids]
 
         assert hasattr(self._model, "generate")
         return [
             self._tokenizer.decode(
-                self._model.generate(**prompt, **self._config_run)[0],
+                self._model.generate(input_ids=tii, **self._config_run)[
+                    :, tii.shape[1] :
+                ][0],
                 skip_special_tokens=True,
             )
-            for prompt in tokenized_prompts
+            for tii in tokenized_input_ids
         ]
 
     @staticmethod

diff --git a/spacy_llm/tests/models/test_dolly.py b/spacy_llm/tests/models/test_dolly.py
@@ -13,6 +13,7 @@
         "name": "dolly-v2-3b",
     },
     "task": {"@llm_tasks": "spacy.NoOp.v1"},
+    "save_io": True,
 }
 
 _NLP_CONFIG = """
@@ -26,6 +27,7 @@
 
 [components.llm]
 factory = "llm"
+save_io = True
 
 [components.llm.task]
 @llm_tasks = "spacy.NoOp.v1"
@@ -41,12 +43,13 @@
 def test_init():
     """Test initialization and simple run."""
     nlp = spacy.blank("en")
-    cfg = copy.deepcopy(_PIPE_CFG)
-    cfg["model"]["@llm_models"] = "spacy.Dolly.v1"
-    nlp.add_pipe("llm", config=cfg)
-    nlp("This is a test.")
+    nlp.add_pipe("llm", config=_PIPE_CFG)
+    doc = nlp("This is a test.")
     nlp.get_pipe("llm")._model.get_model_names()
     torch.cuda.empty_cache()
+    assert not doc.user_data["llm_io"]["llm"]["response"].startswith(
+        doc.user_data["llm_io"]["llm"]["prompt"]
+    )
 
 
 @pytest.mark.gpu

diff --git a/spacy_llm/tests/models/test_falcon.py b/spacy_llm/tests/models/test_falcon.py
@@ -13,6 +13,7 @@
         "name": "falcon-rw-1b",
     },
     "task": {"@llm_tasks": "spacy.NoOp.v1"},
+    "save_io": True,
 }
 
 _NLP_CONFIG = """
@@ -43,8 +44,11 @@ def test_init():
     nlp = spacy.blank("en")
     cfg = copy.deepcopy(_PIPE_CFG)
     nlp.add_pipe("llm", config=cfg)
-    nlp("This is a test.")
+    doc = nlp("This is a test.")
     torch.cuda.empty_cache()
+    assert not doc.user_data["llm_io"]["llm"]["response"].startswith(
+        doc.user_data["llm_io"]["llm"]["prompt"]
+    )
 
 
 @pytest.mark.gpu

diff --git a/spacy_llm/tests/models/test_llama2.py b/spacy_llm/tests/models/test_llama2.py
@@ -13,6 +13,7 @@
         "name": "Llama-2-7b-hf",
     },
     "task": {"@llm_tasks": "spacy.NoOp.v1"},
+    "save_io": True,
 }
 
 _NLP_CONFIG = """
@@ -44,8 +45,11 @@ def test_init():
     nlp = spacy.blank("en")
     cfg = copy.deepcopy(_PIPE_CFG)
     nlp.add_pipe("llm", config=cfg)
-    nlp("This is a test.")
+    doc = nlp("This is a test.")
     torch.cuda.empty_cache()
+    assert not doc.user_data["llm_io"]["llm"]["response"].startswith(
+        doc.user_data["llm_io"]["llm"]["prompt"]
+    )
 
 
 @pytest.mark.skip(reason="CI runner needs more GPU memory")

diff --git a/spacy_llm/tests/models/test_openllama.py b/spacy_llm/tests/models/test_openllama.py
@@ -13,6 +13,7 @@
         "name": "open_llama_3b",
     },
     "task": {"@llm_tasks": "spacy.NoOp.v1"},
+    "save_io": True,
 }
 
 _NLP_CONFIG = """
@@ -25,6 +26,7 @@
 
 [components.llm]
 factory = "llm"
+save_io = True
 
 [components.llm.task]
 @llm_tasks = "spacy.NoOp.v1"
@@ -41,8 +43,11 @@ def test_init():
     """Test initialization and simple run."""
     nlp = spacy.blank("en")
     nlp.add_pipe("llm", config=_PIPE_CFG)
-    nlp("This is a test.")
+    doc = nlp("This is a test.")
     torch.cuda.empty_cache()
+    assert not doc.user_data["llm_io"]["llm"]["response"].startswith(
+        doc.user_data["llm_io"]["llm"]["prompt"]
+    )
 
 
 @pytest.mark.gpu
@@ -53,8 +58,11 @@ def test_init_with_set_config():
     cfg = copy.deepcopy(_PIPE_CFG)
     cfg["model"]["config_run"] = {"max_new_tokens": 32}
     nlp.add_pipe("llm", config=cfg)
-    nlp("This is a test.")
+    doc = nlp("This is a test.")
     torch.cuda.empty_cache()
+    assert not doc.user_data["llm_io"]["llm"]["response"].startswith(
+        doc.user_data["llm_io"]["llm"]["prompt"]
+    )
 
 
 @pytest.mark.gpu

diff --git a/spacy_llm/tests/models/test_stablelm.py b/spacy_llm/tests/models/test_stablelm.py
@@ -13,6 +13,7 @@
         "name": "stablelm-base-alpha-3b",
     },
     "task": {"@llm_tasks": "spacy.NoOp.v1"},
+    "save_io": True,
 }
 
 _NLP_CONFIG = """
@@ -44,10 +45,13 @@ def test_init(name: str):
     """
     nlp = spacy.blank("en")
     cfg = copy.deepcopy(_PIPE_CFG)
-    cfg["model"]["name"] = name
+    cfg["model"]["name"] = name  # type: ignore[index]
     nlp.add_pipe("llm", config=cfg)
-    nlp("This is a test.")
+    doc = nlp("This is a test.")
     torch.cuda.empty_cache()
+    assert not doc.user_data["llm_io"]["llm"]["response"].startswith(
+        doc.user_data["llm_io"]["llm"]["prompt"]
+    )
 
 
 @pytest.mark.gpu