diff --git a/setup.py b/setup.py index eb5d835984..a1152d5f16 100644 --- a/setup.py +++ b/setup.py @@ -99,7 +99,7 @@ def _parse_requirements_file(file_path): "black==22.12.0", "flake8>=3.8.3", "isort>=5.7.0", - "flaky~=3.7.0", + "pytest-rerunfailures>=13.0", "ndjson>=0.3.1", "wheel>=0.36.2", "pytest>=6.0.0", diff --git a/src/deepsparse/transformers/pipelines/text_generation/prep_for_generation.py b/src/deepsparse/transformers/pipelines/text_generation/prep_for_generation.py index 3318ec88c5..66b0c2a79b 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/prep_for_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation/prep_for_generation.py @@ -101,6 +101,7 @@ def run( else [], "finished_reason": [], "token_generator": token_generator, + "past_tokens_queue": copy.copy(tokens), } if kv_cache is None: diff --git a/src/deepsparse/transformers/pipelines/text_generation/process_outputs.py b/src/deepsparse/transformers/pipelines/text_generation/process_outputs.py index 6033e10ea4..cae7e24599 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/process_outputs.py +++ b/src/deepsparse/transformers/pipelines/text_generation/process_outputs.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import datetime -from typing import Optional +from typing import List, Optional import numpy @@ -54,6 +54,33 @@ def _create_generated_text_output( finished=False, ) + def _generate_streamed_text_from_past_tokens( + self, generated_tokens: numpy.ndarray, past_tokens_queue: List[int] + ) -> str: + """ + An auxiliary method that helps to properly generate the streamed text. + Some models like llama2 and mistral are using LlamaTokenizer which is + based on SentencePiece tokenizer. This specific tokenizer doesn't seem + to output appropriate prefix spaces when decoding token by token. + One can make it work if the previously generated tokens are included. + This allows the tokenizer to figure out that the appropriate spaces + from last n consecutive tokens. + + :param generated_tokens: the generated tokens from the engine + :param past_tokens_queue: the queue of last n tokens (n is the + original prompt length in tokens) + :return: the generated string + """ + string_from_n_tokens = self.tokenizer.decode( + past_tokens_queue, skip_special_tokens=True + ) + past_tokens_queue.append(generated_tokens[0]) + string_from_n_plus_1_tokens = self.tokenizer.decode( + past_tokens_queue, skip_special_tokens=True + ) + past_tokens_queue.pop(0) + return [string_from_n_plus_1_tokens[len(string_from_n_tokens) :]] + def run( self, generated_tokens: numpy.ndarray, @@ -64,9 +91,24 @@ def run( ): generation_config = inference_state.current_state.get("generation_config") generated_logits = generated_logits if generation_config.output_scores else None - sequences = self.tokenizer.batch_decode( - generated_tokens, skip_special_tokens=True - ) + + import transformers + + # Fix for LLAMA-specific models when running streaming + # TODO: make streaming a conditional input to this operator. using inference + # state is a quick fix. + if isinstance( + self.tokenizer, + (transformers.LlamaTokenizer, transformers.LlamaTokenizerFast), + ) and inference_state.current_state.get("streaming"): + past_tokens_queue = inference_state.current_state.get("past_tokens_queue") + sequences = self._generate_streamed_text_from_past_tokens( + generated_tokens, past_tokens_queue + ) + else: + sequences = self.tokenizer.batch_decode( + generated_tokens, skip_special_tokens=True + ) try: finished_reason = [f[-1] for f in finished_reason] diff --git a/src/deepsparse/version.py b/src/deepsparse/version.py index 848f460af3..0554b5183f 100644 --- a/src/deepsparse/version.py +++ b/src/deepsparse/version.py @@ -39,7 +39,7 @@ from deepsparse.generated_version import is_enterprise, is_release, splash, version except Exception: # otherwise, fall back to version info in this file - version = "1.7.0" + version = "1.7.1" is_release = False is_enterprise = False splash = ( diff --git a/tests/deepsparse/pipelines/test_pipeline.py b/tests/deepsparse/pipelines/test_pipeline.py index 6ad1c71fe4..3406d13815 100644 --- a/tests/deepsparse/pipelines/test_pipeline.py +++ b/tests/deepsparse/pipelines/test_pipeline.py @@ -16,7 +16,6 @@ from concurrent.futures import ThreadPoolExecutor from unittest import mock -import flaky import pytest from deepsparse.legacy.base_pipeline import BasePipeline @@ -125,7 +124,7 @@ def test_pipeline_executor_num_workers(): assert executor._max_workers >= 1 -@flaky.flaky(max_runs=2, min_passes=1) +@pytest.mark.flaky(reruns=2, min_passes=1) @mock_engine(rng_seed=0) def test_pipeline_call_is_async(engine_mock): # attempts to verify that pipeline calls to engine are async diff --git a/tests/server/test_legacy_loggers.py b/tests/server/test_legacy_loggers.py index e52e6fc4d9..ce3a9b9aec 100644 --- a/tests/server/test_legacy_loggers.py +++ b/tests/server/test_legacy_loggers.py @@ -16,6 +16,7 @@ from collections import Counter from unittest import mock +import pytest from deepsparse.legacy.loggers import PythonLogger from deepsparse.legacy.loggers.config import ( PipelineSystemLoggingConfig, @@ -30,7 +31,6 @@ from deepsparse.server.deepsparse_server import DeepsparseServer from deepsparse.server.helpers import server_logger_from_config from fastapi.testclient import TestClient -from flaky import flaky from tests.deepsparse.legacy.loggers.helpers import fetch_leaf_logger from tests.helpers import find_free_port from tests.test_data.server_test_data import SAMPLE_LOGS_DICT @@ -106,7 +106,7 @@ def test_data_logging_from_predefined(): assert log == expected_log -@flaky(max_runs=4, min_passes=3) +@pytest.mark.flaky(reruns=4, min_passes=3) def test_logging_only_system_info(): server_config = ServerConfig( endpoints=[EndpointConfig(task=task, name=name, model=stub)], @@ -195,7 +195,7 @@ def test_multiple_targets_logging(): ) -@flaky(max_runs=3, min_passes=2) +@pytest.mark.flaky(reruns=3, min_passes=2) def test_function_metric_with_target_loggers(): server_config = ServerConfig( endpoints=[