From 3e00175375ad101c419f5cf18dc5c2995ae8f309 Mon Sep 17 00:00:00 2001
From: Benjamin Fineran <bfineran@users.noreply.github.com>
Date: Thu, 26 Oct 2023 13:22:35 -0400
Subject: [PATCH 01/43] Pipelines Refactor - Initial Impl (#1287)

---
 src/deepsparse/v2/__init__.py                 |  21 ++++
 src/deepsparse/v2/operators/__init__.py       |  17 +++
 src/deepsparse/v2/operators/operator.py       |  90 ++++++++++++++++
 src/deepsparse/v2/pipeline.py                 | 102 ++++++++++++++++++
 src/deepsparse/v2/routers/__init__.py         |  17 +++
 src/deepsparse/v2/routers/router.py           |  95 ++++++++++++++++
 src/deepsparse/v2/schedulers/__init__.py      |  18 ++++
 src/deepsparse/v2/schedulers/scheduler.py     |  63 +++++++++++
 .../v2/schedulers/scheduler_group.py          |  64 +++++++++++
 src/deepsparse/v2/utils/__init__.py           |  18 ++++
 src/deepsparse/v2/utils/context.py            |  42 ++++++++
 src/deepsparse/v2/utils/types.py              |  28 +++++
 tests/deepsparse/v2/__init__.py               |   0
 tests/deepsparse/v2/test_basic_pipeline.py    |  45 ++++++++
 14 files changed, 620 insertions(+)
 create mode 100644 src/deepsparse/v2/__init__.py
 create mode 100644 src/deepsparse/v2/operators/__init__.py
 create mode 100644 src/deepsparse/v2/operators/operator.py
 create mode 100644 src/deepsparse/v2/pipeline.py
 create mode 100644 src/deepsparse/v2/routers/__init__.py
 create mode 100644 src/deepsparse/v2/routers/router.py
 create mode 100644 src/deepsparse/v2/schedulers/__init__.py
 create mode 100644 src/deepsparse/v2/schedulers/scheduler.py
 create mode 100644 src/deepsparse/v2/schedulers/scheduler_group.py
 create mode 100644 src/deepsparse/v2/utils/__init__.py
 create mode 100644 src/deepsparse/v2/utils/context.py
 create mode 100644 src/deepsparse/v2/utils/types.py
 create mode 100644 tests/deepsparse/v2/__init__.py
 create mode 100644 tests/deepsparse/v2/test_basic_pipeline.py

diff --git a/src/deepsparse/v2/__init__.py b/src/deepsparse/v2/__init__.py
new file mode 100644
index 0000000000..4a897be06f
--- /dev/null
+++ b/src/deepsparse/v2/__init__.py
@@ -0,0 +1,21 @@
+# flake8: noqa
+
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .pipeline import *
+from .operators import *
+from .routers import *
+from .schedulers import *
+from .utils import *
diff --git a/src/deepsparse/v2/operators/__init__.py b/src/deepsparse/v2/operators/__init__.py
new file mode 100644
index 0000000000..8f7e6a169d
--- /dev/null
+++ b/src/deepsparse/v2/operators/__init__.py
@@ -0,0 +1,17 @@
+# flake8: noqa
+
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .operator import *
diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py
new file mode 100644
index 0000000000..30e1a48379
--- /dev/null
+++ b/src/deepsparse/v2/operators/operator.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Optional, Type
+
+from pydantic import BaseModel
+
+from deepsparse.v2.utils import Context, OperatorSchema
+
+
+__all__ = ["Operator"]
+
+
+class Operator(ABC):
+    """
+    Base operator class - can represent any part of an ML pipeline
+    """
+
+    # expected structured input and output types, to be defined by child classes
+    input_schema: Optional[Type[OperatorSchema]] = None
+    output_schema: Optional[Type[OperatorSchema]] = None
+
+    @abstractmethod
+    def run(self, inp: OperatorSchema, context: Context) -> OperatorSchema:
+        """
+        :param inp: operator input, as the defined input schema if applicable
+        :param context: pipeline context of already run operators
+        :return: result of this operator as the defined output schema if applicable
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def has_input_schema(cls) -> bool:
+        """
+        :return: True if this class has a defined pydantic input schema
+        """
+        return issubclass(cls.input_schema, BaseModel)
+
+    @classmethod
+    def has_output_schema(cls) -> bool:
+        """
+        :return: True if this class has a defined pydantic input schema
+        """
+        return issubclass(cls.output_schema, BaseModel)
+
+    def __call__(
+        self,
+        *args,
+        context: Optional[Context] = None,
+        **kwargs,
+    ) -> OperatorSchema:
+        """
+        Parses inputs to this Operator and runs the run() method of this operator
+
+        :param args: an unnamed arg may only be provided
+            if it is of the type of the input_schema
+        :param context: pipeline context to pass to operator
+        :param kwargs: kwargs when not initializing from an instantiated schema
+        :return: operator output
+        """
+        if len(args) > 1:
+            raise ValueError(
+                f"Only 1 unnamed arg may be supplied to an Operator, found {len(args)}"
+            )
+
+        if len(args) == 1:
+            if self.input_schema is not None and isinstance(args[0], self.input_schema):
+                inference_input = args[0]
+            else:
+                raise ValueError(
+                    f"1 arg supplied to Operator {self.__class__.__name__} but was not "
+                    f"of expected type {self.input_schema}, found {type(args[0])}"
+                )
+        elif self.has_input_schema():
+            inference_input = self.input_schema(**kwargs)
+        else:
+            inference_input = kwargs
+        return self.run(inference_input, context=context)
diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py
new file mode 100644
index 0000000000..0ec580687d
--- /dev/null
+++ b/src/deepsparse/v2/pipeline.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List
+
+from pydantic import BaseModel, Field, PrivateAttr
+
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.routers import Router
+from deepsparse.v2.schedulers import OperatorScheduler, SchedulerGroup
+
+
+__all__ = ["Pipeline"]
+
+
+class Pipeline(BaseModel):
+    """
+    Pipeline accepts a series of operators, schedulers, and a router which define
+    an end to end ML transformation.
+
+    Calling a pipeline runs these transformations
+    """
+
+    stages: List[Operator] = Field(
+        required=True,
+        description="In-order list of operators that make up this pipeline",
+    )
+    router: Router = Field(
+        default_factor=Router,
+        description="Router object to determine order and run the stages. "
+        "Defaults to the base Router object",
+    )
+    schedulers: List[OperatorScheduler] = Field(
+        default_factor=lambda: [OperatorScheduler()],
+        description="List of schedulers to run operators in order of priority",
+    )
+
+    _scheduler_group: SchedulerGroup = PrivateAttr()
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.validate()
+
+        # SchedulerGroup handles running all schedulers in order of priority
+        self._scheduler_group = SchedulerGroup(self.schedulers)
+
+    def __call__(self, *args, return_context: bool = False, **kwargs):
+        """
+        :param return_context: if True, retrns tuple of the pipelien output
+            and entire context. Default False
+        :return: output of the pipeline stages ran with the router for the given input
+        """
+        if len(args) > 1:
+            raise ValueError(
+                "Only 1 in-line argument may be supplied to Pipeline which "
+                f"must be a Schema, found: {len(args)}"
+            )
+        if args and kwargs:
+            raise ValueError(
+                "Pipeline can only run either a single in-line argument schema or a "
+                f"series of kwargs, found {len(args)} args and {len(kwargs)} kwargs"
+            )
+
+        pipeline_input = args[0] or kwargs
+        pipeline_output, context = self.router.run(
+            inp=pipeline_input,
+            operators=self.stages,
+            scheduler=self._scheduler_group,
+        )
+
+        if return_context:
+            return pipeline_output, context
+
+        return pipeline_output
+
+    def validate(self):
+        router_validation = self.router.validate(self.stages)
+
+        if router_validation is False:
+            # default error message
+            stage_types = [type(stage) for stage in self.stages]
+            raise ValueError(
+                f"Invalid Router: {type(self.router)} for stages: {stage_types}"
+            )
+        elif isinstance(router_validation, str):
+            raise ValueError(f"Invalid Router for stages: {router_validation}")
diff --git a/src/deepsparse/v2/routers/__init__.py b/src/deepsparse/v2/routers/__init__.py
new file mode 100644
index 0000000000..8718bedeb4
--- /dev/null
+++ b/src/deepsparse/v2/routers/__init__.py
@@ -0,0 +1,17 @@
+# flake8: noqa
+
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .router import *
diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py
new file mode 100644
index 0000000000..284c348c10
--- /dev/null
+++ b/src/deepsparse/v2/routers/router.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List, Tuple, Union
+
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.schedulers import OperatorScheduler
+from deepsparse.v2.utils import Context, OperatorSchema
+
+
+__all__ = ["Router"]
+
+
+class Router:
+    """
+    Routers must implement a run method which runs a series of operators
+    for a pipeline for a given input. Base Router runs operators linearly
+    in a series
+    """
+
+    @staticmethod
+    def run(
+        inp: OperatorSchema,
+        operators: List[Operator],
+        scheduler: OperatorScheduler,
+    ) -> Tuple[OperatorSchema, Context]:
+        """
+        :param inp: input to the first operator of the series
+        :param operators: list of operators to run
+        :param scheduler: scheudler to submit operators to
+        :return: final output of the operators
+        """
+        context = Context()
+
+        # run operators linearly
+        operator_input = inp
+        for operator in operators:
+            output_future = scheduler.submit(
+                operator=operator, operator_input=operator_input, context=context
+            )
+
+            # wait for future to resolve
+            operator_output = output_future.result()
+
+            # update context
+            context.update(
+                operator=operator,
+                input=operator_input,
+                output=operator_output,
+            )
+
+            # previous output becomes next input
+            operator_input = operator_output
+
+        return operator_output, context
+
+    @staticmethod
+    def validate(operators: List[Operator]) -> Union[bool, str]:
+        """
+        :param operators: operators that this Router could potentially run over
+        :return: True if this Router can run this series of operators. Base Router
+            runs any series of operators that is non empty and whose input and output
+            schemas align. If not valid, either False or an error string will be
+            returned
+        """
+        if len(operators) < 1:
+            return "No operators found"
+
+        for idx in range(len(operators) - 1):
+            current_output_schema = operators[idx].output_schema
+            next_input_schema = operators[idx + 1].input_schema
+
+            if current_output_schema is None or next_input_schema is None:
+                # if no input/output schema defined, assume operator can run
+                # without schema
+                continue
+
+            if current_output_schema != next_input_schema:
+                return (
+                    f"Operator at idx {idx}: {type(operators[idx])} has invalid "
+                    f"output schema {current_output_schema} for next operator "
+                    f"{type(operators[idx + 1])} which requires {next_input_schema}"
+                )
diff --git a/src/deepsparse/v2/schedulers/__init__.py b/src/deepsparse/v2/schedulers/__init__.py
new file mode 100644
index 0000000000..04c37077e1
--- /dev/null
+++ b/src/deepsparse/v2/schedulers/__init__.py
@@ -0,0 +1,18 @@
+# flake8: noqa
+
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .scheduler import *
+from .scheduler_group import *
diff --git a/src/deepsparse/v2/schedulers/scheduler.py b/src/deepsparse/v2/schedulers/scheduler.py
new file mode 100644
index 0000000000..53f0c8f625
--- /dev/null
+++ b/src/deepsparse/v2/schedulers/scheduler.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from concurrent.futures import Future, ThreadPoolExecutor
+
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.utils import Context, OperatorSchema
+
+
+__all__ = ["OperatorScheduler"]
+
+
+class OperatorScheduler:
+    """
+    OperatorSchedulers should implement a `submit` function that asynchronously
+    runs an operator and its input and returns a Future. Priority of operators
+    to run and resources they are run on are deferred to specific OperatorScheduler
+    implementations
+
+    Base OperatorScheduler behaves as a simple queue deferring to ThreadPoolExecutor
+
+    :param max_workers: maximum number of threads to execute at once
+    """
+
+    def __init__(self, max_workers: int = 1):
+        self._threadpool = ThreadPoolExecutor(max_workers=max_workers)
+
+    def submit(
+        self,
+        operator: Operator,
+        operator_input: OperatorSchema,
+        context: Context,
+    ) -> Future:
+        """
+        :param operator: operator to run
+        :param operator_input: input schema to the operator
+        :param context: context of already run operators
+        :return: future referencing the asynchronously run output of the operator
+        """
+        if isinstance(operator_input, dict):
+            return self._threadpool.submit(operator, context=context, **operator_input)
+        return self._threadpool.submit(operator, operator_input, context=context)
+
+    def can_process(self, operator: Operator, operator_input: OperatorSchema) -> bool:
+        """
+        :param operator: operator to check
+        :param operator_input: operator_input to check
+        :return: True if this Operator can process the given operator and input.
+            Base OperatorScheduler always returns True
+        """
+        return True
diff --git a/src/deepsparse/v2/schedulers/scheduler_group.py b/src/deepsparse/v2/schedulers/scheduler_group.py
new file mode 100644
index 0000000000..2f797b30c7
--- /dev/null
+++ b/src/deepsparse/v2/schedulers/scheduler_group.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from concurrent.futures import Future
+from typing import List
+
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.schedulers.scheduler import OperatorScheduler
+from deepsparse.v2.utils import Context, OperatorSchema
+
+
+__all__ = ["SchedulerGroup"]
+
+
+class SchedulerGroup(OperatorScheduler):
+    """
+    Wrapper for a series of schedulers. Runs submitted operators on the first
+    scheduler that can process a given input
+
+    :param schedulers: list of schedulers to pass operators to
+    """
+
+    def __init__(self, schedulers: List[OperatorScheduler]):
+        self.schedulers = schedulers
+
+    def submit(
+        self,
+        operator: Operator,
+        operator_input: OperatorSchema,
+        context: Context,
+    ) -> Future:
+        """
+        :param operator: operator to run
+        :param operator_input: input schema to the operator
+        :param context: context of already run operators
+        :return: future referencing the asynchronously run output of the operator
+        """
+        for scheduler in self.schedulers:
+            if scheduler.can_process(operator, operator_input):
+                return scheduler.submit(operator, operator_input, context)
+
+    def can_process(self, operator: Operator, operator_input: OperatorSchema) -> bool:
+        """
+        :param operator: operator to check
+        :param operator_input: operator_input to check
+        :return: True if this Operator can process the given operator and input.
+            SchedulerGroup always returns True
+        """
+        return any(
+            scheduler.can_process(operator, operator_input)
+            for scheduler in self.schedulers
+        )
diff --git a/src/deepsparse/v2/utils/__init__.py b/src/deepsparse/v2/utils/__init__.py
new file mode 100644
index 0000000000..4f36eeb448
--- /dev/null
+++ b/src/deepsparse/v2/utils/__init__.py
@@ -0,0 +1,18 @@
+# flake8: noqa
+
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .context import *
+from .types import *
diff --git a/src/deepsparse/v2/utils/context.py b/src/deepsparse/v2/utils/context.py
new file mode 100644
index 0000000000..81fe26de61
--- /dev/null
+++ b/src/deepsparse/v2/utils/context.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Callable, List, NamedTuple
+
+from deepsparse.v2.utils.types import OperatorSchema
+
+
+__all__ = ["Context"]
+
+
+class StageInfo(NamedTuple):
+    operator: Callable
+    input: OperatorSchema
+    output: OperatorSchema
+
+
+class Context:
+    """
+    Context contains the full history of operators and their inputs and outputs
+    in a pipeline
+    """
+
+    def __init__(self):
+        self.stages_executed: List[StageInfo] = []
+
+    def update(self, operator: Callable, input: OperatorSchema, output: OperatorSchema):
+        self.stages_executed.append(
+            StageInfo(operator=operator, input=input, output=output)
+        )
diff --git a/src/deepsparse/v2/utils/types.py b/src/deepsparse/v2/utils/types.py
new file mode 100644
index 0000000000..3e4b974453
--- /dev/null
+++ b/src/deepsparse/v2/utils/types.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Types to support deepsparse pipelines
+"""
+
+from typing import Any, Dict, Union
+
+from pydantic import BaseModel
+
+
+__all__ = ["OperatorSchema"]
+
+
+# Operator inputs and outputs may either be a pydantic base model or a dict of kwargs
+OperatorSchema = Union[BaseModel, Dict[str, Any]]
diff --git a/tests/deepsparse/v2/__init__.py b/tests/deepsparse/v2/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/deepsparse/v2/test_basic_pipeline.py b/tests/deepsparse/v2/test_basic_pipeline.py
new file mode 100644
index 0000000000..d39bc61c8c
--- /dev/null
+++ b/tests/deepsparse/v2/test_basic_pipeline.py
@@ -0,0 +1,45 @@
+"""
+Simple example and test of a dummy pipeline
+"""
+
+from pydantic import BaseModel
+
+from deepsparse.v2 import Pipeline
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.routers import Router
+from deepsparse.v2.schedulers import OperatorScheduler
+from deepsparse.v2.utils import Context, OperatorSchema
+
+
+class IntSchema(BaseModel):
+    value: int
+
+
+class AddOneOperator(Operator):
+    input_schema = IntSchema
+    output_schema = IntSchema
+
+    def run(self, inp: IntSchema, context: Context) -> OperatorSchema:
+        return IntSchema(value=inp.value + 1)
+
+
+class AddTwoOperator(Operator):
+    input_schema = IntSchema
+    output_schema = IntSchema
+
+    def run(self, inp: IntSchema, context: Context) -> OperatorSchema:
+        return IntSchema(value=inp.value + 2)
+
+
+AddThreePipeline = Pipeline(
+    stages=[AddOneOperator(), AddTwoOperator()],
+    router=Router(),
+    schedulers=[OperatorScheduler()],
+)
+
+
+def test_run_simple_pipeline():
+    pipeline_input = IntSchema(value=5)
+    pipeline_output = AddThreePipeline(pipeline_input)
+
+    assert pipeline_output.value == 8

From 224e116dbd0dea213021ccf87b12577d8a408b55 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 31 Oct 2023 16:24:10 -0400
Subject: [PATCH 02/43] [Pipeline Refactor] Additional functionality, engine
 operator, linear router and image classification pipeline/operators/example
 (#1325)

* initial functionality and working example with image classification

* remove testing image

* update args

* initial functionality and working example with image classification

* remove testing image

* pr comments

* defines schemas for operators and test

* add image classification test, PR comments

* fix input/output handling in pipeline and operator base classes to be more generic; remove context

* add additional operator input message

* typo fix
---
 src/deepsparse/v2/__init__.py                 |   2 +-
 .../v2/image_classification/__init__.py       |  20 +++
 .../v2/image_classification/pipeline.py       |  62 ++++++++
 .../postprocess_operator.py                   |  81 ++++++++++
 .../preprocess_operator.py                    | 149 ++++++++++++++++++
 .../v2/operators/engine_operator.py           | 133 ++++++++++++++++
 src/deepsparse/v2/operators/operator.py       |  92 +++++++----
 src/deepsparse/v2/pipeline.py                 | 130 ++++++++-------
 src/deepsparse/v2/routers/router.py           |  88 ++++++-----
 src/deepsparse/v2/schedulers/scheduler.py     |  14 +-
 .../v2/schedulers/scheduler_group.py          |  16 +-
 src/deepsparse/v2/utils/__init__.py           |   1 -
 src/deepsparse/v2/utils/context.py            |  42 -----
 tests/deepsparse/v2/__init__.py               |  13 ++
 tests/deepsparse/v2/test_basic_pipeline.py    |  31 +++-
 .../v2/test_image_classification.py           |  39 +++++
 16 files changed, 709 insertions(+), 204 deletions(-)
 create mode 100644 src/deepsparse/v2/image_classification/__init__.py
 create mode 100644 src/deepsparse/v2/image_classification/pipeline.py
 create mode 100644 src/deepsparse/v2/image_classification/postprocess_operator.py
 create mode 100644 src/deepsparse/v2/image_classification/preprocess_operator.py
 create mode 100644 src/deepsparse/v2/operators/engine_operator.py
 delete mode 100644 src/deepsparse/v2/utils/context.py
 create mode 100644 tests/deepsparse/v2/test_image_classification.py

diff --git a/src/deepsparse/v2/__init__.py b/src/deepsparse/v2/__init__.py
index 4a897be06f..29fcd4126c 100644
--- a/src/deepsparse/v2/__init__.py
+++ b/src/deepsparse/v2/__init__.py
@@ -14,8 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .pipeline import *
 from .operators import *
+from .pipeline import *
 from .routers import *
 from .schedulers import *
 from .utils import *
diff --git a/src/deepsparse/v2/image_classification/__init__.py b/src/deepsparse/v2/image_classification/__init__.py
new file mode 100644
index 0000000000..8668227df7
--- /dev/null
+++ b/src/deepsparse/v2/image_classification/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flake8: noqa
+from .postprocess_operator import *
+from .preprocess_operator import *
+
+
+from .pipeline import *  # isort:skip
diff --git a/src/deepsparse/v2/image_classification/pipeline.py b/src/deepsparse/v2/image_classification/pipeline.py
new file mode 100644
index 0000000000..3d7887a701
--- /dev/null
+++ b/src/deepsparse/v2/image_classification/pipeline.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import warnings
+from typing import Dict, Optional, Tuple, Union
+
+from deepsparse.v2.image_classification.postprocess_operator import (
+    ImageClassificationPostProcess,
+)
+from deepsparse.v2.image_classification.preprocess_operator import (
+    ImageClassificationPreProcess,
+)
+from deepsparse.v2.operators.engine_operator import EngineOperator
+from deepsparse.v2.pipeline import Pipeline
+from deepsparse.v2.routers.router import LinearRouter
+from deepsparse.v2.schedulers.scheduler import OperatorScheduler
+
+
+_LOGGER = logging.getLogger(__name__)
+
+__all__ = ["ImageClassificationPipeline"]
+
+
+class ImageClassificationPipeline(Pipeline):
+    def __init__(
+        self,
+        model_path: str,
+        engine_kwargs: Optional[Dict] = None,
+        class_names: Union[None, str, Dict[str, str]] = None,
+        image_size: Optional[Tuple[int]] = None,
+        top_k: int = 1,
+    ):
+        if not engine_kwargs:
+            engine_kwargs = {}
+            engine_kwargs["model_path"] = model_path
+        elif engine_kwargs.get("model_path") != model_path:
+            warnings.warn(f"Updating engine_kwargs to include {model_path}")
+
+        engine = EngineOperator(**engine_kwargs)
+        preproces = ImageClassificationPreProcess(
+            model_path=engine.model_path, image_size=image_size
+        )
+        postprocess = ImageClassificationPostProcess(
+            top_k=top_k, class_names=class_names
+        )
+
+        ops = [preproces, engine, postprocess]
+        router = LinearRouter(end_route=len(ops))
+        scheduler = [OperatorScheduler()]
+        super().__init__(ops=ops, router=router, schedulers=scheduler)
diff --git a/src/deepsparse/v2/image_classification/postprocess_operator.py b/src/deepsparse/v2/image_classification/postprocess_operator.py
new file mode 100644
index 0000000000..9231113368
--- /dev/null
+++ b/src/deepsparse/v2/image_classification/postprocess_operator.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from typing import Dict, List, Union
+
+import numpy
+from pydantic import BaseModel, Field
+
+from deepsparse.v2.operators import Operator
+
+
+class ImageClassificationOutput(BaseModel):
+    """
+    Output model for image classification
+    """
+
+    labels: List[Union[int, str, List[int], List[str]]] = Field(
+        description="List of labels, one for each prediction"
+    )
+    scores: List[Union[float, List[float]]] = Field(
+        description="List of scores, one for each prediction"
+    )
+
+
+__all__ = ["ImageClassificationPostProcess"]
+
+
+class ImageClassificationPostProcess(Operator):
+    """
+    Image Classification post-processing Operator. This Operator is responsible for
+    processing outputs from the engine and returning the classification results to
+    the user, using the ImageClassifcationOutput structure.
+    """
+
+    input_schema = None
+    output_schema = ImageClassificationOutput
+
+    def __init__(
+        self, top_k: int = 1, class_names: Union[None, str, Dict[str, str]] = None
+    ):
+        self.top_k = top_k
+        if isinstance(class_names, str) and class_names.endswith(".json"):
+            self._class_names = json.load(open(class_names))
+        elif isinstance(class_names, dict):
+            self._class_names = class_names
+        else:
+            self._class_names = None
+
+    def run(self, inp: "EngineOperatorOutputs", **kwargs) -> Dict:  # noqa: F821
+        labels, scores = [], []
+        inp = inp.engine_outputs
+        for prediction_batch in inp[0]:
+            label = (-prediction_batch).argsort()[: self.top_k]
+            score = prediction_batch[label]
+            labels.append(label)
+            scores.append(score.tolist())
+
+        if self._class_names is not None:
+            labels = numpy.vectorize(self._class_names.__getitem__)(labels)
+            labels = labels.tolist()
+
+        if isinstance(labels[0], numpy.ndarray):
+            labels = [label.tolist() for label in labels]
+
+        if len(labels) == 1:
+            labels = labels[0]
+            scores = scores[0]
+
+        return {"scores": scores, "labels": labels}
diff --git a/src/deepsparse/v2/image_classification/preprocess_operator.py b/src/deepsparse/v2/image_classification/preprocess_operator.py
new file mode 100644
index 0000000000..9b4517a44c
--- /dev/null
+++ b/src/deepsparse/v2/image_classification/preprocess_operator.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional, Tuple
+
+import numpy
+import onnx
+from PIL import Image
+from torchvision import transforms
+
+from deepsparse.image_classification.constants import (
+    IMAGENET_RGB_MEANS,
+    IMAGENET_RGB_STDS,
+)
+from deepsparse.pipelines.computer_vision import ComputerVisionSchema
+from deepsparse.v2.operators import Operator
+
+
+class ImageClassificationInput(ComputerVisionSchema):
+    """
+    Input model for image classification
+    """
+
+
+__all__ = ["ImageClassificationPreProcess"]
+
+
+class ImageClassificationPreProcess(Operator):
+    """
+    Image Classification pre-processing operator. This Operator is expected to process
+    the user inputs and prepare them for the engine. Inputs to this Operator are
+    expected to follow the ImageClassificationInput schema.
+    """
+
+    input_schema = ImageClassificationInput
+    output_schema = None
+
+    def __init__(self, model_path: str, image_size: Optional[Tuple[int]] = None):
+        self.model_path = model_path
+        self._image_size = image_size or self._infer_image_size()
+        non_rand_resize_scale = 256.0 / 224.0  # standard used
+        self._pre_normalization_transforms = transforms.Compose(
+            [
+                transforms.Resize(
+                    tuple(
+                        [
+                            round(non_rand_resize_scale * size)
+                            for size in self._image_size
+                        ]
+                    )
+                ),
+                transforms.CenterCrop(self._image_size),
+            ]
+        )
+
+    def run(self, inp: ImageClassificationInput, **kwargs) -> Dict:
+        """
+        Pre-Process the Inputs for DeepSparse Engine
+
+        :param inputs: input model
+        :return: list of preprocessed numpy arrays
+        """
+
+        if isinstance(inp.images, numpy.ndarray):
+            image_batch = inp.images
+        else:
+            if isinstance(inp.images, str):
+                inp.images = [inp.images]
+
+            image_batch = list(map(self._preprocess_image, inp.images))
+
+            # build batch
+            image_batch = numpy.stack(image_batch, axis=0)
+
+        original_dtype = image_batch.dtype
+        image_batch = numpy.ascontiguousarray(image_batch, dtype=numpy.float32)
+
+        if original_dtype == numpy.uint8:
+            image_batch /= 255
+            # normalize entire batch
+            image_batch -= numpy.asarray(IMAGENET_RGB_MEANS).reshape((-1, 3, 1, 1))
+            image_batch /= numpy.asarray(IMAGENET_RGB_STDS).reshape((-1, 3, 1, 1))
+
+        return {"engine_inputs": [image_batch]}
+
+    def _preprocess_image(self, image) -> numpy.ndarray:
+        if isinstance(image, List):
+            # image given as raw list
+            image = numpy.asarray(image)
+            if image.dtype == numpy.float32:
+                # image is already processed, append and continue
+                return image
+            # assume raw image input
+            # put image in PIL format for torchvision processing
+            image = image.astype(numpy.uint8)
+            if image.shape[0] < image.shape[-1]:
+                # put channel last
+                image = numpy.einsum("cwh->whc", image)
+            image = Image.fromarray(image)
+        elif isinstance(image, str):
+            # load image from string filepath
+            image = Image.open(image).convert("RGB")
+        elif isinstance(image, numpy.ndarray):
+            image = image.astype(numpy.uint8)
+            if image.shape[0] < image.shape[-1]:
+                # put channel last
+                image = numpy.einsum("cwh->whc", image)
+            image = Image.fromarray(image)
+
+        if not isinstance(image, Image.Image):
+            raise ValueError(
+                f"inputs to {self.__class__.__name__} must be a string image "
+                "file path(s), a list representing a raw image, "
+                "PIL.Image.Image object(s), or a numpy array representing"
+                f"the entire pre-processed batch. Found {type(image)}"
+            )
+
+        # apply resize and center crop
+        image = self._pre_normalization_transforms(image)
+        image_numpy = numpy.array(image)
+        image.close()
+
+        # make channel first dimension
+        image_numpy = image_numpy.transpose(2, 0, 1)
+        return image_numpy
+
+    def _infer_image_size(self) -> Tuple[int, ...]:
+        """
+        Infer and return the expected shape of the input tensor
+
+        :return: The expected shape of the input tensor from onnx graph
+        """
+        onnx_model = onnx.load(self.model_path)
+        input_tensor = onnx_model.graph.input[0]
+        return (
+            input_tensor.type.tensor_type.shape.dim[2].dim_value,
+            input_tensor.type.tensor_type.shape.dim[3].dim_value,
+        )
diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py
new file mode 100644
index 0000000000..aac94a7697
--- /dev/null
+++ b/src/deepsparse/v2/operators/engine_operator.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional, Union
+
+from pydantic import BaseModel, Field
+
+from deepsparse import Context, Engine, MultiModelEngine, Scheduler
+from deepsparse.benchmark import ORTEngine
+from deepsparse.utils import join_engine_outputs, model_to_path, split_engine_inputs
+from deepsparse.v2.operators import Operator
+
+
+DEEPSPARSE_ENGINE = "deepsparse"
+ORT_ENGINE = "onnxruntime"
+
+SUPPORTED_PIPELINE_ENGINES = [DEEPSPARSE_ENGINE, ORT_ENGINE]
+
+__all__ = ["EngineOperator"]
+
+
+class EngineOperatorInputs(BaseModel):
+    engine_inputs: List = Field(description="engine_inputs")
+
+
+class EngineOperatorOutputs(BaseModel):
+    engine_outputs: List = Field(description="engine outputs")
+
+
+class EngineOperator(Operator):
+    input_schema = EngineOperatorInputs
+    output_schema = EngineOperatorOutputs
+
+    def __init__(
+        self,
+        model_path: str,
+        engine_type: str = DEEPSPARSE_ENGINE,
+        batch_size: Optional[int] = 1,
+        num_cores: int = None,
+        num_streams: int = None,
+        scheduler: Scheduler = None,
+        input_shapes: List[List[int]] = None,
+        engine_context: Optional[Context] = None,
+    ):
+
+        self._batch_size = batch_size
+        self.model_path = model_to_path(model_path)
+        self.engine_context = engine_context
+
+        if self.engine_context is not None:
+            num_cores = num_cores or self.engine_context.num_cores
+            if self.engine_context.num_cores != num_cores:
+                raise ValueError(
+                    f"num_cores mismatch. Expected {self.engine_context.num_cores} "
+                    f"from passed context, but got {num_cores} while "
+                    f"instantiating Pipeline"
+                )
+
+        engine_args = dict(
+            batch_size=self._batch_size,
+            num_cores=num_cores,
+            input_shapes=input_shapes,
+        )
+        if engine_type.lower() == DEEPSPARSE_ENGINE:
+            engine_args["scheduler"] = scheduler
+            engine_args["num_streams"] = num_streams
+
+        self.engine = self._create_engine(self.model_path, engine_type, engine_args)
+
+    def _create_engine(
+        self, onnx_file_path: str, engine_type: str, engine_args: Dict
+    ) -> Union[Engine, MultiModelEngine, ORTEngine]:
+        """
+        Create an inference engine for a given ONNX model
+
+        :param onnx_file_path: path to ONNX model file
+        :param engine_type: type of engine to create.
+        :param engine_args: arguments to pass to engine constructor
+        :param context: context to use for engine
+        :return: inference engine
+        """
+        engine_type = engine_type.lower()
+
+        if engine_type == DEEPSPARSE_ENGINE:
+            if self.engine_context is not None and isinstance(
+                self.engine_context, Context
+            ):
+                engine_args.pop("num_cores", None)
+                engine_args.pop("scheduler", None)
+                engine_args.pop("num_streams", None)
+                engine_args["context"] = self.engien_context
+                return MultiModelEngine(
+                    model=onnx_file_path,
+                    **engine_args,
+                )
+            engine_args.pop("cache_output_bools", None)
+            return Engine(onnx_file_path, **engine_args)
+
+        if engine_type == ORT_ENGINE:
+            return ORTEngine(onnx_file_path, **engine_args)
+
+        raise ValueError(
+            f"Unknown engine_type {engine_type}. Supported values include: "
+            f"{SUPPORTED_PIPELINE_ENGINES}"
+        )
+
+    def run(self, inp: EngineOperatorInputs) -> Dict:
+        inp = inp.engine_inputs
+        batches, orig_batch_size = self.expand_inputs(engine_inputs=inp)
+        batches_outputs = list(map(self.engine, batches))
+        engine_outputs = self.condense_inputs(
+            batch_outputs=batches_outputs, orig_batch_size=orig_batch_size
+        )
+        return {"engine_outputs": engine_outputs}
+
+    def expand_inputs(self, **kwargs):
+        return split_engine_inputs(kwargs["engine_inputs"], self._batch_size)
+
+    def condense_inputs(self, **kwargs):
+        batch_outputs = kwargs["batch_outputs"]
+        orig_batch_size = kwargs["orig_batch_size"]
+        return join_engine_outputs(batch_outputs, orig_batch_size)
diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py
index 30e1a48379..c3a3e28b78 100644
--- a/src/deepsparse/v2/operators/operator.py
+++ b/src/deepsparse/v2/operators/operator.py
@@ -13,39 +13,32 @@
 # limitations under the License.
 
 from abc import ABC, abstractmethod
-from typing import Optional, Type
+from typing import Any, Optional, Type
 
 from pydantic import BaseModel
 
-from deepsparse.v2.utils import Context, OperatorSchema
-
 
 __all__ = ["Operator"]
 
 
 class Operator(ABC):
     """
-    Base operator class - can represent any part of an ML pipeline
+    Base operator class - an operator should be defined for each atomic, functional
+    part of the pipeline.
     """
 
     # expected structured input and output types, to be defined by child classes
-    input_schema: Optional[Type[OperatorSchema]] = None
-    output_schema: Optional[Type[OperatorSchema]] = None
-
-    @abstractmethod
-    def run(self, inp: OperatorSchema, context: Context) -> OperatorSchema:
-        """
-        :param inp: operator input, as the defined input schema if applicable
-        :param context: pipeline context of already run operators
-        :return: result of this operator as the defined output schema if applicable
-        """
-        raise NotImplementedError
+    input_schema: Optional[Type[BaseModel]] = None
+    output_schema: Optional[Type[BaseModel]] = None
 
     @classmethod
     def has_input_schema(cls) -> bool:
         """
         :return: True if this class has a defined pydantic input schema
         """
+        if not cls.input_schema:
+            return False
+
         return issubclass(cls.input_schema, BaseModel)
 
     @classmethod
@@ -53,38 +46,73 @@ def has_output_schema(cls) -> bool:
         """
         :return: True if this class has a defined pydantic input schema
         """
+        if not cls.output_schema:
+            return False
+
         return issubclass(cls.output_schema, BaseModel)
 
     def __call__(
         self,
         *args,
-        context: Optional[Context] = None,
         **kwargs,
-    ) -> OperatorSchema:
+    ) -> Any:
         """
         Parses inputs to this Operator and runs the run() method of this operator
 
-        :param args: an unnamed arg may only be provided
-            if it is of the type of the input_schema
+        :param args: an unnamed arg may only be provided if it is of the type of the
+            input_schema
         :param context: pipeline context to pass to operator
         :param kwargs: kwargs when not initializing from an instantiated schema
         :return: operator output
         """
-        if len(args) > 1:
-            raise ValueError(
-                f"Only 1 unnamed arg may be supplied to an Operator, found {len(args)}"
-            )
-
-        if len(args) == 1:
-            if self.input_schema is not None and isinstance(args[0], self.input_schema):
+        if self.has_input_schema():
+            if len(args) > 1:
+                raise ValueError(
+                    f"The operator requires an {self.input_schema}. Too many arguments"
+                    "provided."
+                )
+            elif args and isinstance(args[0], self.input_schema):
                 inference_input = args[0]
+            elif kwargs:
+                inference_input = self.input_schema(**kwargs)
             else:
                 raise ValueError(
-                    f"1 arg supplied to Operator {self.__class__.__name__} but was not "
-                    f"of expected type {self.input_schema}, found {type(args[0])}"
+                    "Can't resolve inputs. The values for the schema must be provided"
+                    "in the form of a dictionary or an instance of the input_schema"
+                    "object"
                 )
-        elif self.has_input_schema():
-            inference_input = self.input_schema(**kwargs)
+
+            run_output = self.run(inference_input)
         else:
-            inference_input = kwargs
-        return self.run(inference_input, context=context)
+            run_output = self.run(*args, **kwargs)
+
+        if self.has_output_schema():
+            return self.output_schema(**run_output)
+        return run_output
+
+    @abstractmethod
+    def run(self, *args, **kwargs) -> Any:
+        """
+        :param inp: operator input, as the defined input schema if applicable
+        :param context: pipeline context of already run operators
+        :return: result of this operator as the defined output schema if applicable
+        """
+        raise NotImplementedError
+
+    def expand_inputs(self, **kwargs):
+        """
+        Generic function to handle expanding values.
+        """
+        raise NotImplementedError
+
+    def condense_inputs(self, **kwargs):
+        """
+        Generic function to handle condensing values.
+        """
+        raise NotImplementedError
+
+    def yaml(self):
+        pass
+
+    def json(self):
+        pass
diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py
index 0ec580687d..e58f8a5191 100644
--- a/src/deepsparse/v2/pipeline.py
+++ b/src/deepsparse/v2/pipeline.py
@@ -13,9 +13,7 @@
 # limitations under the License.
 
 
-from typing import List
-
-from pydantic import BaseModel, Field, PrivateAttr
+from typing import Dict, List, Union
 
 from deepsparse.v2.operators import Operator
 from deepsparse.v2.routers import Router
@@ -25,78 +23,90 @@
 __all__ = ["Pipeline"]
 
 
-class Pipeline(BaseModel):
+class Pipeline(Operator):
     """
-    Pipeline accepts a series of operators, schedulers, and a router which define
-    an end to end ML transformation.
+    Pipeline accepts a series of operators, schedulers, and a router. Calling a pipeline
+    will use the router to run through all the defined operators. The operators should
+    be implemented using the Operator class and each implemented Operator should be
+    responsible for a functional component of the pipelines. The flow of inputs/outputs
+    between the operators and the steps in the pipeline should be defined by the router,
+    (based off of the Router class), which dicates the next operator in the pipeline.
+    Execution of the operators will be handled by the provided schedulers.
+
+    :param ops: Operators to run within the pipeline. Can either be a list of operators
+        or dictionary of operators.
+    :param router: A Router which dictates the next operator to call.
+    :param schedulers: A list of schedulers to run operators.
 
-    Calling a pipeline runs these transformations
     """
 
-    stages: List[Operator] = Field(
-        required=True,
-        description="In-order list of operators that make up this pipeline",
-    )
-    router: Router = Field(
-        default_factor=Router,
-        description="Router object to determine order and run the stages. "
-        "Defaults to the base Router object",
-    )
-    schedulers: List[OperatorScheduler] = Field(
-        default_factor=lambda: [OperatorScheduler()],
-        description="List of schedulers to run operators in order of priority",
-    )
-
-    _scheduler_group: SchedulerGroup = PrivateAttr()
-
-    class Config:
-        arbitrary_types_allowed = True
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+    def __init__(
+        self,
+        ops: Union[Dict[str, Operator], List[Operator]],
+        router: Router,
+        schedulers: List[OperatorScheduler],
+    ):
 
+        self.ops = ops
+        self.router = router
+        self.schedulers = schedulers
         self.validate()
 
         # SchedulerGroup handles running all schedulers in order of priority
         self._scheduler_group = SchedulerGroup(self.schedulers)
 
-    def __call__(self, *args, return_context: bool = False, **kwargs):
+    def run(self, *args, **kwargs):
+        """
+        Run through the operators using the provided router and scheduler. Update the
+        context to reflect each step of the router. The input to a given operator is the
+        output of the previous operator.
+
+        :param inp: input to the operator. expected to be of any type that is
+        expected by the operator.
+        :param context: context to store the current the inputs, outputs, and operator
+        for each step of the router.
+
+        """
+        next_step = self.router.START_ROUTE
+        operator_output = None
+        while next_step != self.router.END_ROUTE:
+            # Either a dictionary key or valid index
+            operator = self.ops[next_step]
+            if next_step == self.router.START_ROUTE:
+                output_future = self._scheduler_group.submit(
+                    *args, operator=operator, **kwargs
+                )
+            else:
+                if isinstance(operator_output, dict):
+                    output_future = self._scheduler_group.submit(
+                        operator=operator, **operator_output
+                    )
+                else:
+                    output_future = self._scheduler_group.submit(
+                        operator_output, operator=operator
+                    )
+
+            # wait for future to resolve
+            operator_output = output_future.result()
+            next_step = self.router.next(next_step, self.ops)
+        return operator_output
+
+    def __call__(self, *args, **kwargs):
         """
-        :param return_context: if True, retrns tuple of the pipelien output
-            and entire context. Default False
-        :return: output of the pipeline stages ran with the router for the given input
+        :return: output of the pipeline operators ran with the router for the given
+        input
         """
-        if len(args) > 1:
-            raise ValueError(
-                "Only 1 in-line argument may be supplied to Pipeline which "
-                f"must be a Schema, found: {len(args)}"
-            )
-        if args and kwargs:
-            raise ValueError(
-                "Pipeline can only run either a single in-line argument schema or a "
-                f"series of kwargs, found {len(args)} args and {len(kwargs)} kwargs"
-            )
-
-        pipeline_input = args[0] or kwargs
-        pipeline_output, context = self.router.run(
-            inp=pipeline_input,
-            operators=self.stages,
-            scheduler=self._scheduler_group,
-        )
-
-        if return_context:
-            return pipeline_output, context
-
-        return pipeline_output
+        return self.run(*args, **kwargs)
 
     def validate(self):
-        router_validation = self.router.validate(self.stages)
+        """
+        Validate that compatability of the router and operators provided.
+        """
+        router_validation = self.router.validate(self.ops)
 
         if router_validation is False:
             # default error message
-            stage_types = [type(stage) for stage in self.stages]
-            raise ValueError(
-                f"Invalid Router: {type(self.router)} for stages: {stage_types}"
-            )
+            op_types = [type(op) for op in self.ops]
+            raise ValueError(f"Invalid Router: {type(self.router)} for ops: {op_types}")
         elif isinstance(router_validation, str):
-            raise ValueError(f"Invalid Router for stages: {router_validation}")
+            raise ValueError(f"Invalid Router for operators: {router_validation}")
diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py
index 284c348c10..6050803b5e 100644
--- a/src/deepsparse/v2/routers/router.py
+++ b/src/deepsparse/v2/routers/router.py
@@ -13,61 +13,70 @@
 # limitations under the License.
 
 
-from typing import List, Tuple, Union
+import logging
+from abc import abstractmethod
+from typing import Dict, List, Union
 
 from deepsparse.v2.operators import Operator
-from deepsparse.v2.schedulers import OperatorScheduler
-from deepsparse.v2.utils import Context, OperatorSchema
 
 
-__all__ = ["Router"]
+_LOGGER = logging.getLogger(__name__)
+
+__all__ = ["Router", "LinearRouter"]
 
 
 class Router:
     """
-    Routers must implement a run method which runs a series of operators
-    for a pipeline for a given input. Base Router runs operators linearly
-    in a series
+    Routers dicate the next operator to run. Each Router must implement a next function,
+    which dictates the index or key of the next operator to run.
+
+    :param start_route: the start index or key of the router
+    :param end_route: the end index or key of the router
+
     """
 
-    @staticmethod
-    def run(
-        inp: OperatorSchema,
-        operators: List[Operator],
-        scheduler: OperatorScheduler,
-    ) -> Tuple[OperatorSchema, Context]:
+    def __init__(self, end_route: Union[str, int], start_route: Union[str, int]):
+        self.START_ROUTE = start_route
+        self.END_ROUTE = end_route
+
+    @abstractmethod
+    def next(
+        self, past: Union[str, int], ops: Union[List[Operator], Dict[str, Operator]]
+    ) -> Union[str, int]:
         """
-        :param inp: input to the first operator of the series
-        :param operators: list of operators to run
-        :param scheduler: scheudler to submit operators to
-        :return: final output of the operators
+        Determines the index or dictionary key for the next operator which should run.
+
+        :param past: the previous index or key. This should uniquely determine the next
+        operator to run
+        :param ops: list or dictionary of operators
+        :returns: the next index or dictionary key for the next operator to run
         """
-        context = Context()
+        raise NotImplementedError
+
+    def yaml(self):
+        pass
 
-        # run operators linearly
-        operator_input = inp
-        for operator in operators:
-            output_future = scheduler.submit(
-                operator=operator, operator_input=operator_input, context=context
-            )
+    def json(self):
+        pass
 
-            # wait for future to resolve
-            operator_output = output_future.result()
 
-            # update context
-            context.update(
-                operator=operator,
-                input=operator_input,
-                output=operator_output,
-            )
+class LinearRouter(Router):
+    """
+    LinearRouterruns a list of Operators in sequential order. end_route should
+    be the length of the list and the start_route should be the start index.
+    """
 
-            # previous output becomes next input
-            operator_input = operator_output
+    def __init__(self, end_route: int, start_route: int = 0):
+        super().__init__(end_route=end_route, start_route=start_route)
 
-        return operator_output, context
+    def next(self, past: int, ops: List[Operator]) -> int:
+        new_index = past + 1
+        if new_index < self.END_ROUTE:
+            return new_index
+        return self.END_ROUTE
 
     @staticmethod
-    def validate(operators: List[Operator]) -> Union[bool, str]:
+    def validate(operators: List[Operator]) -> bool:
         """
         :param operators: operators that this Router could potentially run over
         :return: True if this Router can run this series of operators. Base Router
@@ -76,7 +85,8 @@ def validate(operators: List[Operator]) -> Union[bool, str]:
             returned
         """
         if len(operators) < 1:
-            return "No operators found"
+            _LOGGER.info("No operators provided")
+            return False
 
         for idx in range(len(operators) - 1):
             current_output_schema = operators[idx].output_schema
@@ -88,8 +98,10 @@ def validate(operators: List[Operator]) -> Union[bool, str]:
                 continue
 
             if current_output_schema != next_input_schema:
-                return (
+                _LOGGER.info(
                     f"Operator at idx {idx}: {type(operators[idx])} has invalid "
                     f"output schema {current_output_schema} for next operator "
                     f"{type(operators[idx + 1])} which requires {next_input_schema}"
                 )
+                return False
+        return True
diff --git a/src/deepsparse/v2/schedulers/scheduler.py b/src/deepsparse/v2/schedulers/scheduler.py
index 53f0c8f625..7d4f249444 100644
--- a/src/deepsparse/v2/schedulers/scheduler.py
+++ b/src/deepsparse/v2/schedulers/scheduler.py
@@ -16,7 +16,6 @@
 from concurrent.futures import Future, ThreadPoolExecutor
 
 from deepsparse.v2.operators import Operator
-from deepsparse.v2.utils import Context, OperatorSchema
 
 
 __all__ = ["OperatorScheduler"]
@@ -37,23 +36,16 @@ class OperatorScheduler:
     def __init__(self, max_workers: int = 1):
         self._threadpool = ThreadPoolExecutor(max_workers=max_workers)
 
-    def submit(
-        self,
-        operator: Operator,
-        operator_input: OperatorSchema,
-        context: Context,
-    ) -> Future:
+    def submit(self, *args, operator: Operator, **kwargs) -> Future:
         """
         :param operator: operator to run
         :param operator_input: input schema to the operator
         :param context: context of already run operators
         :return: future referencing the asynchronously run output of the operator
         """
-        if isinstance(operator_input, dict):
-            return self._threadpool.submit(operator, context=context, **operator_input)
-        return self._threadpool.submit(operator, operator_input, context=context)
+        return self._threadpool.submit(operator, *args, **kwargs)
 
-    def can_process(self, operator: Operator, operator_input: OperatorSchema) -> bool:
+    def can_process(self, *args, operator: Operator, **kwargs) -> bool:
         """
         :param operator: operator to check
         :param operator_input: operator_input to check
diff --git a/src/deepsparse/v2/schedulers/scheduler_group.py b/src/deepsparse/v2/schedulers/scheduler_group.py
index 2f797b30c7..7f00a3c17c 100644
--- a/src/deepsparse/v2/schedulers/scheduler_group.py
+++ b/src/deepsparse/v2/schedulers/scheduler_group.py
@@ -18,7 +18,6 @@
 
 from deepsparse.v2.operators import Operator
 from deepsparse.v2.schedulers.scheduler import OperatorScheduler
-from deepsparse.v2.utils import Context, OperatorSchema
 
 
 __all__ = ["SchedulerGroup"]
@@ -35,12 +34,7 @@ class SchedulerGroup(OperatorScheduler):
     def __init__(self, schedulers: List[OperatorScheduler]):
         self.schedulers = schedulers
 
-    def submit(
-        self,
-        operator: Operator,
-        operator_input: OperatorSchema,
-        context: Context,
-    ) -> Future:
+    def submit(self, *args, operator: Operator, **kwargs) -> Future:
         """
         :param operator: operator to run
         :param operator_input: input schema to the operator
@@ -48,10 +42,10 @@ def submit(
         :return: future referencing the asynchronously run output of the operator
         """
         for scheduler in self.schedulers:
-            if scheduler.can_process(operator, operator_input):
-                return scheduler.submit(operator, operator_input, context)
+            if scheduler.can_process(*args, operator=operator, **kwargs):
+                return scheduler.submit(*args, operator=operator, **kwargs)
 
-    def can_process(self, operator: Operator, operator_input: OperatorSchema) -> bool:
+    def can_process(self, *args, operator: Operator, **kwargs) -> bool:
         """
         :param operator: operator to check
         :param operator_input: operator_input to check
@@ -59,6 +53,6 @@ def can_process(self, operator: Operator, operator_input: OperatorSchema) -> boo
             SchedulerGroup always returns True
         """
         return any(
-            scheduler.can_process(operator, operator_input)
+            scheduler.can_process(*args, operator=operator, **kwargs)
             for scheduler in self.schedulers
         )
diff --git a/src/deepsparse/v2/utils/__init__.py b/src/deepsparse/v2/utils/__init__.py
index 4f36eeb448..a36d8e92ec 100644
--- a/src/deepsparse/v2/utils/__init__.py
+++ b/src/deepsparse/v2/utils/__init__.py
@@ -14,5 +14,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .context import *
 from .types import *
diff --git a/src/deepsparse/v2/utils/context.py b/src/deepsparse/v2/utils/context.py
deleted file mode 100644
index 81fe26de61..0000000000
--- a/src/deepsparse/v2/utils/context.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Callable, List, NamedTuple
-
-from deepsparse.v2.utils.types import OperatorSchema
-
-
-__all__ = ["Context"]
-
-
-class StageInfo(NamedTuple):
-    operator: Callable
-    input: OperatorSchema
-    output: OperatorSchema
-
-
-class Context:
-    """
-    Context contains the full history of operators and their inputs and outputs
-    in a pipeline
-    """
-
-    def __init__(self):
-        self.stages_executed: List[StageInfo] = []
-
-    def update(self, operator: Callable, input: OperatorSchema, output: OperatorSchema):
-        self.stages_executed.append(
-            StageInfo(operator=operator, input=input, output=output)
-        )
diff --git a/tests/deepsparse/v2/__init__.py b/tests/deepsparse/v2/__init__.py
index e69de29bb2..0c44f887a4 100644
--- a/tests/deepsparse/v2/__init__.py
+++ b/tests/deepsparse/v2/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/deepsparse/v2/test_basic_pipeline.py b/tests/deepsparse/v2/test_basic_pipeline.py
index d39bc61c8c..9f85e4976e 100644
--- a/tests/deepsparse/v2/test_basic_pipeline.py
+++ b/tests/deepsparse/v2/test_basic_pipeline.py
@@ -1,14 +1,29 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
 Simple example and test of a dummy pipeline
 """
 
+from typing import Dict
+
 from pydantic import BaseModel
 
 from deepsparse.v2 import Pipeline
 from deepsparse.v2.operators import Operator
-from deepsparse.v2.routers import Router
+from deepsparse.v2.routers import LinearRouter
 from deepsparse.v2.schedulers import OperatorScheduler
-from deepsparse.v2.utils import Context, OperatorSchema
 
 
 class IntSchema(BaseModel):
@@ -19,21 +34,21 @@ class AddOneOperator(Operator):
     input_schema = IntSchema
     output_schema = IntSchema
 
-    def run(self, inp: IntSchema, context: Context) -> OperatorSchema:
-        return IntSchema(value=inp.value + 1)
+    def run(self, inp: IntSchema) -> Dict:
+        return {"value": inp.value + 1}
 
 
 class AddTwoOperator(Operator):
     input_schema = IntSchema
     output_schema = IntSchema
 
-    def run(self, inp: IntSchema, context: Context) -> OperatorSchema:
-        return IntSchema(value=inp.value + 2)
+    def run(self, inp: IntSchema) -> Dict:
+        return {"value": inp.value + 2}
 
 
 AddThreePipeline = Pipeline(
-    stages=[AddOneOperator(), AddTwoOperator()],
-    router=Router(),
+    ops=[AddOneOperator(), AddTwoOperator()],
+    router=LinearRouter(end_route=2),
     schedulers=[OperatorScheduler()],
 )
 
diff --git a/tests/deepsparse/v2/test_image_classification.py b/tests/deepsparse/v2/test_image_classification.py
new file mode 100644
index 0000000000..03e2807454
--- /dev/null
+++ b/tests/deepsparse/v2/test_image_classification.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+
+import pytest
+from deepsparse.v2.image_classification import ImageClassificationPipeline
+from deepsparse.v2.image_classification.preprocess_operator import (
+    ImageClassificationInput,
+)
+from tests.deepsparse.pipelines.data_helpers import computer_vision
+
+
+@pytest.fixture
+def get_images():
+    batch_size = 2
+    images = computer_vision(batch_size=batch_size)
+    return images.get("images")
+
+
+def test_image_classification(get_images):
+    model_path = (
+        "zoo:cv/classification/resnet_v1-50/pytorch/sparseml/imagenet/pruned95-none"
+    )
+    pipeline = ImageClassificationPipeline(model_path=model_path)
+    output = pipeline(ImageClassificationInput(images=get_images))
+    assert output.labels == [[207], [670]]
+    assert numpy.allclose(output.scores, [[21.85], [17.33]], atol=0.01)

From 58b075888e756dd853d2b279d9a98858962ca31d Mon Sep 17 00:00:00 2001
From: Benjamin Fineran <bfineran@users.noreply.github.com>
Date: Wed, 1 Nov 2023 10:53:10 -0400
Subject: [PATCH 03/43] [v2] EngineOperator updates to make continuous batching
 easier (#1371)

* [v2] EngineOperator updates to make continuous batching easier

* test fixes
---
 .../v2/operators/engine_operator.py           | 42 +++++++++++++++----
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py
index aac94a7697..2c61755df9 100644
--- a/src/deepsparse/v2/operators/engine_operator.py
+++ b/src/deepsparse/v2/operators/engine_operator.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from copy import deepcopy
 from typing import Dict, List, Optional, Union
 
 from pydantic import BaseModel, Field
@@ -32,6 +33,13 @@
 
 class EngineOperatorInputs(BaseModel):
     engine_inputs: List = Field(description="engine_inputs")
+    engine: Optional[Engine] = Field(
+        description="override the engine to run forward pass with",
+        default=None,
+    )
+
+    class Config:
+        arbitrary_types_allowed = True
 
 
 class EngineOperatorOutputs(BaseModel):
@@ -76,21 +84,33 @@ def __init__(
             engine_args["scheduler"] = scheduler
             engine_args["num_streams"] = num_streams
 
-        self.engine = self._create_engine(self.model_path, engine_type, engine_args)
+        self._engine_args = engine_args
+        self._engine_type = engine_type
+
+        self.engine = self.create_engine()
+
+    @property
+    def batch_size(self) -> int:
+        """
+        :return: the batch size this engine operator is compiled at
+        """
+        return self._batch_size
 
-    def _create_engine(
-        self, onnx_file_path: str, engine_type: str, engine_args: Dict
+    def create_engine(
+        self,
+        **kwargs,
     ) -> Union[Engine, MultiModelEngine, ORTEngine]:
         """
         Create an inference engine for a given ONNX model
 
-        :param onnx_file_path: path to ONNX model file
-        :param engine_type: type of engine to create.
-        :param engine_args: arguments to pass to engine constructor
-        :param context: context to use for engine
+        :param kwargs: overrides to engine_args used as kwargs for engine
+            constructor/compilation
         :return: inference engine
         """
-        engine_type = engine_type.lower()
+        onnx_file_path = self.model_path
+        engine_args = deepcopy(self._engine_args)
+        engine_args.update(kwargs)
+        engine_type = self._engine_type.lower()
 
         if engine_type == DEEPSPARSE_ENGINE:
             if self.engine_context is not None and isinstance(
@@ -116,6 +136,12 @@ def _create_engine(
         )
 
     def run(self, inp: EngineOperatorInputs) -> Dict:
+        if inp.engine:
+            # run with custom engine, do not split/join since custom engine
+            # may run at any batch size, returning here as code below has a
+            # planned refactor
+            engine_outputs = inp.engine(inp.engine_inputs)
+            return {"engine_outputs": engine_outputs}
         inp = inp.engine_inputs
         batches, orig_batch_size = self.expand_inputs(engine_inputs=inp)
         batches_outputs = list(map(self.engine, batches))

From e1ff108f76b2765e71fc6ee236892ea26b6c7205 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Thu, 2 Nov 2023 20:47:25 -0400
Subject: [PATCH 04/43] [Pipeline Refactor] Update routes, text generation
 initial functionality (#1348)

* initial functionality and working example with image classification

* remove testing image

* rebase fixes

* initial functionality and working example with image classification

* text gen

* updates func

* prompt inference, initial functionality

* remove image; update state docstring

* Fix typo

* add todo for split/join

* remove context, clean-up args, remove prefill_preprocess_operaator

* fix docstrings
---
 src/deepsparse/v2/operators/__init__.py       |   1 -
 .../v2/operators/engine_operator.py           |  18 +-
 src/deepsparse/v2/operators/operator.py       |  30 ++-
 src/deepsparse/v2/pipeline.py                 |  70 ++++--
 src/deepsparse/v2/routers/router.py           |  57 ++++-
 src/deepsparse/v2/schedulers/scheduler.py     |  23 +-
 .../v2/schedulers/scheduler_group.py          |  35 ++-
 src/deepsparse/v2/text_generation/__init__.py |  24 ++
 .../autoregressive_preprocess_operator.py     | 100 ++++++++
 .../v2/text_generation/compile_logits.py      |  43 ++++
 .../v2/text_generation/kv_cache_operator.py   |  70 ++++++
 .../multi_engine_prefill_operator.py          | 135 +++++++++++
 .../v2/text_generation/nl_engine_operator.py  | 191 ++++++++++++++++
 src/deepsparse/v2/text_generation/pipeline.py | 213 ++++++++++++++++++
 .../v2/text_generation/prep_for_prefill.py    |  57 +++++
 .../v2/text_generation/process_inputs.py      | 121 ++++++++++
 src/deepsparse/v2/utils/__init__.py           |   2 +-
 src/deepsparse/v2/utils/state.py              |  64 ++++++
 tests/deepsparse/v2/test_basic_pipeline.py    |   4 +-
 19 files changed, 1203 insertions(+), 55 deletions(-)
 create mode 100644 src/deepsparse/v2/text_generation/__init__.py
 create mode 100644 src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py
 create mode 100644 src/deepsparse/v2/text_generation/compile_logits.py
 create mode 100644 src/deepsparse/v2/text_generation/kv_cache_operator.py
 create mode 100644 src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py
 create mode 100644 src/deepsparse/v2/text_generation/nl_engine_operator.py
 create mode 100644 src/deepsparse/v2/text_generation/pipeline.py
 create mode 100644 src/deepsparse/v2/text_generation/prep_for_prefill.py
 create mode 100644 src/deepsparse/v2/text_generation/process_inputs.py
 create mode 100644 src/deepsparse/v2/utils/state.py

diff --git a/src/deepsparse/v2/operators/__init__.py b/src/deepsparse/v2/operators/__init__.py
index 8f7e6a169d..9d1a9812ac 100644
--- a/src/deepsparse/v2/operators/__init__.py
+++ b/src/deepsparse/v2/operators/__init__.py
@@ -13,5 +13,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from .operator import *
diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py
index 2c61755df9..b7d920a686 100644
--- a/src/deepsparse/v2/operators/engine_operator.py
+++ b/src/deepsparse/v2/operators/engine_operator.py
@@ -17,7 +17,8 @@
 
 from pydantic import BaseModel, Field
 
-from deepsparse import Context, Engine, MultiModelEngine, Scheduler
+from deepsparse import Context as EngineContext
+from deepsparse import Engine, MultiModelEngine, Scheduler
 from deepsparse.benchmark import ORTEngine
 from deepsparse.utils import join_engine_outputs, model_to_path, split_engine_inputs
 from deepsparse.v2.operators import Operator
@@ -54,16 +55,15 @@ def __init__(
         self,
         model_path: str,
         engine_type: str = DEEPSPARSE_ENGINE,
-        batch_size: Optional[int] = 1,
         num_cores: int = None,
         num_streams: int = None,
         scheduler: Scheduler = None,
         input_shapes: List[List[int]] = None,
-        engine_context: Optional[Context] = None,
+        engine_context: Optional[EngineContext] = None,
+        engine_kwargs: Dict = None,
     ):
-
-        self._batch_size = batch_size
         self.model_path = model_to_path(model_path)
+        self._batch_size = 1
         self.engine_context = engine_context
 
         if self.engine_context is not None:
@@ -87,7 +87,7 @@ def __init__(
         self._engine_args = engine_args
         self._engine_type = engine_type
 
-        self.engine = self.create_engine()
+        self.engine = self.create_engine(**engine_kwargs)
 
     @property
     def batch_size(self) -> int:
@@ -114,12 +114,12 @@ def create_engine(
 
         if engine_type == DEEPSPARSE_ENGINE:
             if self.engine_context is not None and isinstance(
-                self.engine_context, Context
+                self.engine_context, EngineContext
             ):
                 engine_args.pop("num_cores", None)
                 engine_args.pop("scheduler", None)
                 engine_args.pop("num_streams", None)
-                engine_args["context"] = self.engien_context
+                engine_args["context"] = self.engine_context
                 return MultiModelEngine(
                     model=onnx_file_path,
                     **engine_args,
@@ -135,7 +135,7 @@ def create_engine(
             f"{SUPPORTED_PIPELINE_ENGINES}"
         )
 
-    def run(self, inp: EngineOperatorInputs) -> Dict:
+    def run(self, inp: EngineOperatorInputs, **kwargs) -> Dict:
         if inp.engine:
             # run with custom engine, do not split/join since custom engine
             # may run at any batch size, returning here as code below has a
diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py
index c3a3e28b78..b3963d8223 100644
--- a/src/deepsparse/v2/operators/operator.py
+++ b/src/deepsparse/v2/operators/operator.py
@@ -17,6 +17,8 @@
 
 from pydantic import BaseModel
 
+from deepsparse.v2.utils import InferenceState, PipelineState
+
 
 __all__ = ["Operator"]
 
@@ -54,6 +56,8 @@ def has_output_schema(cls) -> bool:
     def __call__(
         self,
         *args,
+        inference_state: InferenceState,
+        pipeline_state: PipelineState,
         **kwargs,
     ) -> Any:
         """
@@ -61,7 +65,9 @@ def __call__(
 
         :param args: an unnamed arg may only be provided if it is of the type of the
             input_schema
-        :param context: pipeline context to pass to operator
+        :param inference_state: inference_state for the pipeline.
+        :param pipeline_state: pipeline_state for the pipeline. The values in the state
+            are created during pipeline creation and are read-only during inference.
         :param kwargs: kwargs when not initializing from an instantiated schema
         :return: operator output
         """
@@ -81,10 +87,18 @@ def __call__(
                     "in the form of a dictionary or an instance of the input_schema"
                     "object"
                 )
-
-            run_output = self.run(inference_input)
+            run_output = self.run(
+                inference_input,
+                inference_state=inference_state,
+                pipeline_state=pipeline_state,
+            )
         else:
-            run_output = self.run(*args, **kwargs)
+            run_output = self.run(
+                *args,
+                inference_state=inference_state,
+                pipeline_state=pipeline_state,
+                **kwargs,
+            )
 
         if self.has_output_schema():
             return self.output_schema(**run_output)
@@ -93,12 +107,16 @@ def __call__(
     @abstractmethod
     def run(self, *args, **kwargs) -> Any:
         """
-        :param inp: operator input, as the defined input schema if applicable
-        :param context: pipeline context of already run operators
         :return: result of this operator as the defined output schema if applicable
         """
         raise NotImplementedError
 
+    def can_operate(self, inp: Any) -> bool:
+        """
+        Whether or not the given operator can run, based on input
+        """
+        return True
+
     def expand_inputs(self, **kwargs):
         """
         Generic function to handle expanding values.
diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py
index e58f8a5191..0a8c8b2f93 100644
--- a/src/deepsparse/v2/pipeline.py
+++ b/src/deepsparse/v2/pipeline.py
@@ -18,6 +18,7 @@
 from deepsparse.v2.operators import Operator
 from deepsparse.v2.routers import Router
 from deepsparse.v2.schedulers import OperatorScheduler, SchedulerGroup
+from deepsparse.v2.utils import InferenceState, PipelineState
 
 
 __all__ = ["Pipeline"]
@@ -27,7 +28,7 @@ class Pipeline(Operator):
     """
     Pipeline accepts a series of operators, schedulers, and a router. Calling a pipeline
     will use the router to run through all the defined operators. The operators should
-    be implemented using the Operator class and each implemented Operator should be
+    be implemented using the Operator class and each implemented operator should be
     responsible for a functional component of the pipelines. The flow of inputs/outputs
     between the operators and the steps in the pipeline should be defined by the router,
     (based off of the Router class), which dicates the next operator in the pipeline.
@@ -37,6 +38,7 @@ class Pipeline(Operator):
         or dictionary of operators.
     :param router: A Router which dictates the next operator to call.
     :param schedulers: A list of schedulers to run operators.
+    :param pipeline_state: pipeline_state created during pipeline initialization
 
     """
 
@@ -45,57 +47,93 @@ def __init__(
         ops: Union[Dict[str, Operator], List[Operator]],
         router: Router,
         schedulers: List[OperatorScheduler],
+        pipeline_state: PipelineState = None,
     ):
 
         self.ops = ops
         self.router = router
         self.schedulers = schedulers
+        self.pipeline_state = pipeline_state
         self.validate()
 
         # SchedulerGroup handles running all schedulers in order of priority
         self._scheduler_group = SchedulerGroup(self.schedulers)
 
-    def run(self, *args, **kwargs):
+    def run(
+        self,
+        *args,
+        inference_state: InferenceState,
+        pipeline_state: PipelineState,
+        **kwargs,
+    ):
         """
-        Run through the operators using the provided router and scheduler. Update the
-        context to reflect each step of the router. The input to a given operator is the
-        output of the previous operator.
-
-        :param inp: input to the operator. expected to be of any type that is
-        expected by the operator.
-        :param context: context to store the current the inputs, outputs, and operator
-        for each step of the router.
+        Run through the operators using the provided router and scheduler.
+        The input to a given operator is the output of the previous operator.
 
+        :param inference_state: inference_state for the pipeline.
+        :param pipeline_state: pipeline_state for the pipeline. The values in the state
+            are created during pipeline creation and are read-only during inference.
         """
         next_step = self.router.START_ROUTE
         operator_output = None
+
         while next_step != self.router.END_ROUTE:
             # Either a dictionary key or valid index
             operator = self.ops[next_step]
             if next_step == self.router.START_ROUTE:
                 output_future = self._scheduler_group.submit(
-                    *args, operator=operator, **kwargs
+                    *args,
+                    inference_state=inference_state,
+                    operator=operator,
+                    pipeline_state=pipeline_state,
+                    **kwargs,
                 )
             else:
                 if isinstance(operator_output, dict):
                     output_future = self._scheduler_group.submit(
-                        operator=operator, **operator_output
+                        inference_state=inference_state,
+                        operator=operator,
+                        pipeline_state=pipeline_state,
+                        **operator_output,
                     )
                 else:
                     output_future = self._scheduler_group.submit(
-                        operator_output, operator=operator
+                        operator_output,
+                        inference_state=inference_state,
+                        pipeline_state=pipeline_state,
+                        operator=operator,
                     )
 
-            # wait for future to resolve
             operator_output = output_future.result()
-            next_step = self.router.next(next_step, self.ops)
+            if isinstance(operator_output, tuple):
+                state_update = operator_output[-1]
+                operator_output = operator_output[0]
+                inference_state.update_state(state_update)
+
+            next_step = self.router.next(next_step, self.ops, operator_output)
+
         return operator_output
 
     def __call__(self, *args, **kwargs):
         """
+        Consolidate any provided inference_state or pipeline_state objects and pass
+        any other operator inputs to run().
+
         :return: output of the pipeline operators ran with the router for the given
-        input
+            input
         """
+        if kwargs.get("inference_state"):
+            inference_state = kwargs.pop("inference_state")
+        else:
+            inference_state = InferenceState()
+            inference_state.create_state({})
+
+        if "pipeline_state" in kwargs:
+            self.pipeline_state = kwargs.get("pipeline_state")
+
+        kwargs["inference_state"] = inference_state
+        kwargs["pipeline_state"] = self.pipeline_state
+
         return self.run(*args, **kwargs)
 
     def validate(self):
diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py
index 6050803b5e..d1110d4ca7 100644
--- a/src/deepsparse/v2/routers/router.py
+++ b/src/deepsparse/v2/routers/router.py
@@ -15,14 +15,14 @@
 
 import logging
 from abc import abstractmethod
-from typing import Dict, List, Union
+from typing import Any, Dict, List, Optional, Union
 
 from deepsparse.v2.operators import Operator
 
 
 _LOGGER = logging.getLogger(__name__)
 
-__all__ = ["Router", "LinearRouter"]
+__all__ = ["Router", "LinearRouter", "GraphRouter"]
 
 
 class Router:
@@ -32,23 +32,34 @@ class Router:
 
     :param start_route: the start index or key of the router
     :param end_route: the end index or key of the router
+    :param route: the route that the router has to traverse through
 
     """
 
-    def __init__(self, end_route: Union[str, int], start_route: Union[str, int]):
+    def __init__(
+        self,
+        end_route: Union[str, int],
+        start_route: Union[str, int],
+        route: Optional[Dict] = None,
+    ):
         self.START_ROUTE = start_route
         self.END_ROUTE = end_route
+        self.route = route
 
     @abstractmethod
     def next(
-        self, past: Union[str, int], ops: Union[List[Operator], Dict[str, Operator]]
+        self,
+        past: Union[str, int],
+        ops: Optional[Union[List[Operator], Dict[str, Operator]]],
+        inp: Optional[Any],
     ) -> Union[str, int]:
         """
         Determines the index or dictionary key for the next operator which should run.
 
         :param past: the previous index or key. This should uniquely determine the next
-        operator to run
+            operator to run
         :param ops: list or dictionary of operators
+        :param inp: operator input
         :returns: the next index or dictionary key for the next operator to run
         """
         raise NotImplementedError
@@ -69,7 +80,9 @@ class LinearRouter(Router):
     def __init__(self, end_route: int, start_route: int = 0):
         super().__init__(end_route=end_route, start_route=start_route)
 
-    def next(self, past: int, ops: List[Operator]) -> int:
+    def next(
+        self, past: int, ops: Optional[List[Operator]] = None, inp: Optional[Any] = None
+    ) -> int:
         new_index = past + 1
         if new_index < self.END_ROUTE:
             return new_index
@@ -105,3 +118,35 @@ def validate(operators: List[Operator]) -> bool:
                 )
                 return False
         return True
+
+
+class GraphRouter(Router):
+    """
+    Router for a DAG. Expects graphs be presented in the form of a dictionary, where
+    keys are the nodes of the graph and the values are the connected nodes. For
+    nodes with multiple ouput edges, all the nodes will be visited and the first node
+    where `can_operate` returns True will run. Paths should be deterministic.
+    """
+
+    def __init__(self, end_route: str, start_route: str, route: Dict):
+        super().__init__(end_route=end_route, start_route=start_route, route=route)
+
+    def next(
+        self,
+        past: str,
+        ops: Dict[str, Operator],
+        inp: Any,
+    ) -> int:
+        node = past
+        if isinstance(self.route[node], str):
+            return self.route[node]
+        else:
+            for neighbour_node in self.route[node]:
+                neighbour_node_op = ops[neighbour_node]
+                if neighbour_node_op.can_operate(inp):
+                    return neighbour_node
+            raise ValueError("Cannot operate on any of the nodes")
+
+    @staticmethod
+    def validate(ops) -> bool:
+        pass
diff --git a/src/deepsparse/v2/schedulers/scheduler.py b/src/deepsparse/v2/schedulers/scheduler.py
index 7d4f249444..78a58e3389 100644
--- a/src/deepsparse/v2/schedulers/scheduler.py
+++ b/src/deepsparse/v2/schedulers/scheduler.py
@@ -36,19 +36,30 @@ class OperatorScheduler:
     def __init__(self, max_workers: int = 1):
         self._threadpool = ThreadPoolExecutor(max_workers=max_workers)
 
-    def submit(self, *args, operator: Operator, **kwargs) -> Future:
+    def submit(
+        self,
+        *args,
+        operator: Operator,
+        **kwargs,
+    ) -> Future:
         """
         :param operator: operator to run
-        :param operator_input: input schema to the operator
-        :param context: context of already run operators
         :return: future referencing the asynchronously run output of the operator
         """
-        return self._threadpool.submit(operator, *args, **kwargs)
+        return self._threadpool.submit(
+            operator,
+            *args,
+            **kwargs,
+        )
 
-    def can_process(self, *args, operator: Operator, **kwargs) -> bool:
+    def can_process(
+        self,
+        *args,
+        operator: Operator,
+        **kwargs,
+    ) -> bool:
         """
         :param operator: operator to check
-        :param operator_input: operator_input to check
         :return: True if this Operator can process the given operator and input.
             Base OperatorScheduler always returns True
         """
diff --git a/src/deepsparse/v2/schedulers/scheduler_group.py b/src/deepsparse/v2/schedulers/scheduler_group.py
index 7f00a3c17c..40b5695f22 100644
--- a/src/deepsparse/v2/schedulers/scheduler_group.py
+++ b/src/deepsparse/v2/schedulers/scheduler_group.py
@@ -34,25 +34,44 @@ class SchedulerGroup(OperatorScheduler):
     def __init__(self, schedulers: List[OperatorScheduler]):
         self.schedulers = schedulers
 
-    def submit(self, *args, operator: Operator, **kwargs) -> Future:
+    def submit(
+        self,
+        *args,
+        operator: Operator,
+        **kwargs,
+    ) -> Future:
         """
         :param operator: operator to run
-        :param operator_input: input schema to the operator
-        :param context: context of already run operators
         :return: future referencing the asynchronously run output of the operator
         """
         for scheduler in self.schedulers:
-            if scheduler.can_process(*args, operator=operator, **kwargs):
-                return scheduler.submit(*args, operator=operator, **kwargs)
+            if scheduler.can_process(
+                *args,
+                operator=operator,
+                **kwargs,
+            ):
+                return scheduler.submit(
+                    *args,
+                    operator=operator,
+                    **kwargs,
+                )
 
-    def can_process(self, *args, operator: Operator, **kwargs) -> bool:
+    def can_process(
+        self,
+        *args,
+        operator: Operator,
+        **kwargs,
+    ) -> bool:
         """
         :param operator: operator to check
-        :param operator_input: operator_input to check
         :return: True if this Operator can process the given operator and input.
             SchedulerGroup always returns True
         """
         return any(
-            scheduler.can_process(*args, operator=operator, **kwargs)
+            scheduler.can_process(
+                *args,
+                operator=operator,
+                **kwargs,
+            )
             for scheduler in self.schedulers
         )
diff --git a/src/deepsparse/v2/text_generation/__init__.py b/src/deepsparse/v2/text_generation/__init__.py
new file mode 100644
index 0000000000..37ac88d02f
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# flake8: noqa
+from .autoregressive_preprocess_operator import *
+from .compile_logits import *
+from .kv_cache_operator import *
+from .multi_engine_prefill_operator import *
+from .nl_engine_operator import *
+from .prep_for_prefill import *
+from .process_inputs import *
+
+
+from .pipeline import *  # isort:skip
diff --git a/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py b/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py
new file mode 100644
index 0000000000..cfe7cb531b
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import Any
+
+import numpy
+
+from deepsparse.transformers.utils.helpers import create_causal_mask
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.utils import PipelineState
+
+
+_LOGGER = logging.getLogger(__name__)
+
+__all__ = ["AutoRegressiveOperatorPreprocess"]
+
+
+class AutoRegressiveOperatorPreprocess(Operator):
+    def __init__(self, sequence_length: int, prompt_sequence_length: int):
+        """
+        Prepare the tokens for the single-token engine. This requires creating the
+        attention mask, positions, and causal mask. The output contains these three
+        arrays to be passed into the single-token engine.
+        """
+        self.sequence_length = sequence_length
+        self.prompt_sequence_length = prompt_sequence_length
+        self.set_capacity = False
+
+        _LOGGER.warn(
+            "This operator requires the PipelineState to be set-up with the "
+            "onnx_input_names_no_cache attribute set from the NLEngineOperator."
+        )
+
+    def can_operate(self, inp: Any) -> bool:
+        """
+        Can run this Operator if the number of tokens left to process is greater than
+        0 but less than the self.prompt_sequence_length.
+        """
+        tokens = inp.get("tokens")
+        kv_cache = inp.get("kv_cache")
+
+        remaining_tokens = len(tokens) - kv_cache.total_num_processed_tokens
+        if remaining_tokens > 0 and remaining_tokens < self.prompt_sequence_length:
+            return True
+        return False
+
+    def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwargs):
+
+        if not self.set_capacity:
+            self.set_capacity = True
+            kv_cache.set_capacity(self.sequence_length - 1)
+
+        num_total_processed_tokens = kv_cache.total_num_processed_tokens
+        new_token = tokens[num_total_processed_tokens]
+        engine_input_names = pipeline_state.current_state.get(
+            "onnx_input_names_no_cache"
+        )
+
+        # padding is added to left, so attention mask is 1s from the
+        # right up to the number of total tokens (prompt + generated)
+        attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
+        num_attention_entries_to_unmask = min(
+            num_total_processed_tokens + 1, self.sequence_length
+        )  # cap by seq len
+        attention_mask[:, -num_attention_entries_to_unmask:] = 1
+        positions = numpy.array([[num_total_processed_tokens]], dtype=numpy.int64)
+        input_ids = numpy.array([[new_token]])
+        causal_mask = create_causal_mask(input_ids, attention_mask)
+
+        engine_inputs_map = dict(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            causal_mask=causal_mask,
+            positions=positions,
+        )
+
+        engine_inputs = [engine_inputs_map[name] for name in engine_input_names]
+
+        onnx_input_names_no_cache = pipeline_state.current_state.get(
+            "onnx_input_names_no_cache"
+        )
+        engine_inputs = [engine_inputs_map[name] for name in onnx_input_names_no_cache]
+
+        return {
+            "engine_inputs": engine_inputs,
+            "kv_cache": kv_cache,
+            "tokens": tokens,
+        }
diff --git a/src/deepsparse/v2/text_generation/compile_logits.py b/src/deepsparse/v2/text_generation/compile_logits.py
new file mode 100644
index 0000000000..55c87d791d
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/compile_logits.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.utils import InferenceState
+
+
+__all__ = ["CompilePromptLogits"]
+
+
+class CompilePromptLogits(Operator):
+    """
+    Combine the prompt logits. Currently relying on the inference state to store the
+    prompt logits for each token or multi-token batch processed. This operator will
+    take prompt logits from each iteration run and update the inference state.
+    """
+
+    def run(self, logits, inference_state: InferenceState, **kwargs):
+        logit_type = "prompt_logits"
+
+        if inference_state.current_state.get(logit_type) is not None:
+            current_logits = inference_state.current_state.get(logit_type).copy()
+            current_logits.append(logits)
+        else:
+            current_logits = [logits]
+
+        state_update = {logit_type: current_logits}
+        return {
+            "kv_cache": kwargs.get("kv_cache"),
+            "tokens": kwargs.get("tokens"),
+        }, state_update
diff --git a/src/deepsparse/v2/text_generation/kv_cache_operator.py b/src/deepsparse/v2/text_generation/kv_cache_operator.py
new file mode 100644
index 0000000000..0b232402b3
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/kv_cache_operator.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from deepsparse.transformers.utils import DecoderKVCache
+from deepsparse.transformers.utils.helpers import (
+    initialize_kv_cache_state,
+    prepends_bos_token,
+)
+from deepsparse.v2.operators import Operator
+
+
+__all__ = ["KVCacheCreator"]
+
+
+class KVCacheCreatorOutput(BaseModel):
+    kv_cache: Any = Field(description="KV Cache Created")  # DecoderKVCache
+
+
+class KVCacheCreatorInput(BaseModel):
+    cache_shape: Any = Field(description="shape")
+    kv_cache_data_type: Any = Field(description="data type")
+    output_names: Any = Field(description="output names")
+
+
+class KVCacheCreator(Operator):
+    input_schema = KVCacheCreatorInput
+    output_schema = KVCacheCreatorOutput
+
+    def __init__(
+        self,
+        tokenizer,
+        sequence_length: int,
+        prompt_sequence_length: int,
+        internal_kv_cache: bool,
+    ):
+        self.tokenizer = tokenizer
+        self.prompt_sequence_length = prompt_sequence_length
+        self.internal_kv_cache = internal_kv_cache
+        self.sequence_length = sequence_length
+
+    def run(self, cache_shape, kv_cache_data_type: str, output_names: list, **kwargs):
+        kv_cache_state = initialize_kv_cache_state(
+            cache_shape=cache_shape,
+            kv_cache_data_type=kv_cache_data_type,
+            output_names=output_names,
+            length=self.sequence_length - self.prompt_sequence_length,
+            empty=bool(self.internal_kv_cache),
+        )
+
+        kv_cache = DecoderKVCache(self.internal_kv_cache)
+        kv_cache.setup(
+            state=kv_cache_state,
+            freeze_first_position=prepends_bos_token(self.tokenizer),
+        )
+        return {"kv_cache": kv_cache}
diff --git a/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py b/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py
new file mode 100644
index 0000000000..41ee830a8a
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from enum import Enum
+from typing import Any
+
+import numpy
+
+from deepsparse.transformers.utils.helpers import create_causal_mask
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.utils import PipelineState
+
+
+_LOGGER = logging.getLogger(__name__)
+
+__all__ = ["MultiEnginePrefill"]
+
+
+class OnnxInputNames(Enum):
+    INPUT_IDS = "input_ids"
+    ATTN_MASK = "attention_mask"
+    CAUSAL_MASK = "causal_mask"
+    POSITIONS = "positions"
+
+
+# NOTE: A possible clean-up could involve combining this Operator and the
+# autoregressive_preprocess_operator
+
+
+class MultiEnginePrefill(Operator):
+    def __init__(self, prompt_sequence_length, sequence_length):
+        """
+        Prepare the tokens for the multi-token engine. This requires creating the
+        attention mask, positions, and causal mask. The output contains these three
+        arrays to be passed into the multi-token engine.
+        """
+        self.prompt_sequence_length = prompt_sequence_length
+        self.sequence_length = sequence_length
+        self.cases = {
+            OnnxInputNames.ATTN_MASK.value: self._case_attn_mask,
+            OnnxInputNames.POSITIONS.value: self._case_positions,
+        }
+        _LOGGER.warn(
+            "This operator requires the PipelineState to be set-up with the "
+            "onnx_input_names_no_cache attribute set from the NLEngineOperator."
+        )
+
+    def can_operate(self, inp: Any):
+        """
+        Can only run if the number of prompt tokens left to process is greater than
+        or equal to the self.prompt_sequence_length.
+        """
+        kv_cache = inp.get("kv_cache")
+        tokens = inp.get("tokens")
+
+        if len(tokens) < self.prompt_sequence_length:
+            return False
+
+        if (
+            len(tokens) - kv_cache.total_num_processed_tokens
+            >= self.prompt_sequence_length
+        ):
+            return True
+        return False
+
+    def _case_attn_mask(self, num_total_processed_tokens: int):
+        # create an empty attention mask
+        engine_input = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
+        # calculate the number of entries in attention mask that should be set to 1
+        num_attention_entries_to_unmask = min(
+            num_total_processed_tokens + self.prompt_sequence_length,
+            self.sequence_length,
+        )
+        engine_input[:, -num_attention_entries_to_unmask:] = 1
+        return engine_input
+
+    def _case_positions(self, num_total_processed_tokens: int):
+        return (
+            numpy.arange(
+                num_total_processed_tokens,
+                num_total_processed_tokens + self.prompt_sequence_length,
+            )
+            .reshape(1, -1)
+            .astype(numpy.int64)
+        )
+
+    def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwargs):
+
+        onnx_input_names_no_cache = pipeline_state.current_state.get(
+            "onnx_input_names_no_cache"
+        )
+
+        num_total_processed_tokens = kv_cache.total_num_processed_tokens
+        start = num_total_processed_tokens
+        end = start + self.prompt_sequence_length
+        token_batch = tokens[start:end]
+
+        engine_inputs = []
+        for name in onnx_input_names_no_cache:
+            if name == OnnxInputNames.INPUT_IDS.value:
+                engine_input = numpy.array([token_batch])
+            elif (
+                name == OnnxInputNames.ATTN_MASK.value
+                or name == OnnxInputNames.POSITIONS.value
+            ):
+                engine_input = self.cases[name](num_total_processed_tokens)
+            elif name == OnnxInputNames.CAUSAL_MASK.value:
+                continue
+
+            engine_inputs.append(engine_input)
+
+        if OnnxInputNames.CAUSAL_MASK.value in onnx_input_names_no_cache:
+            causal_mask = create_causal_mask(
+                input_ids=engine_inputs[0],
+                attention_mask=engine_inputs[1],
+            )
+            engine_inputs.append(causal_mask)
+
+        return {
+            "engine_inputs": engine_inputs,
+            "kv_cache": kv_cache,
+            "tokens": tokens,
+        }
diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py
new file mode 100644
index 0000000000..6c1ad1966e
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+from typing import Any, List, Tuple
+
+from pydantic import BaseModel, Field
+
+from deepsparse.utils.onnx import (
+    CACHE_INPUT_PREFIX,
+    overwrite_onnx_model_inputs_for_kv_cache_models,
+)
+from deepsparse.v2.operators.engine_operator import (
+    DEEPSPARSE_ENGINE,
+    EngineOperator,
+    EngineOperatorInputs,
+)
+
+
+__all__ = ["NLEngineOperator"]
+
+
+class NlEngineInput(BaseModel):
+    engine_inputs: List = Field(description="engine inputs")
+    kv_cache: Any = Field(description="kv_cache object")
+    tokens: List = Field(description="tokens")
+
+
+class NLEngineOperator(EngineOperator):
+
+    """
+    Operator for the NL Decoder Engine. This Operator inherits from the EngineOperator.
+    Specific updates to engine attributes are made through this operator, as well
+    as updating the kv_cache. This Operator is used for both the single-token and
+    multi-token case.
+    """
+
+    input_schema = NlEngineInput
+    output_schema = None
+
+    def __init__(
+        self,
+        sequence_length: int,
+        input_ids_length: int,
+        internal_kv_cache: bool = False,
+        **kwargs,
+    ):
+
+        self.kv_cache_data_type = None
+        (
+            onnx_file_path,
+            output_indices_to_be_cached,
+            kv_cache_data_type,
+        ) = overwrite_onnx_model_inputs_for_kv_cache_models(
+            onnx_file_path=kwargs.get("model_path"),
+            batch_size=kwargs.get("batch_size", 1),
+            sequence_length=sequence_length,
+            input_ids_length=input_ids_length,
+        )
+
+        engine_kwargs = kwargs.get("engine_kwargs", {})
+        if kwargs.get("engine_type", DEEPSPARSE_ENGINE) == DEEPSPARSE_ENGINE:
+            if "WAND_OPT_FLAGS" not in os.environ:
+                os.environ["WAND_OPT_FLAGS"] = "default,~pyramids"
+
+        if any(output_indices_to_be_cached):
+            self.kv_cache_data_type = kv_cache_data_type
+            if (
+                internal_kv_cache
+                and kwargs.get("engine_type", DEEPSPARSE_ENGINE) == DEEPSPARSE_ENGINE
+            ):
+                engine_kwargs["cached_outputs"] = output_indices_to_be_cached
+
+        kwargs["engine_kwargs"] = engine_kwargs
+        kwargs["model_path"] = onnx_file_path
+        super().__init__(**kwargs)
+
+        self.input_ids_length = input_ids_length
+
+    def run(self, inp: NlEngineInput, **kwargs) -> Any:
+        engine_input = inp.engine_inputs
+        kv_cache = inp.kv_cache
+
+        inputs = self._add_kv_cache_to_input(engine_input, kv_cache)
+        if bool(kv_cache.engine_internal_cache):
+            # conventionally, before dispatching
+            # inputs to the engine, we validate them
+            # if val_inp=True. However, in this case
+            # we want to pass the empty kv cache inputs
+            # (batch_size=0) to the engine. Therefore,
+            # we skip the validation
+            out = self.engine._eng_net.execute_list_out(
+                inputs, kv_cache.engine_internal_cache
+            )
+        else:
+            # run the engine without the LIB.kv_cache object
+            out = (
+                super()
+                .run(EngineOperatorInputs(engine_inputs=inputs), **kwargs)
+                .get("engine_outputs")
+            )
+
+        logits, *kv_cache_state = out
+        self._update_kv_cache(
+            kv_cache_state=kv_cache_state,
+            input_ids_len=self.input_ids_length,
+            kv_cache=kv_cache,
+        )
+
+        output = {"logits": logits, "kv_cache": kv_cache, "tokens": inp.tokens}
+        return output
+
+    def _add_kv_cache_to_input(self, engine_input, kv_cache):
+        kv_cache_state = copy.copy(kv_cache.cached_inputs)
+
+        for idx, input_name in enumerate(self.onnx_input_names_no_cache):
+            kv_cache_state[input_name] = engine_input[idx]
+
+        new_inp = [kv_cache_state[name] for name in self.engine.input_names]
+        return new_inp
+
+    def _update_kv_cache(self, kv_cache_state, input_ids_len, kv_cache):
+        if bool(kv_cache.engine_internal_cache):
+            kv_cache.total_num_processed_tokens += input_ids_len
+            return
+
+        kv_cache_state = {
+            name: array
+            for name, array in zip(self.onnx_input_names_cached, kv_cache_state)
+        }
+
+        kv_cache.update(
+            state=kv_cache_state,
+            input_ids_len=input_ids_len,
+        )
+
+    @property
+    def onnx_input_names_no_cache(self) -> List[str]:
+        """
+        :return: The input names for the onnx model, excluding
+            the potential kv cache inputs
+        """
+        return [
+            name
+            for name in self.engine.input_names
+            if not name.startswith(CACHE_INPUT_PREFIX)
+        ]
+
+    @property
+    def onnx_input_names_cached(self) -> List[str]:
+        """
+        :return: The cached input names for the onnx model
+        """
+        return [
+            name
+            for name in self.engine.input_names
+            if name.startswith(CACHE_INPUT_PREFIX)
+        ]
+
+    @property
+    def cache_shape(self) -> Tuple[int, int, int, int]:
+        """
+        :return: The shape of the kv cache inputs
+            for the onnx model. The shape is
+            (batch_size, num_heads, sequence_length, hidden_size)
+        """
+        cache_engine_input_index = next(
+            i
+            for i, name in enumerate(self.engine.input_names)
+            if CACHE_INPUT_PREFIX in name
+        )
+        return self.engine.input_shapes[cache_engine_input_index]
+
+    @property
+    def output_names(self) -> List[str]:
+        """
+        :return: The output names for the onnx model
+        """
+        return self.engine.output_names
diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py
new file mode 100644
index 0000000000..9878aa0061
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/pipeline.py
@@ -0,0 +1,213 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+
+from deepsparse.transformers.utils.helpers import process_generation_config
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.pipeline import Pipeline
+from deepsparse.v2.routers import GraphRouter
+from deepsparse.v2.schedulers import OperatorScheduler
+from deepsparse.v2.text_generation import (
+    AutoRegressiveOperatorPreprocess,
+    CompilePromptLogits,
+    KVCacheCreator,
+    MultiEnginePrefill,
+    NLEngineOperator,
+    PrepareforPrefill,
+    ProcessInputsTextGeneration,
+)
+from deepsparse.v2.utils import PipelineState
+
+
+class TextGenerationPipeline(Pipeline):
+    def __init__(
+        self,
+        model_path: str,
+        prompt_sequence_length: int = 16,
+        sequence_length: int = 1024,
+        internal_kv_cache: bool = True,
+        force_max_tokens: bool = False,
+        generation_config=None,
+        engine_kwargs: Dict = None,
+    ):
+
+        pipeline_state = PipelineState()
+        pipeline_state_vals = {}
+
+        # TODO: The code below will be replaced with a transformers set-up Operator.
+        self.tokenizer = None
+        model_path = self.setup_onnx_file_path(model_path, sequence_length)
+        self.tokenizer.padding_side = "left"
+        if not self.tokenizer.pad_token:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+
+        if not engine_kwargs:
+            engine_kwargs = {}
+        engine_kwargs["model_path"] = model_path
+
+        if internal_kv_cache and engine_kwargs.get("engine_type") == "onnxruntime":
+            internal_kv_cache = False
+
+        single_engine_operator = NLEngineOperator(
+            sequence_length=sequence_length,
+            internal_kv_cache=internal_kv_cache,
+            input_ids_length=1,
+            **engine_kwargs,
+        )
+
+        multi_engine_operator = NLEngineOperator(
+            sequence_length=sequence_length,
+            internal_kv_cache=internal_kv_cache,
+            input_ids_length=prompt_sequence_length,
+            **engine_kwargs,
+        )
+
+        # NOTE: Currently using pipeline state. Can swap to simply pass in the
+        # attributes to the specific Operator that neeed them, as class attributes.
+        pipeline_state_vals[
+            "onnx_input_names_no_cache"
+        ] = single_engine_operator.onnx_input_names_no_cache
+        pipeline_state_vals["cache_shape"] = single_engine_operator.cache_shape
+        pipeline_state_vals["output_names"] = single_engine_operator.output_names
+        pipeline_state_vals[
+            "kv_cache_data_type"
+        ] = single_engine_operator.kv_cache_data_type
+        pipeline_state.create_state(pipeline_state_vals)
+
+        process_inputs = ProcessInputsTextGeneration(
+            generation_config=process_generation_config(generation_config),
+            sequence_length=sequence_length,
+            tokenizer=self.tokenizer,
+        )
+
+        kv_cache_creator = KVCacheCreator(
+            sequence_length=sequence_length,
+            tokenizer=self.tokenizer,
+            prompt_sequence_length=prompt_sequence_length,
+            internal_kv_cache=internal_kv_cache,
+        )
+
+        # NOTE: Can also have the KVCacheCreator be initialized inside this Operator.
+        # Relies on pipeline state variables set-up above (can be swapped to be class
+        # attributes instead of using the state.
+        engine_inputs_for_prefill = PrepareforPrefill(kv_cache_creator=kv_cache_creator)
+
+        multi_engine_prefill = MultiEnginePrefill(
+            prompt_sequence_length=prompt_sequence_length,
+            sequence_length=sequence_length,
+        )
+        compile_prompt_logits = CompilePromptLogits()
+        """
+        prep_for_single_engine = PrepareforSingleEngine(
+            prompt_sequence_length=prompt_sequence_length,
+            sequence_length=sequence_length,
+        )
+        """
+        autoregressive_preprocess = AutoRegressiveOperatorPreprocess(
+            sequence_length=sequence_length,
+            prompt_sequence_length=prompt_sequence_length,
+        )
+        final_step = FinalStep()
+
+        ops = {
+            "process_input": process_inputs,
+            "single_engine": single_engine_operator,
+            "multi_engine": multi_engine_operator,
+            "kv_cache_creator": kv_cache_creator,
+            "prepare_prefill": engine_inputs_for_prefill,
+            "multi_engine_prefill": multi_engine_prefill,
+            "compile_logits": compile_prompt_logits,
+            "autoregressive_preprocess": autoregressive_preprocess,
+            "final_step": final_step,
+        }
+
+        routes = {
+            "process_input": "prepare_prefill",
+            "prepare_prefill": ["multi_engine_prefill", "autoregressive_preprocess"],
+            "multi_engine_prefill": "multi_engine",
+            "multi_engine": "compile_logits",
+            "compile_logits": [
+                "multi_engine_prefill",
+                "autoregressive_preprocess",
+                "final_step",
+            ],
+            "autoregressive_preprocess": "single_engine",
+            "single_engine": "compile_logits",
+            "final_step": "STOP",
+        }
+
+        router = GraphRouter(
+            end_route="STOP", start_route="process_input", route=routes
+        )
+        scheduler = [OperatorScheduler()]
+        super().__init__(
+            ops=ops, router=router, schedulers=scheduler, pipeline_state=pipeline_state
+        )
+
+    # TODO: Move to be part of a generic transformers set-up Operator.
+    def setup_onnx_file_path(self, model_path, sequence_length) -> str:
+        import logging
+
+        import transformers
+        from transformers import AutoTokenizer
+
+        from deepsparse.transformers.helpers import get_deployment_path
+
+        """
+        Parses ONNX model from the `model_path` provided. It additionally
+        creates config and tokenizer objects from the `deployment path`,
+        derived from the `model_path` provided.
+
+        :return: file path to the processed ONNX file for the engine to compile
+        """
+        deployment_path, onnx_path = get_deployment_path(model_path)
+
+        hf_logger = logging.getLogger("transformers")
+        hf_logger_level = hf_logger.level
+        hf_logger.setLevel(logging.ERROR)
+        self.config = transformers.PretrainedConfig.from_pretrained(
+            deployment_path,
+            finetuning_task=self.task if hasattr(self, "task") else None,
+        )
+        hf_logger.setLevel(hf_logger_level)
+
+        self._trust_remote_code = False
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            deployment_path,
+            trust_remote_code=self._trust_remote_code,
+            model_max_length=sequence_length,
+        )
+
+        if not self.config or not self.tokenizer:
+            raise RuntimeError(
+                "Invalid config or tokenizer provided. Please provide "
+                "paths to the files or ensure they exist in the `model_path` provided. "
+                "See `tokenizer` and `config` arguments for details."
+            )
+        return onnx_path
+
+
+# NOTE: This is a dummy last step which will be removed. Used as a final step
+# for the current routes.
+class FinalStep(Operator):
+    def can_operate(self, *args, **kwargs):
+        return True
+
+    def run(self, *args, **kwargs):
+        import numpy
+
+        inference_state = kwargs.get("inference_state")
+        prompt_logits = inference_state.current_state.get("prompt_logits")
+        return numpy.concatenate(prompt_logits, axis=1)
diff --git a/src/deepsparse/v2/text_generation/prep_for_prefill.py b/src/deepsparse/v2/text_generation/prep_for_prefill.py
new file mode 100644
index 0000000000..2f9eb15797
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/prep_for_prefill.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import Any
+
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.utils import PipelineState
+
+
+_LOGGER = logging.getLogger(__name__)
+
+__all__ = ["PrepareforPrefill"]
+
+
+class PrepareforPrefill(Operator):
+    def __init__(self, kv_cache_creator: Operator):
+        """
+        Operator before prefill. Responsible for creating the kv_cache based on engine
+        variables. Currently, this operator expects that the kv_cache_creator is
+        provided during initization and then uses pipeline_state to run the
+        kv_cache_operator.
+        """
+        # NOTE: Alternatively, we can initialize the kv_cache_creater operator here,
+        # instead of at the pipeline level.
+        self.kv_cache_creator = kv_cache_creator
+
+        _LOGGER.warn(
+            "This operator requires the PipelineState to be set-up with the "
+            "cache_shape, output_names, kv_cache_data_type attributes to be set "
+            "from the NLEngineOperator"
+        )
+
+    def run(self, tokens: Any, pipeline_state: PipelineState, **kwargs):
+        # NOTE: Can potentially just be class attributes instead of relying on
+        # pipeline state.
+        cache_shape = pipeline_state.current_state.get("cache_shape")
+        data_type = pipeline_state.current_state.get("kv_cache_data_type")
+        output_names = pipeline_state.current_state.get("output_names")
+
+        kv_cache = self.kv_cache_creator.run(
+            cache_shape=cache_shape,
+            kv_cache_data_type=data_type,
+            output_names=output_names,
+        ).get("kv_cache")
+        return {"tokens": tokens, "kv_cache": kv_cache}
diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py
new file mode 100644
index 0000000000..528dcee0b7
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/process_inputs.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pathlib
+from typing import Dict, Union
+
+import transformers
+
+from deepsparse.transformers.pipelines.text_generation import TextGenerationInput
+from deepsparse.transformers.utils.helpers import (
+    check_and_return_generation_config,
+    override_config,
+    repeat_inputs,
+)
+from deepsparse.v2.operators import Operator
+
+
+class GenerationDefaults:
+    num_return_sequences = 1
+    max_length = 1024
+    max_new_tokens = None
+    output_scores = False
+    top_k = 0
+    top_p = 0.0
+    repetition_penalty = 0.0
+    do_sample = False
+    temperature = 1.0
+
+
+__all__ = ["ProcessInputsTextGeneration"]
+
+
+class ProcessInputsTextGeneration(Operator):
+    """
+    Input processing operator. Responsible for tokenizing the input, handling the
+    generation_config (if provided), updating the inference_state for later use,
+    and returning the tokens for prompt inferece. The expected input is defined by
+    the input_schema, which for this operator is TextGeneratioInput.
+    """
+
+    input_schema = TextGenerationInput
+
+    def __init__(
+        self,
+        tokenizer: transformers.PreTrainedTokenizerBase,
+        generation_config: Union[
+            str, pathlib.Path, Dict, transformers.GenerationConfig
+        ],
+        sequence_length: int,
+    ):
+        self.generation_config = generation_config
+        self.tokenizer = tokenizer
+        self.sequence_length = sequence_length
+
+    def run(self, inp: TextGenerationInput, **kwargs):
+        generation_config = check_and_return_generation_config(
+            self.generation_config, inp.generation_config, GenerationDefaults()
+        )
+
+        generation_config = override_config(inp.generation_kwargs, generation_config)
+
+        original_inputs = inp.sequences
+        if generation_config.num_return_sequences > 1:
+            if isinstance(inp.sequences, str):
+                inp.sequences = [inp.sequences]
+            inp.sequences = repeat_inputs(
+                inp.sequences, generation_config.num_return_sequences
+            )
+
+        if inp.fixed_sequences_length:
+            # to enforce a fixed sequence length, we need to
+            # truncate the input to the maximum sequence length
+            # or/and pad it to the maximum sequence length
+            truncate, padding = True, "max_length"
+        else:
+            # otherwise, we do not need to truncate the input
+            # and we shall can pad it to the longest sequence
+            # in the batch (so that the engine can process multiple inputs
+            # at once)
+            truncate, padding = False, "longest"
+
+        input_tokens = self.tokenizer(
+            inp.sequences,
+            return_tensors="np",
+            max_length=self.sequence_length,
+            padding=padding,
+            truncation=truncate,
+        )
+
+        input_ids = input_tokens["input_ids"]
+        attention_mask = input_tokens["attention_mask"]
+
+        inference_state_update = dict(
+            prompts=original_inputs,
+            streaming=inp.streaming,
+            generation_config=generation_config,
+            include_prompt_logits=inp.include_prompt_logits,
+            callback=inp.callback,
+            stop=inp.stop,
+            top_p=generation_config.top_p,
+            top_k=generation_config.top_k,
+            presence_penalty=inp.presence_penalty,
+            frequency_penalty=generation_config.repetition_penalty,
+        )
+
+        # TODO: move this step to prep_for_prefill and add attention mask to the output
+        # this will allow us to split/join more easily when processing multiple prompts
+        # in parallel
+        tokens = input_ids[attention_mask.nonzero()].tolist()
+        return {"tokens": tokens}, inference_state_update
diff --git a/src/deepsparse/v2/utils/__init__.py b/src/deepsparse/v2/utils/__init__.py
index a36d8e92ec..358405d7af 100644
--- a/src/deepsparse/v2/utils/__init__.py
+++ b/src/deepsparse/v2/utils/__init__.py
@@ -13,5 +13,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from .state import *
 from .types import *
diff --git a/src/deepsparse/v2/utils/state.py b/src/deepsparse/v2/utils/state.py
new file mode 100644
index 0000000000..b54b890acf
--- /dev/null
+++ b/src/deepsparse/v2/utils/state.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from abc import ABC
+from typing import Any, Union
+
+
+__all__ = ["State", "PipelineState", "InferenceState"]
+
+
+class State(ABC):
+    """
+    Abstract class to store pipeline-level and inference-level state variables which
+    are generated by some Operator, and required by some other Operator.
+    """
+
+    def __init__(self):
+        self._current_state = None
+
+    @property
+    def current_state(self):
+        return self._current_state
+
+
+class PipelineState(State):
+    """
+    Created during pipeline initialization. Pipeline state values are ready-only
+    duirng inference.
+    """
+
+    def create_state(self, new_state: dict):
+        if self._current_state:
+            raise ValueError("State creation is only allowed during initialization.")
+        self._current_state = new_state
+
+
+class InferenceState(State):
+    """
+    Inference state, created during every inference run.
+    """
+
+    def create_state(self, new_state: dict):
+        if self._current_state:
+            warnings.warn("Current state already exists, overriding.")
+        self._current_state = new_state
+
+    def update_value(self, attribute: str, value: Union[str, int, list]):
+        if not self._current_state.get(attribute):
+            raise ValueError(f"{attribute} is not a valid state attribute")
+        self._current_state[attribute] = value
+
+    def update_state(self, value: Any):
+        self._current_state.update(value)
diff --git a/tests/deepsparse/v2/test_basic_pipeline.py b/tests/deepsparse/v2/test_basic_pipeline.py
index 9f85e4976e..bedddd537a 100644
--- a/tests/deepsparse/v2/test_basic_pipeline.py
+++ b/tests/deepsparse/v2/test_basic_pipeline.py
@@ -34,7 +34,7 @@ class AddOneOperator(Operator):
     input_schema = IntSchema
     output_schema = IntSchema
 
-    def run(self, inp: IntSchema) -> Dict:
+    def run(self, inp: IntSchema, **kwargs) -> Dict:
         return {"value": inp.value + 1}
 
 
@@ -42,7 +42,7 @@ class AddTwoOperator(Operator):
     input_schema = IntSchema
     output_schema = IntSchema
 
-    def run(self, inp: IntSchema) -> Dict:
+    def run(self, inp: IntSchema, **kwargs) -> Dict:
         return {"value": inp.value + 2}
 
 

From 59457b7ca7967c54aad1d33d7db1d6ef83924a87 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Fri, 3 Nov 2023 11:15:00 -0400
Subject: [PATCH 05/43] [Pipeline Refactor] Additional Operators, Route update
 and completed generation functionality (#1356)

* initial functionality and working example with image classification

* remove testing image

* rebase fixes

* initial functionality and working example with image classification

* text gen

* updates func

* prompt inference, initial functionality

* remove image; update state docstring

* Fix typo

* add todo for split/join

* remove context, clean-up args, remove prefill_preprocess_operaator

* fix docstrings

* initial functionality and working example with image classification

* updates func

* prompt inference, initial functionality

* finish generation operators and update routes

* further breakdown operators

* add operators

* fix can_operate condition

* update can_operate to not rely on the inference_state

* rebase + update

* fix condition

* fix capacity settting again

* typo fixes
---
 .../v2/operators/engine_operator.py           |   3 +
 src/deepsparse/v2/text_generation/__init__.py |   7 +
 .../autoregressive_preprocess_operator.py     |  20 ++-
 .../compile_generated_tokens.py               |  56 +++++++
 .../v2/text_generation/compile_generations.py |  55 +++++++
 .../v2/text_generation/compile_logits.py      |   6 +
 .../v2/text_generation/generate_new_token.py  |  90 +++++++++++
 .../multi_engine_prefill_operator.py          |   1 +
 .../v2/text_generation/nl_engine_operator.py  |   8 +-
 src/deepsparse/v2/text_generation/pipeline.py |  61 ++++----
 .../v2/text_generation/prep_for_generation.py | 140 ++++++++++++++++++
 .../v2/text_generation/process_inputs.py      |   2 +-
 .../v2/text_generation/process_outputs.py     |  88 +++++++++++
 .../v2/text_generation/token_generator.py     |  30 ++++
 14 files changed, 529 insertions(+), 38 deletions(-)
 create mode 100644 src/deepsparse/v2/text_generation/compile_generated_tokens.py
 create mode 100644 src/deepsparse/v2/text_generation/compile_generations.py
 create mode 100644 src/deepsparse/v2/text_generation/generate_new_token.py
 create mode 100644 src/deepsparse/v2/text_generation/prep_for_generation.py
 create mode 100644 src/deepsparse/v2/text_generation/process_outputs.py
 create mode 100644 src/deepsparse/v2/text_generation/token_generator.py

diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py
index b7d920a686..c2fc562c63 100644
--- a/src/deepsparse/v2/operators/engine_operator.py
+++ b/src/deepsparse/v2/operators/engine_operator.py
@@ -87,6 +87,9 @@ def __init__(
         self._engine_args = engine_args
         self._engine_type = engine_type
 
+        if not engine_kwargs:
+            engine_kwargs = {}
+
         self.engine = self.create_engine(**engine_kwargs)
 
     @property
diff --git a/src/deepsparse/v2/text_generation/__init__.py b/src/deepsparse/v2/text_generation/__init__.py
index 37ac88d02f..21cd7e2acd 100644
--- a/src/deepsparse/v2/text_generation/__init__.py
+++ b/src/deepsparse/v2/text_generation/__init__.py
@@ -13,12 +13,19 @@
 # limitations under the License.
 # flake8: noqa
 from .autoregressive_preprocess_operator import *
+from .compile_generated_tokens import *
+from .compile_generations import *
 from .compile_logits import *
+from .generate_new_token import *
 from .kv_cache_operator import *
 from .multi_engine_prefill_operator import *
 from .nl_engine_operator import *
 from .prep_for_prefill import *
 from .process_inputs import *
+from .process_outputs import *
 
 
+from .token_generator import *  # isort:skip
+from .prep_for_generation import *  # isort:skip
+
 from .pipeline import *  # isort:skip
diff --git a/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py b/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py
index cfe7cb531b..6e97412e43 100644
--- a/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py
+++ b/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py
@@ -36,7 +36,6 @@ def __init__(self, sequence_length: int, prompt_sequence_length: int):
         """
         self.sequence_length = sequence_length
         self.prompt_sequence_length = prompt_sequence_length
-        self.set_capacity = False
 
         _LOGGER.warn(
             "This operator requires the PipelineState to be set-up with the "
@@ -51,16 +50,19 @@ def can_operate(self, inp: Any) -> bool:
         tokens = inp.get("tokens")
         kv_cache = inp.get("kv_cache")
 
+        if inp.get("in_generation"):
+            return True
+
         remaining_tokens = len(tokens) - kv_cache.total_num_processed_tokens
-        if remaining_tokens > 0 and remaining_tokens < self.prompt_sequence_length:
+        can_process = (
+            remaining_tokens > 0 and remaining_tokens < self.prompt_sequence_length
+        )
+        if can_process and inp.get("in_generation") is None:
             return True
         return False
 
     def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwargs):
-
-        if not self.set_capacity:
-            self.set_capacity = True
-            kv_cache.set_capacity(self.sequence_length - 1)
+        kv_cache.set_capacity(self.sequence_length - 1)
 
         num_total_processed_tokens = kv_cache.total_num_processed_tokens
         new_token = tokens[num_total_processed_tokens]
@@ -88,13 +90,9 @@ def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwarg
 
         engine_inputs = [engine_inputs_map[name] for name in engine_input_names]
 
-        onnx_input_names_no_cache = pipeline_state.current_state.get(
-            "onnx_input_names_no_cache"
-        )
-        engine_inputs = [engine_inputs_map[name] for name in onnx_input_names_no_cache]
-
         return {
             "engine_inputs": engine_inputs,
             "kv_cache": kv_cache,
             "tokens": tokens,
+            "in_generation": kwargs.get("in_generation"),
         }
diff --git a/src/deepsparse/v2/text_generation/compile_generated_tokens.py b/src/deepsparse/v2/text_generation/compile_generated_tokens.py
new file mode 100644
index 0000000000..c87436ab3a
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/compile_generated_tokens.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.utils import InferenceState
+
+
+__all__ = ["CompileGeneratedTokens"]
+
+
+class CompileGeneratedTokens(Operator):
+    def run(
+        self,
+        new_token,
+        logits,
+        finish_reason,
+        kv_cache,
+        tokens,
+        inference_state: InferenceState,
+        **kwargs,
+    ):
+        in_generation = True
+
+        generated_tokens = inference_state.current_state.get("generated_tokens")
+        generated_logits = inference_state.current_state.get("generated_logits")
+        finished_reason = inference_state.current_state.get("finished_reason")
+
+        generated_tokens.append(new_token)
+        generated_logits.append(logits)
+        finished_reason.append(finish_reason)
+
+        if finish_reason is not None:
+            in_generation = False
+
+        state_update = {  # TODO: check if necessary
+            "finished_reason": finished_reason,
+            "generated_tokens": generated_tokens,
+            "generated_logits": generated_logits,
+        }
+
+        output = {
+            "tokens": tokens,
+            "kv_cache": kv_cache,
+            "in_generation": in_generation,
+        }
+        return output, state_update
diff --git a/src/deepsparse/v2/text_generation/compile_generations.py b/src/deepsparse/v2/text_generation/compile_generations.py
new file mode 100644
index 0000000000..ed8297ac01
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/compile_generations.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+
+import numpy
+from pydantic import BaseModel, Field
+
+from deepsparse.transformers.pipelines.text_generation import FinishReason
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.utils import InferenceState
+
+
+__all__ = ["CompileGenerations", "CompileGenerationsOutput"]
+
+
+class CompileGenerationsOutput(BaseModel):
+    generated_tokens: Any = Field(description="generated_tokens")
+    generated_logits: Any = Field(description="generated_logits")
+    finished_reason: Any = Field(description="finished_reason")
+
+
+class CompileGenerations(Operator):
+    output_schema = CompileGenerationsOutput
+
+    def can_operate(self, inp: Any):
+        if inp.get("in_generation") is False:
+            return True
+        return False
+
+    def run(self, inference_state: InferenceState, **kwargs):
+        generated_tokens = inference_state.current_state.get("generated_tokens")
+        generated_logits = inference_state.current_state.get("generated_logits")
+        finished_reason = inference_state.current_state.get("finished_reason")
+
+        if len(finished_reason) == 0:
+            finished_reason.append(FinishReason.LENGTH)
+
+        generated_tokens = numpy.array([generated_tokens])
+        generated_logits = numpy.concatenate(generated_logits, axis=1)
+        return {
+            "generated_tokens": generated_tokens,
+            "generated_logits": generated_logits,
+            "finished_reason": finished_reason,
+        }
diff --git a/src/deepsparse/v2/text_generation/compile_logits.py b/src/deepsparse/v2/text_generation/compile_logits.py
index 55c87d791d..21bd50e03e 100644
--- a/src/deepsparse/v2/text_generation/compile_logits.py
+++ b/src/deepsparse/v2/text_generation/compile_logits.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Any
 
 from deepsparse.v2.operators import Operator
 from deepsparse.v2.utils import InferenceState
@@ -27,6 +28,11 @@ class CompilePromptLogits(Operator):
     take prompt logits from each iteration run and update the inference state.
     """
 
+    def can_operate(self, inp: Any):
+        if inp.get("in_generation") is None:
+            return True
+        return False
+
     def run(self, logits, inference_state: InferenceState, **kwargs):
         logit_type = "prompt_logits"
 
diff --git a/src/deepsparse/v2/text_generation/generate_new_token.py b/src/deepsparse/v2/text_generation/generate_new_token.py
new file mode 100644
index 0000000000..33ab546e39
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/generate_new_token.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Sequence, Union
+
+import transformers
+
+from deepsparse.transformers.pipelines.text_generation import FinishReason
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.utils import InferenceState
+
+
+__all__ = ["GenerateNewTokenOperator"]
+
+
+class GenerateNewTokenOperator(Operator):
+    def __init__(
+        self, tokenizer: transformers.PreTrainedTokenizerBase, force_max_tokens: bool
+    ):
+        self.force_max_tokens = force_max_tokens
+        self.tokenizer = tokenizer
+
+    def can_operate(self, inp: Any):
+        if inp.get("in_generation"):
+            return True
+        return False
+
+    def run(self, logits, kv_cache, inference_state: InferenceState, **kwargs):
+        token_generator = inference_state.current_state.get("token_generator")
+        token = token_generator.generate(logits=logits[0, -1, :])
+        finish_reason = None
+
+        callback = inference_state.current_state.get("callback")
+        stop = inference_state.current_state.get("stop")
+
+        if token == self.tokenizer.eos_token_id and not self.force_max_tokens:
+            finish_reason = FinishReason.STOP
+
+        if self._stop_token_generated(token, stop_tokens=stop):
+            print(
+                "Stop token %s generated. Stopping generation."
+                % self.tokenizer.decode(token)
+            )
+            finish_reason = FinishReason.STOP
+
+        if callback is not None and callback(token) is False:
+            print(
+                "callback %s returned False, stopping generation."
+                % callback.__qualname__
+            )
+            finish_reason = FinishReason.CALLBACK
+
+        max_tokens = inference_state.current_state.get("max_tokens")
+        if len(inference_state.current_state.get("generated_tokens")) + 1 == max_tokens:
+            finish_reason = inference_state.current_state.get("length_finish_reason")
+
+        state_update = {
+            "token_generator": token_generator,
+        }
+
+        new_generation = {
+            "logits": logits,
+            "new_token": token,
+            "finish_reason": finish_reason,
+        }
+        output = {"tokens": token_generator.tokens, "kv_cache": kv_cache}
+        output.update(new_generation)
+        return output, state_update
+
+    def _stop_token_generated(
+        self, token, stop_tokens: Union[None, str, Sequence[str]]
+    ) -> bool:
+        if stop_tokens is None:
+            return False
+
+        decoded_token = self.tokenizer.decode(token)
+        decoded_token = (
+            decoded_token if decoded_token.isspace() else decoded_token.strip()
+        )
+        return decoded_token in stop_tokens
diff --git a/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py b/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py
index 41ee830a8a..9a885c2355 100644
--- a/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py
+++ b/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py
@@ -97,6 +97,7 @@ def _case_positions(self, num_total_processed_tokens: int):
         )
 
     def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwargs):
+        kv_cache.set_capacity(self.sequence_length - self.prompt_sequence_length)
 
         onnx_input_names_no_cache = pipeline_state.current_state.get(
             "onnx_input_names_no_cache"
diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py
index 6c1ad1966e..0bd9098a40 100644
--- a/src/deepsparse/v2/text_generation/nl_engine_operator.py
+++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py
@@ -36,6 +36,7 @@ class NlEngineInput(BaseModel):
     engine_inputs: List = Field(description="engine inputs")
     kv_cache: Any = Field(description="kv_cache object")
     tokens: List = Field(description="tokens")
+    in_generation: bool = Field(description="in_generation", default=None)
 
 
 class NLEngineOperator(EngineOperator):
@@ -119,7 +120,12 @@ def run(self, inp: NlEngineInput, **kwargs) -> Any:
             kv_cache=kv_cache,
         )
 
-        output = {"logits": logits, "kv_cache": kv_cache, "tokens": inp.tokens}
+        output = {
+            "logits": logits,
+            "kv_cache": kv_cache,
+            "tokens": inp.tokens,
+            "in_generation": inp.in_generation,
+        }
         return output
 
     def _add_kv_cache_to_input(self, engine_input, kv_cache):
diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py
index 9878aa0061..49826b8af7 100644
--- a/src/deepsparse/v2/text_generation/pipeline.py
+++ b/src/deepsparse/v2/text_generation/pipeline.py
@@ -15,18 +15,23 @@
 from typing import Dict
 
 from deepsparse.transformers.utils.helpers import process_generation_config
-from deepsparse.v2.operators import Operator
 from deepsparse.v2.pipeline import Pipeline
 from deepsparse.v2.routers import GraphRouter
 from deepsparse.v2.schedulers import OperatorScheduler
 from deepsparse.v2.text_generation import (
     AutoRegressiveOperatorPreprocess,
+    CompileGeneratedTokens,
+    CompileGenerations,
     CompilePromptLogits,
+    GenerateNewTokenOperator,
     KVCacheCreator,
     MultiEnginePrefill,
     NLEngineOperator,
     PrepareforPrefill,
+    PrepareGeneration,
     ProcessInputsTextGeneration,
+    ProcessOutputs,
+    TokenGeneratorOperator,
 )
 from deepsparse.v2.utils import PipelineState
 
@@ -109,17 +114,23 @@ def __init__(
             sequence_length=sequence_length,
         )
         compile_prompt_logits = CompilePromptLogits()
-        """
-        prep_for_single_engine = PrepareforSingleEngine(
-            prompt_sequence_length=prompt_sequence_length,
+
+        autoregressive_preprocess = AutoRegressiveOperatorPreprocess(
             sequence_length=sequence_length,
+            prompt_sequence_length=prompt_sequence_length,
         )
-        """
-        autoregressive_preprocess = AutoRegressiveOperatorPreprocess(
+        token_generator = TokenGeneratorOperator()
+        prep_for_generation = PrepareGeneration(
             sequence_length=sequence_length,
             prompt_sequence_length=prompt_sequence_length,
+            token_generator=token_generator,
+        )
+        generate_new_token = GenerateNewTokenOperator(
+            tokenizer=self.tokenizer, force_max_tokens=force_max_tokens
         )
-        final_step = FinalStep()
+        process_output = ProcessOutputs(tokenizer=self.tokenizer)
+        compile_generations = CompileGenerations()
+        compile_generated_tokens = CompileGeneratedTokens()
 
         ops = {
             "process_input": process_inputs,
@@ -130,7 +141,11 @@ def __init__(
             "multi_engine_prefill": multi_engine_prefill,
             "compile_logits": compile_prompt_logits,
             "autoregressive_preprocess": autoregressive_preprocess,
-            "final_step": final_step,
+            "prep_for_generation": prep_for_generation,
+            "generate_new_token": generate_new_token,
+            "process_outputs": process_output,
+            "compile_generations": compile_generations,
+            "compile_generated_tokens": compile_generated_tokens,
         }
 
         routes = {
@@ -140,12 +155,22 @@ def __init__(
             "multi_engine": "compile_logits",
             "compile_logits": [
                 "multi_engine_prefill",
+                "prep_for_generation",
                 "autoregressive_preprocess",
-                "final_step",
             ],
             "autoregressive_preprocess": "single_engine",
-            "single_engine": "compile_logits",
-            "final_step": "STOP",
+            "single_engine": [
+                "compile_logits",
+                "generate_new_token",
+            ],
+            "prep_for_generation": "autoregressive_preprocess",
+            "generate_new_token": "compile_generated_tokens",
+            "compile_generated_tokens": [
+                "autoregressive_preprocess",
+                "compile_generations",
+            ],
+            "compile_generations": "process_outputs",
+            "process_outputs": "STOP",
         }
 
         router = GraphRouter(
@@ -197,17 +222,3 @@ def setup_onnx_file_path(self, model_path, sequence_length) -> str:
                 "See `tokenizer` and `config` arguments for details."
             )
         return onnx_path
-
-
-# NOTE: This is a dummy last step which will be removed. Used as a final step
-# for the current routes.
-class FinalStep(Operator):
-    def can_operate(self, *args, **kwargs):
-        return True
-
-    def run(self, *args, **kwargs):
-        import numpy
-
-        inference_state = kwargs.get("inference_state")
-        prompt_logits = inference_state.current_state.get("prompt_logits")
-        return numpy.concatenate(prompt_logits, axis=1)
diff --git a/src/deepsparse/v2/text_generation/prep_for_generation.py b/src/deepsparse/v2/text_generation/prep_for_generation.py
new file mode 100644
index 0000000000..544af43980
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/prep_for_generation.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+
+import numpy
+
+from deepsparse.transformers.pipelines.text_generation import FinishReason
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.text_generation import TokenGeneratorOperator
+from deepsparse.v2.utils import InferenceState
+
+
+__all__ = ["PrepareGeneration"]
+
+
+class PrepareGeneration(Operator):
+    def __init__(
+        self,
+        token_generator: TokenGeneratorOperator,
+        prompt_sequence_length: int,
+        sequence_length: int,
+    ):
+        self.prompt_sequence_length = prompt_sequence_length
+        self.sequence_length = sequence_length
+        self.token_generator_creator = token_generator
+
+    def can_operate(self, inp: Any):
+        kv_cache = inp.get("kv_cache")
+        tokens = inp.get("tokens")
+
+        # If the number of prompt tokens is greater than what we've processed,
+        # don't start generation. Should be equal when started as all prompt logits
+        # should be accounted for and we should have updated the kv_cache for the single
+        # token engine.
+        if len(tokens) == kv_cache.total_num_processed_tokens:
+            return True
+        return False
+
+    @staticmethod
+    def set_generated_length(
+        max_length: int,
+        prompt_tokens_length: int,
+        sequence_length: int,
+        prompt_sequence_length: int,
+        max_new_tokens: int,
+        finish_reason_choices: "FinishReason",  # noqa
+    ):
+        """
+        Determine the length of the generated tokens. The hard cap on the total number
+        of tokens is based on the sequence length. If max_length is provided and is less
+        than the sequence length, it will be used to cap the total number of tokens
+        generated. If it is not provided, the max_new_tokens attribute will be used and
+        also capped by the sequence length.
+
+        :param max_length: max_length attribute, provided as input during inference
+        :param prompt_tokens_length: the number of prompt tokens used as part of the
+            generated output
+        :param sequence_length: the sequence length used for the pipeline
+        :param prompt_sequence_length: the prompt sequence length used for the pipeline
+        :param max_new_tokens: the max_new_tokens attribute, which may be provided
+        as part of the input during inference
+        """
+        if max_length:
+            # if max_length provided, use that to cap total tokens generated
+            max_tokens = max_length
+            finish_reason = finish_reason_choices.LENGTH
+        else:
+            # if not provided, max tokens is based on max_new_tokens + prompt tokens
+            max_tokens = (
+                min(max_new_tokens, sequence_length - prompt_sequence_length)
+                + prompt_tokens_length
+            )
+            finish_reason = finish_reason_choices.MAX_NEW_TOKENS
+
+        # hard model/pipeline cap
+        return (
+            (sequence_length, finish_reason_choices.CAPACITY)
+            if sequence_length < max_tokens
+            else (max_tokens, finish_reason)
+        )
+
+    def run(
+        self, tokens: Any, kv_cache: Any, inference_state: InferenceState, **kwargs
+    ):
+        prompt_logits = inference_state.current_state.get("prompt_logits")
+        prompt_logits = numpy.concatenate(prompt_logits, axis=1)
+        # TODO: clean this up such that dont have to keep writing current_state
+        # everywhere
+
+        generation_config = inference_state.current_state.get("generation_config")
+        include_prompt_logits = inference_state.current_state.get(
+            "include_prompt_logits"
+        )
+
+        token_generator_creator_output = self.token_generator_creator.run(
+            logits_shape=prompt_logits[0, -1, :].shape,
+            deterministic=not generation_config.do_sample,
+            sampling_temperature=generation_config.temperature,
+            tokens=tokens,
+            **inference_state.current_state,
+        )
+        token_generator = token_generator_creator_output.get("token_generator")
+        token_generator.generate(prompt_logits[0, -1, :])
+
+        max_tokens, length_finish_reason = PrepareGeneration.set_generated_length(
+            max_length=generation_config.max_length,
+            prompt_tokens_length=1,
+            max_new_tokens=generation_config.max_new_tokens,
+            sequence_length=self.sequence_length,
+            prompt_sequence_length=self.prompt_sequence_length,
+            finish_reason_choices=FinishReason,
+        )
+        state_update = {
+            "max_tokens": max_tokens,
+            "length_finish_reason": length_finish_reason,
+            "generated_tokens": [token_generator.tokens[-1]],
+            "generated_logits": [prompt_logits]
+            if include_prompt_logits
+            else [numpy.expand_dims(prompt_logits[:, -1, :], 0)],
+            "finished_reason": [],
+            "token_generator": token_generator,
+        }
+
+        output = {
+            "tokens": token_generator.tokens,
+            "kv_cache": kv_cache,
+            "in_generation": True,
+        }
+        return output, state_update
diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py
index 528dcee0b7..e57e402983 100644
--- a/src/deepsparse/v2/text_generation/process_inputs.py
+++ b/src/deepsparse/v2/text_generation/process_inputs.py
@@ -28,7 +28,7 @@
 
 class GenerationDefaults:
     num_return_sequences = 1
-    max_length = 1024
+    max_length = 100
     max_new_tokens = None
     output_scores = False
     top_k = 0
diff --git a/src/deepsparse/v2/text_generation/process_outputs.py b/src/deepsparse/v2/text_generation/process_outputs.py
new file mode 100644
index 0000000000..ca1cf78521
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/process_outputs.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import datetime
+from typing import Optional
+
+import numpy
+
+from deepsparse.transformers.pipelines.text_generation import (
+    FinishReason,
+    GeneratedText,
+    TextGenerationOutput,
+)
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.text_generation.compile_generations import CompileGenerationsOutput
+from deepsparse.v2.utils import InferenceState
+
+
+class ProcessOutputs(Operator):
+    output_schema = TextGenerationOutput
+
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+
+    def _create_generated_text_output(
+        self,
+        sequence: str,
+        finish_reason: Optional[FinishReason] = None,
+        logits: Optional[numpy.array] = None,
+    ):
+        if finish_reason:
+            return GeneratedText(
+                text=sequence,
+                score=logits,
+                finished=True,
+                finished_reason=finish_reason.value,
+            )
+        return GeneratedText(
+            text=sequence,
+            score=logits,
+            finished=False,
+        )
+
+    def run(
+        self, inp: CompileGenerationsOutput, inference_state: InferenceState, **kwargs
+    ):
+        generation_config = inference_state.current_state.get("generation_config")
+        generated_tokens = inp.generated_tokens
+        generated_logits = (
+            inp.generated_logits if generation_config.output_scores else None
+        )
+        finished_reason = inp.finished_reason
+        sequences = self.tokenizer.batch_decode(
+            generated_tokens, skip_special_tokens=True
+        )
+
+        finished_reason = [f for f in finished_reason if f]
+
+        if generated_logits is not None:
+            generations = list(
+                map(
+                    self._create_generated_text_output,
+                    sequences,
+                    finished_reason,
+                    generated_logits,
+                )
+            )
+        else:
+            generations = list(
+                map(self._create_generated_text_output, sequences, finished_reason)
+            )
+        outputs = dict(
+            created=datetime.datetime.now(),
+            prompts=inference_state.current_state.get("prompts"),
+            generations=generations,
+        )
+
+        return outputs
diff --git a/src/deepsparse/v2/text_generation/token_generator.py b/src/deepsparse/v2/text_generation/token_generator.py
new file mode 100644
index 0000000000..9148d71cc8
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/token_generator.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from deepsparse.transformers.utils.token_generator import TokenGenerator
+from deepsparse.v2.operators import Operator
+
+
+__all__ = ["TokenGeneratorOperator"]
+
+
+class TokenGeneratorOperator(Operator):
+    def run(self, logits_shape, deterministic, tokens, sampling_temperature, **kwargs):
+        token_generator = TokenGenerator(
+            logits_shape=logits_shape,
+            deterministic=deterministic,
+            tokens=tokens,
+            sampling_temperature=sampling_temperature,
+            **kwargs,
+        )
+        return {"token_generator": token_generator}

From f18d5f3c4a3a6f9431787ae36a3cdfcabaacdd91 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Fri, 3 Nov 2023 15:24:15 -0400
Subject: [PATCH 06/43] add split/join functionality

---
 .../v2/operators/engine_operator.py           | 18 +----
 src/deepsparse/v2/operators/operator.py       | 13 ----
 src/deepsparse/v2/pipeline.py                 | 71 ++++++++++++++++++-
 src/deepsparse/v2/routers/router.py           |  4 +-
 src/deepsparse/v2/schedulers/scheduler.py     | 17 +++++
 .../v2/schedulers/scheduler_group.py          | 27 +++----
 src/deepsparse/v2/text_generation/__init__.py |  1 +
 .../v2/text_generation/join_output.py         | 70 ++++++++++++++++++
 src/deepsparse/v2/text_generation/pipeline.py | 28 ++++++--
 .../v2/text_generation/prep_for_prefill.py    |  9 ++-
 .../v2/text_generation/process_inputs.py      |  9 ++-
 .../v2/text_generation/process_outputs.py     | 16 ++---
 12 files changed, 216 insertions(+), 67 deletions(-)
 create mode 100644 src/deepsparse/v2/text_generation/join_output.py

diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py
index c2fc562c63..bd58aefafa 100644
--- a/src/deepsparse/v2/operators/engine_operator.py
+++ b/src/deepsparse/v2/operators/engine_operator.py
@@ -20,7 +20,7 @@
 from deepsparse import Context as EngineContext
 from deepsparse import Engine, MultiModelEngine, Scheduler
 from deepsparse.benchmark import ORTEngine
-from deepsparse.utils import join_engine_outputs, model_to_path, split_engine_inputs
+from deepsparse.utils import model_to_path
 from deepsparse.v2.operators import Operator
 
 
@@ -145,18 +145,6 @@ def run(self, inp: EngineOperatorInputs, **kwargs) -> Dict:
             # planned refactor
             engine_outputs = inp.engine(inp.engine_inputs)
             return {"engine_outputs": engine_outputs}
-        inp = inp.engine_inputs
-        batches, orig_batch_size = self.expand_inputs(engine_inputs=inp)
-        batches_outputs = list(map(self.engine, batches))
-        engine_outputs = self.condense_inputs(
-            batch_outputs=batches_outputs, orig_batch_size=orig_batch_size
-        )
-        return {"engine_outputs": engine_outputs}
 
-    def expand_inputs(self, **kwargs):
-        return split_engine_inputs(kwargs["engine_inputs"], self._batch_size)
-
-    def condense_inputs(self, **kwargs):
-        batch_outputs = kwargs["batch_outputs"]
-        orig_batch_size = kwargs["orig_batch_size"]
-        return join_engine_outputs(batch_outputs, orig_batch_size)
+        engine_outputs = self.engine(inp.engine_inputs)
+        return {"engine_outputs": engine_outputs}
diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py
index b3963d8223..5bb0be841a 100644
--- a/src/deepsparse/v2/operators/operator.py
+++ b/src/deepsparse/v2/operators/operator.py
@@ -99,7 +99,6 @@ def __call__(
                 pipeline_state=pipeline_state,
                 **kwargs,
             )
-
         if self.has_output_schema():
             return self.output_schema(**run_output)
         return run_output
@@ -117,18 +116,6 @@ def can_operate(self, inp: Any) -> bool:
         """
         return True
 
-    def expand_inputs(self, **kwargs):
-        """
-        Generic function to handle expanding values.
-        """
-        raise NotImplementedError
-
-    def condense_inputs(self, **kwargs):
-        """
-        Generic function to handle condensing values.
-        """
-        raise NotImplementedError
-
     def yaml(self):
         pass
 
diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py
index 0a8c8b2f93..79667fdc3a 100644
--- a/src/deepsparse/v2/pipeline.py
+++ b/src/deepsparse/v2/pipeline.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 
 
-from typing import Dict, List, Union
+import copy
+from functools import partial
+from typing import Any, Dict, List, Union
 
 from deepsparse.v2.operators import Operator
 from deepsparse.v2.routers import Router
@@ -59,6 +61,55 @@ def __init__(
         # SchedulerGroup handles running all schedulers in order of priority
         self._scheduler_group = SchedulerGroup(self.schedulers)
 
+    def _run_sequential(
+        self,
+        inp: Any,
+        inference_state: InferenceState,
+        pipeline_state: PipelineState,
+        start: str,
+        end: str,
+    ):
+        # TODO: somehow refactor to prevent repeat code.
+        next_step = start
+        while next_step != end:
+            operator = self.ops[next_step]
+            if isinstance(inp, dict):
+                operator_output = operator(
+                    pipeline_state=pipeline_state,
+                    inference_state=inference_state,
+                    **inp,
+                )
+            else:
+                operator_output = operator(
+                    inp, pipeline_state=pipeline_state, inference_state=inference_state
+                )
+            if isinstance(operator_output, tuple):
+                state_update = operator_output[-1]
+                operator_output = operator_output[0]
+                inference_state.update_state(state_update)
+
+            next_step = self.router.next(next_step, self.ops, operator_output)
+            inp = operator_output
+        return inp
+
+    def _apply_split(self, inp: Any, inference_state: InferenceState):
+
+        batches, orig_batch_size = self.expand_inputs(inp, 1)
+        run_with_state = partial(
+            self._run_sequential,
+            pipeline_state=self.pipeline_state,
+            start=self.router.route[self.router.SPLIT_ROUTE],
+            end=self.router.END_SPLIT,
+        )
+        inference_state_list = [
+            copy.deepcopy(inference_state) for x in range(len(batches))
+        ]
+        outputs = self._scheduler_group.map(
+            batches, inference_state_list, func=run_with_state
+        )
+        outputs = self.condense_inputs(outputs)
+        return outputs
+
     def run(
         self,
         *args,
@@ -78,7 +129,11 @@ def run(
         operator_output = None
 
         while next_step != self.router.END_ROUTE:
-            # Either a dictionary key or valid index
+            # Split_Route should be after Start_Route
+            if next_step == self.router.SPLIT_ROUTE:
+                operator_output = self._apply_split(operator_output, inference_state)
+                next_step = self.router.route[self.router.END_SPLIT]
+
             operator = self.ops[next_step]
             if next_step == self.router.START_ROUTE:
                 output_future = self._scheduler_group.submit(
@@ -136,6 +191,18 @@ def __call__(self, *args, **kwargs):
 
         return self.run(*args, **kwargs)
 
+    def expand_inputs(self, *args, **kwargs):
+        """
+        Generic function to handle expanding values.
+        """
+        raise NotImplementedError
+
+    def condense_inputs(self, *args, **kwargs):
+        """
+        Generic function to handle condensing values.
+        """
+        raise NotImplementedError
+
     def validate(self):
         """
         Validate that compatability of the router and operators provided.
diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py
index d1110d4ca7..93bc059ddb 100644
--- a/src/deepsparse/v2/routers/router.py
+++ b/src/deepsparse/v2/routers/router.py
@@ -128,8 +128,10 @@ class GraphRouter(Router):
     where `can_operate` returns True will run. Paths should be deterministic.
     """
 
-    def __init__(self, end_route: str, start_route: str, route: Dict):
+    def __init__(self, end_route: str, start_route: str, route: Dict, **kwargs):
         super().__init__(end_route=end_route, start_route=start_route, route=route)
+        self.SPLIT_ROUTE = kwargs.get("split_route")
+        self.END_SPLIT = kwargs.get("end_split")
 
     def next(
         self,
diff --git a/src/deepsparse/v2/schedulers/scheduler.py b/src/deepsparse/v2/schedulers/scheduler.py
index 78a58e3389..617936d509 100644
--- a/src/deepsparse/v2/schedulers/scheduler.py
+++ b/src/deepsparse/v2/schedulers/scheduler.py
@@ -14,6 +14,7 @@
 
 
 from concurrent.futures import Future, ThreadPoolExecutor
+from typing import Callable
 
 from deepsparse.v2.operators import Operator
 
@@ -52,6 +53,22 @@ def submit(
             **kwargs,
         )
 
+    def can_map(self, *args):
+        """
+        args containing list of inputs to be used for each worker. This function if we
+        have sufficient workes available
+        """
+        if len(args[0]) <= self._threadpool._max_workers:
+            return True
+        return False
+
+    def map(self, *args, func: Callable):
+        """
+        :param func: Callable to run as part of the map function
+        args containing a list of function variables to map
+        """
+        return list(self._threadpool.map(func, *args))
+
     def can_process(
         self,
         *args,
diff --git a/src/deepsparse/v2/schedulers/scheduler_group.py b/src/deepsparse/v2/schedulers/scheduler_group.py
index 40b5695f22..8557325c9a 100644
--- a/src/deepsparse/v2/schedulers/scheduler_group.py
+++ b/src/deepsparse/v2/schedulers/scheduler_group.py
@@ -14,7 +14,7 @@
 
 
 from concurrent.futures import Future
-from typing import List
+from typing import Callable, List
 
 from deepsparse.v2.operators import Operator
 from deepsparse.v2.schedulers.scheduler import OperatorScheduler
@@ -56,22 +56,13 @@ def submit(
                     **kwargs,
                 )
 
-    def can_process(
-        self,
-        *args,
-        operator: Operator,
-        **kwargs,
-    ) -> bool:
+    def map(self, *args, func: Callable):
         """
-        :param operator: operator to check
-        :return: True if this Operator can process the given operator and input.
-            SchedulerGroup always returns True
+        :param operator: operator to run
+        :return: list of outputs from multiple workers
         """
-        return any(
-            scheduler.can_process(
-                *args,
-                operator=operator,
-                **kwargs,
-            )
-            for scheduler in self.schedulers
-        )
+        for scheduler in self.schedulers:
+            if scheduler.can_map(
+                args[0],
+            ):
+                return scheduler.map(*args, func=func)
diff --git a/src/deepsparse/v2/text_generation/__init__.py b/src/deepsparse/v2/text_generation/__init__.py
index 21cd7e2acd..08836b8bbe 100644
--- a/src/deepsparse/v2/text_generation/__init__.py
+++ b/src/deepsparse/v2/text_generation/__init__.py
@@ -17,6 +17,7 @@
 from .compile_generations import *
 from .compile_logits import *
 from .generate_new_token import *
+from .join_output import *
 from .kv_cache_operator import *
 from .multi_engine_prefill_operator import *
 from .nl_engine_operator import *
diff --git a/src/deepsparse/v2/text_generation/join_output.py b/src/deepsparse/v2/text_generation/join_output.py
new file mode 100644
index 0000000000..8a6c77a2f1
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/join_output.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+import numpy
+
+from deepsparse.transformers.utils.helpers import pad_to_fixed_length
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.text_generation.compile_generations import CompileGenerationsOutput
+
+
+__all__ = ["JoinOutput"]
+
+
+class JoinOutput(Operator):
+    """
+    Run this operator to combine the results from multiple prompts.
+    """
+
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+
+    def run(self, inp: List[CompileGenerationsOutput], **kwargs):
+        batch_outputs = [x for x in inp[0]]
+        generated_tokens = [x.generated_tokens for x in batch_outputs]
+        generated_logits = [x.generated_logits for x in batch_outputs]
+        finished_reason = [x.finished_reason for x in batch_outputs]
+
+        max_len = max(token.shape[1] for token in generated_tokens)
+
+        # pad all tokens to the same length
+        tokens = [
+            pad_to_fixed_length(
+                array=prediction,
+                max_len=max_len,
+                value=self.tokenizer.pad_token_id,
+                axis=1,
+            )
+            for prediction in generated_tokens
+        ]
+
+        # find the longest sequence in the batch of logits
+        max_len = max(logits.shape[1] for logits in generated_logits)
+
+        # pad all logits to the same length
+        logits = [
+            pad_to_fixed_length(array=single_logits, max_len=max_len, axis=1)
+            for single_logits in generated_logits
+        ]
+
+        tokens = numpy.concatenate(tokens)
+        logits = numpy.concatenate(logits)
+
+        return {
+            "generated_tokens": tokens,
+            "generated_logits": logits,
+            "finished_reason": finished_reason,
+        }
diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py
index 49826b8af7..a24c37ed90 100644
--- a/src/deepsparse/v2/text_generation/pipeline.py
+++ b/src/deepsparse/v2/text_generation/pipeline.py
@@ -15,6 +15,7 @@
 from typing import Dict
 
 from deepsparse.transformers.utils.helpers import process_generation_config
+from deepsparse.utils import split_engine_inputs
 from deepsparse.v2.pipeline import Pipeline
 from deepsparse.v2.routers import GraphRouter
 from deepsparse.v2.schedulers import OperatorScheduler
@@ -24,6 +25,7 @@
     CompileGenerations,
     CompilePromptLogits,
     GenerateNewTokenOperator,
+    JoinOutput,
     KVCacheCreator,
     MultiEnginePrefill,
     NLEngineOperator,
@@ -131,6 +133,7 @@ def __init__(
         process_output = ProcessOutputs(tokenizer=self.tokenizer)
         compile_generations = CompileGenerations()
         compile_generated_tokens = CompileGeneratedTokens()
+        join_output = JoinOutput(tokenizer=self.tokenizer)
 
         ops = {
             "process_input": process_inputs,
@@ -146,10 +149,12 @@ def __init__(
             "process_outputs": process_output,
             "compile_generations": compile_generations,
             "compile_generated_tokens": compile_generated_tokens,
+            "join_output": join_output,
         }
 
         routes = {
-            "process_input": "prepare_prefill",
+            "process_input": "SPLIT",
+            "SPLIT": "prepare_prefill",
             "prepare_prefill": ["multi_engine_prefill", "autoregressive_preprocess"],
             "multi_engine_prefill": "multi_engine",
             "multi_engine": "compile_logits",
@@ -169,18 +174,33 @@ def __init__(
                 "autoregressive_preprocess",
                 "compile_generations",
             ],
-            "compile_generations": "process_outputs",
+            "compile_generations": "JOIN",
+            "JOIN": "join_output",
+            "join_output": "process_outputs",
             "process_outputs": "STOP",
         }
 
         router = GraphRouter(
-            end_route="STOP", start_route="process_input", route=routes
+            end_route="STOP",
+            start_route="process_input",
+            route=routes,
+            split_route="SPLIT",
+            end_split="JOIN",
         )
-        scheduler = [OperatorScheduler()]
+        scheduler = [OperatorScheduler(), OperatorScheduler(max_workers=4)]
         super().__init__(
             ops=ops, router=router, schedulers=scheduler, pipeline_state=pipeline_state
         )
 
+    def expand_inputs(self, items, batch_size):
+        items = [items.get(key) for key in items.keys()]
+        out, orig_batch_size = split_engine_inputs(items, batch_size)
+        combined_batches = [{"input_ids": b[0], "attention_mask": b[1]} for b in out]
+        return combined_batches, orig_batch_size
+
+    def condense_inputs(self, *args, **kwargs):
+        return args[0], kwargs
+
     # TODO: Move to be part of a generic transformers set-up Operator.
     def setup_onnx_file_path(self, model_path, sequence_length) -> str:
         import logging
diff --git a/src/deepsparse/v2/text_generation/prep_for_prefill.py b/src/deepsparse/v2/text_generation/prep_for_prefill.py
index 2f9eb15797..2e5fecb3e8 100644
--- a/src/deepsparse/v2/text_generation/prep_for_prefill.py
+++ b/src/deepsparse/v2/text_generation/prep_for_prefill.py
@@ -42,13 +42,20 @@ def __init__(self, kv_cache_creator: Operator):
             "from the NLEngineOperator"
         )
 
-    def run(self, tokens: Any, pipeline_state: PipelineState, **kwargs):
+    def run(
+        self,
+        input_ids: Any,
+        attention_mask: Any,
+        pipeline_state: PipelineState,
+        **kwargs,
+    ):
         # NOTE: Can potentially just be class attributes instead of relying on
         # pipeline state.
         cache_shape = pipeline_state.current_state.get("cache_shape")
         data_type = pipeline_state.current_state.get("kv_cache_data_type")
         output_names = pipeline_state.current_state.get("output_names")
 
+        tokens = input_ids[attention_mask.nonzero()].tolist()
         kv_cache = self.kv_cache_creator.run(
             cache_shape=cache_shape,
             kv_cache_data_type=data_type,
diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py
index e57e402983..5d47c8ff39 100644
--- a/src/deepsparse/v2/text_generation/process_inputs.py
+++ b/src/deepsparse/v2/text_generation/process_inputs.py
@@ -114,8 +114,7 @@ def run(self, inp: TextGenerationInput, **kwargs):
             frequency_penalty=generation_config.repetition_penalty,
         )
 
-        # TODO: move this step to prep_for_prefill and add attention mask to the output
-        # this will allow us to split/join more easily when processing multiple prompts
-        # in parallel
-        tokens = input_ids[attention_mask.nonzero()].tolist()
-        return {"tokens": tokens}, inference_state_update
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }, inference_state_update
diff --git a/src/deepsparse/v2/text_generation/process_outputs.py b/src/deepsparse/v2/text_generation/process_outputs.py
index ca1cf78521..ba301bdae6 100644
--- a/src/deepsparse/v2/text_generation/process_outputs.py
+++ b/src/deepsparse/v2/text_generation/process_outputs.py
@@ -22,7 +22,6 @@
     TextGenerationOutput,
 )
 from deepsparse.v2.operators import Operator
-from deepsparse.v2.text_generation.compile_generations import CompileGenerationsOutput
 from deepsparse.v2.utils import InferenceState
 
 
@@ -52,19 +51,20 @@ def _create_generated_text_output(
         )
 
     def run(
-        self, inp: CompileGenerationsOutput, inference_state: InferenceState, **kwargs
+        self,
+        generated_tokens: numpy.ndarray,
+        generated_logits: numpy.ndarray,
+        finished_reason: list,
+        inference_state: InferenceState,
+        **kwargs,
     ):
         generation_config = inference_state.current_state.get("generation_config")
-        generated_tokens = inp.generated_tokens
-        generated_logits = (
-            inp.generated_logits if generation_config.output_scores else None
-        )
-        finished_reason = inp.finished_reason
+        generated_logits = generated_logits if generation_config.output_scores else None
         sequences = self.tokenizer.batch_decode(
             generated_tokens, skip_special_tokens=True
         )
 
-        finished_reason = [f for f in finished_reason if f]
+        finished_reason = [f[-1] for f in finished_reason]
 
         if generated_logits is not None:
             generations = list(

From 2c4d23124427e5f99400cc5ce4c79508e6ae436f Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 7 Nov 2023 16:21:55 -0500
Subject: [PATCH 07/43] update router to include split/join in parent class,
 refactor pipeline code to remove repeat code, update map function

---
 src/deepsparse/v2/pipeline.py                 | 136 +++++++++++-------
 src/deepsparse/v2/routers/router.py           |  13 +-
 src/deepsparse/v2/schedulers/scheduler.py     |  17 ---
 .../v2/schedulers/scheduler_group.py          |  13 +-
 src/deepsparse/v2/text_generation/pipeline.py |   8 +-
 5 files changed, 101 insertions(+), 86 deletions(-)

diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py
index 79667fdc3a..f56680d2b9 100644
--- a/src/deepsparse/v2/pipeline.py
+++ b/src/deepsparse/v2/pipeline.py
@@ -14,8 +14,9 @@
 
 
 import copy
+from concurrent.futures import Future
 from functools import partial
-from typing import Any, Dict, List, Union
+from typing import Any, Callable, Dict, List, Union
 
 from deepsparse.v2.operators import Operator
 from deepsparse.v2.routers import Router
@@ -58,7 +59,6 @@ def __init__(
         self.pipeline_state = pipeline_state
         self.validate()
 
-        # SchedulerGroup handles running all schedulers in order of priority
         self._scheduler_group = SchedulerGroup(self.schedulers)
 
     def _run_sequential(
@@ -69,46 +69,77 @@ def _run_sequential(
         start: str,
         end: str,
     ):
-        # TODO: somehow refactor to prevent repeat code.
         next_step = start
         while next_step != end:
-            operator = self.ops[next_step]
-            if isinstance(inp, dict):
-                operator_output = operator(
-                    pipeline_state=pipeline_state,
-                    inference_state=inference_state,
-                    **inp,
-                )
-            else:
-                operator_output = operator(
-                    inp, pipeline_state=pipeline_state, inference_state=inference_state
-                )
-            if isinstance(operator_output, tuple):
-                state_update = operator_output[-1]
-                operator_output = operator_output[0]
+            outputs = self._run_next_step(
+                func=self.ops[next_step],
+                next_step=next_step,
+                input=inp,
+                pipeline_state=pipeline_state,
+                inference_state=inference_state,
+            )
+            next_step, operator_output, state_update = outputs
+            if state_update:
                 inference_state.update_state(state_update)
-
-            next_step = self.router.next(next_step, self.ops, operator_output)
             inp = operator_output
         return inp
 
     def _apply_split(self, inp: Any, inference_state: InferenceState):
+        """
+        Split inputs using the pipeline's expand_inputs function. Inputs are split
+        into a batch size of one when a SPLIT_ROUTE node is found in a given pipeline's
+        provided router. The split batches are run asynchronously and then joined when
+        a JOIN_ROUTE node is found, using the pipeline's condense_inputs function.
+        """
 
         batches, orig_batch_size = self.expand_inputs(inp, 1)
         run_with_state = partial(
             self._run_sequential,
             pipeline_state=self.pipeline_state,
             start=self.router.route[self.router.SPLIT_ROUTE],
-            end=self.router.END_SPLIT,
+            end=self.router.JOIN_ROUTE,
         )
         inference_state_list = [
             copy.deepcopy(inference_state) for x in range(len(batches))
         ]
-        outputs = self._scheduler_group.map(
-            batches, inference_state_list, func=run_with_state
+        futures = self._scheduler_group.map(
+            batches,
+            inference_state_list,
+            func=run_with_state,
         )
-        outputs = self.condense_inputs(outputs)
-        return outputs
+        return self.condense_inputs([x.result() for x in futures])
+
+    def _run_next_step(
+        self,
+        *args,
+        func: Callable,
+        next_step: Union[str, int],
+        input: Any = None,
+        **kwargs,
+    ):
+        """
+        Generic function to run a given func, process the output and determine the next
+        step.
+        """
+        if input:
+            operator_output = (
+                func(*args, **kwargs, **input)
+                if isinstance(input, dict)
+                else func(input, *args, **kwargs)
+            )
+        else:
+            operator_output = func(*args, **kwargs)
+
+        if isinstance(operator_output, Future):
+            operator_output = operator_output.result()
+
+        state_update = None
+        if isinstance(operator_output, tuple):
+            state_update = operator_output[-1]
+            operator_output = operator_output[0]
+
+        next_step = self.router.next(next_step, self.ops, operator_output)
+        return next_step, operator_output, state_update
 
     def run(
         self,
@@ -129,44 +160,34 @@ def run(
         operator_output = None
 
         while next_step != self.router.END_ROUTE:
-            # Split_Route should be after Start_Route
+            # NOTE: split_route should only appear after the start route node
             if next_step == self.router.SPLIT_ROUTE:
                 operator_output = self._apply_split(operator_output, inference_state)
-                next_step = self.router.route[self.router.END_SPLIT]
+                next_step = self.router.route[self.router.JOIN_ROUTE]
 
-            operator = self.ops[next_step]
             if next_step == self.router.START_ROUTE:
-                output_future = self._scheduler_group.submit(
+                outputs = self._run_next_step(
                     *args,
+                    next_step=next_step,
+                    func=self._scheduler_group.submit,
                     inference_state=inference_state,
-                    operator=operator,
+                    operator=self.ops[next_step],
                     pipeline_state=pipeline_state,
                     **kwargs,
                 )
             else:
-                if isinstance(operator_output, dict):
-                    output_future = self._scheduler_group.submit(
-                        inference_state=inference_state,
-                        operator=operator,
-                        pipeline_state=pipeline_state,
-                        **operator_output,
-                    )
-                else:
-                    output_future = self._scheduler_group.submit(
-                        operator_output,
-                        inference_state=inference_state,
-                        pipeline_state=pipeline_state,
-                        operator=operator,
-                    )
-
-            operator_output = output_future.result()
-            if isinstance(operator_output, tuple):
-                state_update = operator_output[-1]
-                operator_output = operator_output[0]
-                inference_state.update_state(state_update)
-
-            next_step = self.router.next(next_step, self.ops, operator_output)
+                outputs = self._run_next_step(
+                    func=self._scheduler_group.submit,
+                    input=operator_output,
+                    next_step=next_step,
+                    inference_state=inference_state,
+                    operator=self.ops[next_step],
+                    pipeline_state=pipeline_state,
+                )
 
+            next_step, operator_output, state_update = outputs
+            if state_update:
+                inference_state.update_state(state_update)
         return operator_output
 
     def __call__(self, *args, **kwargs):
@@ -195,13 +216,22 @@ def expand_inputs(self, *args, **kwargs):
         """
         Generic function to handle expanding values.
         """
-        raise NotImplementedError
+        raise NotImplementedError(
+            "This function should be implemented for any router with split or join"
+            "nodes. expand_inputs will be called prior to the split node (stored in "
+            "the router's SPLIT_ROUTE attribute), expanding outputs for each output "
+            "such that there is a batch size of one per thread."
+        )
 
     def condense_inputs(self, *args, **kwargs):
         """
         Generic function to handle condensing values.
         """
-        raise NotImplementedError
+        raise NotImplementedError(
+            "This function should be implemented for any router with split or join "
+            "nodes. condense_inputs will be called after the join node (stored in the "
+            "router's JOIN_ROUTE attribute), condensing outputs from multiple threads."
+        )
 
     def validate(self):
         """
diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py
index 93bc059ddb..1b70164002 100644
--- a/src/deepsparse/v2/routers/router.py
+++ b/src/deepsparse/v2/routers/router.py
@@ -41,9 +41,13 @@ def __init__(
         end_route: Union[str, int],
         start_route: Union[str, int],
         route: Optional[Dict] = None,
+        split_route: str = "SPLIT",
+        join_route: str = "JOIN",
     ):
         self.START_ROUTE = start_route
         self.END_ROUTE = end_route
+        self.SPLIT_ROUTE = split_route
+        self.JOIN_ROUTE = join_route
         self.route = route
 
     @abstractmethod
@@ -79,6 +83,9 @@ class LinearRouter(Router):
 
     def __init__(self, end_route: int, start_route: int = 0):
         super().__init__(end_route=end_route, start_route=start_route)
+        self.SPLIT_ROUTE = None
+        self.JOIN_ROUTE = None
+        _LOGGER.warn("SPLIT and JOIN are not yet supported for the LinearRouter.")
 
     def next(
         self, past: int, ops: Optional[List[Operator]] = None, inp: Optional[Any] = None
@@ -129,9 +136,9 @@ class GraphRouter(Router):
     """
 
     def __init__(self, end_route: str, start_route: str, route: Dict, **kwargs):
-        super().__init__(end_route=end_route, start_route=start_route, route=route)
-        self.SPLIT_ROUTE = kwargs.get("split_route")
-        self.END_SPLIT = kwargs.get("end_split")
+        super().__init__(
+            end_route=end_route, start_route=start_route, route=route, **kwargs
+        )
 
     def next(
         self,
diff --git a/src/deepsparse/v2/schedulers/scheduler.py b/src/deepsparse/v2/schedulers/scheduler.py
index 617936d509..78a58e3389 100644
--- a/src/deepsparse/v2/schedulers/scheduler.py
+++ b/src/deepsparse/v2/schedulers/scheduler.py
@@ -14,7 +14,6 @@
 
 
 from concurrent.futures import Future, ThreadPoolExecutor
-from typing import Callable
 
 from deepsparse.v2.operators import Operator
 
@@ -53,22 +52,6 @@ def submit(
             **kwargs,
         )
 
-    def can_map(self, *args):
-        """
-        args containing list of inputs to be used for each worker. This function if we
-        have sufficient workes available
-        """
-        if len(args[0]) <= self._threadpool._max_workers:
-            return True
-        return False
-
-    def map(self, *args, func: Callable):
-        """
-        :param func: Callable to run as part of the map function
-        args containing a list of function variables to map
-        """
-        return list(self._threadpool.map(func, *args))
-
     def can_process(
         self,
         *args,
diff --git a/src/deepsparse/v2/schedulers/scheduler_group.py b/src/deepsparse/v2/schedulers/scheduler_group.py
index 8557325c9a..d426f830b2 100644
--- a/src/deepsparse/v2/schedulers/scheduler_group.py
+++ b/src/deepsparse/v2/schedulers/scheduler_group.py
@@ -58,11 +58,10 @@ def submit(
 
     def map(self, *args, func: Callable):
         """
-        :param operator: operator to run
-        :return: list of outputs from multiple workers
+        :param func: generic callable run for each arg
+        :return: list of futures for each submit
         """
-        for scheduler in self.schedulers:
-            if scheduler.can_map(
-                args[0],
-            ):
-                return scheduler.map(*args, func=func)
+        futures = []
+        for _, values in enumerate(zip(*args)):
+            futures.append(self.submit(*values, operator=func))
+        return futures
diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py
index a24c37ed90..240da04907 100644
--- a/src/deepsparse/v2/text_generation/pipeline.py
+++ b/src/deepsparse/v2/text_generation/pipeline.py
@@ -181,13 +181,9 @@ def __init__(
         }
 
         router = GraphRouter(
-            end_route="STOP",
-            start_route="process_input",
-            route=routes,
-            split_route="SPLIT",
-            end_split="JOIN",
+            end_route="STOP", start_route="process_input", route=routes
         )
-        scheduler = [OperatorScheduler(), OperatorScheduler(max_workers=4)]
+        scheduler = [OperatorScheduler()]
         super().__init__(
             ops=ops, router=router, schedulers=scheduler, pipeline_state=pipeline_state
         )

From 672ca2048145bacfa936627da3bb2a6f0f56666e Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 7 Nov 2023 16:38:24 -0500
Subject: [PATCH 08/43] process multiple generations

---
 src/deepsparse/v2/text_generation/process_outputs.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/deepsparse/v2/text_generation/process_outputs.py b/src/deepsparse/v2/text_generation/process_outputs.py
index ba301bdae6..7173b8e256 100644
--- a/src/deepsparse/v2/text_generation/process_outputs.py
+++ b/src/deepsparse/v2/text_generation/process_outputs.py
@@ -79,6 +79,15 @@ def run(
             generations = list(
                 map(self._create_generated_text_output, sequences, finished_reason)
             )
+
+        num_preds = generation_config.num_return_sequences
+        if num_preds > 1:
+            grouped_generations = [
+                generations[n : n + num_preds]
+                for n in range(0, len(generations), num_preds)
+            ]
+            generations = grouped_generations
+
         outputs = dict(
             created=datetime.datetime.now(),
             prompts=inference_state.current_state.get("prompts"),

From 304eb358a17923269ab2d1338a9113d32a268ce0 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Wed, 8 Nov 2023 13:36:40 +0000
Subject: [PATCH 09/43] initial commit

---
 src/deepsparse/transformers/helpers.py        | 97 +++++++++++++++++--
 src/deepsparse/utils/onnx.py                  |  8 +-
 src/deepsparse/v2/text_generation/pipeline.py | 68 +++----------
 3 files changed, 103 insertions(+), 70 deletions(-)

diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index d7acc71a99..78543baf12 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -17,24 +17,26 @@
 """
 
 
+import logging
 import os
 import re
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from typing import List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy
 import onnx
+import transformers
 from onnx import ModelProto
 
 from deepsparse.log import get_main_logger
-from deepsparse.utils.onnx import _MODEL_DIR_ONNX_NAME, truncate_onnx_model
+from deepsparse.utils.onnx import MODEL_ONNX_NAME, truncate_onnx_model
 from sparsezoo import Model
 from sparsezoo.utils import save_onnx
 
 
 __all__ = [
-    "get_deployment_path",
+    "setup_transformers_pipeline",
     "overwrite_transformer_onnx_model_inputs",
     "fix_numpy_types",
     "get_transformer_layer_init_names",
@@ -44,6 +46,81 @@
 _LOGGER = get_main_logger()
 
 
+def setup_transformers_pipeline(
+    model_path: str,
+    sequence_length: int,
+    tokenizer_padding_side: str = "left",
+    engine_kwargs: Optional[Dict] = None,
+    onnx_model_name: Optional[str] = None,
+) -> Tuple[
+    str, transformers.PretrainedConfig, transformers.PreTrainedTokenizer, Dict[str, Any]
+]:
+    """
+    A helper function that sets up the model path, config, tokenizer,
+    and engine kwargs for a transformers model.
+    :param model_path: The path to the model to load
+    :param sequence_length: The sequence length to use for the model
+    :param tokenizer_padding_side: The side to pad on for the tokenizer,
+        either "left" or "right"
+    :param engine_kwargs: The kwargs to pass to the engine
+    :param onnx_model_name: The name of the onnx model to be loaded.
+        If not specified, defaults are used (see setup_onnx_file_path)
+    :return The model path, config, tokenizer, and engine kwargs
+    """
+    model_path, config, tokenizer = setup_onnx_file_path(
+        model_path, sequence_length, onnx_model_name
+    )
+
+    tokenizer.padding_side = tokenizer_padding_side
+    if not tokenizer.pad_token:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    engine_kwargs = engine_kwargs or {}
+    engine_kwargs["model_path"] = model_path
+    return model_path, config, tokenizer, engine_kwargs
+
+
+def setup_onnx_file_path(
+    model_path: str,
+    sequence_length: int,
+    onnx_model_name: Optional[str] = None,
+) -> Tuple[str, transformers.PretrainedConfig, transformers.PreTrainedTokenizer]:
+    """
+    Parses ONNX model from the `model_path` provided. It additionally
+    creates config and tokenizer objects from the `deployment path`,
+    derived from the `model_path` provided.
+    :param model_path: path to the model to be parsed
+    :param sequence_length: maximum sequence length of the model
+    :param onnx_model_name: optionally, the precise name of the ONNX model
+        of interest may be specified. If not specified, the default ONNX model
+        name will be used (refer to `get_deployment_path` for details)
+    :return: file path to the processed ONNX file for the engine to compile
+    """
+    deployment_path, onnx_path = get_deployment_path(model_path, onnx_model_name)
+
+    hf_logger = logging.getLogger("transformers")
+    hf_logger_level = hf_logger.level
+    hf_logger.setLevel(logging.ERROR)
+
+    config = transformers.PretrainedConfig.from_pretrained(deployment_path)
+    hf_logger.setLevel(hf_logger_level)
+
+    trust_remote_code = False
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        deployment_path,
+        trust_remote_code=trust_remote_code,
+        model_max_length=sequence_length,
+    )
+
+    if not config or not tokenizer:
+        raise RuntimeError(
+            "Invalid config or tokenizer provided. Please provide "
+            "paths to the files or ensure they exist in the `model_path` provided. "
+            "See `tokenizer` and `config` arguments for details."
+        )
+    return onnx_path, config, tokenizer
+
+
 def get_deployment_path(model_path: str) -> Tuple[str, str]:
     """
     Returns the path to the deployment directory
@@ -63,26 +140,26 @@ def get_deployment_path(model_path: str) -> Tuple[str, str]:
     if os.path.isdir(model_path):
         model_files = os.listdir(model_path)
 
-        if _MODEL_DIR_ONNX_NAME not in model_files:
+        if MODEL_ONNX_NAME not in model_files:
             raise ValueError(
-                f"{_MODEL_DIR_ONNX_NAME} not found in transformers model directory "
+                f"{MODEL_ONNX_NAME} not found in transformers model directory "
                 f"{model_path}. Be sure that an export of the model is written to "
-                f"{os.path.join(model_path, _MODEL_DIR_ONNX_NAME)}"
+                f"{os.path.join(model_path, MODEL_ONNX_NAME)}"
             )
-        return model_path, os.path.join(model_path, _MODEL_DIR_ONNX_NAME)
+        return model_path, os.path.join(model_path, MODEL_ONNX_NAME)
 
     elif model_path.startswith("zoo:"):
         zoo_model = Model(model_path)
         deployment_path = zoo_model.deployment_directory_path
-        return deployment_path, os.path.join(deployment_path, _MODEL_DIR_ONNX_NAME)
+        return deployment_path, os.path.join(deployment_path, MODEL_ONNX_NAME)
     elif model_path.startswith("hf:"):
         from huggingface_hub import snapshot_download
 
         deployment_path = snapshot_download(repo_id=model_path.replace("hf:", "", 1))
-        onnx_path = os.path.join(deployment_path, _MODEL_DIR_ONNX_NAME)
+        onnx_path = os.path.join(deployment_path, MODEL_ONNX_NAME)
         if not os.path.isfile(onnx_path):
             raise ValueError(
-                f"{_MODEL_DIR_ONNX_NAME} not found in transformers model directory "
+                f"{MODEL_ONNX_NAME} not found in transformers model directory "
                 f"{deployment_path}. Be sure that an export of the model is written to "
                 f"{onnx_path}"
             )
diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index e69bf67321..f518620c2f 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -56,12 +56,12 @@
     "has_model_kv_cache",
     "CACHE_INPUT_PREFIX",
     "CACHE_OUTPUT_PREFIX",
-    "_MODEL_DIR_ONNX_NAME",
+    "MODEL_ONNX_NAME",
 ]
 
 _LOGGER = logging.getLogger(__name__)
 
-_MODEL_DIR_ONNX_NAME = "model.onnx"
+MODEL_ONNX_NAME = "model.onnx"
 CACHE_INPUT_PREFIX = "past_key_values"
 CACHE_OUTPUT_PREFIX = "present"
 
@@ -132,7 +132,7 @@ def model_to_path(model: Union[str, Model, File]) -> str:
         model.deployment_directory_path
 
         # default to the main onnx file for the model
-        model = model.deployment.get_file(_MODEL_DIR_ONNX_NAME).path
+        model = model.deployment.get_file(MODEL_ONNX_NAME).path
 
     elif File is not object and isinstance(model, File):
         # get the downloaded_path -- will auto download if not on local system
@@ -146,7 +146,7 @@ def model_to_path(model: Union[str, Model, File]) -> str:
 
     model_path = Path(model)
     if model_path.is_dir():
-        return str(model_path / _MODEL_DIR_ONNX_NAME)
+        return str(model_path / MODEL_ONNX_NAME)
 
     return model
 
diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py
index 49826b8af7..fdb31f1c6c 100644
--- a/src/deepsparse/v2/text_generation/pipeline.py
+++ b/src/deepsparse/v2/text_generation/pipeline.py
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict
+from typing import Dict, Optional
 
+from deepsparse.transformers.helpers import setup_transformers_pipeline
 from deepsparse.transformers.utils.helpers import process_generation_config
 from deepsparse.v2.pipeline import Pipeline
 from deepsparse.v2.routers import GraphRouter
@@ -45,23 +46,20 @@ def __init__(
         internal_kv_cache: bool = True,
         force_max_tokens: bool = False,
         generation_config=None,
-        engine_kwargs: Dict = None,
+        engine_kwargs: Optional[Dict] = None,
     ):
+        (
+            self.model_path,
+            self.config,
+            self.tokenizer,
+            engine_kwargs,
+        ) = setup_transformers_pipeline(
+            model_path, sequence_length, engine_kwargs=engine_kwargs
+        )
 
         pipeline_state = PipelineState()
         pipeline_state_vals = {}
 
-        # TODO: The code below will be replaced with a transformers set-up Operator.
-        self.tokenizer = None
-        model_path = self.setup_onnx_file_path(model_path, sequence_length)
-        self.tokenizer.padding_side = "left"
-        if not self.tokenizer.pad_token:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-
-        if not engine_kwargs:
-            engine_kwargs = {}
-        engine_kwargs["model_path"] = model_path
-
         if internal_kv_cache and engine_kwargs.get("engine_type") == "onnxruntime":
             internal_kv_cache = False
 
@@ -80,7 +78,7 @@ def __init__(
         )
 
         # NOTE: Currently using pipeline state. Can swap to simply pass in the
-        # attributes to the specific Operator that neeed them, as class attributes.
+        # attributes to the specific Operator that need them, as class attributes.
         pipeline_state_vals[
             "onnx_input_names_no_cache"
         ] = single_engine_operator.onnx_input_names_no_cache
@@ -180,45 +178,3 @@ def __init__(
         super().__init__(
             ops=ops, router=router, schedulers=scheduler, pipeline_state=pipeline_state
         )
-
-    # TODO: Move to be part of a generic transformers set-up Operator.
-    def setup_onnx_file_path(self, model_path, sequence_length) -> str:
-        import logging
-
-        import transformers
-        from transformers import AutoTokenizer
-
-        from deepsparse.transformers.helpers import get_deployment_path
-
-        """
-        Parses ONNX model from the `model_path` provided. It additionally
-        creates config and tokenizer objects from the `deployment path`,
-        derived from the `model_path` provided.
-
-        :return: file path to the processed ONNX file for the engine to compile
-        """
-        deployment_path, onnx_path = get_deployment_path(model_path)
-
-        hf_logger = logging.getLogger("transformers")
-        hf_logger_level = hf_logger.level
-        hf_logger.setLevel(logging.ERROR)
-        self.config = transformers.PretrainedConfig.from_pretrained(
-            deployment_path,
-            finetuning_task=self.task if hasattr(self, "task") else None,
-        )
-        hf_logger.setLevel(hf_logger_level)
-
-        self._trust_remote_code = False
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            deployment_path,
-            trust_remote_code=self._trust_remote_code,
-            model_max_length=sequence_length,
-        )
-
-        if not self.config or not self.tokenizer:
-            raise RuntimeError(
-                "Invalid config or tokenizer provided. Please provide "
-                "paths to the files or ensure they exist in the `model_path` provided. "
-                "See `tokenizer` and `config` arguments for details."
-            )
-        return onnx_path

From 71515ac774eb5c70296798cbad4f460a84d7e0ce Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Wed, 8 Nov 2023 13:48:16 +0000
Subject: [PATCH 10/43] fix error

---
 src/deepsparse/transformers/helpers.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index 78543baf12..70a1e9523d 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -121,7 +121,9 @@ def setup_onnx_file_path(
     return onnx_path, config, tokenizer
 
 
-def get_deployment_path(model_path: str) -> Tuple[str, str]:
+def get_deployment_path(
+    model_path: str, onnx_model_name: Optional[str] = None
+) -> Tuple[str, str]:
     """
     Returns the path to the deployment directory
     for the given model path and the path to the mandatory
@@ -130,9 +132,12 @@ def get_deployment_path(model_path: str) -> Tuple[str, str]:
     for running the transformers model in the deepsparse pipeline
 
     :param model_path: path to model directory, sparsezoo stub, or ONNX file
+    :param onnx_model_name: name of the ONNX file to look for in the deployment
+        directory. Defaults to MODEL_ONNX_NAME
     :return: path to the deployment directory and path to the ONNX file inside
         the deployment directory
     """
+    onnx_model_name = onnx_model_name or MODEL_ONNX_NAME
     if os.path.isfile(model_path):
         # return the parent directory of the ONNX file
         return os.path.dirname(model_path), model_path
@@ -140,26 +145,26 @@ def get_deployment_path(model_path: str) -> Tuple[str, str]:
     if os.path.isdir(model_path):
         model_files = os.listdir(model_path)
 
-        if MODEL_ONNX_NAME not in model_files:
+        if onnx_model_name not in model_files:
             raise ValueError(
-                f"{MODEL_ONNX_NAME} not found in transformers model directory "
+                f"{onnx_model_name} not found in transformers model directory "
                 f"{model_path}. Be sure that an export of the model is written to "
-                f"{os.path.join(model_path, MODEL_ONNX_NAME)}"
+                f"{os.path.join(model_path, onnx_model_name)}"
             )
-        return model_path, os.path.join(model_path, MODEL_ONNX_NAME)
+        return model_path, os.path.join(model_path, onnx_model_name)
 
     elif model_path.startswith("zoo:"):
         zoo_model = Model(model_path)
         deployment_path = zoo_model.deployment_directory_path
-        return deployment_path, os.path.join(deployment_path, MODEL_ONNX_NAME)
+        return deployment_path, os.path.join(deployment_path, onnx_model_name)
     elif model_path.startswith("hf:"):
         from huggingface_hub import snapshot_download
 
         deployment_path = snapshot_download(repo_id=model_path.replace("hf:", "", 1))
-        onnx_path = os.path.join(deployment_path, MODEL_ONNX_NAME)
+        onnx_path = os.path.join(deployment_path, onnx_model_name)
         if not os.path.isfile(onnx_path):
             raise ValueError(
-                f"{MODEL_ONNX_NAME} not found in transformers model directory "
+                f"{onnx_model_name} not found in transformers model directory "
                 f"{deployment_path}. Be sure that an export of the model is written to "
                 f"{onnx_path}"
             )

From 041174b835231099771326ffa32c45742525b62c Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Thu, 9 Nov 2023 09:49:21 -0500
Subject: [PATCH 11/43] [Pipeline Refactor] Split/Join Functionality for
 multiple prompts (#1384)

* add split/join functionality

* update router to include split/join in parent class, refactor pipeline code to remove repeat code, update map function

* process multiple generations

* move map to base class
---
 .../v2/operators/engine_operator.py           |  18 +--
 src/deepsparse/v2/operators/operator.py       |  13 --
 src/deepsparse/v2/pipeline.py                 | 153 ++++++++++++++----
 src/deepsparse/v2/routers/router.py           |  13 +-
 src/deepsparse/v2/schedulers/scheduler.py     |  11 ++
 .../v2/schedulers/scheduler_group.py          |  20 ---
 src/deepsparse/v2/text_generation/__init__.py |   1 +
 .../v2/text_generation/join_output.py         |  70 ++++++++
 src/deepsparse/v2/text_generation/pipeline.py |  20 ++-
 .../v2/text_generation/prep_for_prefill.py    |   9 +-
 .../v2/text_generation/process_inputs.py      |   9 +-
 .../v2/text_generation/process_outputs.py     |  25 ++-
 12 files changed, 268 insertions(+), 94 deletions(-)
 create mode 100644 src/deepsparse/v2/text_generation/join_output.py

diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py
index c2fc562c63..bd58aefafa 100644
--- a/src/deepsparse/v2/operators/engine_operator.py
+++ b/src/deepsparse/v2/operators/engine_operator.py
@@ -20,7 +20,7 @@
 from deepsparse import Context as EngineContext
 from deepsparse import Engine, MultiModelEngine, Scheduler
 from deepsparse.benchmark import ORTEngine
-from deepsparse.utils import join_engine_outputs, model_to_path, split_engine_inputs
+from deepsparse.utils import model_to_path
 from deepsparse.v2.operators import Operator
 
 
@@ -145,18 +145,6 @@ def run(self, inp: EngineOperatorInputs, **kwargs) -> Dict:
             # planned refactor
             engine_outputs = inp.engine(inp.engine_inputs)
             return {"engine_outputs": engine_outputs}
-        inp = inp.engine_inputs
-        batches, orig_batch_size = self.expand_inputs(engine_inputs=inp)
-        batches_outputs = list(map(self.engine, batches))
-        engine_outputs = self.condense_inputs(
-            batch_outputs=batches_outputs, orig_batch_size=orig_batch_size
-        )
-        return {"engine_outputs": engine_outputs}
 
-    def expand_inputs(self, **kwargs):
-        return split_engine_inputs(kwargs["engine_inputs"], self._batch_size)
-
-    def condense_inputs(self, **kwargs):
-        batch_outputs = kwargs["batch_outputs"]
-        orig_batch_size = kwargs["orig_batch_size"]
-        return join_engine_outputs(batch_outputs, orig_batch_size)
+        engine_outputs = self.engine(inp.engine_inputs)
+        return {"engine_outputs": engine_outputs}
diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py
index b3963d8223..5bb0be841a 100644
--- a/src/deepsparse/v2/operators/operator.py
+++ b/src/deepsparse/v2/operators/operator.py
@@ -99,7 +99,6 @@ def __call__(
                 pipeline_state=pipeline_state,
                 **kwargs,
             )
-
         if self.has_output_schema():
             return self.output_schema(**run_output)
         return run_output
@@ -117,18 +116,6 @@ def can_operate(self, inp: Any) -> bool:
         """
         return True
 
-    def expand_inputs(self, **kwargs):
-        """
-        Generic function to handle expanding values.
-        """
-        raise NotImplementedError
-
-    def condense_inputs(self, **kwargs):
-        """
-        Generic function to handle condensing values.
-        """
-        raise NotImplementedError
-
     def yaml(self):
         pass
 
diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py
index 0a8c8b2f93..f56680d2b9 100644
--- a/src/deepsparse/v2/pipeline.py
+++ b/src/deepsparse/v2/pipeline.py
@@ -13,7 +13,10 @@
 # limitations under the License.
 
 
-from typing import Dict, List, Union
+import copy
+from concurrent.futures import Future
+from functools import partial
+from typing import Any, Callable, Dict, List, Union
 
 from deepsparse.v2.operators import Operator
 from deepsparse.v2.routers import Router
@@ -56,9 +59,88 @@ def __init__(
         self.pipeline_state = pipeline_state
         self.validate()
 
-        # SchedulerGroup handles running all schedulers in order of priority
         self._scheduler_group = SchedulerGroup(self.schedulers)
 
+    def _run_sequential(
+        self,
+        inp: Any,
+        inference_state: InferenceState,
+        pipeline_state: PipelineState,
+        start: str,
+        end: str,
+    ):
+        next_step = start
+        while next_step != end:
+            outputs = self._run_next_step(
+                func=self.ops[next_step],
+                next_step=next_step,
+                input=inp,
+                pipeline_state=pipeline_state,
+                inference_state=inference_state,
+            )
+            next_step, operator_output, state_update = outputs
+            if state_update:
+                inference_state.update_state(state_update)
+            inp = operator_output
+        return inp
+
+    def _apply_split(self, inp: Any, inference_state: InferenceState):
+        """
+        Split inputs using the pipeline's expand_inputs function. Inputs are split
+        into a batch size of one when a SPLIT_ROUTE node is found in a given pipeline's
+        provided router. The split batches are run asynchronously and then joined when
+        a JOIN_ROUTE node is found, using the pipeline's condense_inputs function.
+        """
+
+        batches, orig_batch_size = self.expand_inputs(inp, 1)
+        run_with_state = partial(
+            self._run_sequential,
+            pipeline_state=self.pipeline_state,
+            start=self.router.route[self.router.SPLIT_ROUTE],
+            end=self.router.JOIN_ROUTE,
+        )
+        inference_state_list = [
+            copy.deepcopy(inference_state) for x in range(len(batches))
+        ]
+        futures = self._scheduler_group.map(
+            batches,
+            inference_state_list,
+            func=run_with_state,
+        )
+        return self.condense_inputs([x.result() for x in futures])
+
+    def _run_next_step(
+        self,
+        *args,
+        func: Callable,
+        next_step: Union[str, int],
+        input: Any = None,
+        **kwargs,
+    ):
+        """
+        Generic function to run a given func, process the output and determine the next
+        step.
+        """
+        if input:
+            operator_output = (
+                func(*args, **kwargs, **input)
+                if isinstance(input, dict)
+                else func(input, *args, **kwargs)
+            )
+        else:
+            operator_output = func(*args, **kwargs)
+
+        if isinstance(operator_output, Future):
+            operator_output = operator_output.result()
+
+        state_update = None
+        if isinstance(operator_output, tuple):
+            state_update = operator_output[-1]
+            operator_output = operator_output[0]
+
+        next_step = self.router.next(next_step, self.ops, operator_output)
+        return next_step, operator_output, state_update
+
     def run(
         self,
         *args,
@@ -78,40 +160,34 @@ def run(
         operator_output = None
 
         while next_step != self.router.END_ROUTE:
-            # Either a dictionary key or valid index
-            operator = self.ops[next_step]
+            # NOTE: split_route should only appear after the start route node
+            if next_step == self.router.SPLIT_ROUTE:
+                operator_output = self._apply_split(operator_output, inference_state)
+                next_step = self.router.route[self.router.JOIN_ROUTE]
+
             if next_step == self.router.START_ROUTE:
-                output_future = self._scheduler_group.submit(
+                outputs = self._run_next_step(
                     *args,
+                    next_step=next_step,
+                    func=self._scheduler_group.submit,
                     inference_state=inference_state,
-                    operator=operator,
+                    operator=self.ops[next_step],
                     pipeline_state=pipeline_state,
                     **kwargs,
                 )
             else:
-                if isinstance(operator_output, dict):
-                    output_future = self._scheduler_group.submit(
-                        inference_state=inference_state,
-                        operator=operator,
-                        pipeline_state=pipeline_state,
-                        **operator_output,
-                    )
-                else:
-                    output_future = self._scheduler_group.submit(
-                        operator_output,
-                        inference_state=inference_state,
-                        pipeline_state=pipeline_state,
-                        operator=operator,
-                    )
-
-            operator_output = output_future.result()
-            if isinstance(operator_output, tuple):
-                state_update = operator_output[-1]
-                operator_output = operator_output[0]
-                inference_state.update_state(state_update)
-
-            next_step = self.router.next(next_step, self.ops, operator_output)
+                outputs = self._run_next_step(
+                    func=self._scheduler_group.submit,
+                    input=operator_output,
+                    next_step=next_step,
+                    inference_state=inference_state,
+                    operator=self.ops[next_step],
+                    pipeline_state=pipeline_state,
+                )
 
+            next_step, operator_output, state_update = outputs
+            if state_update:
+                inference_state.update_state(state_update)
         return operator_output
 
     def __call__(self, *args, **kwargs):
@@ -136,6 +212,27 @@ def __call__(self, *args, **kwargs):
 
         return self.run(*args, **kwargs)
 
+    def expand_inputs(self, *args, **kwargs):
+        """
+        Generic function to handle expanding values.
+        """
+        raise NotImplementedError(
+            "This function should be implemented for any router with split or join"
+            "nodes. expand_inputs will be called prior to the split node (stored in "
+            "the router's SPLIT_ROUTE attribute), expanding outputs for each output "
+            "such that there is a batch size of one per thread."
+        )
+
+    def condense_inputs(self, *args, **kwargs):
+        """
+        Generic function to handle condensing values.
+        """
+        raise NotImplementedError(
+            "This function should be implemented for any router with split or join "
+            "nodes. condense_inputs will be called after the join node (stored in the "
+            "router's JOIN_ROUTE attribute), condensing outputs from multiple threads."
+        )
+
     def validate(self):
         """
         Validate that compatability of the router and operators provided.
diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py
index d1110d4ca7..1b70164002 100644
--- a/src/deepsparse/v2/routers/router.py
+++ b/src/deepsparse/v2/routers/router.py
@@ -41,9 +41,13 @@ def __init__(
         end_route: Union[str, int],
         start_route: Union[str, int],
         route: Optional[Dict] = None,
+        split_route: str = "SPLIT",
+        join_route: str = "JOIN",
     ):
         self.START_ROUTE = start_route
         self.END_ROUTE = end_route
+        self.SPLIT_ROUTE = split_route
+        self.JOIN_ROUTE = join_route
         self.route = route
 
     @abstractmethod
@@ -79,6 +83,9 @@ class LinearRouter(Router):
 
     def __init__(self, end_route: int, start_route: int = 0):
         super().__init__(end_route=end_route, start_route=start_route)
+        self.SPLIT_ROUTE = None
+        self.JOIN_ROUTE = None
+        _LOGGER.warn("SPLIT and JOIN are not yet supported for the LinearRouter.")
 
     def next(
         self, past: int, ops: Optional[List[Operator]] = None, inp: Optional[Any] = None
@@ -128,8 +135,10 @@ class GraphRouter(Router):
     where `can_operate` returns True will run. Paths should be deterministic.
     """
 
-    def __init__(self, end_route: str, start_route: str, route: Dict):
-        super().__init__(end_route=end_route, start_route=start_route, route=route)
+    def __init__(self, end_route: str, start_route: str, route: Dict, **kwargs):
+        super().__init__(
+            end_route=end_route, start_route=start_route, route=route, **kwargs
+        )
 
     def next(
         self,
diff --git a/src/deepsparse/v2/schedulers/scheduler.py b/src/deepsparse/v2/schedulers/scheduler.py
index 78a58e3389..5313683107 100644
--- a/src/deepsparse/v2/schedulers/scheduler.py
+++ b/src/deepsparse/v2/schedulers/scheduler.py
@@ -14,6 +14,7 @@
 
 
 from concurrent.futures import Future, ThreadPoolExecutor
+from typing import Callable
 
 from deepsparse.v2.operators import Operator
 
@@ -64,3 +65,13 @@ def can_process(
             Base OperatorScheduler always returns True
         """
         return True
+
+    def map(self, *args, func: Callable):
+        """
+        :param func: generic callable run for each arg
+        :return: list of futures for each submit
+        """
+        futures = []
+        for _, values in enumerate(zip(*args)):
+            futures.append(self.submit(*values, operator=func))
+        return futures
diff --git a/src/deepsparse/v2/schedulers/scheduler_group.py b/src/deepsparse/v2/schedulers/scheduler_group.py
index 40b5695f22..14d869a0f2 100644
--- a/src/deepsparse/v2/schedulers/scheduler_group.py
+++ b/src/deepsparse/v2/schedulers/scheduler_group.py
@@ -55,23 +55,3 @@ def submit(
                     operator=operator,
                     **kwargs,
                 )
-
-    def can_process(
-        self,
-        *args,
-        operator: Operator,
-        **kwargs,
-    ) -> bool:
-        """
-        :param operator: operator to check
-        :return: True if this Operator can process the given operator and input.
-            SchedulerGroup always returns True
-        """
-        return any(
-            scheduler.can_process(
-                *args,
-                operator=operator,
-                **kwargs,
-            )
-            for scheduler in self.schedulers
-        )
diff --git a/src/deepsparse/v2/text_generation/__init__.py b/src/deepsparse/v2/text_generation/__init__.py
index 21cd7e2acd..08836b8bbe 100644
--- a/src/deepsparse/v2/text_generation/__init__.py
+++ b/src/deepsparse/v2/text_generation/__init__.py
@@ -17,6 +17,7 @@
 from .compile_generations import *
 from .compile_logits import *
 from .generate_new_token import *
+from .join_output import *
 from .kv_cache_operator import *
 from .multi_engine_prefill_operator import *
 from .nl_engine_operator import *
diff --git a/src/deepsparse/v2/text_generation/join_output.py b/src/deepsparse/v2/text_generation/join_output.py
new file mode 100644
index 0000000000..8a6c77a2f1
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/join_output.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+import numpy
+
+from deepsparse.transformers.utils.helpers import pad_to_fixed_length
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.text_generation.compile_generations import CompileGenerationsOutput
+
+
+__all__ = ["JoinOutput"]
+
+
+class JoinOutput(Operator):
+    """
+    Run this operator to combine the results from multiple prompts.
+    """
+
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+
+    def run(self, inp: List[CompileGenerationsOutput], **kwargs):
+        batch_outputs = [x for x in inp[0]]
+        generated_tokens = [x.generated_tokens for x in batch_outputs]
+        generated_logits = [x.generated_logits for x in batch_outputs]
+        finished_reason = [x.finished_reason for x in batch_outputs]
+
+        max_len = max(token.shape[1] for token in generated_tokens)
+
+        # pad all tokens to the same length
+        tokens = [
+            pad_to_fixed_length(
+                array=prediction,
+                max_len=max_len,
+                value=self.tokenizer.pad_token_id,
+                axis=1,
+            )
+            for prediction in generated_tokens
+        ]
+
+        # find the longest sequence in the batch of logits
+        max_len = max(logits.shape[1] for logits in generated_logits)
+
+        # pad all logits to the same length
+        logits = [
+            pad_to_fixed_length(array=single_logits, max_len=max_len, axis=1)
+            for single_logits in generated_logits
+        ]
+
+        tokens = numpy.concatenate(tokens)
+        logits = numpy.concatenate(logits)
+
+        return {
+            "generated_tokens": tokens,
+            "generated_logits": logits,
+            "finished_reason": finished_reason,
+        }
diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py
index 49826b8af7..240da04907 100644
--- a/src/deepsparse/v2/text_generation/pipeline.py
+++ b/src/deepsparse/v2/text_generation/pipeline.py
@@ -15,6 +15,7 @@
 from typing import Dict
 
 from deepsparse.transformers.utils.helpers import process_generation_config
+from deepsparse.utils import split_engine_inputs
 from deepsparse.v2.pipeline import Pipeline
 from deepsparse.v2.routers import GraphRouter
 from deepsparse.v2.schedulers import OperatorScheduler
@@ -24,6 +25,7 @@
     CompileGenerations,
     CompilePromptLogits,
     GenerateNewTokenOperator,
+    JoinOutput,
     KVCacheCreator,
     MultiEnginePrefill,
     NLEngineOperator,
@@ -131,6 +133,7 @@ def __init__(
         process_output = ProcessOutputs(tokenizer=self.tokenizer)
         compile_generations = CompileGenerations()
         compile_generated_tokens = CompileGeneratedTokens()
+        join_output = JoinOutput(tokenizer=self.tokenizer)
 
         ops = {
             "process_input": process_inputs,
@@ -146,10 +149,12 @@ def __init__(
             "process_outputs": process_output,
             "compile_generations": compile_generations,
             "compile_generated_tokens": compile_generated_tokens,
+            "join_output": join_output,
         }
 
         routes = {
-            "process_input": "prepare_prefill",
+            "process_input": "SPLIT",
+            "SPLIT": "prepare_prefill",
             "prepare_prefill": ["multi_engine_prefill", "autoregressive_preprocess"],
             "multi_engine_prefill": "multi_engine",
             "multi_engine": "compile_logits",
@@ -169,7 +174,9 @@ def __init__(
                 "autoregressive_preprocess",
                 "compile_generations",
             ],
-            "compile_generations": "process_outputs",
+            "compile_generations": "JOIN",
+            "JOIN": "join_output",
+            "join_output": "process_outputs",
             "process_outputs": "STOP",
         }
 
@@ -181,6 +188,15 @@ def __init__(
             ops=ops, router=router, schedulers=scheduler, pipeline_state=pipeline_state
         )
 
+    def expand_inputs(self, items, batch_size):
+        items = [items.get(key) for key in items.keys()]
+        out, orig_batch_size = split_engine_inputs(items, batch_size)
+        combined_batches = [{"input_ids": b[0], "attention_mask": b[1]} for b in out]
+        return combined_batches, orig_batch_size
+
+    def condense_inputs(self, *args, **kwargs):
+        return args[0], kwargs
+
     # TODO: Move to be part of a generic transformers set-up Operator.
     def setup_onnx_file_path(self, model_path, sequence_length) -> str:
         import logging
diff --git a/src/deepsparse/v2/text_generation/prep_for_prefill.py b/src/deepsparse/v2/text_generation/prep_for_prefill.py
index 2f9eb15797..2e5fecb3e8 100644
--- a/src/deepsparse/v2/text_generation/prep_for_prefill.py
+++ b/src/deepsparse/v2/text_generation/prep_for_prefill.py
@@ -42,13 +42,20 @@ def __init__(self, kv_cache_creator: Operator):
             "from the NLEngineOperator"
         )
 
-    def run(self, tokens: Any, pipeline_state: PipelineState, **kwargs):
+    def run(
+        self,
+        input_ids: Any,
+        attention_mask: Any,
+        pipeline_state: PipelineState,
+        **kwargs,
+    ):
         # NOTE: Can potentially just be class attributes instead of relying on
         # pipeline state.
         cache_shape = pipeline_state.current_state.get("cache_shape")
         data_type = pipeline_state.current_state.get("kv_cache_data_type")
         output_names = pipeline_state.current_state.get("output_names")
 
+        tokens = input_ids[attention_mask.nonzero()].tolist()
         kv_cache = self.kv_cache_creator.run(
             cache_shape=cache_shape,
             kv_cache_data_type=data_type,
diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py
index e57e402983..5d47c8ff39 100644
--- a/src/deepsparse/v2/text_generation/process_inputs.py
+++ b/src/deepsparse/v2/text_generation/process_inputs.py
@@ -114,8 +114,7 @@ def run(self, inp: TextGenerationInput, **kwargs):
             frequency_penalty=generation_config.repetition_penalty,
         )
 
-        # TODO: move this step to prep_for_prefill and add attention mask to the output
-        # this will allow us to split/join more easily when processing multiple prompts
-        # in parallel
-        tokens = input_ids[attention_mask.nonzero()].tolist()
-        return {"tokens": tokens}, inference_state_update
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }, inference_state_update
diff --git a/src/deepsparse/v2/text_generation/process_outputs.py b/src/deepsparse/v2/text_generation/process_outputs.py
index ca1cf78521..7173b8e256 100644
--- a/src/deepsparse/v2/text_generation/process_outputs.py
+++ b/src/deepsparse/v2/text_generation/process_outputs.py
@@ -22,7 +22,6 @@
     TextGenerationOutput,
 )
 from deepsparse.v2.operators import Operator
-from deepsparse.v2.text_generation.compile_generations import CompileGenerationsOutput
 from deepsparse.v2.utils import InferenceState
 
 
@@ -52,19 +51,20 @@ def _create_generated_text_output(
         )
 
     def run(
-        self, inp: CompileGenerationsOutput, inference_state: InferenceState, **kwargs
+        self,
+        generated_tokens: numpy.ndarray,
+        generated_logits: numpy.ndarray,
+        finished_reason: list,
+        inference_state: InferenceState,
+        **kwargs,
     ):
         generation_config = inference_state.current_state.get("generation_config")
-        generated_tokens = inp.generated_tokens
-        generated_logits = (
-            inp.generated_logits if generation_config.output_scores else None
-        )
-        finished_reason = inp.finished_reason
+        generated_logits = generated_logits if generation_config.output_scores else None
         sequences = self.tokenizer.batch_decode(
             generated_tokens, skip_special_tokens=True
         )
 
-        finished_reason = [f for f in finished_reason if f]
+        finished_reason = [f[-1] for f in finished_reason]
 
         if generated_logits is not None:
             generations = list(
@@ -79,6 +79,15 @@ def run(
             generations = list(
                 map(self._create_generated_text_output, sequences, finished_reason)
             )
+
+        num_preds = generation_config.num_return_sequences
+        if num_preds > 1:
+            grouped_generations = [
+                generations[n : n + num_preds]
+                for n in range(0, len(generations), num_preds)
+            ]
+            generations = grouped_generations
+
         outputs = dict(
             created=datetime.datetime.now(),
             prompts=inference_state.current_state.get("prompts"),

From a508342daee2ffb715379bc4307a5d752dcc4055 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Mon, 6 Nov 2023 17:44:56 -0500
Subject: [PATCH 12/43] unit testing for text generation operators

---
 .../v2/unit/test_text_generation.py           | 326 ++++++++++++++++++
 1 file changed, 326 insertions(+)
 create mode 100644 tests/deepsparse/v2/unit/test_text_generation.py

diff --git a/tests/deepsparse/v2/unit/test_text_generation.py b/tests/deepsparse/v2/unit/test_text_generation.py
new file mode 100644
index 0000000000..d54331162e
--- /dev/null
+++ b/tests/deepsparse/v2/unit/test_text_generation.py
@@ -0,0 +1,326 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+
+import numpy
+
+import pytest
+from deepsparse.v2.text_generation import TextGenerationPipeline
+from deepsparse.transformers.utils.helpers import prepends_bos_token
+from deepsparse.transformers.helpers import get_deployment_path
+from transformers import AutoTokenizer
+from deepsparse.transformers.pipelines.text_generation import TextGenerationInput
+from deepsparse.v2.text_generation.process_inputs import GenerationDefaults
+from deepsparse.v2.utils import InferenceState
+from deepsparse.v2.text_generation import PrepareGeneration, TokenGeneratorOperator, InferenceState
+import copy
+
+
+@pytest.fixture
+def text_generation_attributes():
+    sequence_length = 5
+    prompt_sequence_length = 2
+    model_path = "hf:mgoin/TinyStories-1M-deepsparse"
+    deployment_path, model_path = get_deployment_path(model_path)
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        deployment_path,
+        trust_remote_code=False,
+        model_max_length=sequence_length,
+    )
+
+    tokenizer.padding_side = "left"
+    if not tokenizer.pad_token:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    return sequence_length, prompt_sequence_length, model_path, tokenizer
+
+
+@pytest.fixture
+def single_token_engine_no_internal_cache(text_generation_attributes):
+    from deepsparse.v2.text_generation import NLEngineOperator
+    seq_length, _, model_path, _ = text_generation_attributes
+    nl_engine_operator = NLEngineOperator(
+        sequence_length=seq_length,
+        input_ids_length=1,
+        model_path=model_path
+    )
+    return nl_engine_operator
+
+@pytest.fixture
+def pipeline_state(single_token_engine_no_internal_cache):
+    from deepsparse.v2.utils import PipelineState
+
+    pipeline_state = PipelineState()
+    pipeline_state_vals = {}
+    pipeline_state_vals[
+        "onnx_input_names_no_cache"
+    ] = single_token_engine_no_internal_cache.onnx_input_names_no_cache
+    pipeline_state_vals["cache_shape"] = single_token_engine_no_internal_cache.cache_shape
+    pipeline_state_vals["output_names"] = single_token_engine_no_internal_cache.output_names
+    print(pipeline_state_vals)
+    pipeline_state_vals[
+        "kv_cache_data_type"
+    ] = single_token_engine_no_internal_cache.kv_cache_data_type
+    pipeline_state.create_state(pipeline_state_vals)
+    return pipeline_state
+
+@pytest.fixture
+def large_prompt():
+    prompt = "Hello, how are you doing today?"
+    generation_config = {"top_p": 0, "top_k": 0, "max_length": 10}
+    return TextGenerationInput(prompt=prompt, generation_config=generation_config)
+
+@pytest.fixture
+def small_prompt():
+    prompt = "Hello"
+    return TextGenerationInput(prompt=prompt)
+
+@pytest.fixture
+def mock_kv_cache():
+    from deepsparse.transformers.utils import DecoderKVCache
+    kv_cache = DecoderKVCache()
+    kv_cache.setup(
+        state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])},
+    )
+    return kv_cache
+
+@pytest.fixture
+def mock_kv_cache_full():
+    from deepsparse.transformers.utils import DecoderKVCache
+    kv_cache = DecoderKVCache()
+    kv_cache.setup(
+        state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])},
+        num_processed_tokens=3
+    )
+    return kv_cache
+
+"""
+@pytest.fixture
+def mock_kv_cache_engine(pipeline_state):
+    from deepsparse.transformers.utils import DecoderKVCache
+    kv_cache = DecoderKVCache()
+    kv_cache_state = initialize_kv_cache_state(
+        cache_shape=pipeline_state.current_state.get("cache_shape"),
+        kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"),
+        output_names=pipeline_state.current_state.get("output_names"),
+        length=self.sequence_length - self.prompt_sequence_length,
+        empty=bool(self.internal_kv_cache),
+    )
+    print(state)
+    return kv_cache
+"""
+
+@pytest.fixture
+def mock_tokens():
+    return [15496]
+
+@pytest.fixture
+def mock_tokens_multiple():
+    return [15496, 15496, 15496]
+
+@pytest.fixture
+def mock_inference_state():
+    generation_config = GenerationDefaults()
+    inference_state = InferenceState()
+    inference_state.create_state({})
+    inference_state.update_state({
+        "generation_config": generation_config})
+    return inference_state
+
+@pytest.fixture
+def mock_token_generator(text_generation_attributes, mock_tokens_multiple):
+    _, _, _, tokenizer = text_generation_attributes
+    token_generator_creator = TokenGeneratorOperator()
+    prompt_logits = numpy.random.rand(1, len(mock_tokens_multiple), len(tokenizer))
+    token_generator_creator_output = token_generator_creator.run(
+        logits_shape=prompt_logits[0, -1, :].shape,
+        deterministic=True,
+        sampling_temperature=1.0,
+        tokens=copy.copy(mock_tokens_multiple),
+    )
+    return token_generator_creator_output.get("token_generator")
+
+@pytest.fixture
+def mock_logits(text_generation_attributes):
+    _, _, _, tokenizer = text_generation_attributes
+    return numpy.random.rand(1, 1, len(tokenizer))
+
+
+def test_process_inputs(text_generation_attributes, small_prompt, large_prompt):
+    sequence_length, _, _, tokenizer = text_generation_attributes
+    from deepsparse.v2.text_generation.process_inputs import ProcessInputsTextGeneration
+    process_inputs = ProcessInputsTextGeneration(
+        sequence_length=sequence_length,
+        tokenizer=tokenizer
+    )
+
+    outputs, state_update = process_inputs.run(small_prompt)
+    assert len(outputs.get("tokens")) == 1
+    assert isinstance(state_update.get("generation_config"), GenerationDefaults)
+    assert state_update.get("prompts") == small_prompt.sequences
+
+    outputs, state_update = process_inputs.run(large_prompt)
+    
+    assert not isinstance(state_update.get("generation_config"), GenerationDefaults)
+    assert state_update.get("generation_config").max_length == large_prompt.generation_config.get("max_length")
+    assert outputs.get("tokens")
+    assert state_update.get("top_k") == large_prompt.generation_config.get("top_k")
+
+
+def test_nl_single_token_engine_no_internal(single_token_engine_no_internal_cache):
+    assert single_token_engine_no_internal_cache.input_ids_length == 1
+    
+def test_kv_cache_creation(pipeline_state, text_generation_attributes):
+    from deepsparse.v2.text_generation import KVCacheCreator, KVCacheCreatorInput
+    seq_length, prompt_sequence_length, model_path, tokenizer = text_generation_attributes
+    kv_cache_creator = KVCacheCreator(
+        tokenizer=tokenizer,
+        prompt_sequence_length=prompt_sequence_length,
+        sequence_length=seq_length,
+        internal_kv_cache=False
+    )
+    
+    assert kv_cache_creator.input_schema == KVCacheCreatorInput
+    kv_cache = kv_cache_creator.run(
+        cache_shape=pipeline_state.current_state.get("cache_shape"),
+        kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"),
+        output_names=pipeline_state.current_state.get("output_names")
+    )
+    assert kv_cache.get("kv_cache")
+    assert kv_cache.get("kv_cache").total_num_processed_tokens == 0
+
+
+def test_autoreg_preproces_can_run(text_generation_attributes, pipeline_state, mock_tokens, mock_kv_cache):
+    seq_len, prompt_seq_len,  _, _ = text_generation_attributes
+    from deepsparse.v2.text_generation.autoregressive_preprocess_operator import AutoRegressiveOperatorPreprocess
+    autoreg_prep = AutoRegressiveOperatorPreprocess(
+        sequence_length=seq_len,
+        prompt_sequence_length=prompt_seq_len
+    )
+    inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache}
+
+    assert autoreg_prep.can_operate(inputs)
+    outputs = autoreg_prep.run(
+        tokens=mock_tokens,
+        kv_cache=mock_kv_cache,
+        pipeline_state=pipeline_state
+    )
+
+    assert len(outputs.get("engine_inputs")) == 4 # tokens, attention mask, causal, positions
+    tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs")
+    print(outputs.get("engine_inputs"))
+    assert tokens.shape[-1] == 1
+    assert attention_mask.shape[-1] == seq_len
+    assert positions[0] == mock_kv_cache.total_num_processed_tokens
+    assert outputs.get("in_generation") is None
+
+def test_autoreg_preproces_cant_run(text_generation_attributes, mock_kv_cache, mock_tokens_multiple):
+    seq_len, _, _, _ = text_generation_attributes
+    from deepsparse.v2.text_generation.autoregressive_preprocess_operator import AutoRegressiveOperatorPreprocess
+    autoreg_prep = AutoRegressiveOperatorPreprocess(
+        sequence_length=seq_len,
+        prompt_sequence_length=2
+    )
+    inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache}
+    assert not autoreg_prep.can_operate(inputs)
+    
+def test_mult_engine_preprocess(text_generation_attributes, mock_kv_cache, mock_tokens_multiple, pipeline_state):
+    seq_len, prompt_seq_len, _, _ = text_generation_attributes
+    from deepsparse.v2.text_generation.multi_engine_prefill_operator import MultiEnginePrefill
+    multi_prep = MultiEnginePrefill(
+        sequence_length=seq_len,
+        prompt_sequence_length=prompt_seq_len
+    )
+    inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache}
+    assert multi_prep.can_operate(inputs)
+    outputs = multi_prep.run(tokens=mock_tokens_multiple, kv_cache=mock_kv_cache, pipeline_state=pipeline_state)    
+    assert len(outputs.get("engine_inputs")) == 4 # tokens, attention mask, causal, positions
+    tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs")
+    assert tokens.shape[-1] == prompt_seq_len
+    assert attention_mask.shape[-1] == seq_len
+    assert positions.shape[-1] == prompt_seq_len
+
+def test_multi_engine_preprocess_cant_operate(text_generation_attributes, mock_kv_cache, mock_tokens):
+    seq_len, prompt_seq_len, _, _ = text_generation_attributes
+    from deepsparse.v2.text_generation.multi_engine_prefill_operator import MultiEnginePrefill
+    multi_prep = MultiEnginePrefill(
+        sequence_length=seq_len,
+        prompt_sequence_length=prompt_seq_len
+    )
+    inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache}
+    assert not multi_prep.can_operate(inputs)
+
+"""
+def test_run_single_engine_once(single_token_engine_no_internal_cache, mock_kv_cache_engine):
+    from deepsparse.v2.text_generation.nl_engine_operator import NlEngineInput
+
+    mock_engine_inputs = [numpy.array([[15496]]), numpy.array([[0, 0, 0, 0, 1]]), numpy.array([[0]]), numpy.array([[[[0, 0, 0, 0, 1]]]])]
+    inputs = NlEngineInput(
+        engine_inputs=mock_engine_inputs,
+        kv_cache=mock_kv_cache_engine,
+        tokens=mock_engine_inputs[0].tolist()
+    )
+    print(single_token_engine_no_internal_cache.run(inputs))
+"""
+
+def test_prep_for_generation(mock_tokens_multiple, mock_kv_cache_full, text_generation_attributes, mock_inference_state):
+    seq_len, prompt_seq_len, _, tokenizer = text_generation_attributes
+    prep_for_generation = PrepareGeneration(
+        token_generator=TokenGeneratorOperator(),
+        sequence_length=seq_len,
+        prompt_sequence_length=prompt_seq_len
+    )
+    inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache_full}
+    assert prep_for_generation.can_operate(inputs)
+
+    prompt_logits = [numpy.random.rand(1, len(mock_tokens_multiple), len(tokenizer))]
+    mock_inference_state.update_state({"prompt_logits": prompt_logits})
+    outputs, state = prep_for_generation.run(
+        tokens=mock_tokens_multiple,
+        kv_cache=mock_kv_cache,
+        inference_state=mock_inference_state
+    ) 
+    assert len(outputs.get("tokens")) == len(mock_tokens_multiple) + 1
+    assert outputs.get("in_generation")
+    assert numpy.array_equal(state.get("generated_logits")[0], numpy.expand_dims(prompt_logits[0][:, -1, :], 0))
+
+def test_generate_new_token(mock_token_generator, text_generation_attributes, mock_kv_cache, mock_inference_state, mock_logits, mock_tokens):
+    _, _, _, tokenizer = text_generation_attributes
+    from deepsparse.v2.text_generation import GenerateNewTokenOperator
+    generate_new_token = GenerateNewTokenOperator(
+        force_max_tokens=False,
+        tokenizer=tokenizer
+    )
+    mock_inference_state.update_state({"token_generator": mock_token_generator, "generated_tokens": [mock_token_generator.tokens]})
+    outputs, state = generate_new_token.run(
+        logits=mock_logits,
+        kv_cache=mock_kv_cache,
+        inference_state=mock_inference_state
+    )
+    assert outputs.get("new_token") == state.get("token_generator").tokens[-1]
+
+
+def test_compile_logits(mock_logits, mock_inference_state):
+    from deepsparse.v2.text_generation import CompilePromptLogits
+    mock_inference_state.update_state({"prompt_logits": [mock_logits]})
+    compile_prompt_logits = CompilePromptLogits()
+    assert compile_prompt_logits.can_operate({})
+    output, state = compile_prompt_logits.run(
+        logits=mock_logits,
+        inference_state=mock_inference_state
+    )
+    assert len(state.get("prompt_logits")) == len([mock_logits]) + 1
+    print(state.get("prompt_logits"))
\ No newline at end of file

From cbb0e86f5d4e588e8afa1f873c3411ef29d7ed2f Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 7 Nov 2023 11:05:13 -0500
Subject: [PATCH 13/43] additional changes

---
 src/deepsparse/v2/text_generation/kv_cache_operator.py   | 3 ++-
 src/deepsparse/v2/text_generation/prep_for_generation.py | 4 ++--
 src/deepsparse/v2/text_generation/process_inputs.py      | 7 ++++---
 tests/deepsparse/v2/unit/test_text_generation.py         | 3 +--
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/deepsparse/v2/text_generation/kv_cache_operator.py b/src/deepsparse/v2/text_generation/kv_cache_operator.py
index 0b232402b3..5811f44b32 100644
--- a/src/deepsparse/v2/text_generation/kv_cache_operator.py
+++ b/src/deepsparse/v2/text_generation/kv_cache_operator.py
@@ -24,7 +24,7 @@
 from deepsparse.v2.operators import Operator
 
 
-__all__ = ["KVCacheCreator"]
+__all__ = ["KVCacheCreator", "KVCacheCreatorInput"]
 
 
 class KVCacheCreatorOutput(BaseModel):
@@ -61,6 +61,7 @@ def run(self, cache_shape, kv_cache_data_type: str, output_names: list, **kwargs
             length=self.sequence_length - self.prompt_sequence_length,
             empty=bool(self.internal_kv_cache),
         )
+        print(kv_cache_state.get("past_key_values.0.key").shape)
 
         kv_cache = DecoderKVCache(self.internal_kv_cache)
         kv_cache.setup(
diff --git a/src/deepsparse/v2/text_generation/prep_for_generation.py b/src/deepsparse/v2/text_generation/prep_for_generation.py
index 544af43980..887f81e173 100644
--- a/src/deepsparse/v2/text_generation/prep_for_generation.py
+++ b/src/deepsparse/v2/text_generation/prep_for_generation.py
@@ -14,6 +14,7 @@
 from typing import Any
 
 import numpy
+import copy
 
 from deepsparse.transformers.pipelines.text_generation import FinishReason
 from deepsparse.v2.operators import Operator
@@ -107,7 +108,7 @@ def run(
             logits_shape=prompt_logits[0, -1, :].shape,
             deterministic=not generation_config.do_sample,
             sampling_temperature=generation_config.temperature,
-            tokens=tokens,
+            tokens=copy.copy(tokens),
             **inference_state.current_state,
         )
         token_generator = token_generator_creator_output.get("token_generator")
@@ -131,7 +132,6 @@ def run(
             "finished_reason": [],
             "token_generator": token_generator,
         }
-
         output = {
             "tokens": token_generator.tokens,
             "kv_cache": kv_cache,
diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py
index 5d47c8ff39..31f5aa0504 100644
--- a/src/deepsparse/v2/text_generation/process_inputs.py
+++ b/src/deepsparse/v2/text_generation/process_inputs.py
@@ -28,7 +28,7 @@
 
 class GenerationDefaults:
     num_return_sequences = 1
-    max_length = 100
+    max_length = 10
     max_new_tokens = None
     output_scores = False
     top_k = 0
@@ -54,10 +54,11 @@ class ProcessInputsTextGeneration(Operator):
     def __init__(
         self,
         tokenizer: transformers.PreTrainedTokenizerBase,
+        sequence_length: int,
         generation_config: Union[
             str, pathlib.Path, Dict, transformers.GenerationConfig
-        ],
-        sequence_length: int,
+        ] = None,
+        
     ):
         self.generation_config = generation_config
         self.tokenizer = tokenizer
diff --git a/tests/deepsparse/v2/unit/test_text_generation.py b/tests/deepsparse/v2/unit/test_text_generation.py
index d54331162e..f33776e010 100644
--- a/tests/deepsparse/v2/unit/test_text_generation.py
+++ b/tests/deepsparse/v2/unit/test_text_generation.py
@@ -322,5 +322,4 @@ def test_compile_logits(mock_logits, mock_inference_state):
         logits=mock_logits,
         inference_state=mock_inference_state
     )
-    assert len(state.get("prompt_logits")) == len([mock_logits]) + 1
-    print(state.get("prompt_logits"))
\ No newline at end of file
+    assert len(state.get("prompt_logits")) == len([mock_logits]) + 1
\ No newline at end of file

From 254158162868709290aa944ed3fdb090a43a431c Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 7 Nov 2023 21:27:00 -0500
Subject: [PATCH 14/43] unit testing completion

---
 .../v2/text_generation/nl_engine_operator.py  |   2 +-
 src/deepsparse/v2/text_generation/pipeline.py |   1 -
 .../v2/text_generation/prep_for_generation.py |  48 +--
 .../v2/text_generation/process_inputs.py      |   7 +-
 .../v2/unit/test_text_generation.py           | 283 +++++++++++-------
 5 files changed, 178 insertions(+), 163 deletions(-)

diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py
index 0bd9098a40..7549f986d9 100644
--- a/src/deepsparse/v2/text_generation/nl_engine_operator.py
+++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py
@@ -29,7 +29,7 @@
 )
 
 
-__all__ = ["NLEngineOperator"]
+__all__ = ["NLEngineOperator", "NlEngineInput"]
 
 
 class NlEngineInput(BaseModel):
diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py
index 240da04907..1c2972859b 100644
--- a/src/deepsparse/v2/text_generation/pipeline.py
+++ b/src/deepsparse/v2/text_generation/pipeline.py
@@ -124,7 +124,6 @@ def __init__(
         token_generator = TokenGeneratorOperator()
         prep_for_generation = PrepareGeneration(
             sequence_length=sequence_length,
-            prompt_sequence_length=prompt_sequence_length,
             token_generator=token_generator,
         )
         generate_new_token = GenerateNewTokenOperator(
diff --git a/src/deepsparse/v2/text_generation/prep_for_generation.py b/src/deepsparse/v2/text_generation/prep_for_generation.py
index 887f81e173..75f4aa9db2 100644
--- a/src/deepsparse/v2/text_generation/prep_for_generation.py
+++ b/src/deepsparse/v2/text_generation/prep_for_generation.py
@@ -11,12 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
 from typing import Any
 
 import numpy
-import copy
 
 from deepsparse.transformers.pipelines.text_generation import FinishReason
+from deepsparse.transformers.utils.helpers import set_generated_length
 from deepsparse.v2.operators import Operator
 from deepsparse.v2.text_generation import TokenGeneratorOperator
 from deepsparse.v2.utils import InferenceState
@@ -29,10 +30,8 @@ class PrepareGeneration(Operator):
     def __init__(
         self,
         token_generator: TokenGeneratorOperator,
-        prompt_sequence_length: int,
         sequence_length: int,
     ):
-        self.prompt_sequence_length = prompt_sequence_length
         self.sequence_length = sequence_length
         self.token_generator_creator = token_generator
 
@@ -48,49 +47,6 @@ def can_operate(self, inp: Any):
             return True
         return False
 
-    @staticmethod
-    def set_generated_length(
-        max_length: int,
-        prompt_tokens_length: int,
-        sequence_length: int,
-        prompt_sequence_length: int,
-        max_new_tokens: int,
-        finish_reason_choices: "FinishReason",  # noqa
-    ):
-        """
-        Determine the length of the generated tokens. The hard cap on the total number
-        of tokens is based on the sequence length. If max_length is provided and is less
-        than the sequence length, it will be used to cap the total number of tokens
-        generated. If it is not provided, the max_new_tokens attribute will be used and
-        also capped by the sequence length.
-
-        :param max_length: max_length attribute, provided as input during inference
-        :param prompt_tokens_length: the number of prompt tokens used as part of the
-            generated output
-        :param sequence_length: the sequence length used for the pipeline
-        :param prompt_sequence_length: the prompt sequence length used for the pipeline
-        :param max_new_tokens: the max_new_tokens attribute, which may be provided
-        as part of the input during inference
-        """
-        if max_length:
-            # if max_length provided, use that to cap total tokens generated
-            max_tokens = max_length
-            finish_reason = finish_reason_choices.LENGTH
-        else:
-            # if not provided, max tokens is based on max_new_tokens + prompt tokens
-            max_tokens = (
-                min(max_new_tokens, sequence_length - prompt_sequence_length)
-                + prompt_tokens_length
-            )
-            finish_reason = finish_reason_choices.MAX_NEW_TOKENS
-
-        # hard model/pipeline cap
-        return (
-            (sequence_length, finish_reason_choices.CAPACITY)
-            if sequence_length < max_tokens
-            else (max_tokens, finish_reason)
-        )
-
     def run(
         self, tokens: Any, kv_cache: Any, inference_state: InferenceState, **kwargs
     ):
diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py
index 31f5aa0504..059ed06f14 100644
--- a/src/deepsparse/v2/text_generation/process_inputs.py
+++ b/src/deepsparse/v2/text_generation/process_inputs.py
@@ -26,6 +26,9 @@
 from deepsparse.v2.operators import Operator
 
 
+__all__ = ["ProcessInputsTextGeneration", "GenerationDefaults"]
+
+
 class GenerationDefaults:
     num_return_sequences = 1
     max_length = 10
@@ -38,9 +41,6 @@ class GenerationDefaults:
     temperature = 1.0
 
 
-__all__ = ["ProcessInputsTextGeneration"]
-
-
 class ProcessInputsTextGeneration(Operator):
     """
     Input processing operator. Responsible for tokenizing the input, handling the
@@ -58,7 +58,6 @@ def __init__(
         generation_config: Union[
             str, pathlib.Path, Dict, transformers.GenerationConfig
         ] = None,
-        
     ):
         self.generation_config = generation_config
         self.tokenizer = tokenizer
diff --git a/tests/deepsparse/v2/unit/test_text_generation.py b/tests/deepsparse/v2/unit/test_text_generation.py
index f33776e010..410bcffdd1 100644
--- a/tests/deepsparse/v2/unit/test_text_generation.py
+++ b/tests/deepsparse/v2/unit/test_text_generation.py
@@ -12,27 +12,44 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import inspect
+import copy
 
 import numpy
+from transformers import AutoTokenizer
 
 import pytest
-from deepsparse.v2.text_generation import TextGenerationPipeline
-from deepsparse.transformers.utils.helpers import prepends_bos_token
 from deepsparse.transformers.helpers import get_deployment_path
-from transformers import AutoTokenizer
 from deepsparse.transformers.pipelines.text_generation import TextGenerationInput
-from deepsparse.v2.text_generation.process_inputs import GenerationDefaults
-from deepsparse.v2.utils import InferenceState
-from deepsparse.v2.text_generation import PrepareGeneration, TokenGeneratorOperator, InferenceState
-import copy
+from deepsparse.transformers.utils import DecoderKVCache
+from deepsparse.transformers.utils.helpers import initialize_kv_cache_state
+from deepsparse.v2 import InferenceState, PipelineState
+from deepsparse.v2.text_generation import (
+    AutoRegressiveOperatorPreprocess,
+    CompilePromptLogits,
+    GenerateNewTokenOperator,
+    GenerationDefaults,
+    KVCacheCreator,
+    KVCacheCreatorInput,
+    MultiEnginePrefill,
+    NlEngineInput,
+    NLEngineOperator,
+    PrepareGeneration,
+    ProcessInputsTextGeneration,
+    TokenGeneratorOperator,
+)
 
 
 @pytest.fixture
 def text_generation_attributes():
     sequence_length = 5
-    prompt_sequence_length = 2
+    prompt_sequence_length = 1
+    return sequence_length, prompt_sequence_length
+
+
+@pytest.fixture
+def model_attributes(text_generation_attributes):
     model_path = "hf:mgoin/TinyStories-1M-deepsparse"
+    sequence_length, prompt_sequence_length = text_generation_attributes
     deployment_path, model_path = get_deployment_path(model_path)
 
     tokenizer = AutoTokenizer.from_pretrained(
@@ -45,104 +62,109 @@ def text_generation_attributes():
     if not tokenizer.pad_token:
         tokenizer.pad_token = tokenizer.eos_token
 
-    return sequence_length, prompt_sequence_length, model_path, tokenizer
+    return tokenizer, model_path
 
 
 @pytest.fixture
-def single_token_engine_no_internal_cache(text_generation_attributes):
-    from deepsparse.v2.text_generation import NLEngineOperator
-    seq_length, _, model_path, _ = text_generation_attributes
+def single_token_engine_no_internal_cache(text_generation_attributes, model_attributes):
+    seq_length, _ = text_generation_attributes
+    _, model_path = model_attributes
+
     nl_engine_operator = NLEngineOperator(
-        sequence_length=seq_length,
-        input_ids_length=1,
-        model_path=model_path
+        sequence_length=seq_length, input_ids_length=1, model_path=model_path
     )
     return nl_engine_operator
 
+
 @pytest.fixture
 def pipeline_state(single_token_engine_no_internal_cache):
-    from deepsparse.v2.utils import PipelineState
-
     pipeline_state = PipelineState()
     pipeline_state_vals = {}
     pipeline_state_vals[
         "onnx_input_names_no_cache"
     ] = single_token_engine_no_internal_cache.onnx_input_names_no_cache
-    pipeline_state_vals["cache_shape"] = single_token_engine_no_internal_cache.cache_shape
-    pipeline_state_vals["output_names"] = single_token_engine_no_internal_cache.output_names
-    print(pipeline_state_vals)
+    pipeline_state_vals[
+        "cache_shape"
+    ] = single_token_engine_no_internal_cache.cache_shape
+    pipeline_state_vals[
+        "output_names"
+    ] = single_token_engine_no_internal_cache.output_names
     pipeline_state_vals[
         "kv_cache_data_type"
     ] = single_token_engine_no_internal_cache.kv_cache_data_type
     pipeline_state.create_state(pipeline_state_vals)
     return pipeline_state
 
+
 @pytest.fixture
 def large_prompt():
     prompt = "Hello, how are you doing today?"
     generation_config = {"top_p": 0, "top_k": 0, "max_length": 10}
     return TextGenerationInput(prompt=prompt, generation_config=generation_config)
 
+
 @pytest.fixture
 def small_prompt():
     prompt = "Hello"
     return TextGenerationInput(prompt=prompt)
 
+
 @pytest.fixture
 def mock_kv_cache():
-    from deepsparse.transformers.utils import DecoderKVCache
     kv_cache = DecoderKVCache()
     kv_cache.setup(
         state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])},
     )
     return kv_cache
 
+
 @pytest.fixture
 def mock_kv_cache_full():
-    from deepsparse.transformers.utils import DecoderKVCache
     kv_cache = DecoderKVCache()
     kv_cache.setup(
         state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])},
-        num_processed_tokens=3
+        num_processed_tokens=3,
     )
     return kv_cache
 
-"""
+
 @pytest.fixture
-def mock_kv_cache_engine(pipeline_state):
-    from deepsparse.transformers.utils import DecoderKVCache
+def mock_kv_cache_engine(pipeline_state, text_generation_attributes):
+    seq_len, _ = text_generation_attributes
     kv_cache = DecoderKVCache()
     kv_cache_state = initialize_kv_cache_state(
         cache_shape=pipeline_state.current_state.get("cache_shape"),
         kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"),
         output_names=pipeline_state.current_state.get("output_names"),
-        length=self.sequence_length - self.prompt_sequence_length,
-        empty=bool(self.internal_kv_cache),
+        length=seq_len - 1,
+        empty=False,
     )
-    print(state)
+    kv_cache.setup(state=kv_cache_state)
     return kv_cache
-"""
+
 
 @pytest.fixture
 def mock_tokens():
     return [15496]
 
+
 @pytest.fixture
 def mock_tokens_multiple():
     return [15496, 15496, 15496]
 
+
 @pytest.fixture
 def mock_inference_state():
     generation_config = GenerationDefaults()
     inference_state = InferenceState()
     inference_state.create_state({})
-    inference_state.update_state({
-        "generation_config": generation_config})
+    inference_state.update_state({"generation_config": generation_config})
     return inference_state
 
+
 @pytest.fixture
-def mock_token_generator(text_generation_attributes, mock_tokens_multiple):
-    _, _, _, tokenizer = text_generation_attributes
+def mock_token_generator(model_attributes, mock_tokens_multiple):
+    tokenizer, _ = model_attributes
     token_generator_creator = TokenGeneratorOperator()
     prompt_logits = numpy.random.rand(1, len(mock_tokens_multiple), len(tokenizer))
     token_generator_creator_output = token_generator_creator.run(
@@ -153,18 +175,20 @@ def mock_token_generator(text_generation_attributes, mock_tokens_multiple):
     )
     return token_generator_creator_output.get("token_generator")
 
+
 @pytest.fixture
-def mock_logits(text_generation_attributes):
-    _, _, _, tokenizer = text_generation_attributes
+def mock_logits(model_attributes):
+    tokenizer, _ = model_attributes
     return numpy.random.rand(1, 1, len(tokenizer))
 
 
-def test_process_inputs(text_generation_attributes, small_prompt, large_prompt):
-    sequence_length, _, _, tokenizer = text_generation_attributes
-    from deepsparse.v2.text_generation.process_inputs import ProcessInputsTextGeneration
+def test_process_inputs(
+    text_generation_attributes, model_attributes, small_prompt, large_prompt
+):
+    sequence_length, _ = text_generation_attributes
+    tokenizer, _ = model_attributes
     process_inputs = ProcessInputsTextGeneration(
-        sequence_length=sequence_length,
-        tokenizer=tokenizer
+        sequence_length=sequence_length, tokenizer=tokenizer
     )
 
     outputs, state_update = process_inputs.run(small_prompt)
@@ -173,115 +197,142 @@ def test_process_inputs(text_generation_attributes, small_prompt, large_prompt):
     assert state_update.get("prompts") == small_prompt.sequences
 
     outputs, state_update = process_inputs.run(large_prompt)
-    
+
     assert not isinstance(state_update.get("generation_config"), GenerationDefaults)
-    assert state_update.get("generation_config").max_length == large_prompt.generation_config.get("max_length")
+    assert state_update.get(
+        "generation_config"
+    ).max_length == large_prompt.generation_config.get("max_length")
     assert outputs.get("tokens")
     assert state_update.get("top_k") == large_prompt.generation_config.get("top_k")
 
 
 def test_nl_single_token_engine_no_internal(single_token_engine_no_internal_cache):
     assert single_token_engine_no_internal_cache.input_ids_length == 1
-    
-def test_kv_cache_creation(pipeline_state, text_generation_attributes):
-    from deepsparse.v2.text_generation import KVCacheCreator, KVCacheCreatorInput
-    seq_length, prompt_sequence_length, model_path, tokenizer = text_generation_attributes
+
+
+def test_kv_cache_creation(
+    pipeline_state, text_generation_attributes, model_attributes
+):
+    seq_length, prompt_seq_len = text_generation_attributes
+    tokenizer, _ = model_attributes
     kv_cache_creator = KVCacheCreator(
         tokenizer=tokenizer,
-        prompt_sequence_length=prompt_sequence_length,
+        prompt_sequence_length=prompt_seq_len,
         sequence_length=seq_length,
-        internal_kv_cache=False
+        internal_kv_cache=False,
     )
-    
+
     assert kv_cache_creator.input_schema == KVCacheCreatorInput
     kv_cache = kv_cache_creator.run(
         cache_shape=pipeline_state.current_state.get("cache_shape"),
         kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"),
-        output_names=pipeline_state.current_state.get("output_names")
+        output_names=pipeline_state.current_state.get("output_names"),
     )
     assert kv_cache.get("kv_cache")
     assert kv_cache.get("kv_cache").total_num_processed_tokens == 0
 
 
-def test_autoreg_preproces_can_run(text_generation_attributes, pipeline_state, mock_tokens, mock_kv_cache):
-    seq_len, prompt_seq_len,  _, _ = text_generation_attributes
-    from deepsparse.v2.text_generation.autoregressive_preprocess_operator import AutoRegressiveOperatorPreprocess
+def test_autoreg_preproces_can_run(
+    text_generation_attributes, pipeline_state, mock_tokens, mock_kv_cache
+):
+    seq_len, _ = text_generation_attributes
     autoreg_prep = AutoRegressiveOperatorPreprocess(
-        sequence_length=seq_len,
-        prompt_sequence_length=prompt_seq_len
+        sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1
     )
     inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache}
 
     assert autoreg_prep.can_operate(inputs)
     outputs = autoreg_prep.run(
-        tokens=mock_tokens,
-        kv_cache=mock_kv_cache,
-        pipeline_state=pipeline_state
+        tokens=mock_tokens, kv_cache=mock_kv_cache, pipeline_state=pipeline_state
     )
 
-    assert len(outputs.get("engine_inputs")) == 4 # tokens, attention mask, causal, positions
+    assert (
+        len(outputs.get("engine_inputs")) == 4
+    )  # tokens, attention mask, causal, positions
     tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs")
-    print(outputs.get("engine_inputs"))
+
     assert tokens.shape[-1] == 1
     assert attention_mask.shape[-1] == seq_len
     assert positions[0] == mock_kv_cache.total_num_processed_tokens
     assert outputs.get("in_generation") is None
 
-def test_autoreg_preproces_cant_run(text_generation_attributes, mock_kv_cache, mock_tokens_multiple):
-    seq_len, _, _, _ = text_generation_attributes
-    from deepsparse.v2.text_generation.autoregressive_preprocess_operator import AutoRegressiveOperatorPreprocess
+
+def test_autoreg_preproces_cant_run(
+    text_generation_attributes, mock_kv_cache, mock_tokens_multiple
+):
+    seq_len, _ = text_generation_attributes
     autoreg_prep = AutoRegressiveOperatorPreprocess(
-        sequence_length=seq_len,
-        prompt_sequence_length=2
+        sequence_length=seq_len, prompt_sequence_length=len(mock_tokens_multiple)
     )
     inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache}
     assert not autoreg_prep.can_operate(inputs)
-    
-def test_mult_engine_preprocess(text_generation_attributes, mock_kv_cache, mock_tokens_multiple, pipeline_state):
-    seq_len, prompt_seq_len, _, _ = text_generation_attributes
-    from deepsparse.v2.text_generation.multi_engine_prefill_operator import MultiEnginePrefill
+
+
+def test_mult_engine_preprocess(
+    text_generation_attributes, mock_kv_cache, mock_tokens_multiple, pipeline_state
+):
+    seq_len, _ = text_generation_attributes
     multi_prep = MultiEnginePrefill(
-        sequence_length=seq_len,
-        prompt_sequence_length=prompt_seq_len
+        sequence_length=seq_len, prompt_sequence_length=len(mock_tokens_multiple)
     )
     inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache}
     assert multi_prep.can_operate(inputs)
-    outputs = multi_prep.run(tokens=mock_tokens_multiple, kv_cache=mock_kv_cache, pipeline_state=pipeline_state)    
-    assert len(outputs.get("engine_inputs")) == 4 # tokens, attention mask, causal, positions
+    outputs = multi_prep.run(
+        tokens=mock_tokens_multiple,
+        kv_cache=mock_kv_cache,
+        pipeline_state=pipeline_state,
+    )
+    assert (
+        len(outputs.get("engine_inputs")) == 4
+    )  # tokens, attention mask, causal, positions
     tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs")
-    assert tokens.shape[-1] == prompt_seq_len
+    assert tokens.shape[-1] == len(mock_tokens_multiple)
     assert attention_mask.shape[-1] == seq_len
-    assert positions.shape[-1] == prompt_seq_len
+    assert positions.shape[-1] == len(mock_tokens_multiple)
 
-def test_multi_engine_preprocess_cant_operate(text_generation_attributes, mock_kv_cache, mock_tokens):
-    seq_len, prompt_seq_len, _, _ = text_generation_attributes
-    from deepsparse.v2.text_generation.multi_engine_prefill_operator import MultiEnginePrefill
+
+def test_multi_engine_preprocess_cant_operate(
+    text_generation_attributes, mock_kv_cache, mock_tokens
+):
+    seq_len, _ = text_generation_attributes
     multi_prep = MultiEnginePrefill(
-        sequence_length=seq_len,
-        prompt_sequence_length=prompt_seq_len
+        sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1
     )
     inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache}
     assert not multi_prep.can_operate(inputs)
 
-"""
-def test_run_single_engine_once(single_token_engine_no_internal_cache, mock_kv_cache_engine):
-    from deepsparse.v2.text_generation.nl_engine_operator import NlEngineInput
 
-    mock_engine_inputs = [numpy.array([[15496]]), numpy.array([[0, 0, 0, 0, 1]]), numpy.array([[0]]), numpy.array([[[[0, 0, 0, 0, 1]]]])]
+def test_run_single_engine_once(
+    single_token_engine_no_internal_cache,
+    mock_kv_cache_engine,
+):
+
+    mock_engine_inputs = [
+        numpy.array([[15496]]),
+        numpy.array([[0, 0, 0, 0, 1]]),
+        numpy.array([[0]]),
+        numpy.array([[[[0, 0, 0, 0, 1]]]]),
+    ]
     inputs = NlEngineInput(
         engine_inputs=mock_engine_inputs,
         kv_cache=mock_kv_cache_engine,
-        tokens=mock_engine_inputs[0].tolist()
+        tokens=mock_engine_inputs[0].tolist(),
     )
-    print(single_token_engine_no_internal_cache.run(inputs))
-"""
-
-def test_prep_for_generation(mock_tokens_multiple, mock_kv_cache_full, text_generation_attributes, mock_inference_state):
-    seq_len, prompt_seq_len, _, tokenizer = text_generation_attributes
+    output = single_token_engine_no_internal_cache.run(inputs)
+    assert output
+
+
+def test_prep_for_generation(
+    mock_tokens_multiple,
+    mock_kv_cache_full,
+    text_generation_attributes,
+    mock_inference_state,
+    model_attributes,
+):
+    seq_len, _ = text_generation_attributes
+    tokenizer, _ = model_attributes
     prep_for_generation = PrepareGeneration(
-        token_generator=TokenGeneratorOperator(),
-        sequence_length=seq_len,
-        prompt_sequence_length=prompt_seq_len
+        token_generator=TokenGeneratorOperator(), sequence_length=seq_len
     )
     inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache_full}
     assert prep_for_generation.can_operate(inputs)
@@ -291,35 +342,45 @@ def test_prep_for_generation(mock_tokens_multiple, mock_kv_cache_full, text_gene
     outputs, state = prep_for_generation.run(
         tokens=mock_tokens_multiple,
         kv_cache=mock_kv_cache,
-        inference_state=mock_inference_state
-    ) 
+        inference_state=mock_inference_state,
+    )
     assert len(outputs.get("tokens")) == len(mock_tokens_multiple) + 1
     assert outputs.get("in_generation")
-    assert numpy.array_equal(state.get("generated_logits")[0], numpy.expand_dims(prompt_logits[0][:, -1, :], 0))
+    assert numpy.array_equal(
+        state.get("generated_logits")[0],
+        numpy.expand_dims(prompt_logits[0][:, -1, :], 0),
+    )
+
 
-def test_generate_new_token(mock_token_generator, text_generation_attributes, mock_kv_cache, mock_inference_state, mock_logits, mock_tokens):
-    _, _, _, tokenizer = text_generation_attributes
-    from deepsparse.v2.text_generation import GenerateNewTokenOperator
+def test_generate_new_token(
+    mock_token_generator,
+    model_attributes,
+    mock_kv_cache,
+    mock_inference_state,
+    mock_logits,
+    mock_tokens,
+):
+    tokenizer, _ = model_attributes
     generate_new_token = GenerateNewTokenOperator(
-        force_max_tokens=False,
-        tokenizer=tokenizer
+        force_max_tokens=False, tokenizer=tokenizer
+    )
+    mock_inference_state.update_state(
+        {
+            "token_generator": mock_token_generator,
+            "generated_tokens": [mock_token_generator.tokens],
+        }
     )
-    mock_inference_state.update_state({"token_generator": mock_token_generator, "generated_tokens": [mock_token_generator.tokens]})
     outputs, state = generate_new_token.run(
-        logits=mock_logits,
-        kv_cache=mock_kv_cache,
-        inference_state=mock_inference_state
+        logits=mock_logits, kv_cache=mock_kv_cache, inference_state=mock_inference_state
     )
     assert outputs.get("new_token") == state.get("token_generator").tokens[-1]
 
 
 def test_compile_logits(mock_logits, mock_inference_state):
-    from deepsparse.v2.text_generation import CompilePromptLogits
     mock_inference_state.update_state({"prompt_logits": [mock_logits]})
     compile_prompt_logits = CompilePromptLogits()
     assert compile_prompt_logits.can_operate({})
     output, state = compile_prompt_logits.run(
-        logits=mock_logits,
-        inference_state=mock_inference_state
+        logits=mock_logits, inference_state=mock_inference_state
     )
-    assert len(state.get("prompt_logits")) == len([mock_logits]) + 1
\ No newline at end of file
+    assert len(state.get("prompt_logits")) == len([mock_logits]) + 1

From 8c8989d03655efe15023432db21dd21023a14f43 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 7 Nov 2023 21:29:01 -0500
Subject: [PATCH 15/43] remove debug

---
 src/deepsparse/v2/text_generation/process_inputs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py
index 059ed06f14..214b8526e3 100644
--- a/src/deepsparse/v2/text_generation/process_inputs.py
+++ b/src/deepsparse/v2/text_generation/process_inputs.py
@@ -31,7 +31,7 @@
 
 class GenerationDefaults:
     num_return_sequences = 1
-    max_length = 10
+    max_length = 100
     max_new_tokens = None
     output_scores = False
     top_k = 0

From f8d75e3fdbd5eac3b8cd041beac82d0339058ed2 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 7 Nov 2023 21:41:11 -0500
Subject: [PATCH 16/43] fix

---
 src/deepsparse/v2/text_generation/pipeline.py   |  1 +
 .../v2/text_generation/prep_for_generation.py   |  4 +++-
 .../deepsparse/v2/unit/test_text_generation.py  | 17 +++++++++--------
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py
index 1c2972859b..240da04907 100644
--- a/src/deepsparse/v2/text_generation/pipeline.py
+++ b/src/deepsparse/v2/text_generation/pipeline.py
@@ -124,6 +124,7 @@ def __init__(
         token_generator = TokenGeneratorOperator()
         prep_for_generation = PrepareGeneration(
             sequence_length=sequence_length,
+            prompt_sequence_length=prompt_sequence_length,
             token_generator=token_generator,
         )
         generate_new_token = GenerateNewTokenOperator(
diff --git a/src/deepsparse/v2/text_generation/prep_for_generation.py b/src/deepsparse/v2/text_generation/prep_for_generation.py
index 75f4aa9db2..0ea4a06a02 100644
--- a/src/deepsparse/v2/text_generation/prep_for_generation.py
+++ b/src/deepsparse/v2/text_generation/prep_for_generation.py
@@ -30,10 +30,12 @@ class PrepareGeneration(Operator):
     def __init__(
         self,
         token_generator: TokenGeneratorOperator,
+        prompt_sequence_length: int,
         sequence_length: int,
     ):
         self.sequence_length = sequence_length
         self.token_generator_creator = token_generator
+        self.prompt_sequence_length = prompt_sequence_length
 
     def can_operate(self, inp: Any):
         kv_cache = inp.get("kv_cache")
@@ -70,7 +72,7 @@ def run(
         token_generator = token_generator_creator_output.get("token_generator")
         token_generator.generate(prompt_logits[0, -1, :])
 
-        max_tokens, length_finish_reason = PrepareGeneration.set_generated_length(
+        max_tokens, length_finish_reason = set_generated_length(
             max_length=generation_config.max_length,
             prompt_tokens_length=1,
             max_new_tokens=generation_config.max_new_tokens,
diff --git a/tests/deepsparse/v2/unit/test_text_generation.py b/tests/deepsparse/v2/unit/test_text_generation.py
index 410bcffdd1..97ed4fef95 100644
--- a/tests/deepsparse/v2/unit/test_text_generation.py
+++ b/tests/deepsparse/v2/unit/test_text_generation.py
@@ -211,7 +211,7 @@ def test_nl_single_token_engine_no_internal(single_token_engine_no_internal_cach
 
 
 def test_kv_cache_creation(
-    pipeline_state, text_generation_attributes, model_attributes
+    text_generation_attributes, model_attributes, pipeline_state
 ):
     seq_length, prompt_seq_len = text_generation_attributes
     tokenizer, _ = model_attributes
@@ -269,7 +269,7 @@ def test_autoreg_preproces_cant_run(
 
 
 def test_mult_engine_preprocess(
-    text_generation_attributes, mock_kv_cache, mock_tokens_multiple, pipeline_state
+    text_generation_attributes, pipeline_state, mock_kv_cache, mock_tokens_multiple
 ):
     seq_len, _ = text_generation_attributes
     multi_prep = MultiEnginePrefill(
@@ -323,16 +323,18 @@ def test_run_single_engine_once(
 
 
 def test_prep_for_generation(
+    text_generation_attributes,
+    model_attributes,
     mock_tokens_multiple,
     mock_kv_cache_full,
-    text_generation_attributes,
     mock_inference_state,
-    model_attributes,
 ):
-    seq_len, _ = text_generation_attributes
+    seq_len, prompt_seq_len = text_generation_attributes
     tokenizer, _ = model_attributes
     prep_for_generation = PrepareGeneration(
-        token_generator=TokenGeneratorOperator(), sequence_length=seq_len
+        prompt_sequence_length=prompt_seq_len,
+        token_generator=TokenGeneratorOperator(),
+        sequence_length=seq_len,
     )
     inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache_full}
     assert prep_for_generation.can_operate(inputs)
@@ -353,12 +355,11 @@ def test_prep_for_generation(
 
 
 def test_generate_new_token(
-    mock_token_generator,
     model_attributes,
+    mock_token_generator,
     mock_kv_cache,
     mock_inference_state,
     mock_logits,
-    mock_tokens,
 ):
     tokenizer, _ = model_attributes
     generate_new_token = GenerateNewTokenOperator(

From fd1e466363f8fb0162640ae5f08aef964e58c084 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 7 Nov 2023 21:45:16 -0500
Subject: [PATCH 17/43] add todo

---
 src/deepsparse/v2/routers/router.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py
index 1b70164002..6b0d851aef 100644
--- a/src/deepsparse/v2/routers/router.py
+++ b/src/deepsparse/v2/routers/router.py
@@ -158,4 +158,5 @@ def next(
 
     @staticmethod
     def validate(ops) -> bool:
+        # TODO: still needs to be implemented for the GraphRouter
         pass

From 64c055266a50dc6ee65ef897783195f2006f943e Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Wed, 8 Nov 2023 09:58:53 -0500
Subject: [PATCH 18/43] more clean-up

---
 .../v2/text_generation/kv_cache_operator.py    |  1 -
 .../deepsparse/v2/unit/test_text_generation.py | 18 +++++++++---------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/deepsparse/v2/text_generation/kv_cache_operator.py b/src/deepsparse/v2/text_generation/kv_cache_operator.py
index 5811f44b32..3c15d0ff5a 100644
--- a/src/deepsparse/v2/text_generation/kv_cache_operator.py
+++ b/src/deepsparse/v2/text_generation/kv_cache_operator.py
@@ -61,7 +61,6 @@ def run(self, cache_shape, kv_cache_data_type: str, output_names: list, **kwargs
             length=self.sequence_length - self.prompt_sequence_length,
             empty=bool(self.internal_kv_cache),
         )
-        print(kv_cache_state.get("past_key_values.0.key").shape)
 
         kv_cache = DecoderKVCache(self.internal_kv_cache)
         kv_cache.setup(
diff --git a/tests/deepsparse/v2/unit/test_text_generation.py b/tests/deepsparse/v2/unit/test_text_generation.py
index 97ed4fef95..0d0c4ef3be 100644
--- a/tests/deepsparse/v2/unit/test_text_generation.py
+++ b/tests/deepsparse/v2/unit/test_text_generation.py
@@ -49,7 +49,7 @@ def text_generation_attributes():
 @pytest.fixture
 def model_attributes(text_generation_attributes):
     model_path = "hf:mgoin/TinyStories-1M-deepsparse"
-    sequence_length, prompt_sequence_length = text_generation_attributes
+    sequence_length, _ = text_generation_attributes
     deployment_path, model_path = get_deployment_path(model_path)
 
     tokenizer = AutoTokenizer.from_pretrained(
@@ -129,14 +129,14 @@ def mock_kv_cache_full():
 
 
 @pytest.fixture
-def mock_kv_cache_engine(pipeline_state, text_generation_attributes):
-    seq_len, _ = text_generation_attributes
+def mock_kv_cache_single_token_engine(pipeline_state, text_generation_attributes):
+    seq_len, prompt_seq_len = text_generation_attributes
     kv_cache = DecoderKVCache()
     kv_cache_state = initialize_kv_cache_state(
         cache_shape=pipeline_state.current_state.get("cache_shape"),
         kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"),
         output_names=pipeline_state.current_state.get("output_names"),
-        length=seq_len - 1,
+        length=seq_len - prompt_seq_len,
         empty=False,
     )
     kv_cache.setup(state=kv_cache_state)
@@ -235,7 +235,7 @@ def test_kv_cache_creation(
 def test_autoreg_preproces_can_run(
     text_generation_attributes, pipeline_state, mock_tokens, mock_kv_cache
 ):
-    seq_len, _ = text_generation_attributes
+    seq_len, prompt_seq_len = text_generation_attributes
     autoreg_prep = AutoRegressiveOperatorPreprocess(
         sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1
     )
@@ -251,7 +251,7 @@ def test_autoreg_preproces_can_run(
     )  # tokens, attention mask, causal, positions
     tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs")
 
-    assert tokens.shape[-1] == 1
+    assert tokens.shape[-1] == prompt_seq_len
     assert attention_mask.shape[-1] == seq_len
     assert positions[0] == mock_kv_cache.total_num_processed_tokens
     assert outputs.get("in_generation") is None
@@ -302,9 +302,9 @@ def test_multi_engine_preprocess_cant_operate(
     assert not multi_prep.can_operate(inputs)
 
 
-def test_run_single_engine_once(
+def test_run_single_token_engine_once(
     single_token_engine_no_internal_cache,
-    mock_kv_cache_engine,
+    mock_kv_cache_single_token_engine,
 ):
 
     mock_engine_inputs = [
@@ -315,7 +315,7 @@ def test_run_single_engine_once(
     ]
     inputs = NlEngineInput(
         engine_inputs=mock_engine_inputs,
-        kv_cache=mock_kv_cache_engine,
+        kv_cache=mock_kv_cache_single_token_engine,
         tokens=mock_engine_inputs[0].tolist(),
     )
     output = single_token_engine_no_internal_cache.run(inputs)

From 913665aea6af951a8bafdfa63ca568a14937f066 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Wed, 8 Nov 2023 10:07:44 -0500
Subject: [PATCH 19/43] fix test

---
 tests/deepsparse/v2/unit/test_text_generation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/deepsparse/v2/unit/test_text_generation.py b/tests/deepsparse/v2/unit/test_text_generation.py
index 0d0c4ef3be..2d2edda94e 100644
--- a/tests/deepsparse/v2/unit/test_text_generation.py
+++ b/tests/deepsparse/v2/unit/test_text_generation.py
@@ -319,7 +319,7 @@ def test_run_single_token_engine_once(
         tokens=mock_engine_inputs[0].tolist(),
     )
     output = single_token_engine_no_internal_cache.run(inputs)
-    assert output
+    assert output.get("logits") is not None
 
 
 def test_prep_for_generation(

From e15521fc53bbe073504cd5ec13fcbbe639702a9f Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Wed, 8 Nov 2023 10:38:19 -0500
Subject: [PATCH 20/43] add docstrings/comments

---
 .../v2/unit/test_text_generation.py           | 87 ++++++++++++++++---
 1 file changed, 75 insertions(+), 12 deletions(-)

diff --git a/tests/deepsparse/v2/unit/test_text_generation.py b/tests/deepsparse/v2/unit/test_text_generation.py
index 2d2edda94e..59204cb2d0 100644
--- a/tests/deepsparse/v2/unit/test_text_generation.py
+++ b/tests/deepsparse/v2/unit/test_text_generation.py
@@ -119,7 +119,7 @@ def mock_kv_cache():
 
 
 @pytest.fixture
-def mock_kv_cache_full():
+def mock_kv_cache_three_tokens_processed():
     kv_cache = DecoderKVCache()
     kv_cache.setup(
         state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])},
@@ -185,6 +185,10 @@ def mock_logits(model_attributes):
 def test_process_inputs(
     text_generation_attributes, model_attributes, small_prompt, large_prompt
 ):
+    """
+    Check if the ProcessInputsTextGeneration Operator successfully processes the
+    inputs and generation config.
+    """
     sequence_length, _ = text_generation_attributes
     tokenizer, _ = model_attributes
     process_inputs = ProcessInputsTextGeneration(
@@ -213,6 +217,10 @@ def test_nl_single_token_engine_no_internal(single_token_engine_no_internal_cach
 def test_kv_cache_creation(
     text_generation_attributes, model_attributes, pipeline_state
 ):
+    """
+    Check if the KVCacheCreator successfully creates a kv_cache object, given the
+    single_token_engine attributes stored in the pipeline_state.
+    """
     seq_length, prompt_seq_len = text_generation_attributes
     tokenizer, _ = model_attributes
     kv_cache_creator = KVCacheCreator(
@@ -235,23 +243,29 @@ def test_kv_cache_creation(
 def test_autoreg_preproces_can_run(
     text_generation_attributes, pipeline_state, mock_tokens, mock_kv_cache
 ):
-    seq_len, prompt_seq_len = text_generation_attributes
+    """
+    Check if the single-token engine preprocess operator can run based on the provided
+    tokens and prompt_sequence_length.
+    """
+
+    seq_len, _ = text_generation_attributes
     autoreg_prep = AutoRegressiveOperatorPreprocess(
         sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1
     )
     inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache}
 
+    # The prompt_sequence_length is greater than the number of tokens that are to be
+    # operated on. Therefore, use the single_token_engine and can_operate() should be
+    # True.
     assert autoreg_prep.can_operate(inputs)
     outputs = autoreg_prep.run(
         tokens=mock_tokens, kv_cache=mock_kv_cache, pipeline_state=pipeline_state
     )
-
-    assert (
-        len(outputs.get("engine_inputs")) == 4
-    )  # tokens, attention mask, causal, positions
+    # Assert 4 engine inputs: tokens, attention mask, causal, positions
+    assert len(outputs.get("engine_inputs")) == 4
     tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs")
 
-    assert tokens.shape[-1] == prompt_seq_len
+    assert tokens.shape[-1] == 1
     assert attention_mask.shape[-1] == seq_len
     assert positions[0] == mock_kv_cache.total_num_processed_tokens
     assert outputs.get("in_generation") is None
@@ -260,32 +274,47 @@ def test_autoreg_preproces_can_run(
 def test_autoreg_preproces_cant_run(
     text_generation_attributes, mock_kv_cache, mock_tokens_multiple
 ):
+    """
+    Check if the single-token engine preprocess operator can run based on the provided
+    tokens and prompt_sequence_length.
+    """
+
     seq_len, _ = text_generation_attributes
     autoreg_prep = AutoRegressiveOperatorPreprocess(
         sequence_length=seq_len, prompt_sequence_length=len(mock_tokens_multiple)
     )
     inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache}
+    # can_operate() should be False as the prompt_sequence_length is equal to the
+    # number of tokens we want to operate on. Therefore, the multi-token engine
+    # should run instead.
     assert not autoreg_prep.can_operate(inputs)
 
 
 def test_mult_engine_preprocess(
     text_generation_attributes, pipeline_state, mock_kv_cache, mock_tokens_multiple
 ):
+    """
+    Check if the multi-token engine preprocess operator can run based on the provided
+    tokens and prompt_sequence_length.
+    """
+
     seq_len, _ = text_generation_attributes
     multi_prep = MultiEnginePrefill(
         sequence_length=seq_len, prompt_sequence_length=len(mock_tokens_multiple)
     )
     inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache}
+    # The number of tokens is equal to the prompt_sequence_length.
+    # Therefore, the multi_token_engine can run and can_operate() should be True.
     assert multi_prep.can_operate(inputs)
     outputs = multi_prep.run(
         tokens=mock_tokens_multiple,
         kv_cache=mock_kv_cache,
         pipeline_state=pipeline_state,
     )
-    assert (
-        len(outputs.get("engine_inputs")) == 4
-    )  # tokens, attention mask, causal, positions
+    # Expect 4 engine inputs: tokens, attention mask, causal, positions
+    assert len(outputs.get("engine_inputs")) == 4
     tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs")
+    # Assert proper shapes for all engine_inputs
     assert tokens.shape[-1] == len(mock_tokens_multiple)
     assert attention_mask.shape[-1] == seq_len
     assert positions.shape[-1] == len(mock_tokens_multiple)
@@ -294,11 +323,18 @@ def test_mult_engine_preprocess(
 def test_multi_engine_preprocess_cant_operate(
     text_generation_attributes, mock_kv_cache, mock_tokens
 ):
+    """
+    Check if the multi-token engine preprocess operator can run based on the provided
+    tokens and prompt_sequence_length.
+    """
     seq_len, _ = text_generation_attributes
     multi_prep = MultiEnginePrefill(
         sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1
     )
     inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache}
+    # The prompt_sequence_length is one greater than the total number of tokens we're
+    # processing. Therefore, this operator should not run and can_operate() should be
+    # False.
     assert not multi_prep.can_operate(inputs)
 
 
@@ -306,6 +342,10 @@ def test_run_single_token_engine_once(
     single_token_engine_no_internal_cache,
     mock_kv_cache_single_token_engine,
 ):
+    """
+    This operator runs through the single-token NLEngine once, given engine_inputs and
+    kv_cache.
+    """
 
     mock_engine_inputs = [
         numpy.array([[15496]]),
@@ -326,9 +366,13 @@ def test_prep_for_generation(
     text_generation_attributes,
     model_attributes,
     mock_tokens_multiple,
-    mock_kv_cache_full,
+    mock_kv_cache_three_tokens_processed,
     mock_inference_state,
 ):
+    """
+    This test will assess the PrepareGeneration, which runs after prompt_inference
+    and before generation.
+    """
     seq_len, prompt_seq_len = text_generation_attributes
     tokenizer, _ = model_attributes
     prep_for_generation = PrepareGeneration(
@@ -336,7 +380,13 @@ def test_prep_for_generation(
         token_generator=TokenGeneratorOperator(),
         sequence_length=seq_len,
     )
-    inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache_full}
+    inputs = {
+        "tokens": mock_tokens_multiple,
+        "kv_cache": mock_kv_cache_three_tokens_processed,
+    }
+    # can_operate() if the total number of prompt tokens is equal to the
+    # number of processed tokens stored in the kv_cache, indicating prompt inference is
+    # complete and generation can begin.
     assert prep_for_generation.can_operate(inputs)
 
     prompt_logits = [numpy.random.rand(1, len(mock_tokens_multiple), len(tokenizer))]
@@ -361,6 +411,11 @@ def test_generate_new_token(
     mock_inference_state,
     mock_logits,
 ):
+    """
+    This test is responsible for testing the GenerateNewTokenOperator, which generates
+    one new token, given a token_generator (stored in the inference_state) and logits
+    from the engine.
+    """
     tokenizer, _ = model_attributes
     generate_new_token = GenerateNewTokenOperator(
         force_max_tokens=False, tokenizer=tokenizer
@@ -374,14 +429,22 @@ def test_generate_new_token(
     outputs, state = generate_new_token.run(
         logits=mock_logits, kv_cache=mock_kv_cache, inference_state=mock_inference_state
     )
+    # The new_token generated/returned by ths operator should match the last token in
+    # token_generator
     assert outputs.get("new_token") == state.get("token_generator").tokens[-1]
 
 
 def test_compile_logits(mock_logits, mock_inference_state):
     mock_inference_state.update_state({"prompt_logits": [mock_logits]})
     compile_prompt_logits = CompilePromptLogits()
+    # Can operate as long as we're not in generation but in prompt_inference. This
+    # can_operate() will check for the `in_generation` flag in the input.
     assert compile_prompt_logits.can_operate({})
     output, state = compile_prompt_logits.run(
         logits=mock_logits, inference_state=mock_inference_state
     )
+    # The CompilePromptLogits is responsible for updating a list of prompt logits
+    # calculated at each step during prompt inference. After one step of running this
+    # operator, the total number of prompt_logits in the inference state should be
+    # the current length of prompt logits + 1
     assert len(state.get("prompt_logits")) == len([mock_logits]) + 1

From 379481e159186434b482df82d17d5893a4a23071 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Thu, 9 Nov 2023 16:53:41 -0500
Subject: [PATCH 21/43] break out tests to individual unit test files; add
 conftest and make scope of fixtures module to help with speed

---
 .../v2/unit/test_text_generation.py           | 450 ------------------
 .../v2/unit/text_generation/conftest.py       | 173 +++++++
 .../v2/unit/text_generation/test_kv_cache.py  |  41 ++
 .../v2/unit/text_generation/test_msic.py      |  31 ++
 .../text_generation/test_process_inputs.py    |  47 ++
 .../test_single_token_engine.py               |  98 ++++
 .../text_generation/test_token_generation.py  |  92 ++++
 .../text_multi_token_engine.py                |  63 +++
 8 files changed, 545 insertions(+), 450 deletions(-)
 delete mode 100644 tests/deepsparse/v2/unit/test_text_generation.py
 create mode 100644 tests/deepsparse/v2/unit/text_generation/conftest.py
 create mode 100644 tests/deepsparse/v2/unit/text_generation/test_kv_cache.py
 create mode 100644 tests/deepsparse/v2/unit/text_generation/test_msic.py
 create mode 100644 tests/deepsparse/v2/unit/text_generation/test_process_inputs.py
 create mode 100644 tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py
 create mode 100644 tests/deepsparse/v2/unit/text_generation/test_token_generation.py
 create mode 100644 tests/deepsparse/v2/unit/text_generation/text_multi_token_engine.py

diff --git a/tests/deepsparse/v2/unit/test_text_generation.py b/tests/deepsparse/v2/unit/test_text_generation.py
deleted file mode 100644
index 59204cb2d0..0000000000
--- a/tests/deepsparse/v2/unit/test_text_generation.py
+++ /dev/null
@@ -1,450 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-
-import numpy
-from transformers import AutoTokenizer
-
-import pytest
-from deepsparse.transformers.helpers import get_deployment_path
-from deepsparse.transformers.pipelines.text_generation import TextGenerationInput
-from deepsparse.transformers.utils import DecoderKVCache
-from deepsparse.transformers.utils.helpers import initialize_kv_cache_state
-from deepsparse.v2 import InferenceState, PipelineState
-from deepsparse.v2.text_generation import (
-    AutoRegressiveOperatorPreprocess,
-    CompilePromptLogits,
-    GenerateNewTokenOperator,
-    GenerationDefaults,
-    KVCacheCreator,
-    KVCacheCreatorInput,
-    MultiEnginePrefill,
-    NlEngineInput,
-    NLEngineOperator,
-    PrepareGeneration,
-    ProcessInputsTextGeneration,
-    TokenGeneratorOperator,
-)
-
-
-@pytest.fixture
-def text_generation_attributes():
-    sequence_length = 5
-    prompt_sequence_length = 1
-    return sequence_length, prompt_sequence_length
-
-
-@pytest.fixture
-def model_attributes(text_generation_attributes):
-    model_path = "hf:mgoin/TinyStories-1M-deepsparse"
-    sequence_length, _ = text_generation_attributes
-    deployment_path, model_path = get_deployment_path(model_path)
-
-    tokenizer = AutoTokenizer.from_pretrained(
-        deployment_path,
-        trust_remote_code=False,
-        model_max_length=sequence_length,
-    )
-
-    tokenizer.padding_side = "left"
-    if not tokenizer.pad_token:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    return tokenizer, model_path
-
-
-@pytest.fixture
-def single_token_engine_no_internal_cache(text_generation_attributes, model_attributes):
-    seq_length, _ = text_generation_attributes
-    _, model_path = model_attributes
-
-    nl_engine_operator = NLEngineOperator(
-        sequence_length=seq_length, input_ids_length=1, model_path=model_path
-    )
-    return nl_engine_operator
-
-
-@pytest.fixture
-def pipeline_state(single_token_engine_no_internal_cache):
-    pipeline_state = PipelineState()
-    pipeline_state_vals = {}
-    pipeline_state_vals[
-        "onnx_input_names_no_cache"
-    ] = single_token_engine_no_internal_cache.onnx_input_names_no_cache
-    pipeline_state_vals[
-        "cache_shape"
-    ] = single_token_engine_no_internal_cache.cache_shape
-    pipeline_state_vals[
-        "output_names"
-    ] = single_token_engine_no_internal_cache.output_names
-    pipeline_state_vals[
-        "kv_cache_data_type"
-    ] = single_token_engine_no_internal_cache.kv_cache_data_type
-    pipeline_state.create_state(pipeline_state_vals)
-    return pipeline_state
-
-
-@pytest.fixture
-def large_prompt():
-    prompt = "Hello, how are you doing today?"
-    generation_config = {"top_p": 0, "top_k": 0, "max_length": 10}
-    return TextGenerationInput(prompt=prompt, generation_config=generation_config)
-
-
-@pytest.fixture
-def small_prompt():
-    prompt = "Hello"
-    return TextGenerationInput(prompt=prompt)
-
-
-@pytest.fixture
-def mock_kv_cache():
-    kv_cache = DecoderKVCache()
-    kv_cache.setup(
-        state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])},
-    )
-    return kv_cache
-
-
-@pytest.fixture
-def mock_kv_cache_three_tokens_processed():
-    kv_cache = DecoderKVCache()
-    kv_cache.setup(
-        state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])},
-        num_processed_tokens=3,
-    )
-    return kv_cache
-
-
-@pytest.fixture
-def mock_kv_cache_single_token_engine(pipeline_state, text_generation_attributes):
-    seq_len, prompt_seq_len = text_generation_attributes
-    kv_cache = DecoderKVCache()
-    kv_cache_state = initialize_kv_cache_state(
-        cache_shape=pipeline_state.current_state.get("cache_shape"),
-        kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"),
-        output_names=pipeline_state.current_state.get("output_names"),
-        length=seq_len - prompt_seq_len,
-        empty=False,
-    )
-    kv_cache.setup(state=kv_cache_state)
-    return kv_cache
-
-
-@pytest.fixture
-def mock_tokens():
-    return [15496]
-
-
-@pytest.fixture
-def mock_tokens_multiple():
-    return [15496, 15496, 15496]
-
-
-@pytest.fixture
-def mock_inference_state():
-    generation_config = GenerationDefaults()
-    inference_state = InferenceState()
-    inference_state.create_state({})
-    inference_state.update_state({"generation_config": generation_config})
-    return inference_state
-
-
-@pytest.fixture
-def mock_token_generator(model_attributes, mock_tokens_multiple):
-    tokenizer, _ = model_attributes
-    token_generator_creator = TokenGeneratorOperator()
-    prompt_logits = numpy.random.rand(1, len(mock_tokens_multiple), len(tokenizer))
-    token_generator_creator_output = token_generator_creator.run(
-        logits_shape=prompt_logits[0, -1, :].shape,
-        deterministic=True,
-        sampling_temperature=1.0,
-        tokens=copy.copy(mock_tokens_multiple),
-    )
-    return token_generator_creator_output.get("token_generator")
-
-
-@pytest.fixture
-def mock_logits(model_attributes):
-    tokenizer, _ = model_attributes
-    return numpy.random.rand(1, 1, len(tokenizer))
-
-
-def test_process_inputs(
-    text_generation_attributes, model_attributes, small_prompt, large_prompt
-):
-    """
-    Check if the ProcessInputsTextGeneration Operator successfully processes the
-    inputs and generation config.
-    """
-    sequence_length, _ = text_generation_attributes
-    tokenizer, _ = model_attributes
-    process_inputs = ProcessInputsTextGeneration(
-        sequence_length=sequence_length, tokenizer=tokenizer
-    )
-
-    outputs, state_update = process_inputs.run(small_prompt)
-    assert len(outputs.get("tokens")) == 1
-    assert isinstance(state_update.get("generation_config"), GenerationDefaults)
-    assert state_update.get("prompts") == small_prompt.sequences
-
-    outputs, state_update = process_inputs.run(large_prompt)
-
-    assert not isinstance(state_update.get("generation_config"), GenerationDefaults)
-    assert state_update.get(
-        "generation_config"
-    ).max_length == large_prompt.generation_config.get("max_length")
-    assert outputs.get("tokens")
-    assert state_update.get("top_k") == large_prompt.generation_config.get("top_k")
-
-
-def test_nl_single_token_engine_no_internal(single_token_engine_no_internal_cache):
-    assert single_token_engine_no_internal_cache.input_ids_length == 1
-
-
-def test_kv_cache_creation(
-    text_generation_attributes, model_attributes, pipeline_state
-):
-    """
-    Check if the KVCacheCreator successfully creates a kv_cache object, given the
-    single_token_engine attributes stored in the pipeline_state.
-    """
-    seq_length, prompt_seq_len = text_generation_attributes
-    tokenizer, _ = model_attributes
-    kv_cache_creator = KVCacheCreator(
-        tokenizer=tokenizer,
-        prompt_sequence_length=prompt_seq_len,
-        sequence_length=seq_length,
-        internal_kv_cache=False,
-    )
-
-    assert kv_cache_creator.input_schema == KVCacheCreatorInput
-    kv_cache = kv_cache_creator.run(
-        cache_shape=pipeline_state.current_state.get("cache_shape"),
-        kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"),
-        output_names=pipeline_state.current_state.get("output_names"),
-    )
-    assert kv_cache.get("kv_cache")
-    assert kv_cache.get("kv_cache").total_num_processed_tokens == 0
-
-
-def test_autoreg_preproces_can_run(
-    text_generation_attributes, pipeline_state, mock_tokens, mock_kv_cache
-):
-    """
-    Check if the single-token engine preprocess operator can run based on the provided
-    tokens and prompt_sequence_length.
-    """
-
-    seq_len, _ = text_generation_attributes
-    autoreg_prep = AutoRegressiveOperatorPreprocess(
-        sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1
-    )
-    inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache}
-
-    # The prompt_sequence_length is greater than the number of tokens that are to be
-    # operated on. Therefore, use the single_token_engine and can_operate() should be
-    # True.
-    assert autoreg_prep.can_operate(inputs)
-    outputs = autoreg_prep.run(
-        tokens=mock_tokens, kv_cache=mock_kv_cache, pipeline_state=pipeline_state
-    )
-    # Assert 4 engine inputs: tokens, attention mask, causal, positions
-    assert len(outputs.get("engine_inputs")) == 4
-    tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs")
-
-    assert tokens.shape[-1] == 1
-    assert attention_mask.shape[-1] == seq_len
-    assert positions[0] == mock_kv_cache.total_num_processed_tokens
-    assert outputs.get("in_generation") is None
-
-
-def test_autoreg_preproces_cant_run(
-    text_generation_attributes, mock_kv_cache, mock_tokens_multiple
-):
-    """
-    Check if the single-token engine preprocess operator can run based on the provided
-    tokens and prompt_sequence_length.
-    """
-
-    seq_len, _ = text_generation_attributes
-    autoreg_prep = AutoRegressiveOperatorPreprocess(
-        sequence_length=seq_len, prompt_sequence_length=len(mock_tokens_multiple)
-    )
-    inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache}
-    # can_operate() should be False as the prompt_sequence_length is equal to the
-    # number of tokens we want to operate on. Therefore, the multi-token engine
-    # should run instead.
-    assert not autoreg_prep.can_operate(inputs)
-
-
-def test_mult_engine_preprocess(
-    text_generation_attributes, pipeline_state, mock_kv_cache, mock_tokens_multiple
-):
-    """
-    Check if the multi-token engine preprocess operator can run based on the provided
-    tokens and prompt_sequence_length.
-    """
-
-    seq_len, _ = text_generation_attributes
-    multi_prep = MultiEnginePrefill(
-        sequence_length=seq_len, prompt_sequence_length=len(mock_tokens_multiple)
-    )
-    inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache}
-    # The number of tokens is equal to the prompt_sequence_length.
-    # Therefore, the multi_token_engine can run and can_operate() should be True.
-    assert multi_prep.can_operate(inputs)
-    outputs = multi_prep.run(
-        tokens=mock_tokens_multiple,
-        kv_cache=mock_kv_cache,
-        pipeline_state=pipeline_state,
-    )
-    # Expect 4 engine inputs: tokens, attention mask, causal, positions
-    assert len(outputs.get("engine_inputs")) == 4
-    tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs")
-    # Assert proper shapes for all engine_inputs
-    assert tokens.shape[-1] == len(mock_tokens_multiple)
-    assert attention_mask.shape[-1] == seq_len
-    assert positions.shape[-1] == len(mock_tokens_multiple)
-
-
-def test_multi_engine_preprocess_cant_operate(
-    text_generation_attributes, mock_kv_cache, mock_tokens
-):
-    """
-    Check if the multi-token engine preprocess operator can run based on the provided
-    tokens and prompt_sequence_length.
-    """
-    seq_len, _ = text_generation_attributes
-    multi_prep = MultiEnginePrefill(
-        sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1
-    )
-    inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache}
-    # The prompt_sequence_length is one greater than the total number of tokens we're
-    # processing. Therefore, this operator should not run and can_operate() should be
-    # False.
-    assert not multi_prep.can_operate(inputs)
-
-
-def test_run_single_token_engine_once(
-    single_token_engine_no_internal_cache,
-    mock_kv_cache_single_token_engine,
-):
-    """
-    This operator runs through the single-token NLEngine once, given engine_inputs and
-    kv_cache.
-    """
-
-    mock_engine_inputs = [
-        numpy.array([[15496]]),
-        numpy.array([[0, 0, 0, 0, 1]]),
-        numpy.array([[0]]),
-        numpy.array([[[[0, 0, 0, 0, 1]]]]),
-    ]
-    inputs = NlEngineInput(
-        engine_inputs=mock_engine_inputs,
-        kv_cache=mock_kv_cache_single_token_engine,
-        tokens=mock_engine_inputs[0].tolist(),
-    )
-    output = single_token_engine_no_internal_cache.run(inputs)
-    assert output.get("logits") is not None
-
-
-def test_prep_for_generation(
-    text_generation_attributes,
-    model_attributes,
-    mock_tokens_multiple,
-    mock_kv_cache_three_tokens_processed,
-    mock_inference_state,
-):
-    """
-    This test will assess the PrepareGeneration, which runs after prompt_inference
-    and before generation.
-    """
-    seq_len, prompt_seq_len = text_generation_attributes
-    tokenizer, _ = model_attributes
-    prep_for_generation = PrepareGeneration(
-        prompt_sequence_length=prompt_seq_len,
-        token_generator=TokenGeneratorOperator(),
-        sequence_length=seq_len,
-    )
-    inputs = {
-        "tokens": mock_tokens_multiple,
-        "kv_cache": mock_kv_cache_three_tokens_processed,
-    }
-    # can_operate() if the total number of prompt tokens is equal to the
-    # number of processed tokens stored in the kv_cache, indicating prompt inference is
-    # complete and generation can begin.
-    assert prep_for_generation.can_operate(inputs)
-
-    prompt_logits = [numpy.random.rand(1, len(mock_tokens_multiple), len(tokenizer))]
-    mock_inference_state.update_state({"prompt_logits": prompt_logits})
-    outputs, state = prep_for_generation.run(
-        tokens=mock_tokens_multiple,
-        kv_cache=mock_kv_cache,
-        inference_state=mock_inference_state,
-    )
-    assert len(outputs.get("tokens")) == len(mock_tokens_multiple) + 1
-    assert outputs.get("in_generation")
-    assert numpy.array_equal(
-        state.get("generated_logits")[0],
-        numpy.expand_dims(prompt_logits[0][:, -1, :], 0),
-    )
-
-
-def test_generate_new_token(
-    model_attributes,
-    mock_token_generator,
-    mock_kv_cache,
-    mock_inference_state,
-    mock_logits,
-):
-    """
-    This test is responsible for testing the GenerateNewTokenOperator, which generates
-    one new token, given a token_generator (stored in the inference_state) and logits
-    from the engine.
-    """
-    tokenizer, _ = model_attributes
-    generate_new_token = GenerateNewTokenOperator(
-        force_max_tokens=False, tokenizer=tokenizer
-    )
-    mock_inference_state.update_state(
-        {
-            "token_generator": mock_token_generator,
-            "generated_tokens": [mock_token_generator.tokens],
-        }
-    )
-    outputs, state = generate_new_token.run(
-        logits=mock_logits, kv_cache=mock_kv_cache, inference_state=mock_inference_state
-    )
-    # The new_token generated/returned by ths operator should match the last token in
-    # token_generator
-    assert outputs.get("new_token") == state.get("token_generator").tokens[-1]
-
-
-def test_compile_logits(mock_logits, mock_inference_state):
-    mock_inference_state.update_state({"prompt_logits": [mock_logits]})
-    compile_prompt_logits = CompilePromptLogits()
-    # Can operate as long as we're not in generation but in prompt_inference. This
-    # can_operate() will check for the `in_generation` flag in the input.
-    assert compile_prompt_logits.can_operate({})
-    output, state = compile_prompt_logits.run(
-        logits=mock_logits, inference_state=mock_inference_state
-    )
-    # The CompilePromptLogits is responsible for updating a list of prompt logits
-    # calculated at each step during prompt inference. After one step of running this
-    # operator, the total number of prompt_logits in the inference state should be
-    # the current length of prompt logits + 1
-    assert len(state.get("prompt_logits")) == len([mock_logits]) + 1
diff --git a/tests/deepsparse/v2/unit/text_generation/conftest.py b/tests/deepsparse/v2/unit/text_generation/conftest.py
new file mode 100644
index 0000000000..5d8483e5f6
--- /dev/null
+++ b/tests/deepsparse/v2/unit/text_generation/conftest.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+
+import numpy
+from transformers import AutoTokenizer
+
+import pytest
+from deepsparse.transformers.helpers import get_deployment_path
+from deepsparse.transformers.pipelines.text_generation import TextGenerationInput
+from deepsparse.transformers.utils import DecoderKVCache
+from deepsparse.transformers.utils.helpers import initialize_kv_cache_state
+from deepsparse.v2 import InferenceState, PipelineState
+from deepsparse.v2.text_generation import (
+    GenerationDefaults,
+    NLEngineOperator,
+    TokenGeneratorOperator,
+)
+
+
+@pytest.fixture(scope="module")
+def text_generation_attributes():
+    sequence_length = 5
+    prompt_sequence_length = 1
+    return sequence_length, prompt_sequence_length
+
+
+@pytest.fixture(scope="module")
+def model_attributes(text_generation_attributes):
+    model_path = "hf:mgoin/TinyStories-1M-deepsparse"
+    sequence_length, _ = text_generation_attributes
+    deployment_path, model_path = get_deployment_path(model_path)
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        deployment_path,
+        trust_remote_code=False,
+        model_max_length=sequence_length,
+    )
+
+    tokenizer.padding_side = "left"
+    if not tokenizer.pad_token:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    return tokenizer, model_path
+
+
+@pytest.fixture(scope="module")
+def single_token_engine_no_internal_cache(text_generation_attributes, model_attributes):
+    seq_length, _ = text_generation_attributes
+    _, model_path = model_attributes
+
+    nl_engine_operator = NLEngineOperator(
+        sequence_length=seq_length, input_ids_length=1, model_path=model_path
+    )
+    return nl_engine_operator
+
+
+@pytest.fixture(scope="module")
+def pipeline_state(single_token_engine_no_internal_cache):
+    pipeline_state = PipelineState()
+    pipeline_state_vals = {}
+    pipeline_state_vals[
+        "onnx_input_names_no_cache"
+    ] = single_token_engine_no_internal_cache.onnx_input_names_no_cache
+    pipeline_state_vals[
+        "cache_shape"
+    ] = single_token_engine_no_internal_cache.cache_shape
+    pipeline_state_vals[
+        "output_names"
+    ] = single_token_engine_no_internal_cache.output_names
+    pipeline_state_vals[
+        "kv_cache_data_type"
+    ] = single_token_engine_no_internal_cache.kv_cache_data_type
+    pipeline_state.create_state(pipeline_state_vals)
+    return pipeline_state
+
+
+@pytest.fixture(scope="module")
+def large_prompt():
+    prompt = "Hello, how are you doing today?"
+    generation_config = {"top_p": 0, "top_k": 0, "max_length": 10}
+    return TextGenerationInput(prompt=prompt, generation_config=generation_config)
+
+
+@pytest.fixture(scope="module")
+def small_prompt():
+    prompt = "Hello"
+    return TextGenerationInput(prompt=prompt)
+
+
+@pytest.fixture(scope="module")
+def mock_kv_cache():
+    kv_cache = DecoderKVCache()
+    kv_cache.setup(
+        state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])},
+    )
+    return kv_cache
+
+
+@pytest.fixture(scope="module")
+def mock_kv_cache_three_tokens_processed():
+    kv_cache = DecoderKVCache()
+    kv_cache.setup(
+        state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])},
+        num_processed_tokens=3,
+    )
+    return kv_cache
+
+
+@pytest.fixture(scope="module")
+def mock_kv_cache_single_token_engine(pipeline_state, text_generation_attributes):
+    seq_len, prompt_seq_len = text_generation_attributes
+    kv_cache = DecoderKVCache()
+    kv_cache_state = initialize_kv_cache_state(
+        cache_shape=pipeline_state.current_state.get("cache_shape"),
+        kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"),
+        output_names=pipeline_state.current_state.get("output_names"),
+        length=seq_len - prompt_seq_len,
+        empty=False,
+    )
+    kv_cache.setup(state=kv_cache_state)
+    return kv_cache
+
+
+@pytest.fixture(scope="module")
+def mock_tokens():
+    return [15496]
+
+
+@pytest.fixture(scope="module")
+def mock_tokens_multiple():
+    return [15496, 15496, 15496]
+
+
+@pytest.fixture(scope="module")
+def mock_inference_state():
+    generation_config = GenerationDefaults()
+    inference_state = InferenceState()
+    inference_state.create_state({})
+    inference_state.update_state({"generation_config": generation_config})
+    return inference_state
+
+
+@pytest.fixture(scope="module")
+def mock_token_generator(model_attributes, mock_tokens_multiple):
+    tokenizer, _ = model_attributes
+    token_generator_creator = TokenGeneratorOperator()
+    prompt_logits = numpy.random.rand(1, len(mock_tokens_multiple), len(tokenizer))
+    token_generator_creator_output = token_generator_creator.run(
+        logits_shape=prompt_logits[0, -1, :].shape,
+        deterministic=True,
+        sampling_temperature=1.0,
+        tokens=copy.copy(mock_tokens_multiple),
+    )
+    return token_generator_creator_output.get("token_generator")
+
+
+@pytest.fixture(scope="module")
+def mock_logits(model_attributes):
+    tokenizer, _ = model_attributes
+    return numpy.random.rand(1, 1, len(tokenizer))
diff --git a/tests/deepsparse/v2/unit/text_generation/test_kv_cache.py b/tests/deepsparse/v2/unit/text_generation/test_kv_cache.py
new file mode 100644
index 0000000000..0c6e42503a
--- /dev/null
+++ b/tests/deepsparse/v2/unit/text_generation/test_kv_cache.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from deepsparse.v2.text_generation import KVCacheCreator, KVCacheCreatorInput
+
+
+def test_kv_cache_creation(
+    text_generation_attributes, model_attributes, pipeline_state
+):
+    """
+    Check if the KVCacheCreator successfully creates a kv_cache object, given the
+    single_token_engine attributes stored in the pipeline_state.
+    """
+    seq_length, prompt_seq_len = text_generation_attributes
+    tokenizer, _ = model_attributes
+    kv_cache_creator = KVCacheCreator(
+        tokenizer=tokenizer,
+        prompt_sequence_length=prompt_seq_len,
+        sequence_length=seq_length,
+        internal_kv_cache=False,
+    )
+
+    assert kv_cache_creator.input_schema == KVCacheCreatorInput
+    kv_cache = kv_cache_creator.run(
+        cache_shape=pipeline_state.current_state.get("cache_shape"),
+        kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"),
+        output_names=pipeline_state.current_state.get("output_names"),
+    )
+    assert kv_cache.get("kv_cache")
+    assert kv_cache.get("kv_cache").total_num_processed_tokens == 0
diff --git a/tests/deepsparse/v2/unit/text_generation/test_msic.py b/tests/deepsparse/v2/unit/text_generation/test_msic.py
new file mode 100644
index 0000000000..caa0cc2efd
--- /dev/null
+++ b/tests/deepsparse/v2/unit/text_generation/test_msic.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from deepsparse.v2.text_generation import CompilePromptLogits
+
+
+def test_compile_logits(mock_logits, mock_inference_state):
+    mock_inference_state.update_state({"prompt_logits": [mock_logits]})
+    compile_prompt_logits = CompilePromptLogits()
+    # Can operate as long as we're not in generation but in prompt_inference. This
+    # can_operate() will check for the `in_generation` flag in the input.
+    assert compile_prompt_logits.can_operate({})
+    output, state = compile_prompt_logits.run(
+        logits=mock_logits, inference_state=mock_inference_state
+    )
+    # The CompilePromptLogits is responsible for updating a list of prompt logits
+    # calculated at each step during prompt inference. After one step of running this
+    # operator, the total number of prompt_logits in the inference state should be
+    # the current length of prompt logits + 1
+    assert len(state.get("prompt_logits")) == len([mock_logits]) + 1
diff --git a/tests/deepsparse/v2/unit/text_generation/test_process_inputs.py b/tests/deepsparse/v2/unit/text_generation/test_process_inputs.py
new file mode 100644
index 0000000000..be59db7475
--- /dev/null
+++ b/tests/deepsparse/v2/unit/text_generation/test_process_inputs.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from deepsparse.v2.text_generation import (
+    GenerationDefaults,
+    ProcessInputsTextGeneration,
+)
+
+
+def test_process_inputs(
+    text_generation_attributes, model_attributes, small_prompt, large_prompt
+):
+    """
+    Check if the ProcessInputsTextGeneration Operator successfully processes the
+    inputs and generation config.
+    """
+    sequence_length, _ = text_generation_attributes
+    tokenizer, _ = model_attributes
+    process_inputs = ProcessInputsTextGeneration(
+        sequence_length=sequence_length, tokenizer=tokenizer
+    )
+
+    outputs, state_update = process_inputs.run(small_prompt)
+    assert len(outputs.get("input_ids")) == 1
+    assert len(outputs.get("attention_mask")) == 1
+    assert isinstance(state_update.get("generation_config"), GenerationDefaults)
+    assert state_update.get("prompts") == small_prompt.sequences
+
+    outputs, state_update = process_inputs.run(large_prompt)
+
+    assert not isinstance(state_update.get("generation_config"), GenerationDefaults)
+    assert state_update.get(
+        "generation_config"
+    ).max_length == large_prompt.generation_config.get("max_length")
+    assert outputs.get("input_ids") is not None
+    assert state_update.get("top_k") == large_prompt.generation_config.get("top_k")
diff --git a/tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py b/tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py
new file mode 100644
index 0000000000..335a28fbe3
--- /dev/null
+++ b/tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+
+from deepsparse.v2.text_generation import (
+    AutoRegressiveOperatorPreprocess,
+    NlEngineInput,
+)
+
+
+def test_autoreg_preproces_can_run(
+    text_generation_attributes, pipeline_state, mock_tokens, mock_kv_cache
+):
+    """
+    Check if the single-token engine preprocess operator can run based on the provided
+    tokens and prompt_sequence_length.
+    """
+
+    seq_len, _ = text_generation_attributes
+    autoreg_prep = AutoRegressiveOperatorPreprocess(
+        sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1
+    )
+    inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache}
+
+    # The prompt_sequence_length is greater than the number of tokens that are to be
+    # operated on. Therefore, use the single_token_engine and can_operate() should be
+    # True.
+    assert autoreg_prep.can_operate(inputs)
+    outputs = autoreg_prep.run(
+        tokens=mock_tokens, kv_cache=mock_kv_cache, pipeline_state=pipeline_state
+    )
+    # Assert 4 engine inputs: tokens, attention mask, causal, positions
+    assert len(outputs.get("engine_inputs")) == 4
+    tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs")
+
+    assert tokens.shape[-1] == 1
+    assert attention_mask.shape[-1] == seq_len
+    assert positions[0] == mock_kv_cache.total_num_processed_tokens
+    assert outputs.get("in_generation") is None
+
+
+def test_autoreg_preproces_cant_run(
+    text_generation_attributes, mock_kv_cache, mock_tokens_multiple
+):
+    """
+    Check if the single-token engine preprocess operator can run based on the provided
+    tokens and prompt_sequence_length.
+    """
+
+    seq_len, _ = text_generation_attributes
+    autoreg_prep = AutoRegressiveOperatorPreprocess(
+        sequence_length=seq_len, prompt_sequence_length=len(mock_tokens_multiple)
+    )
+    inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache}
+    # can_operate() should be False as the prompt_sequence_length is equal to the
+    # number of tokens we want to operate on. Therefore, the multi-token engine
+    # should run instead.
+    assert not autoreg_prep.can_operate(inputs)
+
+
+def test_nl_single_token_engine_no_internal(single_token_engine_no_internal_cache):
+    assert single_token_engine_no_internal_cache.input_ids_length == 1
+
+
+def test_run_single_token_engine_once(
+    single_token_engine_no_internal_cache,
+    mock_kv_cache_single_token_engine,
+):
+    """
+    This operator runs through the single-token NLEngine once, given engine_inputs and
+    kv_cache.
+    """
+
+    mock_engine_inputs = [
+        numpy.array([[15496]]),
+        numpy.array([[0, 0, 0, 0, 1]]),
+        numpy.array([[0]]),
+        numpy.array([[[[0, 0, 0, 0, 1]]]]),
+    ]
+    inputs = NlEngineInput(
+        engine_inputs=mock_engine_inputs,
+        kv_cache=mock_kv_cache_single_token_engine,
+        tokens=mock_engine_inputs[0].tolist(),
+    )
+    output = single_token_engine_no_internal_cache.run(inputs)
+    assert output.get("logits") is not None
diff --git a/tests/deepsparse/v2/unit/text_generation/test_token_generation.py b/tests/deepsparse/v2/unit/text_generation/test_token_generation.py
new file mode 100644
index 0000000000..fbd9e06778
--- /dev/null
+++ b/tests/deepsparse/v2/unit/text_generation/test_token_generation.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy
+
+from deepsparse.v2.text_generation import (
+    GenerateNewTokenOperator,
+    PrepareGeneration,
+    TokenGeneratorOperator,
+)
+
+
+def test_prep_for_generation(
+    text_generation_attributes,
+    model_attributes,
+    mock_tokens_multiple,
+    mock_kv_cache_three_tokens_processed,
+    mock_inference_state,
+):
+    """
+    This test will assess the PrepareGeneration, which runs after prompt_inference
+    and before generation.
+    """
+    seq_len, prompt_seq_len = text_generation_attributes
+    tokenizer, _ = model_attributes
+    prep_for_generation = PrepareGeneration(
+        prompt_sequence_length=prompt_seq_len,
+        token_generator=TokenGeneratorOperator(),
+        sequence_length=seq_len,
+    )
+    inputs = {
+        "tokens": mock_tokens_multiple,
+        "kv_cache": mock_kv_cache_three_tokens_processed,
+    }
+    # can_operate() if the total number of prompt tokens is equal to the
+    # number of processed tokens stored in the kv_cache, indicating prompt inference is
+    # complete and generation can begin.
+    assert prep_for_generation.can_operate(inputs)
+
+    prompt_logits = [numpy.random.rand(1, len(mock_tokens_multiple), len(tokenizer))]
+    mock_inference_state.update_state({"prompt_logits": prompt_logits})
+    outputs, state = prep_for_generation.run(
+        tokens=mock_tokens_multiple,
+        kv_cache=mock_kv_cache_three_tokens_processed,
+        inference_state=mock_inference_state,
+    )
+    assert len(outputs.get("tokens")) == len(mock_tokens_multiple) + 1
+    assert outputs.get("in_generation")
+    assert numpy.array_equal(
+        state.get("generated_logits")[0],
+        numpy.expand_dims(prompt_logits[0][:, -1, :], 0),
+    )
+
+
+def test_generate_new_token(
+    model_attributes,
+    mock_token_generator,
+    mock_kv_cache,
+    mock_inference_state,
+    mock_logits,
+):
+    """
+    This test is responsible for testing the GenerateNewTokenOperator, which generates
+    one new token, given a token_generator (stored in the inference_state) and logits
+    from the engine.
+    """
+    tokenizer, _ = model_attributes
+    generate_new_token = GenerateNewTokenOperator(
+        force_max_tokens=False, tokenizer=tokenizer
+    )
+    mock_inference_state.update_state(
+        {
+            "token_generator": mock_token_generator,
+            "generated_tokens": [mock_token_generator.tokens],
+        }
+    )
+    outputs, state = generate_new_token.run(
+        logits=mock_logits, kv_cache=mock_kv_cache, inference_state=mock_inference_state
+    )
+    # The new_token generated/returned by ths operator should match the last token in
+    # token_generator
+    assert outputs.get("new_token") == state.get("token_generator").tokens[-1]
diff --git a/tests/deepsparse/v2/unit/text_generation/text_multi_token_engine.py b/tests/deepsparse/v2/unit/text_generation/text_multi_token_engine.py
new file mode 100644
index 0000000000..d2c822af4c
--- /dev/null
+++ b/tests/deepsparse/v2/unit/text_generation/text_multi_token_engine.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from deepsparse.v2.text_generation import MultiEnginePrefill
+
+
+def test_mult_engine_preprocess(
+    text_generation_attributes, pipeline_state, mock_kv_cache, mock_tokens_multiple
+):
+    """
+    Check if the multi-token engine preprocess operator can run based on the provided
+    tokens and prompt_sequence_length.
+    """
+
+    seq_len, _ = text_generation_attributes
+    multi_prep = MultiEnginePrefill(
+        sequence_length=seq_len, prompt_sequence_length=len(mock_tokens_multiple)
+    )
+    inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache}
+    # The number of tokens is equal to the prompt_sequence_length.
+    # Therefore, the multi_token_engine can run and can_operate() should be True.
+    assert multi_prep.can_operate(inputs)
+    outputs = multi_prep.run(
+        tokens=mock_tokens_multiple,
+        kv_cache=mock_kv_cache,
+        pipeline_state=pipeline_state,
+    )
+    # Expect 4 engine inputs: tokens, attention mask, causal, positions
+    assert len(outputs.get("engine_inputs")) == 4
+    tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs")
+    # Assert proper shapes for all engine_inputs
+    assert tokens.shape[-1] == len(mock_tokens_multiple)
+    assert attention_mask.shape[-1] == seq_len
+    assert positions.shape[-1] == len(mock_tokens_multiple)
+
+
+def test_multi_engine_preprocess_cant_operate(
+    text_generation_attributes, mock_kv_cache, mock_tokens
+):
+    """
+    Check if the multi-token engine preprocess operator can run based on the provided
+    tokens and prompt_sequence_length.
+    """
+    seq_len, _ = text_generation_attributes
+    multi_prep = MultiEnginePrefill(
+        sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1
+    )
+    inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache}
+    # The prompt_sequence_length is one greater than the total number of tokens we're
+    # processing. Therefore, this operator should not run and can_operate() should be
+    # False.
+    assert not multi_prep.can_operate(inputs)

From 0a50d1dee8a1abe32c4f1c40e27ab16589d32bc2 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Fri, 10 Nov 2023 09:11:00 -0500
Subject: [PATCH 22/43] [Pipeline Refactor] Unit Testing for Text Generation
 Operators (#1392)

* unit testing for text generation operators

* additional changes

* unit testing completion

* remove debug

* fix

* add todo

* more clean-up

* fix test

* add docstrings/comments

* break out tests to individual unit test files; add conftest and make scope of fixtures module to help with speed

* fix name
---
 src/deepsparse/v2/routers/router.py           |   1 +
 .../v2/text_generation/kv_cache_operator.py   |   2 +-
 .../v2/text_generation/nl_engine_operator.py  |   2 +-
 .../v2/text_generation/prep_for_generation.py |  52 +-----
 .../v2/text_generation/process_inputs.py      |  10 +-
 .../v2/unit/text_generation/conftest.py       | 173 ++++++++++++++++++
 .../v2/unit/text_generation/test_kv_cache.py  |  41 +++++
 .../v2/unit/text_generation/test_misc.py      |  31 ++++
 .../text_generation/test_process_inputs.py    |  47 +++++
 .../test_single_token_engine.py               |  98 ++++++++++
 .../text_generation/test_token_generation.py  |  92 ++++++++++
 .../text_multi_token_engine.py                |  63 +++++++
 12 files changed, 558 insertions(+), 54 deletions(-)
 create mode 100644 tests/deepsparse/v2/unit/text_generation/conftest.py
 create mode 100644 tests/deepsparse/v2/unit/text_generation/test_kv_cache.py
 create mode 100644 tests/deepsparse/v2/unit/text_generation/test_misc.py
 create mode 100644 tests/deepsparse/v2/unit/text_generation/test_process_inputs.py
 create mode 100644 tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py
 create mode 100644 tests/deepsparse/v2/unit/text_generation/test_token_generation.py
 create mode 100644 tests/deepsparse/v2/unit/text_generation/text_multi_token_engine.py

diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py
index 1b70164002..6b0d851aef 100644
--- a/src/deepsparse/v2/routers/router.py
+++ b/src/deepsparse/v2/routers/router.py
@@ -158,4 +158,5 @@ def next(
 
     @staticmethod
     def validate(ops) -> bool:
+        # TODO: still needs to be implemented for the GraphRouter
         pass
diff --git a/src/deepsparse/v2/text_generation/kv_cache_operator.py b/src/deepsparse/v2/text_generation/kv_cache_operator.py
index 0b232402b3..3c15d0ff5a 100644
--- a/src/deepsparse/v2/text_generation/kv_cache_operator.py
+++ b/src/deepsparse/v2/text_generation/kv_cache_operator.py
@@ -24,7 +24,7 @@
 from deepsparse.v2.operators import Operator
 
 
-__all__ = ["KVCacheCreator"]
+__all__ = ["KVCacheCreator", "KVCacheCreatorInput"]
 
 
 class KVCacheCreatorOutput(BaseModel):
diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py
index 0bd9098a40..7549f986d9 100644
--- a/src/deepsparse/v2/text_generation/nl_engine_operator.py
+++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py
@@ -29,7 +29,7 @@
 )
 
 
-__all__ = ["NLEngineOperator"]
+__all__ = ["NLEngineOperator", "NlEngineInput"]
 
 
 class NlEngineInput(BaseModel):
diff --git a/src/deepsparse/v2/text_generation/prep_for_generation.py b/src/deepsparse/v2/text_generation/prep_for_generation.py
index 544af43980..0ea4a06a02 100644
--- a/src/deepsparse/v2/text_generation/prep_for_generation.py
+++ b/src/deepsparse/v2/text_generation/prep_for_generation.py
@@ -11,11 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
 from typing import Any
 
 import numpy
 
 from deepsparse.transformers.pipelines.text_generation import FinishReason
+from deepsparse.transformers.utils.helpers import set_generated_length
 from deepsparse.v2.operators import Operator
 from deepsparse.v2.text_generation import TokenGeneratorOperator
 from deepsparse.v2.utils import InferenceState
@@ -31,9 +33,9 @@ def __init__(
         prompt_sequence_length: int,
         sequence_length: int,
     ):
-        self.prompt_sequence_length = prompt_sequence_length
         self.sequence_length = sequence_length
         self.token_generator_creator = token_generator
+        self.prompt_sequence_length = prompt_sequence_length
 
     def can_operate(self, inp: Any):
         kv_cache = inp.get("kv_cache")
@@ -47,49 +49,6 @@ def can_operate(self, inp: Any):
             return True
         return False
 
-    @staticmethod
-    def set_generated_length(
-        max_length: int,
-        prompt_tokens_length: int,
-        sequence_length: int,
-        prompt_sequence_length: int,
-        max_new_tokens: int,
-        finish_reason_choices: "FinishReason",  # noqa
-    ):
-        """
-        Determine the length of the generated tokens. The hard cap on the total number
-        of tokens is based on the sequence length. If max_length is provided and is less
-        than the sequence length, it will be used to cap the total number of tokens
-        generated. If it is not provided, the max_new_tokens attribute will be used and
-        also capped by the sequence length.
-
-        :param max_length: max_length attribute, provided as input during inference
-        :param prompt_tokens_length: the number of prompt tokens used as part of the
-            generated output
-        :param sequence_length: the sequence length used for the pipeline
-        :param prompt_sequence_length: the prompt sequence length used for the pipeline
-        :param max_new_tokens: the max_new_tokens attribute, which may be provided
-        as part of the input during inference
-        """
-        if max_length:
-            # if max_length provided, use that to cap total tokens generated
-            max_tokens = max_length
-            finish_reason = finish_reason_choices.LENGTH
-        else:
-            # if not provided, max tokens is based on max_new_tokens + prompt tokens
-            max_tokens = (
-                min(max_new_tokens, sequence_length - prompt_sequence_length)
-                + prompt_tokens_length
-            )
-            finish_reason = finish_reason_choices.MAX_NEW_TOKENS
-
-        # hard model/pipeline cap
-        return (
-            (sequence_length, finish_reason_choices.CAPACITY)
-            if sequence_length < max_tokens
-            else (max_tokens, finish_reason)
-        )
-
     def run(
         self, tokens: Any, kv_cache: Any, inference_state: InferenceState, **kwargs
     ):
@@ -107,13 +66,13 @@ def run(
             logits_shape=prompt_logits[0, -1, :].shape,
             deterministic=not generation_config.do_sample,
             sampling_temperature=generation_config.temperature,
-            tokens=tokens,
+            tokens=copy.copy(tokens),
             **inference_state.current_state,
         )
         token_generator = token_generator_creator_output.get("token_generator")
         token_generator.generate(prompt_logits[0, -1, :])
 
-        max_tokens, length_finish_reason = PrepareGeneration.set_generated_length(
+        max_tokens, length_finish_reason = set_generated_length(
             max_length=generation_config.max_length,
             prompt_tokens_length=1,
             max_new_tokens=generation_config.max_new_tokens,
@@ -131,7 +90,6 @@ def run(
             "finished_reason": [],
             "token_generator": token_generator,
         }
-
         output = {
             "tokens": token_generator.tokens,
             "kv_cache": kv_cache,
diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py
index 5d47c8ff39..214b8526e3 100644
--- a/src/deepsparse/v2/text_generation/process_inputs.py
+++ b/src/deepsparse/v2/text_generation/process_inputs.py
@@ -26,6 +26,9 @@
 from deepsparse.v2.operators import Operator
 
 
+__all__ = ["ProcessInputsTextGeneration", "GenerationDefaults"]
+
+
 class GenerationDefaults:
     num_return_sequences = 1
     max_length = 100
@@ -38,9 +41,6 @@ class GenerationDefaults:
     temperature = 1.0
 
 
-__all__ = ["ProcessInputsTextGeneration"]
-
-
 class ProcessInputsTextGeneration(Operator):
     """
     Input processing operator. Responsible for tokenizing the input, handling the
@@ -54,10 +54,10 @@ class ProcessInputsTextGeneration(Operator):
     def __init__(
         self,
         tokenizer: transformers.PreTrainedTokenizerBase,
+        sequence_length: int,
         generation_config: Union[
             str, pathlib.Path, Dict, transformers.GenerationConfig
-        ],
-        sequence_length: int,
+        ] = None,
     ):
         self.generation_config = generation_config
         self.tokenizer = tokenizer
diff --git a/tests/deepsparse/v2/unit/text_generation/conftest.py b/tests/deepsparse/v2/unit/text_generation/conftest.py
new file mode 100644
index 0000000000..5d8483e5f6
--- /dev/null
+++ b/tests/deepsparse/v2/unit/text_generation/conftest.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+
+import numpy
+from transformers import AutoTokenizer
+
+import pytest
+from deepsparse.transformers.helpers import get_deployment_path
+from deepsparse.transformers.pipelines.text_generation import TextGenerationInput
+from deepsparse.transformers.utils import DecoderKVCache
+from deepsparse.transformers.utils.helpers import initialize_kv_cache_state
+from deepsparse.v2 import InferenceState, PipelineState
+from deepsparse.v2.text_generation import (
+    GenerationDefaults,
+    NLEngineOperator,
+    TokenGeneratorOperator,
+)
+
+
+@pytest.fixture(scope="module")
+def text_generation_attributes():
+    sequence_length = 5
+    prompt_sequence_length = 1
+    return sequence_length, prompt_sequence_length
+
+
+@pytest.fixture(scope="module")
+def model_attributes(text_generation_attributes):
+    model_path = "hf:mgoin/TinyStories-1M-deepsparse"
+    sequence_length, _ = text_generation_attributes
+    deployment_path, model_path = get_deployment_path(model_path)
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        deployment_path,
+        trust_remote_code=False,
+        model_max_length=sequence_length,
+    )
+
+    tokenizer.padding_side = "left"
+    if not tokenizer.pad_token:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    return tokenizer, model_path
+
+
+@pytest.fixture(scope="module")
+def single_token_engine_no_internal_cache(text_generation_attributes, model_attributes):
+    seq_length, _ = text_generation_attributes
+    _, model_path = model_attributes
+
+    nl_engine_operator = NLEngineOperator(
+        sequence_length=seq_length, input_ids_length=1, model_path=model_path
+    )
+    return nl_engine_operator
+
+
+@pytest.fixture(scope="module")
+def pipeline_state(single_token_engine_no_internal_cache):
+    pipeline_state = PipelineState()
+    pipeline_state_vals = {}
+    pipeline_state_vals[
+        "onnx_input_names_no_cache"
+    ] = single_token_engine_no_internal_cache.onnx_input_names_no_cache
+    pipeline_state_vals[
+        "cache_shape"
+    ] = single_token_engine_no_internal_cache.cache_shape
+    pipeline_state_vals[
+        "output_names"
+    ] = single_token_engine_no_internal_cache.output_names
+    pipeline_state_vals[
+        "kv_cache_data_type"
+    ] = single_token_engine_no_internal_cache.kv_cache_data_type
+    pipeline_state.create_state(pipeline_state_vals)
+    return pipeline_state
+
+
+@pytest.fixture(scope="module")
+def large_prompt():
+    prompt = "Hello, how are you doing today?"
+    generation_config = {"top_p": 0, "top_k": 0, "max_length": 10}
+    return TextGenerationInput(prompt=prompt, generation_config=generation_config)
+
+
+@pytest.fixture(scope="module")
+def small_prompt():
+    prompt = "Hello"
+    return TextGenerationInput(prompt=prompt)
+
+
+@pytest.fixture(scope="module")
+def mock_kv_cache():
+    kv_cache = DecoderKVCache()
+    kv_cache.setup(
+        state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])},
+    )
+    return kv_cache
+
+
+@pytest.fixture(scope="module")
+def mock_kv_cache_three_tokens_processed():
+    kv_cache = DecoderKVCache()
+    kv_cache.setup(
+        state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])},
+        num_processed_tokens=3,
+    )
+    return kv_cache
+
+
+@pytest.fixture(scope="module")
+def mock_kv_cache_single_token_engine(pipeline_state, text_generation_attributes):
+    seq_len, prompt_seq_len = text_generation_attributes
+    kv_cache = DecoderKVCache()
+    kv_cache_state = initialize_kv_cache_state(
+        cache_shape=pipeline_state.current_state.get("cache_shape"),
+        kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"),
+        output_names=pipeline_state.current_state.get("output_names"),
+        length=seq_len - prompt_seq_len,
+        empty=False,
+    )
+    kv_cache.setup(state=kv_cache_state)
+    return kv_cache
+
+
+@pytest.fixture(scope="module")
+def mock_tokens():
+    return [15496]
+
+
+@pytest.fixture(scope="module")
+def mock_tokens_multiple():
+    return [15496, 15496, 15496]
+
+
+@pytest.fixture(scope="module")
+def mock_inference_state():
+    generation_config = GenerationDefaults()
+    inference_state = InferenceState()
+    inference_state.create_state({})
+    inference_state.update_state({"generation_config": generation_config})
+    return inference_state
+
+
+@pytest.fixture(scope="module")
+def mock_token_generator(model_attributes, mock_tokens_multiple):
+    tokenizer, _ = model_attributes
+    token_generator_creator = TokenGeneratorOperator()
+    prompt_logits = numpy.random.rand(1, len(mock_tokens_multiple), len(tokenizer))
+    token_generator_creator_output = token_generator_creator.run(
+        logits_shape=prompt_logits[0, -1, :].shape,
+        deterministic=True,
+        sampling_temperature=1.0,
+        tokens=copy.copy(mock_tokens_multiple),
+    )
+    return token_generator_creator_output.get("token_generator")
+
+
+@pytest.fixture(scope="module")
+def mock_logits(model_attributes):
+    tokenizer, _ = model_attributes
+    return numpy.random.rand(1, 1, len(tokenizer))
diff --git a/tests/deepsparse/v2/unit/text_generation/test_kv_cache.py b/tests/deepsparse/v2/unit/text_generation/test_kv_cache.py
new file mode 100644
index 0000000000..0c6e42503a
--- /dev/null
+++ b/tests/deepsparse/v2/unit/text_generation/test_kv_cache.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from deepsparse.v2.text_generation import KVCacheCreator, KVCacheCreatorInput
+
+
+def test_kv_cache_creation(
+    text_generation_attributes, model_attributes, pipeline_state
+):
+    """
+    Check if the KVCacheCreator successfully creates a kv_cache object, given the
+    single_token_engine attributes stored in the pipeline_state.
+    """
+    seq_length, prompt_seq_len = text_generation_attributes
+    tokenizer, _ = model_attributes
+    kv_cache_creator = KVCacheCreator(
+        tokenizer=tokenizer,
+        prompt_sequence_length=prompt_seq_len,
+        sequence_length=seq_length,
+        internal_kv_cache=False,
+    )
+
+    assert kv_cache_creator.input_schema == KVCacheCreatorInput
+    kv_cache = kv_cache_creator.run(
+        cache_shape=pipeline_state.current_state.get("cache_shape"),
+        kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"),
+        output_names=pipeline_state.current_state.get("output_names"),
+    )
+    assert kv_cache.get("kv_cache")
+    assert kv_cache.get("kv_cache").total_num_processed_tokens == 0
diff --git a/tests/deepsparse/v2/unit/text_generation/test_misc.py b/tests/deepsparse/v2/unit/text_generation/test_misc.py
new file mode 100644
index 0000000000..caa0cc2efd
--- /dev/null
+++ b/tests/deepsparse/v2/unit/text_generation/test_misc.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from deepsparse.v2.text_generation import CompilePromptLogits
+
+
+def test_compile_logits(mock_logits, mock_inference_state):
+    mock_inference_state.update_state({"prompt_logits": [mock_logits]})
+    compile_prompt_logits = CompilePromptLogits()
+    # Can operate as long as we're not in generation but in prompt_inference. This
+    # can_operate() will check for the `in_generation` flag in the input.
+    assert compile_prompt_logits.can_operate({})
+    output, state = compile_prompt_logits.run(
+        logits=mock_logits, inference_state=mock_inference_state
+    )
+    # The CompilePromptLogits is responsible for updating a list of prompt logits
+    # calculated at each step during prompt inference. After one step of running this
+    # operator, the total number of prompt_logits in the inference state should be
+    # the current length of prompt logits + 1
+    assert len(state.get("prompt_logits")) == len([mock_logits]) + 1
diff --git a/tests/deepsparse/v2/unit/text_generation/test_process_inputs.py b/tests/deepsparse/v2/unit/text_generation/test_process_inputs.py
new file mode 100644
index 0000000000..be59db7475
--- /dev/null
+++ b/tests/deepsparse/v2/unit/text_generation/test_process_inputs.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from deepsparse.v2.text_generation import (
+    GenerationDefaults,
+    ProcessInputsTextGeneration,
+)
+
+
+def test_process_inputs(
+    text_generation_attributes, model_attributes, small_prompt, large_prompt
+):
+    """
+    Check if the ProcessInputsTextGeneration Operator successfully processes the
+    inputs and generation config.
+    """
+    sequence_length, _ = text_generation_attributes
+    tokenizer, _ = model_attributes
+    process_inputs = ProcessInputsTextGeneration(
+        sequence_length=sequence_length, tokenizer=tokenizer
+    )
+
+    outputs, state_update = process_inputs.run(small_prompt)
+    assert len(outputs.get("input_ids")) == 1
+    assert len(outputs.get("attention_mask")) == 1
+    assert isinstance(state_update.get("generation_config"), GenerationDefaults)
+    assert state_update.get("prompts") == small_prompt.sequences
+
+    outputs, state_update = process_inputs.run(large_prompt)
+
+    assert not isinstance(state_update.get("generation_config"), GenerationDefaults)
+    assert state_update.get(
+        "generation_config"
+    ).max_length == large_prompt.generation_config.get("max_length")
+    assert outputs.get("input_ids") is not None
+    assert state_update.get("top_k") == large_prompt.generation_config.get("top_k")
diff --git a/tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py b/tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py
new file mode 100644
index 0000000000..335a28fbe3
--- /dev/null
+++ b/tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+
+from deepsparse.v2.text_generation import (
+    AutoRegressiveOperatorPreprocess,
+    NlEngineInput,
+)
+
+
+def test_autoreg_preproces_can_run(
+    text_generation_attributes, pipeline_state, mock_tokens, mock_kv_cache
+):
+    """
+    Check if the single-token engine preprocess operator can run based on the provided
+    tokens and prompt_sequence_length.
+    """
+
+    seq_len, _ = text_generation_attributes
+    autoreg_prep = AutoRegressiveOperatorPreprocess(
+        sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1
+    )
+    inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache}
+
+    # The prompt_sequence_length is greater than the number of tokens that are to be
+    # operated on. Therefore, use the single_token_engine and can_operate() should be
+    # True.
+    assert autoreg_prep.can_operate(inputs)
+    outputs = autoreg_prep.run(
+        tokens=mock_tokens, kv_cache=mock_kv_cache, pipeline_state=pipeline_state
+    )
+    # Assert 4 engine inputs: tokens, attention mask, causal, positions
+    assert len(outputs.get("engine_inputs")) == 4
+    tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs")
+
+    assert tokens.shape[-1] == 1
+    assert attention_mask.shape[-1] == seq_len
+    assert positions[0] == mock_kv_cache.total_num_processed_tokens
+    assert outputs.get("in_generation") is None
+
+
+def test_autoreg_preproces_cant_run(
+    text_generation_attributes, mock_kv_cache, mock_tokens_multiple
+):
+    """
+    Check if the single-token engine preprocess operator can run based on the provided
+    tokens and prompt_sequence_length.
+    """
+
+    seq_len, _ = text_generation_attributes
+    autoreg_prep = AutoRegressiveOperatorPreprocess(
+        sequence_length=seq_len, prompt_sequence_length=len(mock_tokens_multiple)
+    )
+    inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache}
+    # can_operate() should be False as the prompt_sequence_length is equal to the
+    # number of tokens we want to operate on. Therefore, the multi-token engine
+    # should run instead.
+    assert not autoreg_prep.can_operate(inputs)
+
+
+def test_nl_single_token_engine_no_internal(single_token_engine_no_internal_cache):
+    assert single_token_engine_no_internal_cache.input_ids_length == 1
+
+
+def test_run_single_token_engine_once(
+    single_token_engine_no_internal_cache,
+    mock_kv_cache_single_token_engine,
+):
+    """
+    This operator runs through the single-token NLEngine once, given engine_inputs and
+    kv_cache.
+    """
+
+    mock_engine_inputs = [
+        numpy.array([[15496]]),
+        numpy.array([[0, 0, 0, 0, 1]]),
+        numpy.array([[0]]),
+        numpy.array([[[[0, 0, 0, 0, 1]]]]),
+    ]
+    inputs = NlEngineInput(
+        engine_inputs=mock_engine_inputs,
+        kv_cache=mock_kv_cache_single_token_engine,
+        tokens=mock_engine_inputs[0].tolist(),
+    )
+    output = single_token_engine_no_internal_cache.run(inputs)
+    assert output.get("logits") is not None
diff --git a/tests/deepsparse/v2/unit/text_generation/test_token_generation.py b/tests/deepsparse/v2/unit/text_generation/test_token_generation.py
new file mode 100644
index 0000000000..fbd9e06778
--- /dev/null
+++ b/tests/deepsparse/v2/unit/text_generation/test_token_generation.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy
+
+from deepsparse.v2.text_generation import (
+    GenerateNewTokenOperator,
+    PrepareGeneration,
+    TokenGeneratorOperator,
+)
+
+
+def test_prep_for_generation(
+    text_generation_attributes,
+    model_attributes,
+    mock_tokens_multiple,
+    mock_kv_cache_three_tokens_processed,
+    mock_inference_state,
+):
+    """
+    This test will assess the PrepareGeneration, which runs after prompt_inference
+    and before generation.
+    """
+    seq_len, prompt_seq_len = text_generation_attributes
+    tokenizer, _ = model_attributes
+    prep_for_generation = PrepareGeneration(
+        prompt_sequence_length=prompt_seq_len,
+        token_generator=TokenGeneratorOperator(),
+        sequence_length=seq_len,
+    )
+    inputs = {
+        "tokens": mock_tokens_multiple,
+        "kv_cache": mock_kv_cache_three_tokens_processed,
+    }
+    # can_operate() if the total number of prompt tokens is equal to the
+    # number of processed tokens stored in the kv_cache, indicating prompt inference is
+    # complete and generation can begin.
+    assert prep_for_generation.can_operate(inputs)
+
+    prompt_logits = [numpy.random.rand(1, len(mock_tokens_multiple), len(tokenizer))]
+    mock_inference_state.update_state({"prompt_logits": prompt_logits})
+    outputs, state = prep_for_generation.run(
+        tokens=mock_tokens_multiple,
+        kv_cache=mock_kv_cache_three_tokens_processed,
+        inference_state=mock_inference_state,
+    )
+    assert len(outputs.get("tokens")) == len(mock_tokens_multiple) + 1
+    assert outputs.get("in_generation")
+    assert numpy.array_equal(
+        state.get("generated_logits")[0],
+        numpy.expand_dims(prompt_logits[0][:, -1, :], 0),
+    )
+
+
+def test_generate_new_token(
+    model_attributes,
+    mock_token_generator,
+    mock_kv_cache,
+    mock_inference_state,
+    mock_logits,
+):
+    """
+    This test is responsible for testing the GenerateNewTokenOperator, which generates
+    one new token, given a token_generator (stored in the inference_state) and logits
+    from the engine.
+    """
+    tokenizer, _ = model_attributes
+    generate_new_token = GenerateNewTokenOperator(
+        force_max_tokens=False, tokenizer=tokenizer
+    )
+    mock_inference_state.update_state(
+        {
+            "token_generator": mock_token_generator,
+            "generated_tokens": [mock_token_generator.tokens],
+        }
+    )
+    outputs, state = generate_new_token.run(
+        logits=mock_logits, kv_cache=mock_kv_cache, inference_state=mock_inference_state
+    )
+    # The new_token generated/returned by ths operator should match the last token in
+    # token_generator
+    assert outputs.get("new_token") == state.get("token_generator").tokens[-1]
diff --git a/tests/deepsparse/v2/unit/text_generation/text_multi_token_engine.py b/tests/deepsparse/v2/unit/text_generation/text_multi_token_engine.py
new file mode 100644
index 0000000000..d2c822af4c
--- /dev/null
+++ b/tests/deepsparse/v2/unit/text_generation/text_multi_token_engine.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from deepsparse.v2.text_generation import MultiEnginePrefill
+
+
+def test_mult_engine_preprocess(
+    text_generation_attributes, pipeline_state, mock_kv_cache, mock_tokens_multiple
+):
+    """
+    Check if the multi-token engine preprocess operator can run based on the provided
+    tokens and prompt_sequence_length.
+    """
+
+    seq_len, _ = text_generation_attributes
+    multi_prep = MultiEnginePrefill(
+        sequence_length=seq_len, prompt_sequence_length=len(mock_tokens_multiple)
+    )
+    inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache}
+    # The number of tokens is equal to the prompt_sequence_length.
+    # Therefore, the multi_token_engine can run and can_operate() should be True.
+    assert multi_prep.can_operate(inputs)
+    outputs = multi_prep.run(
+        tokens=mock_tokens_multiple,
+        kv_cache=mock_kv_cache,
+        pipeline_state=pipeline_state,
+    )
+    # Expect 4 engine inputs: tokens, attention mask, causal, positions
+    assert len(outputs.get("engine_inputs")) == 4
+    tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs")
+    # Assert proper shapes for all engine_inputs
+    assert tokens.shape[-1] == len(mock_tokens_multiple)
+    assert attention_mask.shape[-1] == seq_len
+    assert positions.shape[-1] == len(mock_tokens_multiple)
+
+
+def test_multi_engine_preprocess_cant_operate(
+    text_generation_attributes, mock_kv_cache, mock_tokens
+):
+    """
+    Check if the multi-token engine preprocess operator can run based on the provided
+    tokens and prompt_sequence_length.
+    """
+    seq_len, _ = text_generation_attributes
+    multi_prep = MultiEnginePrefill(
+        sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1
+    )
+    inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache}
+    # The prompt_sequence_length is one greater than the total number of tokens we're
+    # processing. Therefore, this operator should not run and can_operate() should be
+    # False.
+    assert not multi_prep.can_operate(inputs)

From 4f248ddba0b6a2776ceaa5a7662251a8f8e59b4e Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Mon, 13 Nov 2023 18:24:10 +0100
Subject: [PATCH 23/43] Delete
 tests/deepsparse/v2/unit/text_generation/test_msic.py

---
 .../v2/unit/text_generation/test_msic.py      | 31 -------------------
 1 file changed, 31 deletions(-)
 delete mode 100644 tests/deepsparse/v2/unit/text_generation/test_msic.py

diff --git a/tests/deepsparse/v2/unit/text_generation/test_msic.py b/tests/deepsparse/v2/unit/text_generation/test_msic.py
deleted file mode 100644
index caa0cc2efd..0000000000
--- a/tests/deepsparse/v2/unit/text_generation/test_msic.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from deepsparse.v2.text_generation import CompilePromptLogits
-
-
-def test_compile_logits(mock_logits, mock_inference_state):
-    mock_inference_state.update_state({"prompt_logits": [mock_logits]})
-    compile_prompt_logits = CompilePromptLogits()
-    # Can operate as long as we're not in generation but in prompt_inference. This
-    # can_operate() will check for the `in_generation` flag in the input.
-    assert compile_prompt_logits.can_operate({})
-    output, state = compile_prompt_logits.run(
-        logits=mock_logits, inference_state=mock_inference_state
-    )
-    # The CompilePromptLogits is responsible for updating a list of prompt logits
-    # calculated at each step during prompt inference. After one step of running this
-    # operator, the total number of prompt_logits in the inference state should be
-    # the current length of prompt logits + 1
-    assert len(state.get("prompt_logits")) == len([mock_logits]) + 1

From 20980a769f2abe89f0247c5f7e58193286af9217 Mon Sep 17 00:00:00 2001
From: Benjamin Fineran <bfineran@users.noreply.github.com>
Date: Mon, 13 Nov 2023 16:12:35 -0500
Subject: [PATCH 24/43] [Continuous Batching] Queue Implementation to support
 batching grouping and prioritization (#1373)

* [Continuous Batching] Queue Implementation to support batching grouping and prioritization

* has_key method

* thread safety

* add blocking option for pop_batch

* update docstring

* allow mutex to be shared across continuous batching objects

* revert last commit
---
 src/deepsparse/v2/operators/__init__.py       |   2 +
 .../v2/schedulers/utils/__init__.py           |  18 ++
 .../utils/continuous_batching_queues.py       | 220 ++++++++++++++++++
 tests/deepsparse/v2/schedulers/__init__.py    |  13 ++
 .../v2/schedulers/utils/__init__.py           |  13 ++
 .../utils/test_continuous_batching_queues.py  | 177 ++++++++++++++
 6 files changed, 443 insertions(+)
 create mode 100644 src/deepsparse/v2/schedulers/utils/__init__.py
 create mode 100644 src/deepsparse/v2/schedulers/utils/continuous_batching_queues.py
 create mode 100644 tests/deepsparse/v2/schedulers/__init__.py
 create mode 100644 tests/deepsparse/v2/schedulers/utils/__init__.py
 create mode 100644 tests/deepsparse/v2/schedulers/utils/test_continuous_batching_queues.py

diff --git a/src/deepsparse/v2/operators/__init__.py b/src/deepsparse/v2/operators/__init__.py
index 9d1a9812ac..bf58018493 100644
--- a/src/deepsparse/v2/operators/__init__.py
+++ b/src/deepsparse/v2/operators/__init__.py
@@ -1,4 +1,5 @@
 # flake8: noqa
+# isort: skip_file
 
 # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
 #
@@ -14,3 +15,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .operator import *
+from .engine_operator import *
diff --git a/src/deepsparse/v2/schedulers/utils/__init__.py b/src/deepsparse/v2/schedulers/utils/__init__.py
new file mode 100644
index 0000000000..e2e25b1c90
--- /dev/null
+++ b/src/deepsparse/v2/schedulers/utils/__init__.py
@@ -0,0 +1,18 @@
+# flake8: noqa
+# isort: skip_file
+
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .continuous_batching_queues import *
diff --git a/src/deepsparse/v2/schedulers/utils/continuous_batching_queues.py b/src/deepsparse/v2/schedulers/utils/continuous_batching_queues.py
new file mode 100644
index 0000000000..84d4f38e3d
--- /dev/null
+++ b/src/deepsparse/v2/schedulers/utils/continuous_batching_queues.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from concurrent.futures import Future
+from queue import Queue
+from threading import Condition, Lock
+from time import time
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple
+
+
+__all__ = [
+    "ContinuousBatchingQueue",
+    "ContinuousBatchingQueues",
+    "QueueEntry",
+]
+
+
+# maximum wait time of longest item in queue before it is prioritized
+_MAX_WAIT_MS = 100
+
+
+class QueueEntry(NamedTuple):
+    value: Any
+    future: Optional[Future]
+    entry_time_ms: float
+
+    def time_elapsed(self) -> float:
+        return _current_time_ms() - self.entry_time_ms
+
+
+class ContinuousBatchingQueue(Queue):
+    """
+    Extension of queue.Queue with helper functions for dequeueing valid
+    batch sizes for continuous batching
+
+    :param batch_sizes: valid batch sizes that can be grouped for continuous
+        batching
+    """
+
+    def __init__(self, batch_sizes: List[int], *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self._batch_sizes = batch_sizes
+        self._min_batch_size = min(self.batch_sizes)
+
+    @property
+    def batch_sizes(self) -> List[int]:
+        """
+        :return: valid batch sizes that this queue can return
+        """
+        return self._batch_sizes
+
+    def pop_batch(self) -> List[Any]:
+        """
+        :return:
+        """
+        batch_size = self.max_queued_batch_size()
+        if batch_size == 0:
+            raise RuntimeError(
+                f"Cannot create a batch with {self.qsize()} entries and valid "
+                f"batch sizes: {self.batch_sizes}"
+            )
+
+        return [self.get() for _ in range(batch_size)]
+
+    def has_batch(self) -> bool:
+        """
+        :return: True if a batch of valid size can be filled with the current qsize
+        """
+        return self.qsize() >= self._min_batch_size
+
+    def max_queued_batch_size(self) -> int:
+        """
+        :return: the maximum batch size that can be filled by members of this queue
+        """
+        num_entries = self.qsize()
+        max_size = 0
+
+        for batch_size in self.batch_sizes:
+            if num_entries >= batch_size > max_size:
+                # current batch size can be satisfied and is the largest so far
+                max_size = batch_size
+
+        return max_size
+
+    def peek(self):
+        """
+        :return: threadsafe peek of the first item in the queue
+        """
+        with self.mutex:
+            return self.queue[0]
+
+
+class ContinuousBatchingQueues:
+    """
+    Threadsafe collection of Queues designed to support continuous batching.
+    Each Queue should be keyed by an operator where possible, however keys
+    are kept generic.
+
+    On request for next - a job will be returned with an operator key and
+    a batch of inputs. The default heuristic for the next job will be
+    a combination of wait time and largest batch that can be run
+    """
+
+    def __init__(self):
+        self._queues = {}  # Dict[Any, ContinuousBatchingQueue]
+        self._mutex = Lock()
+
+        # add condition for wait/notify when an item is added to any queue
+        self._item_added = Condition(self._mutex)
+
+    def __contains__(self, key: Any) -> bool:
+        """
+        :param key: key to look up
+        :return: True if the given key has a queue in this group
+        """
+        with self._mutex:
+            return key in self._queues
+
+    def add_queue(self, key: Any, batch_sizes: List[int]):
+        """
+        Adds a queue for a single operator that can be run at multiple batch sizes
+
+        :param key: key to identify queue with, preferably the engine operator
+        :param batch_sizes: batch sizes that the operator can be run at
+        """
+        with self._mutex:
+            self._queues[key] = ContinuousBatchingQueue(batch_sizes=batch_sizes)
+
+    def add_queue_item(self, key: Any, item: Any, future: Optional[Future] = None):
+        """
+        Adds an item to the given queue
+
+        :param key: key for queue to add to
+        :param item: item to add in queue
+        :param future: optional future that should be used for resolution of value
+        """
+        if key not in self:
+            raise KeyError(f"Cannot add item to queue for unregistered key {key}")
+
+        entry = QueueEntry(value=item, future=future, entry_time_ms=_current_time_ms())
+
+        with self._mutex:
+            self._queues[key].put(entry)
+            self._item_added.notify()
+
+    def has_next_batch(self) -> bool:
+        """
+        :return: true if any Queue has enough entries to fill a valid batch size
+        """
+        with self._mutex:
+            return any(queue.has_batch() for queue in self._queues.values())
+
+    def pop_batch(
+        self,
+        select_fn: Callable[[Dict[Any, ContinuousBatchingQueue]], Any] = None,
+        block: bool = True,
+    ) -> Tuple[Any, List[QueueEntry]]:
+        """
+        :param select_fn: function that takes in a dictionary of queue key
+            (i.e. EngineOperator) to its ContinuousBatchingQueue of QueueItem
+            objects and returns the key of the queue that should be returned.
+            Only keys with queues large enough to fill a batch will be given.
+            If not provided, the default select_fn will return the queue that
+            can fill the largest batch size, or the queue that has the first item
+            with the longest wait time if that time is over 100ms.
+        :param block: if True, will wait for a valid batch to be in a queue before
+            popping and returning, if False, will raise an error if a full batch
+            cannot be popped. Default True
+        :return: Tuple of the queue key (EngineOperator) and
+            batch of QueueEntry objects as a list that have been popped and should
+            be run as a batch
+        """
+        with self._mutex:
+            while not (valid_queues := self._filter_empty_queues()):
+                if block:
+                    # wait to search for a valid queue again until a new item is added
+                    self._item_added.wait()
+                else:
+                    raise RuntimeError(
+                        "Cannot pop_batch when no queues have enough items to fill "
+                        "a valid batch size, check with has_next_batch before calling "
+                        "pop_batch"
+                    )
+
+            select_fn = select_fn or _default_select_fn
+            selected_key = select_fn(valid_queues)
+
+            return selected_key, self._queues[selected_key].pop_batch()
+
+    def _filter_empty_queues(self) -> Dict[Any, ContinuousBatchingQueue]:
+        return {key: queue for key, queue in self._queues.items() if queue.has_batch()}
+
+
+def _default_select_fn(queues: Dict[Any, ContinuousBatchingQueue]) -> Any:
+    # find the maximum wait time of a queue
+    wait_times = [(key, queue.peek().time_elapsed()) for key, queue in queues.items()]
+    max_wait_key, max_wait = max(wait_times, key=lambda x: x[1])  # key on time
+
+    if max_wait >= _MAX_WAIT_MS:
+        # if max time is greater than the threshold return that queue
+        return max_wait_key
+
+    # default to the largest batch size that can be satisfied
+    return max(queues.keys(), key=lambda key: queues[key].max_queued_batch_size())
+
+
+def _current_time_ms():
+    return time() * 1000
diff --git a/tests/deepsparse/v2/schedulers/__init__.py b/tests/deepsparse/v2/schedulers/__init__.py
new file mode 100644
index 0000000000..0c44f887a4
--- /dev/null
+++ b/tests/deepsparse/v2/schedulers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/deepsparse/v2/schedulers/utils/__init__.py b/tests/deepsparse/v2/schedulers/utils/__init__.py
new file mode 100644
index 0000000000..0c44f887a4
--- /dev/null
+++ b/tests/deepsparse/v2/schedulers/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/deepsparse/v2/schedulers/utils/test_continuous_batching_queues.py b/tests/deepsparse/v2/schedulers/utils/test_continuous_batching_queues.py
new file mode 100644
index 0000000000..1713d54f82
--- /dev/null
+++ b/tests/deepsparse/v2/schedulers/utils/test_continuous_batching_queues.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+from threading import Thread
+
+import pytest
+from deepsparse.v2.schedulers.utils import (
+    ContinuousBatchingQueue,
+    ContinuousBatchingQueues,
+    QueueEntry,
+)
+
+
+@pytest.mark.parametrize(
+    "batch_sizes,num_entries,expected_batch_size",
+    [
+        ([1, 4, 8], 20, 8),
+        ([1, 4, 8], 6, 4),
+        ([1, 4, 8], 4, 4),
+        ([1, 4, 8], 3, 1),
+        ([4], 5, 4),
+    ],
+)
+def test_queue_single_pop(batch_sizes, num_entries, expected_batch_size):
+    queue = ContinuousBatchingQueue(batch_sizes=batch_sizes)
+    assert not queue.has_batch()
+    for i in range(num_entries):
+        queue.put(i)
+
+    assert queue.has_batch()
+    assert queue.max_queued_batch_size() == expected_batch_size
+
+    batch = queue.pop_batch()
+    assert len(batch) == expected_batch_size
+    assert batch == list(range(expected_batch_size))
+
+
+def test_queue_multi_pop():
+    queue = ContinuousBatchingQueue(batch_sizes=[2, 4, 8])
+
+    for i in range(23):
+        if i < 2:
+            assert not queue.has_batch()
+        else:
+            assert queue.has_batch()
+        queue.put(i)
+
+    def pop_and_assert_queue_size_and_pop(expected_qsize, expected_batch_size):
+        assert queue.qsize() == expected_qsize
+        assert queue.has_batch()
+        assert queue.max_queued_batch_size() == expected_batch_size
+        assert len(queue.pop_batch()) == expected_batch_size
+
+    # pop items from queue, checkign remaining qsize and correct batch size is popped
+    pop_and_assert_queue_size_and_pop(23, 8)
+    pop_and_assert_queue_size_and_pop(15, 8)
+    pop_and_assert_queue_size_and_pop(7, 4)
+    pop_and_assert_queue_size_and_pop(3, 2)
+
+    assert not queue.has_batch()
+    queue.put(23)
+    pop_and_assert_queue_size_and_pop(2, 2)
+
+    assert queue.empty()
+
+
+def test_queue_invalid_pop():
+    queue = ContinuousBatchingQueue(batch_sizes=[4, 8])
+    for i in range(3):
+        queue.put(i)
+
+    with pytest.raises(RuntimeError):
+        # queue size 3, min batch size 4
+        queue.pop_batch()
+
+
+def test_queues_pop_batch_max_valid_batch():
+    queues = ContinuousBatchingQueues()
+
+    queues.add_queue("key_1", [2, 4])
+    queues.add_queue("key_2", [3])
+
+    assert not queues.has_next_batch()
+
+    queues.add_queue_item("key_1", 1)
+    queues.add_queue_item("key_1", 2)
+    assert queues.has_next_batch()
+
+    queues.add_queue_item("key_2", 1)
+    queues.add_queue_item("key_2", 2)
+    queues.add_queue_item("key_2", 3)
+    # NOTE - if this block takes more than 100ms, test may fail
+    # as timeout may lead key_1 to be popped first
+
+    # key_2 should be popped first because it has larger loaded batch size
+    first_popped_key, first_popped_batch = queues.pop_batch()
+    assert first_popped_key == "key_2"
+    assert len(first_popped_batch) == 3
+    assert all(isinstance(item, QueueEntry) for item in first_popped_batch)
+
+    assert queues.has_next_batch()
+
+    second_popped_key, second_popped_batch = queues.pop_batch()
+    assert second_popped_key == "key_1"
+    assert len(second_popped_batch) == 2
+    assert all(isinstance(item, QueueEntry) for item in second_popped_batch)
+
+
+def test_queues_pop_batch_time_elapsed_priority():
+    queues = ContinuousBatchingQueues()
+
+    queues.add_queue("key_1", [2, 4])
+    queues.add_queue("key_2", [3])
+
+    assert not queues.has_next_batch()
+
+    queues.add_queue_item("key_1", 1)
+    queues.add_queue_item("key_1", 2)
+    assert queues.has_next_batch()
+
+    # sleep 150ms (time threshold is 100ms)
+    time.sleep(0.15)
+
+    queues.add_queue_item("key_2", 1)
+    queues.add_queue_item("key_2", 2)
+    queues.add_queue_item("key_2", 3)
+
+    # key 1 should be popped first because its first item has been waiting longer
+    # than the time threshold and key_2 was just added
+
+    popped_key, popped_batch = queues.pop_batch()
+    assert popped_key == "key_1"
+    assert len(popped_batch) == 2
+
+
+def test_queues_pop_batch_blocking():
+    queues = ContinuousBatchingQueues()
+    queues.add_queue("key_1", [2])
+
+    def test_fn():
+        # pop batch and block until true
+        key, batch = queues.pop_batch(block=True)
+        # compare to expected results
+        assert key == "key_1"
+        assert batch == [1, 2]
+
+    # start a thread to pop batch
+    # it should hang indefinitely because block=True and there are no items yet in queue
+    thread = Thread(target=queues.pop_batch)
+    thread.start()
+
+    # confirm thread is still running
+    assert thread.is_alive()
+    time.sleep(0.15)
+    # sleep and confirm thread is still hanging
+    assert thread.is_alive()
+
+    # confirm thread still runs after a single insertion (min batch size is 2)
+    queues.add_queue_item("key_1", 1)
+    assert thread.is_alive()
+
+    # add a second item and assert thread finishes
+    queues.add_queue_item("key_1", 2)
+    time.sleep(0.1)
+    assert not thread.is_alive()

From d81012d0e942d10ce5462027f14d87dd1cdf77bf Mon Sep 17 00:00:00 2001
From: Benjamin Fineran <bfineran@users.noreply.github.com>
Date: Mon, 13 Nov 2023 16:22:30 -0500
Subject: [PATCH 25/43] [Continuous Batching] Executor thread for running
 continuous batching (#1374)

* [Continuous Batching] Executor thread for running continuous batching

* quality

* ensure that executor stops when main thread does - clean up test hack
---
 .../v2/operators/engine_operator.py           | 32 +++++++
 .../v2/schedulers/utils/__init__.py           |  1 +
 .../utils/continuous_batching_executor.py     | 79 ++++++++++++++++++
 .../test_continuous_batching_executor.py      | 83 +++++++++++++++++++
 4 files changed, 195 insertions(+)
 create mode 100644 src/deepsparse/v2/schedulers/utils/continuous_batching_executor.py
 create mode 100644 tests/deepsparse/v2/schedulers/utils/test_continuous_batching_executor.py

diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py
index bd58aefafa..9ee8d734c5 100644
--- a/src/deepsparse/v2/operators/engine_operator.py
+++ b/src/deepsparse/v2/operators/engine_operator.py
@@ -39,6 +39,28 @@ class EngineOperatorInputs(BaseModel):
         default=None,
     )
 
+    @classmethod
+    def join(cls, inputs: List["EngineOperatorInputs"]) -> "EngineOperatorInputs":
+        """
+        :param inputs: list of separate EngineOperatorInputs, batch size must be 1
+        :return: list of inputs joined into a single input with a multi batch size
+        """
+        all_engine_inputs = [engine_input.engine_inputs for engine_input in inputs]
+
+        for engine_inputs in all_engine_inputs:
+            if engine_inputs[0].shape[0] != 1:
+                raise RuntimeError(
+                    "join requires all inputs to have batch size 1, found input with "
+                    f"batch size {engine_inputs[0].shape[0]}"
+                )
+
+        # use join_engine_outputs since dtype is the same
+        joined_engine_inputs = join_engine_outputs(
+            all_engine_inputs, len(all_engine_inputs)
+        )
+
+        return cls(engine_inputs=joined_engine_inputs)
+
     class Config:
         arbitrary_types_allowed = True
 
@@ -46,6 +68,16 @@ class Config:
 class EngineOperatorOutputs(BaseModel):
     engine_outputs: List = Field(description="engine outputs")
 
+    def split(self) -> List["EngineOperatorOutputs"]:
+        """
+        :return: list of the current outputs split to a batch size of 1 each
+        """
+        # using split_engine_inputs since input/output dtypes
+        # are the same (List[ndarray])
+        split_outputs, _ = split_engine_inputs(self.engine_outputs, batch_size=1)
+
+        return [self.__class__(engine_outputs=outputs) for outputs in split_outputs]
+
 
 class EngineOperator(Operator):
     input_schema = EngineOperatorInputs
diff --git a/src/deepsparse/v2/schedulers/utils/__init__.py b/src/deepsparse/v2/schedulers/utils/__init__.py
index e2e25b1c90..521341a7fc 100644
--- a/src/deepsparse/v2/schedulers/utils/__init__.py
+++ b/src/deepsparse/v2/schedulers/utils/__init__.py
@@ -16,3 +16,4 @@
 # limitations under the License.
 
 from .continuous_batching_queues import *
+from .continuous_batching_executor import *
diff --git a/src/deepsparse/v2/schedulers/utils/continuous_batching_executor.py b/src/deepsparse/v2/schedulers/utils/continuous_batching_executor.py
new file mode 100644
index 0000000000..86afdf309c
--- /dev/null
+++ b/src/deepsparse/v2/schedulers/utils/continuous_batching_executor.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from threading import Thread
+from typing import Dict
+
+from deepsparse import Engine
+from deepsparse.v2.operators import EngineOperator
+from deepsparse.v2.schedulers.utils.continuous_batching_queues import (
+    ContinuousBatchingQueues,
+)
+
+
+__all__ = [
+    "ContinuousBatchingExecutorThread",
+]
+
+
+class ContinuousBatchingExecutorThread(Thread):
+    """
+    Thread that when started runs indefinitely, grabbing a valid batch from
+    the queues when possible and running them in the correct engine
+
+    :param queues: ContinuousBatchingQueues object containing a queue for
+        each valid engine
+    :param operators_to_engines: dictionary mapping valid engine operators
+        to a dictionary of its valid batch sizes mapped to an engine compiled
+        for that batch size
+    """
+
+    def __init__(
+        self,
+        queues: ContinuousBatchingQueues,
+        operators_to_engines: Dict[EngineOperator, Dict[int, Engine]],
+    ):
+        self._queues = queues
+        self._operators_to_engines = operators_to_engines
+        self._should_stop = False
+
+        super().__init__(target=self._working_loop)
+        self.daemon = True  # worker thread should exit when main thread exits
+
+    def _working_loop(self):
+        # indefinitely wait for batch, run batch, split and resolve futures
+        while True:
+            # wait for next batch to be available
+            engine_operator, batch = self._queues.pop_batch(block=True)
+
+            # unpack batch of QueueEntry objects
+            engine_inputs, futures, _ = list(zip(*batch))
+            batch_size = len(engine_inputs)
+
+            # type is EngineOperatorInputs
+            joined_inputs = engine_operator.input_schema.join(engine_inputs)
+
+            # get engine for this operator compiled to the popped batch size
+            # and set the inputs to execute with it
+            joined_inputs.engine = self._operators_to_engines[engine_operator][
+                batch_size
+            ]
+
+            # run the engine operator with the given engine at the joined batch size
+            joined_outputs = engine_operator(joined_inputs)
+
+            # split outputs and return the results to their respective futures
+            split_outputs = joined_outputs.split()
+            for output, future in zip(split_outputs, futures):
+                future.set_result(output)
diff --git a/tests/deepsparse/v2/schedulers/utils/test_continuous_batching_executor.py b/tests/deepsparse/v2/schedulers/utils/test_continuous_batching_executor.py
new file mode 100644
index 0000000000..1d5ed9d92b
--- /dev/null
+++ b/tests/deepsparse/v2/schedulers/utils/test_continuous_batching_executor.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+from concurrent.futures import Future
+
+import numpy
+
+from deepsparse.v2.operators import EngineOperator
+from deepsparse.v2.schedulers.utils import (
+    ContinuousBatchingExecutorThread,
+    ContinuousBatchingQueues,
+)
+
+
+def test_continuous_batching_executor_thread():
+    # mobilenet model with batch_size=2
+    engine_operator = EngineOperator("zoo:mobilenet_v2-1.0-imagenet-base", batch_size=2)
+
+    # create queues object and add operator
+    queues = ContinuousBatchingQueues()
+    queues.add_queue(engine_operator, batch_sizes=[2])
+
+    # create engine map
+    operators_to_engines = {engine_operator: {2: engine_operator.engine}}
+
+    worker_thread = ContinuousBatchingExecutorThread(queues, operators_to_engines)
+
+    # thread not started yet
+    assert not worker_thread.is_alive()
+
+    # start and assert thread is alive
+    worker_thread.start()
+    assert worker_thread.is_alive()
+
+    # create first input and add it to queue
+    input_1 = engine_operator.input_schema(
+        engine_inputs=[numpy.random.randn(1, 3, 224, 224).astype(numpy.float32)]
+    )
+    future_1 = Future()
+    queues.add_queue_item(engine_operator, input_1, future=future_1)
+
+    # assert that future is not yet resolved
+    assert not future_1.done()
+
+    # create second input and add it to queue
+    input_2 = engine_operator.input_schema(
+        engine_inputs=[numpy.random.randn(1, 3, 224, 224).astype(numpy.float32)]
+    )
+    future_2 = Future()
+    queues.add_queue_item(engine_operator, input_2, future=future_2)
+
+    # wait 1 second to give engine time to complete
+    time.sleep(1)
+
+    assert future_1.done()
+    assert future_2.done()
+
+    result_1 = future_1.result()
+    result_2 = future_2.result()
+
+    assert isinstance(result_1, engine_operator.output_schema)
+    assert isinstance(result_2, engine_operator.output_schema)
+
+    def assert_batch_size_one(arrays):
+        for array in arrays:
+            assert array.shape[0] == 1
+
+    # make sure only a single batch item was returned to each future
+    # TODO: test that the correct bs1 item is returned (can test against bs1 engine)
+    assert_batch_size_one(result_1.engine_outputs)
+    assert_batch_size_one(result_2.engine_outputs)

From 5c48505eacc1ef49635d1b0c865aa5c10f768381 Mon Sep 17 00:00:00 2001
From: Benjamin Fineran <bfineran@users.noreply.github.com>
Date: Mon, 13 Nov 2023 16:24:17 -0500
Subject: [PATCH 26/43] [ContinuousBatching] ContinuousBatchingScheduler
 Implementation (#1375)

* [ContinuousBatching] ContinuousBatchingScheduler Implementation

* cleanup unnecessary stop condition
---
 src/deepsparse/v2/schedulers/__init__.py      |   2 +
 .../continuous_batching_scheduler.py          | 141 ++++++++++++++++++
 .../test_continuous_batching_scheduler.py     |  48 ++++++
 3 files changed, 191 insertions(+)
 create mode 100644 src/deepsparse/v2/schedulers/continuous_batching_scheduler.py
 create mode 100644 tests/deepsparse/v2/schedulers/test_continuous_batching_scheduler.py

diff --git a/src/deepsparse/v2/schedulers/__init__.py b/src/deepsparse/v2/schedulers/__init__.py
index 04c37077e1..b4d78521ab 100644
--- a/src/deepsparse/v2/schedulers/__init__.py
+++ b/src/deepsparse/v2/schedulers/__init__.py
@@ -1,4 +1,5 @@
 # flake8: noqa
+# isort: skip_file
 
 # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
 #
@@ -16,3 +17,4 @@
 
 from .scheduler import *
 from .scheduler_group import *
+from .continuous_batching_scheduler import *
diff --git a/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py b/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py
new file mode 100644
index 0000000000..96e0a502b6
--- /dev/null
+++ b/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from concurrent.futures import Future
+from threading import Lock
+from typing import List
+
+from deepsparse.v2.operators import EngineOperator, Operator
+from deepsparse.v2.schedulers.scheduler import OperatorScheduler
+from deepsparse.v2.schedulers.utils import (
+    ContinuousBatchingExecutorThread,
+    ContinuousBatchingQueues,
+)
+
+
+__all__ = ["ContinuousBatchingScheduler"]
+
+
+class ContinuousBatchingScheduler(OperatorScheduler):
+    """
+    Manages EngineOperator jobs that should be run with continuous batching.
+    Groups requests for the same engine into larger batches and returns
+    the result to the respeictive request threads after scheduled completion
+
+    :param max_workers: maximum number of threads to execute at once, default 1
+    """
+
+    def __init__(self, max_workers: int = 1):
+        self._max_workers = max_workers
+
+        self._mutex = Lock()
+
+        # Dict[EngineOperator, Dict[batch_size, Engine]]
+        self._operators_to_engines = {}  # EngineOperator -> Dict[batch_size, Engine]
+        self._queues = ContinuousBatchingQueues()
+
+        # create and start max number of worker threads
+        self._threads = [
+            ContinuousBatchingExecutorThread(self._queues, self._operators_to_engines)
+            for _ in range(self.max_workers)
+        ]
+        for worker_thread in self._threads:
+            worker_thread.start()
+
+    @property
+    def max_workers(self) -> int:
+        """
+        :return: maximum number of threads to execute at once
+        """
+        return self._max_workers
+
+    def submit(self, *args, operator: Operator, **kwargs) -> Future:
+        """
+        :param operator: operator to run
+        :param operator_input: input schema to the operator
+        :return: future referencing the asynchronously run output of the operator
+        """
+        inputs = args[0]
+        if not isinstance(inputs, operator.input_schema):
+            raise ValueError(
+                "Inputs to ContinuousBatchingScheduler must be the specific "
+                f"input schema to the given operator. Expected {operator.input_schema}"
+                f"found {type(inputs)}"
+            )
+
+        future = Future()
+        self._queues.add_queue_item(key=operator, item=inputs, future=future)
+
+        return future
+
+    def can_process(self, *args, operator: Operator, **kwargs) -> bool:
+        """
+        :param operator: operator to check
+        :param operator_input: operator_input to check
+        :return: True if this Operator can process the given operator and input.
+            SchedulerGroup always returns True
+        """
+        return operator in self._operators_to_engines and operator in self._queues
+
+    def add_engine_operator(
+        self, engine_operator: EngineOperator, batch_sizes: List[int]
+    ):
+        """
+        Adds tracking for an engine operator to this scheduler
+        with continuous batching for the given sizes
+
+        :param engine_operator: an EngineOperator, must be compiled with
+            batch_size=1
+        :param batch_sizes: batch sizes to use for continuous batching
+        """
+        # lock updates to _operators_to_engines while updating
+        self._mutex.acquire()
+
+        # validation
+        if engine_operator in self._operators_to_engines:
+            # operator already added
+            return
+
+        if not isinstance(engine_operator, EngineOperator):
+            raise ValueError(
+                f"Expected an EngineOperator instance, found {type(engine_operator)}"
+            )
+        if engine_operator.batch_size != 1:
+            raise ValueError(
+                "For continuous batching, EngineOperator must have batch_size=1. "
+                f"found batch_size={engine_operator.batch_size}"
+            )
+
+        # build EngineOperator -> List[batch_size] dict
+        operator_engines = {}
+        # base engine, expected batch size is 1
+        operator_engines[engine_operator.batch_size] = engine_operator.engine
+
+        # compile auxillary engines for continuous batching
+        for batch_size in batch_sizes:
+            if batch_size == 1:
+                continue  # already added
+            operator_engines[batch_size] = operator_engines.create_engine(
+                batch_size=batch_size
+            )
+
+        self._operators_to_engines[engine_operator] = operator_engines
+        self._queues.add_queue(
+            key=engine_operator,
+            batch_sizes=list(operator_engines.keys()),
+        )
+
+        # release lock
+        self._mutex.release()
diff --git a/tests/deepsparse/v2/schedulers/test_continuous_batching_scheduler.py b/tests/deepsparse/v2/schedulers/test_continuous_batching_scheduler.py
new file mode 100644
index 0000000000..7ed49de004
--- /dev/null
+++ b/tests/deepsparse/v2/schedulers/test_continuous_batching_scheduler.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from concurrent.futures import Future
+
+import numpy
+
+from deepsparse.v2.operators import EngineOperator
+from deepsparse.v2.schedulers import ContinuousBatchingScheduler
+
+
+def test_continuous_batching_executor_thread():
+    # simple test that ContinuousBatchingScheduler can be instantiated and return
+    # a result from a request, for testing multi-batch execution, making enough
+    # concurrent requests guarantee batched execution is out of scope
+    scheduler = ContinuousBatchingScheduler()
+
+    # mobilenet model with batch_size=2
+    engine_operator = EngineOperator(
+        "zoo:mobilenet_v2-1.0-imagenet-base",
+        batch_size=1,
+    )
+
+    scheduler.add_engine_operator(engine_operator, [1])
+
+    # submit job to scheduler and expect future to be returned
+    engine_input = engine_operator.input_schema(
+        engine_inputs=[numpy.random.randn(1, 3, 224, 224).astype(numpy.float32)]
+    )
+    future = scheduler.submit(engine_input, operator=engine_operator)
+    assert isinstance(future, Future)
+    assert not future.done()  # assume this runs before engine has a chance to complete
+
+    # assert that output resolves and contains a numpy array
+    engine_output = future.result()
+    assert isinstance(engine_output, engine_operator.output_schema)
+    assert isinstance(engine_output.engine_outputs[0], numpy.ndarray)

From e1b7f3703fc91429f20fd7b79e06487898f4fa6e Mon Sep 17 00:00:00 2001
From: Benjamin Fineran <bfineran@users.noreply.github.com>
Date: Mon, 13 Nov 2023 16:27:26 -0500
Subject: [PATCH 27/43] [continuous batching] singleton pattern for scheduler
 (#1391)

* [continuous batching] singleton pattern for scheduler

* catch from review
---
 .../continuous_batching_scheduler.py          | 36 ++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py b/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py
index 96e0a502b6..669c5922a0 100644
--- a/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py
+++ b/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py
@@ -28,11 +28,32 @@
 __all__ = ["ContinuousBatchingScheduler"]
 
 
+_GLOBAL_SCHEDULER = None
+
+
 class ContinuousBatchingScheduler(OperatorScheduler):
     """
     Manages EngineOperator jobs that should be run with continuous batching.
     Groups requests for the same engine into larger batches and returns
-    the result to the respeictive request threads after scheduled completion
+    the result to the respective request threads after scheduled completion
+
+    Example code for getting or creating a shared instance for scheduling
+    between pipelines and adding an engine operator to the scheduler
+    within a pipeline
+
+    ```python
+
+    class MyPipeline(Pipeline):
+
+        def __init__(self):
+            ...
+            engine_operator = EngineOperator(...)
+            ...
+            continuous_batching_scheduler = ContinuousBatchingScheduler.get_instance()
+            continuous_batching_scheduler.add_engine_operator(engine_operator)
+
+            super.__init__(...)
+    ```
 
     :param max_workers: maximum number of threads to execute at once, default 1
     """
@@ -54,6 +75,19 @@ def __init__(self, max_workers: int = 1):
         for worker_thread in self._threads:
             worker_thread.start()
 
+    @classmethod
+    def get_instance(cls) -> "ContinuousBatchingScheduler":
+        """
+        :return: global instance of the continuous batching scheduler. If one
+            does not exist yet, a scheduler with a single worker thread to
+            schedule all jobs is created and started
+        """
+        if _GLOBAL_SCHEDULER is not None:
+            return _GLOBAL_SCHEDULER  # noqa: F823
+
+        _GLOBAL_SCHEDULER = cls(max_workers=1)
+        return _GLOBAL_SCHEDULER
+
     @property
     def max_workers(self) -> int:
         """

From bbd534da76610a8bc0d6d55352cf0fb65737985e Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Tue, 14 Nov 2023 10:47:40 +0100
Subject: [PATCH 28/43] [Pipeline Refactor][Text-Generation] Create a helper
 function for creating engine_inputs (#1364)

* rebasing off my initial commit

* cleanups

* unit testing for text generation operators

* additional changes

* unit testing completion

* remove debug

* fix

* add todo

* more clean-up

* fix test

* add docstrings/comments

* break out tests to individual unit test files; add conftest and make scope of fixtures module to help with speed

* Delete tests/deepsparse/v2/unit/text_generation/test_msic.py

---------

Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 src/deepsparse/transformers/utils/helpers.py  | 92 ++++++++++++++++++-
 .../autoregressive_preprocess_operator.py     | 34 ++-----
 .../multi_engine_prefill_operator.py          | 81 +++-------------
 .../transformers/utils/test_helpers.py        | 74 +++++++++++++++
 4 files changed, 185 insertions(+), 96 deletions(-)

diff --git a/src/deepsparse/transformers/utils/helpers.py b/src/deepsparse/transformers/utils/helpers.py
index 38e3ec4a4c..648bdef9cf 100644
--- a/src/deepsparse/transformers/utils/helpers.py
+++ b/src/deepsparse/transformers/utils/helpers.py
@@ -14,7 +14,7 @@
 import logging
 import pathlib
 import uuid
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy
 from transformers import AutoTokenizer, GenerationConfig
@@ -33,6 +33,7 @@
     "override_config",
     "process_generation_config",
     "validate_session_ids",
+    "compute_engine_inputs",
     "set_generated_length",
 ]
 
@@ -82,6 +83,95 @@ def set_generated_length(
     )
 
 
+def compute_engine_inputs(onnx_input_names: str, **kwargs) -> List[numpy.ndarray]:
+    """
+    Given the names of the onnx inputs, compute the inputs
+    to the engine. The inputs will be calculating from the
+    passed kwargs. The information about the required kwargs
+    can be found in the docstring of the individual compute
+    functions.
+
+    :param onnx_input_names: The names of the onnx inputs
+    :param kwargs: The kwargs to compute the inputs from
+    :return: The computed inputs to the engine
+    """
+    engine_inputs = []
+    for input_name in onnx_input_names:
+        if input_name == "causal_mask":
+            # delay the computation of the causal mask
+            continue
+        # fetch the compute function for the
+        # given input_name
+        compute_func = _get_compute_func(input_name)
+        # compute the engine input from the kwargs
+        # and append it to the engine_inputs
+        engine_inputs.append(compute_func(**kwargs))
+
+    if "causal_mask" in onnx_input_names:
+        # compute the causal mask and append it to the engine_inputs
+        input_ids, attention_mask, *_ = engine_inputs
+        engine_inputs.append(create_causal_mask(input_ids, attention_mask))
+
+    return engine_inputs
+
+
+def _get_compute_func(input_name: str) -> Callable[..., numpy.ndarray]:
+    # given the input_name, return the appropriate compute function
+    compute_func = {
+        "input_ids": _compute_input_ids,
+        "attention_mask": _compute_attention_mask,
+        "positions": _compute_positions,
+    }.get(input_name)
+    if compute_func is None:
+        raise ValueError(
+            "Could not find compute function " f"for the input_name: {input_name}"
+        )
+    return compute_func
+
+
+def _compute_input_ids(token_batch: List[int], **kwargs) -> numpy.ndarray:
+    # convert the token_batch to a numpy array
+    return numpy.array([token_batch])
+
+
+def _compute_attention_mask(
+    sequence_length: int,
+    prompt_sequence_length: int,
+    num_total_processed_tokens: int,
+    **kwargs,
+) -> numpy.ndarray:
+    # create a fully masked attention mask with the appropriate
+    # shape (equal to the sequence_length)
+    attention_mask = numpy.zeros((1, sequence_length), dtype=numpy.int64)
+    # unmask the appropriate number of tokens, the sum of
+    # - the number of tokens already processed and cached (num_total_processed_tokens)
+    # - the number of tokens currently processed (prompt_sequence_length)
+    # the sum cannot exceed the maximum length of the attention_mask
+    num_attention_entries_to_unmask = min(
+        num_total_processed_tokens + prompt_sequence_length, sequence_length
+    )
+    # unmask the bits from the right-hand side
+    attention_mask[:, -num_attention_entries_to_unmask:] = 1
+    return attention_mask
+
+
+def _compute_positions(
+    num_total_processed_tokens: int, prompt_sequence_length: int, **kwargs
+):
+    # create the positions array with the appropriate shape
+    # positions count starts from the number of tokens already processed
+    # and ends at the number of tokens already processed + the number of tokens
+    # currently processed
+    return (
+        numpy.arange(
+            num_total_processed_tokens,
+            num_total_processed_tokens + prompt_sequence_length,
+        )
+        .reshape(1, -1)
+        .astype(numpy.int64)
+    )
+
+
 def validate_session_ids(
     session_ids: Optional[str], other_attributes: Dict[str, Any]
 ) -> Optional[List[str]]:
diff --git a/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py b/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py
index 6e97412e43..17d8dd662c 100644
--- a/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py
+++ b/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py
@@ -15,9 +15,7 @@
 import logging
 from typing import Any
 
-import numpy
-
-from deepsparse.transformers.utils.helpers import create_causal_mask
+from deepsparse.transformers.utils.helpers import compute_engine_inputs
 from deepsparse.v2.operators import Operator
 from deepsparse.v2.utils import PipelineState
 
@@ -66,30 +64,16 @@ def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwarg
 
         num_total_processed_tokens = kv_cache.total_num_processed_tokens
         new_token = tokens[num_total_processed_tokens]
-        engine_input_names = pipeline_state.current_state.get(
-            "onnx_input_names_no_cache"
-        )
-
-        # padding is added to left, so attention mask is 1s from the
-        # right up to the number of total tokens (prompt + generated)
-        attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
-        num_attention_entries_to_unmask = min(
-            num_total_processed_tokens + 1, self.sequence_length
-        )  # cap by seq len
-        attention_mask[:, -num_attention_entries_to_unmask:] = 1
-        positions = numpy.array([[num_total_processed_tokens]], dtype=numpy.int64)
-        input_ids = numpy.array([[new_token]])
-        causal_mask = create_causal_mask(input_ids, attention_mask)
 
-        engine_inputs_map = dict(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            causal_mask=causal_mask,
-            positions=positions,
+        engine_inputs = compute_engine_inputs(
+            onnx_input_names=pipeline_state.current_state.get(
+                "onnx_input_names_no_cache"
+            ),
+            token_batch=[new_token],
+            prompt_sequence_length=1,
+            sequence_length=self.sequence_length,
+            num_total_processed_tokens=num_total_processed_tokens,
         )
-
-        engine_inputs = [engine_inputs_map[name] for name in engine_input_names]
-
         return {
             "engine_inputs": engine_inputs,
             "kv_cache": kv_cache,
diff --git a/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py b/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py
index 9a885c2355..513c34dfc2 100644
--- a/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py
+++ b/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py
@@ -13,12 +13,9 @@
 # limitations under the License.
 
 import logging
-from enum import Enum
 from typing import Any
 
-import numpy
-
-from deepsparse.transformers.utils.helpers import create_causal_mask
+from deepsparse.transformers.utils.helpers import compute_engine_inputs
 from deepsparse.v2.operators import Operator
 from deepsparse.v2.utils import PipelineState
 
@@ -28,34 +25,14 @@
 __all__ = ["MultiEnginePrefill"]
 
 
-class OnnxInputNames(Enum):
-    INPUT_IDS = "input_ids"
-    ATTN_MASK = "attention_mask"
-    CAUSAL_MASK = "causal_mask"
-    POSITIONS = "positions"
-
-
-# NOTE: A possible clean-up could involve combining this Operator and the
-# autoregressive_preprocess_operator
-
-
 class MultiEnginePrefill(Operator):
     def __init__(self, prompt_sequence_length, sequence_length):
         """
         Prepare the tokens for the multi-token engine. This requires creating the
-        attention mask, positions, and causal mask. The output contains these three
-        arrays to be passed into the multi-token engine.
+        appropriate engine_inputsto be passed into the multi-token engine.
         """
         self.prompt_sequence_length = prompt_sequence_length
         self.sequence_length = sequence_length
-        self.cases = {
-            OnnxInputNames.ATTN_MASK.value: self._case_attn_mask,
-            OnnxInputNames.POSITIONS.value: self._case_positions,
-        }
-        _LOGGER.warn(
-            "This operator requires the PipelineState to be set-up with the "
-            "onnx_input_names_no_cache attribute set from the NLEngineOperator."
-        )
 
     def can_operate(self, inp: Any):
         """
@@ -75,59 +52,23 @@ def can_operate(self, inp: Any):
             return True
         return False
 
-    def _case_attn_mask(self, num_total_processed_tokens: int):
-        # create an empty attention mask
-        engine_input = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
-        # calculate the number of entries in attention mask that should be set to 1
-        num_attention_entries_to_unmask = min(
-            num_total_processed_tokens + self.prompt_sequence_length,
-            self.sequence_length,
-        )
-        engine_input[:, -num_attention_entries_to_unmask:] = 1
-        return engine_input
-
-    def _case_positions(self, num_total_processed_tokens: int):
-        return (
-            numpy.arange(
-                num_total_processed_tokens,
-                num_total_processed_tokens + self.prompt_sequence_length,
-            )
-            .reshape(1, -1)
-            .astype(numpy.int64)
-        )
-
     def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwargs):
         kv_cache.set_capacity(self.sequence_length - self.prompt_sequence_length)
 
-        onnx_input_names_no_cache = pipeline_state.current_state.get(
-            "onnx_input_names_no_cache"
-        )
-
         num_total_processed_tokens = kv_cache.total_num_processed_tokens
         start = num_total_processed_tokens
         end = start + self.prompt_sequence_length
         token_batch = tokens[start:end]
 
-        engine_inputs = []
-        for name in onnx_input_names_no_cache:
-            if name == OnnxInputNames.INPUT_IDS.value:
-                engine_input = numpy.array([token_batch])
-            elif (
-                name == OnnxInputNames.ATTN_MASK.value
-                or name == OnnxInputNames.POSITIONS.value
-            ):
-                engine_input = self.cases[name](num_total_processed_tokens)
-            elif name == OnnxInputNames.CAUSAL_MASK.value:
-                continue
-
-            engine_inputs.append(engine_input)
-
-        if OnnxInputNames.CAUSAL_MASK.value in onnx_input_names_no_cache:
-            causal_mask = create_causal_mask(
-                input_ids=engine_inputs[0],
-                attention_mask=engine_inputs[1],
-            )
-            engine_inputs.append(causal_mask)
+        engine_inputs = compute_engine_inputs(
+            onnx_input_names=pipeline_state.current_state.get(
+                "onnx_input_names_no_cache"
+            ),
+            token_batch=token_batch,
+            prompt_sequence_length=self.prompt_sequence_length,
+            sequence_length=self.sequence_length,
+            num_total_processed_tokens=num_total_processed_tokens,
+        )
 
         return {
             "engine_inputs": engine_inputs,
diff --git a/tests/deepsparse/transformers/utils/test_helpers.py b/tests/deepsparse/transformers/utils/test_helpers.py
index 7fcadcbf9c..95e4ee7fa7 100644
--- a/tests/deepsparse/transformers/utils/test_helpers.py
+++ b/tests/deepsparse/transformers/utils/test_helpers.py
@@ -16,12 +16,86 @@
 
 import pytest
 from deepsparse.transformers.utils.helpers import (
+    compute_engine_inputs,
     create_causal_mask,
     initialize_kv_cache_state,
     validate_session_ids,
 )
 
 
+@pytest.mark.parametrize(
+    "onnx_input_names, "
+    "token_batch, "
+    "prompt_sequence_length, "
+    "sequence_length,  "
+    "num_total_processed_tokens, "
+    "expected_engine_inputs",
+    [
+        (
+            ["input_ids", "attention_mask", "positions"],
+            [1, 2, 3],
+            3,
+            6,
+            2,
+            [
+                numpy.array([[1, 2, 3]]),
+                numpy.array([[0, 1, 1, 1, 1, 1]]),
+                numpy.array([[2, 3, 4]]),
+            ],
+        ),
+        (
+            ["input_ids", "attention_mask", "positions", "causal_mask"],
+            [1, 2, 3],
+            3,
+            6,
+            2,
+            [
+                numpy.array([[1, 2, 3]]),
+                numpy.array([[0, 1, 1, 1, 1, 1]]),
+                numpy.array([[2, 3, 4]]),
+                create_causal_mask(
+                    input_ids=numpy.array([[1, 2, 3]]),
+                    attention_mask=numpy.array([[0, 1, 1, 1, 1, 1]]),
+                ),
+            ],
+        ),
+        (
+            ["input_ids", "attention_mask", "positions", "causal_mask"],
+            [15],
+            1,
+            5,
+            3,
+            [
+                numpy.array([[15]]),
+                numpy.array([[0, 1, 1, 1, 1]]),
+                numpy.array([[3]]),
+                create_causal_mask(
+                    input_ids=numpy.array([[15]]),
+                    attention_mask=numpy.array([[0, 1, 1, 1, 1]]),
+                ),
+            ],
+        ),
+    ],
+)
+def test_compute_engine_inputs(
+    onnx_input_names,
+    token_batch,
+    prompt_sequence_length,
+    sequence_length,
+    num_total_processed_tokens,
+    expected_engine_inputs,
+):
+    engine_inputs = compute_engine_inputs(
+        onnx_input_names=onnx_input_names,
+        token_batch=token_batch,
+        prompt_sequence_length=prompt_sequence_length,
+        sequence_length=sequence_length,
+        num_total_processed_tokens=num_total_processed_tokens,
+    )
+    for x, y in zip(engine_inputs, expected_engine_inputs):
+        assert numpy.array_equal(x, y)
+
+
 @pytest.mark.parametrize(
     "input_ids, attention_mask, expected_causal_mask",
     [

From 51c4ee68523978aa84eb66f39925bd24bdf6a617 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Fri, 17 Nov 2023 14:52:48 +0000
Subject: [PATCH 29/43] pipeline runs, but incorrectly

---
 .../transformers/utils/token_generator.py     | 10 +--
 .../v2/text_generation/join_output.py         |  3 +
 .../v2/text_generation/nl_engine_operator.py  | 45 +++++++++-
 src/deepsparse/v2/text_generation/pipeline.py | 85 +++++++++++++++++--
 .../v2/text_generation/prep_for_generation.py |  1 +
 .../v2/unit/text_generation/conftest.py       |  4 +-
 tests/testdata/gsm8k-v0-greedy_until          |  1 +
 tests/testdata/gsm8k-v0-res.json              |  1 +
 8 files changed, 135 insertions(+), 15 deletions(-)
 create mode 100644 tests/testdata/gsm8k-v0-greedy_until
 create mode 100644 tests/testdata/gsm8k-v0-res.json

diff --git a/src/deepsparse/transformers/utils/token_generator.py b/src/deepsparse/transformers/utils/token_generator.py
index 5fa82b7bc4..76f922de11 100644
--- a/src/deepsparse/transformers/utils/token_generator.py
+++ b/src/deepsparse/transformers/utils/token_generator.py
@@ -77,16 +77,16 @@ def generate(self, logits: numpy.ndarray) -> numpy.ndarray:
         :param logits: the logits from the model with shape (vocab_size,)
         :return: the sampled token
         """
-        if self.top_k:
-            logits = self.apply_top_k(logits)
-        if self.top_p:
-            logits = self.apply_top_p(logits)
-
         if self.deterministic:
             token = numpy.argmax(logits)
             self.tokens.append(token)
             return token
 
+        if self.top_k:
+            logits = self.apply_top_k(logits)
+        if self.top_p:
+            logits = self.apply_top_p(logits)
+
         if self.sampling_temperature != 1.0:
             logits /= self.sampling_temperature
 
diff --git a/src/deepsparse/v2/text_generation/join_output.py b/src/deepsparse/v2/text_generation/join_output.py
index 8a6c77a2f1..29c086d713 100644
--- a/src/deepsparse/v2/text_generation/join_output.py
+++ b/src/deepsparse/v2/text_generation/join_output.py
@@ -33,6 +33,9 @@ def __init__(self, tokenizer):
         self.tokenizer = tokenizer
 
     def run(self, inp: List[CompileGenerationsOutput], **kwargs):
+
+        if not isinstance(inp, list):
+            inp = [[inp]]
         batch_outputs = [x for x in inp[0]]
         generated_tokens = [x.generated_tokens for x in batch_outputs]
         generated_logits = [x.generated_logits for x in batch_outputs]
diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py
index 7549f986d9..9c33cb1f93 100644
--- a/src/deepsparse/v2/text_generation/nl_engine_operator.py
+++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py
@@ -18,6 +18,7 @@
 
 from pydantic import BaseModel, Field
 
+from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs
 from deepsparse.utils.onnx import (
     CACHE_INPUT_PREFIX,
     overwrite_onnx_model_inputs_for_kv_cache_models,
@@ -29,7 +30,12 @@
 )
 
 
-__all__ = ["NLEngineOperator", "NlEngineInput"]
+__all__ = [
+    "NlEngineOperator",
+    "NlEngineOperatorNoCache",
+    "NlEngineInputNoCache",
+    "NlEngineInput",
+]
 
 
 class NlEngineInput(BaseModel):
@@ -39,7 +45,12 @@ class NlEngineInput(BaseModel):
     in_generation: bool = Field(description="in_generation", default=None)
 
 
-class NLEngineOperator(EngineOperator):
+class NlEngineInputNoCache(BaseModel):
+    input_ids: Any
+    attention_mask: Any
+
+
+class NlEngineOperator(EngineOperator):
 
     """
     Operator for the NL Decoder Engine. This Operator inherits from the EngineOperator.
@@ -195,3 +206,33 @@ def output_names(self) -> List[str]:
         :return: The output names for the onnx model
         """
         return self.engine.output_names
+
+
+class NlEngineOperatorNoCache(EngineOperator):
+
+    input_schema = NlEngineInputNoCache
+    output_schema = None
+
+    def __init__(self, sequence_length, **kwargs):
+        model_path, *_ = overwrite_transformer_onnx_model_inputs(
+            path=kwargs.get("model_path"),
+            max_length=sequence_length,
+            batch_size=kwargs.get("batch_size", 1),
+        )
+        super().__init__(**kwargs)
+
+    def run(self, inp: NlEngineInputNoCache, **kwargs) -> Any:
+        engine_inputs = [inp.input_ids, inp.attention_mask]
+        logits = (
+            super()
+            .run(EngineOperatorInputs(engine_inputs=engine_inputs), **kwargs)
+            .get("engine_outputs")
+        )
+        return {
+            "logits": logits,
+            "logits_shape": None,
+            "deterministic": None,
+            "kv_cache": None,
+            "tokens": None,
+            "sampling_temperature": None,
+        }, {"prompt_logits": logits}
diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py
index 4695220819..0f1c3cf559 100644
--- a/src/deepsparse/v2/text_generation/pipeline.py
+++ b/src/deepsparse/v2/text_generation/pipeline.py
@@ -17,8 +17,9 @@
 from deepsparse.transformers.helpers import setup_transformers_pipeline
 from deepsparse.transformers.utils.helpers import process_generation_config
 from deepsparse.utils import split_engine_inputs
+from deepsparse.utils.onnx import default_cached_outputs
 from deepsparse.v2.pipeline import Pipeline
-from deepsparse.v2.routers import GraphRouter
+from deepsparse.v2.routers import GraphRouter, LinearRouter
 from deepsparse.v2.schedulers import OperatorScheduler
 from deepsparse.v2.text_generation import (
     AutoRegressiveOperatorPreprocess,
@@ -29,7 +30,8 @@
     JoinOutput,
     KVCacheCreator,
     MultiEnginePrefill,
-    NLEngineOperator,
+    NlEngineOperator,
+    NlEngineOperatorNoCache,
     PrepareforPrefill,
     PrepareGeneration,
     ProcessInputsTextGeneration,
@@ -39,6 +41,79 @@
 from deepsparse.v2.utils import PipelineState
 
 
+class TextGenerationPipelineNoCache(Pipeline):
+    def __init__(
+        self,
+        model_path: str,
+        sequence_length: int = 1024,
+        engine_kwargs: Optional[Dict] = None,
+        onnx_model_name: Optional[str] = None,
+        generation_config=None,  # TODO: Typing here
+        **kwargs,
+    ):
+
+        (
+            self.model_path,
+            self.config,
+            self.tokenizer,
+            engine_kwargs,
+        ) = setup_transformers_pipeline(
+            model_path,
+            sequence_length,
+            onnx_model_name=onnx_model_name,
+            engine_kwargs=engine_kwargs,
+        )
+        self.verify_no_kv_cache_present()
+
+        token_generator = TokenGeneratorOperator()
+
+        ops = [
+            ProcessInputsTextGeneration(
+                generation_config=process_generation_config(generation_config),
+                sequence_length=sequence_length,
+                tokenizer=self.tokenizer,
+            ),
+            NlEngineOperatorNoCache(sequence_length=sequence_length, **engine_kwargs),
+            PrepareGeneration(
+                sequence_length=sequence_length,
+                prompt_sequence_length=1,
+                token_generator=token_generator,
+            ),
+            GenerateNewTokenOperator(tokenizer=self.tokenizer, force_max_tokens=True),
+            CompileGeneratedTokens(),
+            CompileGenerations(),
+            JoinOutput(tokenizer=self.tokenizer),
+            ProcessOutputs(tokenizer=self.tokenizer),
+        ]
+        router = LinearRouter(end_route=len(ops))
+        scheduler = [OperatorScheduler()]
+        super().__init__(
+            ops=ops,
+            router=router,
+            schedulers=scheduler,
+        )
+
+    def run(self, *args, **kwargs):
+        # we need to set the fixed_sequences_length flag to True
+        # for the non-kv cache pipeline
+        kwargs.update(dict(fixed_sequences_length=True))
+        return super().run(*args, **kwargs)
+
+    def verify_no_kv_cache_present(self) -> bool:
+        """
+        Verifies that the ONNX model does not have
+        KV cache inputs/outputs present.
+        :return: True if compatible, False otherwise
+        """
+        is_kv_cache_present = any(default_cached_outputs(self.model_path))
+        if is_kv_cache_present:
+            raise ValueError(
+                f"The model: {self.model_path} has KV cache inputs/outputs present. "
+                "Please use the TextGenerationPipeline instead."
+            )
+        return not is_kv_cache_present
+
+
 class TextGenerationPipeline(Pipeline):
     def __init__(
         self,
@@ -65,14 +140,14 @@ def __init__(
         if internal_kv_cache and engine_kwargs.get("engine_type") == "onnxruntime":
             internal_kv_cache = False
 
-        single_engine_operator = NLEngineOperator(
+        single_engine_operator = NlEngineOperator(
             sequence_length=sequence_length,
             internal_kv_cache=internal_kv_cache,
             input_ids_length=1,
             **engine_kwargs,
         )
 
-        multi_engine_operator = NLEngineOperator(
+        multi_engine_operator = NlEngineOperator(
             sequence_length=sequence_length,
             internal_kv_cache=internal_kv_cache,
             input_ids_length=prompt_sequence_length,
@@ -194,5 +269,3 @@ def expand_inputs(self, items, batch_size):
 
     def condense_inputs(self, *args, **kwargs):
         return args[0], kwargs
-
-   
\ No newline at end of file
diff --git a/src/deepsparse/v2/text_generation/prep_for_generation.py b/src/deepsparse/v2/text_generation/prep_for_generation.py
index 0ea4a06a02..9b63946c16 100644
--- a/src/deepsparse/v2/text_generation/prep_for_generation.py
+++ b/src/deepsparse/v2/text_generation/prep_for_generation.py
@@ -91,6 +91,7 @@ def run(
             "token_generator": token_generator,
         }
         output = {
+            "logits": prompt_logits,
             "tokens": token_generator.tokens,
             "kv_cache": kv_cache,
             "in_generation": True,
diff --git a/tests/deepsparse/v2/unit/text_generation/conftest.py b/tests/deepsparse/v2/unit/text_generation/conftest.py
index 5d8483e5f6..7524db454a 100644
--- a/tests/deepsparse/v2/unit/text_generation/conftest.py
+++ b/tests/deepsparse/v2/unit/text_generation/conftest.py
@@ -25,7 +25,7 @@
 from deepsparse.v2 import InferenceState, PipelineState
 from deepsparse.v2.text_generation import (
     GenerationDefaults,
-    NLEngineOperator,
+    NlEngineOperator,
     TokenGeneratorOperator,
 )
 
@@ -61,7 +61,7 @@ def single_token_engine_no_internal_cache(text_generation_attributes, model_attr
     seq_length, _ = text_generation_attributes
     _, model_path = model_attributes
 
-    nl_engine_operator = NLEngineOperator(
+    nl_engine_operator = NlEngineOperator(
         sequence_length=seq_length, input_ids_length=1, model_path=model_path
     )
     return nl_engine_operator
diff --git a/tests/testdata/gsm8k-v0-greedy_until b/tests/testdata/gsm8k-v0-greedy_until
new file mode 100644
index 0000000000..09a6a1eadb
--- /dev/null
+++ b/tests/testdata/gsm8k-v0-greedy_until
@@ -0,0 +1 @@
+3b4bf5c7d1504339aa06bcb50212dba05ff761d30de6faf720fdc818b16316ad
\ No newline at end of file
diff --git a/tests/testdata/gsm8k-v0-res.json b/tests/testdata/gsm8k-v0-res.json
new file mode 100644
index 0000000000..fb6514a0e7
--- /dev/null
+++ b/tests/testdata/gsm8k-v0-res.json
@@ -0,0 +1 @@
+{"results": {"gsm8k": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"gsm8k": 0}}
\ No newline at end of file

From fa96efb7105962607c9b27dd0f24e2e89314a973 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Mon, 20 Nov 2023 13:26:32 +0000
Subject: [PATCH 30/43] it works for a single sequence

---
 .../v2/text_generation/nl_engine_operator.py      | 15 ++++++---------
 src/deepsparse/v2/text_generation/pipeline.py     |  4 ++--
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py
index 9c33cb1f93..cb27f69cc0 100644
--- a/src/deepsparse/v2/text_generation/nl_engine_operator.py
+++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py
@@ -16,9 +16,9 @@
 import os
 from typing import Any, List, Tuple
 
+import numpy
 from pydantic import BaseModel, Field
 
-from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs
 from deepsparse.utils.onnx import (
     CACHE_INPUT_PREFIX,
     overwrite_onnx_model_inputs_for_kv_cache_models,
@@ -213,12 +213,7 @@ class NlEngineOperatorNoCache(EngineOperator):
     input_schema = NlEngineInputNoCache
     output_schema = None
 
-    def __init__(self, sequence_length, **kwargs):
-        model_path, *_ = overwrite_transformer_onnx_model_inputs(
-            path=kwargs.get("model_path"),
-            max_length=sequence_length,
-            batch_size=kwargs.get("batch_size", 1),
-        )
+    def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
     def run(self, inp: NlEngineInputNoCache, **kwargs) -> Any:
@@ -228,11 +223,13 @@ def run(self, inp: NlEngineInputNoCache, **kwargs) -> Any:
             .run(EngineOperatorInputs(engine_inputs=engine_inputs), **kwargs)
             .get("engine_outputs")
         )
+
+        logits = numpy.compress(inp.attention_mask[0], logits[0], axis=1)
         return {
-            "logits": logits,
+            "logits": [logits],
             "logits_shape": None,
             "deterministic": None,
             "kv_cache": None,
             "tokens": None,
             "sampling_temperature": None,
-        }, {"prompt_logits": logits}
+        }, {"prompt_logits": [logits]}
diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py
index 0f1c3cf559..d36dabab5d 100644
--- a/src/deepsparse/v2/text_generation/pipeline.py
+++ b/src/deepsparse/v2/text_generation/pipeline.py
@@ -60,6 +60,7 @@ def __init__(
         ) = setup_transformers_pipeline(
             model_path,
             sequence_length,
+            tokenizer_padding_side="right",
             onnx_model_name=onnx_model_name,
             engine_kwargs=engine_kwargs,
         )
@@ -73,14 +74,13 @@ def __init__(
                 sequence_length=sequence_length,
                 tokenizer=self.tokenizer,
             ),
-            NlEngineOperatorNoCache(sequence_length=sequence_length, **engine_kwargs),
+            NlEngineOperatorNoCache(**engine_kwargs),
             PrepareGeneration(
                 sequence_length=sequence_length,
                 prompt_sequence_length=1,
                 token_generator=token_generator,
             ),
             GenerateNewTokenOperator(tokenizer=self.tokenizer, force_max_tokens=True),
-            CompileGeneratedTokens(),
             CompileGenerations(),
             JoinOutput(tokenizer=self.tokenizer),
             ProcessOutputs(tokenizer=self.tokenizer),

From e41ddf891662cea1ddfa1e6af08a90a4dfddf918 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Mon, 20 Nov 2023 14:06:07 +0000
Subject: [PATCH 31/43] cleanup. now lets figure out how to run multiple
 sequences

---
 .../v2/text_generation/join_output.py          |  6 +++++-
 .../v2/text_generation/nl_engine_operator.py   | 18 ++++++++++--------
 tests/testdata/gsm8k-v0-greedy_until           |  1 -
 tests/testdata/gsm8k-v0-res.json               |  1 -
 4 files changed, 15 insertions(+), 11 deletions(-)
 delete mode 100644 tests/testdata/gsm8k-v0-greedy_until
 delete mode 100644 tests/testdata/gsm8k-v0-res.json

diff --git a/src/deepsparse/v2/text_generation/join_output.py b/src/deepsparse/v2/text_generation/join_output.py
index 29c086d713..5813702f46 100644
--- a/src/deepsparse/v2/text_generation/join_output.py
+++ b/src/deepsparse/v2/text_generation/join_output.py
@@ -32,10 +32,14 @@ class JoinOutput(Operator):
     def __init__(self, tokenizer):
         self.tokenizer = tokenizer
 
-    def run(self, inp: List[CompileGenerationsOutput], **kwargs):
+    def run(self, inp: List[List[CompileGenerationsOutput]], **kwargs):
 
         if not isinstance(inp, list):
+            # when running without KV Cache
+            # this will be a single
+            # CompileGenerationsOutput for now
             inp = [[inp]]
+
         batch_outputs = [x for x in inp[0]]
         generated_tokens = [x.generated_tokens for x in batch_outputs]
         generated_logits = [x.generated_logits for x in batch_outputs]
diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py
index cb27f69cc0..fe28bdfe2c 100644
--- a/src/deepsparse/v2/text_generation/nl_engine_operator.py
+++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py
@@ -209,6 +209,11 @@ def output_names(self) -> List[str]:
 
 
 class NlEngineOperatorNoCache(EngineOperator):
+    """
+    Operator the Natural Language Engine, that operates without
+    KV Cache. This means that this operator merely maps input_ids
+    and attention_mask to logits
+    """
 
     input_schema = NlEngineInputNoCache
     output_schema = None
@@ -224,12 +229,9 @@ def run(self, inp: NlEngineInputNoCache, **kwargs) -> Any:
             .get("engine_outputs")
         )
 
+        # By default, the engine outputs logits for all tokens in the sequence.
+        # Let's filter out the logits for the padding tokens.
         logits = numpy.compress(inp.attention_mask[0], logits[0], axis=1)
-        return {
-            "logits": [logits],
-            "logits_shape": None,
-            "deterministic": None,
-            "kv_cache": None,
-            "tokens": None,
-            "sampling_temperature": None,
-        }, {"prompt_logits": [logits]}
+        return {"logits": [logits], "kv_cache": None, "tokens": None}, {
+            "prompt_logits": [logits]
+        }
diff --git a/tests/testdata/gsm8k-v0-greedy_until b/tests/testdata/gsm8k-v0-greedy_until
deleted file mode 100644
index 09a6a1eadb..0000000000
--- a/tests/testdata/gsm8k-v0-greedy_until
+++ /dev/null
@@ -1 +0,0 @@
-3b4bf5c7d1504339aa06bcb50212dba05ff761d30de6faf720fdc818b16316ad
\ No newline at end of file
diff --git a/tests/testdata/gsm8k-v0-res.json b/tests/testdata/gsm8k-v0-res.json
deleted file mode 100644
index fb6514a0e7..0000000000
--- a/tests/testdata/gsm8k-v0-res.json
+++ /dev/null
@@ -1 +0,0 @@
-{"results": {"gsm8k": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"gsm8k": 0}}
\ No newline at end of file

From b80a417a3fd5f46035ee3a73dfc4cffae074d3c6 Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Mon, 20 Nov 2023 21:55:11 +0100
Subject: [PATCH 32/43] [Pipeline Refactor][Text-Generation] Refactor
 `transformers` helpers functions (#1394)

* add split/join functionality

* update router to include split/join in parent class, refactor pipeline code to remove repeat code, update map function

* process multiple generations

* initial commit

* fix error

* unit testing for text generation operators

* additional changes

* unit testing completion

* remove debug

* fix

* add todo

* more clean-up

* fix test

* add docstrings/comments

* break out tests to individual unit test files; add conftest and make scope of fixtures module to help with speed

* Delete tests/deepsparse/v2/unit/text_generation/test_msic.py

* pipeline runs, but incorrectly

* Revert "pipeline runs, but incorrectly"

This reverts commit 51c4ee68523978aa84eb66f39925bd24bdf6a617.

* PR review comments

---------

Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 src/deepsparse/transformers/helpers.py        | 114 ++++++++++++++++--
 .../transformers/pipelines/pipeline.py        |  38 ++----
 src/deepsparse/utils/onnx.py                  |   8 +-
 src/deepsparse/v2/text_generation/pipeline.py |  68 ++---------
 4 files changed, 130 insertions(+), 98 deletions(-)

diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index d7acc71a99..7273b61406 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -17,24 +17,26 @@
 """
 
 
+import logging
 import os
 import re
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from typing import List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy
 import onnx
+import transformers
 from onnx import ModelProto
 
 from deepsparse.log import get_main_logger
-from deepsparse.utils.onnx import _MODEL_DIR_ONNX_NAME, truncate_onnx_model
+from deepsparse.utils.onnx import MODEL_ONNX_NAME, truncate_onnx_model
 from sparsezoo import Model
 from sparsezoo.utils import save_onnx
 
 
 __all__ = [
-    "get_deployment_path",
+    "setup_transformers_pipeline",
     "overwrite_transformer_onnx_model_inputs",
     "fix_numpy_types",
     "get_transformer_layer_init_names",
@@ -44,7 +46,94 @@
 _LOGGER = get_main_logger()
 
 
-def get_deployment_path(model_path: str) -> Tuple[str, str]:
+def setup_transformers_pipeline(
+    model_path: str,
+    sequence_length: int,
+    tokenizer_padding_side: str = "left",
+    engine_kwargs: Optional[Dict] = None,
+    onnx_model_name: Optional[str] = None,
+) -> Tuple[
+    str, transformers.PretrainedConfig, transformers.PreTrainedTokenizer, Dict[str, Any]
+]:
+    """
+    A helper function that sets up the model path, config, tokenizer,
+    and engine kwargs for a transformers model.
+    :param model_path: The path to the model to load
+    :param sequence_length: The sequence length to use for the model
+    :param tokenizer_padding_side: The side to pad on for the tokenizer,
+        either "left" or "right"
+    :param engine_kwargs: The kwargs to pass to the engine
+    :param onnx_model_name: The name of the onnx model to be loaded.
+        If not specified, defaults are used (see setup_onnx_file_path)
+    :return The model path, config, tokenizer, and engine kwargs
+    """
+    model_path, config, tokenizer = setup_onnx_file_path(
+        model_path, sequence_length, onnx_model_name
+    )
+
+    tokenizer.padding_side = tokenizer_padding_side
+    if not tokenizer.pad_token:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    engine_kwargs = engine_kwargs or {}
+    if engine_kwargs.get("model_path"):
+        raise ValueError(
+            "The engine kwargs already specify "
+            f"a model path: {engine_kwargs['model_path']}, "
+            f"but a model path was also provided: {model_path}. "
+            "Please only provide one."
+        )
+    engine_kwargs["model_path"] = model_path
+    return model_path, config, tokenizer, engine_kwargs
+
+
+def setup_onnx_file_path(
+    model_path: str,
+    sequence_length: int,
+    onnx_model_name: Optional[str] = None,
+    task: Optional[str] = None,
+) -> Tuple[str, transformers.PretrainedConfig, transformers.PreTrainedTokenizer]:
+    """
+    Parses ONNX model from the `model_path` provided. It additionally
+    creates config and tokenizer objects from the `deployment path`,
+    derived from the `model_path` provided.
+    :param model_path: path to the model to be parsed
+    :param sequence_length: maximum sequence length of the model
+    :param onnx_model_name: optionally, the precise name of the ONNX model
+        of interest may be specified. If not specified, the default ONNX model
+        name will be used (refer to `get_deployment_path` for details)
+    :return: file path to the processed ONNX file for the engine to compile
+    """
+    deployment_path, onnx_path = get_deployment_path(model_path, onnx_model_name)
+
+    hf_logger = logging.getLogger("transformers")
+    hf_logger_level = hf_logger.level
+    hf_logger.setLevel(logging.ERROR)
+
+    config = transformers.PretrainedConfig.from_pretrained(
+        deployment_path, finetuning_task=task
+    )
+    hf_logger.setLevel(hf_logger_level)
+
+    trust_remote_code = False
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        deployment_path,
+        trust_remote_code=trust_remote_code,
+        model_max_length=sequence_length,
+    )
+
+    if not config or not tokenizer:
+        raise RuntimeError(
+            "Invalid config or tokenizer provided. Please provide "
+            "paths to the files or ensure they exist in the `model_path` provided. "
+            "See `tokenizer` and `config` arguments for details."
+        )
+    return onnx_path, config, tokenizer
+
+
+def get_deployment_path(
+    model_path: str, onnx_model_name: Optional[str] = None
+) -> Tuple[str, str]:
     """
     Returns the path to the deployment directory
     for the given model path and the path to the mandatory
@@ -53,9 +142,12 @@ def get_deployment_path(model_path: str) -> Tuple[str, str]:
     for running the transformers model in the deepsparse pipeline
 
     :param model_path: path to model directory, sparsezoo stub, or ONNX file
+    :param onnx_model_name: name of the ONNX file to look for in the deployment
+        directory. Defaults to MODEL_ONNX_NAME
     :return: path to the deployment directory and path to the ONNX file inside
         the deployment directory
     """
+    onnx_model_name = onnx_model_name or MODEL_ONNX_NAME
     if os.path.isfile(model_path):
         # return the parent directory of the ONNX file
         return os.path.dirname(model_path), model_path
@@ -63,26 +155,26 @@ def get_deployment_path(model_path: str) -> Tuple[str, str]:
     if os.path.isdir(model_path):
         model_files = os.listdir(model_path)
 
-        if _MODEL_DIR_ONNX_NAME not in model_files:
+        if onnx_model_name not in model_files:
             raise ValueError(
-                f"{_MODEL_DIR_ONNX_NAME} not found in transformers model directory "
+                f"{onnx_model_name} not found in transformers model directory "
                 f"{model_path}. Be sure that an export of the model is written to "
-                f"{os.path.join(model_path, _MODEL_DIR_ONNX_NAME)}"
+                f"{os.path.join(model_path, onnx_model_name)}"
             )
-        return model_path, os.path.join(model_path, _MODEL_DIR_ONNX_NAME)
+        return model_path, os.path.join(model_path, onnx_model_name)
 
     elif model_path.startswith("zoo:"):
         zoo_model = Model(model_path)
         deployment_path = zoo_model.deployment_directory_path
-        return deployment_path, os.path.join(deployment_path, _MODEL_DIR_ONNX_NAME)
+        return deployment_path, os.path.join(deployment_path, onnx_model_name)
     elif model_path.startswith("hf:"):
         from huggingface_hub import snapshot_download
 
         deployment_path = snapshot_download(repo_id=model_path.replace("hf:", "", 1))
-        onnx_path = os.path.join(deployment_path, _MODEL_DIR_ONNX_NAME)
+        onnx_path = os.path.join(deployment_path, onnx_model_name)
         if not os.path.isfile(onnx_path):
             raise ValueError(
-                f"{_MODEL_DIR_ONNX_NAME} not found in transformers model directory "
+                f"{onnx_model_name} not found in transformers model directory "
                 f"{deployment_path}. Be sure that an export of the model is written to "
                 f"{onnx_path}"
             )
diff --git a/src/deepsparse/transformers/pipelines/pipeline.py b/src/deepsparse/transformers/pipelines/pipeline.py
index 065a26ce71..ac54c4a3db 100644
--- a/src/deepsparse/transformers/pipelines/pipeline.py
+++ b/src/deepsparse/transformers/pipelines/pipeline.py
@@ -16,19 +16,18 @@
 Base Pipeline class for transformers inference pipeline
 """
 
-import logging
+
 import warnings
 from pathlib import Path
 from typing import Any, Dict, List, Mapping, Optional, Union
 
 import numpy
 import transformers
-from transformers.models.auto import AutoTokenizer
 
 from deepsparse import Bucketable, Pipeline
+from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs
 from deepsparse.transformers.helpers import (
-    get_deployment_path,
-    overwrite_transformer_onnx_model_inputs,
+    setup_onnx_file_path as setup_onnx_file_path_v2,
 )
 
 
@@ -124,24 +123,15 @@ def setup_onnx_file_path(self) -> str:
 
         :return: file path to the processed ONNX file for the engine to compile
         """
-        deployment_path, onnx_path = get_deployment_path(self.model_path)
-
-        # temporarily set transformers logger to ERROR to avoid
-        # printing misleading warnings
-        hf_logger = logging.getLogger("transformers")
-        hf_logger_level = hf_logger.level
-        hf_logger.setLevel(logging.ERROR)
-        self.config = transformers.PretrainedConfig.from_pretrained(
-            deployment_path,
-            finetuning_task=self.task if hasattr(self, "task") else None,
-        )
-        hf_logger.setLevel(hf_logger_level)
-
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            deployment_path,
-            trust_remote_code=self._trust_remote_code,
-            model_max_length=self.sequence_length,
+        # we will be soon retiring V1 pipelines. This is why I am deciding
+        # to reuse the functions from V2 pipelines in the (soon) legacy pipelines
+        onnx_path, config, tokenizer = setup_onnx_file_path_v2(
+            model_path=self.model_path,
+            sequence_length=self.sequence_length,
+            task=self.task if hasattr(self, "task") else None,
         )
+        self.config = config
+        self.tokenizer = tokenizer
 
         if not self._delay_overwriting_inputs:
             # overwrite onnx graph to given required input shape
@@ -153,12 +143,6 @@ def setup_onnx_file_path(self) -> str:
                 onnx_path, max_length=self.sequence_length
             )
 
-        if not self.config or not self.tokenizer:
-            raise RuntimeError(
-                "Invalid config or tokenizer provided. Please provide "
-                "paths to the files or ensure they exist in the `model_path` provided. "
-                "See `tokenizer` and `config` arguments for details."
-            )
         return onnx_path
 
     def tokens_to_engine_input(
diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index e69bf67321..f518620c2f 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -56,12 +56,12 @@
     "has_model_kv_cache",
     "CACHE_INPUT_PREFIX",
     "CACHE_OUTPUT_PREFIX",
-    "_MODEL_DIR_ONNX_NAME",
+    "MODEL_ONNX_NAME",
 ]
 
 _LOGGER = logging.getLogger(__name__)
 
-_MODEL_DIR_ONNX_NAME = "model.onnx"
+MODEL_ONNX_NAME = "model.onnx"
 CACHE_INPUT_PREFIX = "past_key_values"
 CACHE_OUTPUT_PREFIX = "present"
 
@@ -132,7 +132,7 @@ def model_to_path(model: Union[str, Model, File]) -> str:
         model.deployment_directory_path
 
         # default to the main onnx file for the model
-        model = model.deployment.get_file(_MODEL_DIR_ONNX_NAME).path
+        model = model.deployment.get_file(MODEL_ONNX_NAME).path
 
     elif File is not object and isinstance(model, File):
         # get the downloaded_path -- will auto download if not on local system
@@ -146,7 +146,7 @@ def model_to_path(model: Union[str, Model, File]) -> str:
 
     model_path = Path(model)
     if model_path.is_dir():
-        return str(model_path / _MODEL_DIR_ONNX_NAME)
+        return str(model_path / MODEL_ONNX_NAME)
 
     return model
 
diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py
index 240da04907..5ab73f7a48 100644
--- a/src/deepsparse/v2/text_generation/pipeline.py
+++ b/src/deepsparse/v2/text_generation/pipeline.py
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict
+from typing import Dict, Optional
 
+from deepsparse.transformers.helpers import setup_transformers_pipeline
 from deepsparse.transformers.utils.helpers import process_generation_config
 from deepsparse.utils import split_engine_inputs
 from deepsparse.v2.pipeline import Pipeline
@@ -47,23 +48,20 @@ def __init__(
         internal_kv_cache: bool = True,
         force_max_tokens: bool = False,
         generation_config=None,
-        engine_kwargs: Dict = None,
+        engine_kwargs: Optional[Dict] = None,
     ):
+        (
+            self.model_path,
+            self.config,
+            self.tokenizer,
+            engine_kwargs,
+        ) = setup_transformers_pipeline(
+            model_path, sequence_length, engine_kwargs=engine_kwargs
+        )
 
         pipeline_state = PipelineState()
         pipeline_state_vals = {}
 
-        # TODO: The code below will be replaced with a transformers set-up Operator.
-        self.tokenizer = None
-        model_path = self.setup_onnx_file_path(model_path, sequence_length)
-        self.tokenizer.padding_side = "left"
-        if not self.tokenizer.pad_token:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-
-        if not engine_kwargs:
-            engine_kwargs = {}
-        engine_kwargs["model_path"] = model_path
-
         if internal_kv_cache and engine_kwargs.get("engine_type") == "onnxruntime":
             internal_kv_cache = False
 
@@ -82,7 +80,7 @@ def __init__(
         )
 
         # NOTE: Currently using pipeline state. Can swap to simply pass in the
-        # attributes to the specific Operator that neeed them, as class attributes.
+        # attributes to the specific Operator that need them, as class attributes.
         pipeline_state_vals[
             "onnx_input_names_no_cache"
         ] = single_engine_operator.onnx_input_names_no_cache
@@ -196,45 +194,3 @@ def expand_inputs(self, items, batch_size):
 
     def condense_inputs(self, *args, **kwargs):
         return args[0], kwargs
-
-    # TODO: Move to be part of a generic transformers set-up Operator.
-    def setup_onnx_file_path(self, model_path, sequence_length) -> str:
-        import logging
-
-        import transformers
-        from transformers import AutoTokenizer
-
-        from deepsparse.transformers.helpers import get_deployment_path
-
-        """
-        Parses ONNX model from the `model_path` provided. It additionally
-        creates config and tokenizer objects from the `deployment path`,
-        derived from the `model_path` provided.
-
-        :return: file path to the processed ONNX file for the engine to compile
-        """
-        deployment_path, onnx_path = get_deployment_path(model_path)
-
-        hf_logger = logging.getLogger("transformers")
-        hf_logger_level = hf_logger.level
-        hf_logger.setLevel(logging.ERROR)
-        self.config = transformers.PretrainedConfig.from_pretrained(
-            deployment_path,
-            finetuning_task=self.task if hasattr(self, "task") else None,
-        )
-        hf_logger.setLevel(hf_logger_level)
-
-        self._trust_remote_code = False
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            deployment_path,
-            trust_remote_code=self._trust_remote_code,
-            model_max_length=sequence_length,
-        )
-
-        if not self.config or not self.tokenizer:
-            raise RuntimeError(
-                "Invalid config or tokenizer provided. Please provide "
-                "paths to the files or ensure they exist in the `model_path` provided. "
-                "See `tokenizer` and `config` arguments for details."
-            )
-        return onnx_path

From 1b9238a28e664cef1bb6fc2a57c9193fb3d55ce8 Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Mon, 20 Nov 2023 21:55:53 +0100
Subject: [PATCH 33/43] [Text Generation][V2] End-to-end tests (#1402)

* initial commit

* initial commit

* its working now

* beautification

* thank you Dipika <3

* ready to review
---
 .../transformers/utils/token_generator.py     |  11 +-
 .../v2/text_generation/process_inputs.py      |  19 +-
 .../v2/integration_tests/__init__.py          |  13 +
 .../v2/integration_tests/configs/codegen.yaml |   6 +
 .../v2/integration_tests/configs/gpt_neo.yaml |   6 +
 .../v2/integration_tests/configs/opt.yaml     |   6 +
 .../v2/integration_tests/helpers.py           | 137 +++++++
 .../v2/integration_tests/test_llms.py         | 368 ++++++++++++++++++
 8 files changed, 547 insertions(+), 19 deletions(-)
 create mode 100644 tests/deepsparse/v2/integration_tests/__init__.py
 create mode 100644 tests/deepsparse/v2/integration_tests/configs/codegen.yaml
 create mode 100644 tests/deepsparse/v2/integration_tests/configs/gpt_neo.yaml
 create mode 100644 tests/deepsparse/v2/integration_tests/configs/opt.yaml
 create mode 100644 tests/deepsparse/v2/integration_tests/helpers.py
 create mode 100644 tests/deepsparse/v2/integration_tests/test_llms.py

diff --git a/src/deepsparse/transformers/utils/token_generator.py b/src/deepsparse/transformers/utils/token_generator.py
index 5fa82b7bc4..0421da06e2 100644
--- a/src/deepsparse/transformers/utils/token_generator.py
+++ b/src/deepsparse/transformers/utils/token_generator.py
@@ -77,16 +77,17 @@ def generate(self, logits: numpy.ndarray) -> numpy.ndarray:
         :param logits: the logits from the model with shape (vocab_size,)
         :return: the sampled token
         """
-        if self.top_k:
-            logits = self.apply_top_k(logits)
-        if self.top_p:
-            logits = self.apply_top_p(logits)
-
         if self.deterministic:
             token = numpy.argmax(logits)
             self.tokens.append(token)
             return token
 
+        if self.top_k:
+            logits = self.apply_top_k(logits)
+
+        if self.top_p:
+            logits = self.apply_top_p(logits)
+
         if self.sampling_temperature != 1.0:
             logits /= self.sampling_temperature
 
diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py
index 214b8526e3..0f9147f916 100644
--- a/src/deepsparse/v2/text_generation/process_inputs.py
+++ b/src/deepsparse/v2/text_generation/process_inputs.py
@@ -17,7 +17,10 @@
 
 import transformers
 
-from deepsparse.transformers.pipelines.text_generation import TextGenerationInput
+from deepsparse.transformers.pipelines.text_generation import (
+    GenerationDefaults,
+    TextGenerationInput,
+)
 from deepsparse.transformers.utils.helpers import (
     check_and_return_generation_config,
     override_config,
@@ -26,19 +29,7 @@
 from deepsparse.v2.operators import Operator
 
 
-__all__ = ["ProcessInputsTextGeneration", "GenerationDefaults"]
-
-
-class GenerationDefaults:
-    num_return_sequences = 1
-    max_length = 100
-    max_new_tokens = None
-    output_scores = False
-    top_k = 0
-    top_p = 0.0
-    repetition_penalty = 0.0
-    do_sample = False
-    temperature = 1.0
+__all__ = ["ProcessInputsTextGeneration"]
 
 
 class ProcessInputsTextGeneration(Operator):
diff --git a/tests/deepsparse/v2/integration_tests/__init__.py b/tests/deepsparse/v2/integration_tests/__init__.py
new file mode 100644
index 0000000000..0c44f887a4
--- /dev/null
+++ b/tests/deepsparse/v2/integration_tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/deepsparse/v2/integration_tests/configs/codegen.yaml b/tests/deepsparse/v2/integration_tests/configs/codegen.yaml
new file mode 100644
index 0000000000..904358b55f
--- /dev/null
+++ b/tests/deepsparse/v2/integration_tests/configs/codegen.yaml
@@ -0,0 +1,6 @@
+cadence: "nightly"
+model_path: "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none"
+torch_model_name: "salesforce/codegen-350m-mono"
+prompt: "\ndef Fibonacci(n):\n    # Check if input is 0 then it will\n    # print incorrect input"
+precision: 0.0001
+internal_kv_cache: [True, False]
\ No newline at end of file
diff --git a/tests/deepsparse/v2/integration_tests/configs/gpt_neo.yaml b/tests/deepsparse/v2/integration_tests/configs/gpt_neo.yaml
new file mode 100644
index 0000000000..b422efc831
--- /dev/null
+++ b/tests/deepsparse/v2/integration_tests/configs/gpt_neo.yaml
@@ -0,0 +1,6 @@
+cadence: "commit"
+model_path: "hf:mgoin/TinyStories-1M-ds"
+torch_model_name: "roneneldan/TinyStories-1M"
+prompt: "Didn't know what time it was, the lights were low\n I leaned back on my radio"
+precision: 0.001
+internal_kv_cache: [True, False]
\ No newline at end of file
diff --git a/tests/deepsparse/v2/integration_tests/configs/opt.yaml b/tests/deepsparse/v2/integration_tests/configs/opt.yaml
new file mode 100644
index 0000000000..ff2350dbe7
--- /dev/null
+++ b/tests/deepsparse/v2/integration_tests/configs/opt.yaml
@@ -0,0 +1,6 @@
+cadence: "nightly"
+model_path: "zoo:nlg/text_generation/opt-1.3b/pytorch/huggingface/opt_pretrain/base-none"
+torch_model_name: "facebook/opt-1.3b"
+prompt: "Didn't know what time it was, the lights were low\n I leaned back on my radio"
+precision: 0.0001
+internal_kv_cache: [True, False]
\ No newline at end of file
diff --git a/tests/deepsparse/v2/integration_tests/helpers.py b/tests/deepsparse/v2/integration_tests/helpers.py
new file mode 100644
index 0000000000..8d7f3d58d2
--- /dev/null
+++ b/tests/deepsparse/v2/integration_tests/helpers.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy
+import yaml
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+import pytest
+
+
+class TorchGroundTruthSource:
+    """
+    An object that generates ground truth logits and
+    cache states from a prompt. This object can
+    generate tokens in an autoregressive manner, and thus
+    will output:
+     - prompt logits,
+     - generated logits,
+     - prompt cache state,
+     - generated sequence
+    """
+
+    def __init__(self, num_tokens_to_generate: int, model_name: str):
+
+        self.model = AutoModelForCausalLM.from_pretrained(model_name)
+        self.tokenizer = self._create_tokenizer(model_name)
+
+        self.num_tokens_to_generate = num_tokens_to_generate
+
+    def tokenize(self, prompt: str):
+        return self.tokenizer(prompt, return_tensors="pt")
+
+    def __call__(
+        self, prompt: str
+    ) -> Tuple[numpy.ndarray, numpy.ndarray, List[numpy.ndarray], str]:
+        # afaik it is not possible to get 'past_key_values' from
+        # the generate method, so we have to run the model twice
+        out = self.model.generate(
+            self.tokenize(prompt).input_ids,
+            max_new_tokens=self.num_tokens_to_generate,
+            output_scores=True,
+            return_dict_in_generate=True,
+            use_cache=True,
+        )
+        generated_text = self.tokenizer.decode(
+            out.sequences[0], skip_special_tokens=True
+        )
+        generated_logits = numpy.concatenate(
+            [[score.numpy() for score in out.scores]]
+        ).transpose(
+            1, 0, 2
+        )  # (1, num_tokens_to_generate, vocab_size)
+
+        out = self.model(**self.tokenize(prompt))
+        prompt_logits = out.logits.detach().numpy()[
+            :, :-1, :
+        ]  # (1, prompt_length, vocab_size)
+        prompt_cache = [
+            entry.detach().numpy()
+            for key_value_tuple in out.past_key_values
+            for entry in key_value_tuple
+        ]  # List[(1, num_heads, past_length, head_dim)]
+
+        return generated_logits, prompt_logits, prompt_cache, generated_text
+
+    @staticmethod
+    def _create_tokenizer(model_name):
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        tokenizer.padding_side = "left"
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+
+        return tokenizer
+
+
+def parse_params(configs_directory: str) -> List[Dict[str, Any]]:
+    # parses the config file provided
+    assert os.path.isdir(
+        configs_directory
+    ), f"Config_directory {configs_directory} is not a directory"
+
+    config_dicts = []
+    for file in os.listdir(configs_directory):
+        if file.endswith(".yaml"):
+            config_path = os.path.join(configs_directory, file)
+            # reads the yaml file
+            with open(config_path, "r") as f:
+                config = yaml.safe_load(f)
+
+            cadence = os.environ.get("CADENCE", "commit")
+            expected_cadence = config["cadence"]
+
+            if not isinstance(expected_cadence, list):
+                expected_cadence = [expected_cadence]
+            if cadence in expected_cadence:
+                config_dicts.append(config)
+            else:
+                logging.info(
+                    f"Skipping testing model: {config['model_path']} "
+                    f"for cadence: {config['cadence']}"
+                )
+        else:
+            raise FileNotFoundError(
+                f"Could not find a yaml file in {configs_directory}"
+            )
+    return config_dicts
+
+
+def validate_internal_kv_cache(
+    internal_kv_cache, available_kv_cache_types: Union[str, List[str]]
+) -> bool:
+    if internal_kv_cache and True not in available_kv_cache_types:
+        pytest.skip(
+            "The tests for running the pipeline with "
+            "internal kv cache management are disabled."
+        )
+    if not internal_kv_cache and False not in available_kv_cache_types:
+        pytest.skip(
+            "The tests for running the pipeline with "
+            "external kv cache management are disabled."
+        )
+    return internal_kv_cache
diff --git a/tests/deepsparse/v2/integration_tests/test_llms.py b/tests/deepsparse/v2/integration_tests/test_llms.py
new file mode 100644
index 0000000000..34a8f7a258
--- /dev/null
+++ b/tests/deepsparse/v2/integration_tests/test_llms.py
@@ -0,0 +1,368 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This test suite consumes config files to test the text generation pipeline
+for various scenarios.
+
+A sample config file is a yaml that r_equires the following fields:
+    cadence: The cadence of the tests. The available options are:
+              "nightly", "weekly" and "commit". By default, only
+              the tests that have cadence "commit" will be run
+              in GHA. This parameter can be both a string or a
+              list of strings.
+    model_path: The path to the model to be tested
+                (sparsezoo stub/hf model path/local_path)
+    torch_model_name: The name of the torch model
+                (to generate ground truth info)
+    prompt: The prompt to use for testing
+    precision: The precision for the logits/kv_cache entries
+        comparison
+    internal_kv_cache: The type of the internal KV cache
+        management. Is a list that can contain the following
+        values: [True], [False] or [True, False] (to test both
+        external and internal KV cache management)
+"""
+import os
+from typing import List, Tuple
+
+import numpy
+
+import pytest
+from deepsparse.transformers.pipelines.text_generation import TextGenerationOutput
+from deepsparse.v2.pipeline import Pipeline
+from deepsparse.v2.text_generation import TextGenerationPipeline
+from sparsezoo import Model
+from tests.deepsparse.transformers.pipelines.integration_tests.helpers import (
+    TorchGroundTruthSource,
+    parse_params,
+    validate_internal_kv_cache,
+)
+
+
+CONFIGS_DIRECTORY = "tests/deepsparse/v2/integration_tests/configs"
+
+
+@pytest.fixture()
+def max_new_tokens() -> int:
+    return 64
+
+
+@pytest.mark.parametrize("params_dict", parse_params(CONFIGS_DIRECTORY))
+@pytest.mark.parametrize(
+    "internal_kv_cache",
+    [True, False],
+)
+class TestsIntegrationLLMsPipelines:
+    """
+    This test suite is meant to test the main scenarios of
+    the text generation pipeline.
+    """
+
+    def get_pipeline(self, **kwargs) -> Pipeline:
+        """
+        If no kwargs provided, returns the cached "default"
+        pipeline that is used for most of the tests.
+        Otherwise, returns a pipeline with the given kwargs
+        (the default pipeline kwargs are updated with the
+        user-provided kwargs)
+
+        :param kwargs: the optional kwargs to be used to
+            create the pipeline (if not provided, the cached
+            "default" pipeline is returned)
+        :return: the appropriate pipeline
+        """
+        if not kwargs:
+            if self.default_pipeline is None:
+                self.default_pipeline = TextGenerationPipeline(
+                    **self.default_pipeline_kwargs
+                )
+            return self.default_pipeline
+
+        # return a pipeline with the updated default kwargs
+        updated_kwargs = self.default_pipeline_kwargs.copy()
+        updated_kwargs.update(kwargs)
+        return TextGenerationPipeline(**updated_kwargs)
+
+    @pytest.fixture
+    def setup(self, params_dict, max_new_tokens, internal_kv_cache):
+        # set the params_dict as the class attributes
+        for key, value in params_dict.items():
+            setattr(self, key, value)
+        # check whether the specified cache management type
+        # is supported for testing (skip if not supported)
+        self.internal_kv_cache: bool = validate_internal_kv_cache(
+            internal_kv_cache, self.internal_kv_cache
+        )
+        # create torch ground source
+        torch_source = TorchGroundTruthSource(
+            num_tokens_to_generate=max_new_tokens + 1,
+            model_name=self.torch_model_name,
+        )
+        # create torch ground truth
+        self.torch_ground_truth = torch_source(self.prompt)
+
+        # specify the default pipeline kwargs
+        self.default_pipeline_kwargs = dict(
+            model_path=self.model_path,
+            internal_kv_cache=self.internal_kv_cache,
+            force_max_tokens=True,
+        )
+        self.default_pipeline = None
+        self.max_new_tokens = max_new_tokens
+
+    def test_ort_single_token_prefill(self, setup):
+        # Test the pipeline that uses ORT engine. The test covers the
+        # following scenario:
+        # 1. Prompt preprocessing is performed by single-token engine
+        # 2. The KV Cache is never filled up
+        # 3. KV Cache managed externally
+
+        if self.internal_kv_cache:
+            pytest.skip(
+                "Cannot run ORT pipeline with the internal deepsparse cache enabled."
+            )
+
+        pipeline = self.get_pipeline(
+            prompt_sequence_length=1,
+            engine_type="onnxruntime",
+        )
+        output = pipeline(
+            prompt=self.prompt,
+            include_prompt_logits=True,
+            generation_kwargs=dict(
+                max_new_tokens=self.max_new_tokens,
+                output_scores=True,
+            ),
+        )
+
+        self._test_output(
+            output=output,
+            torch_ground_truth=self.torch_ground_truth,
+        )
+
+    def test_ort_multi_token_prefill(self, setup):
+        # Test the pipeline that uses ORT engine. The test covers the
+        # following scenario:
+        # 1. Prompt preprocessing is performed by multi-token engine
+        # 2. The KV Cache is never filled up
+        # 3. KV Cache managed externally
+
+        if self.internal_kv_cache:
+            pytest.skip(
+                "Cannot run ORT pipeline with the internal deepsparse cache enabled."
+            )
+        pipeline = self.get_pipeline(
+            engine_type="onnxruntime",
+        )
+        output = pipeline(
+            prompt=self.prompt,
+            include_prompt_logits=True,
+            generation_kwargs=dict(
+                max_new_tokens=self.max_new_tokens, output_scores=True
+            ),
+        )
+
+        self._test_output(
+            output=output,
+            torch_ground_truth=self.torch_ground_truth,
+        )
+
+    def test_deepsparse_single_token_prefill(self, setup):
+        # Test the pipeline that uses deepsparse engine. The test covers the
+        # following scenario:
+        # 1. Prompt preprocessing is performed by single-token engine
+        # 2. The KV Cache is never filled up
+        # 3. KV Cache managed externally or internally
+
+        pipeline = self.get_pipeline(
+            prompt_sequence_length=1,
+        )
+
+        output = pipeline(
+            prompt=self.prompt,
+            include_prompt_logits=True,
+            generation_kwargs=dict(
+                max_new_tokens=self.max_new_tokens, output_scores=True
+            ),
+        )
+
+        self._test_output(
+            output=output,
+            torch_ground_truth=self.torch_ground_truth,
+            # disable kv cache validation if using internal kv cache
+            run_kv_cache_validation=not self.internal_kv_cache,
+        )
+
+    def test_deepsparse_multi_token_prefill(self, setup):
+        # Test the pipeline that uses deepsparse engine. The test covers the
+        # following scenario:
+        # 1. Prompt preprocessing is performed by multi-token engine
+        # 2. The KV Cache is never filled up
+        # 3. KV Cache managed internally or externally
+
+        pipeline = self.get_pipeline()
+        output = pipeline(
+            prompt=self.prompt,
+            include_prompt_logits=True,
+            generation_kwargs=dict(
+                max_new_tokens=self.max_new_tokens, output_scores=True
+            ),
+        )
+
+        self._test_output(
+            output=output,
+            torch_ground_truth=self.torch_ground_truth,
+            # disable kv cache validation if using internal kv cache
+            run_kv_cache_validation=not self.internal_kv_cache,
+        )
+
+    @pytest.mark.skip(
+        "This test is skipped because we do "
+        "not have support for non-kv-cache models yet"
+    )
+    def test_inference_no_kv_cache_deepsparse(self, setup):
+        self._test_inference_no_kv_cache(engine_type="deepsparse")
+
+    @pytest.mark.skip(
+        "This test is skipped because we do "
+        "not have support for non-kv-cache models yet"
+    )
+    def test_inference_no_kv_cache_ort(self, setup):
+        self._test_inference_no_kv_cache(engine_type="onnxruntime")
+
+    def _test_inference_no_kv_cache(self, engine_type):
+        model_path_no_cache = self._get_model_path_no_cache()
+        pipeline = self.get_pipeline(
+            model_path=model_path_no_cache, engine_type=engine_type
+        )
+        assert not pipeline.cache_support_enabled, (
+            "This pipeline test inference using non-kv cache "
+            "model and thus should not support kv cache"
+        )
+
+        output = pipeline(
+            self.prompt, max_length=1, output_scores=True, include_prompt_logits=True
+        )
+        prompt_length = self.torch_ground_truth[1].shape[1]
+        # prompt logits + one logit for the new generated token
+        logits = output.generations[0].score[-(prompt_length + 1) :, :]
+        # compute ground truth logits analogously
+        generated_logits, prompt_logits, *_ = self.torch_ground_truth
+        logits_gt = numpy.concatenate(
+            [prompt_logits[0], generated_logits[0, :1, :]], axis=0
+        )
+        assert numpy.allclose(logits, logits_gt, atol=self.precision)
+
+    def _test_output(
+        self,
+        output: TextGenerationOutput,
+        torch_ground_truth: Tuple[numpy.ndarray, ...],
+        run_kv_cache_validation: bool = True,
+    ):
+
+        (
+            generated_logits,
+            prompt_logits,
+            prompt_kv_cache,
+            generated_text,
+        ) = torch_ground_truth
+
+        # concatenate target prompt_logits and generated_logits
+        target_logits = numpy.concatenate([prompt_logits, generated_logits], axis=1)
+        # get the logits of the generated sequence
+        score = output.generations[0].score
+
+        # we expect the logits to be exactly the same
+        # as the target logits; the generated sequence should
+        # also be the same as the target sequence
+        assert numpy.allclose(score, target_logits[0], atol=self.precision)
+        assert self.prompt + output.generations[0].text == generated_text
+
+        if hasattr(output, "kv_cache_state") and run_kv_cache_validation:
+            # (if applicable) the kv cache should be the same as the
+            # target kv cache
+            expected_cache = list(output.kv_cache_state[0].values())
+            total_num_processed_tokens = output.total_num_processed_tokens[0]
+            self._test_kv_cache_state(
+                expected_cache=expected_cache,
+                target_cache=prompt_kv_cache,
+                total_num_processed_tokens=total_num_processed_tokens,
+            )
+
+    def _test_kv_cache_state(
+        self,
+        expected_cache: List[numpy.ndarray],
+        target_cache: List[numpy.ndarray],
+        total_num_processed_tokens: int,
+    ):
+        for x, y in zip(expected_cache, target_cache):
+            start_index = total_num_processed_tokens
+            end_index = total_num_processed_tokens - y.shape[2]
+            # x is (in general) composed of three arrays:
+            # - padding cache entries (from 0 to -start_index)
+            # - prompt cache entries (from -start_index to -end_index)
+            # - generated cache entries (from -end_index to -1)
+            # as target_cache only pertains to prompt cache entries, we need to
+            # compare only the prompt cache entries in x with y
+            assert numpy.allclose(
+                x[:, :, -start_index:-end_index, :], y, atol=self.precision
+            )
+
+    def _get_model_path_no_cache(self):
+        if not self.model_path.startswith("zoo:"):
+            pytest.skip("For this test, for now only the zoo model is supported")
+        model = Model(self.model_path)
+        # fetch the necessary file names for pipeline creation
+        required_file_names = [
+            os.path.basename(file.name) for file in model.deployment.files
+        ]
+        training_directory = model.training
+        onnx_model_name_no_cache = [
+            os.path.basename(file.name)
+            for file in model.training.files
+            if file.name.endswith(".onnx")
+        ][0]
+
+        # check if 'training' exists,
+        # if not, download the files
+        if "training" not in os.listdir(model._path):
+            for filename in required_file_names:
+                # download the files to a training directory
+                if filename.endswith(".data"):
+                    # data files are typically stored in a deployment directory
+                    # download them to training
+                    file = model.deployment.get_file(filename)
+                    assert (
+                        file is not None
+                    ), f"Unable to find file {filename} in model {model}"
+                    file.name = file.name.replace("deployment", "training")
+                    file.download()
+                    continue
+
+                if filename.endswith(".onnx"):
+                    # instead of `model.onnx` the onnx_model_name_no_cache
+                    # should be downloaded
+                    filename = filename.replace("model.onnx", onnx_model_name_no_cache)
+
+                file = training_directory.get_file(filename)
+                assert (
+                    file is not None
+                ), f"Unable to find file {filename} in model {model}"
+                file.download()
+            # rename the model file to `model.onnx`
+            os.rename(
+                os.path.join(training_directory.path, onnx_model_name_no_cache),
+                os.path.join(training_directory.path, "model.onnx"),
+            )
+        return training_directory._path

From 9b441f5d314ce0fde92ec3dd9181bf2db3928b40 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Tue, 21 Nov 2023 13:08:58 +0000
Subject: [PATCH 34/43] integration tests pass

---
 .../v2/text_generation/join_output.py         |   7 +-
 .../v2/text_generation/nl_engine_operator.py  |   8 +-
 src/deepsparse/v2/text_generation/pipeline.py |   2 +-
 .../v2/integration_tests/configs/codegen.yaml |   1 +
 .../v2/integration_tests/configs/gpt_neo.yaml |   1 +
 .../v2/integration_tests/configs/opt.yaml     |   1 +
 .../v2/integration_tests/test_llms.py         | 100 +++++-------------
 7 files changed, 39 insertions(+), 81 deletions(-)

diff --git a/src/deepsparse/v2/text_generation/join_output.py b/src/deepsparse/v2/text_generation/join_output.py
index 5813702f46..56d9ac47b1 100644
--- a/src/deepsparse/v2/text_generation/join_output.py
+++ b/src/deepsparse/v2/text_generation/join_output.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
+from typing import Dict, List, Tuple
 
 import numpy
 
@@ -32,9 +32,8 @@ class JoinOutput(Operator):
     def __init__(self, tokenizer):
         self.tokenizer = tokenizer
 
-    def run(self, inp: List[List[CompileGenerationsOutput]], **kwargs):
-
-        if not isinstance(inp, list):
+    def run(self, inp: Tuple[List[CompileGenerationsOutput], Dict], **kwargs):
+        if not isinstance(inp, Tuple):
             # when running without KV Cache
             # this will be a single
             # CompileGenerationsOutput for now
diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py
index fe28bdfe2c..3fa8653ea6 100644
--- a/src/deepsparse/v2/text_generation/nl_engine_operator.py
+++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py
@@ -19,6 +19,7 @@
 import numpy
 from pydantic import BaseModel, Field
 
+from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs
 from deepsparse.utils.onnx import (
     CACHE_INPUT_PREFIX,
     overwrite_onnx_model_inputs_for_kv_cache_models,
@@ -218,7 +219,12 @@ class NlEngineOperatorNoCache(EngineOperator):
     input_schema = NlEngineInputNoCache
     output_schema = None
 
-    def __init__(self, **kwargs):
+    def __init__(self, sequence_length: int, **kwargs):
+        overwrite_transformer_onnx_model_inputs(
+            path=kwargs.get("model_path"),
+            batch_size=kwargs.get("batch_size", 1),
+            max_length=sequence_length,
+        )
         super().__init__(**kwargs)
 
     def run(self, inp: NlEngineInputNoCache, **kwargs) -> Any:
diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py
index d36dabab5d..7c270873fa 100644
--- a/src/deepsparse/v2/text_generation/pipeline.py
+++ b/src/deepsparse/v2/text_generation/pipeline.py
@@ -74,7 +74,7 @@ def __init__(
                 sequence_length=sequence_length,
                 tokenizer=self.tokenizer,
             ),
-            NlEngineOperatorNoCache(**engine_kwargs),
+            NlEngineOperatorNoCache(sequence_length=sequence_length, **engine_kwargs),
             PrepareGeneration(
                 sequence_length=sequence_length,
                 prompt_sequence_length=1,
diff --git a/tests/deepsparse/v2/integration_tests/configs/codegen.yaml b/tests/deepsparse/v2/integration_tests/configs/codegen.yaml
index 904358b55f..9ec212a6cc 100644
--- a/tests/deepsparse/v2/integration_tests/configs/codegen.yaml
+++ b/tests/deepsparse/v2/integration_tests/configs/codegen.yaml
@@ -1,6 +1,7 @@
 cadence: "nightly"
 model_path: "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none"
 torch_model_name: "salesforce/codegen-350m-mono"
+model_name_no_kv_cache: None
 prompt: "\ndef Fibonacci(n):\n    # Check if input is 0 then it will\n    # print incorrect input"
 precision: 0.0001
 internal_kv_cache: [True, False]
\ No newline at end of file
diff --git a/tests/deepsparse/v2/integration_tests/configs/gpt_neo.yaml b/tests/deepsparse/v2/integration_tests/configs/gpt_neo.yaml
index b422efc831..71c57e1f97 100644
--- a/tests/deepsparse/v2/integration_tests/configs/gpt_neo.yaml
+++ b/tests/deepsparse/v2/integration_tests/configs/gpt_neo.yaml
@@ -1,6 +1,7 @@
 cadence: "commit"
 model_path: "hf:mgoin/TinyStories-1M-ds"
 torch_model_name: "roneneldan/TinyStories-1M"
+model_name_no_kv_cache: "model-orig.onnx"
 prompt: "Didn't know what time it was, the lights were low\n I leaned back on my radio"
 precision: 0.001
 internal_kv_cache: [True, False]
\ No newline at end of file
diff --git a/tests/deepsparse/v2/integration_tests/configs/opt.yaml b/tests/deepsparse/v2/integration_tests/configs/opt.yaml
index ff2350dbe7..216d4c03ca 100644
--- a/tests/deepsparse/v2/integration_tests/configs/opt.yaml
+++ b/tests/deepsparse/v2/integration_tests/configs/opt.yaml
@@ -1,6 +1,7 @@
 cadence: "nightly"
 model_path: "zoo:nlg/text_generation/opt-1.3b/pytorch/huggingface/opt_pretrain/base-none"
 torch_model_name: "facebook/opt-1.3b"
+model_name_no_kv_cache: None
 prompt: "Didn't know what time it was, the lights were low\n I leaned back on my radio"
 precision: 0.0001
 internal_kv_cache: [True, False]
\ No newline at end of file
diff --git a/tests/deepsparse/v2/integration_tests/test_llms.py b/tests/deepsparse/v2/integration_tests/test_llms.py
index 34a8f7a258..321070f276 100644
--- a/tests/deepsparse/v2/integration_tests/test_llms.py
+++ b/tests/deepsparse/v2/integration_tests/test_llms.py
@@ -23,6 +23,8 @@
               list of strings.
     model_path: The path to the model to be tested
                 (sparsezoo stub/hf model path/local_path)
+    model_name_no_kv_cache: The name of the onnx model without
+                            the KV cache support
     torch_model_name: The name of the torch model
                 (to generate ground truth info)
     prompt: The prompt to use for testing
@@ -33,7 +35,6 @@
         values: [True], [False] or [True, False] (to test both
         external and internal KV cache management)
 """
-import os
 from typing import List, Tuple
 
 import numpy
@@ -41,8 +42,10 @@
 import pytest
 from deepsparse.transformers.pipelines.text_generation import TextGenerationOutput
 from deepsparse.v2.pipeline import Pipeline
-from deepsparse.v2.text_generation import TextGenerationPipeline
-from sparsezoo import Model
+from deepsparse.v2.text_generation import (
+    TextGenerationPipeline,
+    TextGenerationPipelineNoCache,
+)
 from tests.deepsparse.transformers.pipelines.integration_tests.helpers import (
     TorchGroundTruthSource,
     parse_params,
@@ -69,7 +72,7 @@ class TestsIntegrationLLMsPipelines:
     the text generation pipeline.
     """
 
-    def get_pipeline(self, **kwargs) -> Pipeline:
+    def get_pipeline(self, kv_cache_support=True, **kwargs) -> Pipeline:
         """
         If no kwargs provided, returns the cached "default"
         pipeline that is used for most of the tests.
@@ -82,9 +85,14 @@ def get_pipeline(self, **kwargs) -> Pipeline:
             "default" pipeline is returned)
         :return: the appropriate pipeline
         """
+        text_generation_pipeline_class = (
+            TextGenerationPipeline
+            if kv_cache_support
+            else TextGenerationPipelineNoCache
+        )
         if not kwargs:
             if self.default_pipeline is None:
-                self.default_pipeline = TextGenerationPipeline(
+                self.default_pipeline = text_generation_pipeline_class(
                     **self.default_pipeline_kwargs
                 )
             return self.default_pipeline
@@ -92,7 +100,7 @@ def get_pipeline(self, **kwargs) -> Pipeline:
         # return a pipeline with the updated default kwargs
         updated_kwargs = self.default_pipeline_kwargs.copy()
         updated_kwargs.update(kwargs)
-        return TextGenerationPipeline(**updated_kwargs)
+        return text_generation_pipeline_class(**updated_kwargs)
 
     @pytest.fixture
     def setup(self, params_dict, max_new_tokens, internal_kv_cache):
@@ -135,7 +143,7 @@ def test_ort_single_token_prefill(self, setup):
 
         pipeline = self.get_pipeline(
             prompt_sequence_length=1,
-            engine_type="onnxruntime",
+            engine_kwargs=dict(engine_type="onnxruntime"),
         )
         output = pipeline(
             prompt=self.prompt,
@@ -163,7 +171,7 @@ def test_ort_multi_token_prefill(self, setup):
                 "Cannot run ORT pipeline with the internal deepsparse cache enabled."
             )
         pipeline = self.get_pipeline(
-            engine_type="onnxruntime",
+            engine_kwargs=dict(engine_type="onnxruntime"),
         )
         output = pipeline(
             prompt=self.prompt,
@@ -227,37 +235,27 @@ def test_deepsparse_multi_token_prefill(self, setup):
             run_kv_cache_validation=not self.internal_kv_cache,
         )
 
-    @pytest.mark.skip(
-        "This test is skipped because we do "
-        "not have support for non-kv-cache models yet"
-    )
     def test_inference_no_kv_cache_deepsparse(self, setup):
         self._test_inference_no_kv_cache(engine_type="deepsparse")
 
-    @pytest.mark.skip(
-        "This test is skipped because we do "
-        "not have support for non-kv-cache models yet"
-    )
     def test_inference_no_kv_cache_ort(self, setup):
         self._test_inference_no_kv_cache(engine_type="onnxruntime")
 
     def _test_inference_no_kv_cache(self, engine_type):
-        model_path_no_cache = self._get_model_path_no_cache()
         pipeline = self.get_pipeline(
-            model_path=model_path_no_cache, engine_type=engine_type
-        )
-        assert not pipeline.cache_support_enabled, (
-            "This pipeline test inference using non-kv cache "
-            "model and thus should not support kv cache"
+            onnx_model_name=self.model_name_no_kv_cache,
+            kv_cache_support=False,
+            engine_kwargs=dict(engine_type=engine_type),
         )
 
         output = pipeline(
-            self.prompt, max_length=1, output_scores=True, include_prompt_logits=True
+            prompt=self.prompt,
+            include_prompt_logits=True,
+            generation_kwargs=dict(output_scores=True),
         )
-        prompt_length = self.torch_ground_truth[1].shape[1]
-        # prompt logits + one logit for the new generated token
-        logits = output.generations[0].score[-(prompt_length + 1) :, :]
-        # compute ground truth logits analogously
+
+        logits = output.generations[0].score
+        # logits -> prompt logits + one logit for the new generated token
         generated_logits, prompt_logits, *_ = self.torch_ground_truth
         logits_gt = numpy.concatenate(
             [prompt_logits[0], generated_logits[0, :1, :]], axis=0
@@ -318,51 +316,3 @@ def _test_kv_cache_state(
             assert numpy.allclose(
                 x[:, :, -start_index:-end_index, :], y, atol=self.precision
             )
-
-    def _get_model_path_no_cache(self):
-        if not self.model_path.startswith("zoo:"):
-            pytest.skip("For this test, for now only the zoo model is supported")
-        model = Model(self.model_path)
-        # fetch the necessary file names for pipeline creation
-        required_file_names = [
-            os.path.basename(file.name) for file in model.deployment.files
-        ]
-        training_directory = model.training
-        onnx_model_name_no_cache = [
-            os.path.basename(file.name)
-            for file in model.training.files
-            if file.name.endswith(".onnx")
-        ][0]
-
-        # check if 'training' exists,
-        # if not, download the files
-        if "training" not in os.listdir(model._path):
-            for filename in required_file_names:
-                # download the files to a training directory
-                if filename.endswith(".data"):
-                    # data files are typically stored in a deployment directory
-                    # download them to training
-                    file = model.deployment.get_file(filename)
-                    assert (
-                        file is not None
-                    ), f"Unable to find file {filename} in model {model}"
-                    file.name = file.name.replace("deployment", "training")
-                    file.download()
-                    continue
-
-                if filename.endswith(".onnx"):
-                    # instead of `model.onnx` the onnx_model_name_no_cache
-                    # should be downloaded
-                    filename = filename.replace("model.onnx", onnx_model_name_no_cache)
-
-                file = training_directory.get_file(filename)
-                assert (
-                    file is not None
-                ), f"Unable to find file {filename} in model {model}"
-                file.download()
-            # rename the model file to `model.onnx`
-            os.rename(
-                os.path.join(training_directory.path, onnx_model_name_no_cache),
-                os.path.join(training_directory.path, "model.onnx"),
-            )
-        return training_directory._path

From c858b1f603622881b330ef942d9bdfaca5bcb846 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 21 Nov 2023 10:39:02 -0500
Subject: [PATCH 35/43] [Pipeline Refactor][Text Generation][Continuous
 Batching] Integration (#1409)

* update split/join

* use map

* update

* run end-to-end

* clean-up

* fix bug with batch size, introduce SplitRoute dataclass

* update tests to use new inputs/outputs

* use the normal scheduler for internal kv_cache

* add pipeline inpuits

* clean-up

* change engine type, update docstrings, update override function to be more generic

* move subgraph functionality to its own function; clean-up cont batching in text gen pipeline

* update linear pathway to also use subgraph execution

* rebase fix

* fix tests
---
 .../v2/operators/engine_operator.py           |  12 +-
 src/deepsparse/v2/operators/operator.py       |   5 +-
 src/deepsparse/v2/pipeline.py                 | 224 +++++++++++-------
 src/deepsparse/v2/routers/router.py           |   2 -
 .../continuous_batching_scheduler.py          |  20 +-
 .../utils/continuous_batching_executor.py     |   2 +-
 .../compile_generated_tokens.py               |   2 +-
 .../v2/text_generation/compile_logits.py      |  14 +-
 .../v2/text_generation/generate_new_token.py  |  12 +-
 .../v2/text_generation/nl_engine_operator.py  | 184 +++++++++++---
 src/deepsparse/v2/text_generation/pipeline.py |  48 +++-
 src/deepsparse/v2/utils/__init__.py           |   4 +
 src/deepsparse/v2/utils/data.py               |  39 +++
 src/deepsparse/v2/utils/helpers.py            |  37 +++
 .../v2/integration_tests/test_llms.py         |   6 +-
 .../v2/unit/text_generation/conftest.py       |  11 +-
 .../v2/unit/text_generation/test_misc.py      |  13 +-
 .../text_generation/test_process_inputs.py    |   6 +-
 .../test_single_token_engine.py               |   6 +-
 .../text_generation/test_token_generation.py  |  10 +-
 20 files changed, 486 insertions(+), 171 deletions(-)
 create mode 100644 src/deepsparse/v2/utils/data.py
 create mode 100644 src/deepsparse/v2/utils/helpers.py

diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py
index 9ee8d734c5..630de2d5bd 100644
--- a/src/deepsparse/v2/operators/engine_operator.py
+++ b/src/deepsparse/v2/operators/engine_operator.py
@@ -20,7 +20,7 @@
 from deepsparse import Context as EngineContext
 from deepsparse import Engine, MultiModelEngine, Scheduler
 from deepsparse.benchmark import ORTEngine
-from deepsparse.utils import model_to_path
+from deepsparse.utils import join_engine_outputs, model_to_path, split_engine_inputs
 from deepsparse.v2.operators import Operator
 
 
@@ -29,12 +29,12 @@
 
 SUPPORTED_PIPELINE_ENGINES = [DEEPSPARSE_ENGINE, ORT_ENGINE]
 
-__all__ = ["EngineOperator"]
+__all__ = ["EngineOperator", "EngineOperatorInputs", "EngineOperatorOutputs"]
 
 
 class EngineOperatorInputs(BaseModel):
     engine_inputs: List = Field(description="engine_inputs")
-    engine: Optional[Engine] = Field(
+    engine: Optional[Union[ORTEngine, Engine]] = Field(
         description="override the engine to run forward pass with",
         default=None,
     )
@@ -95,8 +95,8 @@ def __init__(
         engine_kwargs: Dict = None,
     ):
         self.model_path = model_to_path(model_path)
-        self._batch_size = 1
         self.engine_context = engine_context
+        self._batch_size = 1
 
         if self.engine_context is not None:
             num_cores = num_cores or self.engine_context.num_cores
@@ -131,6 +131,7 @@ def batch_size(self) -> int:
         """
         return self._batch_size
 
+    # TODO: maybe add a few args to make this less opaque?
     def create_engine(
         self,
         **kwargs,
@@ -142,7 +143,8 @@ def create_engine(
             constructor/compilation
         :return: inference engine
         """
-        onnx_file_path = self.model_path
+
+        onnx_file_path = kwargs.pop("model_path", self.model_path)
         engine_args = deepcopy(self._engine_args)
         engine_args.update(kwargs)
         engine_type = self._engine_type.lower()
diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py
index 5bb0be841a..2923862b12 100644
--- a/src/deepsparse/v2/operators/operator.py
+++ b/src/deepsparse/v2/operators/operator.py
@@ -17,7 +17,7 @@
 
 from pydantic import BaseModel
 
-from deepsparse.v2.utils import InferenceState, PipelineState
+from deepsparse.v2.utils import InferenceState
 
 
 __all__ = ["Operator"]
@@ -57,7 +57,6 @@ def __call__(
         self,
         *args,
         inference_state: InferenceState,
-        pipeline_state: PipelineState,
         **kwargs,
     ) -> Any:
         """
@@ -90,13 +89,11 @@ def __call__(
             run_output = self.run(
                 inference_input,
                 inference_state=inference_state,
-                pipeline_state=pipeline_state,
             )
         else:
             run_output = self.run(
                 *args,
                 inference_state=inference_state,
-                pipeline_state=pipeline_state,
                 **kwargs,
             )
         if self.has_output_schema():
diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py
index f56680d2b9..78d112a2b3 100644
--- a/src/deepsparse/v2/pipeline.py
+++ b/src/deepsparse/v2/pipeline.py
@@ -15,13 +15,18 @@
 
 import copy
 from concurrent.futures import Future
-from functools import partial
-from typing import Any, Callable, Dict, List, Union
+from typing import Any, Dict, List, Union
 
-from deepsparse.v2.operators import Operator
+from deepsparse.v2.operators import EngineOperator, Operator
 from deepsparse.v2.routers import Router
-from deepsparse.v2.schedulers import OperatorScheduler, SchedulerGroup
+from deepsparse.v2.schedulers import (
+    ContinuousBatchingScheduler,
+    OperatorScheduler,
+    SchedulerGroup,
+)
 from deepsparse.v2.utils import InferenceState, PipelineState
+from deepsparse.v2.utils.data import SubGraph
+from deepsparse.v2.utils.helpers import run_func
 
 
 __all__ = ["Pipeline"]
@@ -50,6 +55,7 @@ def __init__(
         ops: Union[Dict[str, Operator], List[Operator]],
         router: Router,
         schedulers: List[OperatorScheduler],
+        continuous_batching_scheduler: ContinuousBatchingScheduler,
         pipeline_state: PipelineState = None,
     ):
 
@@ -57,32 +63,92 @@ def __init__(
         self.router = router
         self.schedulers = schedulers
         self.pipeline_state = pipeline_state
+        self._continuous_batching_scheduler = continuous_batching_scheduler
         self.validate()
 
         self._scheduler_group = SchedulerGroup(self.schedulers)
 
-    def _run_sequential(
+    def _run_next(
         self,
         inp: Any,
         inference_state: InferenceState,
-        pipeline_state: PipelineState,
-        start: str,
-        end: str,
+        next_step: str,
     ):
-        next_step = start
-        while next_step != end:
-            outputs = self._run_next_step(
-                func=self.ops[next_step],
-                next_step=next_step,
-                input=inp,
-                pipeline_state=pipeline_state,
-                inference_state=inference_state,
+        if (
+            isinstance(self.ops[next_step], EngineOperator)
+            and self._continuous_batching_scheduler
+        ):
+            func = self._continuous_batching_scheduler.submit
+            inp = self.ops[next_step].input_schema(**inp)
+        else:
+            func = self._scheduler_group.submit
+
+        return run_func(
+            func=func,
+            operator=self.ops[next_step],
+            inp=inp,
+            pipeline_state=self.pipeline_state,
+            inference_state=inference_state,
+        )
+
+    def _run_sub_graphs(
+        self, sub_graph_inputs: List[Any], sub_graphs: List[SubGraph]
+    ) -> List[Any]:
+        """
+        Run a list of sub_graphs asynchronously. Polls to identify the sub graph that is
+        still running but has completed its current step. Schedules the next step
+        subgraph step. This is repeated until all subgraphs have finished running and
+        have reached their end step (stored in the Subgraph.end attribute).
+
+        :param sub_graph_inputs: A list of inputs that should be passed to each
+        subgraph. Each subgraph is given an element of the list as input to its
+        first node.
+        :param sub_graphs: A list of Subgraph objects. Each stores the relevant
+        execution information for the particular subgraph, such as its current step
+        in the sub graph, inference state, output, and end step.
+
+        :returns: a list of outputs for all the completed Subgraph objects. Returned
+        in the same order that the subgraphs were passed to the function.
+        """
+        for i in range(len(sub_graphs)):
+            sub_graphs[i].output = self._run_next(
+                sub_graph_inputs[i], sub_graphs[i].inf, sub_graphs[i].step
             )
-            next_step, operator_output, state_update = outputs
-            if state_update:
-                inference_state.update_state(state_update)
-            inp = operator_output
-        return inp
+
+        # Execute all sub graphs until all graphs have been completed.
+        while True:
+            for sub_graph in sub_graphs:
+                if isinstance(sub_graph.output, Future) and sub_graph.output.done():
+                    # get the result for the completed operator; resolve its output
+                    operator_output = sub_graph.output.result()
+                    operator_output = sub_graph.parse_output(operator_output)
+
+                    # determine the next step for the particular operator, using
+                    # its previous output and previously stored step
+                    next_step = self.router.next(
+                        sub_graph.step, self.ops, operator_output
+                    )
+                    # update the step
+                    sub_graph.step = next_step
+
+                    # store the output for the next step. If the next step is
+                    # end step, this particular route has completed. Simply
+                    # update the output value
+                    if next_step in sub_graph.end:
+                        sub_graph.output = operator_output
+                    else:
+                        sub_graph.output = self._run_next(
+                            inp=operator_output,
+                            inference_state=sub_graph.inf,
+                            next_step=next_step,
+                        )
+                    break
+
+            # keep running until all sub graphs have completed.
+            if not any(isinstance(x.output, Future) for x in sub_graphs):
+                break
+
+        return [x.output for x in sub_graphs]
 
     def _apply_split(self, inp: Any, inference_state: InferenceState):
         """
@@ -93,59 +159,29 @@ def _apply_split(self, inp: Any, inference_state: InferenceState):
         """
 
         batches, orig_batch_size = self.expand_inputs(inp, 1)
-        run_with_state = partial(
-            self._run_sequential,
-            pipeline_state=self.pipeline_state,
-            start=self.router.route[self.router.SPLIT_ROUTE],
-            end=self.router.JOIN_ROUTE,
-        )
-        inference_state_list = [
-            copy.deepcopy(inference_state) for x in range(len(batches))
-        ]
-        futures = self._scheduler_group.map(
-            batches,
-            inference_state_list,
-            func=run_with_state,
-        )
-        return self.condense_inputs([x.result() for x in futures])
 
-    def _run_next_step(
-        self,
-        *args,
-        func: Callable,
-        next_step: Union[str, int],
-        input: Any = None,
-        **kwargs,
-    ):
-        """
-        Generic function to run a given func, process the output and determine the next
-        step.
-        """
-        if input:
-            operator_output = (
-                func(*args, **kwargs, **input)
-                if isinstance(input, dict)
-                else func(input, *args, **kwargs)
+        # Create a list of SplitRoutes, per batch size 1
+        # Each SplitRoute object holds information about the particular path it
+        # follows. All start at the same step defined by SPLIT_ROUTE and start
+        # with the same inference_state.
+        split_graphs = [
+            SubGraph(
+                inf=copy.deepcopy(inference_state),
+                step=self.router.route[self.router.SPLIT_ROUTE],
+                end=[self.router.JOIN_ROUTE],
             )
-        else:
-            operator_output = func(*args, **kwargs)
-
-        if isinstance(operator_output, Future):
-            operator_output = operator_output.result()
-
-        state_update = None
-        if isinstance(operator_output, tuple):
-            state_update = operator_output[-1]
-            operator_output = operator_output[0]
+            for i in range(len(batches))
+        ]
 
-        next_step = self.router.next(next_step, self.ops, operator_output)
-        return next_step, operator_output, state_update
+        outputs = self._run_sub_graphs(
+            sub_graph_inputs=batches, sub_graphs=split_graphs
+        )
+        return self.condense_inputs(outputs)
 
     def run(
         self,
         *args,
         inference_state: InferenceState,
-        pipeline_state: PipelineState,
         **kwargs,
     ):
         """
@@ -158,36 +194,56 @@ def run(
         """
         next_step = self.router.START_ROUTE
         operator_output = None
-
         while next_step != self.router.END_ROUTE:
+
+            # Split Grap Execution (i.e multiple subgraphs)
             # NOTE: split_route should only appear after the start route node
             if next_step == self.router.SPLIT_ROUTE:
+                if operator_output is None:
+                    raise ValueError(
+                        f"{self.router.SPLIT_ROUTE} should appear after "
+                        f"{self.ROUTER.START_ROUTE}"
+                    )
+
                 operator_output = self._apply_split(operator_output, inference_state)
                 next_step = self.router.route[self.router.JOIN_ROUTE]
+                if next_step == self.router.END_ROUTE:
+                    return operator_output
 
             if next_step == self.router.START_ROUTE:
-                outputs = self._run_next_step(
+                operator_output = run_func(
                     *args,
-                    next_step=next_step,
                     func=self._scheduler_group.submit,
-                    inference_state=inference_state,
                     operator=self.ops[next_step],
-                    pipeline_state=pipeline_state,
+                    inference_state=inference_state,
+                    pipeline_state=self.pipeline_state,
                     **kwargs,
-                )
+                ).result()
+
+                if isinstance(operator_output, tuple):
+                    operator_output, state_update = (
+                        operator_output[0],
+                        operator_output[-1],
+                    )
+                    inference_state.update_state(state_update)
+
+                next_step = self.router.next(next_step, self.ops, operator_output)
+
             else:
-                outputs = self._run_next_step(
-                    func=self._scheduler_group.submit,
-                    input=operator_output,
-                    next_step=next_step,
-                    inference_state=inference_state,
-                    operator=self.ops[next_step],
-                    pipeline_state=pipeline_state,
+                # Single graph execution
+                graph = SubGraph(
+                    inf=copy.deepcopy(inference_state),
+                    step=next_step,
+                    end=[self.router.SPLIT_ROUTE, self.router.END_ROUTE],
                 )
 
-            next_step, operator_output, state_update = outputs
-            if state_update:
-                inference_state.update_state(state_update)
+                operator_output = self._run_sub_graphs(
+                    sub_graph_inputs=[operator_output], sub_graphs=[graph]
+                )[0]
+
+                inference_state = graph.inf
+                next_step = graph.step
+
         return operator_output
 
     def __call__(self, *args, **kwargs):
@@ -204,11 +260,7 @@ def __call__(self, *args, **kwargs):
             inference_state = InferenceState()
             inference_state.create_state({})
 
-        if "pipeline_state" in kwargs:
-            self.pipeline_state = kwargs.get("pipeline_state")
-
         kwargs["inference_state"] = inference_state
-        kwargs["pipeline_state"] = self.pipeline_state
 
         return self.run(*args, **kwargs)
 
diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py
index 6b0d851aef..6740f706f1 100644
--- a/src/deepsparse/v2/routers/router.py
+++ b/src/deepsparse/v2/routers/router.py
@@ -83,8 +83,6 @@ class LinearRouter(Router):
 
     def __init__(self, end_route: int, start_route: int = 0):
         super().__init__(end_route=end_route, start_route=start_route)
-        self.SPLIT_ROUTE = None
-        self.JOIN_ROUTE = None
         _LOGGER.warn("SPLIT and JOIN are not yet supported for the LinearRouter.")
 
     def next(
diff --git a/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py b/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py
index 669c5922a0..cc74ac0996 100644
--- a/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py
+++ b/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py
@@ -50,7 +50,7 @@ def __init__(self):
             engine_operator = EngineOperator(...)
             ...
             continuous_batching_scheduler = ContinuousBatchingScheduler.get_instance()
-            continuous_batching_scheduler.add_engine_operator(engine_operator)
+            continuous_batching_scheduler.add_engine_operator(engine_operator, [1])
 
             super.__init__(...)
     ```
@@ -58,6 +58,8 @@ def __init__(self):
     :param max_workers: maximum number of threads to execute at once, default 1
     """
 
+    # TODO: If the singleton always returns max_workers 1, should we remove this arg/not
+    # give the user a choice?
     def __init__(self, max_workers: int = 1):
         self._max_workers = max_workers
 
@@ -82,6 +84,8 @@ def get_instance(cls) -> "ContinuousBatchingScheduler":
             does not exist yet, a scheduler with a single worker thread to
             schedule all jobs is created and started
         """
+        global _GLOBAL_SCHEDULER
+
         if _GLOBAL_SCHEDULER is not None:
             return _GLOBAL_SCHEDULER  # noqa: F823
 
@@ -161,8 +165,18 @@ def add_engine_operator(
         for batch_size in batch_sizes:
             if batch_size == 1:
                 continue  # already added
-            operator_engines[batch_size] = operator_engines.create_engine(
-                batch_size=batch_size
+
+            override_model_path = None
+            # text generation/NLEngineOperator specific; could add generic method
+            # for all engine_operators, if desired
+            if hasattr(engine_operator, "override_model_inputs"):
+                override_model_path = engine_operator.override_model_inputs(
+                    model_path=engine_operator.model_path, batch_size=batch_size
+                )
+
+            # will break for internal kv_cache; needs additional argument
+            operator_engines[batch_size] = engine_operator.create_engine(
+                batch_size=batch_size, model_path=override_model_path
             )
 
         self._operators_to_engines[engine_operator] = operator_engines
diff --git a/src/deepsparse/v2/schedulers/utils/continuous_batching_executor.py b/src/deepsparse/v2/schedulers/utils/continuous_batching_executor.py
index 86afdf309c..40ff00ca4f 100644
--- a/src/deepsparse/v2/schedulers/utils/continuous_batching_executor.py
+++ b/src/deepsparse/v2/schedulers/utils/continuous_batching_executor.py
@@ -71,7 +71,7 @@ def _working_loop(self):
             ]
 
             # run the engine operator with the given engine at the joined batch size
-            joined_outputs = engine_operator(joined_inputs)
+            joined_outputs = engine_operator(joined_inputs, inference_state=None)
 
             # split outputs and return the results to their respective futures
             split_outputs = joined_outputs.split()
diff --git a/src/deepsparse/v2/text_generation/compile_generated_tokens.py b/src/deepsparse/v2/text_generation/compile_generated_tokens.py
index c87436ab3a..630067f8c3 100644
--- a/src/deepsparse/v2/text_generation/compile_generated_tokens.py
+++ b/src/deepsparse/v2/text_generation/compile_generated_tokens.py
@@ -42,7 +42,7 @@ def run(
         if finish_reason is not None:
             in_generation = False
 
-        state_update = {  # TODO: check if necessary
+        state_update = {
             "finished_reason": finished_reason,
             "generated_tokens": generated_tokens,
             "generated_logits": generated_logits,
diff --git a/src/deepsparse/v2/text_generation/compile_logits.py b/src/deepsparse/v2/text_generation/compile_logits.py
index 21bd50e03e..48a7158f66 100644
--- a/src/deepsparse/v2/text_generation/compile_logits.py
+++ b/src/deepsparse/v2/text_generation/compile_logits.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any
-
 from deepsparse.v2.operators import Operator
+from deepsparse.v2.text_generation.nl_engine_operator import NLEngineOutputs
 from deepsparse.v2.utils import InferenceState
 
 
@@ -28,12 +27,13 @@ class CompilePromptLogits(Operator):
     take prompt logits from each iteration run and update the inference state.
     """
 
-    def can_operate(self, inp: Any):
-        if inp.get("in_generation") is None:
+    def can_operate(self, inp: NLEngineOutputs):
+        if inp.in_generation is None:
             return True
         return False
 
-    def run(self, logits, inference_state: InferenceState, **kwargs):
+    def run(self, inp: NLEngineOutputs, inference_state: InferenceState, **kwargs):
+        logits = inp.engine_outputs
         logit_type = "prompt_logits"
 
         if inference_state.current_state.get(logit_type) is not None:
@@ -44,6 +44,6 @@ def run(self, logits, inference_state: InferenceState, **kwargs):
 
         state_update = {logit_type: current_logits}
         return {
-            "kv_cache": kwargs.get("kv_cache"),
-            "tokens": kwargs.get("tokens"),
+            "kv_cache": inp.kv_cache,
+            "tokens": inp.tokens,
         }, state_update
diff --git a/src/deepsparse/v2/text_generation/generate_new_token.py b/src/deepsparse/v2/text_generation/generate_new_token.py
index 33ab546e39..5bf48bbdbc 100644
--- a/src/deepsparse/v2/text_generation/generate_new_token.py
+++ b/src/deepsparse/v2/text_generation/generate_new_token.py
@@ -11,12 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Sequence, Union
+from typing import Sequence, Union
 
 import transformers
 
 from deepsparse.transformers.pipelines.text_generation import FinishReason
 from deepsparse.v2.operators import Operator
+from deepsparse.v2.text_generation.nl_engine_operator import NLEngineOutputs
 from deepsparse.v2.utils import InferenceState
 
 
@@ -30,12 +31,15 @@ def __init__(
         self.force_max_tokens = force_max_tokens
         self.tokenizer = tokenizer
 
-    def can_operate(self, inp: Any):
-        if inp.get("in_generation"):
+    def can_operate(self, inp: NLEngineOutputs):
+        if inp.in_generation:
             return True
         return False
 
-    def run(self, logits, kv_cache, inference_state: InferenceState, **kwargs):
+    def run(self, inp: NLEngineOutputs, inference_state: InferenceState, **kwargs):
+        logits = inp.engine_outputs
+        kv_cache = inp.kv_cache
+
         token_generator = inference_state.current_state.get("token_generator")
         token = token_generator.generate(logits=logits[0, -1, :])
         finish_reason = None
diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py
index 7549f986d9..d8c80bbaee 100644
--- a/src/deepsparse/v2/text_generation/nl_engine_operator.py
+++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py
@@ -14,10 +14,13 @@
 
 import copy
 import os
-from typing import Any, List, Tuple
+from pathlib import Path
+from typing import Any, List, Optional, Tuple, Union
 
+import numpy
 from pydantic import BaseModel, Field
 
+from deepsparse.utils import join_engine_outputs, split_engine_inputs
 from deepsparse.utils.onnx import (
     CACHE_INPUT_PREFIX,
     overwrite_onnx_model_inputs_for_kv_cache_models,
@@ -29,14 +32,76 @@
 )
 
 
-__all__ = ["NLEngineOperator", "NlEngineInput"]
+__all__ = ["NLEngineOperator", "NLEngineInputs"]
 
 
-class NlEngineInput(BaseModel):
-    engine_inputs: List = Field(description="engine inputs")
+class NLEngineInputs(BaseModel):
+    engine_inputs: List = Field(description="engine_inputs")
     kv_cache: Any = Field(description="kv_cache object")
     tokens: List = Field(description="tokens")
-    in_generation: bool = Field(description="in_generation", default=None)
+    in_generation: Any = Field(description="in_generation", default=None)
+    engine: Optional[Any] = Field(
+        description="override the engine to run forward pass with",
+        default=None,
+    )
+
+    @classmethod
+    def join(cls, inputs: List["NLEngineInputs"]) -> "NLEngineInputs":
+        """
+        :param inputs: list of separate EngineOperatorInputs, batch size must be 1
+        :return: list of inputs joined into a single input with a multi batch size
+        """
+        all_engine_inputs = []
+        all_kv_cache = []
+        all_tokens = []
+        all_generation = []
+
+        for engine_input in inputs:
+            all_engine_inputs.append(engine_input.engine_inputs)
+            all_kv_cache.append(engine_input.kv_cache)
+            all_tokens.append(engine_input.tokens)
+            all_generation.append(engine_input.in_generation)
+
+        for engine_inputs in all_engine_inputs:
+            if engine_inputs[0].shape[0] != 1:
+                raise RuntimeError(
+                    "join requires all inputs to have batch size 1, found input with "
+                    f"batch size {engine_inputs[0].shape[0]}"
+                )
+        return cls(
+            engine_inputs=all_engine_inputs,
+            tokens=all_tokens,
+            in_generation=all_generation,
+            kv_cache=all_kv_cache,
+        )
+
+    class Config:
+        arbitrary_types_allowed = True
+
+
+class NLEngineOutputs(BaseModel):
+    engine_outputs: Any = Field(description="engine_outputs")
+    kv_cache: Any = Field(description="kv_cache object")
+    tokens: List = Field(description="tokens")
+    in_generation: Any = Field(description="in_generation", default=None)
+
+    def split(self) -> List["NLEngineOutputs"]:
+        """
+        :return: list of the current outputs split to a batch size of 1 each
+        """
+        split_outputs = [
+            numpy.expand_dims(self.engine_outputs[i], 0)
+            for i in range(len(self.engine_outputs))
+        ]
+        return [
+            self.__class__(
+                engine_outputs=split_outputs[i],
+                kv_cache=self.kv_cache[i],
+                tokens=self.tokens[i],
+                in_generation=self.in_generation[i],
+            )
+            for i in range(len(split_outputs))
+        ]
 
 
 class NLEngineOperator(EngineOperator):
@@ -48,8 +113,8 @@ class NLEngineOperator(EngineOperator):
     multi-token case.
     """
 
-    input_schema = NlEngineInput
-    output_schema = None
+    input_schema = NLEngineInputs
+    output_schema = NLEngineOutputs
 
     def __init__(
         self,
@@ -59,17 +124,17 @@ def __init__(
         **kwargs,
     ):
 
+        self.sequence_length = sequence_length
+        self.input_ids_length = input_ids_length
         self.kv_cache_data_type = None
-        (
-            onnx_file_path,
-            output_indices_to_be_cached,
-            kv_cache_data_type,
-        ) = overwrite_onnx_model_inputs_for_kv_cache_models(
-            onnx_file_path=kwargs.get("model_path"),
-            batch_size=kwargs.get("batch_size", 1),
-            sequence_length=sequence_length,
-            input_ids_length=input_ids_length,
+        self.internal_kv_cache = internal_kv_cache
+        self.model_path = kwargs.get("model_path")
+        (onnx_file_path, additional_outputs) = self.override_model_inputs(
+            self.model_path, batch_size=1, return_additional_outputs=True
         )
+        output_indices_to_be_cached, kv_cache_data_type, = additional_outputs.get(
+            "output_indices_to_be_cached"
+        ), additional_outputs.get("kv_cache_data_type")
 
         engine_kwargs = kwargs.get("engine_kwargs", {})
         if kwargs.get("engine_type", DEEPSPARSE_ENGINE) == DEEPSPARSE_ENGINE:
@@ -86,43 +151,95 @@ def __init__(
 
         kwargs["engine_kwargs"] = engine_kwargs
         kwargs["model_path"] = onnx_file_path
+
         super().__init__(**kwargs)
 
-        self.input_ids_length = input_ids_length
+    def override_model_inputs(
+        self,
+        model_path: Union[str, Path],
+        batch_size: int,
+        return_additional_outputs=False,
+    ):
+        """
+        Override the model based on the provided batch_size, sequence_length,
+        and input_ids_length.
+
+        :param model_path: Path to the model
+        :param batch_size: The batch size to be used for the model
+        :return: new overwritten model file path. Optionally returns additional outputs
+            specific to the NLDecoder engine
+        """
+        (
+            onnx_file_path,
+            output_indices_to_be_cached,
+            kv_cache_data_type,
+        ) = overwrite_onnx_model_inputs_for_kv_cache_models(
+            onnx_file_path=model_path,
+            batch_size=batch_size,
+            sequence_length=self.sequence_length,
+            input_ids_length=self.input_ids_length,
+        )
+        if return_additional_outputs:
+            return onnx_file_path, {
+                "output_indices_to_be_cached": output_indices_to_be_cached,
+                "kv_cache_data_type": kv_cache_data_type,
+            }
+        return onnx_file_path
 
-    def run(self, inp: NlEngineInput, **kwargs) -> Any:
+    def run(self, inp: NLEngineInputs, **kwargs) -> NLEngineOutputs:
         engine_input = inp.engine_inputs
         kv_cache = inp.kv_cache
 
-        inputs = self._add_kv_cache_to_input(engine_input, kv_cache)
-        if bool(kv_cache.engine_internal_cache):
+        split = True
+        if not isinstance(kv_cache, list):
+            split = False
+            kv_cache = [kv_cache]
+            engine_input = [engine_input]
+
+        inputs = list(map(self._add_kv_cache_to_input, engine_input, kv_cache))
+
+        if bool(kv_cache[0].engine_internal_cache):
             # conventionally, before dispatching
             # inputs to the engine, we validate them
             # if val_inp=True. However, in this case
             # we want to pass the empty kv cache inputs
             # (batch_size=0) to the engine. Therefore,
             # we skip the validation
+
+            # Internal kv_cache works for batch_size of 1 atm
             out = self.engine._eng_net.execute_list_out(
-                inputs, kv_cache.engine_internal_cache
+                inputs[0], kv_cache[0].engine_internal_cache
             )
         else:
             # run the engine without the LIB.kv_cache object
+            # stack multiple batch inputs along the batch dimension
+            inputs = join_engine_outputs(inputs, len(inputs))
             out = (
                 super()
-                .run(EngineOperatorInputs(engine_inputs=inputs), **kwargs)
+                .run(
+                    EngineOperatorInputs(engine_inputs=inputs, engine=inp.engine),
+                    **kwargs,
+                )
                 .get("engine_outputs")
             )
 
+        # logits should be stacked along batch dim
+        # kv_cache_state should be a list where each dim 0 is batch_size
         logits, *kv_cache_state = out
-        self._update_kv_cache(
-            kv_cache_state=kv_cache_state,
-            input_ids_len=self.input_ids_length,
-            kv_cache=kv_cache,
-        )
+        kv_cache_state, _ = split_engine_inputs(kv_cache_state, 1)
+
+        if len(kv_cache_state) > 0:
+            for i in range(len(kv_cache)):
+                self._update_kv_cache(
+                    kv_cache_state=kv_cache_state[i], kv_cache=kv_cache[i]
+                )
+        else:
+            # internal kv cache case
+            self._update_kv_cache(kv_cache=kv_cache[0])
 
         output = {
-            "logits": logits,
-            "kv_cache": kv_cache,
+            "engine_outputs": logits,
+            "kv_cache": kv_cache if split else kv_cache[0],
             "tokens": inp.tokens,
             "in_generation": inp.in_generation,
         }
@@ -137,9 +254,9 @@ def _add_kv_cache_to_input(self, engine_input, kv_cache):
         new_inp = [kv_cache_state[name] for name in self.engine.input_names]
         return new_inp
 
-    def _update_kv_cache(self, kv_cache_state, input_ids_len, kv_cache):
+    def _update_kv_cache(self, kv_cache, kv_cache_state=None):
         if bool(kv_cache.engine_internal_cache):
-            kv_cache.total_num_processed_tokens += input_ids_len
+            kv_cache.total_num_processed_tokens += self.input_ids_length
             return
 
         kv_cache_state = {
@@ -147,10 +264,7 @@ def _update_kv_cache(self, kv_cache_state, input_ids_len, kv_cache):
             for name, array in zip(self.onnx_input_names_cached, kv_cache_state)
         }
 
-        kv_cache.update(
-            state=kv_cache_state,
-            input_ids_len=input_ids_len,
-        )
+        kv_cache.update(state=kv_cache_state, input_ids_len=self.input_ids_length)
 
     @property
     def onnx_input_names_no_cache(self) -> List[str]:
diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py
index 5ab73f7a48..ae7334cffd 100644
--- a/src/deepsparse/v2/text_generation/pipeline.py
+++ b/src/deepsparse/v2/text_generation/pipeline.py
@@ -12,14 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict, Optional
+import logging
+from typing import Dict, List, Optional
 
 from deepsparse.transformers.helpers import setup_transformers_pipeline
 from deepsparse.transformers.utils.helpers import process_generation_config
 from deepsparse.utils import split_engine_inputs
+from deepsparse.v2.operators import EngineOperator
 from deepsparse.v2.pipeline import Pipeline
 from deepsparse.v2.routers import GraphRouter
-from deepsparse.v2.schedulers import OperatorScheduler
+from deepsparse.v2.schedulers import ContinuousBatchingScheduler, OperatorScheduler
 from deepsparse.v2.text_generation import (
     AutoRegressiveOperatorPreprocess,
     CompileGeneratedTokens,
@@ -39,6 +41,9 @@
 from deepsparse.v2.utils import PipelineState
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 class TextGenerationPipeline(Pipeline):
     def __init__(
         self,
@@ -48,6 +53,7 @@ def __init__(
         internal_kv_cache: bool = True,
         force_max_tokens: bool = False,
         generation_config=None,
+        continuous_batch_sizes: Optional[List[int]] = None,
         engine_kwargs: Optional[Dict] = None,
     ):
         (
@@ -133,6 +139,20 @@ def __init__(
         compile_generated_tokens = CompileGeneratedTokens()
         join_output = JoinOutput(tokenizer=self.tokenizer)
 
+        # TODO: do we want to support lists for different engines?
+        continuous_batching_scheduler = None
+        if continuous_batch_sizes:
+            if internal_kv_cache:
+                _LOGGER.warn(
+                    "internal kv_cache is currently not supported with continuous ",
+                    "batching",
+                )
+            else:
+                continuous_batching_scheduler = self._get_continuous_batching_scheduler(
+                    batch_sizes=continuous_batch_sizes,
+                    engines=[single_engine_operator, multi_engine_operator],
+                )
+
         ops = {
             "process_input": process_inputs,
             "single_engine": single_engine_operator,
@@ -183,7 +203,11 @@ def __init__(
         )
         scheduler = [OperatorScheduler()]
         super().__init__(
-            ops=ops, router=router, schedulers=scheduler, pipeline_state=pipeline_state
+            ops=ops,
+            router=router,
+            schedulers=scheduler,
+            pipeline_state=pipeline_state,
+            continuous_batching_scheduler=continuous_batching_scheduler,
         )
 
     def expand_inputs(self, items, batch_size):
@@ -194,3 +218,21 @@ def expand_inputs(self, items, batch_size):
 
     def condense_inputs(self, *args, **kwargs):
         return args[0], kwargs
+
+    def _get_continuous_batching_scheduler(
+        self, batch_sizes: List[int], engines: List[EngineOperator]
+    ) -> ContinuousBatchingScheduler:
+        """
+        Fetch the continuous batching scheduler. Requires adding the EngineOperator
+        that will run through the scheduler.
+
+        :param batch_sizes: List of batch sizes to be used by the models
+        :param engine: List of EngineOperators which should be scheduled using the
+            continuous batching scheduler
+
+        :returns: ContinuousBatchingScheduler
+        """
+        continuous_batching_scheduler = ContinuousBatchingScheduler.get_instance()
+        for op in engines:
+            continuous_batching_scheduler.add_engine_operator(op, batch_sizes)
+        return continuous_batching_scheduler
diff --git a/src/deepsparse/v2/utils/__init__.py b/src/deepsparse/v2/utils/__init__.py
index 358405d7af..75935a9729 100644
--- a/src/deepsparse/v2/utils/__init__.py
+++ b/src/deepsparse/v2/utils/__init__.py
@@ -13,5 +13,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .helpers import *
 from .state import *
 from .types import *
+
+
+from .data import *  # isort:skip
diff --git a/src/deepsparse/v2/utils/data.py b/src/deepsparse/v2/utils/data.py
new file mode 100644
index 0000000000..40402734cf
--- /dev/null
+++ b/src/deepsparse/v2/utils/data.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, List
+
+from deepsparse.v2.utils import InferenceState
+
+
+__all__ = ["SubGraph"]
+
+
+@dataclass
+class SubGraph:
+    """
+    Helper dataclass to store information about each running sub graph.
+    """
+
+    step: int
+    inf: InferenceState
+    end: List[str]
+    output: Any = None
+
+    def parse_output(self, operator_output: Any):
+        if isinstance(operator_output, tuple):
+            operator_output, state_update = operator_output[0], operator_output[-1]
+            self.inf.update_state(state_update)
+        return operator_output
diff --git a/src/deepsparse/v2/utils/helpers.py b/src/deepsparse/v2/utils/helpers.py
new file mode 100644
index 0000000000..1f4bedc6c9
--- /dev/null
+++ b/src/deepsparse/v2/utils/helpers.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Callable
+
+
+__all__ = ["run_func"]
+
+
+def run_func(
+    *args,
+    func: Callable,
+    inp: Any = None,
+    **kwargs,
+):
+    """
+    Generic function to run a given Callable.
+    """
+    if inp:
+        output = (
+            func(*args, **kwargs, **inp)
+            if isinstance(inp, dict)
+            else func(inp, *args, **kwargs)
+        )
+    else:
+        output = func(*args, **kwargs)
+    return output
diff --git a/tests/deepsparse/v2/integration_tests/test_llms.py b/tests/deepsparse/v2/integration_tests/test_llms.py
index 34a8f7a258..c53899f30c 100644
--- a/tests/deepsparse/v2/integration_tests/test_llms.py
+++ b/tests/deepsparse/v2/integration_tests/test_llms.py
@@ -135,7 +135,7 @@ def test_ort_single_token_prefill(self, setup):
 
         pipeline = self.get_pipeline(
             prompt_sequence_length=1,
-            engine_type="onnxruntime",
+            engine_kwargs={"engine_type": "onnxruntime"},
         )
         output = pipeline(
             prompt=self.prompt,
@@ -163,7 +163,7 @@ def test_ort_multi_token_prefill(self, setup):
                 "Cannot run ORT pipeline with the internal deepsparse cache enabled."
             )
         pipeline = self.get_pipeline(
-            engine_type="onnxruntime",
+            engine_kwargs={"engine_type": "onnxruntime"},
         )
         output = pipeline(
             prompt=self.prompt,
@@ -244,7 +244,7 @@ def test_inference_no_kv_cache_ort(self, setup):
     def _test_inference_no_kv_cache(self, engine_type):
         model_path_no_cache = self._get_model_path_no_cache()
         pipeline = self.get_pipeline(
-            model_path=model_path_no_cache, engine_type=engine_type
+            model_path=model_path_no_cache, engine_kwargs={"engine_type": engine_type}
         )
         assert not pipeline.cache_support_enabled, (
             "This pipeline test inference using non-kv cache "
diff --git a/tests/deepsparse/v2/unit/text_generation/conftest.py b/tests/deepsparse/v2/unit/text_generation/conftest.py
index 5d8483e5f6..3840a9bb0a 100644
--- a/tests/deepsparse/v2/unit/text_generation/conftest.py
+++ b/tests/deepsparse/v2/unit/text_generation/conftest.py
@@ -19,15 +19,14 @@
 
 import pytest
 from deepsparse.transformers.helpers import get_deployment_path
-from deepsparse.transformers.pipelines.text_generation import TextGenerationInput
+from deepsparse.transformers.pipelines.text_generation import (
+    GenerationDefaults,
+    TextGenerationInput,
+)
 from deepsparse.transformers.utils import DecoderKVCache
 from deepsparse.transformers.utils.helpers import initialize_kv_cache_state
 from deepsparse.v2 import InferenceState, PipelineState
-from deepsparse.v2.text_generation import (
-    GenerationDefaults,
-    NLEngineOperator,
-    TokenGeneratorOperator,
-)
+from deepsparse.v2.text_generation import NLEngineOperator, TokenGeneratorOperator
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/deepsparse/v2/unit/text_generation/test_misc.py b/tests/deepsparse/v2/unit/text_generation/test_misc.py
index caa0cc2efd..f215e2aedb 100644
--- a/tests/deepsparse/v2/unit/text_generation/test_misc.py
+++ b/tests/deepsparse/v2/unit/text_generation/test_misc.py
@@ -13,16 +13,23 @@
 # limitations under the License.
 
 from deepsparse.v2.text_generation import CompilePromptLogits
+from deepsparse.v2.text_generation.nl_engine_operator import NLEngineOutputs
 
 
-def test_compile_logits(mock_logits, mock_inference_state):
+def test_compile_logits(mock_logits, mock_inference_state, mock_tokens, mock_kv_cache):
     mock_inference_state.update_state({"prompt_logits": [mock_logits]})
     compile_prompt_logits = CompilePromptLogits()
     # Can operate as long as we're not in generation but in prompt_inference. This
     # can_operate() will check for the `in_generation` flag in the input.
-    assert compile_prompt_logits.can_operate({})
+    inp = NLEngineOutputs(
+        engine_outputs=mock_logits,
+        tokens=mock_tokens,
+        kv_cache=mock_kv_cache,
+        in_generation=None,
+    )
+    assert compile_prompt_logits.can_operate(inp=inp)
     output, state = compile_prompt_logits.run(
-        logits=mock_logits, inference_state=mock_inference_state
+        inp=inp, inference_state=mock_inference_state
     )
     # The CompilePromptLogits is responsible for updating a list of prompt logits
     # calculated at each step during prompt inference. After one step of running this
diff --git a/tests/deepsparse/v2/unit/text_generation/test_process_inputs.py b/tests/deepsparse/v2/unit/text_generation/test_process_inputs.py
index be59db7475..02f4540c44 100644
--- a/tests/deepsparse/v2/unit/text_generation/test_process_inputs.py
+++ b/tests/deepsparse/v2/unit/text_generation/test_process_inputs.py
@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from deepsparse.v2.text_generation import (
-    GenerationDefaults,
-    ProcessInputsTextGeneration,
-)
+from deepsparse.transformers.pipelines.text_generation import GenerationDefaults
+from deepsparse.v2.text_generation import ProcessInputsTextGeneration
 
 
 def test_process_inputs(
diff --git a/tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py b/tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py
index 335a28fbe3..19bb4d1c4a 100644
--- a/tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py
+++ b/tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py
@@ -16,7 +16,7 @@
 
 from deepsparse.v2.text_generation import (
     AutoRegressiveOperatorPreprocess,
-    NlEngineInput,
+    NLEngineInputs,
 )
 
 
@@ -89,10 +89,10 @@ def test_run_single_token_engine_once(
         numpy.array([[0]]),
         numpy.array([[[[0, 0, 0, 0, 1]]]]),
     ]
-    inputs = NlEngineInput(
+    inputs = NLEngineInputs(
         engine_inputs=mock_engine_inputs,
         kv_cache=mock_kv_cache_single_token_engine,
         tokens=mock_engine_inputs[0].tolist(),
     )
     output = single_token_engine_no_internal_cache.run(inputs)
-    assert output.get("logits") is not None
+    assert output.get("engine_outputs") is not None
diff --git a/tests/deepsparse/v2/unit/text_generation/test_token_generation.py b/tests/deepsparse/v2/unit/text_generation/test_token_generation.py
index fbd9e06778..d04f863171 100644
--- a/tests/deepsparse/v2/unit/text_generation/test_token_generation.py
+++ b/tests/deepsparse/v2/unit/text_generation/test_token_generation.py
@@ -18,6 +18,7 @@
     PrepareGeneration,
     TokenGeneratorOperator,
 )
+from deepsparse.v2.text_generation.nl_engine_operator import NLEngineOutputs
 
 
 def test_prep_for_generation(
@@ -68,6 +69,7 @@ def test_generate_new_token(
     mock_kv_cache,
     mock_inference_state,
     mock_logits,
+    mock_tokens,
 ):
     """
     This test is responsible for testing the GenerateNewTokenOperator, which generates
@@ -84,8 +86,14 @@ def test_generate_new_token(
             "generated_tokens": [mock_token_generator.tokens],
         }
     )
+    inp = NLEngineOutputs(
+        engine_outputs=mock_logits,
+        tokens=mock_tokens,
+        kv_cache=mock_kv_cache,
+        in_generation=True,
+    )
     outputs, state = generate_new_token.run(
-        logits=mock_logits, kv_cache=mock_kv_cache, inference_state=mock_inference_state
+        inp=inp, inference_state=mock_inference_state
     )
     # The new_token generated/returned by ths operator should match the last token in
     # token_generator

From bb3ff413f77927020016a8f16ae38606a5750218 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 21 Nov 2023 15:41:41 -0500
Subject: [PATCH 36/43] [Pipeline Refactor] Operator Registry (#1420)

* initial registry functionality

* use sparsezoo mixin
---
 src/deepsparse/v2/__init__.py                 |   1 +
 src/deepsparse/v2/operators/__init__.py       |   1 +
 src/deepsparse/v2/operators/operator.py       |  15 ++
 src/deepsparse/v2/operators/registry.py       |  76 +++++++
 src/deepsparse/v2/task.py                     | 204 ++++++++++++++++++
 src/deepsparse/v2/text_generation/pipeline.py |   2 +
 6 files changed, 299 insertions(+)
 create mode 100644 src/deepsparse/v2/operators/registry.py
 create mode 100644 src/deepsparse/v2/task.py

diff --git a/src/deepsparse/v2/__init__.py b/src/deepsparse/v2/__init__.py
index 29fcd4126c..5fd33a9503 100644
--- a/src/deepsparse/v2/__init__.py
+++ b/src/deepsparse/v2/__init__.py
@@ -18,4 +18,5 @@
 from .pipeline import *
 from .routers import *
 from .schedulers import *
+from .task import *
 from .utils import *
diff --git a/src/deepsparse/v2/operators/__init__.py b/src/deepsparse/v2/operators/__init__.py
index bf58018493..ae14f2a373 100644
--- a/src/deepsparse/v2/operators/__init__.py
+++ b/src/deepsparse/v2/operators/__init__.py
@@ -16,3 +16,4 @@
 # limitations under the License.
 from .operator import *
 from .engine_operator import *
+from .registry import *
diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py
index 2923862b12..377088e09e 100644
--- a/src/deepsparse/v2/operators/operator.py
+++ b/src/deepsparse/v2/operators/operator.py
@@ -18,6 +18,7 @@
 from pydantic import BaseModel
 
 from deepsparse.v2.utils import InferenceState
+from deepsparse.v2.operators.registry import OperatorRegistry
 
 
 __all__ = ["Operator"]
@@ -100,6 +101,20 @@ def __call__(
             return self.output_schema(**run_output)
         return run_output
 
+    @staticmethod
+    def create(
+        task: str,
+        **kwargs,
+    ) -> "Operator":
+        """
+        :param task: Operator task
+        :param kwargs: extra task specific kwargs to be passed to task Operator
+            implementation
+        :return: operator object initialized for the given task
+        """
+        operator_constructor = OperatorRegistry.get_task_constructor(task)
+        return operator_constructor(**kwargs)
+
     @abstractmethod
     def run(self, *args, **kwargs) -> Any:
         """
diff --git a/src/deepsparse/v2/operators/registry.py b/src/deepsparse/v2/operators/registry.py
new file mode 100644
index 0000000000..1b83b20728
--- /dev/null
+++ b/src/deepsparse/v2/operators/registry.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Type
+
+from deepsparse.v2.task import SupportedTasks, dynamic_import_task
+from sparsezoo.utils.registry import (
+    RegistryMixin,
+    get_from_registry,
+    register,
+    registered_names,
+)
+
+
+__all__ = ["OperatorRegistry"]
+
+
+class OperatorRegistry(RegistryMixin):
+    """
+    Register operators with given task name(s). Leverages the RegistryMixin
+    functionality.
+    """
+
+    @classmethod
+    def register_value(cls, operator, name):
+        from deepsparse.v2.operators import Operator
+
+        if not isinstance(name, list):
+            name = [name]
+
+        for task_name in name:
+            register(Operator, operator, task_name, require_subclass=True)
+
+        return operator
+
+    @classmethod
+    def get_task_constructor(cls, task: str) -> Type["Operator"]:  # noqa: F821
+        """
+        This function retrieves the class previously registered via
+        `OperatorRegistry.register` for `task`.
+
+        If `task` starts with "import:", it is treated as a module to be imported,
+        and retrieves the task via the `TASK` attribute of the imported module.
+
+        If `task` starts with "custom", then it is mapped to the "custom" task.
+
+        :param task: The task name to get the constructor for
+        :return: The class registered to `task`
+        :raises ValueError: if `task` was not registered via `OperatorRegistry.register`
+        """
+        from deepsparse.v2.operators import Operator
+
+        if task.startswith("import:"):
+            # dynamically import the task from a file
+            task = dynamic_import_task(module_or_path=task.replace("import:", ""))
+        elif task.startswith("custom"):
+            # support any task that has "custom" at the beginning via the "custom" task
+            task = "custom"
+        else:
+            task = task.lower().replace("-", "_")
+
+        tasks = registered_names(Operator)
+        # step needed to import relevant files required to load the operator
+        SupportedTasks.check_register_task(task, tasks)
+        return get_from_registry(Operator, task)
diff --git a/src/deepsparse/v2/task.py b/src/deepsparse/v2/task.py
new file mode 100644
index 0000000000..f1f4fc6d66
--- /dev/null
+++ b/src/deepsparse/v2/task.py
@@ -0,0 +1,204 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Classes and implementations for supported tasks in the DeepSparse pipeline and system
+"""
+
+import importlib
+import logging
+import os
+import sys
+from collections import namedtuple
+from typing import Iterable, List, Optional, Tuple
+
+
+_LOGGER = logging.getLogger(__name__)
+
+__all__ = ["SupportedTasks", "AliasedTask"]
+
+
+class AliasedTask:
+    """
+    A task that can have multiple aliases to match to.
+    For example, question_answering which can alias to qa as well
+
+    :param name: the name of the task such as question_answering or text_classification
+    :param aliases: the aliases the task can go by in addition to the name such as
+        qa, glue, sentiment_analysis, etc
+    """
+
+    def __init__(self, name: str, aliases: List[str]):
+        self._name = name
+        self._aliases = aliases
+
+    @property
+    def name(self) -> str:
+        """
+        :return: the name of the task such as question_answering
+        """
+        return self._name
+
+    @property
+    def aliases(self) -> List[str]:
+        """
+        :return: the aliases the task can go by such as qa, glue, sentiment_analysis
+        """
+        return self._aliases
+
+    def matches(self, task: str) -> bool:
+        """
+        :param task: the name of the task to check whether the given instance matches.
+            Checks the current name as well as any aliases.
+            Everything is compared at lower case and "-" and whitespace
+            are replaced with "_".
+        :return: True if task does match the current instance, False otherwise
+        """
+        task = task.lower().replace("-", "_")
+
+        # replace whitespace with "_"
+        task = "_".join(task.split())
+
+        return task == self.name or task in self.aliases
+
+
+class SupportedTasks:
+    """
+    The supported tasks in the DeepSparse pipeline and system
+    """
+
+    text_generation = namedtuple(
+        "text_generation", ["text_generation", "opt", "bloom"]
+    )(
+        text_generation=AliasedTask("text_generation", []),
+        opt=AliasedTask("opt", []),
+        bloom=AliasedTask("bloom", []),
+    )
+
+    all_task_categories = [text_generation]
+
+    @classmethod
+    def check_register_task(
+        cls, task: str, extra_tasks: Optional[Iterable[str]] = None
+    ):
+        """
+        :param task: task name to validate and import dependencies for
+        :param extra_tasks: valid task names that are not included in supported tasks.
+            i.e. tasks registered to Pipeline at runtime
+        """
+        if cls.is_text_generation(task):
+            import deepsparse.v2.text_generation.pipeline  # noqa: F401
+
+        all_tasks = set(cls.task_names() + (list(extra_tasks or [])))
+        if task not in all_tasks:
+            raise ValueError(
+                f"Unknown Pipeline task {task}. Currently supported tasks are "
+                f"{list(all_tasks)}"
+            )
+
+    @classmethod
+    def is_text_generation(cls, task: str) -> bool:
+        """
+        :param task: the name of the task to check whether it is a text generation task
+            such as codegen
+        :return: True if it is a text generation task, False otherwise
+        """
+        return any(
+            text_generation_task.matches(task)
+            for text_generation_task in cls.text_generation
+        )
+
+    @classmethod
+    def task_names(cls):
+        task_names = ["custom"]
+        for task_category in cls.all_task_categories:
+            for task in task_category:
+                unique_aliases = (
+                    alias for alias in task._aliases if alias != task._name
+                )
+                task_names += (task._name, *unique_aliases)
+        return task_names
+
+
+def dynamic_import_task(module_or_path: str) -> str:
+    """
+    Dynamically imports `module` with importlib, and returns the `TASK`
+    attribute on the module (something like `importlib.import_module(module).TASK`).
+
+    Example contents of `module`:
+    ```python
+    from deepsparse.pipeline import Pipeline
+    from deepsparse.transformers.pipelines.question_answering import (
+        QuestionAnsweringPipeline,
+    )
+
+    TASK = "my_qa_task"
+    Pipeline.register(TASK)(QuestionAnsweringPipeline)
+    ```
+
+    NOTE: this modifies `sys.path`.
+
+    :raises FileNotFoundError: if path does not exist
+    :raises RuntimeError: if the imported module does not contain `TASK`
+    :raises RuntimeError: if the module doesn't register the task
+    :return: The task from the imported module.
+    """
+    parent_dir, module_name = _split_dir_and_name(module_or_path)
+    if not os.path.exists(os.path.join(parent_dir, module_name + ".py")):
+        raise FileNotFoundError(
+            f"Unable to find file for {module_or_path}. "
+            f"Looked for {module_name}.py under {parent_dir if parent_dir else '.'}"
+        )
+
+    # add parent_dir to sys.path so we can import the file as a module
+    sys.path.append(os.curdir)
+    if parent_dir:
+        _LOGGER.info(f"Adding {parent_dir} to sys.path")
+        sys.path.append(parent_dir)
+
+    # do the import
+    _LOGGER.info(f"Importing '{module_name}'")
+    module_or_path = importlib.import_module(module_name)
+
+    if not hasattr(module_or_path, "TASK"):
+        raise RuntimeError(
+            "When using --task import:<module>, "
+            "module must set the `TASK` attribute."
+        )
+
+    task = getattr(module_or_path, "TASK")
+    _LOGGER.info(f"Using task={repr(task)}")
+
+    return task
+
+
+def _split_dir_and_name(module_or_path: str) -> Tuple[str, str]:
+    """
+    Examples:
+    - `a` -> `("", "a")`
+    - `a.b` -> `("a", "b")`
+    - `a.b.c` -> `("a/b", "c")`
+
+    :return: module split into directory & name
+    """
+    if module_or_path.endswith(".py"):
+        # assume path
+        split_char = os.sep
+        module_or_path = module_or_path.replace(".py", "")
+    else:
+        # assume module
+        split_char = "."
+    *dirs, module_name = module_or_path.split(split_char)
+    parent_dir = os.sep if dirs == [""] else os.sep.join(dirs)
+    return parent_dir, module_name
diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py
index ae7334cffd..344980dc3f 100644
--- a/src/deepsparse/v2/text_generation/pipeline.py
+++ b/src/deepsparse/v2/text_generation/pipeline.py
@@ -19,6 +19,7 @@
 from deepsparse.transformers.utils.helpers import process_generation_config
 from deepsparse.utils import split_engine_inputs
 from deepsparse.v2.operators import EngineOperator
+from deepsparse.v2.operators.registry import OperatorRegistry
 from deepsparse.v2.pipeline import Pipeline
 from deepsparse.v2.routers import GraphRouter
 from deepsparse.v2.schedulers import ContinuousBatchingScheduler, OperatorScheduler
@@ -44,6 +45,7 @@
 _LOGGER = logging.getLogger(__name__)
 
 
+@OperatorRegistry.register(name="text_generation")
 class TextGenerationPipeline(Pipeline):
     def __init__(
         self,

From 90de2b352c47fff541113f5529a52a715c079885 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Wed, 22 Nov 2023 11:23:57 +0000
Subject: [PATCH 37/43] fix tricky rebase

---
 src/deepsparse/v2/operators/operator.py       |  2 +-
 src/deepsparse/v2/pipeline.py                 |  4 ++--
 .../v2/text_generation/nl_engine_operator.py  | 24 ++++++++++---------
 src/deepsparse/v2/text_generation/pipeline.py | 14 +++++------
 .../v2/unit/text_generation/conftest.py       |  4 ++--
 5 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py
index 377088e09e..e775056f8f 100644
--- a/src/deepsparse/v2/operators/operator.py
+++ b/src/deepsparse/v2/operators/operator.py
@@ -17,8 +17,8 @@
 
 from pydantic import BaseModel
 
-from deepsparse.v2.utils import InferenceState
 from deepsparse.v2.operators.registry import OperatorRegistry
+from deepsparse.v2.utils import InferenceState
 
 
 __all__ = ["Operator"]
diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py
index 78d112a2b3..59970b2820 100644
--- a/src/deepsparse/v2/pipeline.py
+++ b/src/deepsparse/v2/pipeline.py
@@ -15,7 +15,7 @@
 
 import copy
 from concurrent.futures import Future
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Optional, Union
 
 from deepsparse.v2.operators import EngineOperator, Operator
 from deepsparse.v2.routers import Router
@@ -55,7 +55,7 @@ def __init__(
         ops: Union[Dict[str, Operator], List[Operator]],
         router: Router,
         schedulers: List[OperatorScheduler],
-        continuous_batching_scheduler: ContinuousBatchingScheduler,
+        continuous_batching_scheduler: Optional[ContinuousBatchingScheduler] = None,
         pipeline_state: PipelineState = None,
     ):
 
diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py
index 407415d00a..9bef8ceb87 100644
--- a/src/deepsparse/v2/text_generation/nl_engine_operator.py
+++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py
@@ -33,10 +33,12 @@
 )
 
 
-__all__ = ["NLEngineOperator",
-            "NlEngineOperatorNoCache",
-            "NlEngineInputNoCache",
-           "NLEngineInputs"]
+__all__ = [
+    "NLEngineOperator",
+    "NLEngineOperatorNoCache",
+    "NLEngineInputsNoCache",
+    "NLEngineInputs",
+]
 
 
 class NLEngineInputs(BaseModel):
@@ -108,12 +110,12 @@ def split(self) -> List["NLEngineOutputs"]:
         ]
 
 
-class NlEngineInputsNoCache(BaseModel):
+class NLEngineInputsNoCache(BaseModel):
     input_ids: Any
     attention_mask: Any
 
 
-class NlEngineOperator(EngineOperator):
+class NLEngineOperator(EngineOperator):
 
     """
     Operator for the NL Decoder Engine. This Operator inherits from the EngineOperator.
@@ -122,8 +124,8 @@ class NlEngineOperator(EngineOperator):
     multi-token case.
     """
 
-    input_schema = NlEngineInputs
-    output_schema = NlEngineOutputs
+    input_schema = NLEngineInputs
+    output_schema = NLEngineOutputs
 
     def __init__(
         self,
@@ -320,14 +322,14 @@ def output_names(self) -> List[str]:
         return self.engine.output_names
 
 
-class NlEngineOperatorNoCache(EngineOperator):
+class NLEngineOperatorNoCache(EngineOperator):
     """
     Operator the Natural Language Engine, that operates without
     KV Cache. This means that this operator merely maps input_ids
     and attention_mask to logits
     """
 
-    input_schema = NlEngineInputNoCache
+    input_schema = NLEngineInputsNoCache
     output_schema = None
 
     def __init__(self, sequence_length: int, **kwargs):
@@ -338,7 +340,7 @@ def __init__(self, sequence_length: int, **kwargs):
         )
         super().__init__(**kwargs)
 
-    def run(self, inp: NlEngineInputNoCache, **kwargs) -> Any:
+    def run(self, inp: NLEngineInputsNoCache, **kwargs) -> Any:
         engine_inputs = [inp.input_ids, inp.attention_mask]
         logits = (
             super()
diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py
index 44e38399a5..f21f671676 100644
--- a/src/deepsparse/v2/text_generation/pipeline.py
+++ b/src/deepsparse/v2/text_generation/pipeline.py
@@ -23,8 +23,6 @@
 from deepsparse.v2.operators.registry import OperatorRegistry
 from deepsparse.v2.pipeline import Pipeline
 from deepsparse.v2.routers import GraphRouter, LinearRouter
-from deepsparse.v2.schedulers import OperatorScheduler
-from deepsparse.v2.routers import GraphRouter
 from deepsparse.v2.schedulers import ContinuousBatchingScheduler, OperatorScheduler
 from deepsparse.v2.text_generation import (
     AutoRegressiveOperatorPreprocess,
@@ -35,8 +33,8 @@
     JoinOutput,
     KVCacheCreator,
     MultiEnginePrefill,
-    NlEngineOperator,
-    NlEngineOperatorNoCache,
+    NLEngineOperator,
+    NLEngineOperatorNoCache,
     PrepareforPrefill,
     PrepareGeneration,
     ProcessInputsTextGeneration,
@@ -48,6 +46,7 @@
 
 _LOGGER = logging.getLogger(__name__)
 
+
 class TextGenerationPipelineNoCache(Pipeline):
     def __init__(
         self,
@@ -81,7 +80,7 @@ def __init__(
                 sequence_length=sequence_length,
                 tokenizer=self.tokenizer,
             ),
-            NlEngineOperatorNoCache(sequence_length=sequence_length, **engine_kwargs),
+            NLEngineOperatorNoCache(sequence_length=sequence_length, **engine_kwargs),
             PrepareGeneration(
                 sequence_length=sequence_length,
                 prompt_sequence_length=1,
@@ -120,6 +119,7 @@ def verify_no_kv_cache_present(self) -> bool:
             )
         return not is_kv_cache_present
 
+
 @OperatorRegistry.register(name="text_generation")
 class TextGenerationPipeline(Pipeline):
     def __init__(
@@ -148,14 +148,14 @@ def __init__(
         if internal_kv_cache and engine_kwargs.get("engine_type") == "onnxruntime":
             internal_kv_cache = False
 
-        single_engine_operator = NlEngineOperator(
+        single_engine_operator = NLEngineOperator(
             sequence_length=sequence_length,
             internal_kv_cache=internal_kv_cache,
             input_ids_length=1,
             **engine_kwargs,
         )
 
-        multi_engine_operator = NlEngineOperator(
+        multi_engine_operator = NLEngineOperator(
             sequence_length=sequence_length,
             internal_kv_cache=internal_kv_cache,
             input_ids_length=prompt_sequence_length,
diff --git a/tests/deepsparse/v2/unit/text_generation/conftest.py b/tests/deepsparse/v2/unit/text_generation/conftest.py
index 96d36d57c2..3840a9bb0a 100644
--- a/tests/deepsparse/v2/unit/text_generation/conftest.py
+++ b/tests/deepsparse/v2/unit/text_generation/conftest.py
@@ -26,7 +26,7 @@
 from deepsparse.transformers.utils import DecoderKVCache
 from deepsparse.transformers.utils.helpers import initialize_kv_cache_state
 from deepsparse.v2 import InferenceState, PipelineState
-from deepsparse.v2.text_generation import NlEngineOperator, TokenGeneratorOperator
+from deepsparse.v2.text_generation import NLEngineOperator, TokenGeneratorOperator
 
 
 @pytest.fixture(scope="module")
@@ -60,7 +60,7 @@ def single_token_engine_no_internal_cache(text_generation_attributes, model_attr
     seq_length, _ = text_generation_attributes
     _, model_path = model_attributes
 
-    nl_engine_operator = NlEngineOperator(
+    nl_engine_operator = NLEngineOperator(
         sequence_length=seq_length, input_ids_length=1, model_path=model_path
     )
     return nl_engine_operator

From 66ca295b240aeb9e65926cb831c0040c125606a9 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Wed, 22 Nov 2023 11:26:42 +0000
Subject: [PATCH 38/43] one more cleanup

---
 src/deepsparse/v2/operators/operator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py
index e775056f8f..377088e09e 100644
--- a/src/deepsparse/v2/operators/operator.py
+++ b/src/deepsparse/v2/operators/operator.py
@@ -17,8 +17,8 @@
 
 from pydantic import BaseModel
 
-from deepsparse.v2.operators.registry import OperatorRegistry
 from deepsparse.v2.utils import InferenceState
+from deepsparse.v2.operators.registry import OperatorRegistry
 
 
 __all__ = ["Operator"]

From dcded1dc98f62ac42a76c6841d17796f6ee4c306 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Wed, 22 Nov 2023 11:56:13 +0000
Subject: [PATCH 39/43] got tests to work after rebase. implementing SPLIT and
 JOIN in linearouter now

---
 src/deepsparse/v2/operators/operator.py             |  2 +-
 .../v2/text_generation/generate_new_token.py        | 13 +++++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py
index 377088e09e..e775056f8f 100644
--- a/src/deepsparse/v2/operators/operator.py
+++ b/src/deepsparse/v2/operators/operator.py
@@ -17,8 +17,8 @@
 
 from pydantic import BaseModel
 
-from deepsparse.v2.utils import InferenceState
 from deepsparse.v2.operators.registry import OperatorRegistry
+from deepsparse.v2.utils import InferenceState
 
 
 __all__ = ["Operator"]
diff --git a/src/deepsparse/v2/text_generation/generate_new_token.py b/src/deepsparse/v2/text_generation/generate_new_token.py
index 5bf48bbdbc..fd91b3412c 100644
--- a/src/deepsparse/v2/text_generation/generate_new_token.py
+++ b/src/deepsparse/v2/text_generation/generate_new_token.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Sequence, Union
+from typing import Any, Dict, Sequence, Union
 
 import transformers
 
@@ -36,9 +36,14 @@ def can_operate(self, inp: NLEngineOutputs):
             return True
         return False
 
-    def run(self, inp: NLEngineOutputs, inference_state: InferenceState, **kwargs):
-        logits = inp.engine_outputs
-        kv_cache = inp.kv_cache
+    def run(self, *args, inference_state: InferenceState, **kwargs):
+        if args:
+            inp = args[0]
+            logits = inp.engine_outputs
+            kv_cache = inp.kv_cache
+        else:
+            logits = kwargs.get("logits")  # inp.engine_outputs
+            kv_cache = kwargs.get("kv_cache")  # inp.kv_cache
 
         token_generator = inference_state.current_state.get("token_generator")
         token = token_generator.generate(logits=logits[0, -1, :])

From 127aa00d5be96371aa59cf9ee91bf31e24fd71a0 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Wed, 22 Nov 2023 13:52:21 +0000
Subject: [PATCH 40/43] pipeline working, with GraphRouter. Needs some more
 testing

---
 .../v2/text_generation/nl_engine_operator.py  |  3 +-
 src/deepsparse/v2/text_generation/pipeline.py | 74 ++++++++++++++-----
 .../v2/text_generation/process_inputs.py      |  4 +-
 .../v2/integration_tests/test_llms.py         |  4 +-
 4 files changed, 60 insertions(+), 25 deletions(-)

diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py
index 9bef8ceb87..2843d3dd17 100644
--- a/src/deepsparse/v2/text_generation/nl_engine_operator.py
+++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py
@@ -350,7 +350,8 @@ def run(self, inp: NLEngineInputsNoCache, **kwargs) -> Any:
 
         # By default, the engine outputs logits for all tokens in the sequence.
         # Let's filter out the logits for the padding tokens.
-        logits = numpy.compress(inp.attention_mask[0], logits[0], axis=1)
+        logits = numpy.compress(inp.attention_mask.flatten(), logits[0], axis=1)
+        print(logits.shape)
         return {"logits": [logits], "kv_cache": None, "tokens": None}, {
             "prompt_logits": [logits]
         }
diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py
index f21f671676..fb736e7771 100644
--- a/src/deepsparse/v2/text_generation/pipeline.py
+++ b/src/deepsparse/v2/text_generation/pipeline.py
@@ -74,24 +74,51 @@ def __init__(
 
         token_generator = TokenGeneratorOperator()
 
-        ops = [
-            ProcessInputsTextGeneration(
-                generation_config=process_generation_config(generation_config),
-                sequence_length=sequence_length,
-                tokenizer=self.tokenizer,
-            ),
-            NLEngineOperatorNoCache(sequence_length=sequence_length, **engine_kwargs),
-            PrepareGeneration(
-                sequence_length=sequence_length,
-                prompt_sequence_length=1,
-                token_generator=token_generator,
-            ),
-            GenerateNewTokenOperator(tokenizer=self.tokenizer, force_max_tokens=True),
-            CompileGenerations(),
-            JoinOutput(tokenizer=self.tokenizer),
-            ProcessOutputs(tokenizer=self.tokenizer),
-        ]
-        router = LinearRouter(end_route=len(ops))
+        process_inputs = ProcessInputsTextGeneration(
+            generation_config=process_generation_config(generation_config),
+            sequence_length=sequence_length,
+            tokenizer=self.tokenizer,
+        )
+        engine_operator = NLEngineOperatorNoCache(
+            sequence_length=sequence_length,
+            **engine_kwargs,
+        )
+        prepare_generation = PrepareGeneration(
+            sequence_length=sequence_length,
+            prompt_sequence_length=1,
+            token_generator=token_generator,
+        )
+        generate_new_token = GenerateNewTokenOperator(
+            tokenizer=self.tokenizer, force_max_tokens=True
+        )
+        compile_generations = CompileGenerations()
+        join_output = JoinOutput(tokenizer=self.tokenizer)
+        process_outputs = ProcessOutputs(tokenizer=self.tokenizer)
+
+        ops = {
+            "process_input": process_inputs,
+            "engine_operator": engine_operator,
+            "prepare_generation": prepare_generation,
+            "generate_new_token": generate_new_token,
+            "compile_generations": compile_generations,
+            "join_output": join_output,
+            "process_outputs": process_outputs,
+        }
+        routes = {
+            "process_input": "SPLIT",
+            "SPLIT": "engine_operator",
+            "engine_operator": "prepare_generation",
+            "prepare_generation": "generate_new_token",
+            "generate_new_token": "compile_generations",
+            "compile_generations": "JOIN",
+            "JOIN": "join_output",
+            "join_output": "process_outputs",
+            "process_outputs": "STOP",
+        }
+
+        router = GraphRouter(
+            end_route="STOP", start_route="process_input", route=routes
+        )
         scheduler = [OperatorScheduler()]
         super().__init__(
             ops=ops,
@@ -102,9 +129,18 @@ def __init__(
     def run(self, *args, **kwargs):
         # we need to set the fixed_sequences_length flag to True
         # for the non-kv cache pipeline
-        kwargs.update(dict(fixed_sequences_length=True))
+        kwargs.update(dict(fixed_sequences_length=True, max_new_tokens=1))
         return super().run(*args, **kwargs)
 
+    def condense_inputs(self, *args, **kwargs):
+        return args[0], kwargs
+
+    def expand_inputs(self, items, batch_size):
+        items = [items.get(key) for key in items.keys()]
+        out, orig_batch_size = split_engine_inputs(items, batch_size)
+        combined_batches = [{"input_ids": b[0], "attention_mask": b[1]} for b in out]
+        return combined_batches, orig_batch_size
+
     def verify_no_kv_cache_present(self) -> bool:
         """
         Verifies that the ONNX model does not have
diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py
index 0f9147f916..85956416a1 100644
--- a/src/deepsparse/v2/text_generation/process_inputs.py
+++ b/src/deepsparse/v2/text_generation/process_inputs.py
@@ -36,8 +36,8 @@ class ProcessInputsTextGeneration(Operator):
     """
     Input processing operator. Responsible for tokenizing the input, handling the
     generation_config (if provided), updating the inference_state for later use,
-    and returning the tokens for prompt inferece. The expected input is defined by
-    the input_schema, which for this operator is TextGeneratioInput.
+    and returning the tokens for prompt inference. The expected input is defined by
+    the input_schema, which for this operator is TextGenerationInput.
     """
 
     input_schema = TextGenerationInput
diff --git a/tests/deepsparse/v2/integration_tests/test_llms.py b/tests/deepsparse/v2/integration_tests/test_llms.py
index 321070f276..350c77b3f8 100644
--- a/tests/deepsparse/v2/integration_tests/test_llms.py
+++ b/tests/deepsparse/v2/integration_tests/test_llms.py
@@ -223,9 +223,7 @@ def test_deepsparse_multi_token_prefill(self, setup):
         output = pipeline(
             prompt=self.prompt,
             include_prompt_logits=True,
-            generation_kwargs=dict(
-                max_new_tokens=self.max_new_tokens, output_scores=True
-            ),
+            generation_kwargs=dict(output_scores=True),
         )
 
         self._test_output(

From af576981bdedc193128f9c588e724fd107ca30e3 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Mon, 27 Nov 2023 15:09:02 +0000
Subject: [PATCH 41/43] ready for review

---
 .../v2/text_generation/generate_new_token.py        | 11 +++--------
 src/deepsparse/v2/text_generation/join_output.py    |  5 -----
 .../v2/text_generation/nl_engine_operator.py        | 12 ++++++------
 src/deepsparse/v2/text_generation/pipeline.py       |  8 +++++---
 tests/deepsparse/v2/integration_tests/test_llms.py  | 13 +++++++++----
 .../unit/text_generation/test_token_generation.py   |  4 +++-
 6 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/src/deepsparse/v2/text_generation/generate_new_token.py b/src/deepsparse/v2/text_generation/generate_new_token.py
index fd91b3412c..ba3fb445aa 100644
--- a/src/deepsparse/v2/text_generation/generate_new_token.py
+++ b/src/deepsparse/v2/text_generation/generate_new_token.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, Sequence, Union
+from typing import Sequence, Union
 
 import transformers
 
@@ -37,13 +37,8 @@ def can_operate(self, inp: NLEngineOutputs):
         return False
 
     def run(self, *args, inference_state: InferenceState, **kwargs):
-        if args:
-            inp = args[0]
-            logits = inp.engine_outputs
-            kv_cache = inp.kv_cache
-        else:
-            logits = kwargs.get("logits")  # inp.engine_outputs
-            kv_cache = kwargs.get("kv_cache")  # inp.kv_cache
+        logits = args[0].engine_outputs if args else kwargs.get("logits")
+        kv_cache = args[0].kv_cache if args else kwargs.get("kv_cache")
 
         token_generator = inference_state.current_state.get("token_generator")
         token = token_generator.generate(logits=logits[0, -1, :])
diff --git a/src/deepsparse/v2/text_generation/join_output.py b/src/deepsparse/v2/text_generation/join_output.py
index 56d9ac47b1..7479ee7493 100644
--- a/src/deepsparse/v2/text_generation/join_output.py
+++ b/src/deepsparse/v2/text_generation/join_output.py
@@ -33,11 +33,6 @@ def __init__(self, tokenizer):
         self.tokenizer = tokenizer
 
     def run(self, inp: Tuple[List[CompileGenerationsOutput], Dict], **kwargs):
-        if not isinstance(inp, Tuple):
-            # when running without KV Cache
-            # this will be a single
-            # CompileGenerationsOutput for now
-            inp = [[inp]]
 
         batch_outputs = [x for x in inp[0]]
         generated_tokens = [x.generated_tokens for x in batch_outputs]
diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py
index 2843d3dd17..aaa1899fd5 100644
--- a/src/deepsparse/v2/text_generation/nl_engine_operator.py
+++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py
@@ -110,11 +110,6 @@ def split(self) -> List["NLEngineOutputs"]:
         ]
 
 
-class NLEngineInputsNoCache(BaseModel):
-    input_ids: Any
-    attention_mask: Any
-
-
 class NLEngineOperator(EngineOperator):
 
     """
@@ -322,6 +317,11 @@ def output_names(self) -> List[str]:
         return self.engine.output_names
 
 
+class NLEngineInputsNoCache(BaseModel):
+    input_ids: Any
+    attention_mask: Any
+
+
 class NLEngineOperatorNoCache(EngineOperator):
     """
     Operator the Natural Language Engine, that operates without
@@ -351,7 +351,7 @@ def run(self, inp: NLEngineInputsNoCache, **kwargs) -> Any:
         # By default, the engine outputs logits for all tokens in the sequence.
         # Let's filter out the logits for the padding tokens.
         logits = numpy.compress(inp.attention_mask.flatten(), logits[0], axis=1)
-        print(logits.shape)
+
         return {"logits": [logits], "kv_cache": None, "tokens": None}, {
             "prompt_logits": [logits]
         }
diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py
index fb736e7771..7c76b613e5 100644
--- a/src/deepsparse/v2/text_generation/pipeline.py
+++ b/src/deepsparse/v2/text_generation/pipeline.py
@@ -22,7 +22,7 @@
 from deepsparse.v2.operators import EngineOperator
 from deepsparse.v2.operators.registry import OperatorRegistry
 from deepsparse.v2.pipeline import Pipeline
-from deepsparse.v2.routers import GraphRouter, LinearRouter
+from deepsparse.v2.routers import GraphRouter
 from deepsparse.v2.schedulers import ContinuousBatchingScheduler, OperatorScheduler
 from deepsparse.v2.text_generation import (
     AutoRegressiveOperatorPreprocess,
@@ -52,9 +52,9 @@ def __init__(
         self,
         model_path: str,
         sequence_length: int = 1024,
-        engine_kwargs: Optional[Dict] = None,
         onnx_model_name: Optional[str] = None,
-        generation_config=None,  # TODO: Typing here
+        generation_config=None,
+        engine_kwargs: Optional[Dict] = None,
         **kwargs,
     ):
 
@@ -116,6 +116,8 @@ def __init__(
             "process_outputs": "STOP",
         }
 
+        # TODO: Using the GraphRouter, but should use
+        # LinearRouter with appropriate split/join support
         router = GraphRouter(
             end_route="STOP", start_route="process_input", route=routes
         )
diff --git a/tests/deepsparse/v2/integration_tests/test_llms.py b/tests/deepsparse/v2/integration_tests/test_llms.py
index 350c77b3f8..3485658dda 100644
--- a/tests/deepsparse/v2/integration_tests/test_llms.py
+++ b/tests/deepsparse/v2/integration_tests/test_llms.py
@@ -85,6 +85,9 @@ def get_pipeline(self, kv_cache_support=True, **kwargs) -> Pipeline:
             "default" pipeline is returned)
         :return: the appropriate pipeline
         """
+        # TODO: This if statement should disappear once
+        # the TextGenerationPipeline contains the
+        # non-kv-cache version of the pipeline
         text_generation_pipeline_class = (
             TextGenerationPipeline
             if kv_cache_support
@@ -223,7 +226,9 @@ def test_deepsparse_multi_token_prefill(self, setup):
         output = pipeline(
             prompt=self.prompt,
             include_prompt_logits=True,
-            generation_kwargs=dict(output_scores=True),
+            generation_kwargs=dict(
+                max_new_tokens=self.max_new_tokens, output_scores=True
+            ),
         )
 
         self._test_output(
@@ -247,18 +252,18 @@ def _test_inference_no_kv_cache(self, engine_type):
         )
 
         output = pipeline(
-            prompt=self.prompt,
+            prompt=[self.prompt, self.prompt],
             include_prompt_logits=True,
             generation_kwargs=dict(output_scores=True),
         )
 
-        logits = output.generations[0].score
         # logits -> prompt logits + one logit for the new generated token
         generated_logits, prompt_logits, *_ = self.torch_ground_truth
         logits_gt = numpy.concatenate(
             [prompt_logits[0], generated_logits[0, :1, :]], axis=0
         )
-        assert numpy.allclose(logits, logits_gt, atol=self.precision)
+        for gen in output.generations:
+            assert numpy.allclose(gen.score, logits_gt, atol=self.precision)
 
     def _test_output(
         self,
diff --git a/tests/deepsparse/v2/unit/text_generation/test_token_generation.py b/tests/deepsparse/v2/unit/text_generation/test_token_generation.py
index d04f863171..219b1048fd 100644
--- a/tests/deepsparse/v2/unit/text_generation/test_token_generation.py
+++ b/tests/deepsparse/v2/unit/text_generation/test_token_generation.py
@@ -93,7 +93,9 @@ def test_generate_new_token(
         in_generation=True,
     )
     outputs, state = generate_new_token.run(
-        inp=inp, inference_state=mock_inference_state
+        logits=inp.engine_outputs,
+        kv_cache=inp.kv_cache,
+        inference_state=mock_inference_state,
     )
     # The new_token generated/returned by ths operator should match the last token in
     # token_generator

From 4397c80c4eb50cf9e40e1be21e36d3604d141ee3 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Tue, 28 Nov 2023 07:43:15 +0000
Subject: [PATCH 42/43] cleanup

---
 src/deepsparse/v2/text_generation/__init__.py |   2 +
 .../v2/text_generation/nl_engine_operator.py  |  48 +-----
 .../nl_engine_operator_no_kv_cache.py         |  67 ++++++++
 src/deepsparse/v2/text_generation/pipeline.py | 113 -------------
 .../text_generation/pipeline_no_kv_cache.py   | 148 ++++++++++++++++++
 .../test_pipeline_no_kv_cache.py              |  43 +++++
 6 files changed, 261 insertions(+), 160 deletions(-)
 create mode 100644 src/deepsparse/v2/text_generation/nl_engine_operator_no_kv_cache.py
 create mode 100644 src/deepsparse/v2/text_generation/pipeline_no_kv_cache.py
 create mode 100644 tests/deepsparse/v2/unit/text_generation/test_pipeline_no_kv_cache.py

diff --git a/src/deepsparse/v2/text_generation/__init__.py b/src/deepsparse/v2/text_generation/__init__.py
index 08836b8bbe..6f1323de50 100644
--- a/src/deepsparse/v2/text_generation/__init__.py
+++ b/src/deepsparse/v2/text_generation/__init__.py
@@ -21,6 +21,7 @@
 from .kv_cache_operator import *
 from .multi_engine_prefill_operator import *
 from .nl_engine_operator import *
+from .nl_engine_operator_no_kv_cache import *
 from .prep_for_prefill import *
 from .process_inputs import *
 from .process_outputs import *
@@ -30,3 +31,4 @@
 from .prep_for_generation import *  # isort:skip
 
 from .pipeline import *  # isort:skip
+from .pipeline_no_kv_cache import *  # isort:skip
diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py
index aaa1899fd5..c6583e37cf 100644
--- a/src/deepsparse/v2/text_generation/nl_engine_operator.py
+++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py
@@ -20,7 +20,6 @@
 import numpy
 from pydantic import BaseModel, Field
 
-from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs
 from deepsparse.utils import join_engine_outputs, split_engine_inputs
 from deepsparse.utils.onnx import (
     CACHE_INPUT_PREFIX,
@@ -33,12 +32,7 @@
 )
 
 
-__all__ = [
-    "NLEngineOperator",
-    "NLEngineOperatorNoCache",
-    "NLEngineInputsNoCache",
-    "NLEngineInputs",
-]
+__all__ = ["NLEngineOperator", "NLEngineInputs", "NLEngineOutputs"]
 
 
 class NLEngineInputs(BaseModel):
@@ -315,43 +309,3 @@ def output_names(self) -> List[str]:
         :return: The output names for the onnx model
         """
         return self.engine.output_names
-
-
-class NLEngineInputsNoCache(BaseModel):
-    input_ids: Any
-    attention_mask: Any
-
-
-class NLEngineOperatorNoCache(EngineOperator):
-    """
-    Operator the Natural Language Engine, that operates without
-    KV Cache. This means that this operator merely maps input_ids
-    and attention_mask to logits
-    """
-
-    input_schema = NLEngineInputsNoCache
-    output_schema = None
-
-    def __init__(self, sequence_length: int, **kwargs):
-        overwrite_transformer_onnx_model_inputs(
-            path=kwargs.get("model_path"),
-            batch_size=kwargs.get("batch_size", 1),
-            max_length=sequence_length,
-        )
-        super().__init__(**kwargs)
-
-    def run(self, inp: NLEngineInputsNoCache, **kwargs) -> Any:
-        engine_inputs = [inp.input_ids, inp.attention_mask]
-        logits = (
-            super()
-            .run(EngineOperatorInputs(engine_inputs=engine_inputs), **kwargs)
-            .get("engine_outputs")
-        )
-
-        # By default, the engine outputs logits for all tokens in the sequence.
-        # Let's filter out the logits for the padding tokens.
-        logits = numpy.compress(inp.attention_mask.flatten(), logits[0], axis=1)
-
-        return {"logits": [logits], "kv_cache": None, "tokens": None}, {
-            "prompt_logits": [logits]
-        }
diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator_no_kv_cache.py b/src/deepsparse/v2/text_generation/nl_engine_operator_no_kv_cache.py
new file mode 100644
index 0000000000..746010560f
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/nl_engine_operator_no_kv_cache.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any
+
+import numpy
+from pydantic import BaseModel
+
+from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs
+from deepsparse.v2.operators.engine_operator import EngineOperator, EngineOperatorInputs
+
+
+__all__ = [
+    "NLEngineOperatorNoCache",
+    "NLEngineInputsNoCache",
+]
+
+
+class NLEngineInputsNoCache(BaseModel):
+    input_ids: Any
+    attention_mask: Any
+
+
+class NLEngineOperatorNoCache(EngineOperator):
+    """
+    Operator the Natural Language Engine, that operates without
+    KV Cache. This means that this operator merely maps input_ids
+    and attention_mask to logits
+    """
+
+    input_schema = NLEngineInputsNoCache
+    output_schema = None
+
+    def __init__(self, sequence_length: int, **kwargs):
+        overwrite_transformer_onnx_model_inputs(
+            path=kwargs.get("model_path"),
+            batch_size=kwargs.get("batch_size", 1),
+            max_length=sequence_length,
+        )
+        super().__init__(**kwargs)
+
+    def run(self, inp: NLEngineInputsNoCache, **kwargs) -> Any:
+        engine_inputs = [inp.input_ids, inp.attention_mask]
+        logits = (
+            super()
+            .run(EngineOperatorInputs(engine_inputs=engine_inputs), **kwargs)
+            .get("engine_outputs")
+        )
+
+        # By default, the engine outputs logits for all tokens in the sequence.
+        # Let's filter out the logits for the padding tokens.
+        logits = numpy.compress(inp.attention_mask.flatten(), logits[0], axis=1)
+
+        return {"logits": [logits], "kv_cache": None, "tokens": None}, {
+            "prompt_logits": [logits]
+        }
diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py
index 7c76b613e5..344980dc3f 100644
--- a/src/deepsparse/v2/text_generation/pipeline.py
+++ b/src/deepsparse/v2/text_generation/pipeline.py
@@ -18,7 +18,6 @@
 from deepsparse.transformers.helpers import setup_transformers_pipeline
 from deepsparse.transformers.utils.helpers import process_generation_config
 from deepsparse.utils import split_engine_inputs
-from deepsparse.utils.onnx import default_cached_outputs
 from deepsparse.v2.operators import EngineOperator
 from deepsparse.v2.operators.registry import OperatorRegistry
 from deepsparse.v2.pipeline import Pipeline
@@ -34,7 +33,6 @@
     KVCacheCreator,
     MultiEnginePrefill,
     NLEngineOperator,
-    NLEngineOperatorNoCache,
     PrepareforPrefill,
     PrepareGeneration,
     ProcessInputsTextGeneration,
@@ -47,117 +45,6 @@
 _LOGGER = logging.getLogger(__name__)
 
 
-class TextGenerationPipelineNoCache(Pipeline):
-    def __init__(
-        self,
-        model_path: str,
-        sequence_length: int = 1024,
-        onnx_model_name: Optional[str] = None,
-        generation_config=None,
-        engine_kwargs: Optional[Dict] = None,
-        **kwargs,
-    ):
-
-        (
-            self.model_path,
-            self.config,
-            self.tokenizer,
-            engine_kwargs,
-        ) = setup_transformers_pipeline(
-            model_path,
-            sequence_length,
-            tokenizer_padding_side="right",
-            onnx_model_name=onnx_model_name,
-            engine_kwargs=engine_kwargs,
-        )
-        self.verify_no_kv_cache_present()
-
-        token_generator = TokenGeneratorOperator()
-
-        process_inputs = ProcessInputsTextGeneration(
-            generation_config=process_generation_config(generation_config),
-            sequence_length=sequence_length,
-            tokenizer=self.tokenizer,
-        )
-        engine_operator = NLEngineOperatorNoCache(
-            sequence_length=sequence_length,
-            **engine_kwargs,
-        )
-        prepare_generation = PrepareGeneration(
-            sequence_length=sequence_length,
-            prompt_sequence_length=1,
-            token_generator=token_generator,
-        )
-        generate_new_token = GenerateNewTokenOperator(
-            tokenizer=self.tokenizer, force_max_tokens=True
-        )
-        compile_generations = CompileGenerations()
-        join_output = JoinOutput(tokenizer=self.tokenizer)
-        process_outputs = ProcessOutputs(tokenizer=self.tokenizer)
-
-        ops = {
-            "process_input": process_inputs,
-            "engine_operator": engine_operator,
-            "prepare_generation": prepare_generation,
-            "generate_new_token": generate_new_token,
-            "compile_generations": compile_generations,
-            "join_output": join_output,
-            "process_outputs": process_outputs,
-        }
-        routes = {
-            "process_input": "SPLIT",
-            "SPLIT": "engine_operator",
-            "engine_operator": "prepare_generation",
-            "prepare_generation": "generate_new_token",
-            "generate_new_token": "compile_generations",
-            "compile_generations": "JOIN",
-            "JOIN": "join_output",
-            "join_output": "process_outputs",
-            "process_outputs": "STOP",
-        }
-
-        # TODO: Using the GraphRouter, but should use
-        # LinearRouter with appropriate split/join support
-        router = GraphRouter(
-            end_route="STOP", start_route="process_input", route=routes
-        )
-        scheduler = [OperatorScheduler()]
-        super().__init__(
-            ops=ops,
-            router=router,
-            schedulers=scheduler,
-        )
-
-    def run(self, *args, **kwargs):
-        # we need to set the fixed_sequences_length flag to True
-        # for the non-kv cache pipeline
-        kwargs.update(dict(fixed_sequences_length=True, max_new_tokens=1))
-        return super().run(*args, **kwargs)
-
-    def condense_inputs(self, *args, **kwargs):
-        return args[0], kwargs
-
-    def expand_inputs(self, items, batch_size):
-        items = [items.get(key) for key in items.keys()]
-        out, orig_batch_size = split_engine_inputs(items, batch_size)
-        combined_batches = [{"input_ids": b[0], "attention_mask": b[1]} for b in out]
-        return combined_batches, orig_batch_size
-
-    def verify_no_kv_cache_present(self) -> bool:
-        """
-        Verifies that the ONNX model does not have
-        KV cache inputs/outputs present.
-        :return: True if compatible, False otherwise
-        """
-        is_kv_cache_present = any(default_cached_outputs(self.model_path))
-        if is_kv_cache_present:
-            raise ValueError(
-                f"The model: {self.model_path} has KV cache inputs/outputs present. "
-                "Please use the TextGenerationPipeline instead."
-            )
-        return not is_kv_cache_present
-
-
 @OperatorRegistry.register(name="text_generation")
 class TextGenerationPipeline(Pipeline):
     def __init__(
diff --git a/src/deepsparse/v2/text_generation/pipeline_no_kv_cache.py b/src/deepsparse/v2/text_generation/pipeline_no_kv_cache.py
new file mode 100644
index 0000000000..a6ec2ae207
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/pipeline_no_kv_cache.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import Dict, Optional
+
+from deepsparse.transformers.helpers import setup_transformers_pipeline
+from deepsparse.transformers.utils.helpers import process_generation_config
+from deepsparse.utils import split_engine_inputs
+from deepsparse.utils.onnx import default_cached_outputs
+from deepsparse.v2.pipeline import Pipeline
+from deepsparse.v2.routers import GraphRouter
+from deepsparse.v2.schedulers import OperatorScheduler
+from deepsparse.v2.text_generation import (
+    CompileGenerations,
+    GenerateNewTokenOperator,
+    JoinOutput,
+    NLEngineOperatorNoCache,
+    PrepareGeneration,
+    ProcessInputsTextGeneration,
+    ProcessOutputs,
+    TokenGeneratorOperator,
+)
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TextGenerationPipelineNoCache(Pipeline):
+    def __init__(
+        self,
+        model_path: str,
+        sequence_length: int = 1024,
+        onnx_model_name: Optional[str] = None,
+        generation_config=None,
+        engine_kwargs: Optional[Dict] = None,
+        **kwargs,
+    ):
+
+        (
+            self.model_path,
+            self.config,
+            self.tokenizer,
+            engine_kwargs,
+        ) = setup_transformers_pipeline(
+            model_path,
+            sequence_length,
+            tokenizer_padding_side="right",
+            onnx_model_name=onnx_model_name,
+            engine_kwargs=engine_kwargs,
+        )
+        self.verify_no_kv_cache_present()
+
+        token_generator = TokenGeneratorOperator()
+
+        process_inputs = ProcessInputsTextGeneration(
+            generation_config=process_generation_config(generation_config),
+            sequence_length=sequence_length,
+            tokenizer=self.tokenizer,
+        )
+        engine_operator = NLEngineOperatorNoCache(
+            sequence_length=sequence_length,
+            **engine_kwargs,
+        )
+        prepare_generation = PrepareGeneration(
+            sequence_length=sequence_length,
+            prompt_sequence_length=1,
+            token_generator=token_generator,
+        )
+        generate_new_token = GenerateNewTokenOperator(
+            tokenizer=self.tokenizer, force_max_tokens=True
+        )
+        compile_generations = CompileGenerations()
+        join_output = JoinOutput(tokenizer=self.tokenizer)
+        process_outputs = ProcessOutputs(tokenizer=self.tokenizer)
+
+        ops = {
+            "process_input": process_inputs,
+            "engine_operator": engine_operator,
+            "prepare_generation": prepare_generation,
+            "generate_new_token": generate_new_token,
+            "compile_generations": compile_generations,
+            "join_output": join_output,
+            "process_outputs": process_outputs,
+        }
+        routes = {
+            "process_input": "SPLIT",
+            "SPLIT": "engine_operator",
+            "engine_operator": "prepare_generation",
+            "prepare_generation": "generate_new_token",
+            "generate_new_token": "compile_generations",
+            "compile_generations": "JOIN",
+            "JOIN": "join_output",
+            "join_output": "process_outputs",
+            "process_outputs": "STOP",
+        }
+
+        # TODO: Using the GraphRouter, but should use
+        # LinearRouter with appropriate split/join support
+        router = GraphRouter(
+            end_route="STOP", start_route="process_input", route=routes
+        )
+        scheduler = [OperatorScheduler()]
+        super().__init__(
+            ops=ops,
+            router=router,
+            schedulers=scheduler,
+        )
+
+    def run(self, *args, **kwargs):
+        # we need to set the fixed_sequences_length flag to True
+        # for the non-kv cache pipeline
+        kwargs.update(dict(fixed_sequences_length=True, max_new_tokens=1))
+        return super().run(*args, **kwargs)
+
+    def condense_inputs(self, *args, **kwargs):
+        return args[0], kwargs
+
+    def expand_inputs(self, items, batch_size):
+        items = [items.get(key) for key in items.keys()]
+        out, orig_batch_size = split_engine_inputs(items, batch_size)
+        combined_batches = [{"input_ids": b[0], "attention_mask": b[1]} for b in out]
+        return combined_batches, orig_batch_size
+
+    def verify_no_kv_cache_present(self) -> bool:
+        """
+        Verifies that the ONNX model does not have
+        KV cache inputs/outputs present.
+        :return: True if compatible, False otherwise
+        """
+        is_kv_cache_present = any(default_cached_outputs(self.model_path))
+        if is_kv_cache_present:
+            raise ValueError(
+                f"The model: {self.model_path} has KV cache inputs/outputs present. "
+                "Please use the TextGenerationPipeline instead."
+            )
+        return not is_kv_cache_present
diff --git a/tests/deepsparse/v2/unit/text_generation/test_pipeline_no_kv_cache.py b/tests/deepsparse/v2/unit/text_generation/test_pipeline_no_kv_cache.py
new file mode 100644
index 0000000000..a6fbfc4d11
--- /dev/null
+++ b/tests/deepsparse/v2/unit/text_generation/test_pipeline_no_kv_cache.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+from deepsparse.v2.text_generation import TextGenerationPipelineNoCache
+
+
+@pytest.mark.parametrize(
+    "onnx_model_name, raise_error",
+    [("model.onnx", True), (None, True), ("model-orig.onnx", False)],
+)
+def test_verify_no_kv_cache_present(model_attributes, onnx_model_name, raise_error):
+    _, model_path = model_attributes
+    # model_path points to .../directory/model.onnx
+    # we need to go up one level to .../directory
+    model_path = os.path.dirname(model_path)
+
+    if raise_error:
+        with pytest.raises(ValueError):
+            if onnx_model_name is None:
+                TextGenerationPipelineNoCache(model_path=model_path)
+            else:
+                TextGenerationPipelineNoCache(
+                    model_path=model_path, onnx_model_name=onnx_model_name
+                )
+        return
+    else:
+        TextGenerationPipelineNoCache(
+            model_path=model_path, onnx_model_name=onnx_model_name
+        )

From 6f1214e88ec9c1674ca59fc3487c0beee032d417 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Tue, 28 Nov 2023 08:01:28 +0000
Subject: [PATCH 43/43] initial commit

---
 src/deepsparse/v2/routers/router.py           | 56 +++++++++++--------
 .../text_generation/pipeline_no_kv_cache.py   | 28 +++++-----
 2 files changed, 45 insertions(+), 39 deletions(-)

diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py
index 6740f706f1..5d0365fda9 100644
--- a/src/deepsparse/v2/routers/router.py
+++ b/src/deepsparse/v2/routers/router.py
@@ -27,7 +27,7 @@
 
 class Router:
     """
-    Routers dicate the next operator to run. Each Router must implement a next function,
+    Routers dictate the next operator to run. Each Router must implement a next function,
     which dictates the index or key of the next operator to run.
 
     :param start_route: the start index or key of the router
@@ -77,11 +77,18 @@ def json(self):
 
 class LinearRouter(Router):
     """
-    LinearRouterruns a list of Operators in sequential order. end_route should
+    LinearRouter runs a list of Operators in sequential order. end_route should
     be the length of the list and the start_route should be the start index.
     """
 
-    def __init__(self, end_route: int, start_route: int = 0):
+    def __init__(self, route: Optional[List[str]] = None, end_route: Optional[int] = None, start_route: int = 0):
+        if end_route is None:
+            if route is None:
+                raise ValueError("To define the number of steps in the LinearRouter "
+                                 "either `route` or `end_route` must be provided"
+                                 )
+
+            end_route = len(route)
         super().__init__(end_route=end_route, start_route=start_route)
         _LOGGER.warn("SPLIT and JOIN are not yet supported for the LinearRouter.")
 
@@ -98,30 +105,31 @@ def validate(operators: List[Operator]) -> bool:
         """
         :param operators: operators that this Router could potentially run over
         :return: True if this Router can run this series of operators. Base Router
-            runs any series of operators that is non empty and whose input and output
+            runs any series of operators that is non-empty and whose input and output
             schemas align. If not valid, either False or an error string will be
             returned
         """
-        if len(operators) < 1:
-            _LOGGER.info("No operators provided")
-            return False
-
-        for idx in range(len(operators) - 1):
-            current_output_schema = operators[idx].output_schema
-            next_input_schema = operators[idx + 1].input_schema
-
-            if current_output_schema is None or next_input_schema is None:
-                # if no input/output schema defined, assume operator can run
-                # without schema
-                continue
-
-            if current_output_schema != next_input_schema:
-                _LOGGER.info(
-                    f"Operator at idx {idx}: {type(operators[idx])} has invalid "
-                    f"output schema {current_output_schema} for next operator "
-                    f"{type(operators[idx + 1])} which requires {next_input_schema}"
-                )
-                return False
+        # Commented out - operators are dicts not lists
+        # if len(operators) < 1:
+        #     _LOGGER.info("No operators provided")
+        #     return False
+        #
+        # for idx in range(len(operators) - 1):
+        #     current_output_schema = operators[idx].output_schema
+        #     next_input_schema = operators[idx + 1].input_schema
+        #
+        #     if current_output_schema is None or next_input_schema is None:
+        #         # if no input/output schema defined, assume operator can run
+        #         # without schema
+        #         continue
+        #
+        #     if current_output_schema != next_input_schema:
+        #         _LOGGER.info(
+        #             f"Operator at idx {idx}: {type(operators[idx])} has invalid "
+        #             f"output schema {current_output_schema} for next operator "
+        #             f"{type(operators[idx + 1])} which requires {next_input_schema}"
+        #         )
+        #         return False
         return True
 
 
diff --git a/src/deepsparse/v2/text_generation/pipeline_no_kv_cache.py b/src/deepsparse/v2/text_generation/pipeline_no_kv_cache.py
index a6ec2ae207..ffb149ff27 100644
--- a/src/deepsparse/v2/text_generation/pipeline_no_kv_cache.py
+++ b/src/deepsparse/v2/text_generation/pipeline_no_kv_cache.py
@@ -20,7 +20,7 @@
 from deepsparse.utils import split_engine_inputs
 from deepsparse.utils.onnx import default_cached_outputs
 from deepsparse.v2.pipeline import Pipeline
-from deepsparse.v2.routers import GraphRouter
+from deepsparse.v2.routers import LinearRouter
 from deepsparse.v2.schedulers import OperatorScheduler
 from deepsparse.v2.text_generation import (
     CompileGenerations,
@@ -94,23 +94,21 @@ def __init__(
             "join_output": join_output,
             "process_outputs": process_outputs,
         }
-        routes = {
-            "process_input": "SPLIT",
-            "SPLIT": "engine_operator",
-            "engine_operator": "prepare_generation",
-            "prepare_generation": "generate_new_token",
-            "generate_new_token": "compile_generations",
-            "compile_generations": "JOIN",
-            "JOIN": "join_output",
-            "join_output": "process_outputs",
-            "process_outputs": "STOP",
-        }
+        route = [
+            "process_input",
+            "SPLIT",
+            "engine_operator",
+            "prepare_generation",
+            "generate_new_token",
+            "compile_generations",
+            "JOIN",
+            "join_output",
+            "process_outputs"
+        ]
 
         # TODO: Using the GraphRouter, but should use
         # LinearRouter with appropriate split/join support
-        router = GraphRouter(
-            end_route="STOP", start_route="process_input", route=routes
-        )
+        router = LinearRouter(route=route)
         scheduler = [OperatorScheduler()]
         super().__init__(
             ops=ops,