diff --git a/tests/deepsparse/pipelines/test_clip.py b/tests/deepsparse/pipelines/test_clip.py index cb8bfeb97b..2858be7dca 100644 --- a/tests/deepsparse/pipelines/test_clip.py +++ b/tests/deepsparse/pipelines/test_clip.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import numpy as np + import pytest from deepsparse.clip import ( CLIPCaptionInput, @@ -30,24 +32,48 @@ from tests.utils import mock_engine +def custom_process_inputs(self, inputs): + if not isinstance(inputs.text, list): + inputs.text = [inputs.text] + if not isinstance(inputs.text[0], str): + return inputs.text + tokens = [np.array(t).astype(np.int32) for t in self.tokenizer(inputs.text)] + tokens = np.stack(tokens, axis=0) + tokens_lengths = np.array(tokens.shape[0] * [tokens.shape[1] - 1]) + return [tokens, tokens_lengths] + + +# This overrides the process_inputs function globally for all CLIPTextPipeline classes. +# This is needed for CLIP-ViT-B-32-256x256-DataComp-s34B-b86K as it has a second input +# that specifies how many tokens are present. +CLIPTextPipeline.process_inputs = custom_process_inputs + + @pytest.fixture -def visual_input(): +def model_folder(): + from huggingface_hub import snapshot_download + + model_id = "neuralmagic/CLIP-ViT-B-32-256x256-DataComp-s34B-b86K-quant-ds" + return snapshot_download(repo_id=model_id) + + +@pytest.fixture +def visual_input(model_folder): + model_path = model_folder + "/visual.onnx" images = computer_vision(batch_size=2) - model_path = None return CLIPVisualInput(images=images.get("images")), model_path @pytest.fixture -def text_input(): - model_path = None +def text_input(model_folder): + model_path = model_folder + "/textual.onnx" text = ["a building", "a dog", "a cat"] return CLIPTextInput(text=text), model_path -@pytest.mark.skip(reason="No CLIP models currently available to run tests") @mock_engine(rng_seed=0) def test_visual_clip(engine, visual_input): - from deepsparse import Pipeline + from deepsparse.legacy import Pipeline model_path = visual_input[-1] pipeline = Pipeline.create(task="clip_visual", model_path=model_path) @@ -57,10 +83,9 @@ def test_visual_clip(engine, visual_input): assert len(output.image_embeddings) == 1 -@pytest.mark.skip(reason="No CLIP models curently available to run tests") @mock_engine(rng_seed=0) def test_text_clip(engine, text_input): - from deepsparse import Pipeline + from deepsparse.legacy import Pipeline model_path = text_input[-1] pipeline = Pipeline.create(task="clip_text", model_path=model_path) @@ -70,10 +95,9 @@ def test_text_clip(engine, text_input): assert len(output.text_embeddings) == 1 -@pytest.mark.skip(reason="No CLIP models currently available to run tests") @mock_engine(rng_seed=0) def test_zero_shot(engine, visual_input, text_input): - from deepsparse.legacy import BasePipeline + from deepsparse.legacy import Pipeline model_path_text = text_input[-1] model_path_visual = visual_input[-1] @@ -81,7 +105,7 @@ def test_zero_shot(engine, visual_input, text_input): "visual_model_path": model_path_visual, "text_model_path": model_path_text, } - pipeline = BasePipeline.create(task="clip_zeroshot", **kwargs) + pipeline = Pipeline.create(task="clip_zeroshot", **kwargs) assert isinstance(pipeline, CLIPZeroShotPipeline) pipeline_input = CLIPZeroShotInput( image=CLIPVisualInput(images=visual_input[0].images[-1]), text=text_input[0] @@ -90,12 +114,12 @@ def test_zero_shot(engine, visual_input, text_input): assert isinstance(output, CLIPZeroShotOutput) -@pytest.mark.skip(reason="No CLIP models currently available to run tests") +@pytest.mark.skip(reason="No CLIP decoder models currently available to run tests") @mock_engine(rng_seed=0) def test_caption(engine, visual_input, text_input): - from deepsparse.legacy import BasePipeline + from deepsparse.legacy import Pipeline - model_path_visual = text_input[-1] + model_path_visual = visual_input[-1] model_path_text = text_input[-1] model_path_decoder = None pipeline_input = CLIPCaptionInput( @@ -106,6 +130,6 @@ def test_caption(engine, visual_input, text_input): "text_model_path": model_path_text, "decoder_model_path": model_path_decoder, } - pipeline = BasePipeline.create(task="clip_caption", **kwargs) + pipeline = Pipeline.create(task="clip_caption", **kwargs) assert isinstance(pipeline, CLIPCaptionPipeline) assert isinstance(pipeline_input, CLIPCaptionInput) diff --git a/tests/utils/engine_mocking.py b/tests/utils/engine_mocking.py index cef0b60164..cfdcbd76ae 100644 --- a/tests/utils/engine_mocking.py +++ b/tests/utils/engine_mocking.py @@ -135,10 +135,17 @@ def execute_list_out(self, inputs: List[numpy.ndarray]) -> List[numpy.ndarray]: def _to_descriptor(node: ort.NodeArg) -> "_NumpyDescriptor": to_numpy_dtype = { - "tensor(float)": numpy.float32, "tensor(double)": numpy.float64, - "tensor(uint8)": numpy.uint8, + "tensor(float)": numpy.float32, + "tensor(float16)": numpy.float16, "tensor(int64)": numpy.int64, + "tensor(int32)": numpy.int32, + "tensor(int16)": numpy.int16, + "tensor(int8)": numpy.int8, + "tensor(uint64)": numpy.uint64, + "tensor(uint32)": numpy.uint32, + "tensor(uint16)": numpy.uint16, + "tensor(uint8)": numpy.uint8, } return _NumpyDescriptor(shape=node.shape, dtype=to_numpy_dtype[node.type])