diff --git a/distributions/dependencies.json b/distributions/dependencies.json index a2393cdea0..7a974b9177 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -249,6 +249,7 @@ "redis", "scikit-learn", "scipy", + "sentence-transformers", "sentencepiece", "torch", "torchvision", @@ -287,6 +288,7 @@ "redis", "scikit-learn", "scipy", + "sentence-transformers", "sentencepiece", "torch", "torchao==0.5.0", diff --git a/llama_stack/apis/models/models.py b/llama_stack/apis/models/models.py index 71101ec8b9..0ee23ecc17 100644 --- a/llama_stack/apis/models/models.py +++ b/llama_stack/apis/models/models.py @@ -21,9 +21,10 @@ class CommonModelFields(BaseModel): ) -class ModelType(Enum): +@json_schema_type +class ModelType(str, Enum): llm = "llm" - embedding_model = "embedding" + embedding = "embedding" @json_schema_type diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index 51be318cb3..16ae353574 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -109,7 +109,7 @@ async def chat_completion( model = await self.routing_table.get_model(model_id) if model is None: raise ValueError(f"Model '{model_id}' not found") - if model.model_type == ModelType.embedding_model: + if model.model_type == ModelType.embedding: raise ValueError( f"Model '{model_id}' is an embedding model and does not support chat completions" ) @@ -142,7 +142,7 @@ async def completion( model = await self.routing_table.get_model(model_id) if model is None: raise ValueError(f"Model '{model_id}' not found") - if model.model_type == ModelType.embedding_model: + if model.model_type == ModelType.embedding: raise ValueError( f"Model '{model_id}' is an embedding model and does not support chat completions" ) diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index bc3de8be08..01edf4e5ac 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -225,10 +225,7 @@ async def register_model( metadata = {} if model_type is None: model_type = ModelType.llm - if ( - "embedding_dimension" not in metadata - and model_type == ModelType.embedding_model - ): + if "embedding_dimension" not in metadata and model_type == ModelType.embedding: raise ValueError( "Embedding model must have an embedding dimension in its metadata" ) @@ -311,8 +308,15 @@ async def register_memory_bank( ) model = await self.get_object_by_identifier("model", params.embedding_model) if model is None: - raise ValueError(f"Model {params.embedding_model} not found") - if model.model_type != ModelType.embedding_model: + if params.embedding_model == "all-MiniLM-L6-v2": + raise ValueError( + "Embeddings are now served via Inference providers. " + "Please upgrade your run.yaml to include inline::sentence-transformer as an additional inference provider. " + "See https://github.com/meta-llama/llama-stack/blob/main/llama_stack/templates/together/run.yaml for an example." + ) + else: + raise ValueError(f"Model {params.embedding_model} not found") + if model.model_type != ModelType.embedding: raise ValueError( f"Model {params.embedding_model} is not an embedding model" ) diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py index e7abde2273..821746640b 100644 --- a/llama_stack/providers/inline/inference/meta_reference/inference.py +++ b/llama_stack/providers/inline/inference/meta_reference/inference.py @@ -83,7 +83,7 @@ async def unregister_model(self, model_id: str) -> None: async def register_model(self, model: Model) -> Model: model = await self.model_registry_helper.register_model(model) - if model.model_type == ModelType.embedding_model: + if model.model_type == ModelType.embedding: self._load_sentence_transformer_model(model.provider_resource_id) return model diff --git a/llama_stack/providers/inline/inference/sentence_transformers/config.py b/llama_stack/providers/inline/inference/sentence_transformers/config.py index aec6d56d81..53f17cfd54 100644 --- a/llama_stack/providers/inline/inference/sentence_transformers/config.py +++ b/llama_stack/providers/inline/inference/sentence_transformers/config.py @@ -4,7 +4,13 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any, Dict + from pydantic import BaseModel -class SentenceTransformersInferenceConfig(BaseModel): ... +class SentenceTransformersInferenceConfig(BaseModel): + + @classmethod + def sample_run_config(cls) -> Dict[str, Any]: + return {} diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index 1ba4ad5994..acd5b62bc5 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -337,7 +337,7 @@ async def embeddings( async def register_model(self, model: Model) -> Model: # ollama does not have embedding models running. Check if the model is in list of available models. - if model.model_type == ModelType.embedding_model: + if model.model_type == ModelType.embedding: response = await self.client.list() available_models = [m["model"] for m in response["models"]] if model.provider_resource_id not in available_models: diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index 7ad5cef0f1..890b547de2 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -207,7 +207,7 @@ async def embeddings( model = await self.model_store.get_model(model_id) kwargs = {} - assert model.model_type == ModelType.embedding_model + assert model.model_type == ModelType.embedding assert model.metadata.get("embedding_dimensions") kwargs["dimensions"] = model.metadata.get("embedding_dimensions") assert all( diff --git a/llama_stack/providers/tests/inference/fixtures.py b/llama_stack/providers/tests/inference/fixtures.py index ed0b0302d9..d9c0cb1889 100644 --- a/llama_stack/providers/tests/inference/fixtures.py +++ b/llama_stack/providers/tests/inference/fixtures.py @@ -238,7 +238,7 @@ async def inference_stack(request, inference_model): model_type = ModelType.llm metadata = {} if os.getenv("EMBEDDING_DIMENSION"): - model_type = ModelType.embedding_model + model_type = ModelType.embedding metadata["embedding_dimension"] = get_env_or_fail("EMBEDDING_DIMENSION") test_stack = await construct_stack_for_test( diff --git a/llama_stack/providers/tests/inference/test_embeddings.py b/llama_stack/providers/tests/inference/test_embeddings.py index 3502c6b20b..bf09896c14 100644 --- a/llama_stack/providers/tests/inference/test_embeddings.py +++ b/llama_stack/providers/tests/inference/test_embeddings.py @@ -18,7 +18,7 @@ async def test_embeddings(self, inference_model, inference_stack): inference_impl, models_impl = inference_stack model = await models_impl.get_model(inference_model) - if model.model_type != ModelType.embedding_model: + if model.model_type != ModelType.embedding: pytest.skip("This test is only applicable for embedding models") response = await inference_impl.embeddings( @@ -39,7 +39,7 @@ async def test_batch_embeddings(self, inference_model, inference_stack): inference_impl, models_impl = inference_stack model = await models_impl.get_model(inference_model) - if model.model_type != ModelType.embedding_model: + if model.model_type != ModelType.embedding: pytest.skip("This test is only applicable for embedding models") texts = ["Hello, world!", "This is a test", "Testing embeddings"] diff --git a/llama_stack/providers/tests/memory/fixtures.py b/llama_stack/providers/tests/memory/fixtures.py index 92fd1720e9..8eebfbefc1 100644 --- a/llama_stack/providers/tests/memory/fixtures.py +++ b/llama_stack/providers/tests/memory/fixtures.py @@ -125,7 +125,7 @@ async def memory_stack(inference_model, request): models=[ ModelInput( model_id=inference_model, - model_type=ModelType.embedding_model, + model_type=ModelType.embedding, metadata={ "embedding_dimension": get_env_or_fail("EMBEDDING_DIMENSION"), }, diff --git a/llama_stack/providers/utils/inference/model_registry.py b/llama_stack/providers/utils/inference/model_registry.py index be2642cdb2..71eb585044 100644 --- a/llama_stack/providers/utils/inference/model_registry.py +++ b/llama_stack/providers/utils/inference/model_registry.py @@ -78,7 +78,7 @@ def get_llama_model(self, provider_model_id: str) -> str: return None async def register_model(self, model: Model) -> Model: - if model.model_type == ModelType.embedding_model: + if model.model_type == ModelType.embedding: # embedding models are always registered by their provider model id and does not need to be mapped to a llama model provider_resource_id = model.provider_resource_id else: diff --git a/llama_stack/templates/cerebras/cerebras.py b/llama_stack/templates/cerebras/cerebras.py index 58e05adf82..9acb244bdd 100644 --- a/llama_stack/templates/cerebras/cerebras.py +++ b/llama_stack/templates/cerebras/cerebras.py @@ -8,10 +8,14 @@ from llama_models.sku_list import all_registered_models +from llama_stack.apis.models.models import ModelType + from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.inference.sentence_transformers import ( + SentenceTransformersInferenceConfig, +) from llama_stack.providers.remote.inference.cerebras import CerebrasImplConfig from llama_stack.providers.remote.inference.cerebras.cerebras import model_aliases - from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -29,6 +33,11 @@ def get_distribution_template() -> DistributionTemplate: provider_type="remote::cerebras", config=CerebrasImplConfig.sample_run_config(), ) + embedding_provider = Provider( + provider_id="sentence-transformers", + provider_type="inline::sentence-transformers", + config=SentenceTransformersInferenceConfig.sample_run_config(), + ) core_model_to_hf_repo = { m.descriptor(): m.huggingface_repo for m in all_registered_models() @@ -37,9 +46,18 @@ def get_distribution_template() -> DistributionTemplate: ModelInput( model_id=core_model_to_hf_repo[m.llama_model], provider_model_id=m.provider_model_id, + provider_id="cerebras", ) for m in model_aliases ] + embedding_model = ModelInput( + model_id="all-MiniLM-L6-v2", + provider_id="sentence-transformers", + model_type=ModelType.embedding, + metadata={ + "embedding_dimension": 384, + }, + ) return DistributionTemplate( name="cerebras", @@ -52,9 +70,9 @@ def get_distribution_template() -> DistributionTemplate: run_configs={ "run.yaml": RunConfigSettings( provider_overrides={ - "inference": [inference_provider], + "inference": [inference_provider, embedding_provider], }, - default_models=default_models, + default_models=default_models + [embedding_model], default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], ), }, diff --git a/llama_stack/templates/cerebras/run.yaml b/llama_stack/templates/cerebras/run.yaml index 451e2b0760..b7c2d316e4 100644 --- a/llama_stack/templates/cerebras/run.yaml +++ b/llama_stack/templates/cerebras/run.yaml @@ -15,6 +15,9 @@ providers: config: base_url: https://api.cerebras.ai api_key: ${env.CEREBRAS_API_KEY} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} safety: - provider_id: llama-guard provider_type: inline::llama-guard @@ -49,12 +52,20 @@ metadata_store: models: - metadata: {} model_id: meta-llama/Llama-3.1-8B-Instruct - provider_id: null + provider_id: cerebras provider_model_id: llama3.1-8b + model_type: llm - metadata: {} model_id: meta-llama/Llama-3.1-70B-Instruct - provider_id: null + provider_id: cerebras provider_model_id: llama3.1-70b + model_type: llm +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + provider_model_id: null + model_type: embedding shields: - params: null shield_id: meta-llama/Llama-Guard-3-8B diff --git a/llama_stack/templates/fireworks/fireworks.py b/llama_stack/templates/fireworks/fireworks.py index 64387e4b7b..cbcac0f929 100644 --- a/llama_stack/templates/fireworks/fireworks.py +++ b/llama_stack/templates/fireworks/fireworks.py @@ -8,11 +8,15 @@ from llama_models.sku_list import all_registered_models +from llama_stack.apis.models.models import ModelType + from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.inference.sentence_transformers import ( + SentenceTransformersInferenceConfig, +) from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.fireworks import FireworksImplConfig from llama_stack.providers.remote.inference.fireworks.fireworks import MODEL_ALIASES - from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -35,6 +39,11 @@ def get_distribution_template() -> DistributionTemplate: provider_type="remote::fireworks", config=FireworksImplConfig.sample_run_config(), ) + embedding_provider = Provider( + provider_id="sentence-transformers", + provider_type="inline::sentence-transformers", + config=SentenceTransformersInferenceConfig.sample_run_config(), + ) memory_provider = Provider( provider_id="faiss", provider_type="inline::faiss", @@ -48,9 +57,18 @@ def get_distribution_template() -> DistributionTemplate: ModelInput( model_id=core_model_to_hf_repo[m.llama_model], provider_model_id=m.provider_model_id, + provider_id="fireworks", ) for m in MODEL_ALIASES ] + embedding_model = ModelInput( + model_id="all-MiniLM-L6-v2", + provider_id="sentence-transformers", + model_type=ModelType.embedding, + metadata={ + "embedding_dimension": 384, + }, + ) return DistributionTemplate( name=name, @@ -63,10 +81,10 @@ def get_distribution_template() -> DistributionTemplate: run_configs={ "run.yaml": RunConfigSettings( provider_overrides={ - "inference": [inference_provider], + "inference": [inference_provider, embedding_provider], "memory": [memory_provider], }, - default_models=default_models, + default_models=default_models + [embedding_model], default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], ), }, diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml index 70e2c1e5c0..cb31b46788 100644 --- a/llama_stack/templates/fireworks/run.yaml +++ b/llama_stack/templates/fireworks/run.yaml @@ -16,8 +16,11 @@ providers: - provider_id: fireworks provider_type: remote::fireworks config: - url: https://api.fireworks.ai/inference + url: https://api.fireworks.ai/inference/v1 api_key: ${env.FIREWORKS_API_KEY} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} memory: - provider_id: faiss provider_type: inline::faiss @@ -74,40 +77,55 @@ metadata_store: models: - metadata: {} model_id: meta-llama/Llama-3.1-8B-Instruct - provider_id: null + provider_id: fireworks provider_model_id: fireworks/llama-v3p1-8b-instruct + model_type: llm - metadata: {} model_id: meta-llama/Llama-3.1-70B-Instruct - provider_id: null + provider_id: fireworks provider_model_id: fireworks/llama-v3p1-70b-instruct + model_type: llm - metadata: {} model_id: meta-llama/Llama-3.1-405B-Instruct-FP8 - provider_id: null + provider_id: fireworks provider_model_id: fireworks/llama-v3p1-405b-instruct + model_type: llm - metadata: {} model_id: meta-llama/Llama-3.2-1B-Instruct - provider_id: null + provider_id: fireworks provider_model_id: fireworks/llama-v3p2-1b-instruct + model_type: llm - metadata: {} model_id: meta-llama/Llama-3.2-3B-Instruct - provider_id: null + provider_id: fireworks provider_model_id: fireworks/llama-v3p2-3b-instruct + model_type: llm - metadata: {} model_id: meta-llama/Llama-3.2-11B-Vision-Instruct - provider_id: null + provider_id: fireworks provider_model_id: fireworks/llama-v3p2-11b-vision-instruct + model_type: llm - metadata: {} model_id: meta-llama/Llama-3.2-90B-Vision-Instruct - provider_id: null + provider_id: fireworks provider_model_id: fireworks/llama-v3p2-90b-vision-instruct + model_type: llm - metadata: {} model_id: meta-llama/Llama-Guard-3-8B - provider_id: null + provider_id: fireworks provider_model_id: fireworks/llama-guard-3-8b + model_type: llm - metadata: {} model_id: meta-llama/Llama-Guard-3-11B-Vision - provider_id: null + provider_id: fireworks provider_model_id: fireworks/llama-guard-3-11b-vision + model_type: llm +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + provider_model_id: null + model_type: embedding shields: - params: null shield_id: meta-llama/Llama-Guard-3-8B diff --git a/llama_stack/templates/hf-endpoint/hf_endpoint.py b/llama_stack/templates/hf-endpoint/hf_endpoint.py index 297fdae512..404440be6a 100644 --- a/llama_stack/templates/hf-endpoint/hf_endpoint.py +++ b/llama_stack/templates/hf-endpoint/hf_endpoint.py @@ -4,7 +4,11 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from llama_stack.apis.models.models import ModelType from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.inference.sentence_transformers import ( + SentenceTransformersInferenceConfig, +) from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.tgi import InferenceEndpointImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -27,6 +31,11 @@ def get_distribution_template() -> DistributionTemplate: provider_type="remote::hf::endpoint", config=InferenceEndpointImplConfig.sample_run_config(), ) + embedding_provider = Provider( + provider_id="sentence-transformers", + provider_type="inline::sentence-transformers", + config=SentenceTransformersInferenceConfig.sample_run_config(), + ) memory_provider = Provider( provider_id="faiss", provider_type="inline::faiss", @@ -41,6 +50,14 @@ def get_distribution_template() -> DistributionTemplate: model_id="${env.SAFETY_MODEL}", provider_id="hf-endpoint-safety", ) + embedding_model = ModelInput( + model_id="all-MiniLM-L6-v2", + provider_id="sentence-transformers", + model_type=ModelType.embedding, + metadata={ + "embedding_dimension": 384, + }, + ) return DistributionTemplate( name=name, @@ -53,15 +70,16 @@ def get_distribution_template() -> DistributionTemplate: run_configs={ "run.yaml": RunConfigSettings( provider_overrides={ - "inference": [inference_provider], + "inference": [inference_provider, embedding_provider], "memory": [memory_provider], }, - default_models=[inference_model], + default_models=[inference_model, embedding_model], ), "run-with-safety.yaml": RunConfigSettings( provider_overrides={ "inference": [ inference_provider, + embedding_provider, Provider( provider_id="hf-endpoint-safety", provider_type="remote::hf::endpoint", @@ -75,6 +93,7 @@ def get_distribution_template() -> DistributionTemplate: default_models=[ inference_model, safety_model, + embedding_model, ], default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], ), diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml index 845abf0dc3..8e566de9a0 100644 --- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml +++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml @@ -18,6 +18,9 @@ providers: config: endpoint_name: ${env.INFERENCE_ENDPOINT_NAME} api_token: ${env.HF_API_TOKEN} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} - provider_id: hf-endpoint-safety provider_type: remote::hf::endpoint config: @@ -81,10 +84,18 @@ models: model_id: ${env.INFERENCE_MODEL} provider_id: hf-endpoint provider_model_id: null + model_type: llm - metadata: {} model_id: ${env.SAFETY_MODEL} provider_id: hf-endpoint-safety provider_model_id: null + model_type: llm +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + provider_model_id: null + model_type: embedding shields: - params: null shield_id: ${env.SAFETY_MODEL} diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml index 815ee7f03f..c1b3a64d00 100644 --- a/llama_stack/templates/hf-endpoint/run.yaml +++ b/llama_stack/templates/hf-endpoint/run.yaml @@ -18,6 +18,9 @@ providers: config: endpoint_name: ${env.INFERENCE_ENDPOINT_NAME} api_token: ${env.HF_API_TOKEN} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} memory: - provider_id: faiss provider_type: inline::faiss @@ -76,6 +79,13 @@ models: model_id: ${env.INFERENCE_MODEL} provider_id: hf-endpoint provider_model_id: null + model_type: llm +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + provider_model_id: null + model_type: embedding shields: [] memory_banks: [] datasets: [] diff --git a/llama_stack/templates/hf-serverless/hf_serverless.py b/llama_stack/templates/hf-serverless/hf_serverless.py index 835495bb93..63b423412f 100644 --- a/llama_stack/templates/hf-serverless/hf_serverless.py +++ b/llama_stack/templates/hf-serverless/hf_serverless.py @@ -4,7 +4,11 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from llama_stack.apis.models.models import ModelType from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.inference.sentence_transformers import ( + SentenceTransformersInferenceConfig, +) from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.tgi import InferenceAPIImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -28,6 +32,11 @@ def get_distribution_template() -> DistributionTemplate: provider_type="remote::hf::serverless", config=InferenceAPIImplConfig.sample_run_config(), ) + embedding_provider = Provider( + provider_id="sentence-transformers", + provider_type="inline::sentence-transformers", + config=SentenceTransformersInferenceConfig.sample_run_config(), + ) memory_provider = Provider( provider_id="faiss", provider_type="inline::faiss", @@ -42,6 +51,14 @@ def get_distribution_template() -> DistributionTemplate: model_id="${env.SAFETY_MODEL}", provider_id="hf-serverless-safety", ) + embedding_model = ModelInput( + model_id="all-MiniLM-L6-v2", + provider_id="sentence-transformers", + model_type=ModelType.embedding, + metadata={ + "embedding_dimension": 384, + }, + ) return DistributionTemplate( name=name, @@ -54,15 +71,16 @@ def get_distribution_template() -> DistributionTemplate: run_configs={ "run.yaml": RunConfigSettings( provider_overrides={ - "inference": [inference_provider], + "inference": [inference_provider, embedding_provider], "memory": [memory_provider], }, - default_models=[inference_model], + default_models=[inference_model, embedding_model], ), "run-with-safety.yaml": RunConfigSettings( provider_overrides={ "inference": [ inference_provider, + embedding_provider, Provider( provider_id="hf-serverless-safety", provider_type="remote::hf::serverless", @@ -76,6 +94,7 @@ def get_distribution_template() -> DistributionTemplate: default_models=[ inference_model, safety_model, + embedding_model, ], default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], ), diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml index 82276ca8f4..2b24ab0747 100644 --- a/llama_stack/templates/hf-serverless/run-with-safety.yaml +++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml @@ -18,6 +18,9 @@ providers: config: huggingface_repo: ${env.INFERENCE_MODEL} api_token: ${env.HF_API_TOKEN} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} - provider_id: hf-serverless-safety provider_type: remote::hf::serverless config: @@ -81,10 +84,18 @@ models: model_id: ${env.INFERENCE_MODEL} provider_id: hf-serverless provider_model_id: null + model_type: llm - metadata: {} model_id: ${env.SAFETY_MODEL} provider_id: hf-serverless-safety provider_model_id: null + model_type: llm +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + provider_model_id: null + model_type: embedding shields: - params: null shield_id: ${env.SAFETY_MODEL} diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml index 6f87c04e27..394d689daa 100644 --- a/llama_stack/templates/hf-serverless/run.yaml +++ b/llama_stack/templates/hf-serverless/run.yaml @@ -18,6 +18,9 @@ providers: config: huggingface_repo: ${env.INFERENCE_MODEL} api_token: ${env.HF_API_TOKEN} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} memory: - provider_id: faiss provider_type: inline::faiss @@ -76,6 +79,13 @@ models: model_id: ${env.INFERENCE_MODEL} provider_id: hf-serverless provider_model_id: null + model_type: llm +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + provider_model_id: null + model_type: embedding shields: [] memory_banks: [] datasets: [] diff --git a/llama_stack/templates/meta-reference-gpu/meta_reference.py b/llama_stack/templates/meta-reference-gpu/meta_reference.py index 0aff9f39c8..461d89a4a5 100644 --- a/llama_stack/templates/meta-reference-gpu/meta_reference.py +++ b/llama_stack/templates/meta-reference-gpu/meta_reference.py @@ -6,10 +6,15 @@ from pathlib import Path +from llama_stack.apis.models.models import ModelType + from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput from llama_stack.providers.inline.inference.meta_reference import ( MetaReferenceInferenceConfig, ) +from llama_stack.providers.inline.inference.sentence_transformers import ( + SentenceTransformersInferenceConfig, +) from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -34,6 +39,11 @@ def get_distribution_template() -> DistributionTemplate: checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}", ), ) + embedding_provider = Provider( + provider_id="sentence-transformers", + provider_type="inline::sentence-transformers", + config=SentenceTransformersInferenceConfig.sample_run_config(), + ) memory_provider = Provider( provider_id="faiss", provider_type="inline::faiss", @@ -44,6 +54,14 @@ def get_distribution_template() -> DistributionTemplate: model_id="${env.INFERENCE_MODEL}", provider_id="meta-reference-inference", ) + embedding_model = ModelInput( + model_id="all-MiniLM-L6-v2", + provider_id="sentence-transformers", + model_type=ModelType.embedding, + metadata={ + "embedding_dimension": 384, + }, + ) safety_model = ModelInput( model_id="${env.SAFETY_MODEL}", provider_id="meta-reference-safety", @@ -59,15 +77,16 @@ def get_distribution_template() -> DistributionTemplate: run_configs={ "run.yaml": RunConfigSettings( provider_overrides={ - "inference": [inference_provider], + "inference": [inference_provider, embedding_provider], "memory": [memory_provider], }, - default_models=[inference_model], + default_models=[inference_model, embedding_model], ), "run-with-safety.yaml": RunConfigSettings( provider_overrides={ "inference": [ inference_provider, + embedding_provider, Provider( provider_id="meta-reference-safety", provider_type="inline::meta-reference", @@ -82,6 +101,7 @@ def get_distribution_template() -> DistributionTemplate: default_models=[ inference_model, safety_model, + embedding_model, ], default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], ), diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml index 044c1e7fde..deb6c4a912 100644 --- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml +++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml @@ -19,6 +19,9 @@ providers: model: ${env.INFERENCE_MODEL} max_seq_len: 4096 checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} - provider_id: meta-reference-safety provider_type: inline::meta-reference config: @@ -83,10 +86,18 @@ models: model_id: ${env.INFERENCE_MODEL} provider_id: meta-reference-inference provider_model_id: null + model_type: llm - metadata: {} model_id: ${env.SAFETY_MODEL} provider_id: meta-reference-safety provider_model_id: null + model_type: llm +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + provider_model_id: null + model_type: embedding shields: - params: null shield_id: ${env.SAFETY_MODEL} diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml index e8fdb10c2f..c190666644 100644 --- a/llama_stack/templates/meta-reference-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-gpu/run.yaml @@ -19,6 +19,9 @@ providers: model: ${env.INFERENCE_MODEL} max_seq_len: 4096 checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} memory: - provider_id: faiss provider_type: inline::faiss @@ -77,6 +80,13 @@ models: model_id: ${env.INFERENCE_MODEL} provider_id: meta-reference-inference provider_model_id: null + model_type: llm +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + provider_model_id: null + model_type: embedding shields: [] memory_banks: [] datasets: [] diff --git a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py index 1d611ae5f4..c460860c56 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py +++ b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py @@ -6,10 +6,15 @@ from pathlib import Path +from llama_stack.apis.models.models import ModelType + from llama_stack.distribution.datatypes import ModelInput, Provider from llama_stack.providers.inline.inference.meta_reference import ( MetaReferenceQuantizedInferenceConfig, ) +from llama_stack.providers.inline.inference.sentence_transformers import ( + SentenceTransformersInferenceConfig, +) from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -34,6 +39,11 @@ def get_distribution_template() -> DistributionTemplate: checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}", ), ) + embedding_provider = Provider( + provider_id="sentence-transformers", + provider_type="inline::sentence-transformers", + config=SentenceTransformersInferenceConfig.sample_run_config(), + ) memory_provider = Provider( provider_id="faiss", provider_type="inline::faiss", @@ -44,6 +54,14 @@ def get_distribution_template() -> DistributionTemplate: model_id="${env.INFERENCE_MODEL}", provider_id="meta-reference-inference", ) + embedding_model = ModelInput( + model_id="all-MiniLM-L6-v2", + provider_id="sentence-transformers", + model_type=ModelType.embedding, + metadata={ + "embedding_dimension": 384, + }, + ) return DistributionTemplate( name=name, distro_type="self_hosted", @@ -54,10 +72,10 @@ def get_distribution_template() -> DistributionTemplate: run_configs={ "run.yaml": RunConfigSettings( provider_overrides={ - "inference": [inference_provider], + "inference": [inference_provider, embedding_provider], "memory": [memory_provider], }, - default_models=[inference_model], + default_models=[inference_model, embedding_model], ), }, run_config_env_vars={ diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml index 0232ec51c8..550170a00d 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml @@ -21,6 +21,9 @@ providers: checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null} quantization: type: fp8 + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} memory: - provider_id: faiss provider_type: inline::faiss @@ -79,6 +82,13 @@ models: model_id: ${env.INFERENCE_MODEL} provider_id: meta-reference-inference provider_model_id: null + model_type: llm +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + provider_model_id: null + model_type: embedding shields: [] memory_banks: [] datasets: [] diff --git a/llama_stack/templates/ollama/ollama.py b/llama_stack/templates/ollama/ollama.py index c24dfa6e90..1e3180a775 100644 --- a/llama_stack/templates/ollama/ollama.py +++ b/llama_stack/templates/ollama/ollama.py @@ -6,7 +6,12 @@ from pathlib import Path +from llama_stack.apis.models.models import ModelType + from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.inference.sentence_transformers import ( + SentenceTransformersInferenceConfig, +) from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.ollama import OllamaImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -29,6 +34,11 @@ def get_distribution_template() -> DistributionTemplate: provider_type="remote::ollama", config=OllamaImplConfig.sample_run_config(), ) + embedding_provider = Provider( + provider_id="sentence-transformers", + provider_type="inline::sentence-transformers", + config=SentenceTransformersInferenceConfig.sample_run_config(), + ) memory_provider = Provider( provider_id="faiss", provider_type="inline::faiss", @@ -43,6 +53,14 @@ def get_distribution_template() -> DistributionTemplate: model_id="${env.SAFETY_MODEL}", provider_id="ollama", ) + embedding_model = ModelInput( + model_id="all-MiniLM-L6-v2", + provider_id="sentence-transformers", + model_type=ModelType.embedding, + metadata={ + "embedding_dimension": 384, + }, + ) return DistributionTemplate( name=name, @@ -55,21 +73,23 @@ def get_distribution_template() -> DistributionTemplate: run_configs={ "run.yaml": RunConfigSettings( provider_overrides={ - "inference": [inference_provider], + "inference": [inference_provider, embedding_provider], "memory": [memory_provider], }, - default_models=[inference_model], + default_models=[inference_model, embedding_model], ), "run-with-safety.yaml": RunConfigSettings( provider_overrides={ "inference": [ inference_provider, + embedding_provider, ], "memory": [memory_provider], }, default_models=[ inference_model, safety_model, + embedding_model, ], default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], ), diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml index fcb1b2dbac..100886c958 100644 --- a/llama_stack/templates/ollama/run-with-safety.yaml +++ b/llama_stack/templates/ollama/run-with-safety.yaml @@ -17,6 +17,9 @@ providers: provider_type: remote::ollama config: url: ${env.OLLAMA_URL:http://localhost:11434} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} memory: - provider_id: faiss provider_type: inline::faiss @@ -75,10 +78,18 @@ models: model_id: ${env.INFERENCE_MODEL} provider_id: ollama provider_model_id: null + model_type: llm - metadata: {} model_id: ${env.SAFETY_MODEL} provider_id: ollama provider_model_id: null + model_type: llm +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + provider_model_id: null + model_type: embedding shields: - params: null shield_id: ${env.SAFETY_MODEL} diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml index 2e739aac24..bcbed3e6ef 100644 --- a/llama_stack/templates/ollama/run.yaml +++ b/llama_stack/templates/ollama/run.yaml @@ -17,6 +17,9 @@ providers: provider_type: remote::ollama config: url: ${env.OLLAMA_URL:http://localhost:11434} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} memory: - provider_id: faiss provider_type: inline::faiss @@ -75,6 +78,13 @@ models: model_id: ${env.INFERENCE_MODEL} provider_id: ollama provider_model_id: null + model_type: llm +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + provider_model_id: null + model_type: embedding shields: [] memory_banks: [] datasets: [] diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml index ac8cf6f4a7..7097bc6496 100644 --- a/llama_stack/templates/remote-vllm/run-with-safety.yaml +++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml @@ -22,6 +22,9 @@ providers: url: ${env.SAFETY_VLLM_URL} max_tokens: ${env.VLLM_MAX_TOKENS:4096} api_token: ${env.VLLM_API_TOKEN:fake} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} memory: - provider_id: faiss provider_type: inline::faiss @@ -58,10 +61,18 @@ models: model_id: ${env.INFERENCE_MODEL} provider_id: vllm-inference provider_model_id: null + model_type: llm - metadata: {} model_id: ${env.SAFETY_MODEL} provider_id: vllm-safety provider_model_id: null + model_type: llm +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + provider_model_id: null + model_type: embedding shields: - params: null shield_id: ${env.SAFETY_MODEL} diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml index 27c5df53c4..c957b05d08 100644 --- a/llama_stack/templates/remote-vllm/run.yaml +++ b/llama_stack/templates/remote-vllm/run.yaml @@ -16,6 +16,9 @@ providers: url: ${env.VLLM_URL} max_tokens: ${env.VLLM_MAX_TOKENS:4096} api_token: ${env.VLLM_API_TOKEN:fake} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} memory: - provider_id: faiss provider_type: inline::faiss @@ -52,6 +55,13 @@ models: model_id: ${env.INFERENCE_MODEL} provider_id: vllm-inference provider_model_id: null + model_type: llm +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + provider_model_id: null + model_type: embedding shields: [] memory_banks: [] datasets: [] diff --git a/llama_stack/templates/remote-vllm/vllm.py b/llama_stack/templates/remote-vllm/vllm.py index f5ccfcf16e..e4c948fbfa 100644 --- a/llama_stack/templates/remote-vllm/vllm.py +++ b/llama_stack/templates/remote-vllm/vllm.py @@ -6,7 +6,12 @@ from pathlib import Path +from llama_stack.apis.models.models import ModelType + from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.inference.sentence_transformers import ( + SentenceTransformersInferenceConfig, +) from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.vllm import VLLMInferenceAdapterConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -28,6 +33,11 @@ def get_distribution_template() -> DistributionTemplate: url="${env.VLLM_URL}", ), ) + embedding_provider = Provider( + provider_id="sentence-transformers", + provider_type="inline::sentence-transformers", + config=SentenceTransformersInferenceConfig.sample_run_config(), + ) memory_provider = Provider( provider_id="faiss", provider_type="inline::faiss", @@ -42,6 +52,14 @@ def get_distribution_template() -> DistributionTemplate: model_id="${env.SAFETY_MODEL}", provider_id="vllm-safety", ) + embedding_model = ModelInput( + model_id="all-MiniLM-L6-v2", + provider_id="sentence-transformers", + model_type=ModelType.embedding, + metadata={ + "embedding_dimension": 384, + }, + ) return DistributionTemplate( name=name, @@ -53,10 +71,10 @@ def get_distribution_template() -> DistributionTemplate: run_configs={ "run.yaml": RunConfigSettings( provider_overrides={ - "inference": [inference_provider], + "inference": [inference_provider, embedding_provider], "memory": [memory_provider], }, - default_models=[inference_model], + default_models=[inference_model, embedding_model], ), "run-with-safety.yaml": RunConfigSettings( provider_overrides={ @@ -69,12 +87,14 @@ def get_distribution_template() -> DistributionTemplate: url="${env.SAFETY_VLLM_URL}", ), ), + embedding_provider, ], "memory": [memory_provider], }, default_models=[ inference_model, safety_model, + embedding_model, ], default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], ), diff --git a/llama_stack/templates/template.py b/llama_stack/templates/template.py index e82be63947..0ec8c1f09d 100644 --- a/llama_stack/templates/template.py +++ b/llama_stack/templates/template.py @@ -11,6 +11,7 @@ import yaml from pydantic import BaseModel, Field +from llama_stack.apis.models.models import ModelType from llama_stack.distribution.datatypes import ( Api, BuildConfig, @@ -146,6 +147,13 @@ def generate_markdown_docs(self) -> str: ) def save_distribution(self, yaml_output_dir: Path, doc_output_dir: Path) -> None: + def enum_representer(dumper, data): + return dumper.represent_scalar("tag:yaml.org,2002:str", data.value) + + # Register YAML representer for ModelType + yaml.add_representer(ModelType, enum_representer) + yaml.SafeDumper.add_representer(ModelType, enum_representer) + for output_dir in [yaml_output_dir, doc_output_dir]: output_dir.mkdir(parents=True, exist_ok=True) diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml index a7375a90ff..ef8344a7ad 100644 --- a/llama_stack/templates/tgi/run-with-safety.yaml +++ b/llama_stack/templates/tgi/run-with-safety.yaml @@ -79,10 +79,12 @@ models: model_id: ${env.INFERENCE_MODEL} provider_id: tgi-inference provider_model_id: null + model_type: llm - metadata: {} model_id: ${env.SAFETY_MODEL} provider_id: tgi-safety provider_model_id: null + model_type: llm shields: - params: null shield_id: ${env.SAFETY_MODEL} diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml index a3e21075ff..22c08d1d3a 100644 --- a/llama_stack/templates/tgi/run.yaml +++ b/llama_stack/templates/tgi/run.yaml @@ -17,6 +17,9 @@ providers: provider_type: remote::tgi config: url: ${env.TGI_URL} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} memory: - provider_id: faiss provider_type: inline::faiss @@ -75,6 +78,13 @@ models: model_id: ${env.INFERENCE_MODEL} provider_id: tgi-inference provider_model_id: null + model_type: llm +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + provider_model_id: null + model_type: embedding shields: [] memory_banks: [] datasets: [] diff --git a/llama_stack/templates/tgi/tgi.py b/llama_stack/templates/tgi/tgi.py index 83818a598f..c84f5b5feb 100644 --- a/llama_stack/templates/tgi/tgi.py +++ b/llama_stack/templates/tgi/tgi.py @@ -6,7 +6,12 @@ from pathlib import Path +from llama_stack.apis.models.models import ModelType + from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.inference.sentence_transformers import ( + SentenceTransformersInferenceConfig, +) from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.tgi import TGIImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -31,6 +36,11 @@ def get_distribution_template() -> DistributionTemplate: url="${env.TGI_URL}", ), ) + embedding_provider = Provider( + provider_id="sentence-transformers", + provider_type="inline::sentence-transformers", + config=SentenceTransformersInferenceConfig.sample_run_config(), + ) memory_provider = Provider( provider_id="faiss", provider_type="inline::faiss", @@ -41,6 +51,14 @@ def get_distribution_template() -> DistributionTemplate: model_id="${env.INFERENCE_MODEL}", provider_id="tgi-inference", ) + embedding_model = ModelInput( + model_id="all-MiniLM-L6-v2", + provider_id="sentence-transformers", + model_type=ModelType.embedding, + metadata={ + "embedding_dimension": 384, + }, + ) safety_model = ModelInput( model_id="${env.SAFETY_MODEL}", provider_id="tgi-safety", @@ -57,10 +75,10 @@ def get_distribution_template() -> DistributionTemplate: run_configs={ "run.yaml": RunConfigSettings( provider_overrides={ - "inference": [inference_provider], + "inference": [inference_provider, embedding_provider], "memory": [memory_provider], }, - default_models=[inference_model], + default_models=[inference_model, embedding_model], ), "run-with-safety.yaml": RunConfigSettings( provider_overrides={ diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml index 529bf78730..9f02d8b549 100644 --- a/llama_stack/templates/together/run.yaml +++ b/llama_stack/templates/together/run.yaml @@ -18,6 +18,9 @@ providers: config: url: https://api.together.xyz/v1 api_key: ${env.TOGETHER_API_KEY} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} memory: - provider_id: faiss provider_type: inline::faiss @@ -74,36 +77,50 @@ metadata_store: models: - metadata: {} model_id: meta-llama/Llama-3.1-8B-Instruct - provider_id: null + provider_id: together provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo + model_type: llm - metadata: {} model_id: meta-llama/Llama-3.1-70B-Instruct - provider_id: null + provider_id: together provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo + model_type: llm - metadata: {} model_id: meta-llama/Llama-3.1-405B-Instruct-FP8 - provider_id: null + provider_id: together provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo + model_type: llm - metadata: {} model_id: meta-llama/Llama-3.2-3B-Instruct - provider_id: null + provider_id: together provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo + model_type: llm - metadata: {} model_id: meta-llama/Llama-3.2-11B-Vision-Instruct - provider_id: null + provider_id: together provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo + model_type: llm - metadata: {} model_id: meta-llama/Llama-3.2-90B-Vision-Instruct - provider_id: null + provider_id: together provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo + model_type: llm - metadata: {} model_id: meta-llama/Llama-Guard-3-8B - provider_id: null + provider_id: together provider_model_id: meta-llama/Meta-Llama-Guard-3-8B + model_type: llm - metadata: {} model_id: meta-llama/Llama-Guard-3-11B-Vision - provider_id: null + provider_id: together provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo + model_type: llm +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + provider_model_id: null + model_type: embedding shields: - params: null shield_id: meta-llama/Llama-Guard-3-8B diff --git a/llama_stack/templates/together/together.py b/llama_stack/templates/together/together.py index 6656cfe440..994cf55498 100644 --- a/llama_stack/templates/together/together.py +++ b/llama_stack/templates/together/together.py @@ -8,11 +8,15 @@ from llama_models.sku_list import all_registered_models +from llama_stack.apis.models.models import ModelType + from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.inference.sentence_transformers import ( + SentenceTransformersInferenceConfig, +) from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.together import TogetherImplConfig from llama_stack.providers.remote.inference.together.together import MODEL_ALIASES - from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -38,6 +42,11 @@ def get_distribution_template() -> DistributionTemplate: provider_type="inline::faiss", config=FaissImplConfig.sample_run_config(f"distributions/{name}"), ) + embedding_provider = Provider( + provider_id="sentence-transformers", + provider_type="inline::sentence-transformers", + config=SentenceTransformersInferenceConfig.sample_run_config(), + ) core_model_to_hf_repo = { m.descriptor(): m.huggingface_repo for m in all_registered_models() @@ -46,9 +55,18 @@ def get_distribution_template() -> DistributionTemplate: ModelInput( model_id=core_model_to_hf_repo[m.llama_model], provider_model_id=m.provider_model_id, + provider_id="together", ) for m in MODEL_ALIASES ] + embedding_model = ModelInput( + model_id="all-MiniLM-L6-v2", + provider_id="sentence-transformers", + model_type=ModelType.embedding, + metadata={ + "embedding_dimension": 384, + }, + ) return DistributionTemplate( name=name, @@ -61,10 +79,10 @@ def get_distribution_template() -> DistributionTemplate: run_configs={ "run.yaml": RunConfigSettings( provider_overrides={ - "inference": [inference_provider], + "inference": [inference_provider, embedding_provider], "memory": [memory_provider], }, - default_models=default_models, + default_models=default_models + [embedding_model], default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], ), }, diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml index 8353dbd513..171f25d632 100644 --- a/llama_stack/templates/vllm-gpu/run.yaml +++ b/llama_stack/templates/vllm-gpu/run.yaml @@ -21,6 +21,9 @@ providers: max_tokens: ${env.MAX_TOKENS:4096} enforce_eager: ${env.ENFORCE_EAGER:False} gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:0.7} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} memory: - provider_id: faiss provider_type: inline::faiss @@ -79,6 +82,13 @@ models: model_id: ${env.INFERENCE_MODEL} provider_id: vllm provider_model_id: null + model_type: llm +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + provider_model_id: null + model_type: embedding shields: [] memory_banks: [] datasets: [] diff --git a/llama_stack/templates/vllm-gpu/vllm.py b/llama_stack/templates/vllm-gpu/vllm.py index 10b448b5cb..fe6fb7186a 100644 --- a/llama_stack/templates/vllm-gpu/vllm.py +++ b/llama_stack/templates/vllm-gpu/vllm.py @@ -4,7 +4,11 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from llama_stack.apis.models.models import ModelType from llama_stack.distribution.datatypes import ModelInput, Provider +from llama_stack.providers.inline.inference.sentence_transformers import ( + SentenceTransformersInferenceConfig, +) from llama_stack.providers.inline.inference.vllm import VLLMConfig from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -32,11 +36,24 @@ def get_distribution_template() -> DistributionTemplate: provider_type="inline::faiss", config=FaissImplConfig.sample_run_config(f"distributions/{name}"), ) + embedding_provider = Provider( + provider_id="sentence-transformers", + provider_type="inline::sentence-transformers", + config=SentenceTransformersInferenceConfig.sample_run_config(), + ) inference_model = ModelInput( model_id="${env.INFERENCE_MODEL}", provider_id="vllm", ) + embedding_model = ModelInput( + model_id="all-MiniLM-L6-v2", + provider_id="sentence-transformers", + model_type=ModelType.embedding, + metadata={ + "embedding_dimension": 384, + }, + ) return DistributionTemplate( name=name, @@ -49,10 +66,10 @@ def get_distribution_template() -> DistributionTemplate: run_configs={ "run.yaml": RunConfigSettings( provider_overrides={ - "inference": [inference_provider], + "inference": [inference_provider, embedding_provider], "memory": [memory_provider], }, - default_models=[inference_model], + default_models=[inference_model, embedding_model], ), }, run_config_env_vars={