diff --git a/distributions/inline-nvidia/build.yaml b/distributions/inline-nvidia/build.yaml new file mode 120000 index 0000000000..8903d2e572 --- /dev/null +++ b/distributions/inline-nvidia/build.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/nvidia/build.yaml \ No newline at end of file diff --git a/distributions/inline-nvidia/compose.yaml b/distributions/inline-nvidia/compose.yaml new file mode 100644 index 0000000000..f7320b9686 --- /dev/null +++ b/distributions/inline-nvidia/compose.yaml @@ -0,0 +1,58 @@ +services: + nim: + image: nvcr.io/nim/meta/llama-3.1-8b-instruct:latest + network_mode: "host" + volumes: + - nim-llm-cache:/opt/nim/.cache + ports: + - "8000:8000" + shm_size: 16G + environment: + - CUDA_VISIBLE_DEVICES=0 + - NIM_HTTP_API_PORT=8000 + - NIM_TRITON_LOG_VERBOSE=1 + - NGC_API_KEY=${NIM_NGC_API_KEY:-${NGC_API_KEY:-ngcapikey}} + command: [] + deploy: + resources: + reservations: + devices: + - driver: nvidia + # that's the closest analogue to --gpus; provide + # an integer amount of devices or 'all' + count: 1 + # Devices are reserved using a list of capabilities, making + # capabilities the only required field. A device MUST + # satisfy all the requested capabilities for a successful + # reservation. + capabilities: [gpu] + runtime: nvidia + healthcheck: + test: ["CMD", "curl", "http://localhost:8000/v1/health/ready"] + interval: 5s + timeout: 5s + retries: 30 + start_period: 120s + llamastack: + depends_on: + - nim + image: distribution-nvidia:dev + network_mode: "host" + volumes: + - ~/.llama:/root/.llama + - ./run.yaml:/root/llamastack-run-nvidia.yaml + ports: + - "5000:5000" + environment: + - INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct} + - NVIDIA_API_KEY=${NVIDIA_API_KEY:-} + entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml" + deploy: + restart_policy: + condition: on-failure + delay: 3s + max_attempts: 5 + window: 60s +volumes: + nim-llm-cache: + driver: local \ No newline at end of file diff --git a/distributions/inline-nvidia/run.yaml b/distributions/inline-nvidia/run.yaml new file mode 100644 index 0000000000..81e9e7f1c0 --- /dev/null +++ b/distributions/inline-nvidia/run.yaml @@ -0,0 +1,56 @@ +version: '2' +image_name: nvidia +docker_image: null +conda_env: nvidia +apis: +- agents +- inference +- memory +- safety +- telemetry +providers: + inference: + - provider_id: nvidia + provider_type: remote::nvidia + config: + url: http://localhost:8000 + api_key: ${env.NVIDIA_API_KEY} + memory: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: {} + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/agents_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} +metadata_store: + namespace: null + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/registry.db +models: +- metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: nvidia + provider_model_id: null +shields: [] +memory_banks: [] +datasets: [] +scoring_fns: [] +eval_tasks: [] + diff --git a/distributions/remote-nvidia/build.yaml b/distributions/remote-nvidia/build.yaml new file mode 120000 index 0000000000..8903d2e572 --- /dev/null +++ b/distributions/remote-nvidia/build.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/nvidia/build.yaml \ No newline at end of file diff --git a/distributions/remote-nvidia/compose.yaml b/distributions/remote-nvidia/compose.yaml new file mode 100644 index 0000000000..04b12d0da2 --- /dev/null +++ b/distributions/remote-nvidia/compose.yaml @@ -0,0 +1,19 @@ +services: + llamastack: + image: distribution-nvidia:dev + network_mode: "host" + volumes: + - ~/.llama:/root/.llama + - ./run.yaml:/root/llamastack-run-nvidia.yaml + ports: + - "5000:5000" + environment: + - INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct} + - NVIDIA_API_KEY=${NVIDIA_API_KEY:-} + entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml" + deploy: + restart_policy: + condition: on-failure + delay: 3s + max_attempts: 5 + window: 60s diff --git a/distributions/remote-nvidia/run.yaml b/distributions/remote-nvidia/run.yaml new file mode 120000 index 0000000000..85da3e26bd --- /dev/null +++ b/distributions/remote-nvidia/run.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/nvidia/run.yaml \ No newline at end of file diff --git a/docs/source/distributions/index.md b/docs/source/distributions/index.md index b61e9b28f5..e88960a848 100644 --- a/docs/source/distributions/index.md +++ b/docs/source/distributions/index.md @@ -24,6 +24,7 @@ If so, we suggest: - {dockerhub}`distribution-remote-vllm` ([Guide](self_hosted_distro/remote-vllm)) - {dockerhub}`distribution-meta-reference-gpu` ([Guide](self_hosted_distro/meta-reference-gpu)) - {dockerhub}`distribution-tgi` ([Guide](self_hosted_distro/tgi)) + - {dockerhub} `distribution-nvidia` ([Guide](self_hosted_distro/nvidia)) - **Are you running on a "regular" desktop machine?** If so, we suggest: diff --git a/docs/source/distributions/self_hosted_distro/nvidia.md b/docs/source/distributions/self_hosted_distro/nvidia.md new file mode 100644 index 0000000000..2f34997166 --- /dev/null +++ b/docs/source/distributions/self_hosted_distro/nvidia.md @@ -0,0 +1,60 @@ +# NVIDIA Distribution + +The `llamastack/distribution-nvidia` distribution consists of the following provider configurations. + +| API | Provider(s) | +|-----|-------------| +| agents | `inline::meta-reference` | +| inference | `remote::nvidia` | +| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | +| safety | `inline::llama-guard` | +| telemetry | `inline::meta-reference` | + + +### Environment Variables + +The following environment variables can be configured: + +- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`) +- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``) + +### Models + +The following models are available by default: + +- `${env.INFERENCE_MODEL} (None)` + + +### Prerequisite: API Keys + +Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). + + +## Running Llama Stack with NVIDIA + +You can do this via Conda (build code) or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + +```bash +LLAMA_STACK_PORT=5001 +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run.yaml:/root/my-run.yaml \ + llamastack/distribution-nvidia \ + --yaml-config /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env NVIDIA_API_KEY=$NVIDIA_API_KEY +``` + +### Via Conda + +```bash +llama stack build --template nvidia --image-type conda +llama stack run ./run.yaml \ + --port 5001 \ + --env NVIDIA_API_KEY=$NVIDIA_API_KEY +``` \ No newline at end of file diff --git a/llama_stack/templates/nvidia/__init__.py b/llama_stack/templates/nvidia/__init__.py new file mode 100644 index 0000000000..24e2fbd216 --- /dev/null +++ b/llama_stack/templates/nvidia/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .nvidia import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/nvidia/build.yaml b/llama_stack/templates/nvidia/build.yaml new file mode 100644 index 0000000000..9a735c2203 --- /dev/null +++ b/llama_stack/templates/nvidia/build.yaml @@ -0,0 +1,19 @@ +version: '2' +name: nvidia +distribution_spec: + description: Use NVIDIA NIM for running LLM inference + docker_image: null + providers: + inference: + - remote::nvidia + memory: + - inline::faiss + - remote::chromadb + - remote::pgvector + safety: + - inline::llama-guard + agents: + - inline::meta-reference + telemetry: + - inline::meta-reference +image_type: conda diff --git a/llama_stack/templates/nvidia/doc_template.md b/llama_stack/templates/nvidia/doc_template.md new file mode 100644 index 0000000000..949018f8d6 --- /dev/null +++ b/llama_stack/templates/nvidia/doc_template.md @@ -0,0 +1,60 @@ +# NVIDIA Distribution + +The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations. + +{{ providers_table }} + +{% if run_config_env_vars %} +### Environment Variables + +The following environment variables can be configured: + +{% for var, (default_value, description) in run_config_env_vars.items() %} +- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) +{% endfor %} +{% endif %} + +{% if default_models %} +### Models + +The following models are available by default: + +{% for model in default_models %} +- `{{ model.model_id }} ({{ model.provider_model_id }})` +{% endfor %} +{% endif %} + + +### Prerequisite: API Keys + +Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). + + +## Running Llama Stack with NVIDIA + +You can do this via Conda (build code) or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + +```bash +LLAMA_STACK_PORT=5001 +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run.yaml:/root/my-run.yaml \ + llamastack/distribution-{{ name }} \ + --yaml-config /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env NVIDIA_API_KEY=$NVIDIA_API_KEY +``` + +### Via Conda + +```bash +llama stack build --template nvidia --image-type conda +llama stack run ./run.yaml \ + --port 5001 \ + --env NVIDIA_API_KEY=$NVIDIA_API_KEY +``` diff --git a/llama_stack/templates/nvidia/nvidia.py b/llama_stack/templates/nvidia/nvidia.py new file mode 100644 index 0000000000..22aa1f4b08 --- /dev/null +++ b/llama_stack/templates/nvidia/nvidia.py @@ -0,0 +1,62 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from pathlib import Path + +from llama_stack.distribution.datatypes import ModelInput, Provider +from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig +from llama_stack.providers.remote.inference.nvidia.nvidia import _MODEL_ALIASES + +from llama_stack.templates.template import DistributionTemplate, RunConfigSettings + + +def get_distribution_template() -> DistributionTemplate: + providers = { + "inference": ["remote::nvidia"], + "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "safety": ["inline::llama-guard"], + "agents": ["inline::meta-reference"], + "telemetry": ["inline::meta-reference"], + } + + inference_provider = Provider( + provider_id="nvidia", + provider_type="remote::nvidia", + config=NVIDIAConfig.sample_run_config(), + ) + + inference_model = ModelInput( + model_id="${env.INFERENCE_MODEL}", + provider_id="nvidia", + ) + + return DistributionTemplate( + name="nvidia", + distro_type="remote_hosted", + description="Use NVIDIA NIM for running LLM inference", + docker_image=None, + template_path=Path(__file__).parent / "doc_template.md", + providers=providers, + default_models=[inference_model], + run_configs={ + "run.yaml": RunConfigSettings( + provider_overrides={ + "inference": [inference_provider], + }, + default_models=[inference_model], + ), + }, + run_config_env_vars={ + "LLAMASTACK_PORT": ( + "5001", + "Port for the Llama Stack distribution server", + ), + "NVIDIA_API_KEY": ( + "", + "NVIDIA API Key", + ), + }, + ) diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml new file mode 100644 index 0000000000..f4953852f5 --- /dev/null +++ b/llama_stack/templates/nvidia/run.yaml @@ -0,0 +1,55 @@ +version: '2' +image_name: nvidia +docker_image: null +conda_env: nvidia +apis: +- agents +- inference +- memory +- safety +- telemetry +providers: + inference: + - provider_id: nvidia + provider_type: remote::nvidia + config: + url: https://integrate.api.nvidia.com + api_key: ${env.NVIDIA_API_KEY} + memory: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: {} + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/agents_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} +metadata_store: + namespace: null + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/registry.db +models: +- metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: nvidia + provider_model_id: null +shields: [] +memory_banks: [] +datasets: [] +scoring_fns: [] +eval_tasks: []