meta-llama · dineshyv · Nov 13, 2024 · Nov 12, 2024 · Nov 12, 2024 · Nov 12, 2024
@@ -21,7 +21,7 @@
     "info": {
         "title": "[DRAFT] Llama Stack Specification",
         "version": "0.0.1",
-        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-11-12 11:39:48.665782"
+        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-11-12 15:47:15.607543"
     },
     "servers": [
         {
@@ -2856,7 +2856,7 @@
             "ChatCompletionRequest": {
                 "type": "object",
                 "properties": {
-                    "model": {
+                    "model_id": {
                         "type": "string"
                     },
                     "messages": {
@@ -2993,7 +2993,7 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "model",
+                    "model_id",
                     "messages"
                 ]
             },
@@ -3120,7 +3120,7 @@
             "CompletionRequest": {
                 "type": "object",
                 "properties": {
-                    "model": {
+                    "model_id": {
                         "type": "string"
                     },
                     "content": {
@@ -3249,7 +3249,7 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "model",
+                    "model_id",
                     "content"
                 ]
             },
@@ -4552,7 +4552,7 @@
             "EmbeddingsRequest": {
                 "type": "object",
                 "properties": {
-                    "model": {
+                    "model_id": {
                         "type": "string"
                     },
                     "contents": {
@@ -4584,7 +4584,7 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "model",
+                    "model_id",
                     "contents"
                 ]
             },
@@ -7837,58 +7837,58 @@
     ],
     "tags": [
         {
-            "name": "MemoryBanks"
+            "name": "Safety"
         },
         {
-            "name": "BatchInference"
+            "name": "EvalTasks"
         },
         {
-            "name": "Agents"
+            "name": "Shields"
         },
         {
-            "name": "Inference"
+            "name": "Telemetry"
         },
         {
-            "name": "DatasetIO"
+            "name": "Memory"
         },
         {
-            "name": "Eval"
+            "name": "Scoring"
         },
         {
-            "name": "Models"
+            "name": "ScoringFunctions"
         },
         {
-            "name": "PostTraining"
+            "name": "SyntheticDataGeneration"
         },
         {
-            "name": "ScoringFunctions"
+            "name": "Models"
         },
         {
-            "name": "Datasets"
+            "name": "Agents"
         },
         {
-            "name": "Shields"
+            "name": "MemoryBanks"
         },
         {
-            "name": "Telemetry"
+            "name": "DatasetIO"
         },
         {
-            "name": "Inspect"
+            "name": "Inference"
         },
         {
-            "name": "Safety"
+            "name": "Datasets"
         },
         {
-            "name": "SyntheticDataGeneration"
+            "name": "PostTraining"
         },
         {
-            "name": "Memory"
+            "name": "BatchInference"
         },
         {
-            "name": "Scoring"
+            "name": "Eval"
         },
         {
-            "name": "EvalTasks"
+            "name": "Inspect"
         },
         {
             "name": "BuiltinTool",

@@ -396,7 +396,7 @@ components:
             - $ref: '#/components/schemas/ToolResponseMessage'
             - $ref: '#/components/schemas/CompletionMessage'
           type: array
-        model:
+        model_id:
           type: string
         response_format:
           oneOf:
@@ -453,7 +453,7 @@ components:
             $ref: '#/components/schemas/ToolDefinition'
           type: array
       required:
-      - model
+      - model_id
       - messages
       type: object
     ChatCompletionResponse:
@@ -577,7 +577,7 @@ components:
               default: 0
               type: integer
           type: object
-        model:
+        model_id:
           type: string
         response_format:
           oneOf:
@@ -626,7 +626,7 @@ components:
         stream:
           type: boolean
       required:
-      - model
+      - model_id
       - content
       type: object
     CompletionResponse:
@@ -903,10 +903,10 @@ components:
                 - $ref: '#/components/schemas/ImageMedia'
               type: array
           type: array
-        model:
+        model_id:
           type: string
       required:
-      - model
+      - model_id
       - contents
       type: object
     EmbeddingsResponse:
@@ -3384,7 +3384,7 @@ info:
   description: "This is the specification of the llama stack that provides\n     \
     \           a set of endpoints and their corresponding interfaces that are tailored\
     \ to\n                best leverage Llama Models. The specification is still in\
-    \ draft and subject to change.\n                Generated at 2024-11-12 11:39:48.665782"
+    \ draft and subject to change.\n                Generated at 2024-11-12 15:47:15.607543"
   title: '[DRAFT] Llama Stack Specification'
   version: 0.0.1
 jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
@@ -4748,24 +4748,24 @@ security:
 servers:
 - url: http://any-hosted-llama-stack.com
 tags:
-- name: MemoryBanks
-- name: BatchInference
-- name: Agents
-- name: Inference
-- name: DatasetIO
-- name: Eval
-- name: Models
-- name: PostTraining
-- name: ScoringFunctions
-- name: Datasets
+- name: Safety
+- name: EvalTasks
 - name: Shields
 - name: Telemetry
-- name: Inspect
-- name: Safety
-- name: SyntheticDataGeneration
 - name: Memory
 - name: Scoring
-- name: EvalTasks
+- name: ScoringFunctions
+- name: SyntheticDataGeneration
+- name: Models
+- name: Agents
+- name: MemoryBanks
+- name: DatasetIO
+- name: Inference
+- name: Datasets
+- name: PostTraining
+- name: BatchInference
+- name: Eval
+- name: Inspect
 - description: <SchemaDefinition schemaRef="#/components/schemas/BuiltinTool" />
   name: BuiltinTool
 - description: <SchemaDefinition schemaRef="#/components/schemas/CompletionMessage"

@@ -538,7 +538,7 @@ Once the server is set up, we can test it with a client to verify it's working c
 $ curl http://localhost:5000/inference/chat_completion \
 -H "Content-Type: application/json" \
 -d '{
-    "model": "Llama3.1-8B-Instruct",
+    "model_id": "Llama3.1-8B-Instruct",
     "messages": [
         {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": "Write me a 2 sentence poem about the moon"}

@@ -226,7 +226,7 @@ class Inference(Protocol):
     @webmethod(route="/inference/completion")
     async def completion(
         self,
-        model: str,
+        model_id: str,
         content: InterleavedTextMedia,
         sampling_params: Optional[SamplingParams] = SamplingParams(),
         response_format: Optional[ResponseFormat] = None,
@@ -237,7 +237,7 @@ async def completion(
     @webmethod(route="/inference/chat_completion")
     async def chat_completion(
         self,
-        model: str,
+        model_id: str,
         messages: List[Message],
         sampling_params: Optional[SamplingParams] = SamplingParams(),
         # zero-shot tool definitions as input to the model
@@ -254,6 +254,6 @@ async def chat_completion(
     @webmethod(route="/inference/embeddings")
     async def embeddings(
         self,
-        model: str,
+        model_id: str,
         contents: List[InterleavedTextMedia],
     ) -> EmbeddingsResponse: ...
@@ -95,7 +95,7 @@ async def register_model(
 
     async def chat_completion(
         self,
-        model: str,
+        model_id: str,
         messages: List[Message],
         sampling_params: Optional[SamplingParams] = SamplingParams(),
         response_format: Optional[ResponseFormat] = None,
@@ -106,7 +106,7 @@ async def chat_completion(
         logprobs: Optional[LogProbConfig] = None,
     ) -> AsyncGenerator:
         params = dict(
-            model=model,
+            model_id=model_id,
             messages=messages,
             sampling_params=sampling_params,
             tools=tools or [],
@@ -116,24 +116,24 @@ async def chat_completion(
             stream=stream,
             logprobs=logprobs,
         )
-        provider = self.routing_table.get_provider_impl(model)
+        provider = self.routing_table.get_provider_impl(model_id)
         if stream:
             return (chunk async for chunk in await provider.chat_completion(**params))
         else:
             return await provider.chat_completion(**params)
 
     async def completion(
         self,
-        model: str,
+        model_id: str,
         content: InterleavedTextMedia,
         sampling_params: Optional[SamplingParams] = SamplingParams(),
         response_format: Optional[ResponseFormat] = None,
         stream: Optional[bool] = False,
         logprobs: Optional[LogProbConfig] = None,
     ) -> AsyncGenerator:
-        provider = self.routing_table.get_provider_impl(model)
+        provider = self.routing_table.get_provider_impl(model_id)
         params = dict(
-            model=model,
+            model_id=model_id,
             content=content,
             sampling_params=sampling_params,
             response_format=response_format,
@@ -147,11 +147,11 @@ async def completion(
 
     async def embeddings(
         self,
-        model: str,
+        model_id: str,
         contents: List[InterleavedTextMedia],
     ) -> EmbeddingsResponse:
-        return await self.routing_table.get_provider_impl(model).embeddings(
-            model=model,
+        return await self.routing_table.get_provider_impl(model_id).embeddings(
+            model_id=model_id,
             contents=contents,
         )