kaito-project · etredal · Mar 12, 2024 · Mar 13, 2024 · Mar 13, 2024 · Mar 13, 2024
@@ -48,6 +48,7 @@ jobs:
         run: |
             PR_BRANCH=${{ env.BRANCH_NAME }} \
             FORCE_RUN_ALL=${{ env.FORCE_RUN_ALL }} \
+            PR_REPO_URL=${{ github.event.pull_request.head.repo.clone_url }} \
             python3 .github/workflows/kind-cluster/determine_models.py
 
       - name: Print Determined Models
@@ -274,6 +275,11 @@ jobs:
         if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true')
         run: |
             curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/healthz
+
+      - name: Test version endpoint
+        if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true')
+        run: |
+            curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/version
 
       - name: Test inference endpoint
         if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && (steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' || env.FORCE_RUN_ALL == 'true')

@@ -90,7 +90,7 @@ def models_to_build(files_changed):
                 seen_model_types.add(model_info["type"])
     return list(models)
 
-def check_modified_models(pr_branch):
+def check_modified_models(pr_branch, pr_repo_url):
     """Check for modified models in the repository."""
     repo_dir = Path.cwd() / "repo"
 
@@ -102,7 +102,14 @@ def check_modified_models(pr_branch):
 
     run_command("git checkout --detach")
     run_command("git fetch origin main:main")
-    run_command(f"git fetch origin {pr_branch}:{pr_branch}")
+
+    fetch_command = f"git fetch origin {pr_branch}:{pr_branch}"
+    if pr_repo_url != KAITO_REPO_URL:
+        # Add the PR's repo as a new remote only if it's different from the main repo
+        run_command("git remote add pr_repo {}".format(pr_repo_url))
+        fetch_command = f"git fetch pr_repo {pr_branch}"
+
+    run_command(fetch_command)
     run_command(f"git checkout {pr_branch}")
 
     files = run_command("git diff --name-only origin/main") # Returns each file on newline
@@ -118,14 +125,15 @@ def check_modified_models(pr_branch):
 def main():
     pr_branch = os.environ.get("PR_BRANCH", "main") # If not specified default to 'main'
     force_run_all = os.environ.get("FORCE_RUN_ALL", "false") # If not specified default to False
+    pr_repo_url = os.environ.get("PR_REPO_URL", KAITO_REPO_URL)
 
     affected_models = []
     if force_run_all != "false":
         affected_models = [model['name'] for model in YAML_PR['models']]
     else:
         # Logic to determine affected models
         # Example: affected_models = ['model1', 'model2', 'model3']
-        affected_models = check_modified_models(pr_branch)
+        affected_models = check_modified_models(pr_branch, pr_repo_url)
 
     # Convert the list of models into JSON matrix format
     matrix = create_matrix(affected_models)

@@ -43,6 +43,8 @@ spec:
                          --build-arg WEIGHTS_PATH=/weights \
                          --build-arg VERSION={{VERSION}} \
                          --build-arg MODEL_TYPE={{MODEL_TYPE}} \
+                         --build-arg IMAGE_NAME={{IMAGE_NAME}} \
+                         --build-arg MODEL_VERSION={{MODEL_VERSION}} \
                          -f $DOCKERFILE_PATH /
             docker push $ACR_NAME.azurecr.io/{{IMAGE_NAME}}:$VERSION
         env:

@@ -55,6 +55,7 @@ jobs:
         run: |
           PR_BRANCH=${{ env.BRANCH_NAME }} \
           FORCE_RUN_ALL=${{ env.FORCE_RUN_ALL }} \
+          PR_REPO_URL=${{ github.event.pull_request.head.repo.clone_url }} \
           python3 .github/workflows/kind-cluster/determine_models.py
 
       - name: Print Determined Models

@@ -3,6 +3,8 @@
 #              --build-arg WEIGHTS_PATH=/weights \
 #              --build-arg VERSION={{VERSION}} \
 #              --build-arg MODEL_TYPE={{MODEL_TYPE}} \
+#              --build-arg IMAGE_NAME={{IMAGE_NAME}} \
+#              --build-arg MODEL_VERSION={{MODEL_VERSION}} \
 
 FROM python:3.8-slim
 WORKDIR /workspace
@@ -26,8 +28,12 @@ RUN pip install 'uvicorn[standard]'
 ARG WEIGHTS_PATH
 ARG MODEL_TYPE
 ARG VERSION
-# Write the version to a file
-RUN echo $VERSION > /workspace/llama/version.txt
+ARG IMAGE_NAME
+ARG MODEL_VERSION
+
+# Write metadata to model_info.json file
+RUN MODEL_VERSION_HASH="${MODEL_VERSION##*/}" && \
+    echo "{\"Model Type\": \"$MODEL_TYPE\", \"Version\": \"$VERSION\", \"Image Name\": \"$IMAGE_NAME\", \"Model Version URL\": \"$MODEL_VERSION\", \"REVISION_ID\": \"$MODEL_VERSION_HASH\"}" > /workspace/llama/model_info.json
 
 ADD ${WEIGHTS_PATH} /workspace/llama/llama-2/weights
 ADD kaito/presets/inference/${MODEL_TYPE} /workspace/llama/llama-2
@@ -4,12 +4,15 @@ FROM mcr.microsoft.com/aifx/acpt/stable-ubuntu2004-cu118-py38-torch211
 ARG WEIGHTS_PATH
 ARG MODEL_TYPE
 ARG VERSION
+ARG IMAGE_NAME
+ARG MODEL_VERSION
 
 # Set the working directory
 WORKDIR /workspace/tfs
 
-# Write the version to a file
-RUN echo $VERSION > /workspace/tfs/version.txt
+# Write metadata to model_info.json file
+RUN MODEL_VERSION_HASH="${MODEL_VERSION##*/}" && \
+    echo "{\"Model Type\": \"$MODEL_TYPE\", \"Version\": \"$VERSION\", \"Image Name\": \"$IMAGE_NAME\", \"Model Version URL\": \"$MODEL_VERSION\", \"REVISION_ID\": \"$MODEL_VERSION_HASH\"}" > /workspace/tfs/model_info.json
 
 # First, copy just the requirements.txt file and install dependencies
 # This is done before copying the code to utilize Docker's layer caching and

@@ -3,12 +3,15 @@ FROM python:3.10-slim
 ARG WEIGHTS_PATH
 ARG MODEL_TYPE
 ARG VERSION
+ARG IMAGE_NAME
+ARG MODEL_VERSION
 
 # Set the working directory
 WORKDIR /workspace/tfs
 
-# Write the version to a file
-RUN echo $VERSION > /workspace/tfs/version.txt
+# Write metadata to model_info.json file
+RUN MODEL_VERSION_HASH="${MODEL_VERSION##*/}" && \
+    echo "{\"Model Type\": \"$MODEL_TYPE\", \"Version\": \"$VERSION\", \"Image Name\": \"$IMAGE_NAME\", \"Model Version URL\": \"$MODEL_VERSION\", \"REVISION_ID\": \"$MODEL_VERSION_HASH\"}" > /workspace/tfs/model_info.json
 
 # First, copy just the preset files and install dependencies
 # This is done before copying the code to utilize Docker's layer caching and

@@ -8,6 +8,7 @@
 import signal
 import sys
 import threading
+import json
 from typing import Optional
 
 import GPUtil
@@ -18,6 +19,9 @@
 from llama import Llama
 from pydantic import BaseModel
 
+# Constants
+MODEL_INFO = "model_info.json"
+
 # Setup argparse
 parser = argparse.ArgumentParser(description="Llama API server.")
 parser.add_argument("--ckpt_dir", default="weights/", help="Checkpoint directory.")
@@ -191,6 +195,13 @@ def get_metrics():
         except Exception as e:
             return {"error": str(e)}
 
+    @app_main.get("/version")
+    def get_version():
+        with open(f"/workspace/llama/{MODEL_INFO}", "r") as f:
+            model_info = json.load(f)
+
+        return model_info
+
 def setup_worker_routes():
     @app_worker.get("/healthz")
     def health_check():

@@ -8,6 +8,7 @@
 import signal
 import sys
 import threading
+import json
 from typing import Optional
 
 import GPUtil
@@ -18,6 +19,9 @@
 from llama import Llama
 from pydantic import BaseModel
 
+# Constants
+MODEL_INFO = "model_info.json"
+
 # Setup argparse
 parser = argparse.ArgumentParser(description="Llama API server.")
 parser.add_argument("--ckpt_dir", default="weights/", help="Checkpoint directory.")
@@ -180,6 +184,13 @@ def get_metrics():
         except Exception as e:
             return {"error": str(e)}
 
+    @app_main.get("/version")
+    def get_version():
+        with open(f"/workspace/tfs/{MODEL_INFO}", "r") as f:
+            model_info = json.load(f)
+
+        return model_info
+
 def setup_worker_routes(): 
     @app_worker.get("/healthz")
     def health_check():

@@ -2,6 +2,7 @@
 # Licensed under the MIT license.
 import os
 from dataclasses import asdict, dataclass, field
+import json
 from typing import Annotated, Any, Dict, List, Optional
 
 import GPUtil
@@ -15,6 +16,8 @@
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                           GenerationConfig, HfArgumentParser)
 
+# Constants
+MODEL_INFO = "model_info.json"
 
 @dataclass
 class ModelConfig:
@@ -428,6 +431,13 @@ def get_metrics():
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 
+@app.get("/version")
+def get_version():
+    with open(f"/workspace/tfs/{MODEL_INFO}", "r") as f:
+        model_info = json.load(f)
+
+    return model_info
+
 if __name__ == "__main__":
     local_rank = int(os.environ.get("LOCAL_RANK", 0)) # Default to 0 if not set
     port = 5000 + local_rank # Adjust port based on local rank

@@ -3,28 +3,29 @@ models:
   - name: llama-2-7b
     type: llama2-completion
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
   - name: llama-2-7b-chat
     type: llama2-chat
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
   - name: llama-2-13b
     type: llama2-completion
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
   - name: llama-2-13b-chat
     type: llama2-chat
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
   - name: llama-2-70b
     type: llama2-completion
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
   - name: llama-2-70b-chat
     type: llama2-chat
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
     # Tag history:
+    # 0.0.4 - Version endpoint (#297)
     # 0.0.3 - Inference API Cleanup (#233)
     # 0.0.2 - Eliminate Unnecessary Process Group Creation in Worker Initialization (#244)
     # 0.0.1 - Initial Release
@@ -34,28 +35,31 @@ models:
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36
     runtime: tfs
-    tag: 0.0.4
+    tag: 0.0.5
   - name: falcon-7b-instruct
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99
     runtime: tfs
-    tag: 0.0.4
+    tag: 0.0.5
     # Tag history:
+    # 0.0.5 - Version endpoint (#297)
     # 0.0.4 - Adjust default model params (#310)
     # 0.0.3 - Update Default Params (#294)
     # 0.0.2 - Inference API Cleanup (#233)
     # 0.0.1 - Initial Release
+
   - name: falcon-40b
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146
     runtime: tfs
-    tag: 0.0.5
+    tag: 0.0.6
   - name: falcon-40b-instruct
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f
     runtime: tfs
-    tag: 0.0.5
+    tag: 0.0.6
     # Tag history for 40b models:
+    # 0.0.6 - Version endpoint (#297)
     # 0.0.5 - Adjust default model params (#310)
     # 0.0.4 - Skipped due to incomplete upload issue
     # 0.0.3 - Update Default Params (#294)
@@ -67,13 +71,14 @@ models:
     type: text-generation 
     version: https://huggingface.co/mistralai/Mistral-7B-v0.1/commit/26bca36bde8333b5d7f72e9ed20ccda6a618af24
     runtime: tfs
-    tag: 0.0.4
+    tag: 0.0.5
   - name: mistral-7b-instruct
     type: text-generation
     version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/commit/b70aa86578567ba3301b21c8a27bea4e8f6d6d61
     runtime: tfs
-    tag: 0.0.4
+    tag: 0.0.5
     # Tag history:
+    # 0.0.5 - Version endpoint (#297)
     # 0.0.4 - Adjust default model params (#310)
     # 0.0.3 - Update Default Params (#294)
     # 0.0.2 - Inference API Cleanup (#233)
@@ -84,8 +89,9 @@ models:
     type: text-generation 
     version: https://huggingface.co/microsoft/phi-2/commit/b10c3eba545ad279e7208ee3a5d644566f001670
     runtime: tfs
-    tag: 0.0.3
+    tag: 0.0.4
     # Tag history:
+    # 0.0.4 - Version endpoint (#297)
     # 0.0.3 - Adjust default model params (#310)
     # 0.0.2 - Update Default Params (#294)
     # 0.0.1 - Initial Release