Merge branch 'main' into rag-service

kaito-project · Dec 23, 2024 · 25a8acf · 25a8acf
2 parents b635b2f + a994f4b
commit 25a8acf
Show file tree

Hide file tree

Showing 41 changed files with 724 additions and 559 deletions.
diff --git a/.github/e2e-preset-configs.json b/.github/e2e-preset-configs.json
@@ -98,6 +98,15 @@
         "OSS": true,
         "loads_adapter": false
       },
+      {
+        "name": "qwen2.5-coder-7b-instruct",
+        "workload": "qwen2-5-coder-7b-instruct",
+        "node-count": 1,
+        "node-vm-size": "Standard_NC12s_v3",
+        "node-osdisk-size": 100,
+        "OSS": true,
+        "loads_adapter": false
+      },
       {
         "name": "llama-2-7b",
         "node-count": 1,

diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml
@@ -170,6 +170,7 @@ jobs:
         run: |
             NAME_SUFFIX=${{ matrix.model.name }}
             NAME_SUFFIX_WITHOUT_DASHES=${NAME_SUFFIX//-/}  # Removing all '-' symbols
+            NAME_SUFFIX_WITHOUT_DASHES=${NAME_SUFFIX_WITHOUT_DASHES//./} # Removing all '.' symbols
 
             if [ ${#NAME_SUFFIX_WITHOUT_DASHES} -gt 12 ]; then
                 TRUNCATED_NAME_SUFFIX=${NAME_SUFFIX_WITHOUT_DASHES:0:12}
@@ -213,16 +214,21 @@ jobs:
                 fi
             fi
 
+      - name: Get testing workload
+        id: workload
+        run: |
+            WORKLOAD_NAME=${{ matrix.model.workload || matrix.model.name }}
+            echo "WORKLOAD_NAME=$WORKLOAD_NAME" >> $GITHUB_OUTPUT
+            echo "WORKLOAD_FILE_PREFIX=presets/workspace/test/manifests/$WORKLOAD_NAME/$WORKLOAD_NAME" >> $GITHUB_OUTPUT
+
       - name: Create Service
-        run: kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-service.yaml
+        run: |
+            kubectl apply -f ${{steps.workload.outputs.WORKLOAD_FILE_PREFIX}}-service.yaml
       
       - name: Retrieve External Service IP
         id: get_ip
         run: |
-            while [[ -z $SERVICE_IP ]]; do 
-                SERVICE_IP=$(kubectl get svc ${{ matrix.model.name }} -o=jsonpath='{.status.loadBalancer.ingress[0].ip}')
-                sleep 5
-            done 
+            SERVICE_IP=$(kubectl get svc ${{steps.workload.outputs.WORKLOAD_NAME}} -o=jsonpath='{.spec.clusterIP}')
             echo "Service IP is $SERVICE_IP"
             echo "SERVICE_IP=$SERVICE_IP" >> $GITHUB_OUTPUT
 
@@ -235,36 +241,38 @@ jobs:
       - name: Replace IP and Deploy Resource to K8s
         run: |
             POSTFIX=$(echo "${{ matrix.model.name }}" | grep -q "llama" && echo "" || echo "_${{ env.RUNTIME }}")
-            sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
-            sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
-            sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
-            kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
+            WORKLOAD_FILE=${{steps.workload.outputs.WORKLOAD_FILE_PREFIX}}$POSTFIX.yaml
+
+            sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" $WORKLOAD_FILE
+            sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" $WORKLOAD_FILE
+            sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" $WORKLOAD_FILE
+            kubectl apply -f $WORKLOAD_FILE
 
       - name: Wait for Resource to be ready
         run: |
-            kubectl rollout status ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} --timeout=1800s
+            kubectl rollout status ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} --timeout=1800s
     
       - name: Check Adapter Loading from Logs
         if: matrix.model.loads_adapter == true
         run: |
-            POD_NAME=$(kubectl get pods -l app=${{ matrix.model.name }} -o jsonpath="{.items[0].metadata.name}")
+            POD_NAME=$(kubectl get pods -l app=${{steps.workload.outputs.WORKLOAD_NAME}} -o jsonpath="{.items[0].metadata.name}")
             kubectl logs $POD_NAME | grep "Adapter added:" | grep "${{ matrix.model.expected_adapter }}" || (echo "Adapter not loaded or incorrect adapter loaded" && exit 1)
 
       - name: Install testing commands
         run: |
-            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- apt-get update
-            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- apt-get install -y curl
+            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- apt-get update
+            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- apt-get install -y curl
 
       - name: Test healthz endpoint
         run: |
-            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
             curl -s http://localhost:5000/health
 
       - name: Test inference endpoint
         run: |
             echo "Testing inference for ${{ matrix.model.name }}"
             if [[ "${{ matrix.model.name }}" == *"llama"* && "${{ matrix.model.name }}" == *"-chat"* ]]; then
-                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
                 curl -s -X POST \
                 -H "Content-Type: application/json" \
                 -d '{
@@ -285,7 +293,7 @@ jobs:
                 }' \
                 http://localhost:5000/chat
             elif [[ "${{ matrix.model.name }}" == *"llama"* ]]; then
-                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
                 curl -s -X POST \
                 -H "Content-Type: application/json" \
                 -d '{
@@ -301,7 +309,7 @@ jobs:
                 }' \
                 http://localhost:5000/generate
             elif [[ "${{ env.RUNTIME }}" == *"vllm"* ]]; then
-                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
                 curl -s -X POST \
                 -H "accept: application/json" \
                 -H "Content-Type: application/json" \
@@ -320,7 +328,7 @@ jobs:
                     }' \
                 http://localhost:5000/v1/chat/completions
             else
-                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
                 curl -s -X POST \
                 -H "accept: application/json" \
                 -H "Content-Type: application/json" \
@@ -367,15 +375,15 @@ jobs:
                 RESOURCE_TYPE=${{ steps.resource.outputs.RESOURCE_TYPE }}
                 
                 # Check and Delete K8s Resource (Deployment or StatefulSet)
-                if kubectl get $RESOURCE_TYPE ${{ matrix.model.name }} > /dev/null 2>&1; then
-                    kubectl logs $RESOURCE_TYPE/${{ matrix.model.name }}
-                    kubectl delete $RESOURCE_TYPE ${{ matrix.model.name }}
+                if kubectl get $RESOURCE_TYPE ${{steps.workload.outputs.WORKLOAD_NAME}} > /dev/null 2>&1; then
+                    kubectl logs $RESOURCE_TYPE/${{steps.workload.outputs.WORKLOAD_NAME}}
+                    kubectl delete $RESOURCE_TYPE ${{steps.workload.outputs.WORKLOAD_NAME}}
                 fi
             fi
 
             # Check and Delete K8s Service if it exists
-            if kubectl get svc ${{ matrix.model.name }} > /dev/null 2>&1; then
-                kubectl delete svc ${{ matrix.model.name }}
+            if kubectl get svc ${{steps.workload.outputs.WORKLOAD_NAME}} > /dev/null 2>&1; then
+                kubectl delete svc ${{steps.workload.outputs.WORKLOAD_NAME}}
             fi
         
             # Check and Delete AKS Nodepool if it exists            

diff --git a/.github/workflows/kind-cluster/determine_models.py b/.github/workflows/kind-cluster/determine_models.py
@@ -22,9 +22,14 @@ def read_yaml(file_path):
 # Format: {falcon-7b : {model_name:falcon-7b, type:text-generation, version: #, tag: #}}
 MODELS = {model['name']: model for model in YAML_PR['models']}
 KAITO_REPO_URL = "https://github.com/kaito-project/kaito.git"
+GITREMOTE_TARGET = "_ciupstream"
 
 def set_multiline_output(name, value):
-    with open(os.environ['GITHUB_OUTPUT'], 'a') as fh:
+    if not os.getenv('GITHUB_OUTPUT'):
+        print(f"Not in github env, skip writing to $GITHUB_OUTPUT .")
+        return
+
+    with open(os.getenv('GITHUB_OUTPUT'), 'a') as fh:
         delimiter = uuid.uuid1()
         print(f'{name}<<{delimiter}', file=fh)
         print(value, file=fh)
@@ -51,9 +56,11 @@ def run_command(command):
 
 def get_yaml_from_branch(branch, file_path):
     """Read YAML from a branch"""
-    subprocess.run(['git', 'fetch', 'origin', branch], check=True)
-    subprocess.run(['git', 'checkout', 'origin/' + branch], check=True)
-    return read_yaml(file_path)
+    subprocess.run(['git', 'fetch', GITREMOTE_TARGET, branch], check=True)
+    subprocess.run(['git', 'checkout', f"{GITREMOTE_TARGET}/" + branch], check=True)
+    content =  read_yaml(file_path)
+    subprocess.run(['git', 'checkout', '-'], check=True)
+    return content
 
 def detect_changes_in_yaml(yaml_main, yaml_pr): 
     """Detecting relevant changes in support_models.yaml"""
@@ -90,33 +97,27 @@ def models_to_build(files_changed):
                 seen_model_types.add(model_info["type"])
     return list(models)
 
-def check_modified_models(pr_branch):
+def check_modified_models():
     """Check for modified models in the repository."""
     repo_dir = Path.cwd() / "repo"
 
     if repo_dir.exists():
         shutil.rmtree(repo_dir)
 
-    run_command(f"git clone {KAITO_REPO_URL} {repo_dir}")
-    os.chdir(repo_dir)
-
-    run_command("git checkout --detach")
-    run_command("git fetch origin main:main")
-    run_command(f"git fetch origin {pr_branch}:{pr_branch}")
-    run_command(f"git checkout {pr_branch}")
+    run_command(f"git remote add {GITREMOTE_TARGET} {KAITO_REPO_URL}")
+    run_command(f"git fetch {GITREMOTE_TARGET}")
 
-    files = run_command("git diff --name-only origin/main") # Returns each file on newline
+    files = run_command(f"git diff --name-only {GITREMOTE_TARGET}/main") # Returns each file on newline
     files = files.split("\n")
-    os.chdir(Path.cwd().parent)
+    print("Files Changed: ", files)
 
     modified_models = models_to_build(files)
-    
+
     print("Modified Models (Images to build): ", modified_models)
 
     return modified_models
 
 def main():
-    pr_branch = os.environ.get("PR_BRANCH", "main") # If not specified default to 'main'
     force_run_all = os.environ.get("FORCE_RUN_ALL", "false") # If not specified default to False
     force_run_all_phi = os.environ.get("FORCE_RUN_ALL_PHI", "false") # If not specified default to False
     force_run_all_public = os.environ.get("FORCE_RUN_ALL_PUBLIC", "false") # If not specified default to False
@@ -131,7 +132,7 @@ def main():
     else:
         # Logic to determine affected models
         # Example: affected_models = ['model1', 'model2', 'model3']
-        affected_models = check_modified_models(pr_branch)
+        affected_models = check_modified_models()
 
     # Convert the list of models into JSON matrix format
     matrix = create_matrix(affected_models)

diff --git a/charts/kaito/workspace/templates/inference-params.yaml b/charts/kaito/workspace/templates/inference-params.yaml
@@ -0,0 +1,19 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: inference-params-template
+  namespace: {{ .Release.Namespace }}
+data:
+  inference_config.yaml: |
+    # Maximum number of steps to find the max available seq len fitting in the GPU memory.
+    max_probe_steps: 6
+
+    vllm:
+      cpu-offload-gb: 0
+      gpu-memory-utilization: 0.95
+      swap-space: 4
+
+      # max-seq-len-to-capture: 8192
+      # num-scheduler-steps: 1
+      # enable-chunked-prefill: false
+      # see https://docs.vllm.ai/en/stable/models/engine_args.html for more options.
diff --git a/cmd/workspace/models.go b/cmd/workspace/models.go
@@ -9,4 +9,5 @@ import (
 	_ "github.com/kaito-project/kaito/presets/workspace/models/mistral"
 	_ "github.com/kaito-project/kaito/presets/workspace/models/phi2"
 	_ "github.com/kaito-project/kaito/presets/workspace/models/phi3"
+	_ "github.com/kaito-project/kaito/presets/workspace/models/qwen"
 )
diff --git a/docs/inference/README.md b/docs/inference/README.md
@@ -96,6 +96,89 @@ For detailed `InferenceSpec` API definitions, refer to the [documentation](https
 
 The OpenAPI specification for the inference API is available at [vLLM API](../../presets/workspace/inference/vllm/api_spec.json), [transformers API](../../presets/workspace/inference/text-generation/api_spec.json).
 
+#### vLLM inference API
+
+vLLM supports OpenAI-compatible inference APIs. Check [here](https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html) for more details.
+
+#### Transformers inference API
+
+The inference service endpoint is `/chat`.
+
+**basic example**
+```
+curl -X POST "http://<SERVICE>:80/chat" -H "accept: application/json" -H "Content-Type: application/json" -d '{"prompt":"YOUR_PROMPT_HERE"}'
+```
+
+**example with full configurable parameters**
+```
+curl -X POST \
+    -H "accept: application/json" \
+    -H "Content-Type: application/json" \
+    -d '{
+        "prompt":"YOUR_PROMPT_HERE",
+        "return_full_text": false,
+        "clean_up_tokenization_spaces": false,
+        "prefix": null,
+        "handle_long_generation": null,
+        "generate_kwargs": {
+                "max_length":200,
+                "min_length":0,
+                "do_sample":true,
+                "early_stopping":false,
+                "num_beams":1,
+                "num_beam_groups":1,
+                "diversity_penalty":0.0,
+                "temperature":1.0,
+                "top_k":10,
+                "top_p":1,
+                "typical_p":1,
+                "repetition_penalty":1,
+                "length_penalty":1,
+                "no_repeat_ngram_size":0,
+                "encoder_no_repeat_ngram_size":0,
+                "bad_words_ids":null,
+                "num_return_sequences":1,
+                "output_scores":false,
+                "return_dict_in_generate":false,
+                "forced_bos_token_id":null,
+                "forced_eos_token_id":null,
+                "remove_invalid_values":null
+            }
+        }' \
+        "http://<SERVICE>:80/chat"
+```
+
+**parameters**
+- `prompt`: The initial text provided by the user, from which the model will continue generating text.
+- `return_full_text`: If False only generated text is returned, else full text is returned.
+- `clean_up_tokenization_spaces`: True/False, determines whether to remove potential extra spaces in the text output.
+- `prefix`: Prefix added to the prompt.
+- `handle_long_generation`: Provides strategies to address generations beyond the model's maximum length capacity.
+- `max_length`: The maximum total number of tokens in the generated text.
+- `min_length`: The minimum total number of tokens that should be generated.
+- `do_sample`: If True, sampling methods will be used for text generation, which can introduce randomness and variation.
+- `early_stopping`: If True, the generation will stop early if certain conditions are met, for example, when a satisfactory number of candidates have been found in beam search.
+- `num_beams`: The number of beams to be used in beam search. More beams can lead to better results but are more computationally expensive.
+- `num_beam_groups`: Divides the number of beams into groups to promote diversity in the generated results.
+- `diversity_penalty`: Penalizes the score of tokens that make the current generation too similar to other groups, encouraging diverse outputs.
+- `temperature`: Controls the randomness of the output by scaling the logits before sampling.
+- `top_k`: Restricts sampling to the k most likely next tokens.
+- `top_p`: Uses nucleus sampling to restrict the sampling pool to tokens comprising the top p probability mass.
+- `typical_p`: Adjusts the probability distribution to favor tokens that are "typically" likely, given the context.
+- `repetition_penalty`: Penalizes tokens that have been generated previously, aiming to reduce repetition.
+- `length_penalty`: Modifies scores based on sequence length to encourage shorter or longer outputs.
+- `no_repeat_ngram_size`: Prevents the generation of any n-gram more than once.
+- `encoder_no_repeat_ngram_size`: Similar to `no_repeat_ngram_size` but applies to the encoder part of encoder-decoder models.
+- `bad_words_ids`: A list of token ids that should not be generated.
+- `num_return_sequences`: The number of different sequences to generate.
+- `output_scores`: Whether to output the prediction scores.
+- `return_dict_in_generate`: If True, the method will return a dictionary containing additional information.
+- `pad_token_id`: The token ID used for padding sequences to the same length.
+- `eos_token_id`: The token ID that signifies the end of a sequence.
+- `forced_bos_token_id`: The token ID that is forcibly used as the beginning of a sequence token.
+- `forced_eos_token_id`: The token ID that is forcibly used as the end of a sequence when max_length is reached.
+- `remove_invalid_values`: If True, filters out invalid values like NaNs or infs from model outputs to prevent crashes.
+
 # Inference workload
 
 Depending on whether the specified model supports distributed inference or not, the Kaito controller will choose to use either Kubernetes **apps.deployment** workload (by default) or Kubernetes **apps.statefulset** workload (if the model supports distributed inference) to manage the inference service, which is exposed using a Cluster-IP type of Kubernetes `service`.

diff --git a/examples/inference/kaito_workspace_qwen_2.5_coder_7b-instruct.yaml b/examples/inference/kaito_workspace_qwen_2.5_coder_7b-instruct.yaml
@@ -0,0 +1,12 @@
+apiVersion: kaito.sh/v1alpha1
+kind: Workspace
+metadata:
+  name: workspace-qwen-2.5-coder-7b-instruct
+resource:
+  instanceType: "Standard_NC24ads_A100_v4"
+  labelSelector:
+    matchLabels:
+      apps: qwen-2.5-coder
+inference:
+  preset:
+    name: qwen2.5-coder-7b-instruct
diff --git a/go.mod b/go.mod
@@ -9,7 +9,7 @@ require (
 	github.com/aws/karpenter-core v0.29.2
 	github.com/aws/karpenter-provider-aws v0.36.2
 	github.com/go-logr/logr v1.4.2
-	github.com/onsi/ginkgo/v2 v2.20.2
+	github.com/onsi/ginkgo/v2 v2.22.0
 	github.com/onsi/gomega v1.34.2
 	github.com/samber/lo v1.47.0
 	github.com/stretchr/testify v1.9.0
@@ -55,7 +55,7 @@ require (
 	github.com/google/gnostic-models v0.6.8 // indirect
 	github.com/google/go-cmp v0.6.0 // indirect
 	github.com/google/gofuzz v1.2.0 // indirect
-	github.com/google/pprof v0.0.0-20240827171923-fa2c70bbbfe5 // indirect
+	github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db // indirect
 	github.com/google/uuid v1.6.0 // indirect
 	github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect
 	github.com/hashicorp/golang-lru v1.0.2 // indirect
@@ -87,14 +87,14 @@ require (
 	go.uber.org/multierr v1.11.0 // indirect
 	go.uber.org/zap v1.27.0 // indirect
 	golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect
-	golang.org/x/net v0.28.0 // indirect
+	golang.org/x/net v0.33.0 // indirect
 	golang.org/x/oauth2 v0.21.0 // indirect
-	golang.org/x/sync v0.8.0 // indirect
-	golang.org/x/sys v0.25.0 // indirect
-	golang.org/x/term v0.24.0 // indirect
-	golang.org/x/text v0.18.0 // indirect
+	golang.org/x/sync v0.10.0 // indirect
+	golang.org/x/sys v0.28.0 // indirect
+	golang.org/x/term v0.27.0 // indirect
+	golang.org/x/text v0.21.0 // indirect
 	golang.org/x/time v0.5.0 // indirect
-	golang.org/x/tools v0.24.0 // indirect
+	golang.org/x/tools v0.26.0 // indirect
 	gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
 	google.golang.org/api v0.180.0 // indirect
 	google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157 // indirect