Skip to content

Commit

Permalink
Merge branch 'main' into rag-service
Browse files Browse the repository at this point in the history
  • Loading branch information
bangqipropel authored Dec 23, 2024
2 parents b635b2f + a994f4b commit 25a8acf
Show file tree
Hide file tree
Showing 41 changed files with 724 additions and 559 deletions.
9 changes: 9 additions & 0 deletions .github/e2e-preset-configs.json
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,15 @@
"OSS": true,
"loads_adapter": false
},
{
"name": "qwen2.5-coder-7b-instruct",
"workload": "qwen2-5-coder-7b-instruct",
"node-count": 1,
"node-vm-size": "Standard_NC12s_v3",
"node-osdisk-size": 100,
"OSS": true,
"loads_adapter": false
},
{
"name": "llama-2-7b",
"node-count": 1,
Expand Down
54 changes: 31 additions & 23 deletions .github/workflows/e2e-preset-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ jobs:
run: |
NAME_SUFFIX=${{ matrix.model.name }}
NAME_SUFFIX_WITHOUT_DASHES=${NAME_SUFFIX//-/} # Removing all '-' symbols
NAME_SUFFIX_WITHOUT_DASHES=${NAME_SUFFIX_WITHOUT_DASHES//./} # Removing all '.' symbols
if [ ${#NAME_SUFFIX_WITHOUT_DASHES} -gt 12 ]; then
TRUNCATED_NAME_SUFFIX=${NAME_SUFFIX_WITHOUT_DASHES:0:12}
Expand Down Expand Up @@ -213,16 +214,21 @@ jobs:
fi
fi
- name: Get testing workload
id: workload
run: |
WORKLOAD_NAME=${{ matrix.model.workload || matrix.model.name }}
echo "WORKLOAD_NAME=$WORKLOAD_NAME" >> $GITHUB_OUTPUT
echo "WORKLOAD_FILE_PREFIX=presets/workspace/test/manifests/$WORKLOAD_NAME/$WORKLOAD_NAME" >> $GITHUB_OUTPUT
- name: Create Service
run: kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-service.yaml
run: |
kubectl apply -f ${{steps.workload.outputs.WORKLOAD_FILE_PREFIX}}-service.yaml
- name: Retrieve External Service IP
id: get_ip
run: |
while [[ -z $SERVICE_IP ]]; do
SERVICE_IP=$(kubectl get svc ${{ matrix.model.name }} -o=jsonpath='{.status.loadBalancer.ingress[0].ip}')
sleep 5
done
SERVICE_IP=$(kubectl get svc ${{steps.workload.outputs.WORKLOAD_NAME}} -o=jsonpath='{.spec.clusterIP}')
echo "Service IP is $SERVICE_IP"
echo "SERVICE_IP=$SERVICE_IP" >> $GITHUB_OUTPUT
Expand All @@ -235,36 +241,38 @@ jobs:
- name: Replace IP and Deploy Resource to K8s
run: |
POSTFIX=$(echo "${{ matrix.model.name }}" | grep -q "llama" && echo "" || echo "_${{ env.RUNTIME }}")
sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
WORKLOAD_FILE=${{steps.workload.outputs.WORKLOAD_FILE_PREFIX}}$POSTFIX.yaml
sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" $WORKLOAD_FILE
sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" $WORKLOAD_FILE
sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" $WORKLOAD_FILE
kubectl apply -f $WORKLOAD_FILE
- name: Wait for Resource to be ready
run: |
kubectl rollout status ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} --timeout=1800s
kubectl rollout status ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} --timeout=1800s
- name: Check Adapter Loading from Logs
if: matrix.model.loads_adapter == true
run: |
POD_NAME=$(kubectl get pods -l app=${{ matrix.model.name }} -o jsonpath="{.items[0].metadata.name}")
POD_NAME=$(kubectl get pods -l app=${{steps.workload.outputs.WORKLOAD_NAME}} -o jsonpath="{.items[0].metadata.name}")
kubectl logs $POD_NAME | grep "Adapter added:" | grep "${{ matrix.model.expected_adapter }}" || (echo "Adapter not loaded or incorrect adapter loaded" && exit 1)
- name: Install testing commands
run: |
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- apt-get update
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- apt-get install -y curl
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- apt-get update
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- apt-get install -y curl
- name: Test healthz endpoint
run: |
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
curl -s http://localhost:5000/health
- name: Test inference endpoint
run: |
echo "Testing inference for ${{ matrix.model.name }}"
if [[ "${{ matrix.model.name }}" == *"llama"* && "${{ matrix.model.name }}" == *"-chat"* ]]; then
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
curl -s -X POST \
-H "Content-Type: application/json" \
-d '{
Expand All @@ -285,7 +293,7 @@ jobs:
}' \
http://localhost:5000/chat
elif [[ "${{ matrix.model.name }}" == *"llama"* ]]; then
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
curl -s -X POST \
-H "Content-Type: application/json" \
-d '{
Expand All @@ -301,7 +309,7 @@ jobs:
}' \
http://localhost:5000/generate
elif [[ "${{ env.RUNTIME }}" == *"vllm"* ]]; then
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
curl -s -X POST \
-H "accept: application/json" \
-H "Content-Type: application/json" \
Expand All @@ -320,7 +328,7 @@ jobs:
}' \
http://localhost:5000/v1/chat/completions
else
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
curl -s -X POST \
-H "accept: application/json" \
-H "Content-Type: application/json" \
Expand Down Expand Up @@ -367,15 +375,15 @@ jobs:
RESOURCE_TYPE=${{ steps.resource.outputs.RESOURCE_TYPE }}
# Check and Delete K8s Resource (Deployment or StatefulSet)
if kubectl get $RESOURCE_TYPE ${{ matrix.model.name }} > /dev/null 2>&1; then
kubectl logs $RESOURCE_TYPE/${{ matrix.model.name }}
kubectl delete $RESOURCE_TYPE ${{ matrix.model.name }}
if kubectl get $RESOURCE_TYPE ${{steps.workload.outputs.WORKLOAD_NAME}} > /dev/null 2>&1; then
kubectl logs $RESOURCE_TYPE/${{steps.workload.outputs.WORKLOAD_NAME}}
kubectl delete $RESOURCE_TYPE ${{steps.workload.outputs.WORKLOAD_NAME}}
fi
fi
# Check and Delete K8s Service if it exists
if kubectl get svc ${{ matrix.model.name }} > /dev/null 2>&1; then
kubectl delete svc ${{ matrix.model.name }}
if kubectl get svc ${{steps.workload.outputs.WORKLOAD_NAME}} > /dev/null 2>&1; then
kubectl delete svc ${{steps.workload.outputs.WORKLOAD_NAME}}
fi
# Check and Delete AKS Nodepool if it exists
Expand Down
35 changes: 18 additions & 17 deletions .github/workflows/kind-cluster/determine_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,14 @@ def read_yaml(file_path):
# Format: {falcon-7b : {model_name:falcon-7b, type:text-generation, version: #, tag: #}}
MODELS = {model['name']: model for model in YAML_PR['models']}
KAITO_REPO_URL = "https://github.com/kaito-project/kaito.git"
GITREMOTE_TARGET = "_ciupstream"

def set_multiline_output(name, value):
with open(os.environ['GITHUB_OUTPUT'], 'a') as fh:
if not os.getenv('GITHUB_OUTPUT'):
print(f"Not in github env, skip writing to $GITHUB_OUTPUT .")
return

with open(os.getenv('GITHUB_OUTPUT'), 'a') as fh:
delimiter = uuid.uuid1()
print(f'{name}<<{delimiter}', file=fh)
print(value, file=fh)
Expand All @@ -51,9 +56,11 @@ def run_command(command):

def get_yaml_from_branch(branch, file_path):
"""Read YAML from a branch"""
subprocess.run(['git', 'fetch', 'origin', branch], check=True)
subprocess.run(['git', 'checkout', 'origin/' + branch], check=True)
return read_yaml(file_path)
subprocess.run(['git', 'fetch', GITREMOTE_TARGET, branch], check=True)
subprocess.run(['git', 'checkout', f"{GITREMOTE_TARGET}/" + branch], check=True)
content = read_yaml(file_path)
subprocess.run(['git', 'checkout', '-'], check=True)
return content

def detect_changes_in_yaml(yaml_main, yaml_pr):
"""Detecting relevant changes in support_models.yaml"""
Expand Down Expand Up @@ -90,33 +97,27 @@ def models_to_build(files_changed):
seen_model_types.add(model_info["type"])
return list(models)

def check_modified_models(pr_branch):
def check_modified_models():
"""Check for modified models in the repository."""
repo_dir = Path.cwd() / "repo"

if repo_dir.exists():
shutil.rmtree(repo_dir)

run_command(f"git clone {KAITO_REPO_URL} {repo_dir}")
os.chdir(repo_dir)

run_command("git checkout --detach")
run_command("git fetch origin main:main")
run_command(f"git fetch origin {pr_branch}:{pr_branch}")
run_command(f"git checkout {pr_branch}")
run_command(f"git remote add {GITREMOTE_TARGET} {KAITO_REPO_URL}")
run_command(f"git fetch {GITREMOTE_TARGET}")

files = run_command("git diff --name-only origin/main") # Returns each file on newline
files = run_command(f"git diff --name-only {GITREMOTE_TARGET}/main") # Returns each file on newline
files = files.split("\n")
os.chdir(Path.cwd().parent)
print("Files Changed: ", files)

modified_models = models_to_build(files)

print("Modified Models (Images to build): ", modified_models)

return modified_models

def main():
pr_branch = os.environ.get("PR_BRANCH", "main") # If not specified default to 'main'
force_run_all = os.environ.get("FORCE_RUN_ALL", "false") # If not specified default to False
force_run_all_phi = os.environ.get("FORCE_RUN_ALL_PHI", "false") # If not specified default to False
force_run_all_public = os.environ.get("FORCE_RUN_ALL_PUBLIC", "false") # If not specified default to False
Expand All @@ -131,7 +132,7 @@ def main():
else:
# Logic to determine affected models
# Example: affected_models = ['model1', 'model2', 'model3']
affected_models = check_modified_models(pr_branch)
affected_models = check_modified_models()

# Convert the list of models into JSON matrix format
matrix = create_matrix(affected_models)
Expand Down
19 changes: 19 additions & 0 deletions charts/kaito/workspace/templates/inference-params.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: inference-params-template
namespace: {{ .Release.Namespace }}
data:
inference_config.yaml: |
# Maximum number of steps to find the max available seq len fitting in the GPU memory.
max_probe_steps: 6
vllm:
cpu-offload-gb: 0
gpu-memory-utilization: 0.95
swap-space: 4
# max-seq-len-to-capture: 8192
# num-scheduler-steps: 1
# enable-chunked-prefill: false
# see https://docs.vllm.ai/en/stable/models/engine_args.html for more options.
1 change: 1 addition & 0 deletions cmd/workspace/models.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ import (
_ "github.com/kaito-project/kaito/presets/workspace/models/mistral"
_ "github.com/kaito-project/kaito/presets/workspace/models/phi2"
_ "github.com/kaito-project/kaito/presets/workspace/models/phi3"
_ "github.com/kaito-project/kaito/presets/workspace/models/qwen"
)
83 changes: 83 additions & 0 deletions docs/inference/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,89 @@ For detailed `InferenceSpec` API definitions, refer to the [documentation](https

The OpenAPI specification for the inference API is available at [vLLM API](../../presets/workspace/inference/vllm/api_spec.json), [transformers API](../../presets/workspace/inference/text-generation/api_spec.json).

#### vLLM inference API

vLLM supports OpenAI-compatible inference APIs. Check [here](https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html) for more details.

#### Transformers inference API

The inference service endpoint is `/chat`.

**basic example**
```
curl -X POST "http://<SERVICE>:80/chat" -H "accept: application/json" -H "Content-Type: application/json" -d '{"prompt":"YOUR_PROMPT_HERE"}'
```
**example with full configurable parameters**
```
curl -X POST \
-H "accept: application/json" \
-H "Content-Type: application/json" \
-d '{
"prompt":"YOUR_PROMPT_HERE",
"return_full_text": false,
"clean_up_tokenization_spaces": false,
"prefix": null,
"handle_long_generation": null,
"generate_kwargs": {
"max_length":200,
"min_length":0,
"do_sample":true,
"early_stopping":false,
"num_beams":1,
"num_beam_groups":1,
"diversity_penalty":0.0,
"temperature":1.0,
"top_k":10,
"top_p":1,
"typical_p":1,
"repetition_penalty":1,
"length_penalty":1,
"no_repeat_ngram_size":0,
"encoder_no_repeat_ngram_size":0,
"bad_words_ids":null,
"num_return_sequences":1,
"output_scores":false,
"return_dict_in_generate":false,
"forced_bos_token_id":null,
"forced_eos_token_id":null,
"remove_invalid_values":null
}
}' \
"http://<SERVICE>:80/chat"
```
**parameters**
- `prompt`: The initial text provided by the user, from which the model will continue generating text.
- `return_full_text`: If False only generated text is returned, else full text is returned.
- `clean_up_tokenization_spaces`: True/False, determines whether to remove potential extra spaces in the text output.
- `prefix`: Prefix added to the prompt.
- `handle_long_generation`: Provides strategies to address generations beyond the model's maximum length capacity.
- `max_length`: The maximum total number of tokens in the generated text.
- `min_length`: The minimum total number of tokens that should be generated.
- `do_sample`: If True, sampling methods will be used for text generation, which can introduce randomness and variation.
- `early_stopping`: If True, the generation will stop early if certain conditions are met, for example, when a satisfactory number of candidates have been found in beam search.
- `num_beams`: The number of beams to be used in beam search. More beams can lead to better results but are more computationally expensive.
- `num_beam_groups`: Divides the number of beams into groups to promote diversity in the generated results.
- `diversity_penalty`: Penalizes the score of tokens that make the current generation too similar to other groups, encouraging diverse outputs.
- `temperature`: Controls the randomness of the output by scaling the logits before sampling.
- `top_k`: Restricts sampling to the k most likely next tokens.
- `top_p`: Uses nucleus sampling to restrict the sampling pool to tokens comprising the top p probability mass.
- `typical_p`: Adjusts the probability distribution to favor tokens that are "typically" likely, given the context.
- `repetition_penalty`: Penalizes tokens that have been generated previously, aiming to reduce repetition.
- `length_penalty`: Modifies scores based on sequence length to encourage shorter or longer outputs.
- `no_repeat_ngram_size`: Prevents the generation of any n-gram more than once.
- `encoder_no_repeat_ngram_size`: Similar to `no_repeat_ngram_size` but applies to the encoder part of encoder-decoder models.
- `bad_words_ids`: A list of token ids that should not be generated.
- `num_return_sequences`: The number of different sequences to generate.
- `output_scores`: Whether to output the prediction scores.
- `return_dict_in_generate`: If True, the method will return a dictionary containing additional information.
- `pad_token_id`: The token ID used for padding sequences to the same length.
- `eos_token_id`: The token ID that signifies the end of a sequence.
- `forced_bos_token_id`: The token ID that is forcibly used as the beginning of a sequence token.
- `forced_eos_token_id`: The token ID that is forcibly used as the end of a sequence when max_length is reached.
- `remove_invalid_values`: If True, filters out invalid values like NaNs or infs from model outputs to prevent crashes.
# Inference workload
Depending on whether the specified model supports distributed inference or not, the Kaito controller will choose to use either Kubernetes **apps.deployment** workload (by default) or Kubernetes **apps.statefulset** workload (if the model supports distributed inference) to manage the inference service, which is exposed using a Cluster-IP type of Kubernetes `service`.
Expand Down
12 changes: 12 additions & 0 deletions examples/inference/kaito_workspace_qwen_2.5_coder_7b-instruct.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: kaito.sh/v1alpha1
kind: Workspace
metadata:
name: workspace-qwen-2.5-coder-7b-instruct
resource:
instanceType: "Standard_NC24ads_A100_v4"
labelSelector:
matchLabels:
apps: qwen-2.5-coder
inference:
preset:
name: qwen2.5-coder-7b-instruct
16 changes: 8 additions & 8 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ require (
github.com/aws/karpenter-core v0.29.2
github.com/aws/karpenter-provider-aws v0.36.2
github.com/go-logr/logr v1.4.2
github.com/onsi/ginkgo/v2 v2.20.2
github.com/onsi/ginkgo/v2 v2.22.0
github.com/onsi/gomega v1.34.2
github.com/samber/lo v1.47.0
github.com/stretchr/testify v1.9.0
Expand Down Expand Up @@ -55,7 +55,7 @@ require (
github.com/google/gnostic-models v0.6.8 // indirect
github.com/google/go-cmp v0.6.0 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/pprof v0.0.0-20240827171923-fa2c70bbbfe5 // indirect
github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect
github.com/hashicorp/golang-lru v1.0.2 // indirect
Expand Down Expand Up @@ -87,14 +87,14 @@ require (
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.27.0 // indirect
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect
golang.org/x/net v0.28.0 // indirect
golang.org/x/net v0.33.0 // indirect
golang.org/x/oauth2 v0.21.0 // indirect
golang.org/x/sync v0.8.0 // indirect
golang.org/x/sys v0.25.0 // indirect
golang.org/x/term v0.24.0 // indirect
golang.org/x/text v0.18.0 // indirect
golang.org/x/sync v0.10.0 // indirect
golang.org/x/sys v0.28.0 // indirect
golang.org/x/term v0.27.0 // indirect
golang.org/x/text v0.21.0 // indirect
golang.org/x/time v0.5.0 // indirect
golang.org/x/tools v0.24.0 // indirect
golang.org/x/tools v0.26.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
google.golang.org/api v0.180.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157 // indirect
Expand Down
Loading

0 comments on commit 25a8acf

Please sign in to comment.