diff --git a/.github/actions/server-testing/action.yml b/.github/actions/server-testing/action.yml index e5a6dcf..fdb2b47 100644 --- a/.github/actions/server-testing/action.yml +++ b/.github/actions/server-testing/action.yml @@ -15,6 +15,10 @@ inputs: required: false default: "" description: "Location of the OGA for RyzenAI NPU install directory on disk" + amd_oga_hybrid: + required: false + default: "" + description: "Location of the OGA for RyzenAI Hybrid install directory on disk" hf_token: required: false default: "" diff --git a/docs/ort_genai_hybrid.md b/docs/ort_genai_hybrid.md new file mode 100644 index 0000000..5919d2c --- /dev/null +++ b/docs/ort_genai_hybrid.md @@ -0,0 +1,109 @@ +# Introduction + +[onnxruntime-genai (aka OGA)](https://github.com/microsoft/onnxruntime-genai/tree/main?tab=readme-ov-file) is a new framework created by Microsoft for running ONNX LLMs. + +## Hybrid instructions + +### Warnings + + - The OGA wheels need to be installed in a specific order or you will end up with the wrong packages in your environment. If you see pip dependency errors, please delete your conda env and start over with a fresh environment. + +### Requirements + - [NPU Drivers (version .237)](https://ryzenai.docs.amd.com/en/latest/inst.html#install-npu-drivers) + - [Hybrid LLM artifacts package](https://github.com/aigdat/ryzenai-sw-ea/blob/main/ryzen_ai_13_ga/hybrid-llm-artifacts_1.3.0.zip) + +### Installation + +1. NOTE: ⚠️ DO THESE STEPS IN EXACTLY THIS ORDER ⚠️ +1. Install `lemonade`: + 1. Create a conda environment: `conda create -n oga-hybrid python=3.10` (Python 3.10 is required) + 1. Activate: `conda activate oga-hybrid` + 1. `cd REPO_ROOT` + 1. `pip install -e .[llm-oga-hybrid]` +1. Download required OGA packages + 1. Access the [Hybrid LLM artifacts package](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab) and download `hybrid-llm-artifacts_1.3.0.zip` and `onnxruntime_vitisai-1.19.0.dev20241217-cp310-cp310-win_amd64.whl`. + 1. Copy the `onnxruntime_vitisai-1.19.0.dev20241217-cp310-cp310-win_amd64.whl` file to the `hybrid-llm-artifacts_1.3.0\hybrid-llm-artifacts\onnxruntime_genai\wheel` folder. + 1. Unzip `hybrid-llm-artifacts_1.3.0.zip` + 1. Create the system environment variable `AMD_OGA_HYBRID` and set it to the path of the `hybrid-llm-artifacts_1.3.0` folder. + 1. Restart your terminal +1. Install the wheels: + 1. `cd hybrid-llm-artifacts_1.3.0\hybrid-llm-artifacts\onnxruntime_genai\wheel` + 1. `pip install onnxruntime_genai_directml-0.4.0.dev0-cp310-cp310-win_amd64.whl` + 1. `pip install onnxruntime_vitisai-1.19.0.dev20241217-cp310-cp310-win_amd64.whl` +1. Install driver + 1. Download NPU driver from [NPU Drivers (version .237)](https://ryzenai.docs.amd.com/en/latest/inst.html#install-npu-drivers) + 1. Unzip `NPU_RAI1.3.zip` + 1. Right click `kipudrv.inf` and select `Install` + 1. Check under `Device Manager` to ensure that `NPU Compute Accelerator` is using version `32.0.203.237`. + +### Runtime + +To test basic functionality, point lemonade to any of the models under [quark-quantized-onnx-hybrid-llms-for-ryzen-ai-1.3](https://huggingface.co/collections/amd/quark-awq-g128-int4-asym-fp16-onnx-hybrid-13-674b307d2ffa21dd68fa41d5): + +``` +lemonade -i amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid oga-load --device hybrid --dtype int4 llm-prompt -p "hello whats your name?" --max-new-tokens 15 +``` + +``` +Building "amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid" + ✓ Loading OnnxRuntime-GenAI model + ✓ Prompting LLM + +amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid: + (executed 1x) + Build dir: C:\Users\ramkr\.cache\lemonade\amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid + Status: Successful build! + Dtype: int4 + Device: hybrid + Response: hello whats your name? i'm a robot, and i'm here to help you with any questions + + + +Woohoo! Saved to ~\.cache\lemonade\amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid +``` + +To test/use the websocket server: + +``` +lemonade -i amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid oga-load --device hybrid --dtype int4 serve --max-new-tokens 50 +``` + +Then open the address (http://localhost:8000) in a browser and chat with it. + +``` +Building "amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid" + ✓ Loading OnnxRuntime-GenAI model + Launching LLM Server + +INFO: Started server process [8704] +INFO: Waiting for application startup. +INFO: Application startup complete. +INFO: Uvicorn running on http://localhost:8000 (Press CTRL+C to quit) +INFO: ::1:57038 - "GET / HTTP/1.1" 200 OK +INFO: ('::1', 57042) - "WebSocket /ws" [accepted] +INFO: connection open +``` + +To run a single MMLU test: + +``` +lemonade -i amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid oga-load --device hybrid --dtype int4 accuracy-mmlu --tests management +``` + +``` +Building "amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid" + ✓ Loading OnnxRuntime-GenAI model + ✓ Measuring accuracy with MMLU + +amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid: + (executed 1x) + Build dir: C:\Users\ramkr\.cache\lemonade\amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid + Status: Successful build! + Dtype: int4 + Device: hybrid + Mmlu Management Accuracy: 49.515 % + + + +Woohoo! Saved to ~\.cache\lemonade\amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid +``` diff --git a/docs/ort_genai_igpu.md b/docs/ort_genai_igpu.md index 0514d69..affb8c8 100644 --- a/docs/ort_genai_igpu.md +++ b/docs/ort_genai_igpu.md @@ -1,6 +1,6 @@ # OnnxRuntime GenAI (OGA) for iGPU and CPU -onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running ONNX LLMs: https://github.com/microsoft/onnxruntime-genai/tree/main?tab=readme-ov-file +[onnxruntime-genai (aka OGA)](https://github.com/microsoft/onnxruntime-genai/tree/main?tab=readme-ov-file) is a new framework created by Microsoft for running ONNX LLMs ## Installation diff --git a/docs/ort_genai_npu.md b/docs/ort_genai_npu.md index 70c322e..a4e1c8d 100644 --- a/docs/ort_genai_npu.md +++ b/docs/ort_genai_npu.md @@ -1,6 +1,6 @@ # Introduction -onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running ONNX LLMs: https://github.com/microsoft/onnxruntime-genai/tree/main?tab=readme-ov-file +[onnxruntime-genai (aka OGA)](https://github.com/microsoft/onnxruntime-genai/tree/main?tab=readme-ov-file) is a new framework created by Microsoft for running ONNX LLMs ## NPU instructions @@ -15,10 +15,10 @@ onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running 1. Create a conda environment: `conda create -n oga-npu python=3.10` (Python 3.10 is required) 1. Activate: `conda activate oga-npu` 1. `cd REPO_ROOT` - 1. `pip install -e .[oga-npu]` + 1. `pip install -e .[llm-oga-npu]` 1. Download required OGA packages - 1. Access the [AMD RyzenAI EA Lounge](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab) and download `amd_oga_Oct4_2024.zip` from `Ryzen AI 1.3 EA Release`. - 1. Unzip `amd_oga_Oct4_2024.zip` + 1. Access the [AMD RyzenAI EA Lounge](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab) and download `npu-llm-artifacts_1.3.0.zip` from `Ryzen AI 1.3 Model Release`. + 1. Unzip `npu-llm-artifacts_1.3.0.zip` 1. Setup your folder structure: 1. Copy the `amd_oga` folder from the above zip file, if desired 1. Create the system environment variable `AMD_OGA` and set it to the path to the `amd_oga` folder @@ -28,79 +28,80 @@ onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running 1. `pip install onnxruntime_vitisai-1.20.0-cp310-cp310-win_amd64.whl` 1. `pip install voe-1.2.0-cp310-cp310-win_amd64.whl` 1. Install driver - 1. Access the [AMD RyzenAI EA Lounge](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab) and download `Win24AIDriver.zip` from `Ryzen AI 1.3 Preview Release`. - 1. Unzip `Win24AIDriver.zip` + 1. Download NPU driver from [NPU Drivers (version .237)](https://ryzenai.docs.amd.com/en/latest/inst.html#install-npu-drivers) + 1. Unzip `NPU_RAI1.3.zip` 1. Right click `kipudrv.inf` and select `Install` - 1. Check under `Device Manager` to ensure that `NPU Compute Accelerator` is using version `32.0.203.219`. + 1. Check under `Device Manager` to ensure that `NPU Compute Accelerator` is using version `32.0.203.237`. ### Runtime -To test basic functionality, point lemonade to any of the models under [quark-quantized-onnx-llms-for-ryzen-ai-1.3-ea](https://huggingface.co/collections/amd/quark-quantized-onnx-llms-for-ryzen-ai-13-ea-66fc8e24927ec45504381902): +To test basic functionality, point lemonade to any of the models under [quark_awq_g128_int4_asym_bf16_onnx_npu 1.3](https://huggingface.co/collections/amd/quark-awq-g128-int4-asym-bf16-onnx-npu-13-6759f510b8132db53e044aaf) ``` -lemonade -i amd/Llama-2-7b-hf-awq-g128-int4-asym-fp32-onnx-ryzen-strix oga-load --device npu --dtype int4 llm-prompt -p "hello whats your name?" --max-new-tokens 15 +lemonade -i amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix --device npu --dtype int4 llm-prompt -p "hello whats your name?" --max-new-tokens 15 ``` ``` -Building "amd_Llama-2-7b-hf-awq-g128-int4-asym-fp32-onnx-ryzen-strix" -[Vitis AI EP] No. of Operators : CPU 73 MATMULNBITS 99 -[Vitis AI EP] No. of Subgraphs :MATMULNBITS 33 +Building "amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid" ✓ Loading OnnxRuntime-GenAI model ✓ Prompting LLM -amd/Llama-2-7b-hf-awq-g128-int4-asym-fp32-onnx-ryzen-strix: +amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix: (executed 1x) - Build dir: C:\Users\danie/.cache/lemonade\amd_Llama-2-7b-hf-awq-g128-int4-asym-fp32-onnx-ryzen-strix + Build dir: C:\Users\ramkr\.cache\lemonade\amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix Status: Successful build! Dtype: int4 Device: npu - Response: hello whats your name? -Hi, I'm a 21 year old male from the + Response: hello whats your name? i'm a robot, and i'm here to help you with any questions + + + +Woohoo! Saved to ~\.cache\lemonade\amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix ``` To test/use the websocket server: ``` -lemonade -i amd/Llama-2-7b-hf-awq-g128-int4-asym-fp32-onnx-ryzen-strix oga-load --device npu --dtype int4 serve --max-new-tokens 50 +lemonade -i amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix --device npu --dtype int4 serve --max-new-tokens 50 ``` Then open the address (http://localhost:8000) in a browser and chat with it. ``` -Building "amd_Llama-2-7b-hf-awq-g128-int4-asym-fp32-onnx-ryzen-strix" -[Vitis AI EP] No. of Operators : CPU 73 MATMULNBITS 99 -[Vitis AI EP] No. of Subgraphs :MATMULNBITS 33 +Building "amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix" ✓ Loading OnnxRuntime-GenAI model + Launching LLM Server - -INFO: Started server process [27752] +INFO: Started server process [8704] INFO: Waiting for application startup. INFO: Application startup complete. INFO: Uvicorn running on http://localhost:8000 (Press CTRL+C to quit) -INFO: ::1:54973 - "GET / HTTP/1.1" 200 OK -INFO: ('::1', 54975) - "WebSocket /ws" [accepted] +INFO: ::1:57038 - "GET / HTTP/1.1" 200 OK +INFO: ('::1', 57042) - "WebSocket /ws" [accepted] INFO: connection open -I'm a newbie here. I'm looking for a good place to buy a domain name. I've been looking around and i've found a few good places. ``` To run a single MMLU test: ``` -lemonade -i amd/Llama-2-7b-hf-awq-g128-int4-asym-fp32-onnx-ryzen-strix oga-load --device npu --dtype int4 accuracy-mmlu --tests management +lemonade -i amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix oga-load --device npu --dtype int4 accuracy-mmlu --tests management ``` ``` -Building "amd_Llama-2-7b-hf-awq-g128-int4-asym-fp32-onnx-ryzen-strix" -[Vitis AI EP] No. of Operators : CPU 73 MATMULNBITS 99 -[Vitis AI EP] No. of Subgraphs :MATMULNBITS 33 - ✓ Loading OnnxRuntime-GenAI model - ✓ Measuring accuracy with MMLU +Building "amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix" + ✓ Loading OnnxRuntime-GenAI model + ✓ Measuring accuracy with MMLU -amd/Llama-2-7b-hf-awq-g128-int4-asym-fp32-onnx-ryzen-strix: +amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix: (executed 1x) - Build dir: C:\Users\danie/.cache/lemonade\amd_Llama-2-7b-hf-awq-g128-int4-asym-fp32-onnx-ryzen-strix + Build dir: C:\Users\ramkr\.cache\lemonade\amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix Status: Successful build! - Dtype: int4 - Device: npu - Mmlu Management Accuracy: 56.31 % + Dtype: int4 + Device: npu + Mmlu Management Accuracy: 49.515 % + + + +Woohoo! Saved to ~\.cache\lemonade\amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix + ``` diff --git a/src/turnkeyml/llm/tools/ort_genai/oga.py b/src/turnkeyml/llm/tools/ort_genai/oga.py index acd3c49..cf176c5 100644 --- a/src/turnkeyml/llm/tools/ort_genai/oga.py +++ b/src/turnkeyml/llm/tools/ort_genai/oga.py @@ -12,6 +12,7 @@ import os import time import json +import shutil from fnmatch import fnmatch from queue import Queue from huggingface_hub import snapshot_download @@ -35,7 +36,13 @@ oga_model_builder_cache_path = "model_builder" # Mapping from processor to executiion provider, used in pathnames and by model_builder -execution_providers = {"cpu": "cpu", "npu": "npu", "igpu": "dml", "cuda": "cuda"} +execution_providers = { + "cpu": "cpu", + "npu": "npu", + "igpu": "dml", + "hybrid": "hybrid", + "cuda": "cuda", +} class OrtGenaiTokenizer(TokenizerAdapter): @@ -248,7 +255,7 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser: parser.add_argument( "-d", "--device", - choices=["igpu", "npu", "cpu", "cuda"], + choices=["igpu", "npu", "cpu", "hybrid", "cuda"], default="igpu", help="Which device to load the model on to (default: igpu)", ) @@ -312,8 +319,10 @@ def run( "cpu": {"int4": "*/*", "fp32": "*/*"}, "igpu": {"int4": "*/*", "fp16": "*/*"}, "npu": {"int4": "amd/**-onnx-ryzen-strix"}, + "hybrid": {"int4": "amd/**-hybrid"}, "cuda": {"int4": "*/*", "fp16": "*/*"}, } + hf_supported = ( device in hf_supported_models and dtype in hf_supported_models[device] @@ -358,7 +367,7 @@ def run( ) # Download the model from HF - if device == "npu": + if device == "npu" or device == "hybrid": # NPU models on HF are ready to go and HF does its own caching full_model_path = snapshot_download( @@ -367,6 +376,67 @@ def run( ) oga_models_subfolder = None + if device == "hybrid": + # Locate the directory containing hybrid-llm-artifacts_1.3.0 in the system PATH + hybrid_artifacts_path = None + hybrid_artifacts_path = os.environ.get("AMD_OGA_HYBRID") + + if hybrid_artifacts_path is None: + raise RuntimeError( + "Could not find hybrid-llm-artifacts_1.3.0 in system PATH. " + "Please ensure it is added to your PATH environment variable." + ) + + if hybrid_artifacts_path: + # Construct the path to onnx_custom_ops.dll + custom_ops_path = os.path.join( + hybrid_artifacts_path, + "hybrid-llm-artifacts", + "onnx_utils", + "bin", + "onnx_custom_ops.dll", + ) + + config_path = os.path.join(full_model_path, "genai_config.json") + + # Check if the config file exists + if os.path.exists(config_path): + with open(config_path, "r", encoding="utf-8") as f: + config = json.load(f) + + # Modify the custom_ops_library under decoder -> session_options + if ( + "model" in config + and "decoder" in config["model"] + and "session_options" in config["model"]["decoder"] + ): + config["model"]["decoder"]["session_options"][ + "custom_ops_library" + ] = custom_ops_path + + # Write the changes back to the file + with open(config_path, "w", encoding="utf-8") as f: + json.dump(config, f, indent=4) + + # Copy DirectML.dll from lib to bin folder + src_dll = os.path.join( + hybrid_artifacts_path, + "hybrid-llm-artifacts", + "onnxruntime_genai", + "lib", + "DirectML.dll", + ) + dst_dll = os.path.join( + hybrid_artifacts_path, + "hybrid-llm-artifacts", + "onnx_utils", + "bin", + "DirectML.dll", + ) + + # Create destination directory if it doesn't exist + os.makedirs(os.path.dirname(dst_dll), exist_ok=True) + shutil.copy2(src_dll, dst_dll) else: # device is 'cpu' or 'igpu'