From ee84439436da9ea6165fa23113c25395330fb103 Mon Sep 17 00:00:00 2001 From: Jeremy Fowers <80718789+jeremyfowers@users.noreply.github.com> Date: Tue, 26 Nov 2024 16:55:16 -0500 Subject: [PATCH] Release TKML v4.0.6 (#244) Co-authored-by: amd-pworfolk <166068376+amd-pworfolk@users.noreply.github.com> --- .github/actions/server-testing/action.yml | 164 +++++++++ .github/workflows/test_lemonade.yml | 14 +- .github/workflows/test_turnkey.yml | 4 +- {src/turnkeyml/llm/docs => docs}/llamacpp.md | 4 +- docs/ort_genai_igpu.md | 50 +++ .../llm/docs => docs}/ort_genai_npu.md | 15 +- setup.py | 4 +- src/turnkeyml/llm/README.md | 26 +- src/turnkeyml/llm/cache.py | 1 + src/turnkeyml/llm/cli.py | 2 +- src/turnkeyml/llm/leap.py | 1 + src/turnkeyml/llm/tools/chat.py | 6 + .../llm/tools/ort_genai/models/README.md | 1 - src/turnkeyml/llm/tools/ort_genai/oga.py | 316 +++++++++++------- src/turnkeyml/version.py | 2 +- test/llm_api.py | 15 +- 16 files changed, 475 insertions(+), 150 deletions(-) create mode 100644 .github/actions/server-testing/action.yml rename {src/turnkeyml/llm/docs => docs}/llamacpp.md (94%) create mode 100644 docs/ort_genai_igpu.md rename {src/turnkeyml/llm/docs => docs}/ort_genai_npu.md (77%) delete mode 100644 src/turnkeyml/llm/tools/ort_genai/models/README.md diff --git a/.github/actions/server-testing/action.yml b/.github/actions/server-testing/action.yml new file mode 100644 index 00000000..467e99ec --- /dev/null +++ b/.github/actions/server-testing/action.yml @@ -0,0 +1,164 @@ +name: "Test Lemonade Server" +description: Launch Lemonade Server and test the endpoints +inputs: + conda_env: + required: true + load_command: + required: true + amd_oga: + required: false + default: "" + description: "Location of the OGA for RyzenAI NPU install directory on disk" +runs: + using: "composite" + steps: + - name: Ensure the Lemonade serer works properly + shell: PowerShell + run: | + $Env:AMD_OGA = "${{ inputs.amd_oga }}" + + $outputFile = "output.log" + $errorFile = "error.log" + $serverProcess = Start-Process -FilePath "conda" -ArgumentList "run ${{ inputs.conda_env }} lemonade -d .\ci-cache ${{ inputs.load_command }} serve --max-new-tokens 10" -RedirectStandardOutput $outputFile -RedirectStandardError $errorFile -PassThru -NoNewWindow + + Write-Host "Wait for 30 seconds to let the server come up" + Start-Sleep -Seconds 30 + + Write-Host "Check if server process successfully launched" + $serverRunning = Get-Process -Id $serverProcess.Id -ErrorAction SilentlyContinue + if (-not $serverRunning) { + Write-Host "Error: Server process isn't running, even though we just tried to start it!" + Write-Host "Standard Output:" + Get-Content $outputFile + + Write-Host "Standard Error:" + Get-Content $errorFile + exit 1 + } else { + Write-Host "Server process is alive." + } + + Write-Host "Wait for the server port to come up" + while ($true) { + + $llmPortCheck = Test-NetConnection -ComputerName 127.0.0.1 -Port 8000 + if (-not $llmPortCheck.TcpTestSucceeded) { + Write-Host "LLM server is not yet running on port 8000!" + Write-Host "Standard Output:" + Get-Content $outputFile + + Write-Host "Standard Error:" + Get-Content $errorFile + } else { + Write-Host "LLM server is running on port 8000." + break + } + + Start-Sleep -Seconds 30 + } + + Write-Host "Checking the /health endpoint" + $response = Invoke-WebRequest -Uri http://127.0.0.1:8000/health -UseBasicParsing + + if ($response.StatusCode -eq 200) { + Write-Output "Good: /health status code is 200" + } else { + Write-Output "Error: /health status code is not 200" + Write-Host "Standard Output:" + Get-Content $outputFile + + Write-Host "Standard Error:" + Get-Content $errorFile + exit 1 + } + + $jsonContent = $response.Content | ConvertFrom-Json + if ($jsonContent) { + Write-Output "Good: /health JSON content is not empty: $jsonContent" + } else { + Write-Output "Error: /health JSON content is empty" + Write-Host "Standard Output:" + Get-Content $outputFile + + Write-Host "Standard Error:" + Get-Content $errorFile + exit 1 + } + + Write-Host "Checking the /ws (streaming generation) endpoint" + + # Define the WebSocket URI + $uri = [System.Uri]::new("ws://127.0.0.1:8000/ws") + + # Create a new ClientWebSocket instance + $webSocket = [System.Net.WebSockets.ClientWebSocket]::new() + + # Connect to the WebSocket server + $webSocket.ConnectAsync($uri, [System.Threading.CancellationToken]::None).Wait() + + # Define the message to send + $message = "Hello, WebSocket!" + $buffer = [System.Text.Encoding]::UTF8.GetBytes($message) + $segment = [System.ArraySegment[byte]]::new($buffer) + + # Send the message + $webSocket.SendAsync($segment, [System.Net.WebSockets.WebSocketMessageType]::Text, $true, [System.Threading.CancellationToken]::None).Wait() + + # Buffer to store the response + $responseBuffer = New-Object byte[] 1024 + $responseSegment = [System.ArraySegment[byte]]::new($responseBuffer) + + # Variable to store the complete response + $response = "" + + # Receive the streaming response + do { + $result = $webSocket.ReceiveAsync($responseSegment, [System.Threading.CancellationToken]::None).Result + $response += [System.Text.Encoding]::UTF8.GetString($responseBuffer, 0, $result.Count) + } while ($response -notlike "**") + + # Close the WebSocket connection + $webSocket.CloseAsync([System.Net.WebSockets.WebSocketCloseStatus]::NormalClosure, "Closing", [System.Threading.CancellationToken]::None).Wait() + + # Check if the response is not empty + if ($response -and $response -notlike "") { + Write-Output "Response is not empty: $response" + } else { + Write-Output "Response is empty or only contains the end marker: $response" + Write-Host "Standard Output:" + Get-Content $outputFile + + Write-Host "Standard Error:" + Get-Content $errorFile + exit 1 + } + + Write-Host "Checking the /stats endpoint" + $response = Invoke-WebRequest -Uri http://127.0.0.1:8000/stats -UseBasicParsing + if ($response.StatusCode -eq 200) { + Write-Output "Good: /stats status code is 200" + } else { + Write-Output "Error: /stats status code is not 200" + Write-Host "Standard Output:" + Get-Content $outputFile + + Write-Host "Standard Error:" + Get-Content $errorFile + exit 1 + } + + $jsonContent = $response.Content | ConvertFrom-Json + if ($jsonContent) { + Write-Output "Good: /stats JSON content is not empty: $jsonContent" + } else { + Write-Output "Error: /stats JSON content is empty" + Write-Host "Standard Output:" + Get-Content $outputFile + + Write-Host "Standard Error:" + Get-Content $errorFile + exit 1 + } + + Write-Host "Close the server process" + Stop-Process -Id $serverProcess.Id \ No newline at end of file diff --git a/.github/workflows/test_lemonade.yml b/.github/workflows/test_lemonade.yml index de79d40c..4f18fb66 100644 --- a/.github/workflows/test_lemonade.yml +++ b/.github/workflows/test_lemonade.yml @@ -16,7 +16,10 @@ jobs: make-lemonade: env: LEMONADE_CI_MODE: "True" - runs-on: ubuntu-latest + strategy: + matrix: + os: [ubuntu-latest, windows-latest] + runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v3 - name: Set up Miniconda with 64-bit Python @@ -25,6 +28,7 @@ jobs: miniconda-version: "latest" activate-environment: lemon python-version: "3.10" + run-post: "false" - name: Install dependencies shell: bash -el {0} run: | @@ -41,11 +45,17 @@ jobs: shell: bash -el {0} run: | pylint src/turnkeyml/llm --rcfile .pylintrc --disable E0401 + - name: Test HF+CPU server + if: runner.os == 'Windows' + timeout-minutes: 10 + uses: ./.github/actions/server-testing + with: + conda_env: -n lemon + load_command: -i facebook/opt-125m huggingface-load - name: Run lemonade tests shell: bash -el {0} run: | lemonade -i facebook/opt-125m huggingface-load llm-prompt -p "hi" --max-new-tokens 10 - python test/llm_api.py diff --git a/.github/workflows/test_turnkey.yml b/.github/workflows/test_turnkey.yml index b4118ef2..52298a6a 100644 --- a/.github/workflows/test_turnkey.yml +++ b/.github/workflows/test_turnkey.yml @@ -36,8 +36,10 @@ jobs: conda install pylint=3.2.7 pip install pytest pip install -e plugins/devices - pip install transformers timm pip install -e . # Required to test current tkml package instead of pypi version + # tokenizers 0.20.4 seems to have an install bug, which we must avoid by limiting + # the transformers version + pip install "transformers<4.46.3" "tokenizers<0.20.4" timm python -m pip check - name: Lint with PyLint shell: bash -el {0} diff --git a/src/turnkeyml/llm/docs/llamacpp.md b/docs/llamacpp.md similarity index 94% rename from src/turnkeyml/llm/docs/llamacpp.md rename to docs/llamacpp.md index cad21872..137e2ffa 100644 --- a/src/turnkeyml/llm/docs/llamacpp.md +++ b/docs/llamacpp.md @@ -8,7 +8,7 @@ This flow has been verified with a generic Llama.cpp model. These instructions are only for linux or Windows with wsl. It may be necessary to be running WSL in an Administrator command prompt. -These instructions also assume that TurnkeyML's llm extensions have been installed (for example with "pip install -e .[llm]") +These instructions also assumes that lemonade has been installed. ### Set up Environment (Assumes TurnkeyML is already installed) @@ -45,4 +45,4 @@ lemonade --input ~/llama.cpp/models/dolphin-llama2-7b.Q5_K_M.gguf load-llama-cpp On windows, the llama.cpp binary might be in a different location (such as llama.cpp\build\bin\Release\), in which case the command mgiht be something like: ```bash lemonade --input ~\llama.cpp\models\dolphin-llama2-7b.Q5_K_M.gguf load-llama-cpp --executable ~\llama.cpp\build\bin\Release\llama-cli accuracy-mmlu --ntrain 5 -``` \ No newline at end of file +``` diff --git a/docs/ort_genai_igpu.md b/docs/ort_genai_igpu.md new file mode 100644 index 00000000..0514d693 --- /dev/null +++ b/docs/ort_genai_igpu.md @@ -0,0 +1,50 @@ +# OnnxRuntime GenAI (OGA) for iGPU and CPU + +onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running ONNX LLMs: https://github.com/microsoft/onnxruntime-genai/tree/main?tab=readme-ov-file + +## Installation + +To install: + +1. `conda create -n oga-igpu python=3.9` +1. `conda activate oga-igpu` +1. `pip install -e .[llm-oga-igpu]` + - Note: don't forget the `[llm-oga-igpu]` at the end, this is what installs ort-genai +1. Get models: + - The oga-load tool can download models from Hugging Face and build ONNX files using oga model_builder. Models can be quantized and optimized for both igpu and cpu. + - Download and build ONNX model files: + - `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4` + - `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device cpu --dtype int4` + - The ONNX model files will be stored in the respective subfolder of the lemonade cache folder and will be reused in future oga-load calls: + - `oga_models\microsoft_phi-3-mini-4k-instruct\dml-int4` + - `oga_models\microsoft_phi-3-mini-4k-instruct\cpu-int4` + - The ONNX model build process can be forced to run again, overwriting the above cache, by using the --force flag: + `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 --force` + - Transformer model architectures supported by the model_builder tool include many popular state-of-the-art models: + - Gemma + - LLaMa + - Mistral + - Phi + - Qwen + - Nemotron + - For the full list of supported models, please see the + [model_builder documentation](https://github.com/microsoft/onnxruntime-genai/blob/main/src/python/py/models/README.md). + - The following quantizations are supported for automatically building ONNXRuntime GenAI model files from the Hugging Face repository: + - cpu: fp32, int4 + - igpu: fp16, int4 +1. Directory structure: + - The model_builder tool caches Hugging Face files and temporary ONNX external data files in `\model_builder` + - The output from model_builder is stored in `\oga_models\\` + - `MODELNAME` is the Hugging Face checkpoint name where any '/' is mapped to an '_' and everything is lower case + - `SUBFOLDER` is `-`, where `EP` is the execution provider (`dml` for igpu, `cpu` for cpu, and `npu` for npu) and `DTYPE` is the datatype + - If the --int4-block-size flag is used then `SUBFOLDER` is` --block-` where `SIZE` is the specified block size + - Other ONNX models in the format required by onnxruntime-genai can be loaded in lemonade if placed in the `\oga_models` folder. + Use the -i and --subfolder flags to specify the folder and subfolder: + `lemonade -i my_model_name --subfolder my_subfolder --device igpu --dtype int4 oga-load` + Lemonade will expect the ONNX model files to be located in `\oga_models\my_model_name\my_subfolder` + +## Usage + +Prompt: `lemonade -i meta-llama/Llama-3.2-1B-Instruct oga-load --device igpu --dtype int4 llm-prompt -p "My thoughts are" --max-new-tokens 50` + +Serving: `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --dtype int4 --device igpu serve --max-new-tokens 100` \ No newline at end of file diff --git a/src/turnkeyml/llm/docs/ort_genai_npu.md b/docs/ort_genai_npu.md similarity index 77% rename from src/turnkeyml/llm/docs/ort_genai_npu.md rename to docs/ort_genai_npu.md index 2ce7c9f9..70c322e0 100644 --- a/src/turnkeyml/llm/docs/ort_genai_npu.md +++ b/docs/ort_genai_npu.md @@ -6,7 +6,6 @@ onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running ### Warnings - - Users have experienced inconsistent results across models and machines. If one model isn't working well on your laptop, try one of the other models. - The OGA wheels need to be installed in a specific order or you will end up with the wrong packages in your environment. If you see pip dependency errors, please delete your conda env and start over with a fresh environment. ### Installation @@ -18,20 +17,16 @@ onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running 1. `cd REPO_ROOT` 1. `pip install -e .[oga-npu]` 1. Download required OGA packages - 1. Access the [AMD RyzenAI EA Lounge](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab) and download `amd_oga_Oct4_2024.zip` from `Ryzen AI 1.3 Preview Release`. + 1. Access the [AMD RyzenAI EA Lounge](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab) and download `amd_oga_Oct4_2024.zip` from `Ryzen AI 1.3 EA Release`. 1. Unzip `amd_oga_Oct4_2024.zip` 1. Setup your folder structure: - 1. Copy all of the content inside `amd_oga` to lemonade's `REPO_ROOT\src\lemonade\tools\ort_genai\models\` - 1. Move all dlls from `REPO_ROOT\src\lemonade\tools\ort_genai\models\libs` to `REPO_ROOT\src\lemonade\tools\ort_genai\models\` + 1. Copy the `amd_oga` folder from the above zip file, if desired + 1. Create the system environment variable `AMD_OGA` and set it to the path to the `amd_oga` folder 1. Install the wheels: - 1. `cd amd_oga\wheels` + 1. `cd %AMD_OGA%\wheels` 1. `pip install onnxruntime_genai-0.5.0.dev0-cp310-cp310-win_amd64.whl` 1. `pip install onnxruntime_vitisai-1.20.0-cp310-cp310-win_amd64.whl` 1. `pip install voe-1.2.0-cp310-cp310-win_amd64.whl` -1. Ensure you have access to the models on Hungging Face: - 1. Ensure you can access the models under [quark-quantized-onnx-llms-for-ryzen-ai-13-ea](https://huggingface.co/collections/amd/quark-quantized-onnx-llms-for-ryzen-ai-13-ea-66fc8e24927ec45504381902) on Hugging Face. Models are gated and you may have to request access. - 1. Create a Hugging Face Access Token [here](https://huggingface.co/settings/tokens). Ensure you select `Read access to contents of all public gated repos you can access` if creating a finegrained token. - 1. Set your Hugging Face token as an environment variable: `set HF_TOKEN=` 1. Install driver 1. Access the [AMD RyzenAI EA Lounge](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab) and download `Win24AIDriver.zip` from `Ryzen AI 1.3 Preview Release`. 1. Unzip `Win24AIDriver.zip` @@ -40,7 +35,7 @@ onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running ### Runtime -To test basic functionality, point lemonade to any of the models under under [quark-quantized-onnx-llms-for-ryzen-ai-13-ea](https://huggingface.co/collections/amd/quark-quantized-onnx-llms-for-ryzen-ai-13-ea-66fc8e24927ec45504381902): +To test basic functionality, point lemonade to any of the models under [quark-quantized-onnx-llms-for-ryzen-ai-1.3-ea](https://huggingface.co/collections/amd/quark-quantized-onnx-llms-for-ryzen-ai-13-ea-66fc8e24927ec45504381902): ``` lemonade -i amd/Llama-2-7b-hf-awq-g128-int4-asym-fp32-onnx-ryzen-strix oga-load --device npu --dtype int4 llm-prompt -p "hello whats your name?" --max-new-tokens 15 diff --git a/setup.py b/setup.py index 0862f873..1d233fae 100644 --- a/setup.py +++ b/setup.py @@ -62,8 +62,8 @@ "llm-oga-dml": [ "onnxruntime-genai-directml==0.4.0", "tqdm", - "torch>=2.0.0", - "transformers", + "torch>=2.0.0,<2.4", + "transformers<4.45.0", "accelerate", "py-cpuinfo", "sentencepiece", diff --git a/src/turnkeyml/llm/README.md b/src/turnkeyml/llm/README.md index a469ff62..88087e09 100644 --- a/src/turnkeyml/llm/README.md +++ b/src/turnkeyml/llm/README.md @@ -5,6 +5,8 @@ Contents: 1. [Getting Started](#getting-started) 1. [Install Specialized Tools](#install-specialized-tools) + - [OnnxRuntime GenAI](#install-onnxruntime-genai) + - [RyzenAI NPU for PyTorch](#install-ryzenai-npu-for-pytorch) 1. [Code Organization](#code-organization) 1. [Contributing](#contributing) @@ -85,29 +87,21 @@ Lemonade supports specialized tools that each require their own setup steps. **N ## Install OnnxRuntime-GenAI -To install support for [onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai) (e.g., the `oga-load` Tool), use `pip install -e .[llm-oga-dml]` instead of the default installation command. +To install support for [onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai), use `pip install -e .[llm-oga-dml]` instead of the default installation command. -Next, you need to get an OGA model. Per the OGA instructions, we suggest Phi-3-Mini. Use the following command to download it from Hugging Face, and make sure to set your `--local-dir` to the `REPO_ROOT/src/turnkeyml/llm/ort_genai/models` directory. +You can then load supported OGA models on to CPU or iGPU with the `oga-load` tool, for example: -`huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx --include directml/directml-int4-awq-block-128* --local-dir REPO_ROOT/src/turnkeyml/llm/tools/ort_genai/models/phi-3-mini-4k-instruct` +`lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are"` -You can try it out with: `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are"` +You can also launch a server process with: -You can also try Phi-3-Mini-128k-Instruct with the following commands: +`lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 serve` -`huggingface-cli download microsoft/Phi-3-mini-128k-instruct-onnx --include directml/directml-int4-awq-block-128* --local-dir REPO_ROOT/src/turnkeyml/llm/tools/ort_genai/models/phi-3-mini-128k-instruct` +You can learn more about the CPU and iGPU support in our [OGA documentation](https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_igpu.md). -`lemonade -i microsoft/Phi-3-mini-128k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are"` +> Note: early access to AMD's RyzenAI NPU is also available. See the [RyzenAI NPU OGA documentation](https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_npu.md) for more information. -You can also try out the CPU with: - -`huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx --include cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/* --local-dir REPO_ROOT/src/turnkeyml/llm/tools/ort_genai/models/phi-3-mini-128k-instruct` - -`lemonade -i microsoft/Phi-3-mini-128k-instruct oga-load --device cpu --dtype int4 llm-prompt -p "Hello, my thoughts are"` - -> Note: no other models or devices are officially supported by `lemonade` on OGA at this time. Contributions appreciated! It only takes a few minutes to add a new model, we just need to add a path to the downloaded model folder to the supported models dictionary in [oga.py](https://github.com/onnx/turnkeyml/blob/v4.0.2/src/turnkeyml/llm/tools/ort_genai/oga.py). - -## Install RyzenAI NPU +## Install RyzenAI NPU for PyTorch To run your LLMs on RyzenAI NPU, first install and set up the `ryzenai-transformers` conda environment (see instructions [here](https://github.com/amd/RyzenAI-SW/blob/main/example/transformers/models/llm/docs/README.md)). Then, install `lemonade` into `ryzenai-transformers`. The `ryzenai-npu-load` Tool will become available in that environment. diff --git a/src/turnkeyml/llm/cache.py b/src/turnkeyml/llm/cache.py index 5c0241f8..6bf90bc8 100644 --- a/src/turnkeyml/llm/cache.py +++ b/src/turnkeyml/llm/cache.py @@ -30,3 +30,4 @@ class Keys: PROMPT_TOKENS = "prompt_tokens" CACHE_DIR = "cache_dir" DEVICE = "device" + OGA_MODELS_SUBFOLDER = "oga_models_subfolder" diff --git a/src/turnkeyml/llm/cli.py b/src/turnkeyml/llm/cli.py index 3ab89c12..e396244a 100644 --- a/src/turnkeyml/llm/cli.py +++ b/src/turnkeyml/llm/cli.py @@ -103,7 +103,7 @@ def main(): first_tool_args.append(global_args["input"]) state = State( - cache_dir=global_args["cache_dir"], + cache_dir=os.path.abspath(global_args["cache_dir"]), build_name=global_args["input"].replace("/", "_"), sequence_info=sequence.info, ) diff --git a/src/turnkeyml/llm/leap.py b/src/turnkeyml/llm/leap.py index 9ae6f548..75475dc1 100644 --- a/src/turnkeyml/llm/leap.py +++ b/src/turnkeyml/llm/leap.py @@ -117,6 +117,7 @@ def from_pretrained( state = oga.OgaLoad().run( state, + input=checkpoint, device="igpu", dtype="int4", ) diff --git a/src/turnkeyml/llm/tools/chat.py b/src/turnkeyml/llm/tools/chat.py index 8c8ee94f..8daec102 100644 --- a/src/turnkeyml/llm/tools/chat.py +++ b/src/turnkeyml/llm/tools/chat.py @@ -22,6 +22,8 @@ DEFAULT_SERVER_PORT = 8000 +END_OF_STREAM = "" + class LLMPrompt(Tool): """ @@ -338,6 +340,7 @@ async def stream_response(websocket: WebSocket): thread.start() # Generate the response using streaming + new_text = "" for new_text in streamer: # Capture performance stats about this token @@ -365,6 +368,9 @@ async def stream_response(websocket: WebSocket): print("Stopping generation early.") break + if new_text != END_OF_STREAM: + await websocket.send_text(END_OF_STREAM) + self.tokens_per_second = 1 / statistics.mean(self.decode_token_times) print("\n") thread.join() diff --git a/src/turnkeyml/llm/tools/ort_genai/models/README.md b/src/turnkeyml/llm/tools/ort_genai/models/README.md deleted file mode 100644 index e1f24b24..00000000 --- a/src/turnkeyml/llm/tools/ort_genai/models/README.md +++ /dev/null @@ -1 +0,0 @@ -This directory is where your OGA model folders go. \ No newline at end of file diff --git a/src/turnkeyml/llm/tools/ort_genai/oga.py b/src/turnkeyml/llm/tools/ort_genai/oga.py index 510dfb85..de5a14a3 100644 --- a/src/turnkeyml/llm/tools/ort_genai/oga.py +++ b/src/turnkeyml/llm/tools/ort_genai/oga.py @@ -1,6 +1,12 @@ # onnxruntime_genai is not lint-friendly yet and PyLint can't # find any of the class methods # pylint: disable=no-member +# +# Model builder constraints: +# 11/10/24 Need transformers <4.45.0 OR onnxruntime-genai 0.5.0 (which must be built from source) +# (transformers v4.45 changes the format of the tokenizer.json file which will be supported in +# onnxruntime-genai 0.5) +# import argparse import os @@ -8,11 +14,13 @@ import json from fnmatch import fnmatch from queue import Queue -from huggingface_hub import snapshot_download, login +from huggingface_hub import snapshot_download import onnxruntime_genai as og +import onnxruntime_genai.models.builder as model_builder from turnkeyml.state import State from turnkeyml.tools import FirstTool import turnkeyml.common.status as status +import turnkeyml.common.printing as printing from turnkeyml.llm.tools.adapter import ( ModelAdapter, TokenizerAdapter, @@ -20,6 +28,15 @@ ) from turnkeyml.llm.cache import Keys +# ONNX Runtime GenAI models will be cached in this subfolder of the lemonade cache folder +oga_models_path = "oga_models" + +# ONNX Runtime GenAI model builder tool uses this subfolder of the lemonade cache as its cache +oga_model_builder_cache_path = "model_builder" + +# Mapping from processor to executiion provider, used in pathnames and by model_builder +execution_providers = {"cpu": "cpu", "npu": "npu", "igpu": "dml"} + class OrtGenaiTokenizer(TokenizerAdapter): def __init__(self, model: og.Model): @@ -182,34 +199,33 @@ def generate( if stopping_criteria[0].stop_event.is_set(): stop_early = True - streamer.add_text("") streamer.done() -# Short names for checkpoints -# So that we don't violate pylint line lengths :) -llama_3 = "meta-llama/Meta-Llama-3-8B" -llama_2 = "meta-llama/Llama-2-7b-chat-hf" -phi_3_mini_4k = "microsoft/Phi-3-mini-4k-instruct" -phi_3_mini_128k = "microsoft/Phi-3-mini-128k-instruct" -qwen_1dot5 = "Qwen/Qwen1.5-7B" - - class OgaLoad(FirstTool): """ - Tool that loads an LLM in OnnxRuntime-GenAI for use with DirectML. + Tool that loads an LLM in OnnxRuntime-GenAI for use with CPU or DirectML execution providers. + + Input: path to a checkpoint. + Supported choices for cpu and igpu from HF model repository: + LLM models on Huggingface supported by model_builder. See documentation + (https://github.com/aigdat/genai/blob/main/docs/ort_genai_igpu.md) for supported models. + Supported choices for npu from HF model repository: + Models on Hugging Face that follow the "amd/**-onnx-ryzen-strix" pattern + Local models for cpu, igpu, or npu: + The specified checkpoint is converted to a local path, via mapping to lower case + and replacing '/' with '_'. If this model already exists in the 'models' folderr + of the lemonade cache and if it has a subfolder -, then this model + will be used. If the --force flag is used and the model is built with model_builder, + then it will be rebuilt. + - Input: path to a checkpoint. Supported choices: - llama_3 = "meta-llama/Meta-Llama-3-8B" - llama_2 = "meta-llama/Llama-2-7b-chat-hf" - phi_3_mini_4k = "microsoft/Phi-3-mini-4k-instruct" - phi_3_mini_128k = "microsoft/Phi-3-mini-128k-instruct" - And models on Hugging Face that follow the "amd/**-onnx-ryzen-strix" pattern Output: state.model: handle to a Huggingface-style LLM loaded on DirectML device state.tokenizer = Huggingface-style LLM tokenizer instance state.dtype = data type of the model on DirectML device + state.checkpoint = name of the checkpoint used to load state.model Note: This tool expects the onnxruntime-genai-directml library to be pre-installed. If that library is not installed, this tool will not load. @@ -220,7 +236,7 @@ class OgaLoad(FirstTool): def __init__(self): super().__init__(monitor_message="Loading OnnxRuntime-GenAI model") - self.status_stats = [Keys.DTYPE, Keys.DEVICE] + self.status_stats = [Keys.DTYPE, Keys.DEVICE, Keys.OGA_MODELS_SUBFOLDER] @staticmethod def parser(add_help: bool = True) -> argparse.ArgumentParser: @@ -239,125 +255,199 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser: parser.add_argument( "--dtype", - choices=["int4"], + choices=["int4", "fp16", "fp32"], required=True, help="Data type to load the model in", ) + parser.add_argument( + "--int4-block-size", + default=None, + help="Specify the block_size for int4 quantization.", + choices=[16, 32, 64, 128, 256], + type=int, + ) + + parser.add_argument( + "--force", + action="store_true", + help="Forces downloading of Hugging-Face model again (if changed). Additionally for" + " cpu and igpu devices only, forces model_builder to run again on the HF model" + " (changed or not).", + ) + + parser.add_argument( + "--download", + action="store_true", + help="Download the model if needed, but don't load it", + ) + + parser.add_argument( + "--subfolder", + default=None, + help="Subfolder where model is located /oga_models/" + "/, default is -. The EPs are: " + f'{", ".join([value + " for " + key for key, value in execution_providers.items()])}.', + ) + return parser def run( self, state: State, - input: str = phi_3_mini_128k, + input: str, device: str = "igpu", dtype: str = "int4", + int4_block_size: int = None, + force: bool = False, + download: bool = False, + subfolder: str = None, ) -> State: checkpoint = input + state.checkpoint = checkpoint - # Map of models[device][dtype][checkpoint] to the name of the model folder on disk - local_supported_models = { - "igpu": { - "int4": { - phi_3_mini_128k: os.path.join( - "phi-3-mini-128k-instruct", - "directml", - "directml-int4-awq-block-128", - ), - phi_3_mini_4k: os.path.join( - "phi-3-mini-4k-instruct", - "directml", - "directml-int4-awq-block-128", - ), - }, - }, - "npu": { - "int4": { - # Legacy RyzenAI 1.2 models for NPU - llama_2: "llama2-7b-int4", - llama_3: "llama3-8b-int4", - qwen_1dot5: "qwen1.5-7b-int4", - } - }, - "cpu": { - "int4": { - phi_3_mini_4k: os.path.join( - "phi-3-mini-4k-instruct", - "cpu_and_mobile", - "cpu-int4-rtn-block-32-acc-level-4", - ), - } - }, + # See whether the device;dtype;checkpoint combination is supported for download from HF + hf_supported_models = { + "cpu": {"int4": "*/*", "fp32": "*/*"}, + "igpu": {"int4": "*/*", "fp16": "*/*"}, + "npu": {"int4": "amd/**-onnx-ryzen-strix"}, } + hf_supported = ( + device in hf_supported_models + and dtype in hf_supported_models[device] + and fnmatch(checkpoint, hf_supported_models[device][dtype]) + ) - hf_supported_models = {"npu": {"int4": "amd/**-onnx-ryzen-strix"}} - - supported_locally = True - try: - dir_name = local_supported_models[device][dtype][checkpoint] - except KeyError as e: - supported_locally = False - hf_supported = ( - device in hf_supported_models - and dtype in hf_supported_models[device] - and fnmatch(checkpoint, hf_supported_models[device][dtype]) + # Check to see if the model already exists locally + if subfolder is None: + subfolder = f"{execution_providers[device]}-{dtype}" + subfolder += ( + f"-block-{int4_block_size}" + if dtype == "int4" and int4_block_size is not None + else "" ) + oga_models_subfolder = os.path.join( + checkpoint.replace("/", "_").lower(), subfolder + ) + full_model_path = os.path.join( + state.cache_dir, oga_models_path, oga_models_subfolder + ) + model_exists_locally = os.path.isdir(full_model_path) and os.listdir( + full_model_path + ) + + # Check if model needs to be downloaded and/or built or rebuilt + if not model_exists_locally or force: + if not hf_supported: + # Download/build can't be done raise ValueError( - "The device;dtype;checkpoint combination is not supported: " - f"{device};{dtype};{checkpoint}. The supported combinations " - f"are: {local_supported_models} for local models and {hf_supported_models}" - " for models on Hugging Face." - ) from e - - # Create models dir if it doesn't exist - models_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "models") - if not os.path.exists(models_dir): - os.makedirs(models_dir) - - # If the model is supported though Hugging Face, download it - if not supported_locally: - hf_model_name = checkpoint.split("amd/")[1] - dir_name = "_".join(hf_model_name.split("-")[:6]).lower() - api_key = os.getenv("HF_TOKEN") - login(api_key) - snapshot_download( - repo_id=checkpoint, - local_dir=os.path.join(models_dir, dir_name), - ignore_patterns=["*.md", "*.txt"], - ) + "The (device, dtype, checkpoint) combination is not supported: " + f"({device}, {dtype}, {checkpoint}). The supported combinations " + f"for Hugging Face models are " + + ", ".join( + [ + f"({dev}, {dt}, {hf_supported_models[dev][dt]})" + for dev in hf_supported_models.keys() + for dt in hf_supported_models[dev] + ] + ) + + "." + ) - current_cwd = os.getcwd() - if device == "npu": - # Change to the models directory - os.chdir(models_dir) + # Download the model from HF + if device == "npu": - # Common environment variables for all NPU models - os.environ["DD_ROOT"] = ".\\bins" - os.environ["DEVICE"] = "stx" - os.environ["XLNX_ENABLE_CACHE"] = "0" + # NPU models on HF are ready to go and HF does its own caching + full_model_path = snapshot_download( + repo_id=checkpoint, + ignore_patterns=["*.md", "*.txt"], + ) + oga_models_subfolder = None - # Phi models require USE_AIE_RoPE=0 - if "phi-" in checkpoint.lower(): - os.environ["USE_AIE_RoPE"] = "0" else: - os.environ["USE_AIE_RoPE"] = "1" - - model_dir = os.path.join(models_dir, dir_name) - state.model = OrtGenaiModel(model_dir) - state.tokenizer = OrtGenaiTokenizer(state.model.model) - state.dtype = dtype - - state.save_stat(Keys.CHECKPOINT, checkpoint) - state.save_stat(Keys.DTYPE, dtype) - state.save_stat(Keys.DEVICE, device) + # device is 'cpu' or 'igpu' + + # Use model_builder to download model and convert to ONNX + printing.log_info(f"Building {checkpoint} for {device} using {dtype}") + extra_options = {} + if int4_block_size is not None: + extra_options["int4-block-size"] = int4_block_size + try: + model_builder.create_model( + checkpoint, # model_name + "", # input_path + full_model_path, # output_path + dtype, # precision + execution_providers[device], # execution_provider + os.path.join( + state.cache_dir, oga_model_builder_cache_path + ), # cache_dir + **extra_options, + ) + except NotImplementedError as e: + # Model architecture is not supported by model builder + raise NotImplementedError("[Model builder] " + str(e)) from e + except OSError as e: + # Model is not found either locally nor in HF repository + raise ValueError("[Model builder] " + str(e)) from e + + if not download: + # The download only flag is not set, so load model + if device == "npu": + if "AMD_OGA" not in os.environ: + raise RuntimeError( + "Please set environment variable AMD_OGA to the path of the amd_oga files" + ) - # Create a UniqueInvocationInfo and ModelInfo so that we can display status - # at the end of the sequence - status.add_to_state(state=state, name=input, model=input) + # Check AMD_OGA points to oga library files + oga_path = os.environ["AMD_OGA"] + if not os.path.exists( + os.path.join(oga_path, "libs", "onnxruntime.dll") + ): + raise RuntimeError( + f"Cannot find libs/onnxruntime.dll in AMD_OGA folder: {oga_path}" + ) - # Put the CWD back to its original value - os.chdir(current_cwd) + # Save current directory and PATH + saved_cwd = os.getcwd() + saved_path = os.environ["PATH"] + + # Change to the AMD_OGA distribution directory + os.chdir(oga_path) + os.environ["PATH"] += os.pathsep + os.path.join( + os.environ["AMD_OGA"], "libs" + ) + + # Common environment variables for all NPU models + os.environ["DD_ROOT"] = ".\\bins" + os.environ["DEVICE"] = "stx" + os.environ["XLNX_ENABLE_CACHE"] = "0" + + # Phi models require USE_AIE_RoPE=0 + if "phi-" in checkpoint.lower(): + os.environ["USE_AIE_RoPE"] = "0" + else: + os.environ["USE_AIE_RoPE"] = "1" + + state.model = OrtGenaiModel(full_model_path) + state.tokenizer = OrtGenaiTokenizer(state.model.model) + state.dtype = dtype + + state.save_stat(Keys.CHECKPOINT, checkpoint) + state.save_stat(Keys.DTYPE, dtype) + state.save_stat(Keys.DEVICE, device) + if oga_models_subfolder is not None: + state.save_stat(Keys.OGA_MODELS_SUBFOLDER, oga_models_subfolder) + + # Create a UniqueInvocationInfo and ModelInfo so that we can display status + # at the end of the sequence + status.add_to_state(state=state, name=input, model=input) + + if device == "npu": + # Restore cwd and PATH + os.chdir(saved_cwd) + os.environ["PATH"] = saved_path return state diff --git a/src/turnkeyml/version.py b/src/turnkeyml/version.py index e2a8e2c0..d15f23d8 100644 --- a/src/turnkeyml/version.py +++ b/src/turnkeyml/version.py @@ -1 +1 @@ -__version__ = "4.0.5" +__version__ = "4.0.6" diff --git a/test/llm_api.py b/test/llm_api.py index 28ed5bbe..3977241d 100644 --- a/test/llm_api.py +++ b/test/llm_api.py @@ -1,6 +1,7 @@ import unittest import shutil import os +import urllib3 from turnkeyml.state import State import turnkeyml.common.filesystem as fs import turnkeyml.common.test_helpers as common @@ -10,6 +11,17 @@ ci_mode = os.getenv("LEMONADE_CI_MODE", False) +try: + url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar" + resp = urllib3.request("GET", url, preload_content=False) + if 200 <= resp.status < 400: + eecs_berkeley_edu_cannot_be_reached = False + else: + eecs_berkeley_edu_cannot_be_reached = True + resp.release_conn() +except urllib3.exceptions.HTTPError: + eecs_berkeley_edu_cannot_be_reached = True + class Testing(unittest.TestCase): def setUp(self) -> None: @@ -32,7 +44,8 @@ def test_001_prompt(self): state = LLMPrompt().run(state, prompt=prompt, max_new_tokens=15) assert len(state.response) > len(prompt), state.response - + + @unittest.skipIf(eecs_berkeley_edu_cannot_be_reached, "eecs.berkeley.edu cannot be reached for dataset download") def test_002_accuracy_mmlu(self): # Test MMLU benchmarking with known model checkpoint = "facebook/opt-125m"