From 3e8c8a4ff1c7bd10aa20bc0d9cf338a0de0fb370 Mon Sep 17 00:00:00 2001
From: Jeremy Fowers <jeremy.fowers@amd.com>
Date: Tue, 26 Nov 2024 12:11:00 -0500
Subject: [PATCH 1/9] Release TKML v4.0.6

---
 .github/workflows/test_lemonade.yml           |  13 +-
 {src/turnkeyml/llm/docs => docs}/llamacpp.md  |   4 +-
 docs/ort_genai_igpu.md                        |  50 +++
 .../llm/docs => docs}/ort_genai_npu.md        |  13 +-
 src/turnkeyml/llm/README.md                   |  26 +-
 src/turnkeyml/llm/cache.py                    |   1 +
 src/turnkeyml/llm/cli.py                      |   6 +-
 src/turnkeyml/llm/leap.py                     |   1 +
 src/turnkeyml/llm/tools/chat.py               |   6 +
 .../llm/tools/ort_genai/models/README.md      |   1 -
 src/turnkeyml/llm/tools/ort_genai/oga.py      | 316 +++++++++++-------
 src/turnkeyml/version.py                      |   2 +-
 test/llm_api.py                               |  15 +-
 13 files changed, 306 insertions(+), 148 deletions(-)
 rename {src/turnkeyml/llm/docs => docs}/llamacpp.md (94%)
 create mode 100644 docs/ort_genai_igpu.md
 rename {src/turnkeyml/llm/docs => docs}/ort_genai_npu.md (78%)
 delete mode 100644 src/turnkeyml/llm/tools/ort_genai/models/README.md

diff --git a/.github/workflows/test_lemonade.yml b/.github/workflows/test_lemonade.yml
index de79d40c..48c93cca 100644
--- a/.github/workflows/test_lemonade.yml
+++ b/.github/workflows/test_lemonade.yml
@@ -16,7 +16,10 @@ jobs:
   make-lemonade:
     env:
         LEMONADE_CI_MODE: "True"
-    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest]
+    runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v3
       - name: Set up Miniconda with 64-bit Python
@@ -41,11 +44,17 @@ jobs:
         shell: bash -el {0}
         run: |
           pylint src/turnkeyml/llm --rcfile .pylintrc --disable E0401
+      - name: Test HF+CPU server
+        if: runner.os == 'Windows'
+        timeout-minutes: 10
+        uses: ./.github/actions/server-testing
+        with:
+          conda_env: -n lemon
+          load_command: -i facebook/opt-125m huggingface-load
       - name: Run lemonade tests
         shell: bash -el {0}
         run: |
           lemonade -i facebook/opt-125m huggingface-load llm-prompt -p "hi" --max-new-tokens 10
-
           python test/llm_api.py
           
 
diff --git a/src/turnkeyml/llm/docs/llamacpp.md b/docs/llamacpp.md
similarity index 94%
rename from src/turnkeyml/llm/docs/llamacpp.md
rename to docs/llamacpp.md
index cad21872..137e2ffa 100644
--- a/src/turnkeyml/llm/docs/llamacpp.md
+++ b/docs/llamacpp.md
@@ -8,7 +8,7 @@ This flow has been verified with a generic Llama.cpp model.
 
 These instructions are only for linux or Windows with wsl. It may be necessary to be running WSL in an Administrator command prompt.
 
-These instructions also assume that TurnkeyML's llm extensions have been installed (for example with "pip install -e .[llm]")
+These instructions also assumes that lemonade has been installed.
 
 
 ### Set up Environment (Assumes TurnkeyML is already installed)
@@ -45,4 +45,4 @@ lemonade --input ~/llama.cpp/models/dolphin-llama2-7b.Q5_K_M.gguf load-llama-cpp
 On windows, the llama.cpp binary might be in a different location (such as llama.cpp\build\bin\Release\), in which case the command mgiht be something like:
 ```bash
 lemonade --input ~\llama.cpp\models\dolphin-llama2-7b.Q5_K_M.gguf load-llama-cpp --executable ~\llama.cpp\build\bin\Release\llama-cli accuracy-mmlu --ntrain 5
-```
\ No newline at end of file
+```
diff --git a/docs/ort_genai_igpu.md b/docs/ort_genai_igpu.md
new file mode 100644
index 00000000..bf75f718
--- /dev/null
+++ b/docs/ort_genai_igpu.md
@@ -0,0 +1,50 @@
+# OnnxRuntime GenAI (OGA) for iGPU and CPU
+
+onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running ONNX LLMs: https://github.com/microsoft/onnxruntime-genai/tree/main?tab=readme-ov-file
+
+## Installation
+
+To install:
+
+1. `conda create -n oga-igpu python=3.9`
+1. `conda activate oga-igpu`
+1. `pip install -e path/to/genai[oga-igpu]`
+   - Note: don't forget the `[oga-igpu]` at the end, this is what installs ort-genai
+1. Get models:
+    - The oga-load tool can download models from Hugging Face and build ONNX files using oga model_builder.  Models can be quantized and optimized for both igpu and cpu.
+    - Download and build ONNX model files:
+      - `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4`
+      - `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device cpu --dtype int4`
+    - The ONNX model files will be stored in the respective subfolder of the lemonade cache folder and will be reused in future oga-load calls:
+      - `oga_models\microsoft_phi-3-mini-4k-instruct\dml-int4`
+      - `oga_models\microsoft_phi-3-mini-4k-instruct\cpu-int4`
+    - The ONNX model build process can be forced to run again, overwriting the above cache, by using the --force flag:
+      `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 --force`
+    - Transformer model architectures supported by the model_builder tool include many popular state-of-the-art models:
+      - Gemma
+      - LLaMa
+      - Mistral
+      - Phi
+      - Qwen
+      - Nemotron
+    - For the full list of supported models, please see the 
+        [model_builder documentation](https://github.com/microsoft/onnxruntime-genai/blob/main/src/python/py/models/README.md).
+	- The following quantizations are supported for automatically building ONNXRuntime GenAI model files from the Hugging Face repository:
+		- cpu: fp32, int4
+		- igpu: fp16, int4
+1. Directory structure:
+	- The model_builder tool caches Hugging Face files and temporary ONNX external data files in `<LEMONADE CACHE>\model_builder`
+	- The output from model_builder is stored in `<LEMONADE_CACHE>\oga_models\<MODELNAME>\<SUBFOLDER>`
+		- `MODELNAME` is the Hugging Face checkpoint name where any '/' is mapped to an '_' and everything is lower case
+		- `SUBFOLDER` is `<EP>-<DTYPE>`, where `EP` is the execution provider (`dml` for igpu, `cpu` for cpu, and `npu` for npu) and `DTYPE` is the datatype
+		- If the --int4-block-size flag is used then `SUBFOLDER` is` <EP>-<DTYPE>-block-<SIZE>` where `SIZE` is the specified block size
+	- Other ONNX models in the format required by onnxruntime-genai can be loaded in lemonade if placed in the `<LEMONADE_CACHE>\oga_models` folder.
+	  Use the -i and --subfolder flags to specify the folder and subfolder:
+		`lemonade -i my_model_name --subfolder my_subfolder --device igpu --dtype int4 oga-load`
+	  Lemonade will expect the ONNX model files to be located in `<LEMONADE_CACHE>\oga_models\my_model_name\my_subfolder`
+	  
+## Usage
+
+Prompt: `lemonade -i meta-llama/Llama-3.2-1B-Instruct oga-load --device igpu --dtype int4 llm-prompt -p "My thoughts are" --max-new-tokens 50`
+
+Serving: `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --dtype int4 --device igpu serve --max-new-tokens 100`
\ No newline at end of file
diff --git a/src/turnkeyml/llm/docs/ort_genai_npu.md b/docs/ort_genai_npu.md
similarity index 78%
rename from src/turnkeyml/llm/docs/ort_genai_npu.md
rename to docs/ort_genai_npu.md
index 2ce7c9f9..02a60a32 100644
--- a/src/turnkeyml/llm/docs/ort_genai_npu.md
+++ b/docs/ort_genai_npu.md
@@ -6,7 +6,6 @@ onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running
 
 ### Warnings
 
- - Users have experienced inconsistent results across models and machines. If one model isn't working well on your laptop, try one of the other models.
  - The OGA wheels need to be installed in a specific order or you will end up with the wrong packages in your environment. If you see pip dependency errors, please delete your conda env and start over with a fresh environment.
 
 ### Installation
@@ -21,17 +20,13 @@ onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running
     1. Access the [AMD RyzenAI EA Lounge](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab) and  download `amd_oga_Oct4_2024.zip` from `Ryzen AI 1.3 Preview Release`.
     1. Unzip `amd_oga_Oct4_2024.zip`
 1. Setup your folder structure:
-    1. Copy all of the content inside `amd_oga` to lemonade's `REPO_ROOT\src\lemonade\tools\ort_genai\models\`
-    1. Move all dlls from `REPO_ROOT\src\lemonade\tools\ort_genai\models\libs` to `REPO_ROOT\src\lemonade\tools\ort_genai\models\`
+    1. Copy the `amd_oga` folder from the above zip file, if desired
+    1. Create the system environment variable `AMD_OGA` and set it to the path to the `amd_oga` folder
 1. Install the wheels:
-    1. `cd amd_oga\wheels`
+    1. `cd %AMD_OGA%\wheels`
     1. `pip install onnxruntime_genai-0.5.0.dev0-cp310-cp310-win_amd64.whl`
     1. `pip install onnxruntime_vitisai-1.20.0-cp310-cp310-win_amd64.whl`
     1. `pip install voe-1.2.0-cp310-cp310-win_amd64.whl`
-1. Ensure you have access to the models on Hungging Face:
-    1. Ensure you can access the models under [quark-quantized-onnx-llms-for-ryzen-ai-13-ea](https://huggingface.co/collections/amd/quark-quantized-onnx-llms-for-ryzen-ai-13-ea-66fc8e24927ec45504381902) on Hugging Face. Models are gated and you may have to request access.
-    1. Create a Hugging Face Access Token [here](https://huggingface.co/settings/tokens). Ensure you select `Read access to contents of all public gated repos you can access` if creating a finegrained token.
-    1. Set your Hugging Face token as an environment variable: `set HF_TOKEN=<your token>`
 1. Install driver
     1. Access the [AMD RyzenAI EA Lounge](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab) and  download `Win24AIDriver.zip` from `Ryzen AI 1.3 Preview Release`.
     1. Unzip `Win24AIDriver.zip`
@@ -40,7 +35,7 @@ onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running
 
 ### Runtime
 
-To test basic functionality, point lemonade to any of the models under under [quark-quantized-onnx-llms-for-ryzen-ai-13-ea](https://huggingface.co/collections/amd/quark-quantized-onnx-llms-for-ryzen-ai-13-ea-66fc8e24927ec45504381902):
+To test basic functionality, point lemonade to any of the models under [quark-quantized-onnx-llms-for-ryzen-ai-13-ea](https://huggingface.co/collections/amd/quark-quantized-onnx-llms-for-ryzen-ai-13-ea-66fc8e24927ec45504381902):
 
 ```
 lemonade -i amd/Llama-2-7b-hf-awq-g128-int4-asym-fp32-onnx-ryzen-strix oga-load --device npu --dtype int4 llm-prompt -p "hello whats your name?" --max-new-tokens 15
diff --git a/src/turnkeyml/llm/README.md b/src/turnkeyml/llm/README.md
index a469ff62..5def1987 100644
--- a/src/turnkeyml/llm/README.md
+++ b/src/turnkeyml/llm/README.md
@@ -5,6 +5,8 @@ Contents:
 
 1. [Getting Started](#getting-started)
 1. [Install Specialized Tools](#install-specialized-tools)
+    - [OnnxRuntime GenAI](#install-onnxruntime-genai)
+    - [RyzenAI NPU for PyTorch](#install-ryzenai-npu-for-pytorch)
 1. [Code Organization](#code-organization)
 1. [Contributing](#contributing)
 
@@ -85,29 +87,17 @@ Lemonade supports specialized tools that each require their own setup steps. **N
 
 ## Install OnnxRuntime-GenAI
 
-To install support for [onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai) (e.g., the `oga-load` Tool), use `pip install -e .[llm-oga-dml]` instead of the default installation command.
+To install support for [onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai), use `pip install -e .[llm-oga-dml]` instead of the default installation command.
 
-Next, you need to get an OGA model. Per the OGA instructions, we suggest Phi-3-Mini. Use the following command to download it from Hugging Face, and make sure to set your `--local-dir` to the `REPO_ROOT/src/turnkeyml/llm/ort_genai/models` directory.
+You can then load supported OGA models on to CPU or iGPU with the `oga-load` tool, for example:
 
-`huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx --include directml/directml-int4-awq-block-128* --local-dir REPO_ROOT/src/turnkeyml/llm/tools/ort_genai/models/phi-3-mini-4k-instruct`
+`lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are"`
 
-You can try it out with: `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are"`
+You can learn more about the CPU and iGPU support in our [OGA documentation](https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_igpu.md).
 
-You can also try Phi-3-Mini-128k-Instruct with the following commands:
+> Note: early access to AMD's RyzenAI NPU is also available. See the [RyzenAI NPU OGA documentation](https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_npu.md) for more information.
 
-`huggingface-cli download microsoft/Phi-3-mini-128k-instruct-onnx --include directml/directml-int4-awq-block-128* --local-dir REPO_ROOT/src/turnkeyml/llm/tools/ort_genai/models/phi-3-mini-128k-instruct`
-
-`lemonade -i microsoft/Phi-3-mini-128k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are"`
-
-You can also try out the CPU with:
-
-`huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx --include cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/* --local-dir REPO_ROOT/src/turnkeyml/llm/tools/ort_genai/models/phi-3-mini-128k-instruct`
-
-`lemonade -i microsoft/Phi-3-mini-128k-instruct oga-load --device cpu --dtype int4 llm-prompt -p "Hello, my thoughts are"`
-
-> Note: no other models or devices are officially supported by `lemonade` on OGA at this time. Contributions appreciated! It only takes a few minutes to add a new model, we just need to add a path to the downloaded model folder to the supported models dictionary in [oga.py](https://github.com/onnx/turnkeyml/blob/v4.0.2/src/turnkeyml/llm/tools/ort_genai/oga.py).
-
-## Install RyzenAI NPU
+## Install RyzenAI NPU for PyTorch
 
 To run your LLMs on RyzenAI NPU, first install and set up the `ryzenai-transformers` conda environment (see instructions [here](https://github.com/amd/RyzenAI-SW/blob/main/example/transformers/models/llm/docs/README.md)). Then, install `lemonade` into `ryzenai-transformers`. The `ryzenai-npu-load` Tool will become available in that environment.
 
diff --git a/src/turnkeyml/llm/cache.py b/src/turnkeyml/llm/cache.py
index 5c0241f8..6bf90bc8 100644
--- a/src/turnkeyml/llm/cache.py
+++ b/src/turnkeyml/llm/cache.py
@@ -30,3 +30,4 @@ class Keys:
     PROMPT_TOKENS = "prompt_tokens"
     CACHE_DIR = "cache_dir"
     DEVICE = "device"
+    OGA_MODELS_SUBFOLDER = "oga_models_subfolder"
diff --git a/src/turnkeyml/llm/cli.py b/src/turnkeyml/llm/cli.py
index 3ab89c12..bb8e5403 100644
--- a/src/turnkeyml/llm/cli.py
+++ b/src/turnkeyml/llm/cli.py
@@ -54,6 +54,10 @@ def main():
     except ModuleNotFoundError:
         pass
 
+
+
+
+
     # Define the argument parser
     parser = cli.CustomArgumentParser(
         description="Turnkey analysis and benchmarking of GenAI models. "
@@ -103,7 +107,7 @@ def main():
         first_tool_args.append(global_args["input"])
 
         state = State(
-            cache_dir=global_args["cache_dir"],
+            cache_dir=os.path.abspath(global_args["cache_dir"]),
             build_name=global_args["input"].replace("/", "_"),
             sequence_info=sequence.info,
         )
diff --git a/src/turnkeyml/llm/leap.py b/src/turnkeyml/llm/leap.py
index 9ae6f548..75475dc1 100644
--- a/src/turnkeyml/llm/leap.py
+++ b/src/turnkeyml/llm/leap.py
@@ -117,6 +117,7 @@ def from_pretrained(
 
         state = oga.OgaLoad().run(
             state,
+            input=checkpoint,
             device="igpu",
             dtype="int4",
         )
diff --git a/src/turnkeyml/llm/tools/chat.py b/src/turnkeyml/llm/tools/chat.py
index 8c8ee94f..8daec102 100644
--- a/src/turnkeyml/llm/tools/chat.py
+++ b/src/turnkeyml/llm/tools/chat.py
@@ -22,6 +22,8 @@
 
 DEFAULT_SERVER_PORT = 8000
 
+END_OF_STREAM = "</s>"
+
 
 class LLMPrompt(Tool):
     """
@@ -338,6 +340,7 @@ async def stream_response(websocket: WebSocket):
                 thread.start()
 
                 # Generate the response using streaming
+                new_text = ""
                 for new_text in streamer:
 
                     # Capture performance stats about this token
@@ -365,6 +368,9 @@ async def stream_response(websocket: WebSocket):
                         print("Stopping generation early.")
                         break
 
+                if new_text != END_OF_STREAM:
+                    await websocket.send_text(END_OF_STREAM)
+
                 self.tokens_per_second = 1 / statistics.mean(self.decode_token_times)
                 print("\n")
                 thread.join()
diff --git a/src/turnkeyml/llm/tools/ort_genai/models/README.md b/src/turnkeyml/llm/tools/ort_genai/models/README.md
deleted file mode 100644
index e1f24b24..00000000
--- a/src/turnkeyml/llm/tools/ort_genai/models/README.md
+++ /dev/null
@@ -1 +0,0 @@
-This directory is where your OGA model folders go.
\ No newline at end of file
diff --git a/src/turnkeyml/llm/tools/ort_genai/oga.py b/src/turnkeyml/llm/tools/ort_genai/oga.py
index 510dfb85..de5a14a3 100644
--- a/src/turnkeyml/llm/tools/ort_genai/oga.py
+++ b/src/turnkeyml/llm/tools/ort_genai/oga.py
@@ -1,6 +1,12 @@
 # onnxruntime_genai is not lint-friendly yet and PyLint can't
 # find any of the class methods
 # pylint: disable=no-member
+#
+# Model builder constraints:
+#   11/10/24 Need transformers <4.45.0 OR onnxruntime-genai 0.5.0 (which must be built from source)
+#   (transformers v4.45 changes the format of the tokenizer.json file which will be supported in
+#   onnxruntime-genai 0.5)
+#
 
 import argparse
 import os
@@ -8,11 +14,13 @@
 import json
 from fnmatch import fnmatch
 from queue import Queue
-from huggingface_hub import snapshot_download, login
+from huggingface_hub import snapshot_download
 import onnxruntime_genai as og
+import onnxruntime_genai.models.builder as model_builder
 from turnkeyml.state import State
 from turnkeyml.tools import FirstTool
 import turnkeyml.common.status as status
+import turnkeyml.common.printing as printing
 from turnkeyml.llm.tools.adapter import (
     ModelAdapter,
     TokenizerAdapter,
@@ -20,6 +28,15 @@
 )
 from turnkeyml.llm.cache import Keys
 
+# ONNX Runtime GenAI models will be cached in this subfolder of the lemonade cache folder
+oga_models_path = "oga_models"
+
+# ONNX Runtime GenAI model builder tool uses this subfolder of the lemonade cache as its cache
+oga_model_builder_cache_path = "model_builder"
+
+# Mapping from processor to executiion provider, used in pathnames and by model_builder
+execution_providers = {"cpu": "cpu", "npu": "npu", "igpu": "dml"}
+
 
 class OrtGenaiTokenizer(TokenizerAdapter):
     def __init__(self, model: og.Model):
@@ -182,34 +199,33 @@ def generate(
                     if stopping_criteria[0].stop_event.is_set():
                         stop_early = True
 
-            streamer.add_text("</s>")
             streamer.done()
 
 
-# Short names for checkpoints
-# So that we don't violate pylint line lengths :)
-llama_3 = "meta-llama/Meta-Llama-3-8B"
-llama_2 = "meta-llama/Llama-2-7b-chat-hf"
-phi_3_mini_4k = "microsoft/Phi-3-mini-4k-instruct"
-phi_3_mini_128k = "microsoft/Phi-3-mini-128k-instruct"
-qwen_1dot5 = "Qwen/Qwen1.5-7B"
-
-
 class OgaLoad(FirstTool):
     """
-    Tool that loads an LLM in OnnxRuntime-GenAI for use with DirectML.
+    Tool that loads an LLM in OnnxRuntime-GenAI for use with CPU or DirectML execution providers.
+
+    Input: path to a checkpoint.
+        Supported choices for cpu and igpu from HF model repository:
+            LLM models on Huggingface supported by model_builder.  See documentation
+            (https://github.com/aigdat/genai/blob/main/docs/ort_genai_igpu.md) for supported models.
+        Supported choices for npu from HF model repository:
+            Models on Hugging Face that follow the "amd/**-onnx-ryzen-strix" pattern
+        Local models for cpu, igpu, or npu:
+            The specified checkpoint is converted to a local path, via mapping to lower case
+            and replacing '/' with '_'.  If this model already exists in the 'models' folderr
+            of the lemonade cache and if it has a subfolder <device>-<dtype>, then this model
+            will be used.  If the --force flag is used and the model is built with model_builder,
+            then it will be rebuilt.
+
 
-    Input: path to a checkpoint. Supported choices:
-        llama_3 = "meta-llama/Meta-Llama-3-8B"
-        llama_2 = "meta-llama/Llama-2-7b-chat-hf"
-        phi_3_mini_4k = "microsoft/Phi-3-mini-4k-instruct"
-        phi_3_mini_128k = "microsoft/Phi-3-mini-128k-instruct"
-        And models on Hugging Face that follow the "amd/**-onnx-ryzen-strix" pattern
 
     Output:
         state.model: handle to a Huggingface-style LLM loaded on DirectML device
         state.tokenizer = Huggingface-style LLM tokenizer instance
         state.dtype = data type of the model on DirectML device
+        state.checkpoint = name of the checkpoint used to load state.model
 
     Note: This tool expects the onnxruntime-genai-directml library to be pre-installed.
             If that library is not installed, this tool will not load.
@@ -220,7 +236,7 @@ class OgaLoad(FirstTool):
     def __init__(self):
         super().__init__(monitor_message="Loading OnnxRuntime-GenAI model")
 
-        self.status_stats = [Keys.DTYPE, Keys.DEVICE]
+        self.status_stats = [Keys.DTYPE, Keys.DEVICE, Keys.OGA_MODELS_SUBFOLDER]
 
     @staticmethod
     def parser(add_help: bool = True) -> argparse.ArgumentParser:
@@ -239,125 +255,199 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser:
 
         parser.add_argument(
             "--dtype",
-            choices=["int4"],
+            choices=["int4", "fp16", "fp32"],
             required=True,
             help="Data type to load the model in",
         )
 
+        parser.add_argument(
+            "--int4-block-size",
+            default=None,
+            help="Specify the block_size for int4 quantization.",
+            choices=[16, 32, 64, 128, 256],
+            type=int,
+        )
+
+        parser.add_argument(
+            "--force",
+            action="store_true",
+            help="Forces downloading of Hugging-Face model again (if changed).  Additionally for"
+            " cpu and igpu devices only, forces model_builder to run again on the HF model"
+            " (changed or not).",
+        )
+
+        parser.add_argument(
+            "--download",
+            action="store_true",
+            help="Download the model if needed, but don't load it",
+        )
+
+        parser.add_argument(
+            "--subfolder",
+            default=None,
+            help="Subfolder where model is located <LEMONADE CACHE>/oga_models/<MODELNAME>"
+            "/<SUBFOLDER>, default is <EP for device>-<dtype>.  The EPs are: "
+            f'{", ".join([value + " for " + key for key, value in execution_providers.items()])}.',
+        )
+
         return parser
 
     def run(
         self,
         state: State,
-        input: str = phi_3_mini_128k,
+        input: str,
         device: str = "igpu",
         dtype: str = "int4",
+        int4_block_size: int = None,
+        force: bool = False,
+        download: bool = False,
+        subfolder: str = None,
     ) -> State:
 
         checkpoint = input
+        state.checkpoint = checkpoint
 
-        # Map of models[device][dtype][checkpoint] to the name of the model folder on disk
-        local_supported_models = {
-            "igpu": {
-                "int4": {
-                    phi_3_mini_128k: os.path.join(
-                        "phi-3-mini-128k-instruct",
-                        "directml",
-                        "directml-int4-awq-block-128",
-                    ),
-                    phi_3_mini_4k: os.path.join(
-                        "phi-3-mini-4k-instruct",
-                        "directml",
-                        "directml-int4-awq-block-128",
-                    ),
-                },
-            },
-            "npu": {
-                "int4": {
-                    # Legacy RyzenAI 1.2 models for NPU
-                    llama_2: "llama2-7b-int4",
-                    llama_3: "llama3-8b-int4",
-                    qwen_1dot5: "qwen1.5-7b-int4",
-                }
-            },
-            "cpu": {
-                "int4": {
-                    phi_3_mini_4k: os.path.join(
-                        "phi-3-mini-4k-instruct",
-                        "cpu_and_mobile",
-                        "cpu-int4-rtn-block-32-acc-level-4",
-                    ),
-                }
-            },
+        # See whether the device;dtype;checkpoint combination is supported for download from HF
+        hf_supported_models = {
+            "cpu": {"int4": "*/*", "fp32": "*/*"},
+            "igpu": {"int4": "*/*", "fp16": "*/*"},
+            "npu": {"int4": "amd/**-onnx-ryzen-strix"},
         }
+        hf_supported = (
+            device in hf_supported_models
+            and dtype in hf_supported_models[device]
+            and fnmatch(checkpoint, hf_supported_models[device][dtype])
+        )
 
-        hf_supported_models = {"npu": {"int4": "amd/**-onnx-ryzen-strix"}}
-
-        supported_locally = True
-        try:
-            dir_name = local_supported_models[device][dtype][checkpoint]
-        except KeyError as e:
-            supported_locally = False
-            hf_supported = (
-                device in hf_supported_models
-                and dtype in hf_supported_models[device]
-                and fnmatch(checkpoint, hf_supported_models[device][dtype])
+        # Check to see if the model already exists locally
+        if subfolder is None:
+            subfolder = f"{execution_providers[device]}-{dtype}"
+            subfolder += (
+                f"-block-{int4_block_size}"
+                if dtype == "int4" and int4_block_size is not None
+                else ""
             )
+        oga_models_subfolder = os.path.join(
+            checkpoint.replace("/", "_").lower(), subfolder
+        )
+        full_model_path = os.path.join(
+            state.cache_dir, oga_models_path, oga_models_subfolder
+        )
+        model_exists_locally = os.path.isdir(full_model_path) and os.listdir(
+            full_model_path
+        )
+
+        # Check if model needs to be downloaded and/or built or rebuilt
+        if not model_exists_locally or force:
+
             if not hf_supported:
+                # Download/build can't be done
                 raise ValueError(
-                    "The device;dtype;checkpoint combination is not supported: "
-                    f"{device};{dtype};{checkpoint}. The supported combinations "
-                    f"are: {local_supported_models} for local models and {hf_supported_models}"
-                    " for models on Hugging Face."
-                ) from e
-
-        # Create models dir if it doesn't exist
-        models_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "models")
-        if not os.path.exists(models_dir):
-            os.makedirs(models_dir)
-
-        # If the model is supported though Hugging Face, download it
-        if not supported_locally:
-            hf_model_name = checkpoint.split("amd/")[1]
-            dir_name = "_".join(hf_model_name.split("-")[:6]).lower()
-            api_key = os.getenv("HF_TOKEN")
-            login(api_key)
-            snapshot_download(
-                repo_id=checkpoint,
-                local_dir=os.path.join(models_dir, dir_name),
-                ignore_patterns=["*.md", "*.txt"],
-            )
+                    "The (device, dtype, checkpoint) combination is not supported: "
+                    f"({device}, {dtype}, {checkpoint}). The supported combinations "
+                    f"for Hugging Face models are "
+                    + ", ".join(
+                        [
+                            f"({dev}, {dt}, {hf_supported_models[dev][dt]})"
+                            for dev in hf_supported_models.keys()
+                            for dt in hf_supported_models[dev]
+                        ]
+                    )
+                    + "."
+                )
 
-        current_cwd = os.getcwd()
-        if device == "npu":
-            # Change to the models directory
-            os.chdir(models_dir)
+            # Download the model from HF
+            if device == "npu":
 
-            # Common environment variables for all NPU models
-            os.environ["DD_ROOT"] = ".\\bins"
-            os.environ["DEVICE"] = "stx"
-            os.environ["XLNX_ENABLE_CACHE"] = "0"
+                # NPU models on HF are ready to go and HF does its own caching
+                full_model_path = snapshot_download(
+                    repo_id=checkpoint,
+                    ignore_patterns=["*.md", "*.txt"],
+                )
+                oga_models_subfolder = None
 
-            # Phi models require USE_AIE_RoPE=0
-            if "phi-" in checkpoint.lower():
-                os.environ["USE_AIE_RoPE"] = "0"
             else:
-                os.environ["USE_AIE_RoPE"] = "1"
-
-        model_dir = os.path.join(models_dir, dir_name)
-        state.model = OrtGenaiModel(model_dir)
-        state.tokenizer = OrtGenaiTokenizer(state.model.model)
-        state.dtype = dtype
-
-        state.save_stat(Keys.CHECKPOINT, checkpoint)
-        state.save_stat(Keys.DTYPE, dtype)
-        state.save_stat(Keys.DEVICE, device)
+                # device is 'cpu' or 'igpu'
+
+                # Use model_builder to download model and convert to ONNX
+                printing.log_info(f"Building {checkpoint} for {device} using {dtype}")
+                extra_options = {}
+                if int4_block_size is not None:
+                    extra_options["int4-block-size"] = int4_block_size
+                try:
+                    model_builder.create_model(
+                        checkpoint,  # model_name
+                        "",  # input_path
+                        full_model_path,  # output_path
+                        dtype,  # precision
+                        execution_providers[device],  # execution_provider
+                        os.path.join(
+                            state.cache_dir, oga_model_builder_cache_path
+                        ),  # cache_dir
+                        **extra_options,
+                    )
+                except NotImplementedError as e:
+                    # Model architecture is not supported by model builder
+                    raise NotImplementedError("[Model builder] " + str(e)) from e
+                except OSError as e:
+                    # Model is not found either locally nor in HF repository
+                    raise ValueError("[Model builder] " + str(e)) from e
+
+        if not download:
+            # The download only flag is not set, so load model
+            if device == "npu":
+                if "AMD_OGA" not in os.environ:
+                    raise RuntimeError(
+                        "Please set environment variable AMD_OGA to the path of the amd_oga files"
+                    )
 
-        # Create a UniqueInvocationInfo and ModelInfo so that we can display status
-        # at the end of the sequence
-        status.add_to_state(state=state, name=input, model=input)
+                # Check AMD_OGA points to oga library files
+                oga_path = os.environ["AMD_OGA"]
+                if not os.path.exists(
+                    os.path.join(oga_path, "libs", "onnxruntime.dll")
+                ):
+                    raise RuntimeError(
+                        f"Cannot find libs/onnxruntime.dll in AMD_OGA folder: {oga_path}"
+                    )
 
-        # Put the CWD back to its original value
-        os.chdir(current_cwd)
+                # Save current directory and PATH
+                saved_cwd = os.getcwd()
+                saved_path = os.environ["PATH"]
+
+                # Change to the AMD_OGA distribution directory
+                os.chdir(oga_path)
+                os.environ["PATH"] += os.pathsep + os.path.join(
+                    os.environ["AMD_OGA"], "libs"
+                )
+
+                # Common environment variables for all NPU models
+                os.environ["DD_ROOT"] = ".\\bins"
+                os.environ["DEVICE"] = "stx"
+                os.environ["XLNX_ENABLE_CACHE"] = "0"
+
+                # Phi models require USE_AIE_RoPE=0
+                if "phi-" in checkpoint.lower():
+                    os.environ["USE_AIE_RoPE"] = "0"
+                else:
+                    os.environ["USE_AIE_RoPE"] = "1"
+
+            state.model = OrtGenaiModel(full_model_path)
+            state.tokenizer = OrtGenaiTokenizer(state.model.model)
+            state.dtype = dtype
+
+            state.save_stat(Keys.CHECKPOINT, checkpoint)
+            state.save_stat(Keys.DTYPE, dtype)
+            state.save_stat(Keys.DEVICE, device)
+            if oga_models_subfolder is not None:
+                state.save_stat(Keys.OGA_MODELS_SUBFOLDER, oga_models_subfolder)
+
+            # Create a UniqueInvocationInfo and ModelInfo so that we can display status
+            # at the end of the sequence
+            status.add_to_state(state=state, name=input, model=input)
+
+            if device == "npu":
+                # Restore cwd and PATH
+                os.chdir(saved_cwd)
+                os.environ["PATH"] = saved_path
 
         return state
diff --git a/src/turnkeyml/version.py b/src/turnkeyml/version.py
index e2a8e2c0..d15f23d8 100644
--- a/src/turnkeyml/version.py
+++ b/src/turnkeyml/version.py
@@ -1 +1 @@
-__version__ = "4.0.5"
+__version__ = "4.0.6"
diff --git a/test/llm_api.py b/test/llm_api.py
index 28ed5bbe..3977241d 100644
--- a/test/llm_api.py
+++ b/test/llm_api.py
@@ -1,6 +1,7 @@
 import unittest
 import shutil
 import os
+import urllib3
 from turnkeyml.state import State
 import turnkeyml.common.filesystem as fs
 import turnkeyml.common.test_helpers as common
@@ -10,6 +11,17 @@
 
 ci_mode = os.getenv("LEMONADE_CI_MODE", False)
 
+try:
+    url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar"
+    resp = urllib3.request("GET", url, preload_content=False)
+    if 200 <= resp.status < 400:
+        eecs_berkeley_edu_cannot_be_reached = False
+    else:
+        eecs_berkeley_edu_cannot_be_reached = True
+    resp.release_conn()
+except urllib3.exceptions.HTTPError:
+    eecs_berkeley_edu_cannot_be_reached = True
+
 
 class Testing(unittest.TestCase):
     def setUp(self) -> None:
@@ -32,7 +44,8 @@ def test_001_prompt(self):
         state = LLMPrompt().run(state, prompt=prompt, max_new_tokens=15)
 
         assert len(state.response) > len(prompt), state.response
-    
+
+    @unittest.skipIf(eecs_berkeley_edu_cannot_be_reached, "eecs.berkeley.edu cannot be reached for dataset download")
     def test_002_accuracy_mmlu(self):
         # Test MMLU benchmarking with known model
         checkpoint = "facebook/opt-125m"

From 764110aae06ef0b8c263a5ec9982343117c0fb61 Mon Sep 17 00:00:00 2001
From: Jeremy Fowers <jeremy.fowers@amd.com>
Date: Tue, 26 Nov 2024 12:32:02 -0500
Subject: [PATCH 2/9] add serve example to OGA readme

---
 src/turnkeyml/llm/README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/turnkeyml/llm/README.md b/src/turnkeyml/llm/README.md
index 5def1987..88087e09 100644
--- a/src/turnkeyml/llm/README.md
+++ b/src/turnkeyml/llm/README.md
@@ -93,6 +93,10 @@ You can then load supported OGA models on to CPU or iGPU with the `oga-load` too
 
 `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are"`
 
+You can also launch a server process with:
+
+`lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 serve`
+
 You can learn more about the CPU and iGPU support in our [OGA documentation](https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_igpu.md).
 
 > Note: early access to AMD's RyzenAI NPU is also available. See the [RyzenAI NPU OGA documentation](https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_npu.md) for more information.

From fa9e5eb307d626baa7dfa99a786be7f2d8ae3f52 Mon Sep 17 00:00:00 2001
From: Jeremy Fowers <jeremy.fowers@amd.com>
Date: Tue, 26 Nov 2024 12:40:40 -0500
Subject: [PATCH 3/9] lint

---
 src/turnkeyml/llm/cli.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/turnkeyml/llm/cli.py b/src/turnkeyml/llm/cli.py
index bb8e5403..e396244a 100644
--- a/src/turnkeyml/llm/cli.py
+++ b/src/turnkeyml/llm/cli.py
@@ -54,10 +54,6 @@ def main():
     except ModuleNotFoundError:
         pass
 
-
-
-
-
     # Define the argument parser
     parser = cli.CustomArgumentParser(
         description="Turnkey analysis and benchmarking of GenAI models. "

From 7d9d713c19f533e474654a859deffd5b1fe1541a Mon Sep 17 00:00:00 2001
From: Jeremy Fowers <jeremy.fowers@amd.com>
Date: Tue, 26 Nov 2024 13:55:28 -0500
Subject: [PATCH 4/9] release action, get past errors?

---
 .github/actions/server-testing/action.yml | 164 ++++++++++++++++++++++
 .github/workflows/test_turnkey.yml        |   2 +-
 setup.py                                  |   4 +-
 3 files changed, 167 insertions(+), 3 deletions(-)
 create mode 100644 .github/actions/server-testing/action.yml

diff --git a/.github/actions/server-testing/action.yml b/.github/actions/server-testing/action.yml
new file mode 100644
index 00000000..467e99ec
--- /dev/null
+++ b/.github/actions/server-testing/action.yml
@@ -0,0 +1,164 @@
+name: "Test Lemonade Server"
+description: Launch Lemonade Server and test the endpoints
+inputs:
+  conda_env:
+    required: true
+  load_command:
+    required: true
+  amd_oga:
+    required: false
+    default: ""
+    description: "Location of the OGA for RyzenAI NPU install directory on disk"
+runs:
+  using: "composite"
+  steps:
+    - name: Ensure the Lemonade serer works properly
+      shell: PowerShell
+      run: |
+        $Env:AMD_OGA = "${{ inputs.amd_oga }}"
+        
+        $outputFile = "output.log"
+        $errorFile = "error.log"
+        $serverProcess = Start-Process -FilePath "conda" -ArgumentList "run ${{ inputs.conda_env }} lemonade -d .\ci-cache ${{ inputs.load_command }} serve --max-new-tokens 10" -RedirectStandardOutput $outputFile -RedirectStandardError $errorFile -PassThru -NoNewWindow
+
+        Write-Host "Wait for 30 seconds to let the server come up"
+        Start-Sleep -Seconds 30
+        
+        Write-Host "Check if server process successfully launched"
+        $serverRunning = Get-Process -Id $serverProcess.Id -ErrorAction SilentlyContinue
+        if (-not $serverRunning) {
+          Write-Host "Error: Server process isn't running, even though we just tried to start it!"
+          Write-Host "Standard Output:"
+          Get-Content $outputFile
+
+          Write-Host "Standard Error:"
+          Get-Content $errorFile
+          exit 1
+        } else {
+          Write-Host "Server process is alive."
+        }
+
+        Write-Host "Wait for the server port to come up"
+        while ($true) {
+          
+          $llmPortCheck = Test-NetConnection -ComputerName 127.0.0.1 -Port 8000
+          if (-not $llmPortCheck.TcpTestSucceeded) {
+            Write-Host "LLM server is not yet running on port 8000!"
+            Write-Host "Standard Output:"
+            Get-Content $outputFile
+
+            Write-Host "Standard Error:"
+            Get-Content $errorFile
+          } else {
+            Write-Host "LLM server is running on port 8000."
+            break
+          }
+
+          Start-Sleep -Seconds 30
+        }
+
+        Write-Host "Checking the /health endpoint"
+        $response = Invoke-WebRequest -Uri http://127.0.0.1:8000/health -UseBasicParsing
+
+        if ($response.StatusCode -eq 200) {
+            Write-Output "Good: /health status code is 200"
+        } else {
+            Write-Output "Error: /health status code is not 200"
+            Write-Host "Standard Output:"
+            Get-Content $outputFile
+
+            Write-Host "Standard Error:"
+            Get-Content $errorFile
+            exit 1
+        }
+
+        $jsonContent = $response.Content | ConvertFrom-Json
+        if ($jsonContent) {
+            Write-Output "Good: /health JSON content is not empty: $jsonContent"
+        } else {
+            Write-Output "Error: /health JSON content is empty"
+            Write-Host "Standard Output:"
+            Get-Content $outputFile
+
+            Write-Host "Standard Error:"
+            Get-Content $errorFile
+            exit 1
+        }
+
+        Write-Host "Checking the /ws (streaming generation) endpoint"
+
+        # Define the WebSocket URI
+        $uri = [System.Uri]::new("ws://127.0.0.1:8000/ws")
+
+        # Create a new ClientWebSocket instance
+        $webSocket = [System.Net.WebSockets.ClientWebSocket]::new()
+
+        # Connect to the WebSocket server
+        $webSocket.ConnectAsync($uri, [System.Threading.CancellationToken]::None).Wait()
+
+        # Define the message to send
+        $message = "Hello, WebSocket!"
+        $buffer = [System.Text.Encoding]::UTF8.GetBytes($message)
+        $segment = [System.ArraySegment[byte]]::new($buffer)
+
+        # Send the message
+        $webSocket.SendAsync($segment, [System.Net.WebSockets.WebSocketMessageType]::Text, $true, [System.Threading.CancellationToken]::None).Wait()
+
+        # Buffer to store the response
+        $responseBuffer = New-Object byte[] 1024
+        $responseSegment = [System.ArraySegment[byte]]::new($responseBuffer)
+
+        # Variable to store the complete response
+        $response = ""
+
+        # Receive the streaming response
+        do {
+            $result = $webSocket.ReceiveAsync($responseSegment, [System.Threading.CancellationToken]::None).Result
+            $response += [System.Text.Encoding]::UTF8.GetString($responseBuffer, 0, $result.Count)
+        } while ($response -notlike "*</s>*")
+
+        # Close the WebSocket connection
+        $webSocket.CloseAsync([System.Net.WebSockets.WebSocketCloseStatus]::NormalClosure, "Closing", [System.Threading.CancellationToken]::None).Wait()
+
+        # Check if the response is not empty
+        if ($response -and $response -notlike "</s>") {
+            Write-Output "Response is not empty: $response"
+        } else {
+            Write-Output "Response is empty or only contains the end marker: $response"
+            Write-Host "Standard Output:"
+            Get-Content $outputFile
+
+            Write-Host "Standard Error:"
+            Get-Content $errorFile
+            exit 1
+        }
+
+        Write-Host "Checking the /stats endpoint"
+        $response = Invoke-WebRequest -Uri http://127.0.0.1:8000/stats  -UseBasicParsing
+        if ($response.StatusCode -eq 200) {
+            Write-Output "Good: /stats status code is 200"
+        } else {
+            Write-Output "Error: /stats status code is not 200"
+            Write-Host "Standard Output:"
+            Get-Content $outputFile
+
+            Write-Host "Standard Error:"
+            Get-Content $errorFile
+            exit 1
+        }
+
+        $jsonContent = $response.Content | ConvertFrom-Json
+        if ($jsonContent) {
+            Write-Output "Good: /stats JSON content is not empty: $jsonContent"
+        } else {
+            Write-Output "Error: /stats JSON content is empty"
+            Write-Host "Standard Output:"
+            Get-Content $outputFile
+
+            Write-Host "Standard Error:"
+            Get-Content $errorFile
+            exit 1
+        }
+
+        Write-Host "Close the server process"
+        Stop-Process -Id $serverProcess.Id
\ No newline at end of file
diff --git a/.github/workflows/test_turnkey.yml b/.github/workflows/test_turnkey.yml
index b4118ef2..3ac2759b 100644
--- a/.github/workflows/test_turnkey.yml
+++ b/.github/workflows/test_turnkey.yml
@@ -36,8 +36,8 @@ jobs:
           conda install pylint=3.2.7
           pip install pytest
           pip install -e plugins/devices
-          pip install transformers timm
           pip install -e . # Required to test current tkml package instead of pypi version
+          pip install transformers timm
           python -m pip check
       - name: Lint with PyLint
         shell: bash -el {0}
diff --git a/setup.py b/setup.py
index 0862f873..1d233fae 100644
--- a/setup.py
+++ b/setup.py
@@ -62,8 +62,8 @@
         "llm-oga-dml": [
             "onnxruntime-genai-directml==0.4.0",
             "tqdm",
-            "torch>=2.0.0",
-            "transformers",
+            "torch>=2.0.0,<2.4",
+            "transformers<4.45.0",
             "accelerate",
             "py-cpuinfo",
             "sentencepiece",

From 0a1d70ba64cbb984bdea25a5f205df0b3a2522b2 Mon Sep 17 00:00:00 2001
From: Jeremy Fowers <jeremy.fowers@amd.com>
Date: Tue, 26 Nov 2024 14:01:33 -0500
Subject: [PATCH 5/9] this?

---
 .github/workflows/test_turnkey.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/test_turnkey.yml b/.github/workflows/test_turnkey.yml
index 3ac2759b..47b5a4a4 100644
--- a/.github/workflows/test_turnkey.yml
+++ b/.github/workflows/test_turnkey.yml
@@ -37,6 +37,8 @@ jobs:
           pip install pytest
           pip install -e plugins/devices
           pip install -e . # Required to test current tkml package instead of pypi version
+          # tokenizers 0.20.4 seems to have a bug
+          pip install tokenizers<0.20.4
           pip install transformers timm
           python -m pip check
       - name: Lint with PyLint

From 86f3e3e91666b13999a57ae429b01f42cfac7cd3 Mon Sep 17 00:00:00 2001
From: Jeremy Fowers <jeremy.fowers@amd.com>
Date: Tue, 26 Nov 2024 14:54:21 -0500
Subject: [PATCH 6/9] this??

---
 .github/workflows/test_turnkey.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test_turnkey.yml b/.github/workflows/test_turnkey.yml
index 47b5a4a4..5dd0e053 100644
--- a/.github/workflows/test_turnkey.yml
+++ b/.github/workflows/test_turnkey.yml
@@ -37,9 +37,9 @@ jobs:
           pip install pytest
           pip install -e plugins/devices
           pip install -e . # Required to test current tkml package instead of pypi version
-          # tokenizers 0.20.4 seems to have a bug
-          pip install tokenizers<0.20.4
-          pip install transformers timm
+          # tokenizers 0.20.4 seems to have an install bug, which we must avoid by limiting
+          # the transformers version
+          pip install transformers<4.46.3 tokenizers<0.20.4 timm
           python -m pip check
       - name: Lint with PyLint
         shell: bash -el {0}

From 0ef8f3becd9c2bef8fafa7bf6e47d6f4fb164a18 Mon Sep 17 00:00:00 2001
From: Jeremy Fowers <jeremy.fowers@amd.com>
Date: Tue, 26 Nov 2024 15:11:02 -0500
Subject: [PATCH 7/9] syntax?

---
 .github/workflows/test_turnkey.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_turnkey.yml b/.github/workflows/test_turnkey.yml
index 5dd0e053..52298a6a 100644
--- a/.github/workflows/test_turnkey.yml
+++ b/.github/workflows/test_turnkey.yml
@@ -39,7 +39,7 @@ jobs:
           pip install -e . # Required to test current tkml package instead of pypi version
           # tokenizers 0.20.4 seems to have an install bug, which we must avoid by limiting
           # the transformers version
-          pip install transformers<4.46.3 tokenizers<0.20.4 timm
+          pip install "transformers<4.46.3" "tokenizers<0.20.4" timm
           python -m pip check
       - name: Lint with PyLint
         shell: bash -el {0}

From 2625b7820c10beebc4867beaf959c1a115e61ea9 Mon Sep 17 00:00:00 2001
From: Jeremy Fowers <jeremy.fowers@amd.com>
Date: Tue, 26 Nov 2024 15:44:18 -0500
Subject: [PATCH 8/9] fix

---
 .github/workflows/test_lemonade.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test_lemonade.yml b/.github/workflows/test_lemonade.yml
index 48c93cca..4f18fb66 100644
--- a/.github/workflows/test_lemonade.yml
+++ b/.github/workflows/test_lemonade.yml
@@ -28,6 +28,7 @@ jobs:
           miniconda-version: "latest"
           activate-environment: lemon
           python-version: "3.10"
+          run-post: "false"
       - name: Install dependencies
         shell: bash -el {0}
         run: |

From f50505a733fb2305890e10df9c2dd72888b0c2c7 Mon Sep 17 00:00:00 2001
From: Jeremy Fowers <jeremy.fowers@amd.com>
Date: Tue, 26 Nov 2024 16:21:14 -0500
Subject: [PATCH 9/9] Docs cleanup

---
 docs/ort_genai_igpu.md | 4 ++--
 docs/ort_genai_npu.md  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/ort_genai_igpu.md b/docs/ort_genai_igpu.md
index bf75f718..0514d693 100644
--- a/docs/ort_genai_igpu.md
+++ b/docs/ort_genai_igpu.md
@@ -8,8 +8,8 @@ To install:
 
 1. `conda create -n oga-igpu python=3.9`
 1. `conda activate oga-igpu`
-1. `pip install -e path/to/genai[oga-igpu]`
-   - Note: don't forget the `[oga-igpu]` at the end, this is what installs ort-genai
+1. `pip install -e .[llm-oga-igpu]`
+   - Note: don't forget the `[llm-oga-igpu]` at the end, this is what installs ort-genai
 1. Get models:
     - The oga-load tool can download models from Hugging Face and build ONNX files using oga model_builder.  Models can be quantized and optimized for both igpu and cpu.
     - Download and build ONNX model files:
diff --git a/docs/ort_genai_npu.md b/docs/ort_genai_npu.md
index 02a60a32..70c322e0 100644
--- a/docs/ort_genai_npu.md
+++ b/docs/ort_genai_npu.md
@@ -17,7 +17,7 @@ onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running
     1. `cd REPO_ROOT`
     1. `pip install -e .[oga-npu]`
 1. Download required OGA packages
-    1. Access the [AMD RyzenAI EA Lounge](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab) and  download `amd_oga_Oct4_2024.zip` from `Ryzen AI 1.3 Preview Release`.
+    1. Access the [AMD RyzenAI EA Lounge](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab) and  download `amd_oga_Oct4_2024.zip` from `Ryzen AI 1.3 EA Release`.
     1. Unzip `amd_oga_Oct4_2024.zip`
 1. Setup your folder structure:
     1. Copy the `amd_oga` folder from the above zip file, if desired
@@ -35,7 +35,7 @@ onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running
 
 ### Runtime
 
-To test basic functionality, point lemonade to any of the models under [quark-quantized-onnx-llms-for-ryzen-ai-13-ea](https://huggingface.co/collections/amd/quark-quantized-onnx-llms-for-ryzen-ai-13-ea-66fc8e24927ec45504381902):
+To test basic functionality, point lemonade to any of the models under [quark-quantized-onnx-llms-for-ryzen-ai-1.3-ea](https://huggingface.co/collections/amd/quark-quantized-onnx-llms-for-ryzen-ai-13-ea-66fc8e24927ec45504381902):
 
 ```
 lemonade -i amd/Llama-2-7b-hf-awq-g128-int4-asym-fp32-onnx-ryzen-strix oga-load --device npu --dtype int4 llm-prompt -p "hello whats your name?" --max-new-tokens 15