Add CPU oga support and update hyperparameters

onnx · Oct 11, 2024 · 394cf9d · 394cf9d
1 parent dbbea14
commit 394cf9d
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 10 deletions.
diff --git a/src/turnkeyml/llm/README.md b/src/turnkeyml/llm/README.md
@@ -99,8 +99,13 @@ You can also try Phi-3-Mini-128k-Instruct with the following commands:
 
 `lemonade -i microsoft/Phi-3-mini-128k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are"`
 
+You can also try out the CPU with:
 
-> Note: no other models or devices are officially supported by `lemonade` on OGA at this time. Contributions appreciated!
+`huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx --include cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/* --local-dir REPO_ROOT/src/turnkeyml/llm/tools/ort_genai/models/phi-3-mini-128k-instruct`
+
+`lemonade -i microsoft/Phi-3-mini-128k-instruct oga-load --device cpu --dtype int4 llm-prompt -p "Hello, my thoughts are"`
+
+> Note: no other models or devices are officially supported by `lemonade` on OGA at this time. Contributions appreciated! It only takes a few minutes to add a new model, we just need to add a path to the downloaded model folder to the supported models dictionary in [oga.py](https://github.com/onnx/turnkeyml/blob/v4.0.2/src/turnkeyml/llm/tools/ort_genai/oga.py).
 
 ## Install RyzenAI NPU
 

diff --git a/src/turnkeyml/llm/tools/ort_genai/oga.py b/src/turnkeyml/llm/tools/ort_genai/oga.py
@@ -5,6 +5,7 @@
 import argparse
 import os
 import time
+import json
 from queue import Queue
 import onnxruntime_genai as og
 from turnkeyml.state import State
@@ -70,6 +71,14 @@ def __init__(self, input_folder):
         super().__init__()
         self.model = og.Model(input_folder)
         self.type = "ort-genai"
+        self.config = self.load_config(input_folder)
+
+    def load_config(self, input_folder):
+        config_path = os.path.join(input_folder, 'genai_config.json')
+        if os.path.exists(config_path):
+            with open(config_path, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        return None
 
     def generate(
         self,
@@ -90,14 +99,34 @@ def generate(
         max_length = len(input_ids) + max_new_tokens
 
         params.input_ids = input_ids
-        params.set_search_options(
-            do_sample=do_sample,
-            top_k=top_k,
-            top_p=top_p,
-            temperature=temperature,
-            max_length=max_length,
-            min_length=max_length,
-        )
+        if self.config and 'search' in self.config:
+            search_config = self.config['search']
+            params.set_search_options(
+                do_sample=search_config.get('do_sample', do_sample),
+                top_k=search_config.get('top_k', top_k),
+                top_p=search_config.get('top_p', top_p),
+                temperature=search_config.get('temperature', temperature),
+                max_length=max_length,
+                min_length=0,
+                early_stopping=search_config.get('early_stopping', False),
+                length_penalty=search_config.get('length_penalty', 1.0),
+                num_beams=search_config.get('num_beams', 1),
+                num_return_sequences=search_config.get('num_return_sequences', 1),
+                repetition_penalty=search_config.get('repetition_penalty', 1.0),
+                past_present_share_buffer=search_config.get('past_present_share_buffer', True),
+                # Not currently supported by OGA
+                # diversity_penalty=search_config.get('diversity_penalty', 0.0),
+                # no_repeat_ngram_size=search_config.get('no_repeat_ngram_size', 0),
+            )
+        else:
+            params.set_search_options(
+                do_sample=do_sample,
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+                max_length=max_length,
+                min_length=max_length,
+            )
         params.try_graph_capture_with_max_batch_size(1)
 
         generator = og.Generator(self.model, params)
@@ -190,7 +219,7 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser:
         parser.add_argument(
             "-d",
             "--device",
-            choices=["igpu", "npu"],
+            choices=["igpu", "npu", "cpu"],
             default="igpu",
             help="Which device to load the model on to (default: igpu)",
         )
@@ -237,6 +266,15 @@ def run(
                     qwen_1dot5: "qwen1.5-7b-int4",
                 }
             },
+            "cpu": {
+                "int4": {
+                    phi_3_mini_4k: os.path.join(
+                        "phi-3-mini-4k-instruct",
+                        "cpu_and_mobile",
+                        "cpu-int4-rtn-block-32-acc-level-4",
+                    ),
+                }
+            },
         }
 
         try: