From cb2141a79bc5c7a7d4c83fa2cb83beae69948f62 Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Tue, 20 Feb 2024 16:54:18 +0100 Subject: [PATCH 01/27] Removing the legacy docs for open models that were broken --- docs/open_models.md | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/docs/open_models.md b/docs/open_models.md index 07c423b4fe..970cfeba22 100644 --- a/docs/open_models.md +++ b/docs/open_models.md @@ -1,23 +1,17 @@ Using with open/local models ============================ -You can integrate `gpt-engineer` with open-source models by leveraging an OpenAI-compatible API. One such API is provided by the [text-generator-ui _extension_ openai](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/openai/README.md). +You can integrate `gpt-engineer` with open-source models by leveraging an OpenAI-compatible API. Setup ----- -To get started, first set up the API with the Runpod template, as per the [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/openai/README.md). - Running the Example ------------------- -Once the API is set up, you can find the host and the exposed TCP port by checking your Runpod dashboard. - -Then, you can use the port and host to run the following example using WizardCoder-Python-34B hosted on Runpod: +On other inference libraries +------------------- -``` - OPENAI_API_BASE=http://:/v1 python -m gpt_engineer.cli.main benchmark/pomodoro_timer --steps benchmark TheBloke_WizardCoder-Python-34B-V1.0-GPTQ -``` Using Azure models ================== From 9f0d79b13f1b38f5c50337f75c361c1be6c30aa4 Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Tue, 20 Feb 2024 16:59:07 +0100 Subject: [PATCH 02/27] Updating the chapter structure --- docs/open_models.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/open_models.md b/docs/open_models.md index 970cfeba22..d7defbadbc 100644 --- a/docs/open_models.md +++ b/docs/open_models.md @@ -3,6 +3,8 @@ Using with open/local models You can integrate `gpt-engineer` with open-source models by leveraging an OpenAI-compatible API. +We provide the minimal and cleanest solution below. It's not the only way to use open/local models but the one we recommend and tested. + Setup ----- @@ -12,6 +14,18 @@ Running the Example On other inference libraries ------------------- +Which open model to use +================== + +Your best choice would be: + +- CodeLlama +- Mixtral 8x7B + +On number of parameters +------------------- + +Use the largest model possible that your hardware allows you to run. Sure the responses might be slower but code quality higher. Using Azure models ================== From 7bb1da414eee2500c4053d8e597885c3c925a29b Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Fri, 23 Feb 2024 18:28:26 +0100 Subject: [PATCH 03/27] Adding documentation for running a specific open LLM model --- docs/examples/CodeLlama2.py | 0 docs/examples/test_llm_running.py | 13 +++++++++ docs/examples/test_open_llm/README.md | 33 ++++++++++++++++++++++ docs/open_models.md | 40 +++++++++++++++++++++++---- 4 files changed, 81 insertions(+), 5 deletions(-) create mode 100644 docs/examples/CodeLlama2.py create mode 100644 docs/examples/test_llm_running.py create mode 100644 docs/examples/test_open_llm/README.md diff --git a/docs/examples/CodeLlama2.py b/docs/examples/CodeLlama2.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/examples/test_llm_running.py b/docs/examples/test_llm_running.py new file mode 100644 index 0000000000..4e1c534710 --- /dev/null +++ b/docs/examples/test_llm_running.py @@ -0,0 +1,13 @@ +from openai import OpenAI + +client = OpenAI(base_url="http://localhost:8000/v1", api_key="sk-xxx") + +response = client.chat.completions.create( + model="llama2", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the meaning of life?"}, + ], +) + +print(response) diff --git a/docs/examples/test_open_llm/README.md b/docs/examples/test_open_llm/README.md new file mode 100644 index 0000000000..cab7ba6529 --- /dev/null +++ b/docs/examples/test_open_llm/README.md @@ -0,0 +1,33 @@ +# Test that the Open LLM is running + +First start the server by using only CPU: + +```bash +export model_path="models/llama-2-7b.Q2_K.gguf" +python -m llama_cpp.server --model $model_path +``` + +Or with GPU support (recommended): + +```bash +python -m llama_cpp.server --model models/llama-2-7b.Q2_K.gguf --n_gpu_layers 1 +``` + +If you have more `GPU` layers available set `--n_gpu_layers` to the higher number. + +## Test API call + +Then ping it via `python` using `OpenAI` API: + +```bash +python examples/test_open_llm/test_open_llm.py +``` + +Or via `curl`: + +```bash +curl --request POST \ + --url http://localhost:8000/v1/chat/completions \ + --header "Content-Type: application/json" \ + --data '{ "model": "llama", "prompt": "Who are you?", "max_tokens": 60}' +``` \ No newline at end of file diff --git a/docs/open_models.md b/docs/open_models.md index d7defbadbc..a6a732769c 100644 --- a/docs/open_models.md +++ b/docs/open_models.md @@ -8,25 +8,55 @@ We provide the minimal and cleanest solution below. It's not the only way to use Setup ----- -Running the Example -------------------- +For inference engine we recommend to the users to use [llama.cpp](https://github.com/ggerganov/llama.cpp) with its `python` bindings `llama-cpp-python`. We choose `llama.cpp` because it supports the largest amount of hardware acceleration backends. -On other inference libraries -------------------- +To install `llama-cpp-python` follow the official [installation docs](https://llama-cpp-python.readthedocs.io/en/latest/) and for [MacOS with Metal support](https://llama-cpp-python.readthedocs.io/en/latest/install/macos/). + +If you want to have benefit from proper hardware acceleration on your machine make sure to set up the proper compile flags: + +- `linux`: `CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"` +- `macos` with Metal support: `CMAKE_ARGS="-DLLAMA_METAL=on"` +- `windows`: `$env:CMAKE_ARGS = "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"` + +Before running: + +```bash +pip install llama-cpp-python +``` + +For the use of `API` we also need to set up the web server: + +```bash +pip install 'llama-cpp-python[server]' +``` + +For detailed use consult the [`llama-cpp-python` docs](https://llama-cpp-python.readthedocs.io/en/latest/server/). + +Before we proceed we need to obtain the model weights in the `gguf` format. In case you have weights in other formats check the `llama-cpp-python` docs for conversion to `gguf` format. Which open model to use ================== Your best choice would be: -- CodeLlama +- [CodeLlama](examples/CodeLlama2.py) - Mixtral 8x7B +But to first test the setup go and download weights [CodeLlama-7B-GGUF by the `TheBloke`](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF). Once that works feel free to try out larger models on your hardware and see what happens. + On number of parameters ------------------- Use the largest model possible that your hardware allows you to run. Sure the responses might be slower but code quality higher. +Running the Example +================== + +To see that your setup works see [test open LLM](examples/test_open_llm/README.md). + +On other inference libraries +------------------- + Using Azure models ================== From e8c34d263cba734819f27c8616080e4db3807504 Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Thu, 29 Feb 2024 18:28:11 +0100 Subject: [PATCH 04/27] Adding an explanation why to use open LLL's --- docs/open_models.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/open_models.md b/docs/open_models.md index a6a732769c..526a903574 100644 --- a/docs/open_models.md +++ b/docs/open_models.md @@ -1,6 +1,8 @@ Using with open/local models ============================ +At the moment the best option for coding is still the use of `gpt-4` models provided by OpenAI. But open models are catching up and are a good free and privacy-oriented alternative if you possess the proper hardware. + You can integrate `gpt-engineer` with open-source models by leveraging an OpenAI-compatible API. We provide the minimal and cleanest solution below. It's not the only way to use open/local models but the one we recommend and tested. From db6f5bbea065f29ee15d915d7109ffa4b6782890 Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Thu, 29 Feb 2024 18:54:39 +0100 Subject: [PATCH 05/27] Updating the example call of the open model --- docs/examples/test_llm_running.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/examples/test_llm_running.py b/docs/examples/test_llm_running.py index 4e1c534710..74b31acd36 100644 --- a/docs/examples/test_llm_running.py +++ b/docs/examples/test_llm_running.py @@ -5,9 +5,10 @@ response = client.chat.completions.create( model="llama2", messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "What is the meaning of life?"}, + {"role": "user", "content": "Provide me with the code for a simple HTML web site."}, ], + temperature=0.7, + max_tokens=200, ) -print(response) +print(response.choices[0].message.content) From bf7a802cfdd61bdd2bd16346c6bc9f576a8399c4 Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Thu, 29 Feb 2024 19:04:15 +0100 Subject: [PATCH 06/27] Adding the option to load local model URL --- gpt_engineer/applications/cli/main.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/gpt_engineer/applications/cli/main.py b/gpt_engineer/applications/cli/main.py index 81970600db..f0740fd0f3 100644 --- a/gpt_engineer/applications/cli/main.py +++ b/gpt_engineer/applications/cli/main.py @@ -58,7 +58,15 @@ def load_env_if_needed(): if os.getenv("OPENAI_API_KEY") is None: # if there is no .env file, try to load from the current working directory load_dotenv(dotenv_path=os.path.join(os.getcwd(), ".env")) - openai.api_key = os.getenv("OPENAI_API_KEY") + + openai.api_key = os.getenv("OPENAI_API_KEY", default=None) + + local_server_url = os.getenv("OPENAI_API_BASE") + + + if local_server_url: + openai.api_base = local_server_url + openai.api_key = "sk-xxx" def load_prompt(input_repo: DiskMemory, improve_mode): From 5d481f151d42792a94617b91018b23dd7ea91199 Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Thu, 29 Feb 2024 19:08:24 +0100 Subject: [PATCH 07/27] Explaining how to run the open LLM model with gpte --- docs/open_models.md | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/docs/open_models.md b/docs/open_models.md index 526a903574..48a715a7ca 100644 --- a/docs/open_models.md +++ b/docs/open_models.md @@ -54,7 +54,21 @@ Use the largest model possible that your hardware allows you to run. Sure the re Running the Example ================== -To see that your setup works see [test open LLM](examples/test_open_llm/README.md). +To see that your setup works see [test open LLM](examples/test_open_llm/README.md). + +If the tests work, run the LLM in separate terminal: + +```bash +python -m llama_cpp.server --model $model_path +``` + +Then run `gpt-engineer` with the following environment variables: + +```bash +export OPENAI_API_BASE="http://localhost:8000/v1" +export OPENAI_API_KEY="sk-xxx" +gpte +``` On other inference libraries ------------------- From d12e8b4015ce972030703f171a2d77e3aba1bf0c Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Thu, 29 Feb 2024 19:08:34 +0100 Subject: [PATCH 08/27] Formating --- gpt_engineer/applications/cli/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpt_engineer/applications/cli/main.py b/gpt_engineer/applications/cli/main.py index f0740fd0f3..fbaaa70382 100644 --- a/gpt_engineer/applications/cli/main.py +++ b/gpt_engineer/applications/cli/main.py @@ -62,7 +62,7 @@ def load_env_if_needed(): openai.api_key = os.getenv("OPENAI_API_KEY", default=None) local_server_url = os.getenv("OPENAI_API_BASE") - + if local_server_url: openai.api_base = local_server_url From c49eb28e4d4e9e970f1aff37147a2903b26a6e11 Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Sun, 17 Mar 2024 12:44:56 +0100 Subject: [PATCH 09/27] Adding the necceseary scripts for testing that openLLM works --- docs/examples/CodeLlama2.py | 0 docs/examples/test_open_llm/test_langchain.py | 16 ++++++++++++++++ .../test_openai_api.py} | 0 docs/open_models.md | 4 +++- 4 files changed, 19 insertions(+), 1 deletion(-) delete mode 100644 docs/examples/CodeLlama2.py create mode 100644 docs/examples/test_open_llm/test_langchain.py rename docs/examples/{test_llm_running.py => test_open_llm/test_openai_api.py} (100%) diff --git a/docs/examples/CodeLlama2.py b/docs/examples/CodeLlama2.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/examples/test_open_llm/test_langchain.py b/docs/examples/test_open_llm/test_langchain.py new file mode 100644 index 0000000000..19481070ce --- /dev/null +++ b/docs/examples/test_open_llm/test_langchain.py @@ -0,0 +1,16 @@ +from langchain_openai import ChatOpenAI +from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler + +temperature = 0.1 +model_name = "CodeLlama" + +model = ChatOpenAI( + model=model_name, + temperature=temperature, + callbacks=[StreamingStdOutCallbackHandler()], + streaming=True +) + +prompt = "Provide me with only the code for a simple python function that sums two numbers." + +model.invoke(prompt) diff --git a/docs/examples/test_llm_running.py b/docs/examples/test_open_llm/test_openai_api.py similarity index 100% rename from docs/examples/test_llm_running.py rename to docs/examples/test_open_llm/test_openai_api.py diff --git a/docs/open_models.md b/docs/open_models.md index 48a715a7ca..a47927e1f0 100644 --- a/docs/open_models.md +++ b/docs/open_models.md @@ -67,7 +67,9 @@ Then run `gpt-engineer` with the following environment variables: ```bash export OPENAI_API_BASE="http://localhost:8000/v1" export OPENAI_API_KEY="sk-xxx" -gpte +export model_name="llama2" + +gpte $model_name ``` On other inference libraries From f717498dc6f275a18997add0068f40daa268b200 Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Sun, 17 Mar 2024 12:46:56 +0100 Subject: [PATCH 10/27] In the api cost estimation step we don't pay for running our local LLM. --- gpt_engineer/applications/cli/main.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gpt_engineer/applications/cli/main.py b/gpt_engineer/applications/cli/main.py index fbaaa70382..921d6d995d 100644 --- a/gpt_engineer/applications/cli/main.py +++ b/gpt_engineer/applications/cli/main.py @@ -267,7 +267,10 @@ def main( store.upload(files_dict) - print("Total api cost: $ ", ai.token_usage_log.usage_cost()) + if openai.api_key == "sk-xxx": + print("Total api cost: $ 0.0 since we are using local LLM.") + else: + print("Total api cost: $ ", ai.token_usage_log.usage_cost()) if __name__ == "__main__": From 29ec1b90bd4aab650c9f81ed6ad2ec2fb5da5238 Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Thu, 21 Mar 2024 17:46:09 +0100 Subject: [PATCH 11/27] Updating the commands for running the gpte with open models --- docs/open_models.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/open_models.md b/docs/open_models.md index a47927e1f0..c03c7a4f2c 100644 --- a/docs/open_models.md +++ b/docs/open_models.md @@ -59,7 +59,7 @@ To see that your setup works see [test open LLM](examples/test_open_llm/README.m If the tests work, run the LLM in separate terminal: ```bash -python -m llama_cpp.server --model $model_path +python -m llama_cpp.server --model $model_path --n_batch 256 --n_gpu_layers 30 ``` Then run `gpt-engineer` with the following environment variables: @@ -69,7 +69,7 @@ export OPENAI_API_BASE="http://localhost:8000/v1" export OPENAI_API_KEY="sk-xxx" export model_name="llama2" -gpte $model_name +gpte $model_name --lite --temperature 0.1 ``` On other inference libraries From 130da6fe845a38b12e275b734d909fc9e1c8994f Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Thu, 21 Mar 2024 18:13:15 +0100 Subject: [PATCH 12/27] Simplifying the test library examples --- docs/examples/test_open_llm/test_langchain.py | 7 ++----- docs/examples/test_open_llm/test_openai_api.py | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/docs/examples/test_open_llm/test_langchain.py b/docs/examples/test_open_llm/test_langchain.py index 19481070ce..d906ab5f20 100644 --- a/docs/examples/test_open_llm/test_langchain.py +++ b/docs/examples/test_open_llm/test_langchain.py @@ -1,12 +1,9 @@ from langchain_openai import ChatOpenAI from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler -temperature = 0.1 -model_name = "CodeLlama" - model = ChatOpenAI( - model=model_name, - temperature=temperature, + model="CodeLlama", + temperature=0.1, callbacks=[StreamingStdOutCallbackHandler()], streaming=True ) diff --git a/docs/examples/test_open_llm/test_openai_api.py b/docs/examples/test_open_llm/test_openai_api.py index 74b31acd36..75eab5b35a 100644 --- a/docs/examples/test_open_llm/test_openai_api.py +++ b/docs/examples/test_open_llm/test_openai_api.py @@ -3,7 +3,7 @@ client = OpenAI(base_url="http://localhost:8000/v1", api_key="sk-xxx") response = client.chat.completions.create( - model="llama2", + model="CodeLlama", messages=[ {"role": "user", "content": "Provide me with the code for a simple HTML web site."}, ], From 5b8c2fd22e65ce9fb5ebef7664749cbda3acd678 Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Thu, 21 Mar 2024 18:14:11 +0100 Subject: [PATCH 13/27] Fixing unclear parts in the test docs --- docs/examples/test_open_llm/README.md | 35 +++++++++++++++++++++------ 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/docs/examples/test_open_llm/README.md b/docs/examples/test_open_llm/README.md index cab7ba6529..2d9abc47b9 100644 --- a/docs/examples/test_open_llm/README.md +++ b/docs/examples/test_open_llm/README.md @@ -3,31 +3,52 @@ First start the server by using only CPU: ```bash -export model_path="models/llama-2-7b.Q2_K.gguf" +export model_path="TheBloke/CodeLlama-13B-GGUF/codellama-13b.Q8_0.gguf" python -m llama_cpp.server --model $model_path ``` Or with GPU support (recommended): ```bash -python -m llama_cpp.server --model models/llama-2-7b.Q2_K.gguf --n_gpu_layers 1 +python -m llama_cpp.server --model TheBloke/CodeLlama-13B-GGUF/codellama-13b.Q8_0.gguf --n_gpu_layers 1 ``` -If you have more `GPU` layers available set `--n_gpu_layers` to the higher number. +If you have more `GPU` layers available set `--n_gpu_layers` to the higher number. To find the amount of avalibale run the above command and look for `llm_load_tensors: offloaded 1/41 layers to GPU` in the output. ## Test API call -Then ping it via `python` using `OpenAI` API: +Set the environment variables: ```bash -python examples/test_open_llm/test_open_llm.py +export OPENAI_API_BASE="http://localhost:8000/v1" +export OPENAI_API_KEY="sk-xxx" +```` + +Then ping the model via `python` using `OpenAI` API: + +```bash +python examples/test_open_llm/test_openai_api.py ``` -Or via `curl`: +If you're not using `CodeLLama` make sure to change the `model` parameter in the test script. + +Or using `curl`: ```bash curl --request POST \ --url http://localhost:8000/v1/chat/completions \ --header "Content-Type: application/json" \ --data '{ "model": "llama", "prompt": "Who are you?", "max_tokens": 60}' -``` \ No newline at end of file +``` + +If this works also make sure that `langchain` interface works since that's how `gpte` interacts with LLMs. + +## Langchain test + +```bash +python examples/test_open_llm/test_langchain.py +``` + +If you're not using `CodeLLama` make sure to change the `model` parameter in the test script. + +That's it 🤓 time to give `gpte` a try. \ No newline at end of file From 31c0dc06ccf20b1677b49d52aa7ba9a2fcb7496b Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Thu, 21 Mar 2024 18:22:13 +0100 Subject: [PATCH 14/27] Cleaning up the explanations for open model use --- docs/open_models.md | 64 +++++++++++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 17 deletions(-) diff --git a/docs/open_models.md b/docs/open_models.md index c03c7a4f2c..29ff205d0a 100644 --- a/docs/open_models.md +++ b/docs/open_models.md @@ -1,32 +1,43 @@ Using with open/local models ============================ +**Use `gpte` first with OpenAI models to get a feel for the `gpte` tool. Then go play with experimental Open LLMs 🐉 support and try not to get 🔥!!** + At the moment the best option for coding is still the use of `gpt-4` models provided by OpenAI. But open models are catching up and are a good free and privacy-oriented alternative if you possess the proper hardware. You can integrate `gpt-engineer` with open-source models by leveraging an OpenAI-compatible API. -We provide the minimal and cleanest solution below. It's not the only way to use open/local models but the one we recommend and tested. +We provide the minimal and cleanest solution below. What is described is not the only way to use open/local models but the one we tested and would recommend to most users. + +More details on why the solution below is recommended in [this blog post](https://zigabrencic.com/blog/2024-02-21). Setup ----- -For inference engine we recommend to the users to use [llama.cpp](https://github.com/ggerganov/llama.cpp) with its `python` bindings `llama-cpp-python`. We choose `llama.cpp` because it supports the largest amount of hardware acceleration backends. +For inference engine we recommend for the users to use [llama.cpp](https://github.com/ggerganov/llama.cpp) with its `python` bindings `llama-cpp-python`. + +We choose `llama.cpp` because: + +- 1.) It supports the largest amount of hardware acceleration backends. +- 2.) Diverse set of open LLM. +- 3.) Is written in `python` and directly on top of `llama.cpp` inference engine. +- 4.) Supports the `openAI` API and `langchain` interface. To install `llama-cpp-python` follow the official [installation docs](https://llama-cpp-python.readthedocs.io/en/latest/) and for [MacOS with Metal support](https://llama-cpp-python.readthedocs.io/en/latest/install/macos/). -If you want to have benefit from proper hardware acceleration on your machine make sure to set up the proper compile flags: +If you want to benefit from proper hardware acceleration on your machine make sure to set up the proper compiler flags before installing your package. - `linux`: `CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"` - `macos` with Metal support: `CMAKE_ARGS="-DLLAMA_METAL=on"` - `windows`: `$env:CMAKE_ARGS = "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"` -Before running: +Then run: ```bash pip install llama-cpp-python ``` -For the use of `API` we also need to set up the web server: +For the use of `API` we also need to set up the web server that `llama-cpp-python` library provides. To install: ```bash pip install 'llama-cpp-python[server]' @@ -34,46 +45,65 @@ pip install 'llama-cpp-python[server]' For detailed use consult the [`llama-cpp-python` docs](https://llama-cpp-python.readthedocs.io/en/latest/server/). -Before we proceed we need to obtain the model weights in the `gguf` format. In case you have weights in other formats check the `llama-cpp-python` docs for conversion to `gguf` format. +Before we proceed we need to obtain the model weights in the `gguf` format. That should be a single file on your disk. + +In case you have weights in other formats check the `llama-cpp-python` docs for conversion to `gguf` format. Model in other formats `ggml`, `.safetensors`, etc. won't work without prior conversion to `gguf` file format! Which open model to use ================== Your best choice would be: -- [CodeLlama](examples/CodeLlama2.py) +- CodeLlama 70B - Mixtral 8x7B -But to first test the setup go and download weights [CodeLlama-7B-GGUF by the `TheBloke`](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF). Once that works feel free to try out larger models on your hardware and see what happens. +We are still testing this part, but the larger the model you can run the better. Sure the responses might be slower in terms of (token/s), but code quality will be higher. -On number of parameters -------------------- +For testing that the open LLM `gpte` setup works we recommend starting with a smaller model. You can download weights of [CodeLlama-7B-GGUF by the `TheBloke`](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF). -Use the largest model possible that your hardware allows you to run. Sure the responses might be slower but code quality higher. +Feel free to try out larger models on your hardware and see what happens. Running the Example ================== -To see that your setup works see [test open LLM](examples/test_open_llm/README.md). +To see that your setup works check [test open LLM](examples/test_open_llm/README.md). In case below isn't clear enough do the same 😉 + +If above tests work proceed. + +For checking that `gpte` works with the `CodeLLama` we recommend for you to create a project with `prompt` file content: + +``` +Write a python script that sums up two numbers. Provide only the `sum_two_numbers` function and nothing else. + +Provide two tests: + +assert(sum_two_numbers(100, 10) == 110) +assert(sum_two_numbers(10.1, 10) == 20.1) +``` -If the tests work, run the LLM in separate terminal: +Now run the LLM in separate terminal: ```bash python -m llama_cpp.server --model $model_path --n_batch 256 --n_gpu_layers 30 ``` -Then run `gpt-engineer` with the following environment variables: +Then in another terminal window set following environment variables: ```bash export OPENAI_API_BASE="http://localhost:8000/v1" export OPENAI_API_KEY="sk-xxx" -export model_name="llama2" +export model_name="CodeLLama" + +And run `gpt-engineer` with the following command: + +```bash gpte $model_name --lite --temperature 0.1 ``` -On other inference libraries -------------------- +The `--lite` mode is needed for now since open models for some reason behave worse with too many instructions at the moment. Temperature is set to `0.1` to get consistent best possible results. + +*That's it. If sth. doesn't work as expected or you figure out how to improve the open LLM support please let us know.* Using Azure models ================== From c2bb6bbd17445e6be1c3a2a2529efb214b1c2a01 Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Thu, 21 Mar 2024 18:30:07 +0100 Subject: [PATCH 15/27] Readablity changes --- docs/open_models.md | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/docs/open_models.md b/docs/open_models.md index 29ff205d0a..73b89a1b17 100644 --- a/docs/open_models.md +++ b/docs/open_models.md @@ -1,13 +1,15 @@ Using with open/local models ============================ -**Use `gpte` first with OpenAI models to get a feel for the `gpte` tool. Then go play with experimental Open LLMs 🐉 support and try not to get 🔥!!** +**Use `gpte` first with OpenAI models to get a feel for the `gpte` tool.** + +**Then go play with experimental Open LLMs 🐉 support and try not to get 🔥!!** At the moment the best option for coding is still the use of `gpt-4` models provided by OpenAI. But open models are catching up and are a good free and privacy-oriented alternative if you possess the proper hardware. You can integrate `gpt-engineer` with open-source models by leveraging an OpenAI-compatible API. -We provide the minimal and cleanest solution below. What is described is not the only way to use open/local models but the one we tested and would recommend to most users. +We provide the minimal and cleanest solution below. What is described is not the only way to use open/local models, but the one we tested and would recommend to most users. More details on why the solution below is recommended in [this blog post](https://zigabrencic.com/blog/2024-02-21). @@ -19,11 +21,11 @@ For inference engine we recommend for the users to use [llama.cpp](https://githu We choose `llama.cpp` because: - 1.) It supports the largest amount of hardware acceleration backends. -- 2.) Diverse set of open LLM. +- 2.) It supports the diverse set of open LLMs. - 3.) Is written in `python` and directly on top of `llama.cpp` inference engine. - 4.) Supports the `openAI` API and `langchain` interface. -To install `llama-cpp-python` follow the official [installation docs](https://llama-cpp-python.readthedocs.io/en/latest/) and for [MacOS with Metal support](https://llama-cpp-python.readthedocs.io/en/latest/install/macos/). +To install `llama-cpp-python` follow the official [installation docs](https://llama-cpp-python.readthedocs.io/en/latest/) and [those docs](https://llama-cpp-python.readthedocs.io/en/latest/install/macos/) for MacOS with Metal support. If you want to benefit from proper hardware acceleration on your machine make sure to set up the proper compiler flags before installing your package. @@ -31,13 +33,15 @@ If you want to benefit from proper hardware acceleration on your machine make su - `macos` with Metal support: `CMAKE_ARGS="-DLLAMA_METAL=on"` - `windows`: `$env:CMAKE_ARGS = "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"` +This will enable the `pip` installer to compile the `llama.cpp` with the proper hardware acceleration backend. + Then run: ```bash pip install llama-cpp-python ``` -For the use of `API` we also need to set up the web server that `llama-cpp-python` library provides. To install: +For our use case we also need to set up the web server that `llama-cpp-python` library provides. To install: ```bash pip install 'llama-cpp-python[server]' @@ -47,9 +51,11 @@ For detailed use consult the [`llama-cpp-python` docs](https://llama-cpp-python. Before we proceed we need to obtain the model weights in the `gguf` format. That should be a single file on your disk. -In case you have weights in other formats check the `llama-cpp-python` docs for conversion to `gguf` format. Model in other formats `ggml`, `.safetensors`, etc. won't work without prior conversion to `gguf` file format! +In case you have weights in other formats check the `llama-cpp-python` docs for conversion to `gguf` format. + +Models in other formats `ggml`, `.safetensors`, etc. won't work without prior conversion to `gguf` file format with the solution described below! -Which open model to use +Which open model to use? ================== Your best choice would be: @@ -59,16 +65,16 @@ Your best choice would be: We are still testing this part, but the larger the model you can run the better. Sure the responses might be slower in terms of (token/s), but code quality will be higher. -For testing that the open LLM `gpte` setup works we recommend starting with a smaller model. You can download weights of [CodeLlama-7B-GGUF by the `TheBloke`](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF). +For testing that the open LLM `gpte` setup works we recommend starting with a smaller model. You can download weights of [CodeLlama-13B-GGUF by the `TheBloke`](https://huggingface.co/TheBloke/CodeLlama-13B-GGUF) choose the largest model version you can run (for example `Q6_K`), since quantisation will degrade LLM performance. Feel free to try out larger models on your hardware and see what happens. Running the Example ================== -To see that your setup works check [test open LLM](examples/test_open_llm/README.md). In case below isn't clear enough do the same 😉 +To see that your setup works check [test open LLM setup](examples/test_open_llm/README.md). -If above tests work proceed. +If above tests work proceed 😉 For checking that `gpte` works with the `CodeLLama` we recommend for you to create a project with `prompt` file content: @@ -87,13 +93,13 @@ Now run the LLM in separate terminal: python -m llama_cpp.server --model $model_path --n_batch 256 --n_gpu_layers 30 ``` -Then in another terminal window set following environment variables: +Then in another terminal window set the following environment variables: ```bash export OPENAI_API_BASE="http://localhost:8000/v1" export OPENAI_API_KEY="sk-xxx" export model_name="CodeLLama" - +``` And run `gpt-engineer` with the following command: @@ -103,7 +109,9 @@ gpte $model_name --lite --temperature 0.1 The `--lite` mode is needed for now since open models for some reason behave worse with too many instructions at the moment. Temperature is set to `0.1` to get consistent best possible results. -*That's it. If sth. doesn't work as expected or you figure out how to improve the open LLM support please let us know.* +That's it. + +*If sth. doesn't work as expected, or you figure out how to improve the open LLM support please let us know.* Using Azure models ================== From 6b192a99ed2e627140009278d869abbd00cf1c2a Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Thu, 21 Mar 2024 18:32:03 +0100 Subject: [PATCH 16/27] Readablity changes --- docs/examples/test_open_llm/README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/examples/test_open_llm/README.md b/docs/examples/test_open_llm/README.md index 2d9abc47b9..ed19b1ef09 100644 --- a/docs/examples/test_open_llm/README.md +++ b/docs/examples/test_open_llm/README.md @@ -13,7 +13,9 @@ Or with GPU support (recommended): python -m llama_cpp.server --model TheBloke/CodeLlama-13B-GGUF/codellama-13b.Q8_0.gguf --n_gpu_layers 1 ``` -If you have more `GPU` layers available set `--n_gpu_layers` to the higher number. To find the amount of avalibale run the above command and look for `llm_load_tensors: offloaded 1/41 layers to GPU` in the output. +If you have more `GPU` layers available set `--n_gpu_layers` to the higher number. + +To find the amount of available run the above command and look for `llm_load_tensors: offloaded 1/41 layers to GPU` in the output. ## Test API call @@ -38,7 +40,7 @@ Or using `curl`: curl --request POST \ --url http://localhost:8000/v1/chat/completions \ --header "Content-Type: application/json" \ - --data '{ "model": "llama", "prompt": "Who are you?", "max_tokens": 60}' + --data '{ "model": "CodeLlama", "prompt": "Who are you?", "max_tokens": 60}' ``` If this works also make sure that `langchain` interface works since that's how `gpte` interacts with LLMs. @@ -51,4 +53,4 @@ python examples/test_open_llm/test_langchain.py If you're not using `CodeLLama` make sure to change the `model` parameter in the test script. -That's it 🤓 time to give `gpte` a try. \ No newline at end of file +That's it 🤓 time to go back [to](/docs/open_models.md) and give `gpte` a try. \ No newline at end of file From 906fe0fcd2725c014ef0f3fe483e3c057dd87270 Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Thu, 21 Mar 2024 18:32:35 +0100 Subject: [PATCH 17/27] Readablity changes --- docs/examples/test_open_llm/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/examples/test_open_llm/README.md b/docs/examples/test_open_llm/README.md index ed19b1ef09..3a573436bf 100644 --- a/docs/examples/test_open_llm/README.md +++ b/docs/examples/test_open_llm/README.md @@ -53,4 +53,4 @@ python examples/test_open_llm/test_langchain.py If you're not using `CodeLLama` make sure to change the `model` parameter in the test script. -That's it 🤓 time to go back [to](/docs/open_models.md) and give `gpte` a try. \ No newline at end of file +That's it 🤓 time to go back [to](/docs/open_models.md#running-the-example) and give `gpte` a try. \ No newline at end of file From 04822f8c7473f1cf7f4a51c6ddce4c2953194553 Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Thu, 21 Mar 2024 18:44:33 +0100 Subject: [PATCH 18/27] Removing redundant if/else lines --- gpt_engineer/applications/cli/main.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/gpt_engineer/applications/cli/main.py b/gpt_engineer/applications/cli/main.py index 336fda99c1..a3c8178a4f 100644 --- a/gpt_engineer/applications/cli/main.py +++ b/gpt_engineer/applications/cli/main.py @@ -79,12 +79,8 @@ def load_env_if_needed(): openai.api_key = os.getenv("OPENAI_API_KEY", default=None) - local_server_url = os.getenv("OPENAI_API_BASE") - - - if local_server_url: - openai.api_base = local_server_url - openai.api_key = "sk-xxx" + if openai.api_key == "sk-xxx": + openai.api_base = os.getenv("OPENAI_API_BASE") if os.getenv("ANTHROPIC_API_KEY") is None: load_dotenv() From 276d3a63c1796332eaa9ece8bf5935ebbea1c4b2 Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Thu, 21 Mar 2024 18:45:41 +0100 Subject: [PATCH 19/27] Removing redundant if/else lines --- gpt_engineer/applications/cli/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpt_engineer/applications/cli/main.py b/gpt_engineer/applications/cli/main.py index a3c8178a4f..223410f9dd 100644 --- a/gpt_engineer/applications/cli/main.py +++ b/gpt_engineer/applications/cli/main.py @@ -77,7 +77,7 @@ def load_env_if_needed(): if os.getenv("OPENAI_API_KEY") is None: load_dotenv(dotenv_path=os.path.join(os.getcwd(), ".env")) - openai.api_key = os.getenv("OPENAI_API_KEY", default=None) + openai.api_key = os.getenv("OPENAI_API_KEY") if openai.api_key == "sk-xxx": openai.api_base = os.getenv("OPENAI_API_BASE") From 99e7dd9c21cd94e77a96b834900c4fc749704938 Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Thu, 21 Mar 2024 18:51:59 +0100 Subject: [PATCH 20/27] Ruff fixes --- docs/examples/test_open_llm/test_langchain.py | 8 +++++--- docs/examples/test_open_llm/test_openai_api.py | 5 ++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/docs/examples/test_open_llm/test_langchain.py b/docs/examples/test_open_llm/test_langchain.py index d906ab5f20..3ffbf9a859 100644 --- a/docs/examples/test_open_llm/test_langchain.py +++ b/docs/examples/test_open_llm/test_langchain.py @@ -1,13 +1,15 @@ -from langchain_openai import ChatOpenAI from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler +from langchain_openai import ChatOpenAI model = ChatOpenAI( model="CodeLlama", temperature=0.1, callbacks=[StreamingStdOutCallbackHandler()], - streaming=True + streaming=True, ) -prompt = "Provide me with only the code for a simple python function that sums two numbers." +prompt = ( + "Provide me with only the code for a simple python function that sums two numbers." +) model.invoke(prompt) diff --git a/docs/examples/test_open_llm/test_openai_api.py b/docs/examples/test_open_llm/test_openai_api.py index 75eab5b35a..cc4bc23984 100644 --- a/docs/examples/test_open_llm/test_openai_api.py +++ b/docs/examples/test_open_llm/test_openai_api.py @@ -5,7 +5,10 @@ response = client.chat.completions.create( model="CodeLlama", messages=[ - {"role": "user", "content": "Provide me with the code for a simple HTML web site."}, + { + "role": "user", + "content": "Provide me with the code for a simple HTML web site.", + }, ], temperature=0.7, max_tokens=200, From 278b3609c6ccab5d211514146ddb3522550ce381 Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Thu, 21 Mar 2024 19:10:06 +0100 Subject: [PATCH 21/27] Fix: trailing whitespace trim --- docs/examples/test_open_llm/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/examples/test_open_llm/README.md b/docs/examples/test_open_llm/README.md index 3a573436bf..953ddd53e3 100644 --- a/docs/examples/test_open_llm/README.md +++ b/docs/examples/test_open_llm/README.md @@ -13,7 +13,7 @@ Or with GPU support (recommended): python -m llama_cpp.server --model TheBloke/CodeLlama-13B-GGUF/codellama-13b.Q8_0.gguf --n_gpu_layers 1 ``` -If you have more `GPU` layers available set `--n_gpu_layers` to the higher number. +If you have more `GPU` layers available set `--n_gpu_layers` to the higher number. To find the amount of available run the above command and look for `llm_load_tensors: offloaded 1/41 layers to GPU` in the output. @@ -53,4 +53,4 @@ python examples/test_open_llm/test_langchain.py If you're not using `CodeLLama` make sure to change the `model` parameter in the test script. -That's it 🤓 time to go back [to](/docs/open_models.md#running-the-example) and give `gpte` a try. \ No newline at end of file +That's it 🤓 time to go back [to](/docs/open_models.md#running-the-example) and give `gpte` a try. From 28834284f60157af5221a92180a010f7930e7547 Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Thu, 21 Mar 2024 19:10:34 +0100 Subject: [PATCH 22/27] Fixing style changes --- docs/open_models.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/open_models.md b/docs/open_models.md index 73b89a1b17..1faf55eaae 100644 --- a/docs/open_models.md +++ b/docs/open_models.md @@ -16,7 +16,7 @@ More details on why the solution below is recommended in [this blog post](https: Setup ----- -For inference engine we recommend for the users to use [llama.cpp](https://github.com/ggerganov/llama.cpp) with its `python` bindings `llama-cpp-python`. +For inference engine we recommend for the users to use [llama.cpp](https://github.com/ggerganov/llama.cpp) with its `python` bindings `llama-cpp-python`. We choose `llama.cpp` because: @@ -47,11 +47,11 @@ For our use case we also need to set up the web server that `llama-cpp-python` l pip install 'llama-cpp-python[server]' ``` -For detailed use consult the [`llama-cpp-python` docs](https://llama-cpp-python.readthedocs.io/en/latest/server/). +For detailed use consult the [`llama-cpp-python` docs](https://llama-cpp-python.readthedocs.io/en/latest/server/). Before we proceed we need to obtain the model weights in the `gguf` format. That should be a single file on your disk. -In case you have weights in other formats check the `llama-cpp-python` docs for conversion to `gguf` format. +In case you have weights in other formats check the `llama-cpp-python` docs for conversion to `gguf` format. Models in other formats `ggml`, `.safetensors`, etc. won't work without prior conversion to `gguf` file format with the solution described below! @@ -65,14 +65,14 @@ Your best choice would be: We are still testing this part, but the larger the model you can run the better. Sure the responses might be slower in terms of (token/s), but code quality will be higher. -For testing that the open LLM `gpte` setup works we recommend starting with a smaller model. You can download weights of [CodeLlama-13B-GGUF by the `TheBloke`](https://huggingface.co/TheBloke/CodeLlama-13B-GGUF) choose the largest model version you can run (for example `Q6_K`), since quantisation will degrade LLM performance. +For testing that the open LLM `gpte` setup works we recommend starting with a smaller model. You can download weights of [CodeLlama-13B-GGUF by the `TheBloke`](https://huggingface.co/TheBloke/CodeLlama-13B-GGUF) choose the largest model version you can run (for example `Q6_K`), since quantisation will degrade LLM performance. Feel free to try out larger models on your hardware and see what happens. Running the Example ================== -To see that your setup works check [test open LLM setup](examples/test_open_llm/README.md). +To see that your setup works check [test open LLM setup](examples/test_open_llm/README.md). If above tests work proceed 😉 @@ -111,7 +111,7 @@ The `--lite` mode is needed for now since open models for some reason behave wor That's it. -*If sth. doesn't work as expected, or you figure out how to improve the open LLM support please let us know.* +*If sth. doesn't work as expected, or you figure out how to improve the open LLM support please let us know.* Using Azure models ================== From 704bf031826eb70b2d169e490cb00195bb86768f Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Fri, 22 Mar 2024 09:52:43 +0100 Subject: [PATCH 23/27] Removing the test_ from example script names so that pytest doesn't consider them as tests. Adding the shell parameter MODEL_NAME to make example clearer --- docs/examples/{test_open_llm => open_llms}/README.md | 10 +++++----- .../langchain_interface.py} | 3 ++- .../openai_api_interface.py} | 8 ++++++-- 3 files changed, 13 insertions(+), 8 deletions(-) rename docs/examples/{test_open_llm => open_llms}/README.md (81%) rename docs/examples/{test_open_llm/test_langchain.py => open_llms/langchain_interface.py} (88%) rename docs/examples/{test_open_llm/test_openai_api.py => open_llms/openai_api_interface.py} (67%) diff --git a/docs/examples/test_open_llm/README.md b/docs/examples/open_llms/README.md similarity index 81% rename from docs/examples/test_open_llm/README.md rename to docs/examples/open_llms/README.md index 953ddd53e3..93e8f3300f 100644 --- a/docs/examples/test_open_llm/README.md +++ b/docs/examples/open_llms/README.md @@ -24,15 +24,16 @@ Set the environment variables: ```bash export OPENAI_API_BASE="http://localhost:8000/v1" export OPENAI_API_KEY="sk-xxx" +export MODEL_NAME="CodeLlama" ```` Then ping the model via `python` using `OpenAI` API: ```bash -python examples/test_open_llm/test_openai_api.py +python examples/open_llms/openai_api_interface.py ``` -If you're not using `CodeLLama` make sure to change the `model` parameter in the test script. +If you're not using `CodeLLama` make sure to change the `MODEL_NAME` parameter. Or using `curl`: @@ -48,9 +49,8 @@ If this works also make sure that `langchain` interface works since that's how ` ## Langchain test ```bash -python examples/test_open_llm/test_langchain.py +export MODEL_NAME="CodeLlama" +python examples/open_llms/langchain_interface.py ``` -If you're not using `CodeLLama` make sure to change the `model` parameter in the test script. - That's it 🤓 time to go back [to](/docs/open_models.md#running-the-example) and give `gpte` a try. diff --git a/docs/examples/test_open_llm/test_langchain.py b/docs/examples/open_llms/langchain_interface.py similarity index 88% rename from docs/examples/test_open_llm/test_langchain.py rename to docs/examples/open_llms/langchain_interface.py index 3ffbf9a859..b039700182 100644 --- a/docs/examples/test_open_llm/test_langchain.py +++ b/docs/examples/open_llms/langchain_interface.py @@ -1,8 +1,9 @@ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain_openai import ChatOpenAI +import os model = ChatOpenAI( - model="CodeLlama", + model=os.getenv("MODEL_NAME"), temperature=0.1, callbacks=[StreamingStdOutCallbackHandler()], streaming=True, diff --git a/docs/examples/test_open_llm/test_openai_api.py b/docs/examples/open_llms/openai_api_interface.py similarity index 67% rename from docs/examples/test_open_llm/test_openai_api.py rename to docs/examples/open_llms/openai_api_interface.py index cc4bc23984..19828df02e 100644 --- a/docs/examples/test_open_llm/test_openai_api.py +++ b/docs/examples/open_llms/openai_api_interface.py @@ -1,9 +1,13 @@ from openai import OpenAI +import os -client = OpenAI(base_url="http://localhost:8000/v1", api_key="sk-xxx") +client = OpenAI( + base_url=os.getenv("OPENAI_API_BASE"), + api_key=os.getenv("OPENAI_API_KEY") +) response = client.chat.completions.create( - model="CodeLlama", + model=os.getenv("MODEL_NAME"), messages=[ { "role": "user", From 87fa9da628f932e075f386901ee2642a12132590 Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Fri, 22 Mar 2024 09:52:59 +0100 Subject: [PATCH 24/27] Fixing style changes --- docs/examples/open_llms/langchain_interface.py | 3 ++- docs/examples/open_llms/openai_api_interface.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/examples/open_llms/langchain_interface.py b/docs/examples/open_llms/langchain_interface.py index b039700182..05a7c2c269 100644 --- a/docs/examples/open_llms/langchain_interface.py +++ b/docs/examples/open_llms/langchain_interface.py @@ -1,6 +1,7 @@ +import os + from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain_openai import ChatOpenAI -import os model = ChatOpenAI( model=os.getenv("MODEL_NAME"), diff --git a/docs/examples/open_llms/openai_api_interface.py b/docs/examples/open_llms/openai_api_interface.py index 19828df02e..c940840e5c 100644 --- a/docs/examples/open_llms/openai_api_interface.py +++ b/docs/examples/open_llms/openai_api_interface.py @@ -1,9 +1,9 @@ -from openai import OpenAI import os +from openai import OpenAI + client = OpenAI( - base_url=os.getenv("OPENAI_API_BASE"), - api_key=os.getenv("OPENAI_API_KEY") + base_url=os.getenv("OPENAI_API_BASE"), api_key=os.getenv("OPENAI_API_KEY") ) response = client.chat.completions.create( From b0ae7318060b415d64825946070a8f2f30e95838 Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Thu, 28 Mar 2024 21:03:34 +0100 Subject: [PATCH 25/27] Switching from HTML test to python code example --- docs/examples/open_llms/openai_api_interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/examples/open_llms/openai_api_interface.py b/docs/examples/open_llms/openai_api_interface.py index c940840e5c..21650b77c1 100644 --- a/docs/examples/open_llms/openai_api_interface.py +++ b/docs/examples/open_llms/openai_api_interface.py @@ -11,7 +11,7 @@ messages=[ { "role": "user", - "content": "Provide me with the code for a simple HTML web site.", + "content": "Provide me with only the code for a simple python function that sums two numbers.", }, ], temperature=0.7, From 8afeb74b8a1a5443bf6e6d06981f396319f6c659 Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Thu, 28 Mar 2024 21:55:47 +0100 Subject: [PATCH 26/27] Switching to env variable LOCAL_MODEL --- gpt_engineer/applications/cli/main.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/gpt_engineer/applications/cli/main.py b/gpt_engineer/applications/cli/main.py index 223410f9dd..e4f4b6386e 100644 --- a/gpt_engineer/applications/cli/main.py +++ b/gpt_engineer/applications/cli/main.py @@ -79,9 +79,6 @@ def load_env_if_needed(): openai.api_key = os.getenv("OPENAI_API_KEY") - if openai.api_key == "sk-xxx": - openai.api_base = os.getenv("OPENAI_API_BASE") - if os.getenv("ANTHROPIC_API_KEY") is None: load_dotenv() if os.getenv("ANTHROPIC_API_KEY") is None: @@ -476,7 +473,7 @@ def main( if ai.token_usage_log.is_openai_model(): print("Total api cost: $ ", ai.token_usage_log.usage_cost()) - elif openai.api_key == "sk-xxx": + elif os.getenv("LOCAL_MODEL"): print("Total api cost: $ 0.0 since we are using local LLM.") else: print("Total tokens used: ", ai.token_usage_log.total_tokens()) From 4e7b07232dfe0adf29b1ae6a0716105e7ff8fcc5 Mon Sep 17 00:00:00 2001 From: Ziga Brencic Date: Thu, 28 Mar 2024 21:56:16 +0100 Subject: [PATCH 27/27] Switching to env variable LOCAL_MODEL and fixing the lower case variable name. --- docs/open_models.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/open_models.md b/docs/open_models.md index 1faf55eaae..6586e9b8a6 100644 --- a/docs/open_models.md +++ b/docs/open_models.md @@ -98,13 +98,14 @@ Then in another terminal window set the following environment variables: ```bash export OPENAI_API_BASE="http://localhost:8000/v1" export OPENAI_API_KEY="sk-xxx" -export model_name="CodeLLama" +export MODEL_NAME="CodeLLama" +export LOCAL_MODEL=true ``` And run `gpt-engineer` with the following command: ```bash -gpte $model_name --lite --temperature 0.1 +gpte $MODEL_NAME --lite --temperature 0.1 ``` The `--lite` mode is needed for now since open models for some reason behave worse with too many instructions at the moment. Temperature is set to `0.1` to get consistent best possible results.