From cb2141a79bc5c7a7d4c83fa2cb83beae69948f62 Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Tue, 20 Feb 2024 16:54:18 +0100
Subject: [PATCH 01/27] Removing the legacy docs for open models that were
 broken

---
 docs/open_models.md | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)
diff --git a/docs/open_models.md b/docs/open_models.md
index 07c423b4fe..970cfeba22 100644
--- a/docs/open_models.md
+++ b/docs/open_models.md
@@ -1,23 +1,17 @@
 Using with open/local models
 ============================
 
-You can integrate `gpt-engineer` with open-source models by leveraging an OpenAI-compatible API. One such API is provided by the [text-generator-ui _extension_ openai](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/openai/README.md).
+You can integrate `gpt-engineer` with open-source models by leveraging an OpenAI-compatible API.
 
 Setup
 -----
 
-To get started, first set up the API with the Runpod template, as per the [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/openai/README.md).
-
 Running the Example
 -------------------
 
-Once the API is set up, you can find the host and the exposed TCP port by checking your Runpod dashboard.
-
-Then, you can use the port and host to run the following example using WizardCoder-Python-34B hosted on Runpod:
+On other inference libraries
+-------------------
 
-```
-  OPENAI_API_BASE=http://<host>:<port>/v1 python -m gpt_engineer.cli.main benchmark/pomodoro_timer --steps benchmark TheBloke_WizardCoder-Python-34B-V1.0-GPTQ
-```
 
 Using Azure models
 ==================

From 9f0d79b13f1b38f5c50337f75c361c1be6c30aa4 Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Tue, 20 Feb 2024 16:59:07 +0100
Subject: [PATCH 02/27] Updating the chapter structure

---
 docs/open_models.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/docs/open_models.md b/docs/open_models.md
index 970cfeba22..d7defbadbc 100644
--- a/docs/open_models.md
+++ b/docs/open_models.md
@@ -3,6 +3,8 @@ Using with open/local models
 
 You can integrate `gpt-engineer` with open-source models by leveraging an OpenAI-compatible API.
 
+We provide the minimal and cleanest solution below. It's not the only way to use open/local models but the one we recommend and tested.
+
 Setup
 -----
 
@@ -12,6 +14,18 @@ Running the Example
 On other inference libraries
 -------------------
 
+Which open model to use
+==================
+
+Your best choice would be:
+
+- CodeLlama
+- Mixtral 8x7B
+
+On number of parameters
+-------------------
+
+Use the largest model possible that your hardware allows you to run. Sure the responses might be slower but code quality higher.
 
 Using Azure models
 ==================

From 7bb1da414eee2500c4053d8e597885c3c925a29b Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Fri, 23 Feb 2024 18:28:26 +0100
Subject: [PATCH 03/27] Adding documentation for running a specific open LLM
 model

---
 docs/examples/CodeLlama2.py           |  0
 docs/examples/test_llm_running.py     | 13 +++++++++
 docs/examples/test_open_llm/README.md | 33 ++++++++++++++++++++++
 docs/open_models.md                   | 40 +++++++++++++++++++++++----
 4 files changed, 81 insertions(+), 5 deletions(-)
 create mode 100644 docs/examples/CodeLlama2.py
 create mode 100644 docs/examples/test_llm_running.py
 create mode 100644 docs/examples/test_open_llm/README.md

diff --git a/docs/examples/CodeLlama2.py b/docs/examples/CodeLlama2.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/docs/examples/test_llm_running.py b/docs/examples/test_llm_running.py
new file mode 100644
index 0000000000..4e1c534710
--- /dev/null
+++ b/docs/examples/test_llm_running.py
@@ -0,0 +1,13 @@
+from openai import OpenAI
+
+client = OpenAI(base_url="http://localhost:8000/v1", api_key="sk-xxx")
+
+response = client.chat.completions.create(
+    model="llama2",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What is the meaning of life?"},
+    ],
+)
+
+print(response)
diff --git a/docs/examples/test_open_llm/README.md b/docs/examples/test_open_llm/README.md
new file mode 100644
index 0000000000..cab7ba6529
--- /dev/null
+++ b/docs/examples/test_open_llm/README.md
@@ -0,0 +1,33 @@
+# Test that the Open LLM is running
+
+First start the server by using only CPU:
+
+```bash
+export model_path="models/llama-2-7b.Q2_K.gguf"
+python -m llama_cpp.server --model $model_path
+```
+
+Or with GPU support (recommended):
+
+```bash
+python -m llama_cpp.server --model models/llama-2-7b.Q2_K.gguf --n_gpu_layers 1
+```
+
+If you have more `GPU` layers available set `--n_gpu_layers` to the higher number.
+
+## Test API call
+
+Then ping it via `python` using `OpenAI` API:
+
+```bash
+python examples/test_open_llm/test_open_llm.py
+```
+
+Or via `curl`:
+
+```bash
+curl --request POST \
+     --url http://localhost:8000/v1/chat/completions \
+     --header "Content-Type: application/json" \
+     --data '{ "model": "llama", "prompt": "Who are you?", "max_tokens": 60}'
+```
\ No newline at end of file
diff --git a/docs/open_models.md b/docs/open_models.md
index d7defbadbc..a6a732769c 100644
--- a/docs/open_models.md
+++ b/docs/open_models.md
@@ -8,25 +8,55 @@ We provide the minimal and cleanest solution below. It's not the only way to use
 Setup
 -----
 
-Running the Example
--------------------
+For inference engine we recommend to the users to use [llama.cpp](https://github.com/ggerganov/llama.cpp) with its `python` bindings `llama-cpp-python`. We choose `llama.cpp` because it supports the largest amount of hardware acceleration backends.
 
-On other inference libraries
--------------------
+To install `llama-cpp-python` follow the official [installation docs](https://llama-cpp-python.readthedocs.io/en/latest/) and for [MacOS with Metal support](https://llama-cpp-python.readthedocs.io/en/latest/install/macos/).
+
+If you want to have benefit from proper hardware acceleration on your machine make sure to set up the proper compile flags:
+
+- `linux`: `CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"`
+- `macos` with Metal support: `CMAKE_ARGS="-DLLAMA_METAL=on"`
+- `windows`: `$env:CMAKE_ARGS = "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"`
+
+Before running:
+
+```bash
+pip install llama-cpp-python
+```
+
+For the use of `API` we also need to set up the web server:
+
+```bash
+pip install 'llama-cpp-python[server]'
+```
+
+For detailed use consult the [`llama-cpp-python` docs](https://llama-cpp-python.readthedocs.io/en/latest/server/). 
+
+Before we proceed we need to obtain the model weights in the `gguf` format. In case you have weights in other formats check the `llama-cpp-python` docs for conversion to `gguf` format.
 
 Which open model to use
 ==================
 
 Your best choice would be:
 
-- CodeLlama
+- [CodeLlama](examples/CodeLlama2.py)
 - Mixtral 8x7B
 
+But to first test the setup go and download weights [CodeLlama-7B-GGUF by the `TheBloke`](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF). Once that works feel free to try out larger models on your hardware and see what happens.
+
 On number of parameters
 -------------------
 
 Use the largest model possible that your hardware allows you to run. Sure the responses might be slower but code quality higher.
 
+Running the Example
+==================
+
+To see that your setup works see [test open LLM](examples/test_open_llm/README.md).
+
+On other inference libraries
+-------------------
+
 Using Azure models
 ==================
 

From e8c34d263cba734819f27c8616080e4db3807504 Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Thu, 29 Feb 2024 18:28:11 +0100
Subject: [PATCH 04/27] Adding an explanation why to use open LLL's

---
 docs/open_models.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/open_models.md b/docs/open_models.md
index a6a732769c..526a903574 100644
--- a/docs/open_models.md
+++ b/docs/open_models.md
@@ -1,6 +1,8 @@
 Using with open/local models
 ============================
 
+At the moment the best option for coding is still the use of `gpt-4` models provided by OpenAI. But open models are catching up and are a good free and privacy-oriented alternative if you possess the proper hardware.
+
 You can integrate `gpt-engineer` with open-source models by leveraging an OpenAI-compatible API.
 
 We provide the minimal and cleanest solution below. It's not the only way to use open/local models but the one we recommend and tested.

From db6f5bbea065f29ee15d915d7109ffa4b6782890 Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Thu, 29 Feb 2024 18:54:39 +0100
Subject: [PATCH 05/27] Updating the example call of the open model

---
 docs/examples/test_llm_running.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/examples/test_llm_running.py b/docs/examples/test_llm_running.py
index 4e1c534710..74b31acd36 100644
--- a/docs/examples/test_llm_running.py
+++ b/docs/examples/test_llm_running.py
@@ -5,9 +5,10 @@
 response = client.chat.completions.create(
     model="llama2",
     messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "What is the meaning of life?"},
+        {"role": "user", "content": "Provide me with the code for a simple HTML web site."},
     ],
+    temperature=0.7,
+    max_tokens=200,
 )
 
-print(response)
+print(response.choices[0].message.content)

From bf7a802cfdd61bdd2bd16346c6bc9f576a8399c4 Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Thu, 29 Feb 2024 19:04:15 +0100
Subject: [PATCH 06/27] Adding the option to load local model URL

---
 gpt_engineer/applications/cli/main.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/gpt_engineer/applications/cli/main.py b/gpt_engineer/applications/cli/main.py
index 81970600db..f0740fd0f3 100644
--- a/gpt_engineer/applications/cli/main.py
+++ b/gpt_engineer/applications/cli/main.py
@@ -58,7 +58,15 @@ def load_env_if_needed():
     if os.getenv("OPENAI_API_KEY") is None:
         # if there is no .env file, try to load from the current working directory
         load_dotenv(dotenv_path=os.path.join(os.getcwd(), ".env"))
-    openai.api_key = os.getenv("OPENAI_API_KEY")
+
+    openai.api_key = os.getenv("OPENAI_API_KEY", default=None)
+
+    local_server_url = os.getenv("OPENAI_API_BASE")
+    
+
+    if local_server_url:
+        openai.api_base = local_server_url
+        openai.api_key = "sk-xxx"
 
 
 def load_prompt(input_repo: DiskMemory, improve_mode):

From 5d481f151d42792a94617b91018b23dd7ea91199 Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Thu, 29 Feb 2024 19:08:24 +0100
Subject: [PATCH 07/27] Explaining how to run the open LLM model with gpte

---
 docs/open_models.md | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/docs/open_models.md b/docs/open_models.md
index 526a903574..48a715a7ca 100644
--- a/docs/open_models.md
+++ b/docs/open_models.md
@@ -54,7 +54,21 @@ Use the largest model possible that your hardware allows you to run. Sure the re
 Running the Example
 ==================
 
-To see that your setup works see [test open LLM](examples/test_open_llm/README.md).
+To see that your setup works see [test open LLM](examples/test_open_llm/README.md). 
+
+If the tests work, run the LLM in separate terminal:
+
+```bash
+python -m llama_cpp.server --model $model_path
+```
+
+Then run `gpt-engineer` with the following environment variables:
+
+```bash
+export OPENAI_API_BASE="http://localhost:8000/v1"
+export OPENAI_API_KEY="sk-xxx"
+gpte <project_dir>
+```
 
 On other inference libraries
 -------------------

From d12e8b4015ce972030703f171a2d77e3aba1bf0c Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Thu, 29 Feb 2024 19:08:34 +0100
Subject: [PATCH 08/27] Formating

---
 gpt_engineer/applications/cli/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpt_engineer/applications/cli/main.py b/gpt_engineer/applications/cli/main.py
index f0740fd0f3..fbaaa70382 100644
--- a/gpt_engineer/applications/cli/main.py
+++ b/gpt_engineer/applications/cli/main.py
@@ -62,7 +62,7 @@ def load_env_if_needed():
     openai.api_key = os.getenv("OPENAI_API_KEY", default=None)
 
     local_server_url = os.getenv("OPENAI_API_BASE")
-    
+
 
     if local_server_url:
         openai.api_base = local_server_url

From c49eb28e4d4e9e970f1aff37147a2903b26a6e11 Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Sun, 17 Mar 2024 12:44:56 +0100
Subject: [PATCH 09/27] Adding the necceseary scripts for testing that openLLM
 works

---
 docs/examples/CodeLlama2.py                      |  0
 docs/examples/test_open_llm/test_langchain.py    | 16 ++++++++++++++++
 .../test_openai_api.py}                          |  0
 docs/open_models.md                              |  4 +++-
 4 files changed, 19 insertions(+), 1 deletion(-)
 delete mode 100644 docs/examples/CodeLlama2.py
 create mode 100644 docs/examples/test_open_llm/test_langchain.py
 rename docs/examples/{test_llm_running.py => test_open_llm/test_openai_api.py} (100%)

diff --git a/docs/examples/CodeLlama2.py b/docs/examples/CodeLlama2.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/docs/examples/test_open_llm/test_langchain.py b/docs/examples/test_open_llm/test_langchain.py
new file mode 100644
index 0000000000..19481070ce
--- /dev/null
+++ b/docs/examples/test_open_llm/test_langchain.py
@@ -0,0 +1,16 @@
+from langchain_openai import ChatOpenAI
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+
+temperature = 0.1
+model_name = "CodeLlama"
+
+model = ChatOpenAI(
+    model=model_name,
+    temperature=temperature,
+    callbacks=[StreamingStdOutCallbackHandler()],
+    streaming=True
+)
+
+prompt = "Provide me with only the code for a simple python function that sums two numbers."
+
+model.invoke(prompt)
diff --git a/docs/examples/test_llm_running.py b/docs/examples/test_open_llm/test_openai_api.py
similarity index 100%
rename from docs/examples/test_llm_running.py
rename to docs/examples/test_open_llm/test_openai_api.py
diff --git a/docs/open_models.md b/docs/open_models.md
index 48a715a7ca..a47927e1f0 100644
--- a/docs/open_models.md
+++ b/docs/open_models.md
@@ -67,7 +67,9 @@ Then run `gpt-engineer` with the following environment variables:
 ```bash
 export OPENAI_API_BASE="http://localhost:8000/v1"
 export OPENAI_API_KEY="sk-xxx"
-gpte <project_dir>
+export model_name="llama2"
+
+gpte <project_dir> $model_name
 ```
 
 On other inference libraries

From f717498dc6f275a18997add0068f40daa268b200 Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Sun, 17 Mar 2024 12:46:56 +0100
Subject: [PATCH 10/27] In the api cost estimation step we don't pay for
 running our local LLM.

---
 gpt_engineer/applications/cli/main.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/gpt_engineer/applications/cli/main.py b/gpt_engineer/applications/cli/main.py
index fbaaa70382..921d6d995d 100644
--- a/gpt_engineer/applications/cli/main.py
+++ b/gpt_engineer/applications/cli/main.py
@@ -267,7 +267,10 @@ def main(
 
     store.upload(files_dict)
 
-    print("Total api cost: $ ", ai.token_usage_log.usage_cost())
+    if openai.api_key == "sk-xxx":
+        print("Total api cost: $ 0.0 since we are using local LLM.")
+    else:
+        print("Total api cost: $ ", ai.token_usage_log.usage_cost())
 
 
 if __name__ == "__main__":

From 29ec1b90bd4aab650c9f81ed6ad2ec2fb5da5238 Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Thu, 21 Mar 2024 17:46:09 +0100
Subject: [PATCH 11/27] Updating the commands for running the gpte with open
 models

---
 docs/open_models.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/open_models.md b/docs/open_models.md
index a47927e1f0..c03c7a4f2c 100644
--- a/docs/open_models.md
+++ b/docs/open_models.md
@@ -59,7 +59,7 @@ To see that your setup works see [test open LLM](examples/test_open_llm/README.m
 If the tests work, run the LLM in separate terminal:
 
 ```bash
-python -m llama_cpp.server --model $model_path
+python -m llama_cpp.server --model $model_path --n_batch 256 --n_gpu_layers 30
 ```
 
 Then run `gpt-engineer` with the following environment variables:
@@ -69,7 +69,7 @@ export OPENAI_API_BASE="http://localhost:8000/v1"
 export OPENAI_API_KEY="sk-xxx"
 export model_name="llama2"
 
-gpte <project_dir> $model_name
+gpte <project_dir> $model_name --lite --temperature 0.1
 ```
 
 On other inference libraries

From 130da6fe845a38b12e275b734d909fc9e1c8994f Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Thu, 21 Mar 2024 18:13:15 +0100
Subject: [PATCH 12/27] Simplifying the test library examples

---
 docs/examples/test_open_llm/test_langchain.py  | 7 ++-----
 docs/examples/test_open_llm/test_openai_api.py | 2 +-
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/docs/examples/test_open_llm/test_langchain.py b/docs/examples/test_open_llm/test_langchain.py
index 19481070ce..d906ab5f20 100644
--- a/docs/examples/test_open_llm/test_langchain.py
+++ b/docs/examples/test_open_llm/test_langchain.py
@@ -1,12 +1,9 @@
 from langchain_openai import ChatOpenAI
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 
-temperature = 0.1
-model_name = "CodeLlama"
-
 model = ChatOpenAI(
-    model=model_name,
-    temperature=temperature,
+    model="CodeLlama",
+    temperature=0.1,
     callbacks=[StreamingStdOutCallbackHandler()],
     streaming=True
 )
diff --git a/docs/examples/test_open_llm/test_openai_api.py b/docs/examples/test_open_llm/test_openai_api.py
index 74b31acd36..75eab5b35a 100644
--- a/docs/examples/test_open_llm/test_openai_api.py
+++ b/docs/examples/test_open_llm/test_openai_api.py
@@ -3,7 +3,7 @@
 client = OpenAI(base_url="http://localhost:8000/v1", api_key="sk-xxx")
 
 response = client.chat.completions.create(
-    model="llama2",
+    model="CodeLlama",
     messages=[
         {"role": "user", "content": "Provide me with the code for a simple HTML web site."},
     ],

From 5b8c2fd22e65ce9fb5ebef7664749cbda3acd678 Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Thu, 21 Mar 2024 18:14:11 +0100
Subject: [PATCH 13/27] Fixing unclear parts in the test docs

---
 docs/examples/test_open_llm/README.md | 35 +++++++++++++++++++++------
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/docs/examples/test_open_llm/README.md b/docs/examples/test_open_llm/README.md
index cab7ba6529..2d9abc47b9 100644
--- a/docs/examples/test_open_llm/README.md
+++ b/docs/examples/test_open_llm/README.md
@@ -3,31 +3,52 @@
 First start the server by using only CPU:
 
 ```bash
-export model_path="models/llama-2-7b.Q2_K.gguf"
+export model_path="TheBloke/CodeLlama-13B-GGUF/codellama-13b.Q8_0.gguf"
 python -m llama_cpp.server --model $model_path
 ```
 
 Or with GPU support (recommended):
 
 ```bash
-python -m llama_cpp.server --model models/llama-2-7b.Q2_K.gguf --n_gpu_layers 1
+python -m llama_cpp.server --model TheBloke/CodeLlama-13B-GGUF/codellama-13b.Q8_0.gguf --n_gpu_layers 1
 ```
 
-If you have more `GPU` layers available set `--n_gpu_layers` to the higher number.
+If you have more `GPU` layers available set `--n_gpu_layers` to the higher number. To find the amount of avalibale  run the above command and look for `llm_load_tensors: offloaded 1/41 layers to GPU` in the output.
 
 ## Test API call
 
-Then ping it via `python` using `OpenAI` API:
+Set the environment variables:
 
 ```bash
-python examples/test_open_llm/test_open_llm.py
+export OPENAI_API_BASE="http://localhost:8000/v1"
+export OPENAI_API_KEY="sk-xxx"
+````
+
+Then ping the model via `python` using `OpenAI` API:
+
+```bash
+python examples/test_open_llm/test_openai_api.py
 ```
 
-Or via `curl`:
+If you're not using `CodeLLama` make sure to change the `model` parameter in the test script.
+
+Or using `curl`:
 
 ```bash
 curl --request POST \
      --url http://localhost:8000/v1/chat/completions \
      --header "Content-Type: application/json" \
      --data '{ "model": "llama", "prompt": "Who are you?", "max_tokens": 60}'
-```
\ No newline at end of file
+```
+
+If this works also make sure that `langchain` interface works since that's how `gpte` interacts with LLMs.
+
+## Langchain test
+
+```bash
+python examples/test_open_llm/test_langchain.py
+```
+
+If you're not using `CodeLLama` make sure to change the `model` parameter in the test script.
+
+That's it 🤓 time to give `gpte` a try.
\ No newline at end of file

From 31c0dc06ccf20b1677b49d52aa7ba9a2fcb7496b Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Thu, 21 Mar 2024 18:22:13 +0100
Subject: [PATCH 14/27] Cleaning up the explanations for open model use

---
 docs/open_models.md | 64 +++++++++++++++++++++++++++++++++------------
 1 file changed, 47 insertions(+), 17 deletions(-)

diff --git a/docs/open_models.md b/docs/open_models.md
index c03c7a4f2c..29ff205d0a 100644
--- a/docs/open_models.md
+++ b/docs/open_models.md
@@ -1,32 +1,43 @@
 Using with open/local models
 ============================
 
+**Use `gpte` first with OpenAI models to get a feel for the `gpte` tool. Then go play with experimental Open LLMs 🐉 support and try not to get 🔥!!**
+
 At the moment the best option for coding is still the use of `gpt-4` models provided by OpenAI. But open models are catching up and are a good free and privacy-oriented alternative if you possess the proper hardware.
 
 You can integrate `gpt-engineer` with open-source models by leveraging an OpenAI-compatible API.
 
-We provide the minimal and cleanest solution below. It's not the only way to use open/local models but the one we recommend and tested.
+We provide the minimal and cleanest solution below. What is described is not the only way to use open/local models but the one we tested and would recommend to most users.
+
+More details on why the solution below is recommended in [this blog post](https://zigabrencic.com/blog/2024-02-21).
 
 Setup
 -----
 
-For inference engine we recommend to the users to use [llama.cpp](https://github.com/ggerganov/llama.cpp) with its `python` bindings `llama-cpp-python`. We choose `llama.cpp` because it supports the largest amount of hardware acceleration backends.
+For inference engine we recommend for the users to use [llama.cpp](https://github.com/ggerganov/llama.cpp) with its `python` bindings `llama-cpp-python`. 
+
+We choose `llama.cpp` because:
+
+- 1.) It supports the largest amount of hardware acceleration backends.
+- 2.) Diverse set of open LLM.
+- 3.) Is written in `python` and directly on top of `llama.cpp` inference engine.
+- 4.) Supports the `openAI` API and `langchain` interface.
 
 To install `llama-cpp-python` follow the official [installation docs](https://llama-cpp-python.readthedocs.io/en/latest/) and for [MacOS with Metal support](https://llama-cpp-python.readthedocs.io/en/latest/install/macos/).
 
-If you want to have benefit from proper hardware acceleration on your machine make sure to set up the proper compile flags:
+If you want to benefit from proper hardware acceleration on your machine make sure to set up the proper compiler flags before installing your package.
 
 - `linux`: `CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"`
 - `macos` with Metal support: `CMAKE_ARGS="-DLLAMA_METAL=on"`
 - `windows`: `$env:CMAKE_ARGS = "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"`
 
-Before running:
+Then run:
 
 ```bash
 pip install llama-cpp-python
 ```
 
-For the use of `API` we also need to set up the web server:
+For the use of `API` we also need to set up the web server that `llama-cpp-python` library provides. To install:
 
 ```bash
 pip install 'llama-cpp-python[server]'
@@ -34,46 +45,65 @@ pip install 'llama-cpp-python[server]'
 
 For detailed use consult the [`llama-cpp-python` docs](https://llama-cpp-python.readthedocs.io/en/latest/server/). 
 
-Before we proceed we need to obtain the model weights in the `gguf` format. In case you have weights in other formats check the `llama-cpp-python` docs for conversion to `gguf` format.
+Before we proceed we need to obtain the model weights in the `gguf` format. That should be a single file on your disk.
+
+In case you have weights in other formats check the `llama-cpp-python` docs for conversion to `gguf` format. Model in other formats `ggml`, `.safetensors`, etc. won't work without prior conversion to `gguf` file format!
 
 Which open model to use
 ==================
 
 Your best choice would be:
 
-- [CodeLlama](examples/CodeLlama2.py)
+- CodeLlama 70B
 - Mixtral 8x7B
 
-But to first test the setup go and download weights [CodeLlama-7B-GGUF by the `TheBloke`](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF). Once that works feel free to try out larger models on your hardware and see what happens.
+We are still testing this part, but the larger the model you can run the better. Sure the responses might be slower in terms of (token/s), but code quality will be higher.
 
-On number of parameters
--------------------
+For testing that the open LLM `gpte` setup works we recommend starting with a smaller model. You can download weights of [CodeLlama-7B-GGUF by the `TheBloke`](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF). 
 
-Use the largest model possible that your hardware allows you to run. Sure the responses might be slower but code quality higher.
+Feel free to try out larger models on your hardware and see what happens.
 
 Running the Example
 ==================
 
-To see that your setup works see [test open LLM](examples/test_open_llm/README.md). 
+To see that your setup works check [test open LLM](examples/test_open_llm/README.md). In case below isn't clear enough do the same 😉
+
+If above tests work proceed.
+
+For checking that `gpte` works with the `CodeLLama` we recommend for you to create a project with `prompt` file content:
+
+```
+Write a python script that sums up two numbers. Provide only the `sum_two_numbers` function and nothing else.
+
+Provide two tests:
+
+assert(sum_two_numbers(100, 10) == 110)
+assert(sum_two_numbers(10.1, 10) == 20.1)
+```
 
-If the tests work, run the LLM in separate terminal:
+Now run the LLM in separate terminal:
 
 ```bash
 python -m llama_cpp.server --model $model_path --n_batch 256 --n_gpu_layers 30
 ```
 
-Then run `gpt-engineer` with the following environment variables:
+Then in another terminal window set following environment variables:
 
 ```bash
 export OPENAI_API_BASE="http://localhost:8000/v1"
 export OPENAI_API_KEY="sk-xxx"
-export model_name="llama2"
+export model_name="CodeLLama"
 
+
+And run `gpt-engineer` with the following command:
+
+```bash
 gpte <project_dir> $model_name --lite --temperature 0.1
 ```
 
-On other inference libraries
--------------------
+The `--lite` mode is needed for now since open models for some reason behave worse with too many instructions at the moment. Temperature is set to `0.1` to get consistent best possible results.
+
+*That's it. If sth. doesn't work as expected or you figure out how to improve the open LLM support please let us know.*  
 
 Using Azure models
 ==================

From c2bb6bbd17445e6be1c3a2a2529efb214b1c2a01 Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Thu, 21 Mar 2024 18:30:07 +0100
Subject: [PATCH 15/27] Readablity changes

---
 docs/open_models.md | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/docs/open_models.md b/docs/open_models.md
index 29ff205d0a..73b89a1b17 100644
--- a/docs/open_models.md
+++ b/docs/open_models.md
@@ -1,13 +1,15 @@
 Using with open/local models
 ============================
 
-**Use `gpte` first with OpenAI models to get a feel for the `gpte` tool. Then go play with experimental Open LLMs 🐉 support and try not to get 🔥!!**
+**Use `gpte` first with OpenAI models to get a feel for the `gpte` tool.**
+
+**Then go play with experimental Open LLMs 🐉 support and try not to get 🔥!!**
 
 At the moment the best option for coding is still the use of `gpt-4` models provided by OpenAI. But open models are catching up and are a good free and privacy-oriented alternative if you possess the proper hardware.
 
 You can integrate `gpt-engineer` with open-source models by leveraging an OpenAI-compatible API.
 
-We provide the minimal and cleanest solution below. What is described is not the only way to use open/local models but the one we tested and would recommend to most users.
+We provide the minimal and cleanest solution below. What is described is not the only way to use open/local models, but the one we tested and would recommend to most users.
 
 More details on why the solution below is recommended in [this blog post](https://zigabrencic.com/blog/2024-02-21).
 
@@ -19,11 +21,11 @@ For inference engine we recommend for the users to use [llama.cpp](https://githu
 We choose `llama.cpp` because:
 
 - 1.) It supports the largest amount of hardware acceleration backends.
-- 2.) Diverse set of open LLM.
+- 2.) It supports the diverse set of open LLMs.
 - 3.) Is written in `python` and directly on top of `llama.cpp` inference engine.
 - 4.) Supports the `openAI` API and `langchain` interface.
 
-To install `llama-cpp-python` follow the official [installation docs](https://llama-cpp-python.readthedocs.io/en/latest/) and for [MacOS with Metal support](https://llama-cpp-python.readthedocs.io/en/latest/install/macos/).
+To install `llama-cpp-python` follow the official [installation docs](https://llama-cpp-python.readthedocs.io/en/latest/) and [those docs](https://llama-cpp-python.readthedocs.io/en/latest/install/macos/) for MacOS with Metal support.
 
 If you want to benefit from proper hardware acceleration on your machine make sure to set up the proper compiler flags before installing your package.
 
@@ -31,13 +33,15 @@ If you want to benefit from proper hardware acceleration on your machine make su
 - `macos` with Metal support: `CMAKE_ARGS="-DLLAMA_METAL=on"`
 - `windows`: `$env:CMAKE_ARGS = "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"`
 
+This will enable the `pip` installer to compile the `llama.cpp` with the proper hardware acceleration backend.
+
 Then run:
 
 ```bash
 pip install llama-cpp-python
 ```
 
-For the use of `API` we also need to set up the web server that `llama-cpp-python` library provides. To install:
+For our use case we also need to set up the web server that `llama-cpp-python` library provides. To install:
 
 ```bash
 pip install 'llama-cpp-python[server]'
@@ -47,9 +51,11 @@ For detailed use consult the [`llama-cpp-python` docs](https://llama-cpp-python.
 
 Before we proceed we need to obtain the model weights in the `gguf` format. That should be a single file on your disk.
 
-In case you have weights in other formats check the `llama-cpp-python` docs for conversion to `gguf` format. Model in other formats `ggml`, `.safetensors`, etc. won't work without prior conversion to `gguf` file format!
+In case you have weights in other formats check the `llama-cpp-python` docs for conversion to `gguf` format. 
+
+Models in other formats `ggml`, `.safetensors`, etc. won't work without prior conversion to `gguf` file format with the solution described below!
 
-Which open model to use
+Which open model to use?
 ==================
 
 Your best choice would be:
@@ -59,16 +65,16 @@ Your best choice would be:
 
 We are still testing this part, but the larger the model you can run the better. Sure the responses might be slower in terms of (token/s), but code quality will be higher.
 
-For testing that the open LLM `gpte` setup works we recommend starting with a smaller model. You can download weights of [CodeLlama-7B-GGUF by the `TheBloke`](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF). 
+For testing that the open LLM `gpte` setup works we recommend starting with a smaller model. You can download weights of [CodeLlama-13B-GGUF by the `TheBloke`](https://huggingface.co/TheBloke/CodeLlama-13B-GGUF) choose the largest model version you can run (for example `Q6_K`), since quantisation will degrade LLM performance. 
 
 Feel free to try out larger models on your hardware and see what happens.
 
 Running the Example
 ==================
 
-To see that your setup works check [test open LLM](examples/test_open_llm/README.md). In case below isn't clear enough do the same 😉
+To see that your setup works check [test open LLM setup](examples/test_open_llm/README.md). 
 
-If above tests work proceed.
+If above tests work proceed 😉
 
 For checking that `gpte` works with the `CodeLLama` we recommend for you to create a project with `prompt` file content:
 
@@ -87,13 +93,13 @@ Now run the LLM in separate terminal:
 python -m llama_cpp.server --model $model_path --n_batch 256 --n_gpu_layers 30
 ```
 
-Then in another terminal window set following environment variables:
+Then in another terminal window set the following environment variables:
 
 ```bash
 export OPENAI_API_BASE="http://localhost:8000/v1"
 export OPENAI_API_KEY="sk-xxx"
 export model_name="CodeLLama"
-
+```
 
 And run `gpt-engineer` with the following command:
 
@@ -103,7 +109,9 @@ gpte <project_dir> $model_name --lite --temperature 0.1
 
 The `--lite` mode is needed for now since open models for some reason behave worse with too many instructions at the moment. Temperature is set to `0.1` to get consistent best possible results.
 
-*That's it. If sth. doesn't work as expected or you figure out how to improve the open LLM support please let us know.*  
+That's it.
+
+*If sth. doesn't work as expected, or you figure out how to improve the open LLM support please let us know.*  
 
 Using Azure models
 ==================

From 6b192a99ed2e627140009278d869abbd00cf1c2a Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Thu, 21 Mar 2024 18:32:03 +0100
Subject: [PATCH 16/27] Readablity changes

---
 docs/examples/test_open_llm/README.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/examples/test_open_llm/README.md b/docs/examples/test_open_llm/README.md
index 2d9abc47b9..ed19b1ef09 100644
--- a/docs/examples/test_open_llm/README.md
+++ b/docs/examples/test_open_llm/README.md
@@ -13,7 +13,9 @@ Or with GPU support (recommended):
 python -m llama_cpp.server --model TheBloke/CodeLlama-13B-GGUF/codellama-13b.Q8_0.gguf --n_gpu_layers 1
 ```
 
-If you have more `GPU` layers available set `--n_gpu_layers` to the higher number. To find the amount of avalibale  run the above command and look for `llm_load_tensors: offloaded 1/41 layers to GPU` in the output.
+If you have more `GPU` layers available set `--n_gpu_layers` to the higher number. 
+
+To find the amount of available  run the above command and look for `llm_load_tensors: offloaded 1/41 layers to GPU` in the output.
 
 ## Test API call
 
@@ -38,7 +40,7 @@ Or using `curl`:
 curl --request POST \
      --url http://localhost:8000/v1/chat/completions \
      --header "Content-Type: application/json" \
-     --data '{ "model": "llama", "prompt": "Who are you?", "max_tokens": 60}'
+     --data '{ "model": "CodeLlama", "prompt": "Who are you?", "max_tokens": 60}'
 ```
 
 If this works also make sure that `langchain` interface works since that's how `gpte` interacts with LLMs.
@@ -51,4 +53,4 @@ python examples/test_open_llm/test_langchain.py
 
 If you're not using `CodeLLama` make sure to change the `model` parameter in the test script.
 
-That's it 🤓 time to give `gpte` a try.
\ No newline at end of file
+That's it 🤓 time to go back [to](/docs/open_models.md) and give `gpte` a try.
\ No newline at end of file

From 906fe0fcd2725c014ef0f3fe483e3c057dd87270 Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Thu, 21 Mar 2024 18:32:35 +0100
Subject: [PATCH 17/27] Readablity changes

---
 docs/examples/test_open_llm/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/examples/test_open_llm/README.md b/docs/examples/test_open_llm/README.md
index ed19b1ef09..3a573436bf 100644
--- a/docs/examples/test_open_llm/README.md
+++ b/docs/examples/test_open_llm/README.md
@@ -53,4 +53,4 @@ python examples/test_open_llm/test_langchain.py
 
 If you're not using `CodeLLama` make sure to change the `model` parameter in the test script.
 
-That's it 🤓 time to go back [to](/docs/open_models.md) and give `gpte` a try.
\ No newline at end of file
+That's it 🤓 time to go back [to](/docs/open_models.md#running-the-example) and give `gpte` a try.
\ No newline at end of file

From 04822f8c7473f1cf7f4a51c6ddce4c2953194553 Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Thu, 21 Mar 2024 18:44:33 +0100
Subject: [PATCH 18/27] Removing redundant if/else lines

---
 gpt_engineer/applications/cli/main.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/gpt_engineer/applications/cli/main.py b/gpt_engineer/applications/cli/main.py
index 336fda99c1..a3c8178a4f 100644
--- a/gpt_engineer/applications/cli/main.py
+++ b/gpt_engineer/applications/cli/main.py
@@ -79,12 +79,8 @@ def load_env_if_needed():
 
     openai.api_key = os.getenv("OPENAI_API_KEY", default=None)
 
-    local_server_url = os.getenv("OPENAI_API_BASE")
-
-
-    if local_server_url:
-        openai.api_base = local_server_url
-        openai.api_key = "sk-xxx"
+    if openai.api_key == "sk-xxx":
+        openai.api_base = os.getenv("OPENAI_API_BASE")
 
     if os.getenv("ANTHROPIC_API_KEY") is None:
         load_dotenv()

From 276d3a63c1796332eaa9ece8bf5935ebbea1c4b2 Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Thu, 21 Mar 2024 18:45:41 +0100
Subject: [PATCH 19/27] Removing redundant if/else lines

---
 gpt_engineer/applications/cli/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpt_engineer/applications/cli/main.py b/gpt_engineer/applications/cli/main.py
index a3c8178a4f..223410f9dd 100644
--- a/gpt_engineer/applications/cli/main.py
+++ b/gpt_engineer/applications/cli/main.py
@@ -77,7 +77,7 @@ def load_env_if_needed():
     if os.getenv("OPENAI_API_KEY") is None:
         load_dotenv(dotenv_path=os.path.join(os.getcwd(), ".env"))
 
-    openai.api_key = os.getenv("OPENAI_API_KEY", default=None)
+    openai.api_key = os.getenv("OPENAI_API_KEY")
 
     if openai.api_key == "sk-xxx":
         openai.api_base = os.getenv("OPENAI_API_BASE")

From 99e7dd9c21cd94e77a96b834900c4fc749704938 Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Thu, 21 Mar 2024 18:51:59 +0100
Subject: [PATCH 20/27] Ruff fixes

---
 docs/examples/test_open_llm/test_langchain.py  | 8 +++++---
 docs/examples/test_open_llm/test_openai_api.py | 5 ++++-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/docs/examples/test_open_llm/test_langchain.py b/docs/examples/test_open_llm/test_langchain.py
index d906ab5f20..3ffbf9a859 100644
--- a/docs/examples/test_open_llm/test_langchain.py
+++ b/docs/examples/test_open_llm/test_langchain.py
@@ -1,13 +1,15 @@
-from langchain_openai import ChatOpenAI
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+from langchain_openai import ChatOpenAI
 
 model = ChatOpenAI(
     model="CodeLlama",
     temperature=0.1,
     callbacks=[StreamingStdOutCallbackHandler()],
-    streaming=True
+    streaming=True,
 )
 
-prompt = "Provide me with only the code for a simple python function that sums two numbers."
+prompt = (
+    "Provide me with only the code for a simple python function that sums two numbers."
+)
 
 model.invoke(prompt)
diff --git a/docs/examples/test_open_llm/test_openai_api.py b/docs/examples/test_open_llm/test_openai_api.py
index 75eab5b35a..cc4bc23984 100644
--- a/docs/examples/test_open_llm/test_openai_api.py
+++ b/docs/examples/test_open_llm/test_openai_api.py
@@ -5,7 +5,10 @@
 response = client.chat.completions.create(
     model="CodeLlama",
     messages=[
-        {"role": "user", "content": "Provide me with the code for a simple HTML web site."},
+        {
+            "role": "user",
+            "content": "Provide me with the code for a simple HTML web site.",
+        },
     ],
     temperature=0.7,
     max_tokens=200,

From 278b3609c6ccab5d211514146ddb3522550ce381 Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Thu, 21 Mar 2024 19:10:06 +0100
Subject: [PATCH 21/27] Fix: trailing whitespace trim

---
 docs/examples/test_open_llm/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/examples/test_open_llm/README.md b/docs/examples/test_open_llm/README.md
index 3a573436bf..953ddd53e3 100644
--- a/docs/examples/test_open_llm/README.md
+++ b/docs/examples/test_open_llm/README.md
@@ -13,7 +13,7 @@ Or with GPU support (recommended):
 python -m llama_cpp.server --model TheBloke/CodeLlama-13B-GGUF/codellama-13b.Q8_0.gguf --n_gpu_layers 1
 ```
 
-If you have more `GPU` layers available set `--n_gpu_layers` to the higher number. 
+If you have more `GPU` layers available set `--n_gpu_layers` to the higher number.
 
 To find the amount of available  run the above command and look for `llm_load_tensors: offloaded 1/41 layers to GPU` in the output.
 
@@ -53,4 +53,4 @@ python examples/test_open_llm/test_langchain.py
 
 If you're not using `CodeLLama` make sure to change the `model` parameter in the test script.
 
-That's it 🤓 time to go back [to](/docs/open_models.md#running-the-example) and give `gpte` a try.
\ No newline at end of file
+That's it 🤓 time to go back [to](/docs/open_models.md#running-the-example) and give `gpte` a try.

From 28834284f60157af5221a92180a010f7930e7547 Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Thu, 21 Mar 2024 19:10:34 +0100
Subject: [PATCH 22/27] Fixing style changes

---
 docs/open_models.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/open_models.md b/docs/open_models.md
index 73b89a1b17..1faf55eaae 100644
--- a/docs/open_models.md
+++ b/docs/open_models.md
@@ -16,7 +16,7 @@ More details on why the solution below is recommended in [this blog post](https:
 Setup
 -----
 
-For inference engine we recommend for the users to use [llama.cpp](https://github.com/ggerganov/llama.cpp) with its `python` bindings `llama-cpp-python`. 
+For inference engine we recommend for the users to use [llama.cpp](https://github.com/ggerganov/llama.cpp) with its `python` bindings `llama-cpp-python`.
 
 We choose `llama.cpp` because:
 
@@ -47,11 +47,11 @@ For our use case we also need to set up the web server that `llama-cpp-python` l
 pip install 'llama-cpp-python[server]'
 ```
 
-For detailed use consult the [`llama-cpp-python` docs](https://llama-cpp-python.readthedocs.io/en/latest/server/). 
+For detailed use consult the [`llama-cpp-python` docs](https://llama-cpp-python.readthedocs.io/en/latest/server/).
 
 Before we proceed we need to obtain the model weights in the `gguf` format. That should be a single file on your disk.
 
-In case you have weights in other formats check the `llama-cpp-python` docs for conversion to `gguf` format. 
+In case you have weights in other formats check the `llama-cpp-python` docs for conversion to `gguf` format.
 
 Models in other formats `ggml`, `.safetensors`, etc. won't work without prior conversion to `gguf` file format with the solution described below!
 
@@ -65,14 +65,14 @@ Your best choice would be:
 
 We are still testing this part, but the larger the model you can run the better. Sure the responses might be slower in terms of (token/s), but code quality will be higher.
 
-For testing that the open LLM `gpte` setup works we recommend starting with a smaller model. You can download weights of [CodeLlama-13B-GGUF by the `TheBloke`](https://huggingface.co/TheBloke/CodeLlama-13B-GGUF) choose the largest model version you can run (for example `Q6_K`), since quantisation will degrade LLM performance. 
+For testing that the open LLM `gpte` setup works we recommend starting with a smaller model. You can download weights of [CodeLlama-13B-GGUF by the `TheBloke`](https://huggingface.co/TheBloke/CodeLlama-13B-GGUF) choose the largest model version you can run (for example `Q6_K`), since quantisation will degrade LLM performance.
 
 Feel free to try out larger models on your hardware and see what happens.
 
 Running the Example
 ==================
 
-To see that your setup works check [test open LLM setup](examples/test_open_llm/README.md). 
+To see that your setup works check [test open LLM setup](examples/test_open_llm/README.md).
 
 If above tests work proceed 😉
 
@@ -111,7 +111,7 @@ The `--lite` mode is needed for now since open models for some reason behave wor
 
 That's it.
 
-*If sth. doesn't work as expected, or you figure out how to improve the open LLM support please let us know.*  
+*If sth. doesn't work as expected, or you figure out how to improve the open LLM support please let us know.*
 
 Using Azure models
 ==================

From 704bf031826eb70b2d169e490cb00195bb86768f Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Fri, 22 Mar 2024 09:52:43 +0100
Subject: [PATCH 23/27] Removing the test_ from example script names so that
 pytest doesn't consider them as tests. Adding the shell parameter MODEL_NAME
 to make example clearer

---
 docs/examples/{test_open_llm => open_llms}/README.md   | 10 +++++-----
 .../langchain_interface.py}                            |  3 ++-
 .../openai_api_interface.py}                           |  8 ++++++--
 3 files changed, 13 insertions(+), 8 deletions(-)
 rename docs/examples/{test_open_llm => open_llms}/README.md (81%)
 rename docs/examples/{test_open_llm/test_langchain.py => open_llms/langchain_interface.py} (88%)
 rename docs/examples/{test_open_llm/test_openai_api.py => open_llms/openai_api_interface.py} (67%)

diff --git a/docs/examples/test_open_llm/README.md b/docs/examples/open_llms/README.md
similarity index 81%
rename from docs/examples/test_open_llm/README.md
rename to docs/examples/open_llms/README.md
index 953ddd53e3..93e8f3300f 100644
--- a/docs/examples/test_open_llm/README.md
+++ b/docs/examples/open_llms/README.md
@@ -24,15 +24,16 @@ Set the environment variables:
 ```bash
 export OPENAI_API_BASE="http://localhost:8000/v1"
 export OPENAI_API_KEY="sk-xxx"
+export MODEL_NAME="CodeLlama"
 ````
 
 Then ping the model via `python` using `OpenAI` API:
 
 ```bash
-python examples/test_open_llm/test_openai_api.py
+python examples/open_llms/openai_api_interface.py
 ```
 
-If you're not using `CodeLLama` make sure to change the `model` parameter in the test script.
+If you're not using `CodeLLama` make sure to change the `MODEL_NAME` parameter.
 
 Or using `curl`:
 
@@ -48,9 +49,8 @@ If this works also make sure that `langchain` interface works since that's how `
 ## Langchain test
 
 ```bash
-python examples/test_open_llm/test_langchain.py
+export MODEL_NAME="CodeLlama"
+python examples/open_llms/langchain_interface.py
 ```
 
-If you're not using `CodeLLama` make sure to change the `model` parameter in the test script.
-
 That's it 🤓 time to go back [to](/docs/open_models.md#running-the-example) and give `gpte` a try.
diff --git a/docs/examples/test_open_llm/test_langchain.py b/docs/examples/open_llms/langchain_interface.py
similarity index 88%
rename from docs/examples/test_open_llm/test_langchain.py
rename to docs/examples/open_llms/langchain_interface.py
index 3ffbf9a859..b039700182 100644
--- a/docs/examples/test_open_llm/test_langchain.py
+++ b/docs/examples/open_llms/langchain_interface.py
@@ -1,8 +1,9 @@
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain_openai import ChatOpenAI
+import os
 
 model = ChatOpenAI(
-    model="CodeLlama",
+    model=os.getenv("MODEL_NAME"),
     temperature=0.1,
     callbacks=[StreamingStdOutCallbackHandler()],
     streaming=True,
diff --git a/docs/examples/test_open_llm/test_openai_api.py b/docs/examples/open_llms/openai_api_interface.py
similarity index 67%
rename from docs/examples/test_open_llm/test_openai_api.py
rename to docs/examples/open_llms/openai_api_interface.py
index cc4bc23984..19828df02e 100644
--- a/docs/examples/test_open_llm/test_openai_api.py
+++ b/docs/examples/open_llms/openai_api_interface.py
@@ -1,9 +1,13 @@
 from openai import OpenAI
+import os
 
-client = OpenAI(base_url="http://localhost:8000/v1", api_key="sk-xxx")
+client = OpenAI(
+    base_url=os.getenv("OPENAI_API_BASE"),
+    api_key=os.getenv("OPENAI_API_KEY")
+)
 
 response = client.chat.completions.create(
-    model="CodeLlama",
+    model=os.getenv("MODEL_NAME"),
     messages=[
         {
             "role": "user",

From 87fa9da628f932e075f386901ee2642a12132590 Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Fri, 22 Mar 2024 09:52:59 +0100
Subject: [PATCH 24/27] Fixing style changes

---
 docs/examples/open_llms/langchain_interface.py  | 3 ++-
 docs/examples/open_llms/openai_api_interface.py | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/examples/open_llms/langchain_interface.py b/docs/examples/open_llms/langchain_interface.py
index b039700182..05a7c2c269 100644
--- a/docs/examples/open_llms/langchain_interface.py
+++ b/docs/examples/open_llms/langchain_interface.py
@@ -1,6 +1,7 @@
+import os
+
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain_openai import ChatOpenAI
-import os
 
 model = ChatOpenAI(
     model=os.getenv("MODEL_NAME"),
diff --git a/docs/examples/open_llms/openai_api_interface.py b/docs/examples/open_llms/openai_api_interface.py
index 19828df02e..c940840e5c 100644
--- a/docs/examples/open_llms/openai_api_interface.py
+++ b/docs/examples/open_llms/openai_api_interface.py
@@ -1,9 +1,9 @@
-from openai import OpenAI
 import os
 
+from openai import OpenAI
+
 client = OpenAI(
-    base_url=os.getenv("OPENAI_API_BASE"),
-    api_key=os.getenv("OPENAI_API_KEY")
+    base_url=os.getenv("OPENAI_API_BASE"), api_key=os.getenv("OPENAI_API_KEY")
 )
 
 response = client.chat.completions.create(

From b0ae7318060b415d64825946070a8f2f30e95838 Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Thu, 28 Mar 2024 21:03:34 +0100
Subject: [PATCH 25/27] Switching from HTML test to python code example

---
 docs/examples/open_llms/openai_api_interface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/examples/open_llms/openai_api_interface.py b/docs/examples/open_llms/openai_api_interface.py
index c940840e5c..21650b77c1 100644
--- a/docs/examples/open_llms/openai_api_interface.py
+++ b/docs/examples/open_llms/openai_api_interface.py
@@ -11,7 +11,7 @@
     messages=[
         {
             "role": "user",
-            "content": "Provide me with the code for a simple HTML web site.",
+            "content": "Provide me with only the code for a simple python function that sums two numbers.",
         },
     ],
     temperature=0.7,

From 8afeb74b8a1a5443bf6e6d06981f396319f6c659 Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Thu, 28 Mar 2024 21:55:47 +0100
Subject: [PATCH 26/27] Switching to env variable LOCAL_MODEL

---
 gpt_engineer/applications/cli/main.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/gpt_engineer/applications/cli/main.py b/gpt_engineer/applications/cli/main.py
index 223410f9dd..e4f4b6386e 100644
--- a/gpt_engineer/applications/cli/main.py
+++ b/gpt_engineer/applications/cli/main.py
@@ -79,9 +79,6 @@ def load_env_if_needed():
 
     openai.api_key = os.getenv("OPENAI_API_KEY")
 
-    if openai.api_key == "sk-xxx":
-        openai.api_base = os.getenv("OPENAI_API_BASE")
-
     if os.getenv("ANTHROPIC_API_KEY") is None:
         load_dotenv()
     if os.getenv("ANTHROPIC_API_KEY") is None:
@@ -476,7 +473,7 @@ def main(
 
     if ai.token_usage_log.is_openai_model():
         print("Total api cost: $ ", ai.token_usage_log.usage_cost())
-    elif openai.api_key == "sk-xxx":
+    elif os.getenv("LOCAL_MODEL"):
         print("Total api cost: $ 0.0 since we are using local LLM.")
     else:
         print("Total tokens used: ", ai.token_usage_log.total_tokens())

From 4e7b07232dfe0adf29b1ae6a0716105e7ff8fcc5 Mon Sep 17 00:00:00 2001
From: Ziga Brencic <ziga.brencic@gamil.com>
Date: Thu, 28 Mar 2024 21:56:16 +0100
Subject: [PATCH 27/27] Switching to env variable LOCAL_MODEL and fixing the
 lower case variable name.

---
 docs/open_models.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/open_models.md b/docs/open_models.md
index 1faf55eaae..6586e9b8a6 100644
--- a/docs/open_models.md
+++ b/docs/open_models.md
@@ -98,13 +98,14 @@ Then in another terminal window set the following environment variables:
 ```bash
 export OPENAI_API_BASE="http://localhost:8000/v1"
 export OPENAI_API_KEY="sk-xxx"
-export model_name="CodeLLama"
+export MODEL_NAME="CodeLLama"
+export LOCAL_MODEL=true
 ```
 
 And run `gpt-engineer` with the following command:
 
 ```bash
-gpte <project_dir> $model_name --lite --temperature 0.1
+gpte <project_dir> $MODEL_NAME --lite --temperature 0.1
 ```
 
 The `--lite` mode is needed for now since open models for some reason behave worse with too many instructions at the moment. Temperature is set to `0.1` to get consistent best possible results.