diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ba3308e57..a5080658d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -7,16 +7,34 @@ on:
   pull_request:
     branches:
       - main
-      - debugging_ipc
 
 jobs:
-  build:
+  Docker:
     runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Build the Docker image
+        run: docker compose build
+      - name: Run the Docker image
+        run: docker compose up gpt-pilot -d
+      - name: Wait for the Docker image to start
+        run: docker ps
+      - name: Stop the Docker image
+        run: docker compose down
+  
+  Test:
+    runs-on: ${{ matrix.os }}
     strategy:
       matrix:
         # 3.10 - 04 Oct 2021
         # 3.11 - 24 Oct 2022
         python-version: ['3.9', '3.10', '3.11', '3.12']
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        exclude:
+          # LINK : fatal error LNK1181: cannot open input file 'libpq.lib'
+          # Maybe related: https://github.com/psycopg/psycopg2/issues/1628
+          - os: windows-latest
+            python-version: '3.12'
 
     steps:
     - uses: actions/checkout@v4
@@ -42,7 +60,9 @@ jobs:
         #ruff --format=github --target-version=py37 --ignore=F401,E501 .
 
     - name: Run tests
+      env:
+        PYTHONPATH: .
       run: |
         pip install pytest
         cd pilot
-        PYTHONPATH=. pytest -m "not slow and not uses_tokens and not ux_test"
+        pytest -m "not slow and not uses_tokens and not ux_test"
diff --git a/pilot/const/function_calls.py b/pilot/const/function_calls.py
index 967cfaa95..2619f5032 100644
--- a/pilot/const/function_calls.py
+++ b/pilot/const/function_calls.py
@@ -44,7 +44,7 @@ def command_definition(description_command=f'A single command that needs to be e
                        description_timeout=
                        'Timeout in milliseconds that represent the approximate time this command takes to finish. '
                        'If you need to run a command that doesnt\'t finish by itself (eg. a command to run an app), '
-                       'set the timeout to -1 and provide a process_name. '
+                       'set the timeout to to a value long enough to determine that it has started successfully and provide a process_name. '
                        'If you need to create a directory that doesn\'t exist and is not the root project directory, '
                        'always create it by running a command `mkdir`'):
     return {
@@ -59,6 +59,10 @@ def command_definition(description_command=f'A single command that needs to be e
                 'type': 'number',
                 'description': description_timeout,
             },
+            'success_message': {
+                'type': 'string',
+                'description': 'A message to look for in the output of the command to determine if successful or not.',
+            },
             'process_name': {
                 'type': 'string',
                 'description': 'If the process needs to continue running after the command is executed provide '
@@ -136,7 +140,7 @@ def command_definition(description_command=f'A single command that needs to be e
                         'description': 'List of smaller development steps that need to be done to complete the entire task.',
                         'items': {
                             'type': 'object',
-                            'description': 'A smaller development step that needs to be done to complete the entire task.  Remember, if you need to run a command that doesnt\'t finish by itself (eg. a command to run an app), put the timeout to 3000 milliseconds. If you need to create a directory that doesn\'t exist and is not the root project directory, always create it by running a command `mkdir`',
+                            'description': 'A smaller development step that needs to be done to complete the entire task.  Remember, if you need to run a command that doesn\'t finish by itself (eg. a command to run an app), put the timeout to 3000 milliseconds. If you need to create a directory that doesn\'t exist and is not the root project directory, always create it by running a command `mkdir`',
                             'properties': {
                                 'type': {
                                     'type': 'string',
@@ -179,14 +183,18 @@ def command_definition(description_command=f'A single command that needs to be e
                         'description': 'List of smaller development steps that need to be done to complete the entire task.',
                         'items': {
                             'type': 'object',
-                            'description': 'A smaller development step that needs to be done to complete the entire task.  Remember, if you need to run a command that doesnt\'t finish by itself (eg. a command to run an  If you need to create a directory that doesn\'t exist and is not the root project directory, always create it by running a command `mkdir`',
+                            'description': 'A smaller development step that needs to be done to complete the entire task.  Remember, if you need to run a command that doesn\'t finish by itself (eg. a command to run an  If you need to create a directory that doesn\'t exist and is not the root project directory, always create it by running a command `mkdir`',
                             'properties': {
                                 'type': {
                                     'type': 'string',
-                                    'enum': ['command', 'code_change', 'human_intervention'],
+                                    'enum': ['command', 'kill_process', 'code_change', 'human_intervention'],
                                     'description': 'Type of the development step that needs to be done to complete the entire task.',
                                 },
                                 'command': command_definition(),
+                                'kill_process': {
+                                    'type': 'string',
+                                    'description': 'To kill a process that was left running by a previous `command` step provide the `process_name` in this field and set `type` to "kill_process".',
+                                },
                                 'code_change': {
                                     'type': 'object',
                                     'description': 'A code change that needs to be implemented. This should be used only if the task is of a type "code_change".',
@@ -456,7 +464,7 @@ def command_definition(description_command=f'A single command that needs to be e
                             },
                             'path': {
                                 'type': 'string',
-                                'description': 'Path of the file that needs to be saved on the disk.',
+                                'description': 'Full path of the file with the file name that needs to be saved.',
                             },
                             'content': {
                                 'type': 'string',
diff --git a/pilot/helpers/AgentConvo.py b/pilot/helpers/AgentConvo.py
index 16397851e..e450de84b 100644
--- a/pilot/helpers/AgentConvo.py
+++ b/pilot/helpers/AgentConvo.py
@@ -82,6 +82,8 @@ def send_message(self, prompt_path=None, prompt_data=None, function_calls: Funct
                 development_step = save_development_step(self.agent.project, prompt_path, prompt_data, self.messages, response)
 
         # TODO handle errors from OpenAI
+        # It's complicated because calling functions are expecting different types of responses - string or tuple
+        # https://github.com/Pythagora-io/gpt-pilot/issues/165 & #91
         if response == {}:
             logger.error(f'Aborting with "OpenAI API error happened"')
             raise Exception("OpenAI API error happened.")
diff --git a/pilot/helpers/agents/Developer.py b/pilot/helpers/agents/Developer.py
index 7c88fa9ba..55270ba3c 100644
--- a/pilot/helpers/agents/Developer.py
+++ b/pilot/helpers/agents/Developer.py
@@ -1,3 +1,4 @@
+import platform
 import uuid
 from utils.style import green, red, green_bold, yellow_bold, red_bold, blue_bold, white_bold
 from helpers.exceptions.TokenLimitError import TokenLimitError
@@ -11,7 +12,7 @@
 from helpers.Agent import Agent
 from helpers.AgentConvo import AgentConvo
 from utils.utils import should_execute_step, array_of_objects_to_string, generate_app_data
-from helpers.cli import run_command_until_success, execute_command_and_check_cli_response
+from helpers.cli import run_command_until_success, execute_command_and_check_cli_response, running_processes
 from const.function_calls import FILTER_OS_TECHNOLOGIES, EXECUTE_COMMANDS, GET_TEST_TYPE, IMPLEMENT_TASK
 from database.database import save_progress, get_progress_steps, update_app_status
 from utils.utils import get_os_info
@@ -97,10 +98,12 @@ def step_command_run(self, convo, step, i):
         additional_message = 'Let\'s start with the step #0:\n\n' if i == 0 else f'So far, steps { ", ".join(f"#{j}" for j in range(i)) } are finished so let\'s do step #{i + 1} now.\n\n'
 
         process_name = data['process_name'] if 'process_name' in data else None
+        success_message = data['success_message'] if 'success_message' in data else None
 
         return run_command_until_success(convo, data['command'],
                                          timeout=data['timeout'],
                                          process_name=process_name,
+                                         success_message=success_message,
                                          additional_message=additional_message)
 
     def step_human_intervention(self, convo, step: dict):
@@ -171,7 +174,9 @@ def task_postprocessing(self, convo, development_task, continue_development, tas
 
         if development_task is not None:
             convo.remove_last_x_messages(2)
-            detailed_user_review_goal = convo.send_message('development/define_user_review_goal.prompt', {})
+            detailed_user_review_goal = convo.send_message('development/define_user_review_goal.prompt', {
+                'os': platform.system()
+            })
             convo.remove_last_x_messages(2)
 
         try:
@@ -340,7 +345,9 @@ def continue_development(self, iteration_convo, last_branch_name, continue_descr
 
                 # self.debugger.debug(iteration_convo, user_input=user_feedback)
 
-                task_steps = iteration_convo.send_message('development/parse_task.prompt', {}, IMPLEMENT_TASK)
+                task_steps = iteration_convo.send_message('development/parse_task.prompt', {
+                    'running_processes': running_processes
+                }, IMPLEMENT_TASK)
                 iteration_convo.remove_last_x_messages(2)
 
                 return self.execute_task(iteration_convo, task_steps, is_root_task=True)
diff --git a/pilot/helpers/cli.py b/pilot/helpers/cli.py
index c2f72f0cc..121df32ad 100644
--- a/pilot/helpers/cli.py
+++ b/pilot/helpers/cli.py
@@ -17,8 +17,8 @@
 
 interrupted = False
 
-running_processes: Dict[str, int] = {}
-"""Holds a list of process IDs, mapped to the `process_name` provided in the call to `execute_command()`."""
+running_processes: Dict[str, tuple[str, int]] = {}
+"""Holds a list of (command, process ID)s, mapped to the `process_name` provided in the call to `execute_command()`."""
 
 
 def enqueue_output(out, q):
@@ -74,12 +74,12 @@ def run_command(command, root_path, q_stdout, q_stderr) -> subprocess.Popen:
 
 def terminate_named_process(process_name: str) -> None:
     if process_name in running_processes:
-        terminate_process(running_processes[process_name], process_name)
+        terminate_process(running_processes[process_name][1], process_name)
 
 
 def terminate_running_processes():
     for process_name in list(running_processes.keys()):
-        terminate_process(running_processes[process_name], process_name)
+        terminate_process(running_processes[process_name][1], process_name)
 
 
 def terminate_process(pid: int, name=None) -> None:
@@ -100,11 +100,12 @@ def terminate_process(pid: int, name=None) -> None:
             logger.error(f'Error while terminating process: {e}')
 
     for process_name in list(running_processes.keys()):
-        if running_processes[process_name] == pid:
+        if running_processes[process_name][1] == pid:
             del running_processes[process_name]
 
 
-def execute_command(project, command, timeout=None, process_name: str = None, force=False):
+def execute_command(project, command, timeout=None, success_message=None, process_name: str = None, force=False) \
+        -> (str, str, int):
     """
     Execute a command and capture its output.
 
@@ -112,6 +113,7 @@ def execute_command(project, command, timeout=None, process_name: str = None, fo
         project: The project associated with the command.
         command (str): The command to run.
         timeout (int, optional): The maximum execution time in milliseconds. Default is None.
+        success_message: A message to look for in the output of the command to determine if successful or not.
         process_name (str, optional): A name for the process.
                             If `timeout` is not provided, can be used to terminate the process.
         force (bool, optional): Whether to execute the command without confirmation. Default is False.
@@ -119,8 +121,8 @@ def execute_command(project, command, timeout=None, process_name: str = None, fo
     Returns:
         cli_response (str): The command output
                             or: '', 'DONE' if user answered 'no' or 'skip'
-        llm_response (str): The response from the agent.
-                            TODO: this seems to be 'DONE' (no or skip) or None
+        llm_response (str): 'DONE' if 'no', 'skip' or `success_message` matched.
+                            Otherwise `None` - caller should send `cli_response` to LLM
         exit_code (int): The exit code of the process.
     """
     if timeout is not None:
@@ -166,6 +168,7 @@ def execute_command(project, command, timeout=None, process_name: str = None, fo
         return command_run.cli_response, None, None
 
     return_value = None
+    was_success = None
 
     q_stderr = queue.Queue()
     q = queue.Queue()
@@ -174,7 +177,7 @@ def execute_command(project, command, timeout=None, process_name: str = None, fo
 
     if process_name is not None:
         terminate_named_process(process_name)
-        running_processes[process_name] = process.pid
+        running_processes[process_name] = (command, process.pid)
 
     output = ''
     stderr_output = ''
@@ -189,9 +192,9 @@ def execute_command(project, command, timeout=None, process_name: str = None, fo
     try:
         while True:
             elapsed_time = time.time() - start_time
-            if timeout is not None:
-                # TODO: print to IPC using a different message type so VS Code can ignore it or update the previous value
-                print(white_bold(f'\rt: {round(elapsed_time * 1000)}ms : '), end='', flush=True)
+            # if timeout is not None:
+            #     # TODO: print to IPC using a different message type so VS Code can ignore it or update the previous value
+            #     print(white_bold(f'\rt: {round(elapsed_time * 1000)}ms : '), end='', flush=True)
 
             # Check if process has finished
             if process.poll() is not None:
@@ -207,6 +210,10 @@ def execute_command(project, command, timeout=None, process_name: str = None, fo
 
             # If timeout is reached, kill the process
             if timeout is not None and elapsed_time * 1000 > timeout:
+                if process_name is not None:
+                    logger.info(f'Process "{process_name}" running after timeout as pid: {process.pid}')
+                    break
+
                 raise TimeoutError("Command exceeded the specified timeout.")
                 # os.killpg(process.pid, signal.SIGKILL)
                 # break
@@ -220,6 +227,10 @@ def execute_command(project, command, timeout=None, process_name: str = None, fo
                 output += line
                 print(green('CLI OUTPUT:') + line, end='')
                 logger.info('CLI OUTPUT: ' + line)
+                if success_message is not None and success_message in line:
+                    logger.info('Success message found: %s', success_message)
+                    was_success = True
+                    break
 
             # Read stderr
             try:
@@ -231,10 +242,6 @@ def execute_command(project, command, timeout=None, process_name: str = None, fo
                 stderr_output += stderr_line
                 print(red('CLI ERROR:') + stderr_line, end='')  # Print with different color for distinction
                 logger.error('CLI ERROR: ' + stderr_line)
-                
-            if process_name is not None:
-                logger.info(f'Process {process_name} running as pid: {process.pid}')
-                break
 
     except (KeyboardInterrupt, TimeoutError) as e:
         interrupted = True
@@ -245,11 +252,11 @@ def execute_command(project, command, timeout=None, process_name: str = None, fo
             print('\nTimeout detected. Stopping command execution...')
             logger.warn('Timeout detected. Stopping command execution...')
 
+        was_success = False
         terminate_process(process.pid)
 
     elapsed_time = time.time() - start_time
-    print(f'{command} took {round(elapsed_time * 1000)}ms to execute.')
-    logger.info(f'{command} took {round(elapsed_time * 1000)}ms to execute.')
+    logger.info(f'`{command}` took {round(elapsed_time * 1000)}ms to execute.')
 
     # stderr_output = ''
     # while not q_stderr.empty():
@@ -263,7 +270,7 @@ def execute_command(project, command, timeout=None, process_name: str = None, fo
 
     save_command_run(project, command, return_value)
 
-    return return_value, None, process.returncode
+    return return_value, 'DONE' if was_success else None, process.returncode
 
 
 def build_directory_tree(path, prefix="", ignore=None, is_last=False, files=None, add_descriptions=False):
@@ -332,6 +339,7 @@ def execute_command_and_check_cli_response(command, timeout, convo):
 def run_command_until_success(convo, command,
                               timeout: Union[int, None],
                               process_name: Union[str, None] = None,
+                              success_message=None,
                               additional_message=None,
                               force=False,
                               return_cli_response=False,
@@ -345,6 +353,7 @@ def run_command_until_success(convo, command,
         timeout (int): The maximum execution time in milliseconds.
         process_name: A name for the process.
                       If `timeout` is not provided, can be used to terminate the process.
+        success_message: A message to look for in the output of the command to determine if successful or not.
         additional_message (str, optional): Additional message to include in the response.
         force (bool, optional): Whether to execute the command without confirmation. Default is False.
         return_cli_response (bool, optional): If True, may raise TooDeepRecursionError(cli_response)
@@ -353,11 +362,12 @@ def run_command_until_success(convo, command,
     cli_response, response, exit_code = execute_command(convo.agent.project,
                                                         command,
                                                         timeout=timeout,
+                                                        success_message=success_message,
                                                         process_name=process_name,
                                                         force=force)
 
     if response is None:
-        logger.info(f'{command} exit code: {exit_code}')
+        logger.info(f'`{command}` exit code: {exit_code}')
         if exit_code is None:
             response = 'DONE'
         else:
diff --git a/pilot/helpers/test_Debugger.py b/pilot/helpers/test_Debugger.py
new file mode 100644
index 000000000..eb7aeb9e2
--- /dev/null
+++ b/pilot/helpers/test_Debugger.py
@@ -0,0 +1,84 @@
+import builtins
+import pytest
+from unittest.mock import patch
+from dotenv import load_dotenv
+
+load_dotenv()
+from pilot.utils.custom_print import get_custom_print
+from pilot.helpers.agents.Developer import Developer
+from pilot.helpers.AgentConvo import AgentConvo
+from pilot.helpers.Debugger import Debugger
+from pilot.helpers.test_Project import create_project
+from pilot.test.mock_questionary import MockQuestionary
+
+
+################## NOTE: this test needs to be ran in debug with breakpoints ##################
+
+@pytest.mark.uses_tokens
+@patch('pilot.helpers.AgentConvo.get_saved_development_step')
+@patch('pilot.helpers.AgentConvo.save_development_step')
+@patch('utils.questionary.get_saved_user_input')
+@patch('utils.questionary.save_user_input')
+@patch('helpers.cli.get_saved_command_run')
+@patch('helpers.cli.run_command')
+@patch('helpers.cli.save_command_run')
+# @patch('pilot.helpers.cli.execute_command', return_value=('', 'DONE', 0))
+def test_debug(
+        # mock_execute_command,
+        mock_save_command, mock_run_command, mock_get_saved_command,
+               mock_save_input, mock_user_input, mock_save_step, mock_get_saved_step):
+    # Given
+    builtins.print, ipc_client_instance = get_custom_print({})
+    project = create_project()
+    project.current_step = 'coding'
+    developer = Developer(project)
+    project.developer = developer
+    convo = AgentConvo(developer)
+    convo.load_branch = lambda x: None
+
+    debugger = Debugger(developer)
+    # TODO: mock agent.project.developer.execute_task
+
+    # convo.messages.append()
+    convo.construct_and_add_message_from_prompt('dev_ops/ran_command.prompt', {
+        'cli_response': '''
+stderr:
+```
+node:internal/modules/cjs/loader:1080
+  throw err;
+  ^
+
+Error: Cannot find module 'mime'
+Require stack:
+- /workspace/chat_app/node_modules/send/index.js
+- /workspace/chat_app/node_modules/express/lib/utils.js
+- /workspace/chat_app/node_modules/express/lib/application.js
+- /workspace/chat_app/node_modules/express/lib/express.js
+- /workspace/chat_app/node_modules/express/index.js
+- /workspace/chat_app/server.js
+    at Module._resolveFilename (node:internal/modules/cjs/loader:1077:15)
+    at Module._load (node:internal/modules/cjs/loader:922:27)
+    at Module.require (node:internal/modules/cjs/loader:1143:19)
+    at require (node:internal/modules/cjs/helpers:121:18)
+    at Object.<anonymous> (/workspace/chat_app/node_modules/send/index.js:24:12)
+    at Module._compile (node:internal/modules/cjs/loader:1256:14)
+    at Module._extensions..js (node:internal/modules/cjs/loader:1310:10)
+    at Module.load (node:internal/modules/cjs/loader:1119:32)
+    at Module._load (node:internal/modules/cjs/loader:960:12)
+```
+stdout:
+```
+> chat_app@1.0.0 start
+> node server.js
+```        
+'''
+    })
+
+    mock_questionary = MockQuestionary(['', ''])
+
+    with patch('utils.questionary.questionary', mock_questionary):
+        # When
+        result = debugger.debug(convo, command={'command': 'npm run start'}, is_root_task=True)
+
+        # Then
+        assert result == {'success': True}
diff --git a/pilot/helpers/test_cli.py b/pilot/helpers/test_cli.py
new file mode 100644
index 000000000..a39211988
--- /dev/null
+++ b/pilot/helpers/test_cli.py
@@ -0,0 +1,6 @@
+from pilot.helpers.cli import terminate_process
+
+
+def test_terminate_process_not_running():
+    terminate_process('999999999', 'not running')
+    assert True
diff --git a/pilot/prompts/development/define_user_review_goal.prompt b/pilot/prompts/development/define_user_review_goal.prompt
index 4dfc01310..2f9e94060 100644
--- a/pilot/prompts/development/define_user_review_goal.prompt
+++ b/pilot/prompts/development/define_user_review_goal.prompt
@@ -1,4 +1,4 @@
-How can a human user test if this task was completed successfully? If you specify a command that needs to be run or give example, be very specific. You don't want the user to have to think anything through but rather that they just follow your instructions.
+How can a human user test if this task was completed successfully? If you specify a command that needs to be run or give example, be very specific. You don't want the user to have to think anything through but rather that they just follow your instructions. Note that the command will run on a {{ os }} machine.
 
 !IMPORTANT!
 In case the task can be tested by making an API request, do not suggest how can a request be made with Postman but rather write a full cURL command that the user can just run.
diff --git a/pilot/prompts/development/parse_task.prompt b/pilot/prompts/development/parse_task.prompt
index 9232259e5..799263d12 100644
--- a/pilot/prompts/development/parse_task.prompt
+++ b/pilot/prompts/development/parse_task.prompt
@@ -1 +1,8 @@
-Ok, now, take your previous message and convert it to actionable items. An item might be a code change or a command run. When you need to change code, make sure that you put the entire content of the file in the value of `content` key even though you will likely copy and paste the most of the previous message.
\ No newline at end of file
+Ok, now, take your previous message and convert it to actionable items. An item might be a code change or a command run. When you need to change code, make sure that you put the entire content of the file in the value of `content` key even though you will likely copy and paste the most of the previous message.
+{%- if running_processes %}
+Note that the following processes are already running:
+
+{% for key, data in running_processes.items() -%}
+- "{{ key }}" (`{{ data[0] }}`)
+{% endfor -%}
+{% endif -%}
diff --git a/pilot/prompts/test_prompts.py b/pilot/prompts/test_prompts.py
index 51c7180d5..87e2d75f6 100644
--- a/pilot/prompts/test_prompts.py
+++ b/pilot/prompts/test_prompts.py
@@ -43,3 +43,27 @@ def test_prompt_ran_command_0_exit():
 
 If the command was successfully executed, respond with `DONE`. If it wasn't, respond with `NEEDS_DEBUGGING`.
 '''.strip()
+
+
+def test_parse_task_no_processes():
+    # When
+    prompt = get_prompt('development/parse_task.prompt', {
+        'running_processes': {}
+    })
+
+    # Then
+    assert 'the following processes' not in prompt
+
+
+def test_parse_task_with_processes():
+    # When
+    prompt = get_prompt('development/parse_task.prompt', {
+        'running_processes': {
+            'app': ('npm start', 123),
+            'mongo': ('mongod', 456)
+        }
+    })
+
+    # Then
+    assert 'the following processes are already running:' in prompt
+    assert '- "app" (`npm start`)\n- "mongo" (`mongod`)' in prompt
diff --git a/pilot/utils/llm_connection.py b/pilot/utils/llm_connection.py
index 593b98043..7f3c2cf97 100644
--- a/pilot/utils/llm_connection.py
+++ b/pilot/utils/llm_connection.py
@@ -157,6 +157,8 @@ def set_function_error(args, err_str: str):
             del args[0]['function_buffer']
 
     def wrapper(*args, **kwargs):
+        wait_duration_ms = None
+
         while True:
             try:
                 # spinner_stop(spinner)
@@ -190,6 +192,7 @@ def wrapper(*args, **kwargs):
                     # or `Expecting value` with `pos` before the end of `e.doc`
                     function_error_count = update_error_count(args)
                     logger.warning('Received invalid character in JSON response from LLM. Asking to retry...')
+                    logger.info(f'  received: {e.doc}')
                     set_function_error(args, err_str)
                     if function_error_count < 3:
                         continue
@@ -212,9 +215,13 @@ def wrapper(*args, **kwargs):
                     match = re.search(r"Please try again in (\d+)ms.", err_str)
                     if match:
                         # spinner = spinner_start(colored("Rate limited. Waiting...", 'yellow'))
-                        logger.debug('Rate limited. Waiting...')
-                        wait_duration = int(match.group(1)) / 1000
-                        time.sleep(wait_duration)
+                        if wait_duration_ms is None:
+                            wait_duration_ms = int(match.group(1))
+                        elif wait_duration_ms < 6000:
+                            # waiting 6ms isn't usually long enough - exponential back-off until about 6 seconds
+                            wait_duration_ms *= 2
+                        logger.debug(f'Rate limited. Waiting {wait_duration_ms}ms...')
+                        time.sleep(wait_duration_ms / 1000)
                     continue
 
                 print(red(f'There was a problem with request to openai API:'))
@@ -249,7 +256,6 @@ def stream_gpt_completion(data, req_type, project):
     :param project: NEEDED FOR WRAPPER FUNCTION retry_on_exception
     :return: {'text': str} or {'function_calls': {'name': str, arguments: '{...}'}}
     """
-
     # TODO add type dynamically - this isn't working when connected to the external process
     try:
         terminal_width = os.get_terminal_size().columns
@@ -328,11 +334,8 @@ def return_result(result_data, lines_printed):
         stream=True
     )
 
-    # Log the response status code and message
-    logger.debug(f'Response status code: {response.status_code}')
-
     if response.status_code != 200:
-        logger.info(f'problem with request: {response.text}')
+        logger.info(f'problem with request (status {response.status_code}): {response.text}')
         raise Exception(f"API responded with status code: {response.status_code}. Response text: {response.text}")
 
     # function_calls = {'name': '', 'arguments': ''}
diff --git a/pilot/utils/questionary.py b/pilot/utils/questionary.py
index b9877b105..dd1d4a2f9 100644
--- a/pilot/utils/questionary.py
+++ b/pilot/utils/questionary.py
@@ -1,7 +1,10 @@
-from prompt_toolkit.styles import Style
+import platform
 import questionary
-from utils.style import yellow_bold
 import re
+import sys
+from prompt_toolkit.styles import Style
+from utils.style import yellow_bold
+
 from database.database import save_user_input, get_saved_user_input
 
 custom_style = Style.from_dict({
@@ -19,7 +22,7 @@ def remove_ansi_codes(s: str) -> str:
 
 
 def styled_select(*args, **kwargs):
-    kwargs["style"] = custom_style  # Set style here
+    kwargs["style"] = custom_style
     return questionary.select(*args, **kwargs).unsafe_ask()  # .ask() is included here
 
 
@@ -38,7 +41,8 @@ def styled_text(project, question, ignore_user_input_count=False, style=None):
         config = {
             'style': style if style is not None else custom_style,
         }
-        question = remove_ansi_codes(question) # Colorama and questionary are not compatible and styling doesn't work
+        question = remove_ansi_codes(question)  # Colorama and questionary are not compatible and styling doesn't work
+        flush_input()
         response = questionary.text(question, **config).unsafe_ask()  # .ask() is included here
     else:
         response = print(question, type='user_input_request')
@@ -55,4 +59,19 @@ def get_user_feedback():
     config = {
         'style': custom_style,
     }
-    return questionary.text("How did GPT Pilot do? Were you able to create any app that works? Please write any feedback you have or just press ENTER to exit: ", **config).unsafe_ask()
+    return questionary.text('How did GPT Pilot do? Were you able to create any app that works? '
+                            'Please write any feedback you have or just press ENTER to exit: ', **config).unsafe_ask()
+
+
+def flush_input():
+    """Flush the input buffer, discarding all that's in the buffer."""
+    try:
+        if platform.system() == 'Windows':
+            import msvcrt
+            while msvcrt.kbhit():
+                msvcrt.getch()
+        else:
+            import termios
+            termios.tcflush(sys.stdin, termios.TCIOFLUSH)
+    except (ImportError, OSError):
+        pass
diff --git a/pilot/utils/test_llm_connection.py b/pilot/utils/test_llm_connection.py
index 0d9e79b87..cdbb276b8 100644
--- a/pilot/utils/test_llm_connection.py
+++ b/pilot/utils/test_llm_connection.py
@@ -2,7 +2,7 @@
 from json import JSONDecodeError
 
 import pytest
-from unittest.mock import patch, Mock
+from unittest.mock import call, patch, Mock
 from dotenv import load_dotenv
 from jsonschema import ValidationError
 from const.function_calls import ARCHITECTURE, DEVELOPMENT_PLAN
@@ -364,6 +364,47 @@ class TestLlmConnection:
     def setup_method(self):
         builtins.print, ipc_client_instance = get_custom_print({})
 
+    @patch('utils.llm_connection.requests.post')
+    @patch('utils.llm_connection.time.sleep')
+    def test_rate_limit_error(self, mock_sleep, mock_post, monkeypatch):
+        monkeypatch.setenv('OPENAI_API_KEY', 'secret')
+
+        error_text = '''{
+                "error": {
+                    "message": "Rate limit reached for 10KTPM-200RPM in organization org-OASFC7k1Ff5IzueeLArhQtnT on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues.",
+                    "type": "tokens",
+                    "param": null,
+                    "code": "rate_limit_exceeded"
+                }
+            }'''
+        content = 'DONE'
+        success_text = '{"id": "gen-123", "choices": [{"index": 0, "delta": {"role": "assistant", "content": "' + content + '"}}]}'
+
+        error_response = Mock()
+        error_response.status_code = 429
+        error_response.text = error_text
+
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.iter_lines.return_value = [success_text.encode('utf-8')]
+
+        mock_post.side_effect = [error_response, error_response, error_response, error_response, error_response,
+                                 error_response, error_response, error_response, error_response, error_response,
+                                 error_response, error_response, mock_response]
+        wrapper = retry_on_exception(stream_gpt_completion)
+        data = {'model': 'gpt-4'}
+
+        # When
+        response = wrapper(data, 'test', project)
+
+        # Then
+        assert response == {'text': 'DONE'}
+        # assert mock_sleep.call_count == 9
+        assert mock_sleep.call_args_list == [call(0.006), call(0.012), call(0.024), call(0.048), call(0.096),
+                                             call(0.192), call(0.384), call(0.768), call(1.536), call(3.072),
+                                             call(6.144), call(6.144)]
+        # mock_sleep.call
+
     @patch('utils.llm_connection.requests.post')
     def test_stream_gpt_completion(self, mock_post, monkeypatch):
         # Given streaming JSON response