-
Notifications
You must be signed in to change notification settings - Fork 4
/
app.py
185 lines (154 loc) · 9.11 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
from datetime import datetime
import logging
import os
from argparse import ArgumentParser
import yaml
from webapp import create_app, socketio
os.makedirs("logs", exist_ok=True)
log_file_name = datetime.now().strftime(
os.path.join("logs", "llmaix_%H_%M_%d_%m_%Y.log")
)
logging.basicConfig(
level=logging.DEBUG,
filename=log_file_name,
filemode="w",
format="%(asctime)s - %(levelname)s - %(message)s",
)
logging.debug("Starting LLM-AIx ...")
def create_parser() -> ArgumentParser:
parser = ArgumentParser(description='Web app for llama-cpp')
parser.add_argument("--model_path", type=str, default=os.getenv('MODEL_PATH', "models"), help="Path where the models are stored which llama cpp can load.")
parser.add_argument("--server_path", type=str, default=os.getenv('SERVER_PATH', r""), help="Path to the llama server executable.")
parser.add_argument("--port", type=int, default=int(os.getenv('PORT', 5000)), help="On which port the Web App should be available.")
parser.add_argument("--host", type=str, default=os.getenv('HOST', "localhost"))
parser.add_argument("--config_file", type=str, default=os.getenv('CONFIG_FILE', "config.yml"))
parser.add_argument("--n_gpu_layers", type=int, default=int(os.getenv('N_GPU_LAYERS', 80)))
parser.add_argument("--llamacpp_port", type=int, default=int(os.getenv('LLAMACPP_PORT', 2929)))
parser.add_argument("--gpu", type=str, default=os.getenv('GPU', "all"), help="Which GPU to use", choices=["all", "0", "1", "2", "3", "4", "5", "6", "7", "mps"])
parser.add_argument("--debug", action="store_true", default=os.getenv('DEBUG', 'false') == 'true')
parser.add_argument("--mode", type=str, default=os.getenv('MODE', "choice"), choices=["anonymizer", "informationextraction", "choice"], help="Which mode to run")
parser.add_argument("--disable_parallel", action="store_true", default=os.getenv('DISABLE_PARALLEL', 'false') == 'true', help="Disable parallel llama-cpp processing.")
parser.add_argument("--no_parallel_preprocessing", action="store_true", default=os.getenv('NO_PARALLEL_PREPROCESSING', 'false') == 'true', help="Disable parallel preprocessing")
parser.add_argument("--context_size", type=int, default=int(os.getenv('CONTEXT_SIZE', -1)), help="Set custom context size for llama cpp")
parser.add_argument("--verbose_llama", action="store_true", default=os.getenv('VERBOSE_LLAMA', 'false') == 'true', help="Verbose llama cpp")
parser.add_argument("--no_password", action="store_true", default=os.getenv('NO_PASSWORD', 'false') == 'true', help="Disable password protection")
return parser
def load_yaml_file(file_path):
with open(file_path, 'r') as file:
try:
data = yaml.safe_load(file)
return data
except yaml.YAMLError as exc:
print(f"Error loading YAML file: {exc}")
return None
def is_path(string):
# Check for directory separators
if '/' in string or '\\' in string:
return True
# Check if it's an absolute path
if os.path.isabs(string):
return True
# Check if the directory part of the path exists
if os.path.dirname(string) and os.path.exists(os.path.dirname(string)):
return True
return False
def check_model_config(model_dir, model_config_file):
if not os.path.exists(model_dir):
raise ValueError(f"Model directory {model_dir} does not exist")
if is_path(model_config_file):
if not os.path.isfile(model_config_file):
raise ValueError(f"Model config file {model_config_file} does not exist")
else:
if not os.path.isfile(os.path.join(model_dir, model_config_file)):
raise ValueError(
f"Model config file {model_config_file} does not exist in model directory {model_dir}. Did you mount the model directory to the container?")
model_config_file = os.path.join(model_dir, model_config_file)
model_config = load_yaml_file(model_config_file)
if model_config is None:
raise ValueError(f"Error loading model config file {model_config_file}")
print("Model config loaded: ", model_config)
# check model config yaml. Example entry:
# - name: "llama3.1_8b_instruct_q8"
# display_name: "LLaMA 3.1 8B Instruct Q8_0"
# file_name: "Meta-Llama-3.1-8B-Instruct-Q8_0.gguf"
# model_context_size: 128000
# kv_cache_size: 128000
# kv_cache_quants: "q8_0" # e.g. "q_8" or "q_4" - requires flash attention
# flash - attention: true # does not work for some models
# mlock: true
# server_slots: 8
# seed: 42
# n_gpu_layers: 33
for model_dict in model_config['models']:
if 'kv_cache_size' not in model_dict:
raise ValueError(f"Model config for {model_dict['name']} is missing 'kv_cache_size'")
if 'server_slots' not in model_dict:
raise ValueError(f"Model config for {model_dict['name']} is missing 'server_slots'")
if 'file_name' not in model_dict:
raise ValueError(f"Model config for {model_dict['name']} is missing 'file_name'")
if 'n_gpu_layers' not in model_dict:
raise ValueError(f"Model config for {model_dict['name']} is missing 'n_gpu_layers'")
if 'flash_attention' not in model_dict:
print("Flash attention not found in model config, setting to False")
# model_dict['flash_attention'] = False
if 'mlock' not in model_dict:
print("Mlock not found in model config, setting to False")
# model_dict['mlock'] = False
if 'seed' not in model_dict:
print("Seed not found in model config, setting to 42")
# model_dict['seed'] = 42
if 'kv_cache_quants' not in model_dict:
print("KV cache quants not found in model config, setting to None")
# model_dict['kv_cache_quants'] = None
elif model_dict['kv_cache_quants'] == "":
print("KV cache quants is empty, setting to f16")
# model_dict['kv_cache_quants'] = None
elif model_dict['kv_cache_quants'] != "" and 'flash_attention' not in model_dict:
raise ValueError(
f"Model config for {model_dict['name']} is missing 'flash_attention' when kv_cache_quants is set")
elif model_dict['kv_cache_quants'] != "" and not model_dict['flash_attention']:
raise ValueError(
f"Model config for {model_dict['name']} has kv_cache_quants set but flash_attention is False")
elif model_dict['kv_cache_quants'] not in ["q4_0", "q8_0", "f16", "f32", "q5_0", "q5_1", "q4_1", "iq4_nl"]:
raise ValueError(
f"Model config for {model_dict['name']} has invalid kv_cache_quants value. Please use one of: q4_0, q8_0, f16, f32, q5_0, q5_1, q4_1, iq4_nl")
if 'model_context_size' not in model_dict:
raise ValueError(f"Model config for {model_dict['name']} is missing 'model_context_size'")
if 'display_name' not in model_dict:
print("Display name not found in model config, setting to file name")
# model_dict['display_name'] = model_dict['file_name']
model_file = os.path.join(model_dir, model_dict['file_name'])
if not os.path.isfile(model_file):
raise ValueError(f"Model file {model_file} not found")
if __name__ == "__main__":
parser = create_parser()
args = parser.parse_args()
app = create_app(auth_required=True if args.host != "localhost" and not args.no_password else False)
app.config["MODEL_PATH"] = args.model_path
app.config["SERVER_PATH"] = args.server_path
app.config["SERVER_PORT"] = args.port
app.config["CONFIG_FILE"] = args.config_file
app.config["GPU"] = args.gpu
app.config["LLAMACPP_PORT"] = args.llamacpp_port
app.config["DEBUG"] = args.debug
app.config["NO_PARALLEL"] = args.disable_parallel
app.config["VERBOSE_LLAMA"] = args.verbose_llama
app.config["PARALLEL_PREPROCESSING"] = not args.no_parallel_preprocessing
app.config["MODE"] = args.mode
if args.gpu != "all":
print("Using GPU " + args.gpu)
# if model path is relative, make it absolute
if not os.path.isabs(app.config["MODEL_PATH"]):
app.config["MODEL_PATH"] = os.path.abspath(app.config["MODEL_PATH"])
# if server path is relative, make it absolute
if not os.path.isabs(app.config["SERVER_PATH"]):
app.config["SERVER_PATH"] = os.path.abspath(app.config["SERVER_PATH"])
check_model_config(app.config["MODEL_PATH"], app.config["CONFIG_FILE"],)
if not os.path.isfile(app.config["SERVER_PATH"]):
print("WARNING: Llama.cpp server executable not found in '" + app.config["SERVER_PATH"] + "'. Did you specify --server_path correctly?")
print("Start Webserver on http://" + args.host + ":" + str(args.port))
if args.host == "0.0.0.0":
print("Please use http://localhost:" + str(args.port) + " to access the web app locally or the IP / hostname of your server to access the web app in your local network.")
if args.host != "localhost":
print("Requires authentication")
socketio.run(app, debug=args.debug, use_reloader=args.debug, port=args.port, host=args.host, allow_unsafe_werkzeug=True)