From 755abee9aef143a791e1c816e6907a1dd2bdb7fa Mon Sep 17 00:00:00 2001 From: Jacob-Junqi Tian Date: Sun, 24 Nov 2024 16:01:54 -0500 Subject: [PATCH 01/13] Custom embeddings: added openai-compatible embed logic. --- vec_inf/cli/_cli.py | 8 +- vec_inf/cli/_utils.py | 1 - vec_inf/embed.slurm | 38 ++++++ vec_inf/embedding/__init__.py | 0 vec_inf/embedding/openai_api_server.py | 169 +++++++++++++++++++++++++ vec_inf/launch_server.sh | 11 +- 6 files changed, 223 insertions(+), 4 deletions(-) create mode 100644 vec_inf/embed.slurm create mode 100644 vec_inf/embedding/__init__.py create mode 100644 vec_inf/embedding/openai_api_server.py diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py index bcfba41..9f1bea3 100644 --- a/vec_inf/cli/_cli.py +++ b/vec_inf/cli/_cli.py @@ -131,6 +131,10 @@ def launch( if model_name in models_df["model_name"].values: default_args = utils.load_default_args(models_df, model_name) + model_type = default_args.pop("model_type") + if model_type == "Text Embedding": + launch_cmd += " --slurm-script embed.slurm" + for arg in default_args: if arg in locals() and locals()[arg] is not None: default_args[arg] = locals()[arg] @@ -336,7 +340,9 @@ def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None: with Live(refresh_per_second=1, console=CONSOLE) as live: while True: - out_logs = utils.read_slurm_log(slurm_job_name, slurm_job_id, "out", log_dir) + out_logs = utils.read_slurm_log( + slurm_job_name, slurm_job_id, "out", log_dir + ) metrics = utils.get_latest_metric(out_logs) table = utils.create_table(key_title="Metric", value_title="Value") for key, value in metrics.items(): diff --git a/vec_inf/cli/_utils.py b/vec_inf/cli/_utils.py index 6eb5c8e..b070d98 100644 --- a/vec_inf/cli/_utils.py +++ b/vec_inf/cli/_utils.py @@ -135,7 +135,6 @@ def load_default_args(models_df: pd.DataFrame, model_name: str) -> dict: row_data = models_df.loc[models_df["model_name"] == model_name] default_args = row_data.iloc[0].to_dict() default_args.pop("model_name") - default_args.pop("model_type") return default_args diff --git a/vec_inf/embed.slurm b/vec_inf/embed.slurm new file mode 100644 index 0000000..5878bae --- /dev/null +++ b/vec_inf/embed.slurm @@ -0,0 +1,38 @@ +#!/bin/bash +#SBATCH --cpus-per-task=16 +#SBATCH --mem=64G + +# Load CUDA, change to the cuda version on your environment if different +source /opt/lmod/lmod/init/profile +module load cuda-12.3 +nvidia-smi + +source ${SRC_DIR}/find_port.sh + +# Write server url to file +hostname=${SLURMD_NODENAME} +vllm_port_number=$(find_available_port $hostname 8080 65535) + +echo "Server address: http://${hostname}:${vllm_port_number}/v1" +echo "http://${hostname}:${vllm_port_number}/v1" > ${VLLM_BASE_URL_FILENAME} + +# Activate vllm venv +if [ "$VENV_BASE" = "singularity" ]; then + export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_0.3.4.sif + export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1 + module load singularity-ce/3.8.2 + singularity exec $SINGULARITY_IMAGE ray stop + singularity exec --nv --bind ${MODEL_WEIGHTS_PARENT_DIR}:${MODEL_WEIGHTS_PARENT_DIR} $SINGULARITY_IMAGE \ + python3.10 ${SRC_DIR}/embedding/openai_api_server.py \ + --model ${VLLM_MODEL_WEIGHTS} \ + --port ${vllm_port_number} \ + --trust-remote-code \ + --max-num-seqs ${VLLM_MAX_NUM_SEQS} +else + source ${VENV_BASE}/bin/activate + python3 ${SRC_DIR}/embedding/openai_api_server.py \ + --model ${VLLM_MODEL_WEIGHTS} \ + --port ${vllm_port_number} \ + --trust-remote-code \ + --max-num-seqs ${VLLM_MAX_NUM_SEQS} +fi diff --git a/vec_inf/embedding/__init__.py b/vec_inf/embedding/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vec_inf/embedding/openai_api_server.py b/vec_inf/embedding/openai_api_server.py new file mode 100644 index 0000000..87d45a9 --- /dev/null +++ b/vec_inf/embedding/openai_api_server.py @@ -0,0 +1,169 @@ +import argparse +import asyncio +import base64 +from asyncio import Queue +from typing import List, Optional, Union +import sys + +import torch +import uvicorn +from fastapi import FastAPI, Response +from pydantic import BaseModel +from transformers import AutoModel, AutoTokenizer + + +# Define request and response models +class EmbeddingsRequest(BaseModel): + input: Union[str, List[str]] + model: str + encoding_format: Optional[str] = "float" # Default to 'float' + user: Optional[str] = None + + +class EmbeddingData(BaseModel): + object: str + embedding: Union[List[float], str] # Can be a list of floats or a base64 string + index: int + + +class EmbeddingsResponse(BaseModel): + object: str + data: List[EmbeddingData] + model: str + usage: dict + + +parser = argparse.ArgumentParser() +parser.add_argument("--model") +parser.add_argument("--port", type=int) +parser.add_argument("--max-num-seqs", type=int) +parser.add_argument("--trust-remote-code", type=bool) +args = parser.parse_args() + + +# Initialize the FastAPI app +app = FastAPI() + +# Load the tokenizer and model from HuggingFace +tokenizer = AutoTokenizer.from_pretrained(args.model) +model = AutoModel.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) + +# Initialize the request queue and batch processing parameters +request_queue = Queue() +BATCH_TIMEOUT = 0.01 # in seconds + + +@app.post("/v1/embeddings") +async def create_embeddings(request: EmbeddingsRequest): + """ + Handle incoming embedding requests by adding them to the processing queue. + """ + # Create a Future to hold the result + future = asyncio.get_event_loop().create_future() + # Put the request into the queue + await request_queue.put((request, future)) + # Wait for the result + result = await future + return result + + +@app.get("/health") +def status_check(): + """ + Returns 200. + """ + return Response(status_code=200) + + +async def process_queue(): + """ + Continuously process requests from the queue in batches. + """ + while True: + requests_futures = [] + try: + # Wait for at least one item + request_future = await request_queue.get() + requests_futures.append(request_future) + # Now, try to get more items with a timeout + try: + while len(requests_futures) < args.max_num_seqs: + request_future = await asyncio.wait_for( + request_queue.get(), timeout=BATCH_TIMEOUT + ) + requests_futures.append(request_future) + except asyncio.TimeoutError: + pass + except Exception: + continue + # Process the batch + requests = [rf[0] for rf in requests_futures] + futures = [rf[1] for rf in requests_futures] + # Collect input texts and track counts + batched_input_texts = [] + input_counts = [] + encoding_formats = [] + for request in requests: + input_text = request.input + if isinstance(input_text, str): + input_text = [input_text] + input_counts.append(len(input_text)) + batched_input_texts.extend(input_text) + encoding_formats.append(request.encoding_format) + # Tokenize and compute embeddings + inputs = tokenizer( + batched_input_texts, padding=True, truncation=True, return_tensors="pt" + ) + with torch.no_grad(): + outputs = model(**inputs) + embeddings = outputs.last_hidden_state.mean(dim=1).tolist() + # Split embeddings back to individual requests + idx = 0 + for request, future, count, encoding_format in zip( + requests, futures, input_counts, encoding_formats + ): + request_embeddings = embeddings[idx : idx + count] + idx += count + # Prepare response + data = [] + for i, embedding in enumerate(request_embeddings): + if encoding_format == "base64": + # Convert list of floats to bytes + embedding_bytes = ( + torch.tensor(embedding, dtype=torch.float32).numpy().tobytes() + ) + # Encode bytes to base64 string + embedding_base64 = base64.b64encode(embedding_bytes).decode("utf-8") + data.append( + EmbeddingData( + object="embedding", embedding=embedding_base64, index=i + ) + ) + else: + data.append( + EmbeddingData(object="embedding", embedding=embedding, index=i) + ) + response = EmbeddingsResponse( + object="list", + data=data, + model=request.model, + usage={ + "prompt_tokens": len(inputs["input_ids"]), # type: ignore + "total_tokens": len(inputs["input_ids"]), # type: ignore + }, + ) + # Set the result + future.set_result(response) + + +@app.on_event("startup") +async def startup_event(): + """ + Start the background task to process the request queue. + """ + asyncio.create_task(process_queue()) + + +if __name__ == "__main__": + print("INFO: Application startup complete.", file=sys.stderr) + uvicorn.run("embedding_server:app", host="0.0.0.0", port=args.port) diff --git a/vec_inf/launch_server.sh b/vec_inf/launch_server.sh index 4476ca9..36459f6 100755 --- a/vec_inf/launch_server.sh +++ b/vec_inf/launch_server.sh @@ -17,6 +17,7 @@ while [[ "$#" -gt 0 ]]; do --data-type) data_type="$2"; shift ;; --venv) venv="$2"; shift ;; --log-dir) log_dir="$2"; shift ;; + --slurm-script) slurm_script="$2"; shift ;; --model-weights-parent-dir) model_weights_parent_dir="$2"; shift ;; --pipeline-parallelism) pipeline_parallelism="$2"; shift ;; *) echo "Unknown parameter passed: $1"; exit 1 ;; @@ -44,8 +45,8 @@ export VLLM_MAX_MODEL_LEN=$max_model_len export VLLM_MAX_LOGPROBS=$vocab_size export VLLM_DATA_TYPE=$data_type export VENV_BASE=$venv +export SLURM_SCRIPT=$slurm_script export LOG_DIR=$log_dir -export MODEL_WEIGHTS_PARENT_DIR=$model_weights_parent_dir if [ -n "$max_num_seqs" ]; then export VLLM_MAX_NUM_SEQS=$max_num_seqs @@ -53,6 +54,12 @@ else export VLLM_MAX_NUM_SEQS=256 fi +if [ -n "$slurm_script" ]; then + export SLURM_SCRIPT=$slurm_script +else + export SLURM_SCRIPT="vllm.slurm" +fi + if [ -n "$pipeline_parallelism" ]; then export PIPELINE_PARALLELISM=$pipeline_parallelism else @@ -121,4 +128,4 @@ sbatch --job-name $JOB_NAME \ --time $WALLTIME \ --output $LOG_DIR/$JOB_NAME.%j.out \ --error $LOG_DIR/$JOB_NAME.%j.err \ - $SRC_DIR/${is_special}vllm.slurm + $SRC_DIR/${is_special}${SLURM_SCRIPT} From f80a31e6bc816d18c6ff3415b48790887db90f23 Mon Sep 17 00:00:00 2001 From: Jacob-Junqi Tian Date: Sun, 24 Nov 2024 16:54:35 -0500 Subject: [PATCH 02/13] Custom embeddings: Included SRC_DIR mount in singularity config. --- vec_inf/embed.slurm | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vec_inf/embed.slurm b/vec_inf/embed.slurm index 5878bae..c9da467 100644 --- a/vec_inf/embed.slurm +++ b/vec_inf/embed.slurm @@ -22,7 +22,10 @@ if [ "$VENV_BASE" = "singularity" ]; then export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1 module load singularity-ce/3.8.2 singularity exec $SINGULARITY_IMAGE ray stop - singularity exec --nv --bind ${MODEL_WEIGHTS_PARENT_DIR}:${MODEL_WEIGHTS_PARENT_DIR} $SINGULARITY_IMAGE \ + singularity exec --nv \ + --bind ${MODEL_WEIGHTS_PARENT_DIR}:${MODEL_WEIGHTS_PARENT_DIR} \ + --bind ${SRC_DIR}:${SRC_DIR} \ + $SINGULARITY_IMAGE \ python3.10 ${SRC_DIR}/embedding/openai_api_server.py \ --model ${VLLM_MODEL_WEIGHTS} \ --port ${vllm_port_number} \ From 98d4dd3bef0dfaaad8cc21602e4dcd75b1ac1661 Mon Sep 17 00:00:00 2001 From: Jacob-Junqi Tian Date: Sun, 24 Nov 2024 16:57:49 -0500 Subject: [PATCH 03/13] Custom embeddings: added store_true to trust_remote_code cli argument. --- vec_inf/embedding/openai_api_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vec_inf/embedding/openai_api_server.py b/vec_inf/embedding/openai_api_server.py index 87d45a9..0c9d02b 100644 --- a/vec_inf/embedding/openai_api_server.py +++ b/vec_inf/embedding/openai_api_server.py @@ -37,7 +37,7 @@ class EmbeddingsResponse(BaseModel): parser.add_argument("--model") parser.add_argument("--port", type=int) parser.add_argument("--max-num-seqs", type=int) -parser.add_argument("--trust-remote-code", type=bool) +parser.add_argument("--trust-remote-code", type=bool, action="store_true") args = parser.parse_args() From c7a0c23f0aee75cea3ea21acad14b44a693799b7 Mon Sep 17 00:00:00 2001 From: Jacob-Junqi Tian Date: Sun, 24 Nov 2024 16:58:39 -0500 Subject: [PATCH 04/13] Custom embeddings: added store_true to trust_remote_code cli argument. --- vec_inf/embedding/openai_api_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vec_inf/embedding/openai_api_server.py b/vec_inf/embedding/openai_api_server.py index 0c9d02b..334ec48 100644 --- a/vec_inf/embedding/openai_api_server.py +++ b/vec_inf/embedding/openai_api_server.py @@ -37,7 +37,7 @@ class EmbeddingsResponse(BaseModel): parser.add_argument("--model") parser.add_argument("--port", type=int) parser.add_argument("--max-num-seqs", type=int) -parser.add_argument("--trust-remote-code", type=bool, action="store_true") +parser.add_argument("--trust-remote-code", action="store_true") args = parser.parse_args() From be5118ee99709d786c7cba5056a223a4dc332b04 Mon Sep 17 00:00:00 2001 From: Jacob-Junqi Tian Date: Sun, 24 Nov 2024 17:00:02 -0500 Subject: [PATCH 05/13] Custom embeddings: appended MODEL_WEIGHTS_PARENT_DIR to model name. --- vec_inf/embed.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vec_inf/embed.slurm b/vec_inf/embed.slurm index c9da467..6184a3e 100644 --- a/vec_inf/embed.slurm +++ b/vec_inf/embed.slurm @@ -27,7 +27,7 @@ if [ "$VENV_BASE" = "singularity" ]; then --bind ${SRC_DIR}:${SRC_DIR} \ $SINGULARITY_IMAGE \ python3.10 ${SRC_DIR}/embedding/openai_api_server.py \ - --model ${VLLM_MODEL_WEIGHTS} \ + --model ${MODEL_WEIGHTS_PARENT_DIR}/${VLLM_MODEL_WEIGHTS} \ --port ${vllm_port_number} \ --trust-remote-code \ --max-num-seqs ${VLLM_MAX_NUM_SEQS} From 85dace992d80582372e70bc1367510c6fde4781a Mon Sep 17 00:00:00 2001 From: Jacob-Junqi Tian Date: Sun, 24 Nov 2024 17:11:36 -0500 Subject: [PATCH 06/13] Custom embeddings: renamed "embedding" to "embeddings". Updated uvicorn config. Updated model weight mount. --- vec_inf/embed.slurm | 10 +++++----- vec_inf/{embedding => embeddings}/__init__.py | 0 vec_inf/{embedding => embeddings}/openai_api_server.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) rename vec_inf/{embedding => embeddings}/__init__.py (100%) rename vec_inf/{embedding => embeddings}/openai_api_server.py (98%) diff --git a/vec_inf/embed.slurm b/vec_inf/embed.slurm index 6184a3e..da39876 100644 --- a/vec_inf/embed.slurm +++ b/vec_inf/embed.slurm @@ -23,19 +23,19 @@ if [ "$VENV_BASE" = "singularity" ]; then module load singularity-ce/3.8.2 singularity exec $SINGULARITY_IMAGE ray stop singularity exec --nv \ - --bind ${MODEL_WEIGHTS_PARENT_DIR}:${MODEL_WEIGHTS_PARENT_DIR} \ + --bind ${MODEL_WEIGHTS_PARENT_DIR}:/model-weights \ --bind ${SRC_DIR}:${SRC_DIR} \ $SINGULARITY_IMAGE \ - python3.10 ${SRC_DIR}/embedding/openai_api_server.py \ - --model ${MODEL_WEIGHTS_PARENT_DIR}/${VLLM_MODEL_WEIGHTS} \ + python3.10 ${SRC_DIR}/embeddings/openai_api_server.py \ + --model /model-weights/${VLLM_MODEL_WEIGHTS} \ --port ${vllm_port_number} \ --trust-remote-code \ --max-num-seqs ${VLLM_MAX_NUM_SEQS} else source ${VENV_BASE}/bin/activate - python3 ${SRC_DIR}/embedding/openai_api_server.py \ + python3 ${SRC_DIR}/embeddings/openai_api_server.py \ --model ${VLLM_MODEL_WEIGHTS} \ --port ${vllm_port_number} \ --trust-remote-code \ --max-num-seqs ${VLLM_MAX_NUM_SEQS} -fi +fi \ No newline at end of file diff --git a/vec_inf/embedding/__init__.py b/vec_inf/embeddings/__init__.py similarity index 100% rename from vec_inf/embedding/__init__.py rename to vec_inf/embeddings/__init__.py diff --git a/vec_inf/embedding/openai_api_server.py b/vec_inf/embeddings/openai_api_server.py similarity index 98% rename from vec_inf/embedding/openai_api_server.py rename to vec_inf/embeddings/openai_api_server.py index 334ec48..c154c5a 100644 --- a/vec_inf/embedding/openai_api_server.py +++ b/vec_inf/embeddings/openai_api_server.py @@ -166,4 +166,4 @@ async def startup_event(): if __name__ == "__main__": print("INFO: Application startup complete.", file=sys.stderr) - uvicorn.run("embedding_server:app", host="0.0.0.0", port=args.port) + uvicorn.run(app, host="0.0.0.0", port=args.port) From 4484299c8b6ed448fc18864dc36036cabd95991f Mon Sep 17 00:00:00 2001 From: Jacob-Junqi Tian Date: Sun, 24 Nov 2024 17:13:56 -0500 Subject: [PATCH 07/13] Addressed potentially missing ": " in slurm output. --- vec_inf/cli/_cli.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py index 9f1bea3..2c12527 100644 --- a/vec_inf/cli/_cli.py +++ b/vec_inf/cli/_cli.py @@ -159,6 +159,9 @@ def launch( output_dict = {"slurm_job_id": slurm_job_id} for line in output_lines: + if ": " not in line: + continue + key, value = line.split(": ") table.add_row(key, value) output_dict[key.lower().replace(" ", "_")] = value From 3903bed10df6cd26d231ffcf971d322c343d3f80 Mon Sep 17 00:00:00 2001 From: Jacob-Junqi Tian Date: Sun, 24 Nov 2024 17:42:19 -0500 Subject: [PATCH 08/13] Added all-MiniLM-L6-v2 model. Marked model-weights-parent-dir as optional to load values from models.csv. --- vec_inf/cli/_cli.py | 6 +++--- vec_inf/models/models.csv | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py index 2c12527..c002eea 100644 --- a/vec_inf/cli/_cli.py +++ b/vec_inf/cli/_cli.py @@ -82,8 +82,8 @@ def cli(): ) @click.option( "--model-weights-parent-dir", - type=str, - default="/model-weights", + type=Optional[str], + default=None, help="Path to parent directory containing model weights, default to '/model-weights' for supported models", ) @click.option( @@ -161,7 +161,7 @@ def launch( for line in output_lines: if ": " not in line: continue - + key, value = line.split(": ") table.add_row(key, value) output_dict[key.lower().replace(" ", "_")] = value diff --git a/vec_inf/models/models.csv b/vec_inf/models/models.csv index 421856a..086a23c 100644 --- a/vec_inf/models/models.csv +++ b/vec_inf/models/models.csv @@ -65,3 +65,4 @@ Qwen2.5-72B-Instruct,Qwen2.5,72B-Instruct,LLM,4,1,152064,16384,256,true,m2,08:00 Pixtral-12B-2409,Pixtral,12B-2409,VLM,1,1,131072,8192,256,true,m2,08:00:00,a40,auto,singularity,default,/model-weights bge-multilingual-gemma2,bge,multilingual-gemma2,Text Embedding,1,1,256002,4096,256,true,m2,08:00:00,a40,auto,singularity,default,/model-weights e5-mistral-7b-instruct,e5,mistral-7b-instruct,Text Embedding,1,1,32000,4096,256,true,m2,08:00:00,a40,auto,singularity,default,/model-weights +all-MiniLM-L6-v2,sentence-transformers,all-MiniLM-L6-v2,Text Embedding,1,1,30522,512,256,true,m2,08:00:00,a40,auto,singularity,default,/fs01/projects/llm/ From c048776c3a4cbb2be44a648a10ba5747cc262f37 Mon Sep 17 00:00:00 2001 From: Jacob-Junqi Tian Date: Sun, 24 Nov 2024 17:47:56 -0500 Subject: [PATCH 09/13] Added MODEL_WEIGHTS_PARENT_DIR to exported variables in launch_server.sh --- vec_inf/launch_server.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/vec_inf/launch_server.sh b/vec_inf/launch_server.sh index 36459f6..c6ebc19 100755 --- a/vec_inf/launch_server.sh +++ b/vec_inf/launch_server.sh @@ -47,6 +47,7 @@ export VLLM_DATA_TYPE=$data_type export VENV_BASE=$venv export SLURM_SCRIPT=$slurm_script export LOG_DIR=$log_dir +export MODEL_WEIGHTS_PARENT_DIR=$model_weights_parent_dirl if [ -n "$max_num_seqs" ]; then export VLLM_MAX_NUM_SEQS=$max_num_seqs From bae2b49f3ec572a4317eaf8a5ab67bd34e64b74b Mon Sep 17 00:00:00 2001 From: Jacob-Junqi Tian Date: Sun, 24 Nov 2024 17:48:46 -0500 Subject: [PATCH 10/13] Custom embeddings: updated model path in slurm file. --- vec_inf/embed.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vec_inf/embed.slurm b/vec_inf/embed.slurm index da39876..6d9b125 100644 --- a/vec_inf/embed.slurm +++ b/vec_inf/embed.slurm @@ -27,7 +27,7 @@ if [ "$VENV_BASE" = "singularity" ]; then --bind ${SRC_DIR}:${SRC_DIR} \ $SINGULARITY_IMAGE \ python3.10 ${SRC_DIR}/embeddings/openai_api_server.py \ - --model /model-weights/${VLLM_MODEL_WEIGHTS} \ + --model ${VLLM_MODEL_WEIGHTS} \ --port ${vllm_port_number} \ --trust-remote-code \ --max-num-seqs ${VLLM_MAX_NUM_SEQS} From 5ffe7ea6e30af85e960270eefc472367f8126128 Mon Sep 17 00:00:00 2001 From: Jacob-Junqi Tian Date: Sun, 24 Nov 2024 17:51:07 -0500 Subject: [PATCH 11/13] Added MODEL_WEIGHTS_PARENT_DIR to exported variables in launch_server.sh --- vec_inf/launch_server.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vec_inf/launch_server.sh b/vec_inf/launch_server.sh index c6ebc19..ec1988d 100755 --- a/vec_inf/launch_server.sh +++ b/vec_inf/launch_server.sh @@ -47,7 +47,7 @@ export VLLM_DATA_TYPE=$data_type export VENV_BASE=$venv export SLURM_SCRIPT=$slurm_script export LOG_DIR=$log_dir -export MODEL_WEIGHTS_PARENT_DIR=$model_weights_parent_dirl +export MODEL_WEIGHTS_PARENT_DIR=$model_weights_parent_dir if [ -n "$max_num_seqs" ]; then export VLLM_MAX_NUM_SEQS=$max_num_seqs From f274b317ccac1b36c18d9b0b3da626165b5c10ba Mon Sep 17 00:00:00 2001 From: Jacob-Junqi Tian Date: Sun, 24 Nov 2024 17:52:00 -0500 Subject: [PATCH 12/13] Custom embeddings: updated model path bind in slurm file. --- vec_inf/embed.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vec_inf/embed.slurm b/vec_inf/embed.slurm index 6d9b125..5addb38 100644 --- a/vec_inf/embed.slurm +++ b/vec_inf/embed.slurm @@ -23,7 +23,7 @@ if [ "$VENV_BASE" = "singularity" ]; then module load singularity-ce/3.8.2 singularity exec $SINGULARITY_IMAGE ray stop singularity exec --nv \ - --bind ${MODEL_WEIGHTS_PARENT_DIR}:/model-weights \ + --bind ${MODEL_WEIGHTS_PARENT_DIR}:${MODEL_WEIGHTS_PARENT_DIR} \ --bind ${SRC_DIR}:${SRC_DIR} \ $SINGULARITY_IMAGE \ python3.10 ${SRC_DIR}/embeddings/openai_api_server.py \ From 81acc767cfb28e1236c5d79c540f2241b7e4f59d Mon Sep 17 00:00:00 2001 From: Jacob-Junqi Tian Date: Sun, 24 Nov 2024 18:11:31 -0500 Subject: [PATCH 13/13] Custom embeddings: added BAAI/bge-base-en-v1.5 model. --- vec_inf/models/models.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/vec_inf/models/models.csv b/vec_inf/models/models.csv index 086a23c..d48efa3 100644 --- a/vec_inf/models/models.csv +++ b/vec_inf/models/models.csv @@ -66,3 +66,4 @@ Pixtral-12B-2409,Pixtral,12B-2409,VLM,1,1,131072,8192,256,true,m2,08:00:00,a40,a bge-multilingual-gemma2,bge,multilingual-gemma2,Text Embedding,1,1,256002,4096,256,true,m2,08:00:00,a40,auto,singularity,default,/model-weights e5-mistral-7b-instruct,e5,mistral-7b-instruct,Text Embedding,1,1,32000,4096,256,true,m2,08:00:00,a40,auto,singularity,default,/model-weights all-MiniLM-L6-v2,sentence-transformers,all-MiniLM-L6-v2,Text Embedding,1,1,30522,512,256,true,m2,08:00:00,a40,auto,singularity,default,/fs01/projects/llm/ +bge-base-en-v1.5,BAAI,base-en-v1.5,Text Embedding,1,1,30522,512,256,true,m2,08:00:00,a40,auto,singularity,default,/fs01/projects/llm/