From 2eaf13626bce8a44c27ee8bc36a7cca53027f27e Mon Sep 17 00:00:00 2001
From: Dina Suehiro Jones <dina.s.jones@intel.com>
Date: Mon, 16 Dec 2024 10:02:08 -0800
Subject: [PATCH 01/10] Add image query support to the backend microservices
 (#12)

* Backend enhancements for image query capabilities for MultimodalQnA

* Fix model name var

Signed-off-by: dmsuehir <dina.s.jones@intel.com>

* Remove space at end of prompt

Signed-off-by: dmsuehir <dina.s.jones@intel.com>

* Add env var for the max number of images sent to the LVM

Signed-off-by: dmsuehir <dina.s.jones@intel.com>

* README update for the MAX_IMAGES env var

Signed-off-by: dmsuehir <dina.s.jones@intel.com>

* Remove prints

Signed-off-by: dmsuehir <dina.s.jones@intel.com>

* Audio query functionality to multimodal backend (#8)

Signed-off-by: okhleif-IL <omar.khleif@intel.com>

* added in audio dict creation

Signed-off-by: okhleif-IL <omar.khleif@intel.com>

* separated audio from prompt

Signed-off-by: okhleif-IL <omar.khleif@intel.com>

* added ASR endpoint

Signed-off-by: okhleif-IL <omar.khleif@intel.com>

* removed ASR endpoints from mm embedding

Signed-off-by: okhleif-IL <omar.khleif@intel.com>

* edited return logic, fixed function call

Signed-off-by: okhleif-IL <omar.khleif@intel.com>

* added megaservice to elif

Signed-off-by: okhleif-IL <omar.khleif@intel.com>

* reworked helper func

Signed-off-by: okhleif-IL <omar.khleif@intel.com>

* Append audio to prompt

Signed-off-by: okhleif-IL <omar.khleif@intel.com>

* Reworked handle messages, added metadata

Signed-off-by: okhleif-IL <omar.khleif@intel.com>

* Moved dictionary logic to right place

Signed-off-by: okhleif-IL <omar.khleif@intel.com>

* changed logic to rely on message len

Signed-off-by: okhleif-IL <omar.khleif@intel.com>

* list --> empty str

Signed-off-by: okhleif-IL <omar.khleif@intel.com>
---------

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>
Signed-off-by: okhleif-IL <omar.khleif@intel.com>
Signed-off-by: dmsuehir <dina.s.jones@intel.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fixed role bug where i never was > 0

Signed-off-by: okhleif-IL <omar.khleif@intel.com>

* Fix after merge

Signed-off-by: dmsuehir <dina.s.jones@intel.com>

* removed whitespace

Signed-off-by: okhleif-IL <omar.khleif@intel.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix call to get role labels

Signed-off-by: dmsuehir <dina.s.jones@intel.com>

* Gateway test updates images within the conversation

Signed-off-by: dmsuehir <dina.s.jones@intel.com>

* Adds unit test coverage for audio query

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>

* Update test to check the returned b64 types

Signed-off-by: dmsuehir <dina.s.jones@intel.com>

* Update test since we don't expect images from the assistant

Signed-off-by: dmsuehir <dina.s.jones@intel.com>

* Port number fix

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>

* Formatting

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fixed place where port number is set

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>

* Remove old comment and added more accurate description

Signed-off-by: dmsuehir <dina.s.jones@intel.com>

* add comment in code about MAX_IMAGES

Signed-off-by: dmsuehir <dina.s.jones@intel.com>

* Add Gaudi support for image query

Signed-off-by: dmsuehir <dina.s.jones@intel.com>

* Fix to pass the retrieved image last

Signed-off-by: dmsuehir <dina.s.jones@intel.com>

* Revert out gateway and gateway test code, due to its move to GenAIExamples

Signed-off-by: dmsuehir <dina.s.jones@intel.com>

* Fix retriever test for checking for b64_img_str in the result

Signed-off-by: dmsuehir <dina.s.jones@intel.com>

---------

Signed-off-by: dmsuehir <dina.s.jones@intel.com>
Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>
Signed-off-by: okhleif-IL <omar.khleif@intel.com>
Co-authored-by: Omar Khleif <omar.khleif@intel.com>
Co-authored-by: Melanie Hart Buehler <melanie.h.buehler@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Abolfazl Shahbazi <12436063+ashahba@users.noreply.github.com>
---
 comps/cores/proto/docarray.py                 |  2 +-
 comps/embeddings/multimodal/README.md         |  7 ++
 .../multimodal_langchain/mm_embedding_mmei.py | 18 +++-
 comps/lvms/llava/README.md                    | 12 ++-
 comps/lvms/llava/dependency/llava_server.py   | 83 +++++++++++++++----
 comps/lvms/llava/lvm.py                       | 17 ++++
 comps/lvms/tgi-llava/lvm_tgi.py               | 47 +++++++++--
 .../redis/langchain/retriever_redis.py        |  6 ++
 .../embeddings/test_embeddings_multimodal.sh  | 16 ++++
 tests/lvms/test_lvms_llava.sh                 | 36 ++++++++
 .../lvms/test_lvms_tgi-llava_on_intel_hpu.sh  | 35 ++++++++
 ...t_retrievers_multimodal_redis_langchain.sh | 26 ++++++
 12 files changed, 275 insertions(+), 30 deletions(-)

diff --git a/comps/cores/proto/docarray.py b/comps/cores/proto/docarray.py
index 8c71086f58..56de4a8c60 100644
--- a/comps/cores/proto/docarray.py
+++ b/comps/cores/proto/docarray.py
@@ -278,7 +278,7 @@ class GraphDoc(BaseDoc):
 
 
 class LVMDoc(BaseDoc):
-    image: str
+    image: Union[str, List[str]]
     prompt: str
     max_new_tokens: conint(ge=0, le=1024) = 512
     top_k: int = 10
diff --git a/comps/embeddings/multimodal/README.md b/comps/embeddings/multimodal/README.md
index c75a60f12a..c839365bcd 100644
--- a/comps/embeddings/multimodal/README.md
+++ b/comps/embeddings/multimodal/README.md
@@ -170,11 +170,18 @@ docker compose -f docker_compose_multimodal_embedding.yaml up -d
 
 **Compute a joint embedding of an image-text pair**
 
+The image can be passed as a URL:
 ```bash
 curl -X POST http://0.0.0.0:6600/v1/embeddings \
      -H "Content-Type: application/json" \
      -d '{"text": {"text" : "This is some sample text."}, "image" : {"url": "https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true"}}'
 ```
+Or as a base64 encoded string:
+```bash
+curl -X POST http://0.0.0.0:6600/v1/embeddings \
+     -H "Content-Type: application/json" \
+     -d '{"text": {"text" : "This is some sample text."}, "image" : {"base64_image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC"}}'
+```
 
 **Compute an embedding of a text**
 
diff --git a/comps/embeddings/multimodal/multimodal_langchain/mm_embedding_mmei.py b/comps/embeddings/multimodal/multimodal_langchain/mm_embedding_mmei.py
index fbd972a202..cd052fc288 100644
--- a/comps/embeddings/multimodal/multimodal_langchain/mm_embedding_mmei.py
+++ b/comps/embeddings/multimodal/multimodal_langchain/mm_embedding_mmei.py
@@ -7,6 +7,7 @@
 
 import requests
 from fastapi.responses import JSONResponse
+from typing import Union
 
 from comps import (
     CustomLogger,
@@ -38,7 +39,7 @@
     output_datatype=EmbedMultimodalDoc,
 )
 @register_statistics(names=["opea_service@multimodal_embedding_mmei_langchain"])
-def embedding(input: MultimodalDoc) -> EmbedDoc:
+def embedding(input: MultimodalDoc) -> Union[EmbedDoc, EmbedMultimodalDoc]:
     start = time.time()
     if logflag:
         logger.info(input)
@@ -48,9 +49,15 @@ def embedding(input: MultimodalDoc) -> EmbedDoc:
         json["text"] = input.text
     elif isinstance(input, TextImageDoc):
         json["text"] = input.text.text
-        img_bytes = input.image.url.load_bytes()
-        base64_img = base64.b64encode(img_bytes).decode("utf-8")
-        json["img_b64_str"] = base64_img
+        base64_img = ""
+        if input.image.url:
+            img_bytes = input.image.url.load_bytes()
+            base64_img = base64.b64encode(img_bytes).decode("utf-8")
+        elif input.image.base64_image:
+            base64_img = input.image.base64_image
+
+        if base64_img:
+            json["img_b64_str"] = base64_img
     else:
         return JSONResponse(status_code=400, content={"message": "Bad request!"})
 
@@ -66,6 +73,9 @@ def embedding(input: MultimodalDoc) -> EmbedDoc:
             res = EmbedDoc(text=input.text, embedding=embed_vector)
         elif isinstance(input, TextImageDoc):
             res = EmbedMultimodalDoc(text=input.text.text, url=input.image.url, embedding=embed_vector)
+
+            if base64_img:
+                res.base64_image = base64_img
     except requests.exceptions.ConnectionError:
         res = JSONResponse(status_code=503, content={"message": "Multimodal embedding endpoint not started!"})
     statistics_dict["opea_service@multimodal_embedding_mmei_langchain"].append_latency(time.time() - start, None)
diff --git a/comps/lvms/llava/README.md b/comps/lvms/llava/README.md
index 998eb4b664..74e1de706f 100644
--- a/comps/lvms/llava/README.md
+++ b/comps/lvms/llava/README.md
@@ -1,6 +1,6 @@
 # LVM Microservice
 
-Visual Question and Answering is one of the multimodal tasks empowered by LVMs (Large Visual Models). This microservice supports visual Q&A by using LLaVA as the base large visual model. It accepts two inputs: a prompt and an image. It outputs the answer to the prompt about the image.
+Visual Question and Answering is one of the multimodal tasks empowered by LVMs (Large Visual Models). This microservice supports visual Q&A by using LLaVA as the base large visual model. It accepts two inputs: a prompt and images. It outputs the answer to the prompt about the images.
 
 ## 🚀1. Start Microservice with Python (Option 1)
 
@@ -92,10 +92,15 @@ docker run -p 8399:8399 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_M
 
 #### 2.2.2 Start LVM service
 
+> Note: The `MAX_IMAGES` environment variable is used to specify the maximum number of images that will be sent from the LVM service to the LLaVA server.
+> If an image list longer than `MAX_IMAGES` is sent to the LVM server, a shortened image list will be sent to the LLaVA service. If the image list
+> needs to be shortened, the most recent images (the ones at the end of the list) are prioritized to send to the LLaVA service. Some LLaVA models have not
+> been trained with multiple images and may lead to inaccurate results. If `MAX_IMAGES` is not set, it will default to `1`.
+
 ```bash
 ip_address=$(hostname -I | awk '{print $1}')
 
-docker run -p 9399:9399 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LVM_ENDPOINT=http://$ip_address:8399 opea/lvm-llava-svc:latest
+docker run -p 9399:9399 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LVM_ENDPOINT=http://$ip_address:8399 -e MAX_IMAGES=1 opea/lvm-llava-svc:latest
 ```
 
 #### 2.2.3 Test
@@ -106,6 +111,9 @@ docker run -p 9399:9399 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$htt
 # curl with an image and a prompt
 http_proxy="" curl http://localhost:9399/v1/lvm -XPOST -d '{"image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "prompt":"What is this?"}' -H 'Content-Type: application/json'
 
+# curl with multiple images and a prompt (Note that depending on your MAX_IMAGES value, both images may not be sent to the LLaVA model)
+http_proxy="" curl http://localhost:9399/v1/lvm -XPOST -d '{"image": ["iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNkYPhfz0AEYBxVSF+FAP5FDvcfRYWgAAAAAElFTkSuQmCC", "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNk+M9Qz0AEYBxVSF+FAAhKDveksOjmAAAAAElFTkSuQmCC"], "prompt":"What is in these images?"}' -H 'Content-Type: application/json'
+
 # curl with a prompt only (no image)
 http_proxy="" curl http://localhost:9399/v1/lvm -XPOST -d '{"image": "", "prompt":"What is deep learning?"}' -H 'Content-Type: application/json'
 
diff --git a/comps/lvms/llava/dependency/llava_server.py b/comps/lvms/llava/dependency/llava_server.py
index 644e15a82e..4fc0043805 100644
--- a/comps/lvms/llava/dependency/llava_server.py
+++ b/comps/lvms/llava/dependency/llava_server.py
@@ -13,6 +13,7 @@
 import uvicorn
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response
+from transformers import AutoProcessor
 from transformers import pipeline
 from transformers.image_utils import load_image
 
@@ -33,9 +34,16 @@ def pipeline_preprocess(self, image, prompt=None, timeout=None):
     The original transformers image-to-text pipeline preprocess function requires that an image is passed in, and will
     fail if the image parameter is null/empty. In order to support multimodal use cases with the same pipeline, this
     preprocess function handles the case where there is no image with the prompt.
+    Also, the image-to-text pipeline typically treats multiple images passed in as a list as a batch (where it iterates
+    over the image inputs for generation). For that reason, the original pipeline_preprocess code would only get a
+    single image at a time. To support multiple images, the pipeline call is updated to send a list of lists for the
+    images (so that when iterated, we still get multiple images) and this pipeline_preprocess function has been updated
+    to handle a list of images in addition to single images.
     """
 
-    if image:
+    if isinstance(image, list):
+        image = [load_image(i, timeout=timeout) for i in image]
+    elif image:
         image = load_image(image, timeout=timeout)
 
     if prompt is not None:
@@ -114,23 +122,52 @@ async def health() -> Response:
 
 
 @app.post("/generate")
-async def generate(request: Request) -> Response:  # FIXME batch_size=1 for now, only accept single image
+async def generate(request: Request) -> Response:  # FIXME batch_size=1 for now
     print("LLaVA generation begin.")
     request_dict = await request.json()
     prompt = request_dict.pop("prompt")
-    img_b64_str = request_dict.pop("img_b64_str")
+    img_b64_str = request_dict.pop("img_b64_str")  # String or list of strings
     max_new_tokens = request_dict.pop("max_new_tokens", 100)
 
+    # Determine the format of the role labels based on the model name
+    model_name = generator.model.name_or_path
+    user_label = "USER:"
+    assistant_label = "ASSISTANT:"
+    image_tag = "<image>\n"
+
+    # This is the role label that we see in the results from the pipeline. This is used to split the output.
+    output_assistant_label = "ASSISTANT: "
+
+    if "llava-interleave" in model_name:
+        user_label = "<|im_start|>user"
+        assistant_label = "<|im_end|><|im_start|>assistant"
+        output_assistant_label = "assistant "
+    elif "llava-v1.6-mistral" in model_name:
+        user_label = "[INST]"
+        assistant_label = " [/INST]"
+        output_assistant_label = "[/INST] "
+
     if img_b64_str:
-        # Decode and Resize the image
-        image = PIL.Image.open(BytesIO(base64.b64decode(img_b64_str)))
-        image = process_image(image)
-        # format the prompt with an image
-        prompt = f"<image>\nUSER: {prompt}\nASSISTANT:"
+        if isinstance(img_b64_str, str):
+            img_b64_str = [img_b64_str]
+
+        # Decode and Resize the images
+        images = []
+        for img_b64 in img_b64_str:
+            if img_b64:
+                image = PIL.Image.open(BytesIO(base64.b64decode(img_b64)))
+                image = process_image(image)
+                images.append(image)
+
+        # If the prompt provided does not have all the image tags, format the prompt with images
+        num_images = len(images)
+        num_image_tags = prompt.count(image_tag)
+        image_tags = image_tag * (num_images - num_image_tags) if num_images > num_image_tags else ""
+        prompt = f"{user_label}{image_tags} {prompt}{assistant_label}"
     else:
-        image = None
+        images = None
         # format the prompt with text only
-        prompt = f"USER: {prompt}\nASSISTANT:"
+        prompt = f"{user_label} {prompt}\n{assistant_label}"
 
     if args.device == "hpu":
         generate_kwargs = {
@@ -149,12 +186,13 @@ async def generate(request: Request) -> Response:  # FIXME batch_size=1 for now,
     # Override the pipeline preprocessing
     generator.preprocess = pipeline_preprocess.__get__(generator, type(generator))
 
-    result = generator(image, prompt=prompt, batch_size=1, generate_kwargs=generate_kwargs)
+    result = generator([images], prompt=prompt, batch_size=1, generate_kwargs=generate_kwargs)
     end = time.time()
-    result = result[0]["generated_text"].split("ASSISTANT: ")[-1]
+    result = result[0][0]["generated_text"].split(output_assistant_label.strip())[-1].strip()
     print(f"LLaVA result = {result}, time = {(end-start) * 1000 }ms")
-    if image:
-        image.close()
+    if images:
+        for i in images:
+            i.close()
 
     ret = {"text": result}
     return JSONResponse(ret)
@@ -191,6 +229,8 @@ async def generate(request: Request) -> Response:  # FIXME batch_size=1 for now,
         device=args.device,
     )
 
+    processor = AutoProcessor.from_pretrained(model_name_or_path)
+
     # warmup
     print("LLaVA warmup...")
     if args.device == "hpu":
@@ -214,10 +254,23 @@ async def generate(request: Request) -> Response:  # FIXME batch_size=1 for now,
     images = []
     for image_path in image_paths:
         images.append(PIL.Image.open(requests.get(image_path, stream=True, timeout=3000).raw))
+
+    # Generate a text prompt to use for warm up
+    conversation = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": "What's the content of the image?"},
+                ],
+        },
+    ]
+    text_prompt = processor.apply_chat_template(conversation)
+
     for i in range(args.warmup):
         generator(
             images,
-            prompt="<image>\nUSER: What's the content of the image?\nASSISTANT:",
+            prompt=text_prompt,
             batch_size=1,
             generate_kwargs=generate_kwargs,
         )
diff --git a/comps/lvms/llava/lvm.py b/comps/lvms/llava/lvm.py
index 897f7cbbe4..9d7bde0f90 100644
--- a/comps/lvms/llava/lvm.py
+++ b/comps/lvms/llava/lvm.py
@@ -28,6 +28,9 @@
 logger = CustomLogger("lvm")
 logflag = os.getenv("LOGFLAG", False)
 
+# The maximum number of images that should be sent to the LVM
+max_images = int(os.getenv("MAX_IMAGES", 1))
+
 
 @register_microservice(
     name="opea_service@lvm",
@@ -76,6 +79,17 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc
         prompt = request.prompt
         max_new_tokens = request.max_new_tokens
 
+    # Limit the number of images being sent to the LVM
+    if isinstance(img_b64_str, list) and len(img_b64_str) > max_images:
+        img_b64_str=img_b64_str[-max_images:]
+
+        # Adjust the number of images tags in the prompt
+        image_tag = "<image>\n"
+        num_tags_in_prompt = prompt.count(image_tag)
+
+        if len(img_b64_str) < num_tags_in_prompt:
+            prompt = prompt.replace(image_tag, "", num_tags_in_prompt - len(img_b64_str))
+
     inputs = {"img_b64_str": img_b64_str, "prompt": prompt, "max_new_tokens": max_new_tokens}
     # forward to the LLaVA server
     response = requests.post(url=f"{lvm_endpoint}/generate", data=json.dumps(inputs), proxies={"http": None})
@@ -99,5 +113,8 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc
 if __name__ == "__main__":
     lvm_endpoint = os.getenv("LVM_ENDPOINT", "http://localhost:8399")
 
+    if logflag:
+        logger.info(f"MAX_IMAGES: {max_images}")
+
     logger.info("[LVM] LVM initialized.")
     opea_microservices["opea_service@lvm"].start()
diff --git a/comps/lvms/tgi-llava/lvm_tgi.py b/comps/lvms/tgi-llava/lvm_tgi.py
index 38b492c395..04ceee400c 100644
--- a/comps/lvms/tgi-llava/lvm_tgi.py
+++ b/comps/lvms/tgi-llava/lvm_tgi.py
@@ -27,6 +27,9 @@
 logger = CustomLogger("lvm_tgi")
 logflag = os.getenv("LOGFLAG", False)
 
+# The maximum number of images that should be sent to the LVM
+max_images = int(os.getenv("MAX_IMAGES", 1))
+
 
 @register_microservice(
     name="opea_service@lvm_tgi",
@@ -88,15 +91,41 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc
         top_k = request.top_k
         top_p = request.top_p
 
-    if not img_b64_str:
-        # Work around an issue where LLaVA-NeXT is not providing good responses when prompted without an image.
-        # Provide an image and then instruct the model to ignore the image. The base64 string below is the encoded png:
-        # https://raw.githubusercontent.com/opea-project/GenAIExamples/refs/tags/v1.0/AudioQnA/ui/svelte/src/lib/assets/icons/png/audio1.png
-        img_b64_str = "iVBORw0KGgoAAAANSUhEUgAAADUAAAAlCAYAAADiMKHrAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAKPSURBVHgB7Zl/btowFMefnUTqf+MAHYMTjN4gvcGOABpM+8E0doLSE4xpsE3rKuAG3KC5Ad0J6MYOkP07YnvvhR9y0lVzupTIVT5SwDjB9fd97WfsMkCef1rUXM8dY9HHK4hWUevzi/oVWAqnF8fzLmAtiPA3Aq0lFsVA1fRKxlgNLIbDPaQUZQuu6YO98aIipHOiFGtIqaYfn1UnUCDds6WPyeANlTFbv9WztbFTK+HNUVAPiz7nbPzq7HsPCoKWIBREGfsJXZit5xT07X0jp6iRdIbEHOnjyyD97OvzH00lVS2K5OS2ax11cBXxJgYxlEIE6XZclzdTX6n8XjkkcEIfbj2nMO0/SNd1vy4vsCNjYPyEovfyy88GZIQCSKOCMf6ORgStoboLJuSWKDYCfK2q4jjrMZ+GOh7Pib/gek5DHxVUJtcgA7mJ4kwZRbN7viQXFzQn0Nl52gXG4Fo7DKAYp0yI3VHQ16oaWV0wYa+iGE8nG+wAdx5DzpS/KGyhFGULpShbKEXZQinqLlBK/IKc2asoh4sZvoXJWhlAzuxV1KBVD3HrfYTFAK8ZHgu0hu36DHLG+Izinw250WUkXHJht02QUnxLP7fZxR7f1I6S7Ir2GgmYvIQM5OYUuYBdainATq2ZjTqPBlnbGXYeBrg9Od18DKmc1U0jpw4OIIwEJFxQSl2b4MN2lf74fw8nFNbHt/5N9xWKTZvJ2S6YZk6RC3j2cKpVhSIShZ0mea6caCOCAjyNHd5gPPxGncMBTvI6hunYdaJ6kf8VoSCP2odxX6RkR6NOtanfj13EswKVqEQrPzzFL1lK+YvCFraiEqs8TrwQLGYraqpX4kr/Hixml+63Z+CoM9DTo438AUmP+KyMWT+tAAAAAElFTkSuQmCC"
-        prompt = f"Please disregard the image and answer the question. {prompt}"
+    # Make img_b64_str into a list of strings (if it's not already a list)
+    if not isinstance(img_b64_str, list):
+        if img_b64_str:
+            img_b64_str = [img_b64_str]
+        else:
+            # If img_b64_str was an empty string, which means we have just have a text prompt.
+            # Work around an issue where LLaVA-NeXT is not providing good responses when prompted without an image.
+            # Provide an image and then instruct the model to ignore the image. The base64 string below is the encoded png:
+            # https://raw.githubusercontent.com/opea-project/GenAIExamples/refs/tags/v1.0/AudioQnA/ui/svelte/src/lib/assets/icons/png/audio1.png
+            img_b64_str = ["iVBORw0KGgoAAAANSUhEUgAAADUAAAAlCAYAAADiMKHrAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAKPSURBVHgB7Zl/btowFMefnUTqf+MAHYMTjN4gvcGOABpM+8E0doLSE4xpsE3rKuAG3KC5Ad0J6MYOkP07YnvvhR9y0lVzupTIVT5SwDjB9fd97WfsMkCef1rUXM8dY9HHK4hWUevzi/oVWAqnF8fzLmAtiPA3Aq0lFsVA1fRKxlgNLIbDPaQUZQuu6YO98aIipHOiFGtIqaYfn1UnUCDds6WPyeANlTFbv9WztbFTK+HNUVAPiz7nbPzq7HsPCoKWIBREGfsJXZit5xT07X0jp6iRdIbEHOnjyyD97OvzH00lVS2K5OS2ax11cBXxJgYxlEIE6XZclzdTX6n8XjkkcEIfbj2nMO0/SNd1vy4vsCNjYPyEovfyy88GZIQCSKOCMf6ORgStoboLJuSWKDYCfK2q4jjrMZ+GOh7Pib/gek5DHxVUJtcgA7mJ4kwZRbN7viQXFzQn0Nl52gXG4Fo7DKAYp0yI3VHQ16oaWV0wYa+iGE8nG+wAdx5DzpS/KGyhFGULpShbKEXZQinqLlBK/IKc2asoh4sZvoXJWhlAzuxV1KBVD3HrfYTFAK8ZHgu0hu36DHLG+Izinw250WUkXHJht02QUnxLP7fZxR7f1I6S7Ir2GgmYvIQM5OYUuYBdainATq2ZjTqPBlnbGXYeBrg9Od18DKmc1U0jpw4OIIwEJFxQSl2b4MN2lf74fw8nFNbHt/5N9xWKTZvJ2S6YZk6RC3j2cKpVhSIShZ0mea6caCOCAjyNHd5gPPxGncMBTvI6hunYdaJ6kf8VoSCP2odxX6RkR6NOtanfj13EswKVqEQrPzzFL1lK+YvCFraiEqs8TrwQLGYraqpX4kr/Hixml+63Z+CoM9DTo438AUmP+KyMWT+tAAAAAElFTkSuQmCC"]
+            prompt = f"Please disregard the image and answer the question. {prompt}"
+
+    # Truncate the list of images if we have too many, only sending the most recent ones at the end of the list
+    if len(img_b64_str) > max_images:
+        img_b64_str=img_b64_str[-max_images:]
 
-    image = f"data:image/png;base64,{img_b64_str}"
-    image_prompt = f"![]({image})\n{prompt}\nASSISTANT:"
+    # Check the number of image tags in the prompt and adjust them to match the number of images that we have
+    image_tag = "<image>\n"
+    num_tags_in_prompt = prompt.count(image_tag)
+
+    # We have too many image tags in the prompt replace the first x instance of the tag with an empty string
+    if  len(img_b64_str) < num_tags_in_prompt:
+        prompt = prompt.replace(image_tag, "", num_tags_in_prompt - len(img_b64_str))
+    
+    # We don't have enough image tags in the prompt, add them
+    if len(img_b64_str) > num_tags_in_prompt:
+        num_tags_to_add = len(img_b64_str) - num_tags_in_prompt
+        tags_to_add = image_tag * num_tags_to_add
+        prompt = f"{tags_to_add}{prompt}"
+
+    # Replace image tags with the data
+    for i in img_b64_str:
+        formatted_image_str = f"![](data:image/png;base64,{i})\n"
+        prompt = prompt.replace(image_tag, formatted_image_str, 1)
+    image_prompt = f"{prompt}\nASSISTANT:"
 
     if streaming:
 
@@ -152,4 +181,6 @@ async def stream_generator():
     lvm_endpoint = os.getenv("LVM_ENDPOINT", "http://localhost:8399")
     lvm_client = AsyncInferenceClient(lvm_endpoint)
     logger.info("[LVM] LVM initialized.")
+    if logflag:
+        logger.info(f"MAX_IMAGES: {max_images}")
     opea_microservices["opea_service@lvm_tgi"].start()
diff --git a/comps/retrievers/multimodal/redis/langchain/retriever_redis.py b/comps/retrievers/multimodal/redis/langchain/retriever_redis.py
index a01b3e20c4..a92d59aba2 100644
--- a/comps/retrievers/multimodal/redis/langchain/retriever_redis.py
+++ b/comps/retrievers/multimodal/redis/langchain/retriever_redis.py
@@ -69,6 +69,12 @@ async def retrieve(
     if isinstance(input, EmbedMultimodalDoc):
         metadata_list = []
         for r in search_res:
+            # If the input had an image, pass that through in the metadata along with the search result image
+            if input.base64_image:
+                if r.metadata["b64_img_str"]:
+                    r.metadata["b64_img_str"] = [input.base64_image, r.metadata["b64_img_str"]]
+                else:
+                    r.metadata["b64_img_str"] = input.base64_image
             metadata_list.append(r.metadata)
             retrieved_docs.append(TextDoc(text=r.page_content))
         result = SearchedMultimodalDoc(retrieved_docs=retrieved_docs, initial_query=input.text, metadata=metadata_list)
diff --git a/tests/embeddings/test_embeddings_multimodal.sh b/tests/embeddings/test_embeddings_multimodal.sh
index bd2ca93b70..5bb2fd9f93 100644
--- a/tests/embeddings/test_embeddings_multimodal.sh
+++ b/tests/embeddings/test_embeddings_multimodal.sh
@@ -85,6 +85,22 @@ function validate_microservice_image_text_pair_embedding() {
     fi
 }
 
+function validate_microservice_b64_image_text_pair_embedding() {
+    result=$(http_proxy="" curl http://${ip_address}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \
+        -X POST \
+        -H "Content-Type: application/json" \
+        -d '{"text": {"text" : "This is some sample text."}, "image" : {"base64_image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC"}}')
+
+    if [[ $result == *"embedding"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong. Received was $result"
+        docker logs embedding-multimodal-bridgetower
+        docker logs embedding-multimodal
+        exit 1
+    fi
+}
+
 function validate_microservice() {
     validate_microservice_text_embedding
     validate_microservice_image_text_pair_embedding
diff --git a/tests/lvms/test_lvms_llava.sh b/tests/lvms/test_lvms_llava.sh
index 4627ec6ee7..8558fa5e3d 100644
--- a/tests/lvms/test_lvms_llava.sh
+++ b/tests/lvms/test_lvms_llava.sh
@@ -48,6 +48,42 @@ function validate_microservice() {
         exit 1
     fi
 
+    # Test sending two images with a text prompt with one image tag in the prompt.
+    # The first image is green and the second image is blue. Since the default MAX_IMAGES is 1, only the blue image should be sent to the LVM.
+    result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"image": ["iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNk+M9Qz0AEYBxVSF+FAAhKDveksOjmAAAAAElFTkSuQmCC", "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNkYPhfz0AEYBxVSF+FAP5FDvcfRYWgAAAAAElFTkSuQmCC"], "prompt":"<image>\nWhat are in these images?"}' -H 'Content-Type: application/json')
+    if [[ $result == *"blue"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong."
+        docker logs test-comps-lvm-llava >> ${LOG_PATH}/llava-dependency.log
+        docker logs test-comps-lvm-llava-svc >> ${LOG_PATH}/llava-server.log
+        exit 1
+    fi
+
+    # Test sending two images with a text prompt without any image tags.
+    # The first image is blue and the second image is green. Since the default MAX_IMAGES is 1, only the green image should be sent to the LVM.
+    result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"image": ["iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNkYPhfz0AEYBxVSF+FAP5FDvcfRYWgAAAAAElFTkSuQmCC", "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNk+M9Qz0AEYBxVSF+FAAhKDveksOjmAAAAAElFTkSuQmCC"], "prompt":"What are in these images?"}' -H 'Content-Type: application/json')
+    if [[ $result == *"green"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong."
+        docker logs test-comps-lvm-llava >> ${LOG_PATH}/llava-dependency.log
+        docker logs test-comps-lvm-llava-svc >> ${LOG_PATH}/llava-server.log
+        exit 1
+    fi
+
+    # Same test as above, except including two image tags with the prompt to ensure the number of image tags is reconciled.
+    # The first image is blue and the second image is green. Since the default MAX_IMAGES is 1, only the green image should be sent to the LVM.
+    result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"image": ["iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNkYPhfz0AEYBxVSF+FAP5FDvcfRYWgAAAAAElFTkSuQmCC", "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNk+M9Qz0AEYBxVSF+FAAhKDveksOjmAAAAAElFTkSuQmCC"], "prompt":"<image>\n<image>\nWhat are in these images?"}' -H 'Content-Type: application/json')
+    if [[ $result == *"green"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong."
+        docker logs test-comps-lvm-llava >> ${LOG_PATH}/llava-dependency.log
+        docker logs test-comps-lvm-llava-svc >> ${LOG_PATH}/llava-server.log
+        exit 1
+    fi
+
     result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [{"b64_img_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "transcript_for_inference": "yellow image", "video_id": "8c7461df-b373-4a00-8696-9a2234359fe0", "time_of_frame_ms":"37000000", "source_video":"WeAreGoingOnBullrun_8c7461df-b373-4a00-8696-9a2234359fe0.mp4"}]}' -H 'Content-Type: application/json')
     if [[ $result == *"yellow"* ]]; then
         echo "Result correct."
diff --git a/tests/lvms/test_lvms_tgi-llava_on_intel_hpu.sh b/tests/lvms/test_lvms_tgi-llava_on_intel_hpu.sh
index 1fa0155266..9d1a69a7ae 100644
--- a/tests/lvms/test_lvms_tgi-llava_on_intel_hpu.sh
+++ b/tests/lvms/test_lvms_tgi-llava_on_intel_hpu.sh
@@ -54,6 +54,41 @@ function validate_microservice() {
         echo "LVM prompt without image - HTTP status (successful)"
     fi
 
+    # Test sending two images with a text prompt with one image tag in the prompt.
+    # The first image is green and the second image is blue. Since the default MAX_IMAGES is 1, only the blue image should be sent to the LVM.
+    result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"image": ["iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNk+M9Qz0AEYBxVSF+FAAhKDveksOjmAAAAAElFTkSuQmCC", "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNkYPhfz0AEYBxVSF+FAP5FDvcfRYWgAAAAAElFTkSuQmCC"], "prompt":"<image>\nWhat are in these images?"}' -H 'Content-Type: application/json')
+    if [[ $result == *"blue"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong."
+        docker logs test-comps-lvm-llava >> ${LOG_PATH}/llava-dependency.log
+        docker logs test-comps-lvm-llava-svc >> ${LOG_PATH}/llava-server.log
+        exit 1
+    fi
+
+    # Test sending two images with a text prompt without any image tags.
+    # The first image is blue and the second image is green. Since the default MAX_IMAGES is 1, only the green image should be sent to the LVM.
+    result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"image": ["iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNkYPhfz0AEYBxVSF+FAP5FDvcfRYWgAAAAAElFTkSuQmCC", "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNk+M9Qz0AEYBxVSF+FAAhKDveksOjmAAAAAElFTkSuQmCC"], "prompt":"What are in these images?"}' -H 'Content-Type: application/json')
+    if [[ $result == *"green"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong."
+        docker logs test-comps-lvm-llava >> ${LOG_PATH}/llava-dependency.log
+        docker logs test-comps-lvm-llava-svc >> ${LOG_PATH}/llava-server.log
+        exit 1
+    fi
+
+    # Same test as above, except including two image tags with the prompt to ensure the number of image tags is reconciled.
+    # The first image is blue and the second image is green. Since the default MAX_IMAGES is 1, only the green image should be sent to the LVM.
+    result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"image": ["iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNkYPhfz0AEYBxVSF+FAP5FDvcfRYWgAAAAAElFTkSuQmCC", "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNk+M9Qz0AEYBxVSF+FAAhKDveksOjmAAAAAElFTkSuQmCC"], "prompt":"<image>\n<image>\nWhat are in these images?"}' -H 'Content-Type: application/json')
+    if [[ $result == *"green"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong."
+        docker logs test-comps-lvm-llava >> ${LOG_PATH}/llava-dependency.log
+        docker logs test-comps-lvm-llava-svc >> ${LOG_PATH}/llava-server.log
+        exit 1
+    fi
 }
 
 function stop_docker() {
diff --git a/tests/retrievers/test_retrievers_multimodal_redis_langchain.sh b/tests/retrievers/test_retrievers_multimodal_redis_langchain.sh
index 873516ddc5..06fecec69d 100644
--- a/tests/retrievers/test_retrievers_multimodal_redis_langchain.sh
+++ b/tests/retrievers/test_retrievers_multimodal_redis_langchain.sh
@@ -58,6 +58,32 @@ function validate_microservice() {
         docker logs test-comps-retriever-multimodal-redis >> ${LOG_PATH}/retriever.log
         exit 1
     fi
+
+    # Test the retriever with a b64 image that should be passed through
+    HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding},\"img_b64_str\":\"iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC\"}" -H 'Content-Type: application/json' "$URL")
+    if [ "$HTTP_STATUS" -eq 200 ]; then
+        echo "[ retriever ] HTTP status is 200. Checking content..."
+        local CONTENT=$(curl -s -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/retriever.log)
+
+        if echo "$CONTENT" | grep -q "retrieved_docs"; then
+            echo "[ retriever ] Content has retrieved_docs as expected."
+            if echo "$CONTENT" | grep -q "b64_img_str"; then
+                echo "[ retriever ] Content has b64_img_str as expected."
+            else
+                echo "[ retriever ] Content does not include the b64_img_str: $CONTENT"
+                docker logs test-comps-retriever-multimodal-redis >> ${LOG_PATH}/retriever.log
+                exit 1
+            fi
+        else
+            echo "[ retriever ] Content does not match the expected result: $CONTENT"
+            docker logs test-comps-retriever-multimodal-redis >> ${LOG_PATH}/retriever.log
+            exit 1
+        fi
+    else
+        echo "[ retriever ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs test-comps-retriever-multimodal-redis >> ${LOG_PATH}/retriever.log
+        exit 1
+    fi
 }
 
 function stop_docker() {

From 20a79e1577c29c23ce0145af69cfd4ec030561c5 Mon Sep 17 00:00:00 2001
From: Omar Khleif <omar.khleif@intel.com>
Date: Mon, 30 Dec 2024 10:06:15 -0800
Subject: [PATCH 02/10] Replaced hard coded ports with dynamically assigned
 values from set_env file (#17)

* changed all hardcoded ports to getenv with defaults instead

Signed-off-by: okhleif-IL <omar.khleif@intel.com>
---------

Signed-off-by: okhleif-IL <omar.khleif@intel.com>
---
 comps/asr/whisper/asr.py                               |  5 +++--
 comps/asr/whisper/dependency/whisper_server.py         |  2 +-
 .../redis/langchain/prepare_videodoc_redis.py          | 10 +++++-----
 .../multimodal_langchain/mm_embedding_mmei.py          |  4 ++--
 comps/lvms/llava/dependency/llava_server.py            |  3 ++-
 comps/lvms/llava/lvm.py                                |  2 +-
 comps/lvms/tgi-llava/lvm_tgi.py                        |  2 +-
 comps/retrievers/redis/langchain/retriever_redis.py    |  2 +-
 8 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/comps/asr/whisper/asr.py b/comps/asr/whisper/asr.py
index 920c831526..702564149d 100644
--- a/comps/asr/whisper/asr.py
+++ b/comps/asr/whisper/asr.py
@@ -28,7 +28,7 @@
     service_type=ServiceType.ASR,
     endpoint="/v1/audio/transcriptions",
     host="0.0.0.0",
-    port=9099,
+    port=int(os.getenv("ASR_PORT", 9099)),
     input_datatype=Base64ByteStrDoc,
     output_datatype=LLMParamsDoc,
 )
@@ -48,6 +48,7 @@ async def audio_to_text(audio: Base64ByteStrDoc):
 
 
 if __name__ == "__main__":
-    asr_endpoint = os.getenv("ASR_ENDPOINT", "http://localhost:7066")
+    whisper_port = int(os.getenv("WHISPER_PORT", 7066))
+    asr_endpoint = os.getenv("ASR_ENDPOINT", f"http://localhost:{whisper_port}")
     logger.info("[asr - router] ASR initialized.")
     opea_microservices["opea_service@asr"].start()
diff --git a/comps/asr/whisper/dependency/whisper_server.py b/comps/asr/whisper/dependency/whisper_server.py
index dcb3dd19cb..979ece0d56 100644
--- a/comps/asr/whisper/dependency/whisper_server.py
+++ b/comps/asr/whisper/dependency/whisper_server.py
@@ -106,7 +106,7 @@ async def audio_transcriptions(
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default="0.0.0.0")
-    parser.add_argument("--port", type=int, default=7066)
+    parser.add_argument("--port", type=int, default=os.getenv("WHISPER_PORT", 7066))
     parser.add_argument("--model_name_or_path", type=str, default="openai/whisper-small")
     parser.add_argument("--language", type=str, default="english")
     parser.add_argument("--device", type=str, default="cpu")
diff --git a/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py b/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
index fa8ed4896e..fee9ac0c82 100644
--- a/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
+++ b/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
@@ -335,7 +335,7 @@ def drop_index(index_name, redis_url=REDIS_URL):
 
 
 @register_microservice(
-    name="opea_service@prepare_videodoc_redis", endpoint="/v1/generate_transcripts", host="0.0.0.0", port=6007
+    name="opea_service@prepare_videodoc_redis", endpoint="/v1/generate_transcripts", host="0.0.0.0", port=int(os.getenv("DATAPREP_MMR_PORT", 6007))
 )
 async def ingest_generate_transcripts(files: List[UploadFile] = File(None)):
     """Upload videos or audio files with speech, generate transcripts using whisper and ingest into redis."""
@@ -444,7 +444,7 @@ async def ingest_generate_transcripts(files: List[UploadFile] = File(None)):
 
 
 @register_microservice(
-    name="opea_service@prepare_videodoc_redis", endpoint="/v1/generate_captions", host="0.0.0.0", port=6007
+    name="opea_service@prepare_videodoc_redis", endpoint="/v1/generate_captions", host="0.0.0.0", port=int(os.getenv("DATAPREP_MMR_PORT", 6007))
 )
 async def ingest_generate_caption(files: List[UploadFile] = File(None)):
     """Upload images and videos without speech (only background music or no audio), generate captions using lvm microservice and ingest into redis."""
@@ -506,7 +506,7 @@ async def ingest_generate_caption(files: List[UploadFile] = File(None)):
     name="opea_service@prepare_videodoc_redis",
     endpoint="/v1/ingest_with_text",
     host="0.0.0.0",
-    port=6007,
+    port=int(os.getenv("DATAPREP_MMR_PORT", 6007)),
 )
 async def ingest_with_text(files: List[UploadFile] = File(None)):
     if files:
@@ -602,7 +602,7 @@ async def ingest_with_text(files: List[UploadFile] = File(None)):
 
 
 @register_microservice(
-    name="opea_service@prepare_videodoc_redis", endpoint="/v1/dataprep/get_files", host="0.0.0.0", port=6007
+    name="opea_service@prepare_videodoc_redis", endpoint="/v1/dataprep/get_files", host="0.0.0.0", port=int(os.getenv("DATAPREP_MMR_PORT", 6007))
 )
 async def rag_get_file_structure():
     """Returns list of names of uploaded videos saved on the server."""
@@ -616,7 +616,7 @@ async def rag_get_file_structure():
 
 
 @register_microservice(
-    name="opea_service@prepare_videodoc_redis", endpoint="/v1/dataprep/delete_files", host="0.0.0.0", port=6007
+    name="opea_service@prepare_videodoc_redis", endpoint="/v1/dataprep/delete_files", host="0.0.0.0", port=int(os.getenv("DATAPREP_MMR_PORT", 6007))
 )
 async def delete_files():
     """Delete all uploaded files along with redis index."""
diff --git a/comps/embeddings/multimodal/multimodal_langchain/mm_embedding_mmei.py b/comps/embeddings/multimodal/multimodal_langchain/mm_embedding_mmei.py
index cd052fc288..68b8dc6095 100644
--- a/comps/embeddings/multimodal/multimodal_langchain/mm_embedding_mmei.py
+++ b/comps/embeddings/multimodal/multimodal_langchain/mm_embedding_mmei.py
@@ -25,7 +25,7 @@
 
 logger = CustomLogger("multimodal_embedding_mmei_langchain")
 logflag = os.getenv("LOGFLAG", False)
-port = int(os.getenv("MM_EMBEDDING_PORT_MICROSERVICE", 6600))
+port = int(os.getenv("MM_EMBEDDING_PORT_MICROSERVICE", 6000))
 headers = {"Content-Type": "application/json"}
 
 
@@ -86,7 +86,7 @@ def embedding(input: MultimodalDoc) -> Union[EmbedDoc, EmbedMultimodalDoc]:
 
 if __name__ == "__main__":
     url_endpoint = os.getenv("MMEI_EMBEDDING_HOST_ENDPOINT", "http://0.0.0.0")
-    port_endpoint = os.getenv("MMEI_EMBEDDING_PORT_ENDPOINT", "8080")
+    port_endpoint = os.getenv("MMEI_EMBEDDING_PORT_ENDPOINT", "6006")
     path_endpoint = os.getenv("MMEI_EMBEDDING_PATH_ENDPOINT", "/v1/encode")
 
     mmei_embedding_endpoint = os.getenv("MMEI_EMBEDDING_ENDPOINT", f"{url_endpoint}:{port_endpoint}{path_endpoint}")
diff --git a/comps/lvms/llava/dependency/llava_server.py b/comps/lvms/llava/dependency/llava_server.py
index 4fc0043805..cd74328b87 100644
--- a/comps/lvms/llava/dependency/llava_server.py
+++ b/comps/lvms/llava/dependency/llava_server.py
@@ -6,6 +6,7 @@
 import base64
 import time
 from io import BytesIO
+import os
 
 import PIL.Image
 import requests
@@ -201,7 +202,7 @@ async def generate(request: Request) -> Response:  # FIXME batch_size=1 for now
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default="0.0.0.0")
-    parser.add_argument("--port", type=int, default=8399)
+    parser.add_argument("--port", type=int, default=os.getenv("LLAVA_SERVER_PORT", 8399))
     parser.add_argument("--model_name_or_path", type=str, default="llava-hf/llava-1.5-7b-hf")
     parser.add_argument("--use_hpu_graphs", default=False, action="store_true")
     parser.add_argument("--warmup", type=int, default=1, help="Number of warmup iterations for benchmarking.")
diff --git a/comps/lvms/llava/lvm.py b/comps/lvms/llava/lvm.py
index 9d7bde0f90..05ad07ae50 100644
--- a/comps/lvms/llava/lvm.py
+++ b/comps/lvms/llava/lvm.py
@@ -37,7 +37,7 @@
     service_type=ServiceType.LVM,
     endpoint="/v1/lvm",
     host="0.0.0.0",
-    port=9399,
+    port=int(os.getenv("LVM_PORT", 9399)),
 )
 @register_statistics(names=["opea_service@lvm"])
 async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc, MetadataTextDoc]:
diff --git a/comps/lvms/tgi-llava/lvm_tgi.py b/comps/lvms/tgi-llava/lvm_tgi.py
index 04ceee400c..2896ba8b0a 100644
--- a/comps/lvms/tgi-llava/lvm_tgi.py
+++ b/comps/lvms/tgi-llava/lvm_tgi.py
@@ -36,7 +36,7 @@
     service_type=ServiceType.LVM,
     endpoint="/v1/lvm",
     host="0.0.0.0",
-    port=9399,
+    port=int(os.getenv("LVM_PORT", 9399)),
     input_datatype=LVMDoc,
     output_datatype=TextDoc,
 )
diff --git a/comps/retrievers/redis/langchain/retriever_redis.py b/comps/retrievers/redis/langchain/retriever_redis.py
index e99b5884d5..295e257902 100644
--- a/comps/retrievers/redis/langchain/retriever_redis.py
+++ b/comps/retrievers/redis/langchain/retriever_redis.py
@@ -44,7 +44,7 @@
     service_type=ServiceType.RETRIEVER,
     endpoint="/v1/retrieval",
     host="0.0.0.0",
-    port=7000,
+    port=int(os.getenv("REDIS_RETRIEVER_PORT", 7000)),
 )
 @register_statistics(names=["opea_service@retriever_redis"])
 async def retrieve(

From 236da36090a46e155193fd01881bba505b986a06 Mon Sep 17 00:00:00 2001
From: Melanie Hart Buehler <melanie.h.buehler@intel.com>
Date: Wed, 8 Jan 2025 13:42:52 -0800
Subject: [PATCH 03/10] MultimodalQnA PDF Ingestion (#16)

* Initial implementation of PDF ingestion

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>

* PDF ingestion fixes

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>

* Adds a test for dataprep microservice

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>

* Improved comments, variable name, and a docstring

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>

* Updated for review feedback

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>

---------

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>
---
 .../multimodal/redis/langchain/README.md      |   7 +-
 .../redis/langchain/prepare_videodoc_redis.py | 149 +++++++++++++++---
 .../redis/langchain/requirements.txt          |   1 +
 ...est_dataprep_multimodal_redis_langchain.sh |  31 +++-
 4 files changed, 160 insertions(+), 28 deletions(-)

diff --git a/comps/dataprep/multimodal/redis/langchain/README.md b/comps/dataprep/multimodal/redis/langchain/README.md
index db24b431fd..5c21e16c25 100644
--- a/comps/dataprep/multimodal/redis/langchain/README.md
+++ b/comps/dataprep/multimodal/redis/langchain/README.md
@@ -5,6 +5,7 @@ This `dataprep` microservice accepts the following from the user and ingests the
 - Videos (mp4 files) and their transcripts (optional)
 - Images (gif, jpg, jpeg, and png files) and their captions (optional)
 - Audio (wav files)
+- PDFs (with text and images)
 
 ## 🚀1. Start Microservice with Python（Option 1）
 
@@ -111,18 +112,19 @@ docker container logs -f dataprep-multimodal-redis
 
 ## 🚀4. Consume Microservice
 
-Once this dataprep microservice is started, user can use the below commands to invoke the microservice to convert images and videos and their transcripts (optional) to embeddings and save to the Redis vector store.
+Once this dataprep microservice is started, user can use the below commands to invoke the microservice to convert images, videos, text, and PDF files to embeddings and save to the Redis vector store.
 
 This microservice provides 3 different ways for users to ingest files into Redis vector store corresponding to the 3 use cases.
 
 ### 4.1 Consume _ingest_with_text_ API
 
-**Use case:** This API is used when videos are accompanied by transcript files (`.vtt` format) or images are accompanied by text caption files (`.txt` format).
+**Use case:** This API is used for videos accompanied by transcript files (`.vtt` format), images accompanied by text caption files (`.txt` format), and PDF files containing a mix of text and images.
 
 **Important notes:**
 
 - Make sure the file paths after `files=@` are correct.
 - Every transcript or caption file's name must be identical to its corresponding video or image file's name (except their extension - .vtt goes with .mp4 and .txt goes with .jpg, .jpeg, .png, or .gif). For example, `video1.mp4` and `video1.vtt`. Otherwise, if `video1.vtt` is not included correctly in the API call, the microservice will return an error `No captions file video1.vtt found for video1.mp4`.
+- It is assumed that PDFs will contain at least one image. Each image in the file will be embedded along with the text that appears on the same page as the image.
 
 #### Single video-transcript pair upload
 
@@ -157,6 +159,7 @@ curl -X POST \
     -F "files=@./image1.txt" \
     -F "files=@./image2.jpg" \
     -F "files=@./image2.txt" \
+    -F "files=@./example.pdf" \
     http://localhost:6007/v1/ingest_with_text
 ```
 
diff --git a/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py b/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
index fee9ac0c82..f56929c72c 100644
--- a/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
+++ b/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
@@ -1,6 +1,8 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import base64
+import json
 import os
 import shutil
 import time
@@ -30,6 +32,7 @@
     write_vtt,
 )
 from PIL import Image
+import pymupdf
 
 from comps import opea_microservices, register_microservice
 from comps.embeddings.multimodal.bridgetower.bridgetower_embedding import BridgeTowerEmbedding
@@ -301,7 +304,53 @@ def prepare_data_and_metadata_from_annotation(
     return text_list, image_list, metadatas
 
 
-def ingest_multimodal(videoname, data_folder, embeddings):
+def prepare_pdf_data_from_annotation(annotation, path_to_frames, title):
+    """PDF data processing has some key differences from videos and images.
+    
+    1. Neighboring frames' transcripts are not currently considered relevant.
+       We are only taking the text located on the same page as the image.
+    2. The images/frames are indexed differently, by page and image-within-page
+       indices, as opposed to a single frame index.
+    3. Instead of time of frame in ms, we return the PDF page index through
+       the pre-existing time_of_frame_ms metadata key to maintain compatibility.
+    """
+    text_list = []
+    image_list = []
+    metadatas = []
+    for frame in annotation:
+        page_index = frame["frame_no"]
+        image_index = frame["sub_video_id"]
+        path_to_frame = os.path.join(path_to_frames, f"page{page_index}_image{image_index}.png")
+        caption_for_ingesting = frame["caption"]
+        caption_for_inference = frame["caption"]
+
+        video_id = frame["video_id"]
+        b64_img_str = frame["b64_img_str"]
+        embedding_type = "pair" if b64_img_str else "text"
+        source_video = frame["video_name"]
+
+        text_list.append(caption_for_ingesting)
+
+        if b64_img_str:
+            image_list.append(path_to_frame)
+
+        metadatas.append(
+            {
+                "content": caption_for_ingesting,
+                "b64_img_str": b64_img_str,
+                "video_id": video_id,
+                "source_video": source_video,
+                "time_of_frame_ms": page_index,  # For PDFs save the page number
+                "embedding_type": embedding_type,
+                "title": title,
+                "transcript_for_inference": caption_for_inference,
+            }
+        )
+
+    return text_list, image_list, metadatas
+
+
+def ingest_multimodal(filename, data_folder, embeddings, is_pdf=False):
     """Ingest text image pairs to Redis from the data/ directory that consists of frames and annotations."""
     data_folder = os.path.abspath(data_folder)
     annotation_file_path = os.path.join(data_folder, "annotations.json")
@@ -310,10 +359,13 @@ def ingest_multimodal(videoname, data_folder, embeddings):
     annotation = load_json_file(annotation_file_path)
 
     # prepare data to ingest
-    text_list, image_list, metadatas = prepare_data_and_metadata_from_annotation(annotation, path_to_frames, videoname)
+    if is_pdf:
+        text_list, image_list, metadatas = prepare_pdf_data_from_annotation(annotation, path_to_frames, filename)
+    else:
+        text_list, image_list, metadatas = prepare_data_and_metadata_from_annotation(annotation, path_to_frames, filename)
 
     MultimodalRedis.from_text_image_pairs_return_keys(
-        texts=[f"From {videoname}. " + text for text in text_list],
+        texts=[f"From {filename}. " + text for text in text_list],
         images=image_list,
         embedding=embeddings,
         metadatas=metadatas,
@@ -510,7 +562,7 @@ async def ingest_generate_caption(files: List[UploadFile] = File(None)):
 )
 async def ingest_with_text(files: List[UploadFile] = File(None)):
     if files:
-        accepted_media_formats = [".mp4", ".png", ".jpg", ".jpeg", ".gif"]
+        accepted_media_formats = [".mp4", ".png", ".jpg", ".jpeg", ".gif", ".pdf"]
         # Create a lookup dictionary containing all media files
         matched_files = {f.filename: [f] for f in files if os.path.splitext(f.filename)[1] in accepted_media_formats}
         uploaded_files_map = {}
@@ -537,25 +589,25 @@ async def ingest_with_text(files: List[UploadFile] = File(None)):
             elif file_extension not in accepted_media_formats:
                 print(f"Skipping file {file.filename} because of unsupported format.")
 
-        # Check if every media file has a caption file
-        for media_file_name, file_pair in matched_files.items():
-            if len(file_pair) != 2:
+        # Check that every media file that is not a pdf has a caption file
+        for media_file_name, file_list in matched_files.items():
+            if len(file_list) != 2 and os.path.splitext(media_file_name)[1] != ".pdf":
                 raise HTTPException(status_code=400, detail=f"No caption file found for {media_file_name}")
 
         if len(matched_files.keys()) == 0:
             return HTTPException(
                 status_code=400,
-                detail="The uploaded files have unsupported formats. Please upload at least one video file (.mp4) with captions (.vtt) or one image (.png, .jpg, .jpeg, or .gif) with caption (.txt)",
+                detail="The uploaded files have unsupported formats. Please upload at least one video file (.mp4) with captions (.vtt) or one image (.png, .jpg, .jpeg, or .gif) with caption (.txt) or one .pdf file",
             )
 
         for media_file in matched_files:
             print(f"Processing file {media_file}")
+            file_name, file_extension = os.path.splitext(media_file)
 
             # Assign unique identifier to file
             file_id = generate_id()
 
             # Create file name by appending identifier
-            file_name, file_extension = os.path.splitext(media_file)
             media_file_name = f"{file_name}_{file_id}{file_extension}"
             media_dir_name = os.path.splitext(media_file_name)[0]
 
@@ -564,25 +616,72 @@ async def ingest_with_text(files: List[UploadFile] = File(None)):
                 shutil.copyfileobj(matched_files[media_file][0].file, f)
             uploaded_files_map[file_name] = media_file_name
 
-            # Save caption file in upload directory
-            caption_file_extension = os.path.splitext(matched_files[media_file][1].filename)[1]
-            caption_file = f"{media_dir_name}{caption_file_extension}"
-            with open(os.path.join(upload_folder, caption_file), "wb") as f:
-                shutil.copyfileobj(matched_files[media_file][1].file, f)
+            if file_extension == ".pdf":
+                # Set up location to store pdf images and text, reusing "frames" and "annotations" from video
+                output_dir = os.path.join(upload_folder, media_dir_name)
+                os.makedirs(output_dir, exist_ok=True)
+                os.makedirs(os.path.join(output_dir, "frames"), exist_ok=True)
+                doc = pymupdf.open(os.path.join(upload_folder, media_file_name))
+                annotations = []
+                for page_idx, page in enumerate(doc, start=1):
+                    text = page.get_text()
+                    images = page.get_images()
+                    for image_idx, image in enumerate(images, start=1):
+                        # Write image and caption file for each image found in pdf
+                        img_fname = f"page{page_idx}_image{image_idx}"
+                        img_fpath = os.path.join(output_dir, "frames", img_fname + ".png")
+                        pix = pymupdf.Pixmap(doc, image[0])  # create pixmap
+
+                        if pix.n - pix.alpha > 3:  # if CMYK, convert to RGB first
+                            pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
+
+                        pix.save(img_fpath)  # pixmap to png
+                        pix = None
+
+                        # Convert image to base64 encoded string
+                        with open(img_fpath, "rb") as image2str: 
+                            encoded_string = base64.b64encode(image2str.read())  # png to bytes
+
+                        decoded_string = encoded_string.decode()  # bytes to string
+
+                        # Create annotations file, reusing metadata keys from video
+                        annotations.append(
+                            {
+                                "video_id": file_id,
+                                "video_name": os.path.basename(os.path.join(upload_folder, media_file_name)),
+                                "b64_img_str": decoded_string,
+                                "caption": text,
+                                "time": 0.0,
+                                "frame_no": page_idx,
+                                "sub_video_id": image_idx,
+                            }
+                        )
+
+                with open(os.path.join(output_dir, "annotations.json"), "w") as f:
+                    json.dump(annotations, f)
+
+                # Ingest multimodal data into redis
+                ingest_multimodal(file_name, os.path.join(upload_folder, media_dir_name), embeddings, is_pdf=True)
+            else:
+                # Save caption file in upload directory
+                caption_file_extension = os.path.splitext(matched_files[media_file][1].filename)[1]
+                caption_file = f"{media_dir_name}{caption_file_extension}"
+                with open(os.path.join(upload_folder, caption_file), "wb") as f:
+                    shutil.copyfileobj(matched_files[media_file][1].file, f)
 
-            # Store frames and caption annotations in a new directory
-            extract_frames_and_annotations_from_transcripts(
-                file_id,
-                os.path.join(upload_folder, media_file_name),
-                os.path.join(upload_folder, caption_file),
-                os.path.join(upload_folder, media_dir_name),
-            )
+                # Store frames and caption annotations in a new directory
+                extract_frames_and_annotations_from_transcripts(
+                    file_id,
+                    os.path.join(upload_folder, media_file_name),
+                    os.path.join(upload_folder, caption_file),
+                    os.path.join(upload_folder, media_dir_name),
+                )
 
-            # Delete temporary caption file
-            os.remove(os.path.join(upload_folder, caption_file))
+                # Delete temporary caption file
+                os.remove(os.path.join(upload_folder, caption_file))
 
-            # Ingest multimodal data into redis
-            ingest_multimodal(file_name, os.path.join(upload_folder, media_dir_name), embeddings)
+                # Ingest multimodal data into redis
+                ingest_multimodal(file_name, os.path.join(upload_folder, media_dir_name), embeddings)
 
             # Delete temporary media directory containing frames and annotations
             shutil.rmtree(os.path.join(upload_folder, media_dir_name))
diff --git a/comps/dataprep/multimodal/redis/langchain/requirements.txt b/comps/dataprep/multimodal/redis/langchain/requirements.txt
index b368bb2336..55c8ba3c87 100644
--- a/comps/dataprep/multimodal/redis/langchain/requirements.txt
+++ b/comps/dataprep/multimodal/redis/langchain/requirements.txt
@@ -11,6 +11,7 @@ opentelemetry-sdk
 Pillow
 prometheus-fastapi-instrumentator
 pydantic
+pymupdf
 python-multipart
 redis
 shortuuid
diff --git a/tests/dataprep/test_dataprep_multimodal_redis_langchain.sh b/tests/dataprep/test_dataprep_multimodal_redis_langchain.sh
index 664f72c628..b19cee10a7 100644
--- a/tests/dataprep/test_dataprep_multimodal_redis_langchain.sh
+++ b/tests/dataprep/test_dataprep_multimodal_redis_langchain.sh
@@ -20,6 +20,8 @@ audio_fn="${tmp_dir}/${audio_name}.wav"
 image_name="apple"
 image_fn="${tmp_dir}/${image_name}.png"
 caption_fn="${tmp_dir}/${image_name}.txt"
+pdf_name="nke-10k-2023"
+pdf_fn="${tmp_dir}/${pdf_name}.pdf"
 
 function build_docker_images() {
     cd $WORKPATH
@@ -132,6 +134,9 @@ tire.""" > ${transcript_fn}
     echo "Downloading Audio"
     wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav -O ${audio_fn}
 
+    echo "Downloading PDF"
+    wget https://raw.githubusercontent.com/opea-project/GenAIComps/main/comps/retrievers/redis/data/nke-10k-2023.pdf -O ${pdf_fn}
+
 }
 
 function validate_microservice() {
@@ -256,6 +261,30 @@ function validate_microservice() {
         echo "[ $SERVICE_NAME ] Content is as expected."
     fi
 
+    # test v1/ingest_with_text with a PDF file
+    echo "Testing ingest_with_text API with a PDF file"
+    URL="http://${ip_address}:$dataprep_service_port/v1/ingest_with_text"
+
+    HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@$pdf_fn" -H 'Content-Type: multipart/form-data' "$URL")
+    HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
+    RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
+    SERVICE_NAME="dataprep - upload - file"
+
+    if [ "$HTTP_STATUS" -ne "200" ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs test-comps-dataprep-multimodal-redis >> ${LOG_PATH}/dataprep_upload_file.log
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+    fi
+    if [[ "$RESPONSE_BODY" != *"Data preparation succeeded"* ]]; then
+        echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        docker logs test-comps-dataprep-multimodal-redis >> ${LOG_PATH}/dataprep_upload_file.log
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] Content is as expected."
+    fi
+
     # test v1/generate_captions upload video file
     echo "Testing generate_captions API with video"
     URL="http://${ip_address}:$dataprep_service_port/v1/generate_captions"
@@ -319,7 +348,7 @@ function validate_microservice() {
     else
         echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
     fi
-    if [[ "$RESPONSE_BODY" != *${image_name}* || "$RESPONSE_BODY" != *${video_name}* || "$RESPONSE_BODY" != *${audio_name}* ]]; then
+    if [[ "$RESPONSE_BODY" != *${image_name}* || "$RESPONSE_BODY" != *${video_name}* || "$RESPONSE_BODY" != *${audio_name}* || "$RESPONSE_BODY" != *${pdf_name}* ]]; then
         echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
         docker logs test-comps-dataprep-multimodal-redis >> ${LOG_PATH}/dataprep_file.log
         exit 1

From cc4f41fc0f4905399c87bfc0d335b5d0c4f808fc Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 10 Jan 2025 22:36:25 +0000
Subject: [PATCH 04/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../redis/langchain/prepare_videodoc_redis.py | 30 ++++++++++++++-----
 comps/lvms/llava/dependency/llava_server.py   |  7 ++---
 comps/lvms/llava/lvm.py                       |  2 +-
 comps/lvms/tgi-llava/lvm_tgi.py               | 10 ++++---
 4 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py b/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
index 3267f3d6ff..6ad1e35625 100644
--- a/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
+++ b/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
@@ -10,6 +10,7 @@
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Type, Union
 
+import pymupdf
 from config import EMBED_MODEL, INDEX_NAME, INDEX_SCHEMA, LVM_ENDPOINT, REDIS_URL, WHISPER_MODEL
 from fastapi import File, HTTPException, UploadFile
 from langchain_community.utilities.redis import _array_to_buffer
@@ -32,7 +33,6 @@
     write_vtt,
 )
 from PIL import Image
-import pymupdf
 
 from comps import opea_microservices, register_microservice
 from comps.third_parties.bridgetower.src.bridgetower_embedding import BridgeTowerEmbedding
@@ -306,7 +306,7 @@ def prepare_data_and_metadata_from_annotation(
 
 def prepare_pdf_data_from_annotation(annotation, path_to_frames, title):
     """PDF data processing has some key differences from videos and images.
-    
+
     1. Neighboring frames' transcripts are not currently considered relevant.
        We are only taking the text located on the same page as the image.
     2. The images/frames are indexed differently, by page and image-within-page
@@ -362,7 +362,9 @@ def ingest_multimodal(filename, data_folder, embeddings, is_pdf=False):
     if is_pdf:
         text_list, image_list, metadatas = prepare_pdf_data_from_annotation(annotation, path_to_frames, filename)
     else:
-        text_list, image_list, metadatas = prepare_data_and_metadata_from_annotation(annotation, path_to_frames, filename)
+        text_list, image_list, metadatas = prepare_data_and_metadata_from_annotation(
+            annotation, path_to_frames, filename
+        )
 
     MultimodalRedis.from_text_image_pairs_return_keys(
         texts=[f"From {filename}. " + text for text in text_list],
@@ -387,7 +389,10 @@ def drop_index(index_name, redis_url=REDIS_URL):
 
 
 @register_microservice(
-    name="opea_service@prepare_videodoc_redis", endpoint="/v1/generate_transcripts", host="0.0.0.0", port=int(os.getenv("DATAPREP_MMR_PORT", 6007))
+    name="opea_service@prepare_videodoc_redis",
+    endpoint="/v1/generate_transcripts",
+    host="0.0.0.0",
+    port=int(os.getenv("DATAPREP_MMR_PORT", 6007)),
 )
 async def ingest_generate_transcripts(files: List[UploadFile] = File(None)):
     """Upload videos or audio files with speech, generate transcripts using whisper and ingest into redis."""
@@ -496,7 +501,10 @@ async def ingest_generate_transcripts(files: List[UploadFile] = File(None)):
 
 
 @register_microservice(
-    name="opea_service@prepare_videodoc_redis", endpoint="/v1/generate_captions", host="0.0.0.0", port=int(os.getenv("DATAPREP_MMR_PORT", 6007))
+    name="opea_service@prepare_videodoc_redis",
+    endpoint="/v1/generate_captions",
+    host="0.0.0.0",
+    port=int(os.getenv("DATAPREP_MMR_PORT", 6007)),
 )
 async def ingest_generate_caption(files: List[UploadFile] = File(None)):
     """Upload images and videos without speech (only background music or no audio), generate captions using lvm microservice and ingest into redis."""
@@ -639,7 +647,7 @@ async def ingest_with_text(files: List[UploadFile] = File(None)):
                         pix = None
 
                         # Convert image to base64 encoded string
-                        with open(img_fpath, "rb") as image2str: 
+                        with open(img_fpath, "rb") as image2str:
                             encoded_string = base64.b64encode(image2str.read())  # png to bytes
 
                         decoded_string = encoded_string.decode()  # bytes to string
@@ -701,7 +709,10 @@ async def ingest_with_text(files: List[UploadFile] = File(None)):
 
 
 @register_microservice(
-    name="opea_service@prepare_videodoc_redis", endpoint="/v1/dataprep/get_files", host="0.0.0.0", port=int(os.getenv("DATAPREP_MMR_PORT", 6007))
+    name="opea_service@prepare_videodoc_redis",
+    endpoint="/v1/dataprep/get_files",
+    host="0.0.0.0",
+    port=int(os.getenv("DATAPREP_MMR_PORT", 6007)),
 )
 async def rag_get_file_structure():
     """Returns list of names of uploaded videos saved on the server."""
@@ -715,7 +726,10 @@ async def rag_get_file_structure():
 
 
 @register_microservice(
-    name="opea_service@prepare_videodoc_redis", endpoint="/v1/dataprep/delete_files", host="0.0.0.0", port=int(os.getenv("DATAPREP_MMR_PORT", 6007))
+    name="opea_service@prepare_videodoc_redis",
+    endpoint="/v1/dataprep/delete_files",
+    host="0.0.0.0",
+    port=int(os.getenv("DATAPREP_MMR_PORT", 6007)),
 )
 async def delete_files():
     """Delete all uploaded files along with redis index."""
diff --git a/comps/lvms/llava/dependency/llava_server.py b/comps/lvms/llava/dependency/llava_server.py
index cd74328b87..5540a02b62 100644
--- a/comps/lvms/llava/dependency/llava_server.py
+++ b/comps/lvms/llava/dependency/llava_server.py
@@ -4,9 +4,9 @@
 
 import argparse
 import base64
+import os
 import time
 from io import BytesIO
-import os
 
 import PIL.Image
 import requests
@@ -14,8 +14,7 @@
 import uvicorn
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response
-from transformers import AutoProcessor
-from transformers import pipeline
+from transformers import AutoProcessor, pipeline
 from transformers.image_utils import load_image
 
 model_name_or_path = None
@@ -263,7 +262,7 @@ async def generate(request: Request) -> Response:  # FIXME batch_size=1 for now
             "content": [
                 {"type": "image"},
                 {"type": "text", "text": "What's the content of the image?"},
-                ],
+            ],
         },
     ]
     text_prompt = processor.apply_chat_template(conversation)
diff --git a/comps/lvms/llava/lvm.py b/comps/lvms/llava/lvm.py
index 05ad07ae50..7b1b6e4cbe 100644
--- a/comps/lvms/llava/lvm.py
+++ b/comps/lvms/llava/lvm.py
@@ -81,7 +81,7 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc
 
     # Limit the number of images being sent to the LVM
     if isinstance(img_b64_str, list) and len(img_b64_str) > max_images:
-        img_b64_str=img_b64_str[-max_images:]
+        img_b64_str = img_b64_str[-max_images:]
 
         # Adjust the number of images tags in the prompt
         image_tag = "<image>\n"
diff --git a/comps/lvms/tgi-llava/lvm_tgi.py b/comps/lvms/tgi-llava/lvm_tgi.py
index 81f07945e9..58244062bb 100644
--- a/comps/lvms/tgi-llava/lvm_tgi.py
+++ b/comps/lvms/tgi-llava/lvm_tgi.py
@@ -100,21 +100,23 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc
             # Work around an issue where LLaVA-NeXT is not providing good responses when prompted without an image.
             # Provide an image and then instruct the model to ignore the image. The base64 string below is the encoded png:
             # https://raw.githubusercontent.com/opea-project/GenAIExamples/refs/tags/v1.0/AudioQnA/ui/svelte/src/lib/assets/icons/png/audio1.png
-            img_b64_str = ["iVBORw0KGgoAAAANSUhEUgAAADUAAAAlCAYAAADiMKHrAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAKPSURBVHgB7Zl/btowFMefnUTqf+MAHYMTjN4gvcGOABpM+8E0doLSE4xpsE3rKuAG3KC5Ad0J6MYOkP07YnvvhR9y0lVzupTIVT5SwDjB9fd97WfsMkCef1rUXM8dY9HHK4hWUevzi/oVWAqnF8fzLmAtiPA3Aq0lFsVA1fRKxlgNLIbDPaQUZQuu6YO98aIipHOiFGtIqaYfn1UnUCDds6WPyeANlTFbv9WztbFTK+HNUVAPiz7nbPzq7HsPCoKWIBREGfsJXZit5xT07X0jp6iRdIbEHOnjyyD97OvzH00lVS2K5OS2ax11cBXxJgYxlEIE6XZclzdTX6n8XjkkcEIfbj2nMO0/SNd1vy4vsCNjYPyEovfyy88GZIQCSKOCMf6ORgStoboLJuSWKDYCfK2q4jjrMZ+GOh7Pib/gek5DHxVUJtcgA7mJ4kwZRbN7viQXFzQn0Nl52gXG4Fo7DKAYp0yI3VHQ16oaWV0wYa+iGE8nG+wAdx5DzpS/KGyhFGULpShbKEXZQinqLlBK/IKc2asoh4sZvoXJWhlAzuxV1KBVD3HrfYTFAK8ZHgu0hu36DHLG+Izinw250WUkXHJht02QUnxLP7fZxR7f1I6S7Ir2GgmYvIQM5OYUuYBdainATq2ZjTqPBlnbGXYeBrg9Od18DKmc1U0jpw4OIIwEJFxQSl2b4MN2lf74fw8nFNbHt/5N9xWKTZvJ2S6YZk6RC3j2cKpVhSIShZ0mea6caCOCAjyNHd5gPPxGncMBTvI6hunYdaJ6kf8VoSCP2odxX6RkR6NOtanfj13EswKVqEQrPzzFL1lK+YvCFraiEqs8TrwQLGYraqpX4kr/Hixml+63Z+CoM9DTo438AUmP+KyMWT+tAAAAAElFTkSuQmCC"]
+            img_b64_str = [
+                "iVBORw0KGgoAAAANSUhEUgAAADUAAAAlCAYAAADiMKHrAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAKPSURBVHgB7Zl/btowFMefnUTqf+MAHYMTjN4gvcGOABpM+8E0doLSE4xpsE3rKuAG3KC5Ad0J6MYOkP07YnvvhR9y0lVzupTIVT5SwDjB9fd97WfsMkCef1rUXM8dY9HHK4hWUevzi/oVWAqnF8fzLmAtiPA3Aq0lFsVA1fRKxlgNLIbDPaQUZQuu6YO98aIipHOiFGtIqaYfn1UnUCDds6WPyeANlTFbv9WztbFTK+HNUVAPiz7nbPzq7HsPCoKWIBREGfsJXZit5xT07X0jp6iRdIbEHOnjyyD97OvzH00lVS2K5OS2ax11cBXxJgYxlEIE6XZclzdTX6n8XjkkcEIfbj2nMO0/SNd1vy4vsCNjYPyEovfyy88GZIQCSKOCMf6ORgStoboLJuSWKDYCfK2q4jjrMZ+GOh7Pib/gek5DHxVUJtcgA7mJ4kwZRbN7viQXFzQn0Nl52gXG4Fo7DKAYp0yI3VHQ16oaWV0wYa+iGE8nG+wAdx5DzpS/KGyhFGULpShbKEXZQinqLlBK/IKc2asoh4sZvoXJWhlAzuxV1KBVD3HrfYTFAK8ZHgu0hu36DHLG+Izinw250WUkXHJht02QUnxLP7fZxR7f1I6S7Ir2GgmYvIQM5OYUuYBdainATq2ZjTqPBlnbGXYeBrg9Od18DKmc1U0jpw4OIIwEJFxQSl2b4MN2lf74fw8nFNbHt/5N9xWKTZvJ2S6YZk6RC3j2cKpVhSIShZ0mea6caCOCAjyNHd5gPPxGncMBTvI6hunYdaJ6kf8VoSCP2odxX6RkR6NOtanfj13EswKVqEQrPzzFL1lK+YvCFraiEqs8TrwQLGYraqpX4kr/Hixml+63Z+CoM9DTo438AUmP+KyMWT+tAAAAAElFTkSuQmCC"
+            ]
             prompt = f"Please disregard the image and answer the question. {prompt}"
 
     # Truncate the list of images if we have too many, only sending the most recent ones at the end of the list
     if len(img_b64_str) > max_images:
-        img_b64_str=img_b64_str[-max_images:]
+        img_b64_str = img_b64_str[-max_images:]
 
     # Check the number of image tags in the prompt and adjust them to match the number of images that we have
     image_tag = "<image>\n"
     num_tags_in_prompt = prompt.count(image_tag)
 
     # We have too many image tags in the prompt replace the first x instance of the tag with an empty string
-    if  len(img_b64_str) < num_tags_in_prompt:
+    if len(img_b64_str) < num_tags_in_prompt:
         prompt = prompt.replace(image_tag, "", num_tags_in_prompt - len(img_b64_str))
-    
+
     # We don't have enough image tags in the prompt, add them
     if len(img_b64_str) > num_tags_in_prompt:
         num_tags_to_add = len(img_b64_str) - num_tags_in_prompt

From 7b045f68557cbfa41066e5cf03e5eaa83f0036db Mon Sep 17 00:00:00 2001
From: Melanie Hart Buehler <melanie.h.buehler@intel.com>
Date: Fri, 10 Jan 2025 16:28:50 -0800
Subject: [PATCH 05/10] Fixed download location to permanent link (#18)

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>
---
 tests/dataprep/test_dataprep_multimodal_redis_langchain.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/dataprep/test_dataprep_multimodal_redis_langchain.sh b/tests/dataprep/test_dataprep_multimodal_redis_langchain.sh
index af4424c63a..ba9a5c6ddf 100644
--- a/tests/dataprep/test_dataprep_multimodal_redis_langchain.sh
+++ b/tests/dataprep/test_dataprep_multimodal_redis_langchain.sh
@@ -135,7 +135,7 @@ tire.""" > ${transcript_fn}
     wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav -O ${audio_fn}
 
     echo "Downloading PDF"
-    wget https://raw.githubusercontent.com/opea-project/GenAIComps/main/comps/retrievers/redis/data/nke-10k-2023.pdf -O ${pdf_fn}
+    wget https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf -O ${pdf_fn}
 
 }
 

From d7a73cba23702ed7559bcfab638c538e4162744c Mon Sep 17 00:00:00 2001
From: Dina Suehiro Jones <dina.s.jones@intel.com>
Date: Mon, 13 Jan 2025 11:20:42 -0800
Subject: [PATCH 06/10] Multimodal retriever redis test fixes (#19)

* Fixing Multimodal Retriever Redis tests

Signed-off-by: dmsuehir <dina.s.jones@intel.com>

* Code cleanup

Signed-off-by: dmsuehir <dina.s.jones@intel.com>

* Remove debug changes

Signed-off-by: dmsuehir <dina.s.jones@intel.com>

* Formatting

Signed-off-by: dmsuehir <dina.s.jones@intel.com>

---------

Signed-off-by: dmsuehir <dina.s.jones@intel.com>
---
 .../redis/langchain/retriever_redis.py        |  99 ----------------
 ...t_retrievers_multimodal_redis_langchain.sh | 110 ------------------
 tests/retrievers/test_retrievers_redis.sh     |  34 ++++--
 3 files changed, 24 insertions(+), 219 deletions(-)
 delete mode 100644 comps/retrievers/multimodal/redis/langchain/retriever_redis.py
 delete mode 100644 tests/retrievers/test_retrievers_multimodal_redis_langchain.sh

diff --git a/comps/retrievers/multimodal/redis/langchain/retriever_redis.py b/comps/retrievers/multimodal/redis/langchain/retriever_redis.py
deleted file mode 100644
index a92d59aba2..0000000000
--- a/comps/retrievers/multimodal/redis/langchain/retriever_redis.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import time
-from typing import Union
-
-from langchain_community.vectorstores import Redis
-from multimodal_config import INDEX_NAME, INDEX_SCHEMA, REDIS_URL
-
-from comps import (
-    EmbedMultimodalDoc,
-    SearchedMultimodalDoc,
-    ServiceType,
-    TextDoc,
-    opea_microservices,
-    register_microservice,
-    register_statistics,
-    statistics_dict,
-)
-from comps.cores.proto.api_protocol import (
-    ChatCompletionRequest,
-    RetrievalRequest,
-    RetrievalResponse,
-    RetrievalResponseData,
-)
-from comps.embeddings.multimodal.bridgetower import BridgeTowerEmbedding
-
-
-@register_microservice(
-    name="opea_service@multimodal_retriever_redis",
-    service_type=ServiceType.RETRIEVER,
-    endpoint="/v1/multimodal_retrieval",
-    host="0.0.0.0",
-    port=7000,
-)
-@register_statistics(names=["opea_service@multimodal_retriever_redis"])
-async def retrieve(
-    input: Union[EmbedMultimodalDoc, RetrievalRequest, ChatCompletionRequest]
-) -> Union[SearchedMultimodalDoc, RetrievalResponse, ChatCompletionRequest]:
-
-    start = time.time()
-    # check if the Redis index has data
-    if vector_db.client.keys() == []:
-        search_res = []
-    else:
-        # if the Redis index has data, perform the search
-        if input.search_type == "similarity":
-            search_res = await vector_db.asimilarity_search_by_vector(embedding=input.embedding, k=input.k)
-        elif input.search_type == "similarity_distance_threshold":
-            if input.distance_threshold is None:
-                raise ValueError("distance_threshold must be provided for " + "similarity_distance_threshold retriever")
-            search_res = await vector_db.asimilarity_search_by_vector(
-                embedding=input.embedding, k=input.k, distance_threshold=input.distance_threshold
-            )
-        elif input.search_type == "similarity_score_threshold":
-            docs_and_similarities = await vector_db.asimilarity_search_with_relevance_scores(
-                query=input.text, k=input.k, score_threshold=input.score_threshold
-            )
-            search_res = [doc for doc, _ in docs_and_similarities]
-        elif input.search_type == "mmr":
-            search_res = await vector_db.amax_marginal_relevance_search(
-                query=input.text, k=input.k, fetch_k=input.fetch_k, lambda_mult=input.lambda_mult
-            )
-        else:
-            raise ValueError(f"{input.search_type} not valid")
-
-    # return different response format
-    retrieved_docs = []
-    if isinstance(input, EmbedMultimodalDoc):
-        metadata_list = []
-        for r in search_res:
-            # If the input had an image, pass that through in the metadata along with the search result image
-            if input.base64_image:
-                if r.metadata["b64_img_str"]:
-                    r.metadata["b64_img_str"] = [input.base64_image, r.metadata["b64_img_str"]]
-                else:
-                    r.metadata["b64_img_str"] = input.base64_image
-            metadata_list.append(r.metadata)
-            retrieved_docs.append(TextDoc(text=r.page_content))
-        result = SearchedMultimodalDoc(retrieved_docs=retrieved_docs, initial_query=input.text, metadata=metadata_list)
-    else:
-        for r in search_res:
-            retrieved_docs.append(RetrievalResponseData(text=r.page_content, metadata=r.metadata))
-        if isinstance(input, RetrievalRequest):
-            result = RetrievalResponse(retrieved_docs=retrieved_docs)
-        elif isinstance(input, ChatCompletionRequest):
-            input.retrieved_docs = retrieved_docs
-            input.documents = [doc.text for doc in retrieved_docs]
-            result = input
-
-    statistics_dict["opea_service@multimodal_retriever_redis"].append_latency(time.time() - start, None)
-    return result
-
-
-if __name__ == "__main__":
-
-    embeddings = BridgeTowerEmbedding()
-    vector_db = Redis(embedding=embeddings, index_name=INDEX_NAME, index_schema=INDEX_SCHEMA, redis_url=REDIS_URL)
-    opea_microservices["opea_service@multimodal_retriever_redis"].start()
diff --git a/tests/retrievers/test_retrievers_multimodal_redis_langchain.sh b/tests/retrievers/test_retrievers_multimodal_redis_langchain.sh
deleted file mode 100644
index 06fecec69d..0000000000
--- a/tests/retrievers/test_retrievers_multimodal_redis_langchain.sh
+++ /dev/null
@@ -1,110 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -x
-
-WORKPATH=$(dirname "$PWD")
-LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
-
-function build_docker_images() {
-    cd $WORKPATH
-    docker build --no-cache -t opea/retriever-multimodal-redis:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/multimodal/redis/langchain/Dockerfile .
-    if [ $? -ne 0 ]; then
-        echo "opea/retriever-multimodal-redis built fail"
-        exit 1
-    else
-        echo "opea/retriever-multimodal-redis built successful"
-    fi
-}
-
-function start_service() {
-    # redis
-    docker run -d --name test-comps-multimodal-retriever-redis-vector-db -p 5689:6379 -p 5011:8001 -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy redis/redis-stack:7.2.0-v9
-    sleep 10s
-
-    # redis retriever
-    export REDIS_URL="redis://${ip_address}:5689"
-    export INDEX_NAME="rag-redis"
-    retriever_port=5434
-    unset http_proxy
-    docker run -d --name="test-comps-retriever-multimodal-redis" -p ${retriever_port}:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -e INDEX_NAME=$INDEX_NAME opea/retriever-multimodal-redis:comps
-
-    sleep 5m
-}
-
-function validate_microservice() {
-    retriever_port=5434
-    export PATH="${HOME}/miniforge3/bin:$PATH"
-    source activate
-    URL="http://${ip_address}:$retriever_port/v1/multimodal_retrieval"
-    test_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)")
-
-    HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL")
-    if [ "$HTTP_STATUS" -eq 200 ]; then
-        echo "[ retriever ] HTTP status is 200. Checking content..."
-        local CONTENT=$(curl -s -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/retriever.log)
-
-        if echo "$CONTENT" | grep -q "retrieved_docs"; then
-            echo "[ retriever ] Content is as expected."
-        else
-            echo "[ retriever ] Content does not match the expected result: $CONTENT"
-            docker logs test-comps-retriever-multimodal-redis >> ${LOG_PATH}/retriever.log
-            exit 1
-        fi
-    else
-        echo "[ retriever ] HTTP status is not 200. Received status was $HTTP_STATUS"
-        docker logs test-comps-retriever-multimodal-redis >> ${LOG_PATH}/retriever.log
-        exit 1
-    fi
-
-    # Test the retriever with a b64 image that should be passed through
-    HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding},\"img_b64_str\":\"iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC\"}" -H 'Content-Type: application/json' "$URL")
-    if [ "$HTTP_STATUS" -eq 200 ]; then
-        echo "[ retriever ] HTTP status is 200. Checking content..."
-        local CONTENT=$(curl -s -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/retriever.log)
-
-        if echo "$CONTENT" | grep -q "retrieved_docs"; then
-            echo "[ retriever ] Content has retrieved_docs as expected."
-            if echo "$CONTENT" | grep -q "b64_img_str"; then
-                echo "[ retriever ] Content has b64_img_str as expected."
-            else
-                echo "[ retriever ] Content does not include the b64_img_str: $CONTENT"
-                docker logs test-comps-retriever-multimodal-redis >> ${LOG_PATH}/retriever.log
-                exit 1
-            fi
-        else
-            echo "[ retriever ] Content does not match the expected result: $CONTENT"
-            docker logs test-comps-retriever-multimodal-redis >> ${LOG_PATH}/retriever.log
-            exit 1
-        fi
-    else
-        echo "[ retriever ] HTTP status is not 200. Received status was $HTTP_STATUS"
-        docker logs test-comps-retriever-multimodal-redis >> ${LOG_PATH}/retriever.log
-        exit 1
-    fi
-}
-
-function stop_docker() {
-    cid_retrievers=$(docker ps -aq --filter "name=test-comps-*")
-    if [[ ! -z "$cid_retrievers" ]]; then
-        docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s
-    fi
-}
-
-function main() {
-
-    stop_docker
-
-    build_docker_images
-    start_service
-
-    validate_microservice
-
-    stop_docker
-    echo y | docker system prune
-
-}
-
-main
diff --git a/tests/retrievers/test_retrievers_redis.sh b/tests/retrievers/test_retrievers_redis.sh
index 5e7a5d87dd..393d5f5309 100644
--- a/tests/retrievers/test_retrievers_redis.sh
+++ b/tests/retrievers/test_retrievers_redis.sh
@@ -49,7 +49,7 @@ function start_multimodal_service() {
 
     # redis retriever
     export REDIS_URL="redis://${ip_address}:5689"
-    export INDEX_NAME="rag-redis"
+    export INDEX_NAME="mm-rag-redis"
     retriever_port=5435
     unset http_proxy
     docker run -d --name="test-comps-retriever-redis-server" -p ${retriever_port}:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -e INDEX_NAME=$INDEX_NAME -e BRIDGE_TOWER_EMBEDDING=true -e LOGFLAG=true -e RETRIEVER_TYPE="redis" opea/retriever-redis:comps
@@ -82,30 +82,43 @@ function validate_microservice() {
         docker logs test-comps-retriever-redis-server >> ${LOG_PATH}/retriever.log
         exit 1
     fi
+}
+
+function validate_mm_microservice() {
+    local test_embedding="$1"
+
+    retriever_port=5435
+    export PATH="${HOME}/miniforge3/bin:$PATH"
+    source activate
+    URL="http://${ip_address}:$retriever_port/v1/retrieval"
 
     # Test the retriever with a b64 image that should be passed through
-    HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding},\"img_b64_str\":\"iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC\"}" -H 'Content-Type: application/json' "$URL")
+    HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding},\"base64_image\":\"iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC\"}" -H 'Content-Type: application/json' "$URL")
     if [ "$HTTP_STATUS" -eq 200 ]; then
         echo "[ retriever ] HTTP status is 200. Checking content..."
         local CONTENT=$(curl -s -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/retriever.log)
 
         if echo "$CONTENT" | grep -q "retrieved_docs"; then
             echo "[ retriever ] Content has retrieved_docs as expected."
-            if echo "$CONTENT" | grep -q "b64_img_str"; then
-                echo "[ retriever ] Content has b64_img_str as expected."
-            else
-                echo "[ retriever ] Content does not include the b64_img_str: $CONTENT"
-                docker logs test-comps-retriever-multimodal-redis >> ${LOG_PATH}/retriever.log
-                exit 1
+            empty_search_results=$(echo "$CONTENT" | grep "\"retrieved_docs\":\[\]")
+            if [ -z "$empty_search_results" ]; then
+                # If search results are not empty, check for b64 image string
+                if echo "$CONTENT" | grep -q "b64_img_str"; then
+                    echo "[ retriever ] Content has b64_img_str as expected."
+                else
+                    echo "[ retriever ] Content does not include the b64_img_str: $CONTENT"
+                    docker logs test-comps-retriever-redis-server >> ${LOG_PATH}/retriever.log
+                    exit 1
+                fi
             fi
         else
             echo "[ retriever ] Content does not match the expected result: $CONTENT"
-            docker logs test-comps-retriever-multimodal-redis >> ${LOG_PATH}/retriever.log
+            docker logs test-comps-retriever-redis-server >> ${LOG_PATH}/retriever.log
             exit 1
         fi
     else
         echo "[ retriever ] HTTP status is not 200. Received status was $HTTP_STATUS"
-        docker logs test-comps-retriever-multimodal-redis >> ${LOG_PATH}/retriever.log
+        docker logs test-comps-retriever-redis-server >> ${LOG_PATH}/retriever.log
         exit 1
     fi
 }
@@ -132,6 +145,7 @@ function main() {
     start_multimodal_service
     test_embedding_multi=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)")
     validate_microservice "$test_embedding_multi"
+    validate_mm_microservice "$test_embedding_multi"
 
     # clean env
     stop_docker

From 10c010109057aafc3131b974d3b2b47299497dea Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 13 Jan 2025 20:19:26 +0000
Subject: [PATCH 07/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 comps/lvms/src/integrations/opea_llava.py     |  4 ++--
 comps/lvms/src/integrations/opea_tgi_llava.py | 10 ++++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/comps/lvms/src/integrations/opea_llava.py b/comps/lvms/src/integrations/opea_llava.py
index 61539b3bbe..95713dffeb 100644
--- a/comps/lvms/src/integrations/opea_llava.py
+++ b/comps/lvms/src/integrations/opea_llava.py
@@ -99,9 +99,9 @@ async def invoke(
             prompt = request.prompt
             max_new_tokens = request.max_new_tokens
 
-         # Limit the number of images being sent to the LVM
+        # Limit the number of images being sent to the LVM
         if isinstance(img_b64_str, list) and len(img_b64_str) > max_images:
-            img_b64_str=img_b64_str[-max_images:]
+            img_b64_str = img_b64_str[-max_images:]
 
             # Adjust the number of images tags in the prompt
             image_tag = "<image>\n"
diff --git a/comps/lvms/src/integrations/opea_tgi_llava.py b/comps/lvms/src/integrations/opea_tgi_llava.py
index 35796ac0cf..0de6b7ede6 100644
--- a/comps/lvms/src/integrations/opea_tgi_llava.py
+++ b/comps/lvms/src/integrations/opea_tgi_llava.py
@@ -113,7 +113,7 @@ async def invoke(
             top_k = request.top_k
             top_p = request.top_p
 
-         # Make img_b64_str into a list of strings (if it's not already a list)
+        # Make img_b64_str into a list of strings (if it's not already a list)
         if not isinstance(img_b64_str, list):
             if img_b64_str:
                 img_b64_str = [img_b64_str]
@@ -122,19 +122,21 @@ async def invoke(
                 # Work around an issue where LLaVA-NeXT is not providing good responses when prompted without an image.
                 # Provide an image and then instruct the model to ignore the image. The base64 string below is the encoded png:
                 # https://raw.githubusercontent.com/opea-project/GenAIExamples/refs/tags/v1.0/AudioQnA/ui/svelte/src/lib/assets/icons/png/audio1.png
-                img_b64_str = ["iVBORw0KGgoAAAANSUhEUgAAADUAAAAlCAYAAADiMKHrAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAKPSURBVHgB7Zl/btowFMefnUTqf+MAHYMTjN4gvcGOABpM+8E0doLSE4xpsE3rKuAG3KC5Ad0J6MYOkP07YnvvhR9y0lVzupTIVT5SwDjB9fd97WfsMkCef1rUXM8dY9HHK4hWUevzi/oVWAqnF8fzLmAtiPA3Aq0lFsVA1fRKxlgNLIbDPaQUZQuu6YO98aIipHOiFGtIqaYfn1UnUCDds6WPyeANlTFbv9WztbFTK+HNUVAPiz7nbPzq7HsPCoKWIBREGfsJXZit5xT07X0jp6iRdIbEHOnjyyD97OvzH00lVS2K5OS2ax11cBXxJgYxlEIE6XZclzdTX6n8XjkkcEIfbj2nMO0/SNd1vy4vsCNjYPyEovfyy88GZIQCSKOCMf6ORgStoboLJuSWKDYCfK2q4jjrMZ+GOh7Pib/gek5DHxVUJtcgA7mJ4kwZRbN7viQXFzQn0Nl52gXG4Fo7DKAYp0yI3VHQ16oaWV0wYa+iGE8nG+wAdx5DzpS/KGyhFGULpShbKEXZQinqLlBK/IKc2asoh4sZvoXJWhlAzuxV1KBVD3HrfYTFAK8ZHgu0hu36DHLG+Izinw250WUkXHJht02QUnxLP7fZxR7f1I6S7Ir2GgmYvIQM5OYUuYBdainATq2ZjTqPBlnbGXYeBrg9Od18DKmc1U0jpw4OIIwEJFxQSl2b4MN2lf74fw8nFNbHt/5N9xWKTZvJ2S6YZk6RC3j2cKpVhSIShZ0mea6caCOCAjyNHd5gPPxGncMBTvI6hunYdaJ6kf8VoSCP2odxX6RkR6NOtanfj13EswKVqEQrPzzFL1lK+YvCFraiEqs8TrwQLGYraqpX4kr/Hixml+63Z+CoM9DTo438AUmP+KyMWT+tAAAAAElFTkSuQmCC"]
+                img_b64_str = [
+                    "iVBORw0KGgoAAAANSUhEUgAAADUAAAAlCAYAAADiMKHrAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAKPSURBVHgB7Zl/btowFMefnUTqf+MAHYMTjN4gvcGOABpM+8E0doLSE4xpsE3rKuAG3KC5Ad0J6MYOkP07YnvvhR9y0lVzupTIVT5SwDjB9fd97WfsMkCef1rUXM8dY9HHK4hWUevzi/oVWAqnF8fzLmAtiPA3Aq0lFsVA1fRKxlgNLIbDPaQUZQuu6YO98aIipHOiFGtIqaYfn1UnUCDds6WPyeANlTFbv9WztbFTK+HNUVAPiz7nbPzq7HsPCoKWIBREGfsJXZit5xT07X0jp6iRdIbEHOnjyyD97OvzH00lVS2K5OS2ax11cBXxJgYxlEIE6XZclzdTX6n8XjkkcEIfbj2nMO0/SNd1vy4vsCNjYPyEovfyy88GZIQCSKOCMf6ORgStoboLJuSWKDYCfK2q4jjrMZ+GOh7Pib/gek5DHxVUJtcgA7mJ4kwZRbN7viQXFzQn0Nl52gXG4Fo7DKAYp0yI3VHQ16oaWV0wYa+iGE8nG+wAdx5DzpS/KGyhFGULpShbKEXZQinqLlBK/IKc2asoh4sZvoXJWhlAzuxV1KBVD3HrfYTFAK8ZHgu0hu36DHLG+Izinw250WUkXHJht02QUnxLP7fZxR7f1I6S7Ir2GgmYvIQM5OYUuYBdainATq2ZjTqPBlnbGXYeBrg9Od18DKmc1U0jpw4OIIwEJFxQSl2b4MN2lf74fw8nFNbHt/5N9xWKTZvJ2S6YZk6RC3j2cKpVhSIShZ0mea6caCOCAjyNHd5gPPxGncMBTvI6hunYdaJ6kf8VoSCP2odxX6RkR6NOtanfj13EswKVqEQrPzzFL1lK+YvCFraiEqs8TrwQLGYraqpX4kr/Hixml+63Z+CoM9DTo438AUmP+KyMWT+tAAAAAElFTkSuQmCC"
+                ]
                 prompt = f"Please disregard the image and answer the question. {prompt}"
 
         # Truncate the list of images if we have too many, only sending the most recent ones at the end of the list
         if len(img_b64_str) > max_images:
-            img_b64_str=img_b64_str[-max_images:]
+            img_b64_str = img_b64_str[-max_images:]
 
         # Check the number of image tags in the prompt and adjust them to match the number of images that we have
         image_tag = "<image>\n"
         num_tags_in_prompt = prompt.count(image_tag)
 
         # We have too many image tags in the prompt replace the first x instance of the tag with an empty string
-        if  len(img_b64_str) < num_tags_in_prompt:
+        if len(img_b64_str) < num_tags_in_prompt:
             prompt = prompt.replace(image_tag, "", num_tags_in_prompt - len(img_b64_str))
 
         # We don't have enough image tags in the prompt, add them

From fc378d6a077d7164c63091a5a83fa02122348a79 Mon Sep 17 00:00:00 2001
From: Melanie Hart Buehler <melanie.h.buehler@intel.com>
Date: Mon, 13 Jan 2025 12:53:13 -0800
Subject: [PATCH 08/10] Improved local variable names (#20)

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>
---
 .../redis/langchain/prepare_videodoc_redis.py | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py b/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
index 6ad1e35625..dc52bdae3b 100644
--- a/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
+++ b/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
@@ -304,12 +304,12 @@ def prepare_data_and_metadata_from_annotation(
     return text_list, image_list, metadatas
 
 
-def prepare_pdf_data_from_annotation(annotation, path_to_frames, title):
+def prepare_pdf_data_from_annotation(annotation, path_to_files, title):
     """PDF data processing has some key differences from videos and images.
 
-    1. Neighboring frames' transcripts are not currently considered relevant.
+    1. Neighboring transcripts are not currently considered relevant.
        We are only taking the text located on the same page as the image.
-    2. The images/frames are indexed differently, by page and image-within-page
+    2. The images within PDFs are indexed by page and image-within-page
        indices, as opposed to a single frame index.
     3. Instead of time of frame in ms, we return the PDF page index through
        the pre-existing time_of_frame_ms metadata key to maintain compatibility.
@@ -317,29 +317,29 @@ def prepare_pdf_data_from_annotation(annotation, path_to_frames, title):
     text_list = []
     image_list = []
     metadatas = []
-    for frame in annotation:
-        page_index = frame["frame_no"]
-        image_index = frame["sub_video_id"]
-        path_to_frame = os.path.join(path_to_frames, f"page{page_index}_image{image_index}.png")
-        caption_for_ingesting = frame["caption"]
-        caption_for_inference = frame["caption"]
-
-        video_id = frame["video_id"]
-        b64_img_str = frame["b64_img_str"]
+    for item in annotation:
+        page_index = item["frame_no"]
+        image_index = item["sub_video_id"]
+        path_to_image = os.path.join(path_to_files, f"page{page_index}_image{image_index}.png")
+        caption_for_ingesting = item["caption"]
+        caption_for_inference = item["caption"]
+
+        pdf_id = item["video_id"]
+        b64_img_str = item["b64_img_str"]
         embedding_type = "pair" if b64_img_str else "text"
-        source_video = frame["video_name"]
+        source = item["video_name"]
 
         text_list.append(caption_for_ingesting)
 
         if b64_img_str:
-            image_list.append(path_to_frame)
+            image_list.append(path_to_image)
 
         metadatas.append(
             {
                 "content": caption_for_ingesting,
                 "b64_img_str": b64_img_str,
-                "video_id": video_id,
-                "source_video": source_video,
+                "video_id": pdf_id,
+                "source_video": source,
                 "time_of_frame_ms": page_index,  # For PDFs save the page number
                 "embedding_type": embedding_type,
                 "title": title,

From 20d6302c064d9961633bc4102eb46eac38d4fedb Mon Sep 17 00:00:00 2001
From: Omar Khleif <omar.khleif@intel.com>
Date: Mon, 13 Jan 2025 15:12:17 -0800
Subject: [PATCH 09/10] reverted configurable asr port (#21)

Signed-off-by: okhleif-IL <omar.khleif@intel.com>
---
 comps/asr/src/opea_asr_microservice.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comps/asr/src/opea_asr_microservice.py b/comps/asr/src/opea_asr_microservice.py
index 95318b9bb6..decf602a69 100644
--- a/comps/asr/src/opea_asr_microservice.py
+++ b/comps/asr/src/opea_asr_microservice.py
@@ -34,7 +34,7 @@
     service_type=ServiceType.ASR,
     endpoint="/v1/audio/transcriptions",
     host="0.0.0.0",
-    port=int(os.getenv("ASR_PORT", 9099)),
+    port=9099,
     input_datatype=Base64ByteStrDoc,
     output_datatype=LLMParamsDoc,
 )

From b1c9c764003f444cf1202259ad69fbdbfd96d562 Mon Sep 17 00:00:00 2001
From: Dina Suehiro Jones <dina.s.jones@intel.com>
Date: Thu, 16 Jan 2025 15:11:35 -0800
Subject: [PATCH 10/10] Revert env var used for internal whisper port (#22)

Signed-off-by: dmsuehir <dina.s.jones@intel.com>
---
 comps/asr/src/integrations/dependency/whisper/whisper_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comps/asr/src/integrations/dependency/whisper/whisper_server.py b/comps/asr/src/integrations/dependency/whisper/whisper_server.py
index 110346c32e..5221dc9d50 100644
--- a/comps/asr/src/integrations/dependency/whisper/whisper_server.py
+++ b/comps/asr/src/integrations/dependency/whisper/whisper_server.py
@@ -106,7 +106,7 @@ async def audio_transcriptions(
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default="0.0.0.0")
-    parser.add_argument("--port", type=int, default=os.getenv("WHISPER_PORT", 7066))
+    parser.add_argument("--port", type=int, default=7066)
     parser.add_argument("--model_name_or_path", type=str, default="openai/whisper-small")
     parser.add_argument("--language", type=str, default="english")
     parser.add_argument("--device", type=str, default="cpu")