From dcec9d7bb7b530a01607f8edb7defe808f22b599 Mon Sep 17 00:00:00 2001
From: Xiao Ley <xiao.ley@outlook.com>
Date: Mon, 24 Jun 2024 01:06:26 +0800
Subject: [PATCH] feat: add new features to enhance image and link handling in
 Jina tool (#5517)

---
 .../builtin/jina/tools/jina_reader.py         | 19 ++++++---
 .../builtin/jina/tools/jina_reader.yaml       | 42 +++++++++++++++++++
 .../builtin/jina/tools/jina_search.py         |  9 ++++
 .../builtin/jina/tools/jina_search.yaml       | 42 +++++++++++++++++++
 4 files changed, 107 insertions(+), 5 deletions(-)

diff --git a/api/core/tools/provider/builtin/jina/tools/jina_reader.py b/api/core/tools/provider/builtin/jina/tools/jina_reader.py
index b0bd4788466132..0d0eaef25b1005 100644
--- a/api/core/tools/provider/builtin/jina/tools/jina_reader.py
+++ b/api/core/tools/provider/builtin/jina/tools/jina_reader.py
@@ -10,10 +10,10 @@
 class JinaReaderTool(BuiltinTool):
     _jina_reader_endpoint = 'https://r.jina.ai/'
 
-    def _invoke(self, 
+    def _invoke(self,
                 user_id: str,
-               tool_parameters: dict[str, Any], 
-        ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
+                tool_parameters: dict[str, Any],
+                ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
         """
             invoke tools
         """
@@ -34,6 +34,15 @@ def _invoke(self,
         if wait_for_selector is not None and wait_for_selector != '':
             headers['X-Wait-For-Selector'] = wait_for_selector
 
+        if tool_parameters.get('image_caption', False):
+            headers['X-With-Generated-Alt'] = 'true'
+
+        if tool_parameters.get('gather_all_links_at_the_end', False):
+            headers['X-With-Links-Summary'] = 'true'
+
+        if tool_parameters.get('gather_all_images_at_the_end', False):
+            headers['X-With-Images-Summary'] = 'true'
+
         proxy_server = tool_parameters.get('proxy_server', None)
         if proxy_server is not None and proxy_server != '':
             headers['X-Proxy-Url'] = proxy_server
@@ -42,12 +51,12 @@ def _invoke(self,
             headers['X-No-Cache'] = 'true'
 
         response = ssrf_proxy.get(
-            str(URL(self._jina_reader_endpoint + url)), 
+            str(URL(self._jina_reader_endpoint + url)),
             headers=headers,
             timeout=(10, 60)
         )
 
         if tool_parameters.get('summary', False):
             return self.create_text_message(self.summary(user_id, response.text))
-        
+
         return self.create_text_message(response.text)
diff --git a/api/core/tools/provider/builtin/jina/tools/jina_reader.yaml b/api/core/tools/provider/builtin/jina/tools/jina_reader.yaml
index 703fa3d389ad75..5eb2692ea555da 100644
--- a/api/core/tools/provider/builtin/jina/tools/jina_reader.yaml
+++ b/api/core/tools/provider/builtin/jina/tools/jina_reader.yaml
@@ -51,6 +51,48 @@ parameters:
       pt_BR: css selector for waiting for specific elements
     llm_description: css selector of the target element to wait for
     form: form
+  - name: image_caption
+    type: boolean
+    required: false
+    default: false
+    label:
+      en_US: Image caption
+      zh_Hans: 图片说明
+      pt_BR: Legenda da imagem
+    human_description:
+      en_US: "Captions all images at the specified URL, adding 'Image [idx]: [caption]' as an alt tag for those without one. This allows downstream LLMs to interact with the images in activities such as reasoning and summarizing."
+      zh_Hans: "为指定 URL 上的所有图像添加标题，为没有标题的图像添加“Image [idx]: [caption]”作为 alt 标签。这允许下游 LLM 在推理和总结等活动中与图像进行交互。"
+      pt_BR: "Captions all images at the specified URL, adding 'Image [idx]: [caption]' as an alt tag for those without one. This allows downstream LLMs to interact with the images in activities such as reasoning and summarizing."
+    llm_description: Captions all images at the specified URL
+    form: form
+  - name: gather_all_links_at_the_end
+    type: boolean
+    required: false
+    default: false
+    label:
+      en_US: Gather all links at the end
+      zh_Hans: 将所有链接集中到最后
+      pt_BR: Coletar todos os links ao final
+    human_description:
+      en_US: A "Buttons & Links" section will be created at the end. This helps the downstream LLMs or web agents navigating the page or take further actions.
+      zh_Hans: 最后会创建一个“按钮和链接”部分。这可以帮助下游 LLM 或 Web 代理浏览页面或采取进一步的行动。
+      pt_BR: A "Buttons & Links" section will be created at the end. This helps the downstream LLMs or web agents navigating the page or take further actions.
+    llm_description: Gather all links at the end
+    form: form
+  - name: gather_all_images_at_the_end
+    type: boolean
+    required: false
+    default: false
+    label:
+      en_US: Gather all images at the end
+      zh_Hans: 将所有图片集中到最后
+      pt_BR: Coletar todas as imagens ao final
+    human_description:
+      en_US: An "Images" section will be created at the end. This gives the downstream LLMs an overview of all visuals on the page, which may improve reasoning.
+      zh_Hans: 最后会创建一个“图像”部分。这可以让下游的 LLM 概览页面上的所有视觉效果，从而提高推理能力。
+      pt_BR: An "Images" section will be created at the end. This gives the downstream LLMs an overview of all visuals on the page, which may improve reasoning.
+    llm_description: Gather all images at the end
+    form: form
   - name: proxy_server
     type: string
     required: false
diff --git a/api/core/tools/provider/builtin/jina/tools/jina_search.py b/api/core/tools/provider/builtin/jina/tools/jina_search.py
index c13f58d0cd163c..3eda2c5a22e47b 100644
--- a/api/core/tools/provider/builtin/jina/tools/jina_search.py
+++ b/api/core/tools/provider/builtin/jina/tools/jina_search.py
@@ -24,6 +24,15 @@ def _invoke(
         if 'api_key' in self.runtime.credentials and self.runtime.credentials.get('api_key'):
             headers['Authorization'] = "Bearer " + self.runtime.credentials.get('api_key')
 
+        if tool_parameters.get('image_caption', False):
+            headers['X-With-Generated-Alt'] = 'true'
+
+        if tool_parameters.get('gather_all_links_at_the_end', False):
+            headers['X-With-Links-Summary'] = 'true'
+
+        if tool_parameters.get('gather_all_images_at_the_end', False):
+            headers['X-With-Images-Summary'] = 'true'
+
         proxy_server = tool_parameters.get('proxy_server', None)
         if proxy_server is not None and proxy_server != '':
             headers['X-Proxy-Url'] = proxy_server
diff --git a/api/core/tools/provider/builtin/jina/tools/jina_search.yaml b/api/core/tools/provider/builtin/jina/tools/jina_search.yaml
index f3b6c0737a9699..da0a300c6c7520 100644
--- a/api/core/tools/provider/builtin/jina/tools/jina_search.yaml
+++ b/api/core/tools/provider/builtin/jina/tools/jina_search.yaml
@@ -22,6 +22,48 @@ parameters:
       zh_Hans: 在网络上搜索信息
     llm_description: simple question to ask on the web
     form: llm
+  - name: image_caption
+    type: boolean
+    required: false
+    default: false
+    label:
+      en_US: Image caption
+      zh_Hans: 图片说明
+      pt_BR: Legenda da imagem
+    human_description:
+      en_US: "Captions all images at the specified URL, adding 'Image [idx]: [caption]' as an alt tag for those without one. This allows downstream LLMs to interact with the images in activities such as reasoning and summarizing."
+      zh_Hans: "为指定 URL 上的所有图像添加标题，为没有标题的图像添加“Image [idx]: [caption]”作为 alt 标签。这允许下游 LLM 在推理和总结等活动中与图像进行交互。"
+      pt_BR: "Captions all images at the specified URL, adding 'Image [idx]: [caption]' as an alt tag for those without one. This allows downstream LLMs to interact with the images in activities such as reasoning and summarizing."
+    llm_description: Captions all images at the specified URL
+    form: form
+  - name: gather_all_links_at_the_end
+    type: boolean
+    required: false
+    default: false
+    label:
+      en_US: Gather all links at the end
+      zh_Hans: 将所有链接集中到最后
+      pt_BR: Coletar todos os links ao final
+    human_description:
+      en_US: A "Buttons & Links" section will be created at the end. This helps the downstream LLMs or web agents navigating the page or take further actions.
+      zh_Hans: 最后会创建一个“按钮和链接”部分。这可以帮助下游 LLM 或 Web 代理浏览页面或采取进一步的行动。
+      pt_BR: A "Buttons & Links" section will be created at the end. This helps the downstream LLMs or web agents navigating the page or take further actions.
+    llm_description: Gather all links at the end
+    form: form
+  - name: gather_all_images_at_the_end
+    type: boolean
+    required: false
+    default: false
+    label:
+      en_US: Gather all images at the end
+      zh_Hans: 将所有图片集中到最后
+      pt_BR: Coletar todas as imagens ao final
+    human_description:
+      en_US: An "Images" section will be created at the end. This gives the downstream LLMs an overview of all visuals on the page, which may improve reasoning.
+      zh_Hans: 最后会创建一个“图像”部分。这可以让下游的 LLM 概览页面上的所有视觉效果，从而提高推理能力。
+      pt_BR: An "Images" section will be created at the end. This gives the downstream LLMs an overview of all visuals on the page, which may improve reasoning.
+    llm_description: Gather all images at the end
+    form: form
   - name: proxy_server
     type: string
     required: false