From db59481dbeeaf58996888da8d1ac836f9e821247 Mon Sep 17 00:00:00 2001 From: "patrick@cryptolock.ai" Date: Mon, 7 Oct 2024 10:45:51 -0700 Subject: [PATCH 1/5] Add Azure AI Search Support --- README.md | 2 +- examples/storm_examples/run_storm_wiki_gpt.py | 12 +- knowledge_storm/rm.py | 109 +++++++++++++++++- requirements.txt | 3 +- 4 files changed, 119 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index e027b3c..50c8f36 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ You could also install the source code which allows you to modify the behavior o Currently, our package support: - `OpenAIModel`, `AzureOpenAIModel`, `ClaudeModel`, `VLLMClient`, `TGIClient`, `TogetherClient`, `OllamaClient`, `GoogleModel`, `DeepSeekModel`, `GroqModel` as language model components -- `YouRM`, `BingSearch`, `VectorRM`, `SerperRM`, `BraveRM`, `SearXNG`, `DuckDuckGoSearchRM`, `TavilySearchRM`, `GoogleSearch` as retrieval module components +- `YouRM`, `BingSearch`, `VectorRM`, `SerperRM`, `BraveRM`, `SearXNG`, `DuckDuckGoSearchRM`, `TavilySearchRM`, `GoogleSearch`, and `AzureAISearch` as retrieval module components :star2: **PRs for integrating more language models into [knowledge_storm/lm.py](knowledge_storm/lm.py) and search engines/retrievers into [knowledge_storm/rm.py](knowledge_storm/rm.py) are highly appreciated!** diff --git a/examples/storm_examples/run_storm_wiki_gpt.py b/examples/storm_examples/run_storm_wiki_gpt.py index ac07968..b1740a1 100644 --- a/examples/storm_examples/run_storm_wiki_gpt.py +++ b/examples/storm_examples/run_storm_wiki_gpt.py @@ -20,10 +20,11 @@ """ import os + from argparse import ArgumentParser from knowledge_storm import STORMWikiRunnerArguments, STORMWikiRunner, STORMWikiLMConfigs from knowledge_storm.lm import OpenAIModel, AzureOpenAIModel -from knowledge_storm.rm import YouRM, BingSearch, BraveRM, SerperRM, DuckDuckGoSearchRM, TavilySearchRM, SearXNG +from knowledge_storm.rm import YouRM, BingSearch, BraveRM, SerperRM, DuckDuckGoSearchRM, TavilySearchRM, SearXNG, AzureAISearch from knowledge_storm.utils import load_api_key @@ -72,6 +73,7 @@ def main(args): # STORM is a knowledge curation system which consumes information from the retrieval module. # Currently, the information source is the Internet and we use search engine API as the retrieval module. + match args.retriever: case 'bing': rm = BingSearch(bing_search_api=os.getenv('BING_SEARCH_API_KEY'), k=engine_args.search_top_k) @@ -87,8 +89,10 @@ def main(args): rm = TavilySearchRM(tavily_search_api_key=os.getenv('TAVILY_API_KEY'), k=engine_args.search_top_k, include_raw_content=True) case 'searxng': rm = SearXNG(searxng_api_key=os.getenv('SEARXNG_API_KEY'), k=engine_args.search_top_k) + case 'azure_ai_search': + rm = AzureAISearch(azure_ai_search_api_key=os.getenv('AZURE_AI_SEARCH_API_KEY'), k=engine_args.search_top_k) case _: - raise ValueError(f'Invalid retriever: {args.retriever}. Choose either "bing", "you", "brave", "duckduckgo", "serper", "tavily", or "searxng"') + raise ValueError(f'Invalid retriever: {args.retriever}. Choose either "bing", "you", "brave", "duckduckgo", "serper", "tavily", "searxng", or "azure_ai_search"') runner = STORMWikiRunner(engine_args, lm_configs, rm) @@ -113,7 +117,7 @@ def main(args): help='Maximum number of threads to use. The information seeking part and the article generation' 'part can speed up by using multiple threads. Consider reducing it if keep getting ' '"Exceed rate limit" error when calling LM API.') - parser.add_argument('--retriever', type=str, choices=['bing', 'you', 'brave', 'serper', 'duckduckgo', 'tavily', 'searxng'], + parser.add_argument('--retriever', type=str, choices=['bing', 'you', 'brave', 'serper', 'duckduckgo', 'tavily', 'searxng', 'azure_ai_search'], help='The search engine API to use for retrieving information.') # stage of the pipeline parser.add_argument('--do-research', action='store_true', @@ -138,4 +142,4 @@ def main(args): parser.add_argument('--remove-duplicate', action='store_true', help='If True, remove duplicate content from the article.') - main(parser.parse_args()) \ No newline at end of file + main(parser.parse_args()) diff --git a/knowledge_storm/rm.py b/knowledge_storm/rm.py index 7f029e7..a078383 100644 --- a/knowledge_storm/rm.py +++ b/knowledge_storm/rm.py @@ -13,6 +13,8 @@ from .utils import WebPageHelper +from azure.core.credentials import AzureKeyCredential +from azure.search.documents import SearchClient class YouRM(dspy.Retrieve): def __init__(self, ydc_api_key=None, k=3, is_valid_source: Callable = None): @@ -77,7 +79,6 @@ def forward( return collected_results - class BingSearch(dspy.Retrieve): def __init__( self, @@ -1093,3 +1094,109 @@ def forward( collected_results.append(r) return collected_results + +class AzureAISearch(dspy.Retrieve): + """Retrieve information from custom queries using Azure AI Search. General Documentation can be found at: https://learn.microsoft.com/en-us/azure/search/search-create-service-portal. Python Documentation and examples can be found at https://learn.microsoft.com/en-us/python/api/overview/azure/search-documents-readme?view=azure-python. Requires pip install azure-search-documents""" + + def __init__( + self, + azure_ai_search_api_key=None, + azure_ai_search_url=None, + azure_ai_search_index_name=None, + k=3, + is_valid_source: Callable = None + ): + """ + Params: + azure_ai_search_api_key: Azure AI Search API key. Check out https://learn.microsoft.com/en-us/azure/search/search-security-api-keys?tabs=rest-use%2Cportal-find%2Cportal-query + "API key" section + azure_ai_search_url: Custom Azure AI Search Endpoint URL. Check out https://learn.microsoft.com/en-us/azure/search/search-create-service-portal#name-the-service + azure_ai_search_index_name: Custom Azure AI Search Index Name. Check out https://learn.microsoft.com/en-us/azure/search/search-how-to-create-search-index?tabs=portal + k: Number of top results to retrieve. + is_valid_source: Optional function to filter valid sources. + min_char_count: Minimum character count for the article to be considered valid. + snippet_chunk_size: Maximum character count for each snippet. + webpage_helper_max_threads: Maximum number of threads to use for webpage helper. + """ + super().__init__(k=k) + if not azure_ai_search_api_key and not os.environ.get("AZURE_AI_SEARCH_API_KEY"): + raise RuntimeError( + "You must supply azure_ai_search_api_key or set environment variable AZURE_AI_SEARCH_API_KEY" + ) + elif azure_ai_search_api_key: + self.azure_ai_search_api_key = azure_ai_search_api_key + else: + self.azure_ai_search_api_key = os.environ["AZURE_AI_SEARCH_API_KEY"] + + if not azure_ai_search_url and not os.environ.get("AZURE_AI_SEARCH_URL"): + raise RuntimeError( + "You must supply azure_ai_search_url or set environment variable AZURE_AI_SEARCH_URL" + ) + elif azure_ai_search_url: + self.azure_ai_search_url = azure_ai_search_url + else: + self.azure_ai_search_url = os.environ["AZURE_AI_SEARCH_URL"] + + if not azure_ai_search_index_name and not os.environ.get("AZURE_AI_SEARCH_INDEX_NAME"): + raise RuntimeError( + "You must supply azure_ai_search_index_name or set environment variable AZURE_AI_SEARCH_INDEX_NAME" + ) + elif azure_ai_search_index_name: + self.azure_ai_search_index_name = azure_ai_search_index_name + else: + self.azure_ai_search_index_name = os.environ["AZURE_AI_SEARCH_INDEX_NAME"] + + self.usage = 0 + + # If not None, is_valid_source shall be a function that takes a URL and returns a boolean. + if is_valid_source: + self.is_valid_source = is_valid_source + else: + self.is_valid_source = lambda x: True + + def get_usage_and_reset(self): + usage = self.usage + self.usage = 0 + + return {"AzureAISearch": usage} + + def forward( + self, query_or_queries: Union[str, List[str]], exclude_urls: List[str] = [] + ): + """Search with Azure Open AI for self.k top passages for query or queries + + Args: + query_or_queries (Union[str, List[str]]): The query or queries to search for. + exclude_urls (List[str]): A list of urls to exclude from the search results. + + Returns: + a list of Dicts, each dict has keys of 'description', 'snippets' (list of strings), 'title', 'url' + """ + queries = ( + [query_or_queries] + if isinstance(query_or_queries, str) + else query_or_queries + ) + self.usage += len(queries) + collected_results = [] + + client = SearchClient(self.azure_ai_search_url,self.azure_ai_search_index_name,AzureKeyCredential(self.azure_ai_search_api_key)) + for query in queries: + try: + # https://learn.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.searchclient?view=azure-python#azure-search-documents-searchclient-search + results = client.search(search_text=query, top=1) + + for result in results: + document = { + "url": result['metadata_storage_path'], + "title": result['title'], + "description": "N/A", + "snippets": [result['chunk']] + } + collected_results.append(document) + except Exception as e: + logging.error(f"Error occurs when searching query {query}: {e}") + + return collected_results + + diff --git a/requirements.txt b/requirements.txt index 7e1a88a..07e40c3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,5 @@ trafilatura langchain-huggingface qdrant-client langchain-qdrant -numpy==1.26.4 \ No newline at end of file +numpy==1.26.4 +azure-search-documents==11.5.1 \ No newline at end of file From 3922a84c4ea233cf94532cdca7e924b85372eb87 Mon Sep 17 00:00:00 2001 From: "patrick@cryptolock.ai" Date: Wed, 16 Oct 2024 14:01:10 -0700 Subject: [PATCH 2/5] Azure AI Search has optional requirements --- knowledge_storm/rm.py | 55 ++++++++++++++++++++++++++++--------------- requirements.txt | 3 +-- 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/knowledge_storm/rm.py b/knowledge_storm/rm.py index a078383..00bb3cb 100644 --- a/knowledge_storm/rm.py +++ b/knowledge_storm/rm.py @@ -13,8 +13,6 @@ from .utils import WebPageHelper -from azure.core.credentials import AzureKeyCredential -from azure.search.documents import SearchClient class YouRM(dspy.Retrieve): def __init__(self, ydc_api_key=None, k=3, is_valid_source: Callable = None): @@ -79,6 +77,7 @@ def forward( return collected_results + class BingSearch(dspy.Retrieve): def __init__( self, @@ -1095,16 +1094,19 @@ def forward( return collected_results + class AzureAISearch(dspy.Retrieve): - """Retrieve information from custom queries using Azure AI Search. General Documentation can be found at: https://learn.microsoft.com/en-us/azure/search/search-create-service-portal. Python Documentation and examples can be found at https://learn.microsoft.com/en-us/python/api/overview/azure/search-documents-readme?view=azure-python. Requires pip install azure-search-documents""" + """Retrieve information from custom queries using Azure AI Search. + General Documentation can be found at: https://learn.microsoft.com/en-us/azure/search/search-create-service-portal. Python Documentation and examples can be found at https://learn.microsoft.com/en-us/python/api/overview/azure/search-documents-readme?view=azure-python. Requires pip install azure-search-documents + """ def __init__( - self, - azure_ai_search_api_key=None, - azure_ai_search_url=None, - azure_ai_search_index_name=None, - k=3, - is_valid_source: Callable = None + self, + azure_ai_search_api_key=None, + azure_ai_search_url=None, + azure_ai_search_index_name=None, + k=3, + is_valid_source: Callable = None, ): """ Params: @@ -1119,7 +1121,18 @@ def __init__( webpage_helper_max_threads: Maximum number of threads to use for webpage helper. """ super().__init__(k=k) - if not azure_ai_search_api_key and not os.environ.get("AZURE_AI_SEARCH_API_KEY"): + + try: + from azure.core.credentials import AzureKeyCredential + from azure.search.documents import SearchClient + except ImportError as err: + raise ImportError( + "AzureAISearch requires `pip install azure-search-documents`." + ) from err + + if not azure_ai_search_api_key and not os.environ.get( + "AZURE_AI_SEARCH_API_KEY" + ): raise RuntimeError( "You must supply azure_ai_search_api_key or set environment variable AZURE_AI_SEARCH_API_KEY" ) @@ -1127,7 +1140,7 @@ def __init__( self.azure_ai_search_api_key = azure_ai_search_api_key else: self.azure_ai_search_api_key = os.environ["AZURE_AI_SEARCH_API_KEY"] - + if not azure_ai_search_url and not os.environ.get("AZURE_AI_SEARCH_URL"): raise RuntimeError( "You must supply azure_ai_search_url or set environment variable AZURE_AI_SEARCH_URL" @@ -1136,8 +1149,10 @@ def __init__( self.azure_ai_search_url = azure_ai_search_url else: self.azure_ai_search_url = os.environ["AZURE_AI_SEARCH_URL"] - - if not azure_ai_search_index_name and not os.environ.get("AZURE_AI_SEARCH_INDEX_NAME"): + + if not azure_ai_search_index_name and not os.environ.get( + "AZURE_AI_SEARCH_INDEX_NAME" + ): raise RuntimeError( "You must supply azure_ai_search_index_name or set environment variable AZURE_AI_SEARCH_INDEX_NAME" ) @@ -1180,7 +1195,11 @@ def forward( self.usage += len(queries) collected_results = [] - client = SearchClient(self.azure_ai_search_url,self.azure_ai_search_index_name,AzureKeyCredential(self.azure_ai_search_api_key)) + client = SearchClient( + self.azure_ai_search_url, + self.azure_ai_search_index_name, + AzureKeyCredential(self.azure_ai_search_api_key), + ) for query in queries: try: # https://learn.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.searchclient?view=azure-python#azure-search-documents-searchclient-search @@ -1188,15 +1207,13 @@ def forward( for result in results: document = { - "url": result['metadata_storage_path'], - "title": result['title'], + "url": result["metadata_storage_path"], + "title": result["title"], "description": "N/A", - "snippets": [result['chunk']] + "snippets": [result["chunk"]], } collected_results.append(document) except Exception as e: logging.error(f"Error occurs when searching query {query}: {e}") return collected_results - - diff --git a/requirements.txt b/requirements.txt index 07e40c3..7e1a88a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,4 @@ trafilatura langchain-huggingface qdrant-client langchain-qdrant -numpy==1.26.4 -azure-search-documents==11.5.1 \ No newline at end of file +numpy==1.26.4 \ No newline at end of file From da52da2620b0f3f976e69818561e78526d191252 Mon Sep 17 00:00:00 2001 From: "patrick@cryptolock.ai" Date: Wed, 16 Oct 2024 14:33:03 -0700 Subject: [PATCH 3/5] azure ai search scoped imports included --- knowledge_storm/rm.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/knowledge_storm/rm.py b/knowledge_storm/rm.py index 00bb3cb..624d248 100644 --- a/knowledge_storm/rm.py +++ b/knowledge_storm/rm.py @@ -1187,6 +1187,13 @@ def forward( Returns: a list of Dicts, each dict has keys of 'description', 'snippets' (list of strings), 'title', 'url' """ + try: + from azure.core.credentials import AzureKeyCredential + from azure.search.documents import SearchClient + except ImportError as err: + raise ImportError( + "AzureAISearch requires `pip install azure-search-documents`." + ) from err queries = ( [query_or_queries] if isinstance(query_or_queries, str) From 628bf0ec9dd829476f4544c859fd766644a9ea38 Mon Sep 17 00:00:00 2001 From: Yijia Shao <67158122+shaoyijia@users.noreply.github.com> Date: Sat, 19 Oct 2024 11:06:15 -0700 Subject: [PATCH 4/5] Nit. --- knowledge_storm/rm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/knowledge_storm/rm.py b/knowledge_storm/rm.py index 624d248..bb770af 100644 --- a/knowledge_storm/rm.py +++ b/knowledge_storm/rm.py @@ -1097,7 +1097,9 @@ def forward( class AzureAISearch(dspy.Retrieve): """Retrieve information from custom queries using Azure AI Search. - General Documentation can be found at: https://learn.microsoft.com/en-us/azure/search/search-create-service-portal. Python Documentation and examples can be found at https://learn.microsoft.com/en-us/python/api/overview/azure/search-documents-readme?view=azure-python. Requires pip install azure-search-documents + + General Documentation: https://learn.microsoft.com/en-us/azure/search/search-create-service-portal. + Python Documentation: https://learn.microsoft.com/en-us/python/api/overview/azure/search-documents-readme?view=azure-python. """ def __init__( From 9511be7b0058c5e0b94cbe7ad0ed66b3dc0b5f95 Mon Sep 17 00:00:00 2001 From: Yijia Shao <67158122+shaoyijia@users.noreply.github.com> Date: Sat, 19 Oct 2024 11:11:13 -0700 Subject: [PATCH 5/5] Fix format issue. --- knowledge_storm/rm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/knowledge_storm/rm.py b/knowledge_storm/rm.py index bb770af..ec57d79 100644 --- a/knowledge_storm/rm.py +++ b/knowledge_storm/rm.py @@ -1097,7 +1097,7 @@ def forward( class AzureAISearch(dspy.Retrieve): """Retrieve information from custom queries using Azure AI Search. - + General Documentation: https://learn.microsoft.com/en-us/azure/search/search-create-service-portal. Python Documentation: https://learn.microsoft.com/en-us/python/api/overview/azure/search-documents-readme?view=azure-python. """