From 58fadbebbc3bb62f67cf9d0b2c7135a5e03269d3 Mon Sep 17 00:00:00 2001 From: Jacob-Junqi Tian Date: Sun, 24 Nov 2024 18:20:03 -0500 Subject: [PATCH 1/3] Added example KScope OpenAI-compatible embedding code for langchain. --- .../document_search_langchain.ipynb | 155 +++++++++++------- 1 file changed, 93 insertions(+), 62 deletions(-) diff --git a/document_search/document_search_langchain.ipynb b/document_search/document_search_langchain.ipynb index 6841733..24df10c 100644 --- a/document_search/document_search_langchain.ipynb +++ b/document_search/document_search_langchain.ipynb @@ -54,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "742aa343-c90c-4e4a-8099-a3fa218e256d", "metadata": {}, "outputs": [], @@ -79,8 +79,8 @@ "from langchain.chains import RetrievalQA\n", "from langchain_community.vectorstores import FAISS\n", "from langchain.document_loaders.pdf import PyPDFDirectoryLoader\n", - "from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n", - "from langchain_openai import ChatOpenAI\n", + "# from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n", + "from langchain_openai import ChatOpenAI, OpenAIEmbeddings\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter" ] }, @@ -94,7 +94,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 13, "id": "1e70d51a", "metadata": {}, "outputs": [], @@ -110,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 14, "id": "e9e0bec6-a89c-4fca-a218-c784ec18e109", "metadata": {}, "outputs": [], @@ -130,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "dd4e2417", "metadata": {}, "outputs": [], @@ -153,7 +153,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 15, "id": "74b61e4f", "metadata": {}, "outputs": [], @@ -179,13 +179,13 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 28, "id": "2553d130-5b02-4852-928f-beb7ecd05d3f", "metadata": {}, "outputs": [], "source": [ "GENERATOR_MODEL_NAME = \"Meta-Llama-3.1-8B-Instruct\"\n", - "EMBEDDING_MODEL_NAME = \"BAAI/bge-base-en-v1.5\"" + "EMBEDDING_MODEL_NAME = \"bge-base-en-v1.5\"" ] }, { @@ -204,7 +204,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "id": "6133a928", "metadata": {}, "outputs": [], @@ -222,7 +222,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 16, "id": "00061d61", "metadata": {}, "outputs": [ @@ -283,7 +283,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 19, "id": "5710c72d", "metadata": {}, "outputs": [ @@ -292,7 +292,7 @@ "output_type": "stream", "text": [ "Number of source documents: 42\n", - "Number of text chunks: 228\n" + "Number of text chunks: 196\n" ] } ], @@ -319,7 +319,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 29, "id": "24b42902-d145-4f61-80c2-334a4b1da886", "metadata": {}, "outputs": [ @@ -332,14 +332,13 @@ } ], "source": [ - "model_kwargs = {'device': 'cuda', 'trust_remote_code': True}\n", - "encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity\n", - "\n", - "print(f\"Setting up the embeddings model...\")\n", - "embeddings = HuggingFaceEmbeddings(\n", - " model_name=EMBEDDING_MODEL_NAME,\n", - " model_kwargs=model_kwargs,\n", - " encode_kwargs=encode_kwargs,\n", + "print(\"Setting up the embeddings model...\")\n", + "embeddings = OpenAIEmbeddings(\n", + " model=EMBEDDING_MODEL_NAME,\n", + " # Leverage the RoBERTa tokenizer to make sure that \n", + " # the chunks stay within the 512-token context window.\n", + " tiktoken_model_name=\"roberta-base\",\n", + " tiktoken_enabled=False\n", ")" ] }, @@ -361,7 +360,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 30, "id": "1048c42a", "metadata": {}, "outputs": [], @@ -383,7 +382,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 31, "id": "51dc81d7-8333-41e6-9e77-47a45ee0b374", "metadata": {}, "outputs": [ @@ -394,19 +393,22 @@ "Document 1:\n", "\n", "5 \n", - "Annual Report 2021–22 Vector Institute\n", + "Annual Report 2021–22Vector Institute\n", "SPOTLIGHT ON FIVE YEARS OF AI \n", "LEADERSHIP FOR CANADIANS \n", "SINCE THE VECTOR INSTITUTE WAS FOUNDED IN 2017: \n", "2,080+ \n", "Students have graduated from \n", "Vector-recognized AI programs and \n", - "study paths $6.2 M \n", + "study paths \n", + "$6.2 M \n", "Scholarship funds committed to \n", - "students in AI programs 3,700+ \n", + "students in AI programs \n", + "3,700+ \n", "Postings for AI-focused jobs and \n", "internships ofered on Vector’s \n", - "Digital Talent Hub $103 M \n", + "Digital Talent Hub \n", + "$103 M \n", "In research funding committed to \n", "Vector-afliated researchers \n", "94 \n", @@ -415,8 +417,11 @@ "Document 2:\n", "\n", "26 \n", - " VECTOR SCHOLARSHIPS IN \n", - "AI ATTRACT TOP TALENT TO ONTARIO UNIVERSITIES \n", + " \n", + " \n", + "VECTOR SCHOLARSHIPS IN \n", + "AI ATTRACT TOP TALENT \n", + "TO ONTARIO UNIVERSITIES \n", "109 \n", "Vector Scholarships in AI awarded \n", "34 \n", @@ -425,54 +430,80 @@ "Universities \n", "351 \n", "Scholarships awarded since the \n", - "program launched in 2018 Supported with funding from the Province of Ontario, the Vector Institute Scholarship in Artifcial Intelligence (VSAI) helps Ontario universities to attract the best and brightest students to study in AI-related master’s programs. \n", + "program launched in 2018 \n", + "Supported with funding from the Province of \n", + "Ontario, the Vector Institute Scholarship in Artifcial \n", + "Intelligence (VSAI) helps Ontario universities to attract \n", + "the best and brightest students to study in AI-related \n", + "master’s programs. \n", "Scholarship recipients connect directly with leading\n", "----------------------------------------------------------------------------------------------------\n", "Document 3:\n", "\n", - "Arrows indicate year-over-year (YoY) directional change since 2020–21 The complete Ontario AI Snapshot for 2021–22 will be available soon on the Vector Institute website at vectorinstitute.ai. \n", + "The complete Ontario AI Snapshot for 2021–22 will be available soon on the \n", + "Vector Institute website at vectorinstitute.ai. \n", "YoY \n", "22,458 \n", - "AI jobs created YoY \n", - "59,673 \n", - "AI jobs retained YoY \n", + "AI jobs created \n", + "YoY \n", + "59,67 3 \n", + "AI jobs retained \n", + "YoY \n", "1,775 \n", - "New AI Master’s & study path enrolments YoY \n", + "New AI Master’s & study \n", + "path enrolments \n", + "YoY \n", "1,007 \n", - "New AI Master’s graduates from Vector-recognized programs \n", + "New AI Master’s graduates from \n", + "Vector-recognized programs \n", "YoY \n", "66 \n", - "New AI-related patents fled across Canada YoY \n", + "New AI-related patents \n", + "fled across Canada \n", + "YoY \n", "$2.86 BILLION \n", - "In AI-related VC investment * YoY \n", - "273\n", + "In AI-related VC investment* \n", + "YoY \n", + "273 \n", + "Companies invested in \n", + "the Ontario AI ecosystem \n", + "YoY \n", + "50 \n", + "Companies moved into\n", "----------------------------------------------------------------------------------------------------\n", "Document 4:\n", "\n", + "my professional and academic journey.” \n", + "Alex Cui, Vector Scholarship in AI Recipient 2021–22 \n", + "“The scholarship funding from the Vector Institute \n", + "has played an instrumental role in expanding \n", + "graduate teaching, learning, and research \n", + "opportunities in AI at Queen’s University.” \n", + "Dr. Fahim Quadir, Vice-Provost and Dean, School of \n", + "Graduate Studies & Professor of Global Developmental \n", + "Studies, Queen’s University \n", + "PRACTICAL, HANDS-ON \n", + "PROGRAMMING TO FOSTER \n", + "WORKFORCE SKILLS \n", + "AND EXPERIENCE\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 5:\n", + "\n", "23 \n", "RESEARCH AWARDS AND \n", "ACHIEVEMENTS \n", "Each year, members of Vector’s research community \n", - "are recognized for outstanding contributions to AI and machine learning felds. Highlights of 2021–22 include: \n", + "are recognized for outstanding contributions to AI and \n", + "machine learning felds. Highlights of 2021–22 include: \n", "GLOBAL REACH OF VECTOR \n", "RESEARCHERS AND THEIR WORK \n", "Vector researchers published papers, gave \n", - "presentations, or led workshops at many of the top AI conferences this year, including NeurIPS, CVPR, ICLR, ICML, and ACM FAccT. \n", - "380+ Research papers presented at\n", - "----------------------------------------------------------------------------------------------------\n", - "Document 5:\n", - "\n", - "24 \n", - "Annual Report 2021–22 Vector Institute\n", - " \n", - " \n", - " TALENT & \n", - "WORKFORCE DEVELOPMENT \n", - "Vector is helping to attract, develop, and \n", - "connect the AI-skilled workforce that will transform Ontario’s economy 1,775 \n", - "AI master’s students began their studies in \n", - "recognized AI-related programs and study paths, up 27% from last year V\n", - "ector is working with both universities and employers\n" + "presentations, or led workshops at many of the \n", + "top AI conferences this year, including NeurIPS, \n", + "CVPR, ICLR, ICML, and ACM FAccT. \n", + "380+ Research papers presented at \n", + "high-impact global \n", + "conferences and in top-\n" ] } ], @@ -490,7 +521,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 32, "id": "e26d9f46-a082-4497-8ffc-9fa3eccc2ef3", "metadata": {}, "outputs": [ @@ -500,7 +531,7 @@ "text": [ "Result: \n", "\n", - "The text does not provide the number of Vector Scholarships in AI awarded in 2022. It does provide the total number of Vector Scholarships in AI awarded since the program launched in 2018, which is 109.\n" + "According to the context, 109 Vector Scholarships in AI were awarded.\n" ] } ], @@ -525,9 +556,9 @@ ], "metadata": { "kernelspec": { - "display_name": "rag_dataloaders", + "display_name": "Python 3", "language": "python", - "name": "rag_dataloaders" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -539,7 +570,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.12.5" } }, "nbformat": 4, From 35b9ecd1e7edf557027d2d289bb51a247f502398 Mon Sep 17 00:00:00 2001 From: Jacob-Junqi Tian Date: Sun, 24 Nov 2024 18:35:05 -0500 Subject: [PATCH 2/3] Added example KScope OpenAI-compatible embedding code for llamaindex. --- .../document_search_llamaindex.ipynb | 158 ++++++++++++------ 1 file changed, 108 insertions(+), 50 deletions(-) diff --git a/document_search/document_search_llamaindex.ipynb b/document_search/document_search_llamaindex.ipynb index 4373c32..e48b2cf 100644 --- a/document_search/document_search_llamaindex.ipynb +++ b/document_search/document_search_llamaindex.ipynb @@ -52,6 +52,19 @@ "#### Import libraries" ] }, + { + "cell_type": "markdown", + "id": "bf703d8a", + "metadata": {}, + "source": [ + "```bash\n", + "pip install \\\n", + " llama-index-embeddings-openai \\\n", + " llama-index-vector-stores-faiss \\\n", + " llama-index-llms-openai-like\n", + "```" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -68,7 +81,17 @@ "execution_count": 2, "id": "2f637730", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jacobtian/python/3.12/usr/local/lib/python3.12/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n" + ] + } + ], "source": [ "import faiss\n", "import os\n", @@ -82,7 +105,7 @@ "from llama_index.core.llms import ChatMessage\n", "from llama_index.core.node_parser import LangchainNodeParser\n", "from llama_index.core.query_engine import RetrieverQueryEngine\n", - "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n", + "from llama_index.embeddings.openai import OpenAIEmbedding\n", "from llama_index.llms.openai_like import OpenAILike\n", "from llama_index.vector_stores.faiss import FaissVectorStore" ] @@ -113,14 +136,13 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 11, "id": "4be12a21-c830-4aa3-a76d-3684b9445950", "metadata": {}, "outputs": [], "source": [ - "GENERATOR_BASE_URL = os.environ.get(\"OPENAI_BASE_URL\")\n", - "\n", - "OPENAI_API_KEY = os.environ.get(\"OPENAI_API_KEY\")" + "GENERATOR_BASE_URL = os.environ[\"OPENAI_BASE_URL\"]\n", + "OPENAI_API_KEY = os.environ[\"OPENAI_API_KEY\"]" ] }, { @@ -133,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "dd4e2417", "metadata": {}, "outputs": [], @@ -156,7 +178,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "74b61e4f", "metadata": {}, "outputs": [], @@ -182,13 +204,13 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "78a10552-cb1a-4088-9081-05494fca9410", "metadata": {}, "outputs": [], "source": [ "GENERATOR_MODEL_NAME = \"Meta-Llama-3.1-8B-Instruct\"\n", - "EMBEDDING_MODEL_NAME = \"BAAI/bge-base-en-v1.5\"" + "EMBEDDING_MODEL_NAME = \"bge-base-en-v1.5\"" ] }, { @@ -207,7 +229,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "6133a928", "metadata": {}, "outputs": [], @@ -225,7 +247,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "3f3d559a-74cf-4406-9ee4-61944f3e4b65", "metadata": {}, "outputs": [ @@ -246,7 +268,8 @@ " temperature=0,\n", " max_tokens=None,\n", " api_base=GENERATOR_BASE_URL,\n", - " api_key=OPENAI_API_KEY\n", + " api_key=OPENAI_API_KEY,\n", + " api_version=\"1\"\n", ")\n", "message = [\n", " ChatMessage(\n", @@ -290,7 +313,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "id": "5710c72d", "metadata": {}, "outputs": [ @@ -299,7 +322,7 @@ "output_type": "stream", "text": [ "Number of source documents: 42\n", - "Number of text chunks: 228\n" + "Number of text chunks: 196\n" ] } ], @@ -325,7 +348,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 14, "id": "268ab345-4676-4700-8965-4639751e7fe8", "metadata": {}, "outputs": [ @@ -333,17 +356,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "Setting up the embeddings model...\n" + "Setting up the embeddings model bge-base-en-v1.5 from https://kscope.vectorinstitute.ai/v1\n" ] } ], "source": [ - "print(f\"Setting up the embeddings model...\")\n", - "embeddings = HuggingFaceEmbedding(\n", + "print(f\"Setting up the embeddings model {EMBEDDING_MODEL_NAME} from {GENERATOR_BASE_URL}\")\n", + "embeddings = OpenAIEmbedding(\n", " model_name=EMBEDDING_MODEL_NAME,\n", - " device='cuda',\n", - " trust_remote_code=True,\n", - ")" + " embed_batch_size=10,\n", + " api_base=GENERATOR_BASE_URL,\n", + " api_key=OPENAI_API_KEY,\n", + ")\n" ] }, { @@ -356,7 +380,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "id": "7446327c-d8b9-4928-92c7-fb0af4fb0fdc", "metadata": {}, "outputs": [], @@ -383,7 +407,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "id": "c49d0093-0105-499a-a7e3-ebf6326a85d9", "metadata": {}, "outputs": [], @@ -403,7 +427,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "id": "37f512cb-36f8-4afb-a8c6-0c187a0d9cae", "metadata": {}, "outputs": [], @@ -424,7 +448,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "id": "43ff6d3c-b6e8-4702-8591-44e0d7b7d484", "metadata": {}, "outputs": [ @@ -434,9 +458,24 @@ "text": [ "Document 1:\n", "\n", + "number and a 27 per cent increase over the previous \n", + "year. Last year also saw more than 1,000 new graduates \n", + "from AI master’s programs in Ontario; a milestone \n", + "achieved ahead of the province’s 2023 target. These \n", + "skilled AI graduates will hold an envied role in the \n", + "workforce of the future. Further, our research \n", + "community has now grown to more than 700, whose \n", + "infuence continues to grow; they published more than \n", + "380 research papers in high-impact global conferences\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 2:\n", + "\n", "26 \n", - " VECTOR SCHOLARSHIPS IN \n", - "AI ATTRACT TOP TALENT TO ONTARIO UNIVERSITIES \n", + " \n", + " \n", + "VECTOR SCHOLARSHIPS IN \n", + "AI ATTRACT TOP TALENT \n", + "TO ONTARIO UNIVERSITIES \n", "109 \n", "Vector Scholarships in AI awarded \n", "34 \n", @@ -445,52 +484,71 @@ "Universities \n", "351 \n", "Scholarships awarded since the \n", - "program launched in 2018 Supported with funding from the Province of Ontario, the Vector Institute Scholarship in Artifcial Intelligence (VSAI) helps Ontario universities to attract the best and brightest students to study in AI-related master’s programs. \n", + "program launched in 2018 \n", + "Supported with funding from the Province of \n", + "Ontario, the Vector Institute Scholarship in Artifcial \n", + "Intelligence (VSAI) helps Ontario universities to attract \n", + "the best and brightest students to study in AI-related \n", + "master’s programs. \n", "Scholarship recipients connect directly with leading\n", "----------------------------------------------------------------------------------------------------\n", - "Document 2:\n", + "Document 3:\n", "\n", "5 \n", - "Annual Report 2021–22 Vector Institute\n", + "Annual Report 2021–22Vector Institute\n", "SPOTLIGHT ON FIVE YEARS OF AI \n", "LEADERSHIP FOR CANADIANS \n", "SINCE THE VECTOR INSTITUTE WAS FOUNDED IN 2017: \n", "2,080+ \n", "Students have graduated from \n", "Vector-recognized AI programs and \n", - "study paths $6.2 M \n", + "study paths \n", + "$6.2 M \n", "Scholarship funds committed to \n", - "students in AI programs 3,700+ \n", + "students in AI programs \n", + "3,700+ \n", "Postings for AI-focused jobs and \n", "internships ofered on Vector’s \n", - "Digital Talent Hub $103 M \n", + "Digital Talent Hub \n", + "$103 M \n", "In research funding committed to \n", "Vector-afliated researchers \n", "94 \n", "Research awards earned by\n", "----------------------------------------------------------------------------------------------------\n", - "Document 3:\n", + "Document 4:\n", "\n", - "studies in a Vector-recognized AI-related master’s program or other study paths in AI — both a record number and a 27 per cent increase over the previous year. Last year also saw more than 1,000 new graduates from AI master’s programs in Ontario; a milestone achieved ahead of the province’s 2023 target. These skilled AI graduates will hold an envied role in the workforce of the future. Further, our research community has now grown to more than 700, whose infuence continues to grow; they published more than\n", + "my professional and academic journey.” \n", + "Alex Cui, Vector Scholarship in AI Recipient 2021–22 \n", + "“The scholarship funding from the Vector Institute \n", + "has played an instrumental role in expanding \n", + "graduate teaching, learning, and research \n", + "opportunities in AI at Queen’s University.” \n", + "Dr. Fahim Quadir, Vice-Provost and Dean, School of \n", + "Graduate Studies & Professor of Global Developmental \n", + "Studies, Queen’s University \n", + "PRACTICAL, HANDS-ON \n", + "PROGRAMMING TO FOSTER \n", + "WORKFORCE SKILLS \n", + "AND EXPERIENCE\n", "----------------------------------------------------------------------------------------------------\n", - "Document 4:\n", + "Document 5:\n", "\n", "23 \n", "RESEARCH AWARDS AND \n", "ACHIEVEMENTS \n", "Each year, members of Vector’s research community \n", - "are recognized for outstanding contributions to AI and machine learning felds. Highlights of 2021–22 include: \n", + "are recognized for outstanding contributions to AI and \n", + "machine learning felds. Highlights of 2021–22 include: \n", "GLOBAL REACH OF VECTOR \n", "RESEARCHERS AND THEIR WORK \n", "Vector researchers published papers, gave \n", - "presentations, or led workshops at many of the top AI conferences this year, including NeurIPS, CVPR, ICLR, ICML, and ACM FAccT. \n", - "380+ Research papers presented at\n", - "----------------------------------------------------------------------------------------------------\n", - "Document 5:\n", - "\n", - "and Universities \n", - "1,007 \n", - "graduates from recognized AI-master’s programs at Ontario universities, exceeding the province’s target to graduate 1,000 AI master’s students per year by 2023 ahead of schedule\n" + "presentations, or led workshops at many of the \n", + "top AI conferences this year, including NeurIPS, \n", + "CVPR, ICLR, ICML, and ACM FAccT. \n", + "380+ Research papers presented at \n", + "high-impact global \n", + "conferences and in top-\n" ] } ], @@ -508,7 +566,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "id": "23499f4a", "metadata": {}, "outputs": [ @@ -539,9 +597,9 @@ ], "metadata": { "kernelspec": { - "display_name": "rag_dataloaders", + "display_name": "Python 3", "language": "python", - "name": "rag_dataloaders" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -553,7 +611,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.12.5" } }, "nbformat": 4, From 4b76a3b622ad625ad649c148a76fb4c4f49f0be1 Mon Sep 17 00:00:00 2001 From: Jacob-Junqi Tian Date: Sun, 24 Nov 2024 19:11:52 -0500 Subject: [PATCH 3/3] Added example KScope OpenAI-compatible embedding code for langchain and llamaindex (google search, local documents, sql.) --- .../document_search_langchain.ipynb | 7 +- sql_search/sql_search_langchain.ipynb | 42 +- web_search/web_search_langchain.ipynb | 491 ++---------------- web_search/web_search_llamaindex.ipynb | 151 ++++-- 4 files changed, 180 insertions(+), 511 deletions(-) diff --git a/document_search/document_search_langchain.ipynb b/document_search/document_search_langchain.ipynb index 24df10c..8edb51f 100644 --- a/document_search/document_search_langchain.ipynb +++ b/document_search/document_search_langchain.ipynb @@ -65,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "2f637730", "metadata": {}, "outputs": [], @@ -79,7 +79,6 @@ "from langchain.chains import RetrievalQA\n", "from langchain_community.vectorstores import FAISS\n", "from langchain.document_loaders.pdf import PyPDFDirectoryLoader\n", - "# from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n", "from langchain_openai import ChatOpenAI, OpenAIEmbeddings\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter" ] @@ -319,7 +318,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "id": "24b42902-d145-4f61-80c2-334a4b1da886", "metadata": {}, "outputs": [ @@ -332,7 +331,7 @@ } ], "source": [ - "print(\"Setting up the embeddings model...\")\n", + "print(f\"Setting up the embeddings model {EMBEDDING_MODEL_NAME} at {GENERATOR_BASE_URL}\")\n", "embeddings = OpenAIEmbeddings(\n", " model=EMBEDDING_MODEL_NAME,\n", " # Leverage the RoBERTa tokenizer to make sure that \n", diff --git a/sql_search/sql_search_langchain.ipynb b/sql_search/sql_search_langchain.ipynb index 16bd784..767b857 100644 --- a/sql_search/sql_search_langchain.ipynb +++ b/sql_search/sql_search_langchain.ipynb @@ -45,6 +45,19 @@ "## Set up the RAG workflow environment" ] }, + { + "cell_type": "markdown", + "id": "0c347778", + "metadata": {}, + "source": [ + "```bash\n", + "pip install \\\n", + " langchain-community \\\n", + " langchain-experimental \\\n", + " langchain-openai\n", + "```" + ] + }, { "cell_type": "markdown", "id": "c7c85ba1-df09-49c6-a1c3-11d0b26e9e00", @@ -108,14 +121,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 9, "id": "c4ccb9b4-cb43-4779-8b06-1640f2a8a2be", "metadata": {}, "outputs": [], "source": [ - "GENERATOR_BASE_URL = os.environ.get(\"OPENAI_BASE_URL\")\n", + "GENERATOR_BASE_URL = os.environ[\"OPENAI_BASE_URL\"]\n", "\n", - "OPENAI_API_KEY = os.environ.get(\"OPENAI_API_KEY\")" + "OPENAI_API_KEY = os.environ[\"OPENAI_API_KEY\"]" ] }, { @@ -536,7 +549,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "5710c72d", "metadata": {}, "outputs": [], @@ -555,7 +568,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "0925da1b-d837-49bc-abec-fc51d85c5940", "metadata": {}, "outputs": [ @@ -608,7 +621,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "1048c42a", "metadata": {}, "outputs": [ @@ -620,7 +633,14 @@ "\n", "\u001b[1m> Entering new SQLDatabaseChain chain...\u001b[0m\n", "What is the average balance of all management jobs who applied for banking deposits?\n", - "SQLQuery:\u001b[32;1m\u001b[1;3mSELECT AVG(\"balance\") FROM banking_term_deposits WHERE \"job\" = 'management'\u001b[0m\n", + "SQLQuery:" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32;1m\u001b[1;3mSELECT AVG(\"balance\") FROM banking_term_deposits WHERE \"job\" = 'management'\u001b[0m\n", "SQLResult: \u001b[33;1m\u001b[1;3m[(1763.6168323112709,)]\u001b[0m\n", "Answer:\u001b[32;1m\u001b[1;3mQuestion: What is the average balance of all management jobs who applied for banking deposits?\n", "SQLQuery: SELECT AVG(\"balance\") FROM banking_term_deposits WHERE \"job\" = 'management'\u001b[0m\n", @@ -642,7 +662,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "d6a049a0-aa60-4a49-8d48-4aff5171d214", "metadata": {}, "outputs": [ @@ -671,9 +691,9 @@ ], "metadata": { "kernelspec": { - "display_name": "rag_dataloaders", + "display_name": "Python 3", "language": "python", - "name": "rag_dataloaders" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -685,7 +705,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.12.5" } }, "nbformat": 4, diff --git a/web_search/web_search_langchain.ipynb b/web_search/web_search_langchain.ipynb index 5bdcebe..a37eaee 100644 --- a/web_search/web_search_langchain.ipynb +++ b/web_search/web_search_langchain.ipynb @@ -64,12 +64,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "2f637730", "metadata": {}, "outputs": [], "source": [ - "import faiss\n", "import os\n", "import requests\n", "import sys\n", @@ -81,7 +80,6 @@ "from langchain.chains import RetrievalQA\n", "from langchain_community.vectorstores import FAISS\n", "from langchain.docstore.document import Document\n", - "from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n", "from langchain_openai import ChatOpenAI, OpenAIEmbeddings\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter" ] @@ -112,14 +110,13 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 15, "id": "1c2d6b52-7ad1-478e-be82-36bedfa505db", "metadata": {}, "outputs": [], "source": [ - "GENERATOR_BASE_URL = os.environ.get(\"OPENAI_BASE_URL\")\n", - "\n", - "OPENAI_API_KEY = os.environ.get(\"OPENAI_API_KEY\")" + "GENERATOR_BASE_URL = os.environ[\"OPENAI_BASE_URL\"]\n", + "OPENAI_API_KEY = os.environ[\"OPENAI_API_KEY\"]" ] }, { @@ -155,13 +152,19 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "9fd9e704-2c8f-41ac-b733-05b32d2caf7c", "metadata": {}, "outputs": [], "source": [ "GENERATOR_MODEL_NAME = \"Meta-Llama-3.1-8B-Instruct\"\n", - "EMBEDDING_MODEL_NAME = \"BAAI/bge-base-en-v1.5\"" + "\n", + "## Select one of the two options: \n", + "## - \"all-MiniLM-L6-v2\" (22M parameters)\n", + "## - \"bge-base-en-v1.5\" (110M parameters)\n", + "\n", + "# EMBEDDING_MODEL_NAME = \"bge-base-en-v1.5\"\n", + "EMBEDDING_MODEL_NAME = \"all-MiniLM-L6-v2\"" ] }, { @@ -270,7 +273,7 @@ "output_type": "stream", "text": [ "Number of source documents: 10\n", - "Number of text chunks: 745\n", + "Number of text chunks: 869\n", "\n" ] } @@ -278,7 +281,7 @@ "source": [ "# Do a Google web search and parse the results into a big text string\n", "web_documents = []\n", - "for result_url in search(query, tld=\"com\", num=5, stop=10, pause=2):\n", + "for result_url in search(query):\n", " response = requests.get(result_url)\n", " soup = BeautifulSoup(response.content, 'html.parser')\n", " web_documents.append(soup.get_text())\n", @@ -303,7 +306,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 17, "id": "81a0f241-a654-4ab2-8851-57a23e063e47", "metadata": {}, "outputs": [ @@ -311,7 +314,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Setting up the embeddings model...\n" + "Setting up the embeddings model all-MiniLM-L6-v2 at https://kscope.vectorinstitute.ai/v1\n" ] } ], @@ -319,11 +322,13 @@ "model_kwargs = {'device': 'cuda', 'trust_remote_code': True}\n", "encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity\n", "\n", - "print(f\"Setting up the embeddings model...\")\n", - "embeddings = HuggingFaceEmbeddings(\n", - " model_name=EMBEDDING_MODEL_NAME,\n", - " model_kwargs=model_kwargs,\n", - " encode_kwargs=encode_kwargs,\n", + "print(f\"Setting up the embeddings model {EMBEDDING_MODEL_NAME} at {GENERATOR_BASE_URL}\")\n", + "embeddings = OpenAIEmbeddings(\n", + " model=EMBEDDING_MODEL_NAME,\n", + " # Leverage the RoBERTa tokenizer to make sure that \n", + " # the chunks stay within the 512-token context window.\n", + " tiktoken_model_name=\"roberta-base\",\n", + " tiktoken_enabled=False\n", ")" ] }, @@ -340,12 +345,16 @@ "id": "b9bfc4e4-4c62-492d-a0b5-9de6d6985a0d", "metadata": {}, "source": [ - "The retriever will identify the document chunks that most closely match our original query. (This takes about 1-2 minutes)" + "The retriever will identify the document chunks that most closely match our original query. \n", + "\n", + "Depending on the number of chunks provided, generating the embeddings might require:\n", + "- about 5 minutes, when using \"bge-base-en-v1.5\" (110M parameters);\n", + "- about 1 minute, when using \"all-MiniLM-L6-v2\" (22M parameters)." ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 18, "id": "ae20b23b-43ff-4677-a534-0f507b090d9f", "metadata": {}, "outputs": [], @@ -367,7 +376,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 19, "id": "bec4a394-cfe3-465f-9873-d86132bb47a4", "metadata": {}, "outputs": [ @@ -377,443 +386,27 @@ "text": [ "Document 1:\n", "\n", - "Dodgers win World Series 2024\n", + "The 2024 Major League Baseball season (MLB) began on March 20–21 with a two-game series between the Los Angeles Dodgers and the San Diego Padres held in Seoul, South Korea, before the regular season proper ran from March 28 to September 30.[1][2] The 94th All-Star Game was played on July 16 at Globe Life Field in Arlington, Texas,[3] with the American League winning, 5–3.[4] The postseason then began on October 1 and concluded with Game 5 of the World Series on October 30.[5] Going into the season, the\n", "----------------------------------------------------------------------------------------------------\n", "Document 2:\n", "\n", - "Dodgers win the 2024 World Series | 10/30/2024 | MLB.com\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Globe iconLogin iconRecap iconSearch iconTickets icon\n", + "The 2024 Major League Baseball season (MLB) began on March 20–21 with a two-game series between the Los Angeles Dodgers and the San Diego Padres held in Seoul, South Korea, before the regular season proper ran from March 28 to September 30.[1][2] The 94th All-Star Game was played on July 16 at Globe Life Field in Arlington, Texas,[3] with the American League winning, 5–3.[4] The postseason then began on October 1 and concluded with Game 5 of the World Series on October 30.[5] Going into the season, the\n", "----------------------------------------------------------------------------------------------------\n", "Document 3:\n", "\n", - "2024 World Series - Wikipedia\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Jump to content\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Main menu\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Main menu\n", - "move to sidebar\n", - "hide\n", - "\n", - "\n", - "\n", - "\t\tNavigation\n", - "\t\n", - "\n", - "\n", - "Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\t\tContribute\n", - "\t\n", - "\n", - "\n", - "HelpLearn to editCommunity portalRecent changesUpload file\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Search\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Search\n", - "\n", - "\n", - "\n", + "^ \"Bart, Reynolds homer, Pirates beat White Sox 9-4\". ESPN.com. Associated Press. July 14, 2024. Chicago entered the day as the first team in MLB history with 70 losses before the All-Star break.\n", "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Appearance\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Donate\n", - "\n", - "Create account\n", - "\n", - "Log in\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Personal tools\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Donate Create account Log in\n", + "^ Axisa, Mike (August 25, 2024). \"White Sox lose 100th game of 2024 MLB season: Chicago club on pace to beat 1962 Mets for most losses\". CBSSports.com. Retrieved August 25, 2024.\n", "----------------------------------------------------------------------------------------------------\n", "Document 4:\n", "\n", - "The 2024 World Series was the championship series of Major League Baseball's (MLB) 2024 season. The 120th edition of the World Series, it was a best-of-seven playoff between the National League (NL) champion Los Angeles Dodgers and the American League (AL) champion New York Yankees. It was the Dodgers' first World Series appearance and win since 2020, and the Yankees' first World Series appearance since 2009. The series began on October 25 and ended on October 30 with the Dodgers winning in five\n", + "^ \"Bart, Reynolds homer, Pirates beat White Sox 9-4\". ESPN.com. Associated Press. July 14, 2024. Chicago entered the day as the first team in MLB history with 70 losses before the All-Star break.\n", + "\n", + "^ Axisa, Mike (August 25, 2024). \"White Sox lose 100th game of 2024 MLB season: Chicago club on pace to beat 1962 Mets for most losses\". CBSSports.com. Retrieved August 25, 2024.\n", "----------------------------------------------------------------------------------------------------\n", "Document 5:\n", "\n", - "2024 Japan Series - Wikipedia\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Jump to content\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Main menu\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Main menu\n", - "move to sidebar\n", - "hide\n", - "\n", - "\n", - "\n", - "\t\tNavigation\n", - "\t\n", - "\n", - "\n", - "Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\t\tContribute\n", - "\t\n", - "\n", - "\n", - "HelpLearn to editCommunity portalRecent changesUpload file\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Search\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Search\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Appearance\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Donate\n", - "\n", - "Create account\n", - "\n", - "Log in\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Personal tools\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Donate Create account Log in\n" + "The 2024 World Series was the championship series of Major League Baseball's (MLB) 2024 season. The 120th edition of the World Series, it was a best-of-seven playoff between the National League (NL) champion Los Angeles Dodgers and the American League (AL) champion New York Yankees. It was the Dodgers' first World Series appearance and win since 2020, and the Yankees' first World Series appearance since 2009. The series began on October 25 and ended on October 30 with the Dodgers winning in five\n" ] } ], @@ -831,7 +424,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 20, "id": "64b191b1-e25f-49e0-a377-e0b50023dd4c", "metadata": {}, "outputs": [ @@ -862,9 +455,9 @@ ], "metadata": { "kernelspec": { - "display_name": "rag_dataloaders", + "display_name": "Python 3", "language": "python", - "name": "rag_dataloaders" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -876,7 +469,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.12.5" } }, "nbformat": 4, diff --git a/web_search/web_search_llamaindex.ipynb b/web_search/web_search_llamaindex.ipynb index 1b4a54c..1ca3963 100644 --- a/web_search/web_search_llamaindex.ipynb +++ b/web_search/web_search_llamaindex.ipynb @@ -64,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "id": "2f637730", "metadata": {}, "outputs": [], @@ -85,7 +85,7 @@ "from llama_index.core.node_parser import LangchainNodeParser\n", "from llama_index.core.query_engine import RetrieverQueryEngine\n", "from llama_index.core.readers import StringIterableReader\n", - "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n", + "from llama_index.embeddings.openai import OpenAIEmbedding\n", "from llama_index.llms.openai_like import OpenAILike\n", "from llama_index.vector_stores.faiss import FaissVectorStore" ] @@ -100,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "id": "8fac6ed4-691d-4669-b72b-1ceb4d27944d", "metadata": {}, "outputs": [], @@ -116,14 +116,13 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "id": "4c528521-8bb4-49c6-b181-e0ecd6862e3e", "metadata": {}, "outputs": [], "source": [ - "GENERATOR_BASE_URL = os.environ.get(\"OPENAI_BASE_URL\")\n", - "\n", - "OPENAI_API_KEY = os.environ.get(\"OPENAI_API_KEY\")" + "GENERATOR_BASE_URL = os.environ[\"OPENAI_BASE_URL\"]\n", + "OPENAI_API_KEY = os.environ[\"OPENAI_API_KEY\"]" ] }, { @@ -136,7 +135,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "id": "dd4e2417", "metadata": {}, "outputs": [], @@ -159,13 +158,13 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "id": "84b28b5e-d9d9-4788-9887-5c71aedfd8a1", "metadata": {}, "outputs": [], "source": [ "GENERATOR_MODEL_NAME = \"Meta-Llama-3.1-8B-Instruct\"\n", - "EMBEDDING_MODEL_NAME = \"BAAI/bge-base-en-v1.5\"" + "EMBEDDING_MODEL_NAME = \"bge-base-en-v1.5\"" ] }, { @@ -184,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 10, "id": "6133a928", "metadata": {}, "outputs": [], @@ -202,7 +201,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "id": "40c2663f", "metadata": {}, "outputs": [ @@ -223,7 +222,8 @@ " temperature=0,\n", " max_tokens=None,\n", " api_base=GENERATOR_BASE_URL,\n", - " api_key=OPENAI_API_KEY\n", + " api_key=OPENAI_API_KEY,\n", + " api_version=\"1\"\n", ")\n", "message = [\n", " ChatMessage(\n", @@ -269,7 +269,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 15, "id": "5710c72d", "metadata": {}, "outputs": [ @@ -277,15 +277,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "Number of source documents: 10\n", - "Number of text chunks: 745\n" + "Number of source documents: 8\n", + "Number of text chunks: 573\n" ] } ], "source": [ "# Do a Google web search and store the results in a documents list\n", "web_documents = []\n", - "for result_url in search(query, tld=\"com\", num=5, stop=10, pause=2):\n", + "for result_url in search(query):\n", " response = requests.get(result_url)\n", " soup = BeautifulSoup(response.content, 'html.parser')\n", " web_documents.append(soup.get_text())\n", @@ -309,7 +309,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 19, "id": "54f69ffb-44f0-411f-bdc3-75efe8fb0d23", "metadata": {}, "outputs": [ @@ -317,16 +317,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "Setting up the embeddings model...\n" + "Setting up the embeddings model bge-base-en-v1.5 from https://kscope.vectorinstitute.ai/v1\n" ] } ], "source": [ - "print(f\"Setting up the embeddings model...\")\n", - "embeddings = HuggingFaceEmbedding(\n", + "print(f\"Setting up the embeddings model {EMBEDDING_MODEL_NAME} from {GENERATOR_BASE_URL}\")\n", + "embeddings = OpenAIEmbedding(\n", " model_name=EMBEDDING_MODEL_NAME,\n", - " device='cuda',\n", - " trust_remote_code=True,\n", + " embed_batch_size=128,\n", + " api_base=GENERATOR_BASE_URL,\n", + " api_key=OPENAI_API_KEY,\n", ")" ] }, @@ -340,7 +341,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 17, "id": "b7484523-4a7d-43de-9c9c-bf46425e83ac", "metadata": {}, "outputs": [], @@ -367,7 +368,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 20, "id": "996df356-db4f-4e1e-90cb-9414ed0d2ec2", "metadata": {}, "outputs": [], @@ -387,7 +388,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 21, "id": "891457d4-60cb-470c-b11a-3840eb85c882", "metadata": {}, "outputs": [], @@ -408,7 +409,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 23, "id": "7dffa44d-c61c-4d35-835b-eb3aa9989f7a", "metadata": {}, "outputs": [ @@ -418,7 +419,7 @@ "text": [ "Document 1:\n", "\n", - "Dodgers win the 2024 World Series | 10/30/2024 | MLB.com\n", + "2024 World Series - Wikipedia\n", "\n", "\n", "\n", @@ -455,6 +456,7 @@ "\n", "\n", "\n", + "Jump to content\n", "\n", "\n", "\n", @@ -462,23 +464,33 @@ "\n", "\n", "\n", + "Main menu\n", "\n", "\n", "\n", "\n", "\n", + "Main menu\n", + "move to sidebar\n", + "hide\n", "\n", "\n", "\n", + "\t\tNavigation\n", + "\t\n", "\n", "\n", + "Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n", "\n", "\n", "\n", "\n", "\n", + "\t\tContribute\n", + "\t\n", "\n", "\n", + "HelpLearn to editCommunity portalRecent changesUpload file\n", "\n", "\n", "\n", @@ -498,6 +510,7 @@ "\n", "\n", "\n", + "Search\n", "\n", "\n", "\n", @@ -509,28 +522,30 @@ "\n", "\n", "\n", + "Search\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", "\n", "\n", "\n", "\n", "\n", "\n", - "Globe iconLogin iconRecap iconSearch iconTickets icon\n", - "----------------------------------------------------------------------------------------------------\n", - "Document 2:\n", "\n", - "Dodgers win World Series 2024\n", - "----------------------------------------------------------------------------------------------------\n", - "Document 3:\n", "\n", - "The 2024 World Series was the championship series of Major League Baseball's (MLB) 2024 season. The 120th edition of the World Series, it was a best-of-seven playoff between the National League (NL) champion Los Angeles Dodgers and the American League (AL) champion New York Yankees. It was the Dodgers' first World Series appearance and win since 2020, and the Yankees' first World Series appearance since 2009. The series began on October 25 and ended on October 30 with the Dodgers winning in five\n", - "----------------------------------------------------------------------------------------------------\n", - "Document 4:\n", "\n", - "2024 World Series - Wikipedia\n", "\n", "\n", "\n", + "Appearance\n", "\n", "\n", "\n", @@ -547,6 +562,52 @@ "\n", "\n", "\n", + "Donate\n", + "\n", + "Create account\n", + "\n", + "Log in\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Personal tools\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Donate Create account Log in\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 2:\n", + "\n", + "The 2024 World Series was the championship series of Major League Baseball's (MLB) 2024 season. The 120th edition of the World Series, it was a best-of-seven playoff between the National League (NL) champion Los Angeles Dodgers and the American League (AL) champion New York Yankees. It was the Dodgers' first World Series appearance and win since 2020, and the Yankees' first World Series appearance since 2009. The series began on October 25 and ended on October 30 with the Dodgers winning in five\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 3:\n", + "\n", + "MLB World Series 2024 Prize Money: How much will the MLB champions make?The MLB is about to end its 2024 season Only the World Series between the New York Yankees and the Los Angeles Dodgers remains ahead.The New York Yankees were crowned in the East Division and achieved the best record in the American League. The Cleveland Guardians took the Central, while the Houston Astros won the West Division. The Baltimore Orioles, the Detroit Tigers, and the Kansas City Royals were awarded the wild card tickets in\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 4:\n", + "\n", + "Dodgers beat Yankees to win 2024 World Series\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 5:\n", + "\n", + "2024 Major League Baseball season - Wikipedia\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", "\n", "\n", "\n", @@ -689,11 +750,7 @@ "\n", "\n", "\n", - "Donate Create account Log in\n", - "----------------------------------------------------------------------------------------------------\n", - "Document 5:\n", - "\n", - "Dodgers secure their 2024 World Series win over the Yankees\",\"displayAsVideoGif\":false,\"duration\":\"00:01:30\",\"slug\":\"radio-call-of-dodgers-world-series-victory\",\"tags\":[{\"__typename\":\"GameTag\"},{\"__typename\":\"TeamTag\",\"slug\":\"teamid-119\",\"title\":\"Los Angeles Dodgers\",\"team\":{\"__ref\":\"Team:119\"},\"type\":\"team\"},{\"__typename\":\"PersonTag\",\"slug\":\"playerid-621111\",\"title\":\"Walker\n" + "Donate Create account Log in\n" ] } ], @@ -711,7 +768,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 26, "id": "ab480021-1266-45c9-b4ba-45d4c22dd5bc", "metadata": {}, "outputs": [ @@ -742,9 +799,9 @@ ], "metadata": { "kernelspec": { - "display_name": "rag_dataloaders", + "display_name": "Python 3", "language": "python", - "name": "rag_dataloaders" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -756,7 +813,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.12.5" } }, "nbformat": 4,