diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..b403da38 --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +# Backup fiels +~* + +# Python build / packaging +build/ +dist/ +__pycache__/ +llmware.egg-info/ +venv/ + +# Mac +.DS_Store/ diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..290235ea --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,128 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity +and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or + advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email + address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +info@aibloks.com. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series +of actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or +permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, available at +https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see the FAQ at +https://www.contributor-covenant.org/faq. Translations are available at +https://www.contributor-covenant.org/translations. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..c4984b2d --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,57 @@ +# Contributing to llmware +Contributions to `llmware` are welcome from everyone. Our goal is to make the process as simple, transparent, and straightforward as possible. + +The following are a set of guidelines for contributing to `llmware`. As with everything in the project, the contributions to `llmware` are governed by our [Code of Conduct](https://github.com/llmware-ai/llmware/blob/main/CODE_OF_CONDUCT.md). + +## How can you contribute? + +### Contributing code + +**If you encounter a bug, you can** + +- File an issue about the bug. +- Provide clear and concrete ways/scripts to reproduce the bug. +- Provide possible solutions for the bug. +- Pull a request to fix the bug. + +**If you're interested in existing issues, you can** + +- Provide answers for questions in our [github discussions](https://github.com/llmware-ai/llmware/discussions) +- Provide help for bug or enhancement issues. + - Ask questions, reproduce the issue, or provide solutions. + - Pull a request to fix the issue. + +**If you'd like to contribute a new feature or significantly change an existing one, you can** + +- Start a discussion with us in our [github discussions](https://github.com/llmware-ai/llmware/discussions). + +**If you want to become a contributor of llmware, submit your pull requests!!** + +- For those just getting started, see [GitHub workflow](https://github.com/llmware-ai/llmware/blob/main/CONTRIBUTING.md#github-workflow) below. +- All submissions will be reviewed as quickly as possible. +- Once the review is complete, your PR will be merged into the Main branch. + +**If you believe you've found a security vulnerability** + +Please _do not_ submit an issue ticket or pull request or otherwise publicly disclose the issue. Follow the process at [Reporting a Vulnerability](https://github.com/llmware-ai/llmware/blob/main/Security.md) + +### GitHub workflow + +Generally, we follow the "fork-and-pull" Git workflow. + +1. [Fork](https://docs.github.com/en/github/getting-started-with-github/fork-a-repo) the repository on GitHub. +2. Clone your fork to your local machine with `git clone git@github.com:/llmware.git`. +3. Create a branch with `git checkout -b my-topic-branch`. +4. [Commit](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/committing-changes-to-a-pull-request-branch-created-from-a-fork) changes to your own branch, then push to GitHub with `git push origin my-topic-branch`. +5. Submit a [pull request](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests) so that we can review your changes. + +Remember to [sync your forked repository](https://docs.github.com/en/github/getting-started-with-github/fork-a-repo#keep-your-fork-synced) _before_ submitting proposed changes upstream. If you have an existing local repository, please update it before you start, to minimize the chance of merge conflicts. + +```shell +git remote add upstream git@github.com:llmware-ai/llmware.git +git fetch upstream +git checkout upstream/main -b my-topic-branch +``` + +### Do you have questions or just want to bounce around an idea? +Questions and discussions are welcome in our [github discussions](https://github.com/llmware-ai/llmware/discussions)! diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..261eeb9e --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/NOTICE b/NOTICE new file mode 100644 index 00000000..cb2f319e --- /dev/null +++ b/NOTICE @@ -0,0 +1,92 @@ + +Copyright 2023, llmware + +This software contains links to the llmware public model repository. Models in this repository are licensed under the Apache License 2.0. (https://www.apache.org/licenses/LICENSE-2.0) + +This software contains code copied from, derived from or inspired by Nils Reimers and the UKP Labe Sentence Transformers Model. (https://github.com/UKPLab/sentence-transformers) +Copyright 2019 Nils Reimers + +This software contains code copied from, derived from or inspired from the PyTorch BERT model. (https://github.com/huggingface/transformers) +Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + +This software contains code copied from, derived from or inspired from the Huggingface transformers generation code. (https://github.com/huggingface/transformers/src/transformers/generation) +Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + +================================================================================================= + +Open-Source dependencies for the llmware package: + +3-clause BSD License (https://opensource.org/license/bsd-3-clause/) + * Software: libzip (https://libzip.org/) + * Software: lxml (https://github.com/lxml/lxml) + * Software: numpy (https://github.com/numpy/numpy) + * Software: scipy (https://github.com/scipy/scipy) + * Software: torch (https://github.com/pytorch/pytorch) + * Software: Werkzeug (https://github.com/pallets/werkzeug/) + +Apache License 2.0 (https://www.apache.org/licenses/LICENSE-2.0) + * Software: boto3 (https://github.com/boto/boto3) + * Software: google-cloud-aiplatform (https://github.com/googleapis/python-aiplatform) + * Software: mongo-c-driver (https://github.com/mongodb/mongo-c-driver) + * Software: pymilvus (https://github.com/milvus-io/pymilvus) + * Software: pymongo (http://github.com/mongodb/mongo-python-driver) + * Software: pytesseract (https://github.com/madmaze/pytesseract) + * Software: tesseract (https://github.com/tesseract-ocr/tesseract) + * Software: tokenizers (https://github.com/huggingface/tokenizers) + * Software: yfinance (https://github.com/ranaroussi/yfinance) + +GNU GENERAL PUBLIC LICENSE 3.0 (https://www.gnu.org/licenses/gpl-3.0.html#license-text) + * Software: poppler (https://poppler.freedesktop.org/) + +Historical Permission Notice and Disclaimer (HPND) (https://spdx.org/licenses/HPND) + * Software: pillow (https://github.com/python-pillow/Pillow) + +libtiff License (https://spdx.org/licenses/libtiff.html) + * Softward: libtiff (http://www.libtiff.org/) + +MIT License (https://opensource.org/license/mit/) + * Software: ai21 (https://pypi.org/project/ai21/) + * Software: anthropic (https://github.com/anthropics/anthropic-sdk-python) + * Software: beautifulsoup4 (https://pypi.org/project/beautifulsoup4/) + * Software: cohere (https://github.com/cohere-ai/cohere-python) + * Software: faise-cpu (https://github.com/kyamagu/faiss-wheels) + * Software: openai (https://github.com/openai/openai-python) + * Software: pdf2image (https://github.com/Belval/pdf2image) + * Software: word2number (https://github.com/akshaynagpal/w2n) + * Software: Wikipedia-API (https://github.com/martin-majlis/Wikipedia-API) + +PNG Reference Library version 2 (https://spdx.org/licenses/libpng-2.0.html) + * Software: libpng (http://www.libpng.org/pub/png/libpng.html) + +================================================================================================= + +Citations for Open Source Software and Research used in the development of llmware: + +NumPy +Harris, C.R., Millman, K.J., van der Walt, S.J. et al. Array programming with NumPy. Nature 585, 357–362 (2020). + +Tokenizers +Moi, A., & Patry, N. (2023). HuggingFace's Tokenizers (Version 0.13.4) [Computer software]. https://github.com/huggingface/tokenizers + +Torch +Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., Desmaison, A., Kopf, A., Yang, E., DeVito, Z., Raison, M., Tejani, A., Chilamkurthy, S., Steiner, B., Fang, L., Bai, J., & Chintala, S. (2019). PyTorch: An Imperative Style, High-Performance Deep Learning Library [Conference paper]. Advances in Neural Information Processing Systems 32, 8024–8035. http://papers.neurips.cc/paper/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf + +BERT +Turc, Iulia; Chang, Ming-Wei; Lee, Kenton; Toutanova, Kristina. Well-Read Students Learn Better: On the Importance of Pre-training Compact Models, arXiv preprint arXiv:1908.08962v2 (2019). + +Devlin, Jacob; Chang, Ming-Wei; Lee, Kenton; Toutanova, Kristina. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding, arXiv preprint arXiv:1810.04805 (2018). + +GPT2 +Radford, Alec; Wu, Jeff; Child, Rewon; Luan, David; Amodei, Dario; Sutskever, Ilya. Language Models are Unsupervised Multitask Learners. (2019) + +Roberta +Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. RoBERTa: {A} Robustly Optimized {BERT} Pretraining Approach. CoRR, abs/1907.11692, 2019. http://arxiv.org/abs/1907.11692. + +Sentence-BERT +Reimers, Nils and Gurevych, Iryna. Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing. November 2019. Association for Computational Linguistics. https://arxiv.org/abs/1908.10084. + +Huggingface Transformers +Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Ma, C., Jernite, Y., Plu, J., Xu, C., Le Scao, T., Gugger, S., Drame, M., Lhoest, Q., & Rush, A. M. (2020). Transformers: State-of-the-Art Natural Language Processing [Conference paper]. 38–45. https://www.aclweb.org/anthology/2020.emnlp-demos.6 + diff --git a/README.md b/README.md index 44808e74..3a1f03b7 100644 --- a/README.md +++ b/README.md @@ -1 +1,266 @@ -# llmware \ No newline at end of file +# llmware +![Static Badge](https://img.shields.io/badge/python-3.9_%7C_3.10-blue?color=blue) +![PyPI - Version](https://img.shields.io/pypi/v/llmware?color=blue) +![PyPI - Downloads](https://img.shields.io/pypi/dw/llmware?color=blue) + + +`llmware` is a unified, open, extensible framework for LLM-based application patterns including Retrieval Augmented Generation (RAG). This project provides a comprehensive set of tools that anyone can use – from beginner to the most sophisticated AI developer – to rapidly build industrial-grade enterprise LLM-based applications. + +With `llmware`, our goal is to contribute to and help catalize an open community around the new combination of open, extensible technologies being assembled to accomplish fact-based generative workflows. + +## 🎯 Key features +`llmware` is an integrated framework comprised of four major components: + +
+Retrieval: Assemble fact-sets + +- A comprehensive set of querying methods: semantic, text, and hybrid retrieval with integrated metadata. +- Ranking and filtering strategies to enable semantic search and rapid retrieval of information. +- Web scrapers, Wikipedia integration, and Yahoo Finance API integration as additional tools to assemble fact-sets for generation. +
+ + +
+Prompt: Tools for sophisticated generative scenarios + +- **Connect Models:** Open interface designed to support AI21, Ai Bloks READ-GPT, Anthropic, Cohere, HuggingFace Generative models, OpenAI. +- **Prepare Sources:** Tools for packaging and tracking a wide range of materials into model context window sizes. Sources include files, websites, audio, AWS Transcribe transcripts, Wikipedia and Yahoo Finance. +- **Prompt Catalog:** Dynamically configurable prompts to experiment with multiple models without any change in the code. +- **Post Processing:** a full set of metadata and tools for evidence verification, classification of a response, and fact-checking. +- **Human in the Loop:** Ability to enable user ratings, feedback, and corrections of AI responses. +- **Auditability:** A flexible state mechanism to capture, track, analyze and audit the LLM prompt lifecycle +
+ +
+Vector Embeddings: swappable embedding models and vector databases + +- Custom trained sentence transformer embedding models and support for embedding models from Cohere, Google, HuggingFace Embedding models, and OpenAI. +- Mix-and-match among multiple options to find the right solution for any particular application. +- Out-of-the-box support for 3 vector databases - Milvus, FAISS, and Pinecone. +
+ +
+Parsing and Text Chunking: Prepare your data for RAG + +* Parsers for: PDF, PowerPoint, Word, Excel, HTML, Text, WAV, AWS Transcribe transcripts. +* A complete set of text-chunking tools to separate information and associated metadata to a consistent block format. +
+ +Explore [additional llmware capabilities](https://github.com/llmware-ai/llmware/blob/main/examples/README.md) +## 🌱 Getting Started + +### 1. Install llmware: + +```bash +python3 -m pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ llmware +``` +See [Working with llmware](#%EF%B8%8F-working-with-the-llmware-github-repository) for other options to get up and running. + +### 2. MongoDB and Milvus + +MongoDB and Milvus are optional and used to provide production-grade database and vector embedding capabilities. The fastest way to get started is to use the provided Docker Compose file which takes care of running them both: +```bash +curl -o docker-compose.yaml https://raw.githubusercontent.com/llmware-ai/llmware/main/mongodb-milvus-compose.yaml +``` +and then run the containers: +```bash +docker compose up -d +``` +Not ready to install MongoDB or Milvus? Check out what you can do without them in our [examples section](https://github.com/llmware-ai/llmware/blob/main/examples/README.md#using-llmware-without-mongodb-or-an-embedding-database). + +See [Running MongoDB and Milvus](#%EF%B8%8F-alternate-options-for-running-mongodb-and-milvus) for other options to get up and running with these optional dependencies. + +### 3. 🔥 Start coding - Quick Start For RAG 🔥 +```python +# This example demonstrates Retrieval Augmented Retrieval (RAG): +import os +from llmware.library import Library +from llmware.retrieval import Query +from llmware.prompts import Prompt +from llmware.setup import Setup + +# Update this value with your own API Key, either by setting the env var or editing it directly here: +openai_api_key = os.environ["OPENAI_API_KEY"] + +# A self-contained end-to-end example of RAG +def end_to_end_rag(): + + # Create a library called "Agreements", and load it with llmware sample files + print (f"\n > Creating library 'Agreements'...") + library = Library().create_new_library("Agreements") + sample_files_path = Setup().load_sample_files() + library.add_files(os.path.join(sample_files_path,"Agreements")) + + # Create vector embeddings for the library using the "industry-bert-contracts model and store them in Milvus + print (f"\n > Generating vector embeddings using embedding model: 'industry-bert-contracts'...") + library.install_new_embedding(embedding_model_name="industry-bert-contracts", vector_db="milvus") + + # Perform a semantic search against our library. This will gather evidence to be used in the LLM prompt + print (f"\n > Performing a semantic query...") + os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid a HuggingFace tokenizer warning + query_results = Query(library).semantic_query("Termination", result_count=20) + + # Create a new prompter using the GPT-4 and add the query_results captured above + prompt_text = "Summarize the termination provisions" + print (f"\n > Prompting LLM with '{prompt_text}'") + prompter = Prompt().load_model("gpt-4", api_key=openai_api_key) + sources = prompter.add_source_query_results(query_results) + + # Prompt the LLM with the sources and a query string + responses = prompter.prompt_with_source(prompt_text, prompt_name="summarize_with_bullets") + for response in responses: + print ("\n > LLM response\n" + response["llm_response"]) + + # Finally, generate a CSV report that can be shared + print (f"\n > Generating CSV report...") + report_data = prompter.send_to_human_for_review() + print ("File: " + report_data["report_fp"] + "\n") + +end_to_end_rag() +``` +#### Response from end-to-end RAG example + +``` +> python examples/rag.py + + > Creating library 'Agreements'... + + > Generating vector embeddings using embedding model: 'industry-bert-contracts'... + + > Performing a semantic query... + + > Prompting LLM with 'Summarize the termination provisions' + + > LLM response +- Employment period ends on the first occurrence of either the 6th anniversary of the effective date or a company sale. +- Early termination possible as outlined in sections 3.1 through 3.4. +- Employer can terminate executive's employment under section 3.1 anytime without cause, with at least 30 days' prior written notice. +- If notice is given, the executive is allowed to seek other employment during the notice period. + + > Generating CSV report... +File: /Users/llmware/llmware_data/prompt_history/interaction_report_Fri Sep 29 12:07:42 2023.csv +``` +#### See additional [llmware examples](https://github.com/llmware-ai/llmware/blob/main/examples/README.md) for more code samples and ideas. + +### 4. Accessing LLM's and setting-up API keys & secrets +To get started with a proprietary model, you need to provide your own API Keys. If you don't yet have one, more information can be found at: [AI21](https://docs.ai21.com/docs/quickstart), [Ai Bloks](https://www.aibloks.com/contact-us), [Anthropic](https://docs.anthropic.com/claude/reference/getting-started-with-the-api), [Cohere](https://cohere.com/), [Google](https://cloud.google.com/vertex-ai/docs/generative-ai/start/quickstarts/api-quickstart), [OpenAI](https://help.openai.com/en/collections/3675940-getting-started-with-openai-api). + +API keys and secrets for models, aws, and pinecone can be set-up for use in environment variables or managed however you prefer. + +You can also access the `llmware` public model repository which includes out-of-the-box custom trained sentence transformer embedding models fine-tuned for the following industries: Insurance, Contracts, Asset Management, SEC. These domain specific models along with llmware's generative BLING model series ("Best Little Instruction-following No-GPU-required") are available at [llmware on Huggingface](https://huggingface.co/llmware). Explore using the model repository and the `llmware` Huggingface integration in [llmware examples](https://github.com/llmware-ai/llmware/blob/main/examples/README.md). + + +## 🔹 Alternate options for running MongoDB and Milvus + +There are several options for getting MongoDB running +
+🐳 A. Run mongo container with docker + + ```bash +docker run -d -p 27017:27017 -v mongodb-volume:/data/db --name=mongodb mongo:latest +``` +
+ +
+🐳 B. Run container with docker compose + + Create a _docker-compose.yaml_ file with the content: +```yaml +version: "3" + +services: + mongodb: + container_name: mongodb + image: 'mongo:latest' + volumes: + - mongodb-volume:/data/db + ports: + - '27017:27017' + +volumes: + llmware-mongodb: + driver: local +``` +and then run: +```bash +docker compose up +``` +
+ +
+📖 C. Install MongoDB natively + +See the [Official MongoDB Installation Guide](https://www.mongodb.com/docs/manual/installation/) + +
+ +## ✍️ Working with the llmware Github repository + +The llmware repo can be pulled locally to get access to all the examples, or to work directly with the llmware code + +### Pull the repo locally + +```bash +git clone git@github.com:llmware-ai/llmware.git +``` +or download/extract a [zip of the llmware repository](https://github.com/llmware-ai/llmware/archive/refs/heads/main.zip) + +### Other options for running llmware - TODO UPDATE THESE INSTRUCTIONS + +
+Run llmware in a container + + ```bash +TODO insert command for pulling the container here +``` +
+ +
+Run llmware natively + +At the top level of the llmware repository run the following command: + +```bash +pip install . +``` + +
+ +## ✨ Getting help or sharing your ideas with the community +Questions and discussions are welcome in our [github discussions](https://github.com/llmware-ai/llmware/discussions). + +Interested in contributing to llmware? We welcome involvement from the community to extend and enhance the framework! +- 💡 What's your favorite model or is there one you'd like to check out in your experiements? +- 💡 Have you had success with a different embedding databases? +- 💡 Is there a prompt that shines in a RAG workflow? + +Information on ways to participate can be found in our [Contributors Guide](https://github.com/llmware-ai/llmware/blob/main/CONTRIBUTING.md#contributing-to-llmware). As with all aspects of this project, contributing is governed by our [Code of Conduct](https://github.com/llmware-ai/llmware/blob/main/CODE_OF_CONDUCT.md). + +## 📣 Release notes and Change Log + +**Supported OS's:** +- MacOS +- Linux +- (Windows is a roadmap item) + +**Supported Vector Databases:** +- Milvus +- FAISS +- Pinecone + +**Prereq's:** +- [Homebrew](https://docs.brew.sh/Installation) +- [python v3.9 - 3.10](https://www.python.org/about/gettingstarted/) +- On Linux, the pip package attempts to install the native dependencies. If it is run without root permission or a package manager other than Apt is used, you will need to manually install the following native packages: + * mongo-c-driver, libpng, libzip, libtiff, zlib, tesseract, popplar + +**Optional:** +- [Docker](https://docs.docker.com/get-docker/) + +
+ Change Log + +- **Oct 2,2023:** 🔥 Initial release of llmware to open source!! 🔥 + +
+ diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000..f972d76a --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,11 @@ +# Security Policy + +## Reporting a Vulnerability + +If you believe you have found a security vulnerability in llmware, we encourage you to let us know right away. We will investigate all reports and do our best to quickly fix the problem. + +Please report security issues by sending an email to security@aibloks.com. + +Do __not__ submit an issue ticket or pull request or otherwise publicly disclose the issue. + +After receiving your email, we will assess respond as soon as possible and share what we plan to do. diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 00000000..03c498cf --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,82 @@ +version: "3.5" + +services: + mongodb: + container_name: mongodb + image: mongo:5.0.10 + environment: + # To secure the root account, provide credentials here + MONGO_INITDB_ROOT_USERNAME: + MONGO_INITDB_ROOT_PASSWORD: + volumes: + - llmware-mongodb:/data/db + ports: + - '27017:27017' + + etcd: + container_name: milvus-etcd + image: quay.io/coreos/etcd:v3.5.5 + environment: + - ETCD_AUTO_COMPACTION_MODE=revision + - ETCD_AUTO_COMPACTION_RETENTION=1000 + - ETCD_QUOTA_BACKEND_BYTES=4294967296 + - ETCD_SNAPSHOT_COUNT=50000 + volumes: + - llmware-etcd:/etcd + command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd + healthcheck: + test: ["CMD", "etcdctl", "endpoint", "health"] + interval: 30s + timeout: 20s + retries: 3 + + minio: + container_name: milvus-minio + image: minio/minio:RELEASE.2023-03-20T20-16-18Z + environment: + MINIO_ACCESS_KEY: minioadmin + MINIO_SECRET_KEY: minioadmin + ports: + - "9001:9001" + - "9000:9000" + volumes: + - llmware-minio:/minio_data + command: minio server /minio_data --console-address ":9001" + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 30s + timeout: 20s + retries: 3 + + milvus: + container_name: milvus + image: milvusdb/milvus:v2.3.0 + command: ["milvus", "run", "standalone"] + environment: + ETCD_ENDPOINTS: etcd:2379 + MINIO_ADDRESS: minio:9000 + volumes: + - llmware-milvus:/var/lib/milvus + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"] + interval: 30s + start_period: 90s + timeout: 20s + retries: 3 + ports: + - "19530:19530" + - "9091:9091" + depends_on: + - "etcd" + - "minio" + +volumes: + llmware-mongodb: + driver: local + llmware-etcd: + driver: local + llmware-minio: + driver: local + llmware-milvus: + driver: local + diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 00000000..4cef00ec --- /dev/null +++ b/examples/README.md @@ -0,0 +1,35 @@ +# Getting started with llmware + +| Example | Detail | +--------------|--------------| +| 1. Getting Started ([code](getting_started.py)) | Create and populate your first library, prepare the library for semantic search with vector embeddings, and run a semantic search. | +| 2. Working with LLMs ([code](working_with_llms.py)) | Connect to your favorite LLM and perform basic prompts. | +| 3. LLM Prompts ([code](llm_prompts.py)) | Prompt LLMs with various sources, explore the out-of-the-box Prompt Catalog, and use different prompt styles.| +| 4. Retrieval ([code](retrieval.py)) | Explore the breadth of retrieval capabilities and persisting, loading and saving retrieval history.| +| 5. RAG ([code](rag.py)) | Integrate Prompts and Retrieval using various information Sources to accomplish Retrieval Augmented Generation (RAG).| +| 6. Working with Prompts ([code](working_with_prompts.py)) | Inspection of Prompt history which is useful in AI Audit scenarios.| +| 7. Parsing ([code](parsing.py)) | Ingest at scale into library and ‘at runtime' into any Prompt.| +| 8. Embedding ([code](embedding.py)) | Simple access to multiple embedding models and vector DBs (“mix and match”). | +| 9. Huggingface Integration ([code](huggingface_integration.py)) | How to bring your favorite HF model into llmware seamlessly. Customize a generative model with weights from a custom fine-tuned model. | +| 10. `llmware` BLING model ([code](llmware_bling.py)) | Experiement with RAG scenarios using ~1B parameter GPT models that can run on your laptop. BLING models are fine-tuned for common RAG scenarios, specifically: question-answering, key-value extraction, and basic summarization. | +| 11. Knowledge Graph ([code](knowledge_graph.py)) | Generate scalable, statistical NLP artifacts - knowledge graphs & document graphs. | +| 12. Datasets ([code](datasets.py)) | Dataset generation streamlined for fine-tuning generative and embedding models and formats such as Alpaca, ChatGPT, Human-Bot. | +| 13. Working without Databases ([code](working_without_a_database.py))| Parse, Prompt and generate Datasets from Prompt history without installing MongoDB or a vector database.| +| 14. Working with Libraries ([code](working_with_libraries.py)) | Explore all Library operations. | + +# Using llmware without MongoDB or an embedding database +You can do some interesting things using `llmware` without a database or vector embeddings. Parsing can be done in memory and outputted to text or json. Prompts can be crafted with sources from files, Wikipedia or the Yahoo Finance API. The [Working without a Database](working_without_a_database.py), [LLM Prompts](llm_prompts.py), and [Parsing](parsing.py) examples show scenarios that can be accomplished and through out the examples are specific methods that do not require MongoDB or embeddings. + +# Additional llmware capabilities +- The `llmware` public model repository with out-of-the-box custom trained sentence transformer embedding models fine-tuned for the following industries: Insurance, Contracts, Asset Management, SEC. These domain specific models along with `llmware`'s generative BLING model series ("Best Little Instruction-following No-GPU-required") are available at [llmware on Huggingface](https://huggingface.co/llmware). Explore their use in the [Embedding](embedding.py), [Huggingface Integration](huggingface_integration.py), and [`llmware` BLING model](llmware_bling.py) examples. + +- Create knowledge graphs with a high-powered and fast C-based co-occurrence table matrix builder, the output of which can feed NLP statistics as well as potentially graph databases. Explore the [Knowledge Graph](knowledge_graph.py) example. + +- Generate datasets for fine-tuning both generative and embedding models. llmware uses sophisticated data-crafting strategies, and leveraging the data captured throughout the system. Explore the [Datasets](datasets.py) example. + +- Library is the simple, flexible, unifying construct in llmware to assemble and normalize parsed text chunks, and is linked to both a text search index, and an open platform of embedding models and vector databases. Explore the [Working with Libraries](working_with_libraries.py) example. + +- The llmware parsers follow a consistent 27 key metadata dictionary, so that you can extract the same information from a PDF as a PowerPoint or Text file. The parsers generally extract images, tables, and all available document metadata. There is a complete set of text chunking tools to parse a batch of documents (across multiple formats) and chunk and store in consistent format in a document store. Explore the [Parsing](parsing.py) example. + +- All data artifacts are published in standard formats – json, txt files, pytorch_model.bin files, and fully portable and exportable to any platform. + diff --git a/examples/datasets.py b/examples/datasets.py new file mode 100644 index 00000000..0bdeb1ce --- /dev/null +++ b/examples/datasets.py @@ -0,0 +1,87 @@ +''' This example demonstrates creating and using datasets + 1. Datasets suitable for fine tuning embedding models + 2. Completion and other types of datasets + 3. Generating datasets from all data in a library or with filtered data + 4. Creating datasets from AWS Transcribe transcripts +''' +import json +import os +from llmware.util import Datasets +from llmware.library import Library +from llmware.retrieval import Query +from llmware.parsers import Parser +from llmware.setup import Setup + +def build_and_use_dataset(library_name): + + # Setup a library and build a knowledge graph. Datasets will use the data in the knowledge graph + print (f"\n > Creating library {library_name}...") + library = Library().create_new_library(library_name) + sample_files_path = Setup().load_sample_files() + library.add_files(os.path.join(sample_files_path,"SmallLibrary")) + library.generate_knowledge_graph() + + # Create a Datasets object + datasets = Datasets(library) + + # Build a basic dataset useful for industry domain adaptation for finetuning embedding models + print (f"\n > Building basic text dataset...") + basic_embedding_dataset = datasets.build_text_ds(min_tokens=500, max_tokens=1000) + dataset_location = os.path.join(library.dataset_path, basic_embedding_dataset["ds_id"]) + print (f"\n > Dataset:") + print (f"(Files referenced below are found in {dataset_location})") + print (f"\n{json.dumps(basic_embedding_dataset, indent=2)}") + sample = datasets.get_dataset_sample(datasets.current_ds_name) + print (f"\nRandom sample from the dataset:\n{json.dumps(sample, indent=2)}") + + # Other Dataset Generation and Usage Examples: + + # Build a simple self-supervised generative dataset- extracts text and splits into 'text' & 'completion' + # Several generative "prompt_wrappers" are available - chat_gpt | alpaca | + basic_generative_completion_dataset= datasets.build_gen_ds_targeted_text_completion(prompt_wrapping="alpaca") + + # Build a generative self-supervised training sets created by pairing 'header_text' with 'text' + xsum_generative_completion_dataset = datasets.build_gen_ds_headline_text_xsum(prompt_wrapping="human_bot") + topic_prompter_dataset = datasets.build_gen_ds_headline_topic_prompter(prompt_wrapping="chat_gpt") + + # Filter a library by a key term as part of building the dataset + filtered_dataset = datasets.build_text_ds(query="agreement", filter_dict={"master_index":1}) + + # Pass a set of query results to create a dataset from those results only + query_results = Query(library=library).query("africa") + query_filtered_dataaset = datasets.build_text_ds(min_tokens=250,max_tokens=600, qr=query_results) + + # Images with text dataset + images_with_text_dataset = datasets.build_visual_ds_image_labels() + + +def build_aws_transcribe_datasets(library_name): + + # Setup a library and build a knowledge graph. Datasets will use the data in the knowledge graph + print (f"\n > Creating library {library_name}...") + library = Library().create_new_library(library_name) + sample_files_path = Setup().load_sample_files() + library.add_dialogs(os.path.join(sample_files_path,"AWS-Transcribe")) + library.generate_knowledge_graph() + + # Create a Datasets object + datasets = Datasets(library) + + # Build generative conversation dataset + print (f"\n > Building generative conversation dataset...") + generative_conversation_dataset = datasets.build_gen_dialog_ds(prompt_wrapping="human_bot",human_first=True) + dataset_location = os.path.join(library.dataset_path, generative_conversation_dataset["ds_id"]) + print (f"\n > Dataset:") + print (f"(Files referenced below are found in {dataset_location})") + print (f"\n{json.dumps(generative_conversation_dataset, indent=2)}") + sample = datasets.get_dataset_sample(datasets.current_ds_name) + print (f"\nRandom sample from the dataset:\n{json.dumps(sample, indent=2)}") + + # Other Dataset Generation and Usage Examples: + + # Build generative model fine-tuning dataset from llm prompt state history + # supports 3 popular formats - alpaca, chatgpt, and 'human_bot' + generative_curated_dataset = datasets.build_gen_ds_from_prompt_history(prompt_wrapping="alpaca") + +build_and_use_dataset("test_txt_datasets") +build_aws_transcribe_datasets("aws_transcribe_datasets") \ No newline at end of file diff --git a/examples/embedding.py b/examples/embedding.py new file mode 100644 index 00000000..8cbcff1d --- /dev/null +++ b/examples/embedding.py @@ -0,0 +1,40 @@ +''' This example demonstrates creating vector embeddings (used for doing semantic queries) + Note: Pinecone is not used in the example below as it requires an API key. If you have a Pinecone account, you can set these two variables: + os.environ.get("USER_MANAGED_PINECONE_API_KEY") = + os.environ.get("USER_MANAGED_PINECONE_ENVIRONMENT") = (for example "gcp-starter") +''' +import os +from llmware.library import Library +from llmware.retrieval import Query +from llmware.setup import Setup + +# Generate vector embeddings and store then in each of Milvus, FAISS +# This is only to demonstrate how to work with the various embedding DBs. Typically, you'd pick just one +def generate_vector_embeddings(library_name): + + # Create and populate a library + print (f"\n > Creating a and populating library: {library_name}...") + library = Library().create_new_library(library_name) + sample_files_path = Setup().load_sample_files() + library.add_files(input_folder_path=os.path.join(sample_files_path, "SmallLibrary")) + + # To create vector embeddings you just need to specify the embedding model and the vector embedding DB + # For examples of using HuggingFace and SentenceTransformer models, see those examples in this same folder + vector_dbs = ["milvus", "faiss"] + embedding_model = "mini-lm-sbert" + for vector_db in vector_dbs: + print (f"\n > Generating embedding vectors and storing in '{vector_db}'...") + library.install_new_embedding(embedding_model_name=embedding_model, vector_db=vector_db) + + # Then when doing semantic queries, the most recent vector DB used for embeddings will be used. + # We just find the best 3 hits for "Salary" + query = Query(library,embedding_model_name=embedding_model) + print (f"\n > Running a query for 'Salary'...") + query_results = Query(library).semantic_query(query="Salary", result_count=3, results_only=True) + print (query_results) + print (f"\n\nHits for 'Salary' in '{library_name}':\n") + for query_result in query_results: + print("File: " + query_result["file_source"] + " (Page " + str(query_result["page_num"]) + "):\n" + query_result["text"] + "\n") + +generate_vector_embeddings("embedding_tests") + diff --git a/examples/getting_started.py b/examples/getting_started.py new file mode 100644 index 00000000..0b2710cf --- /dev/null +++ b/examples/getting_started.py @@ -0,0 +1,32 @@ +''' This example demonstrates: + 1. Creating your first library + 2. Adding some files to it + 3. Generating vector embeddings and storing them in Milvus + 4. Doing a semantic query +''' + +import os +from llmware.library import Library +from llmware.retrieval import Query +from llmware.setup import Setup + +library_name="getting_started" + +print (f"\n > Creating library {library_name}...") +library = Library().create_new_library(library_name) + +print (f"\n > Loading the llmware Sample Files...") +sample_files_path = Setup().load_sample_files() + +print (f"\n > Adding some files to the library...") +library.add_files(input_folder_path=os.path.join(sample_files_path, "SmallLibrary")) + +print (f"\n > Generating embedding vectors (using the 'mini-lm-sbert' model) and storing them (using 'Milvus')...") +library.install_new_embedding(embedding_model_name="mini-lm-sbert", vector_db="milvus") + +print (f"\n > Running a query for 'Salary'...") +query_results = Query(library).semantic_query(query="Salary", result_count=3, results_only=True) +print (query_results) +print (f"\n\nHits for 'Salary' in {library_name}:\n") +for query_result in query_results: + print("File: " + query_result["file_source"] + " (Page " + str(query_result["page_num"]) + "):\n" + query_result["text"] + "\n") diff --git a/examples/huggingface_integration.py b/examples/huggingface_integration.py new file mode 100644 index 00000000..213ac735 --- /dev/null +++ b/examples/huggingface_integration.py @@ -0,0 +1,193 @@ +''' This example demonstrates the use of HuggingFace models + 1. Use llmware models available on HuggingFace for generating vector embeddings + 2. Load a basic decoder generative model from Huggingface and use it + 3. Customizing a generative model with weights from a custom fine-tuned model + 4. Using a Transformers model for embedding + 5. Using a SentenceTransformers model for embedding +''' + +import os +import torch +from llmware.configs import LLMWareConfig +from llmware.library import Library +from llmware.retrieval import Query +from llmware.models import ModelCatalog, HFEmbeddingModel +from llmware.prompts import Prompt +from llmware.setup import Setup +from llmware.util import CloudBucketManager + +try: + from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM +except ImportError: + raise ImportError ( + "This example requires classes from the 'transformers' Python package" + "You can install it with 'pip install transformers'" + ) +try: + from sentence_transformers import SentenceTransformer +except ImportError: + raise ImportError ( + "This example requires classes from the 'sentence-transformers' Python package" + "You can install it with 'pip install sentence-transformers'" + ) + +# Load an llmware model from Hugging Face and generate vector embeddings which you can store in whatever vector DB you like +# See the embeddings.py and other examples for how to use llmware's supported vector DBs and perform semantic queries +def use_llmware_hf_models_for_embedding(): + + # llmware industry models currently published on HuggingFace (more will be coming!) + llmware_industry_models = ["llmware/industry-bert-sec-v0.1", + "llmware/industry-bert-asset-management-v0.1", + "llmware/industry-bert-contracts-v0.1", + "llmware/industry-bert-insurance-v0.1"] + + # Choose one + hf_model_name = "llmware/industry-bert-sec-v0.1" + + # Load the model using the Transformer classes and then into llmware using an HFEmbeddingModel + print (f"\n > Loading model '{hf_model_name}'from HuggingFace...") + hf_tokenizer = AutoTokenizer.from_pretrained(hf_model_name) + hf_model = AutoModel.from_pretrained(hf_model_name) + llmware_model = HFEmbeddingModel(model=hf_model, tokenizer=hf_tokenizer,model_name=hf_model_name) + + # Generate an vector embedding + sample = "This is a sample sentence" + vector_embedding = llmware_model.embedding(sample) + print (f"\n > Generating a vector embedding for: '{sample}'\n\n{vector_embedding}") + +# Load a basic decoder generative model from Huggingface and use it +def load_and_use_decoder_generative_model(): + + # These are some good 'off-the-shelf' smaller testing generative models from HuggingFace + hf_model_testing_list = ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b", + "EleutherAI/pythia-70m-v0", "EleutherAI/pythia-160m-v0", "EleutherAI/pythia-410m-v0", + "EleutherAI/pythia-1b-v0", "EleutherAI/pythia-1.4b-v0"] + + # Here we'll just select one of the above models + model_name = hf_model_testing_list[5] + + # Load the model using the Transformer classes + print (f"\n > Loading model '{model_name}'from HuggingFace...") + hf_model = AutoModelForCausalLM.from_pretrained(model_name) + hf_tokenizer = AutoTokenizer.from_pretrained(model_name) + + # Bring the model into llware. These models were not trained on instruction following, + # so we set instruction_following to False + model = ModelCatalog().load_hf_generative_model(hf_model, hf_tokenizer, instruction_following=False) + + # Make a call to the model + prompt_text = "The future of artificial intelligence is likely to be" + print (f"\n > Prompting the model with '{prompt_text}'") + output = model.inference(prompt_text)["llm_response"] + print(f"\nResponse:\n{prompt_text}{output}") + + # Integrate the model into a llmware prompt + prompt_text = "As the man turned around, off in the distance, he noticed" + print (f"\n > Prompting the model with '{prompt_text}'") + prompter = Prompt(llm_model=hf_model,tokenizer=hf_tokenizer,from_hf=True) + output = prompter.prompt_main(prompt_text)["llm_response"] + print(f"\nResponse:\n{prompt_text}{output}") + +# Load a HuggingFace generative model and override the weights to use a custom user-developed fine-tuned model +def override_generative_model_weights_with_custom_fine_tuned_model(): + + # These are some good 'off-the-shelf' smaller testing generative models from HuggingFace + hf_model_testing_list = ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b", + "EleutherAI/pythia-70m-v0", "EleutherAI/pythia-160m-v0", "EleutherAI/pythia-410m-v0", + "EleutherAI/pythia-1b-v0", "EleutherAI/pythia-1.4b-v0"] + + # Select a model + model_name = "EleutherAI/pythia-410m-v0" + + # Load the model using the Transformer classes + print (f"\n > Loading model '{model_name}'from HuggingFace...") + hf_model = AutoModelForCausalLM.from_pretrained(model_name) + hf_tokenizer = AutoTokenizer.from_pretrained(model_name) + + # Retrive the custom fine-tuned model + # Note: This is a custom model that has been developed only for testing and demonstration purposes + custom_model_name = "contracts-pythia-hf-410m-v0" + print (f"\n > Loading custom model '{custom_model_name}'from llmware...") + custom_model_path = os.path.join(LLMWareConfig.get_model_repo_path(),custom_model_name) + if not os.path.exists(custom_model_path): + CloudBucketManager().pull_single_model_from_llmware_public_repo(custom_model_name) + + # Override the hf_model default model weights with our own custom-trained weights and load it into llmware + print (f"\n > Overring model '{model_name}' to use custom-trained weights from '{custom_model_name}'...") + hf_model.load_state_dict(torch.load(os.path.join(custom_model_path,"pytorch_model.bin"), map_location=torch.device('cpu')), strict=False) + model = ModelCatalog().load_hf_generative_model(hf_model, hf_tokenizer, instruction_following=False) + + # Interact with the model + prompt_text = "According to the terms of the executive stock option plan," + print (f"\n > Prompting the model with '{prompt_text}'") + output = model.inference(prompt_text)["llm_response"] + print(f"\nResponse:\n{prompt_text}{output}") + + +# Use a Transformers model for embedding +def use_transformers_model_for_embedding(library_name, model_name): + + # Create a library and add some documents so we can do some vector embeddings + print (f"\n > Creating a library...") + library = Library().create_new_library(library_name) + sample_files_path = Setup().load_sample_files() + library.add_files(input_folder_path=os.path.join(sample_files_path, "SmallLibrary")) + + # Load the model + print (f"\n > Loading model '{model_name}'") + hf_model = AutoModel.from_pretrained(model_name) + hf_tokenizer = AutoTokenizer.from_pretrained(model_name) + + # Create vector embeddings + print (f"\n > Creating vector embeddings...") + library.install_new_embedding(model=hf_model, tokenizer=hf_tokenizer, from_hf=True, vector_db="faiss", batch_size=50) + + # Perform a query + query_term = "salary" + print (f"\n > Performing query for {query_term}...") + query = Query(library=library, embedding_model_name=model_name, embedding_model=hf_model, tokenizer=hf_tokenizer, from_hf=True) + query_results = query.semantic_query(query_term,result_count=3) + print (f"Top 3 Results:") + for i, result in enumerate(query_results): + file_source = result["file_source"] + page_num = result["page_num"] + text = result["text"] + print(f"\n - From {file_source} (page {page_num}):\n{text}") + +# Use a SentenceTransformers model for embedding +def use_sentence_transformers_model_for_embedding(library_name, model_name): + + # Create a library and add some documents so we can do some vector embeddings + print (f"\n > Creating a library...") + library = Library().create_new_library(library_name) + sample_files_path = Setup().load_sample_files() + library.add_files(input_folder_path=os.path.join(sample_files_path, "SmallLibrary")) + + # Load the model + print (f"\n > Loading model '{model_name}'") + sbert_model = SentenceTransformer(model_name) + + # Create vector embeddings + print (f"\n > Creating vector embeddings...") + library.install_new_embedding(model=sbert_model, embedding_model_name=model_name, + from_sentence_transformer=True, vector_db="faiss", batch_size=100) + + # Perform a query + query_term = "salary" + print (f"\n > Performing query for {query_term}...") + query= Query(library=library, embedding_model_name=model_name, embedding_model=sbert_model, + from_sentence_transformer=True) + query_results = query.semantic_query(query_term, result_count=3) + print (f"Top 3 Results:") + for i, result in enumerate(query_results): + file_source = result["file_source"] + page_num = result["page_num"] + text = result["text"] + print(f"\n - From {file_source} (page {page_num}):\n{text}") + + +use_llmware_hf_models_for_embedding() +load_and_use_decoder_generative_model() +override_generative_model_weights_with_custom_fine_tuned_model() +use_transformers_model_for_embedding("test_transformers", "bert-base-cased") +use_sentence_transformers_model_for_embedding("test_sentence_transformers", "all-distilroberta-v1") diff --git a/examples/knowledge_graph.py b/examples/knowledge_graph.py new file mode 100644 index 00000000..49c74116 --- /dev/null +++ b/examples/knowledge_graph.py @@ -0,0 +1,130 @@ +''' This example demonstrates creating and using knowledge graphs and document graphs +''' + +import os +from llmware.library import Library +from llmware.setup import Setup +from llmware.util import Graph + +# Create a library (if it doesn't already exist), add files to it +def create_and_populate_a_library(library_name): + + # Load the library or create and populate it if doesn't exist + if Library().check_if_library_exists(library_name): + # Load the library + library = Library().load_library(library_name) + else: + print (f" > Creating library {library_name}...") + # Create the library + library = Library().create_new_library(library_name) + # Load the llmware sample file repository + sample_files_path = Setup().load_sample_files() + # Add files from the "SmallLibrary" folder to library + library.add_files(os.path.join(sample_files_path,"SmallLibrary")) + # Return the library + return library + +# Just a helper method to print large lists in the following methods more cleanly +def summarize_top_10(the_object): + try: + iterator = iter(the_object) + except TypeError: + return the_object + + output = "" + for i, item in enumerate(iterator): + if i >= 10: + return output + output += (f"\n - {item}") + return output + +# Building a knowledge graph is easy. +# It is an analytically intensive process and can take a few minutes for larger collections +def build_and_use_knowledge_graph (library): + + # Build the knowledge graph + print (f" > Building knowledge graph for library '{library.library_name}'...") + library.generate_knowledge_graph() + + # Knowledge graph artifacts are stored in the library's /nlp folder + print (f" > Generated knowledge graph artifacts\nFrom: {library.nlp_path}:") + for file_name in os.listdir(library.nlp_path): + print (f" - {file_name}") + + # Load Graph object with my_library + graph = Graph(library) + + # Get the overall nlp stats + print (f"\n > Knowledge graph - nlp stats") + library_analytics = graph.get_library_data_stats() + for key, value in library_analytics.items(): + if key not in ['graph_top']: + if key in ['bigrams','mcw']: + print(f" - {key} (top 10 only):{summarize_top_10(value)}") + else: + print(f" - {key}: {summarize_top_10(value)}") + + # Run a pseudo query against the knowledge graph to find related terms + # These terms could be used to 'enhance' search query and weigh more heavily on related concepts + query_term = 'united nations' + print (f"\n > Knowledge graph - query for '{query_term}'") + query_results = graph.kg_query(query_term) + for key, value in query_results.items(): + print(f" - {key}: {value}") + + # Related bigrams + print (f"\n > Knowledge graph - bigrams for '{query_term}'") + bigrams = graph.kg_query_related_bigrams(query_term) + for key, value in query_results.items(): + print(f" - {key}: {value}") + + # Query counts + query_term_2 = "sustainable social development" + print (f"\n > Knowledge graph - query counts for '{query_term_2}'") + query_counts = graph.kg_query_counts(query_term_2) + print(f" - {query_counts}") + + # Export for visualization + print (f"\n > Knowledge graph - export for visualization for query '{query_term}'") + red_nodes, nodes, edges = graph.export_graph_with_query_to_visualize(10, query_term) + red_nodes = summarize_top_10(red_nodes) + nodes = summarize_top_10(nodes) + edges = summarize_top_10(edges) + print(f" - Red Nodes: {(red_nodes)}\n - Nodes (top 10 only): {nodes}\n - Edges (top 10 only): {edges}") + + # Export whole graph for visualization + print (f"\n > Knowledge graph - export for visualization for whole graph") + nodes, edges = graph.export_graph_to_visualize(10) + nodes = summarize_top_10(nodes) + edges = summarize_top_10(edges) + print(f" - Nodes (top 10 only): {nodes}\n - Edges (top 10 only): {edges}") + + # Build document graph + print (f"\n > Building document graphs...") + doc_graph = graph.doc_graph_builder() + + first_doc = doc_graph[0] + + print (f"\n > Document graph information for the 1st document in the library") + for key, value in first_doc.items(): + if key in ['last_block_in_doc', 'first_block_in_doc', 'doc_ID']: + print(f" - {key} : {value}") + continue + if key in ['context_table']: + print(f" - {key} (top 3 only):") + for i, item in enumerate(value): + if (i >= 3): + continue + print (f" - {item}") + + else: + print(f" - {key} (top 10 only):{summarize_top_10(value)}") + + + # Assemble top blocks + print (f"\n > Top blocks") + block_output = graph.assemble_top_blocks(first_doc["block_scores"],first_doc["doc_ID"], max_samples=3) + print (f"block_output:\n{block_output}") + +library = create_and_populate_a_library("knowledge_graph") +build_and_use_knowledge_graph(library) diff --git a/examples/llm_prompts.py b/examples/llm_prompts.py new file mode 100644 index 00000000..f5532e74 --- /dev/null +++ b/examples/llm_prompts.py @@ -0,0 +1,94 @@ +''' This example demonstrates: + 1. Prompting LLMs with different kinds of sources/context + 2. The Prompt Catalog and the use different prompt styles + + Note: This example uses OpenAI's gpt-4 LLM. See the "working_with_llms.py" for examples of connecting to other LLMs +''' +import os +from llmware.prompts import Prompt +from llmware.setup import Setup +from llmware.util import PromptCatalog + +# Update this value with your own API Key, either by setting the env var or editing it directly here: +openai_api_key = os.environ["OPENAI_API_KEY"] + +# llmware provides many out of the box prompt instructions such as yes_no, number_or_none, summarize_with_bullets,etc +def print_all_prompt_instructions(): + print (f"\n > ALL AVAILABLE PROMPT INSTRUCTIONS") + for prompt in PromptCatalog().get_all_prompts(): + print (" - " + prompt["prompt_name"]) + +# With the provided context submit the given prompt to the LLM +def simple_prompt_with_context_string(prompt, context, llm_name, api_key): + print (f"\n > SIMPLE PROMPT WITH CONTEXT STRING") + prompter = Prompt(llm_name=llm_name, llm_api_key=api_key) + response = prompter.prompt_main(prompt=prompt, context=context)["llm_response"] + print (f"- Context: {context}\n- Prompt: {prompt}\n- LLM Response:\n{response}") + +# Use an llmware prompt_instruction to submit the given prompt and prompt_instruction to the LLM +def prompt_with_prompt_instruction(prompt, context, prompt_instruction, llm_name, api_key): + print (f"\n > PROMPT WITH CONTEXT USING '{prompt_instruction}' PROMPT INSTRUCTION") + prompter = Prompt(llm_name=llm_name, llm_api_key=api_key) + response = prompter.prompt_from_catalog(prompt=prompt, context=context, prompt_name=prompt_instruction)["llm_response"] + print (f"- Context: {context}\n- Prompt: {prompt}\n- LLM Response:\n{response}") + + +# In some cases you may want to add additional configuraton. +def prompt_with_inference_config(prompt, context, prompt_instruction, inference_config, llm_name, api_key): + print (f"\n > PROMPT WITH CONTEXT USING '{prompt_instruction}' PROMPT INSTRUCTION") + prompter = Prompt(llm_name=llm_name, llm_api_key=api_key) + response = prompter.prompt_main(prompt=prompt, context=context, prompt_name=prompt_instruction, + inference_dict=inference_config)["llm_response"] + print (f"- Context: {context}\n- Prompt: {prompt}\n- LLM Response:\n{response}") + +# If the context you need to pass to an LLM is contained in Wikipedia you can easily add it as a source +def prompt_with_wiki_source(prompt, wiki_topic, prompt_instruction, llm_name, api_key): + print (f"\n > PROMPT WITH CONTEXT FROM WIKIPEDIA USING '{prompt_instruction}' PROMPT INSTRUCTION") + prompter = Prompt(llm_name=llm_name, llm_api_key=api_key) + prompter.add_source_wikipedia(wiki_topic, article_count=1) + response = prompter.prompt_with_source(prompt=prompt, prompt_name=prompt_instruction)[0]["llm_response"] + print (f"- Context: Wikepedia article(s) for '{wiki_topic}'\n- Prompt: {prompt}\n- LLM Response:\n{response}") + +# If the context you need to pass is in local files, you can easily add then as sources +def prompt_with_local_file_sources(prompt, local_folder, local_files, prompt_instruction, llm_name, api_key): + print (f"\n > PROMPT WITH CONTEXT FROM LOCAL FILE USING '{prompt_instruction}' PROMPT INSTRUCTION") + prompter = Prompt(llm_name=llm_name, llm_api_key=api_key) + for local_file in local_files: + prompter.add_source_document(local_folder, local_file) + response = prompter.prompt_with_source(prompt=prompt, prompt_name=prompt_instruction)[0]["llm_response"] + print (f"- Context: {local_files}\n- Prompt: {prompt}\n- LLM Response:\n{response}") + +print_all_prompt_instructions() + +simple_prompt_with_context_string( prompt = "What is my 3rd favorite type of food?", + context = "My favorite foods are Sushi, Italian and Greek", + llm_name = "gpt-4", + api_key = openai_api_key + ) + +prompt_with_prompt_instruction( prompt = "How old is my oldest sibling?", + context = "My brother is 20 years old and my sister is 1.5 times older", + prompt_instruction = "number_or_none", + llm_name = "gpt-4", + api_key = openai_api_key + ) + +prompt_with_inference_config( prompt = "Why is it difficult?", + context = "I am interested in building rockets", + prompt_instruction = "explain_child", + inference_config = {"temperature": 0.8, "llm_max_output_len": 1000, "max_tokens": 1000}, + llm_name = "gpt-4", + api_key = openai_api_key) + +prompt_with_wiki_source( prompt = "Was Barack Obama the Prime Minister of Canada?", + wiki_topic = "Barack Obama", + prompt_instruction = "yes_no", + llm_name = "gpt-4", + api_key = openai_api_key) + +prompt_with_local_file_sources( prompt = "What was the effective date of this agreement?", + local_folder = os.path.join(Setup().load_sample_files(), "SmallLibrary"), + local_files = ['Gaia EXECUTIVE EMPLOYMENT AGREEMENT.pdf'], + prompt_instruction = "just_the_facts", + llm_name = "gpt-4", + api_key = openai_api_key) \ No newline at end of file diff --git a/examples/llmware_bling.py b/examples/llmware_bling.py new file mode 100644 index 00000000..48f08b25 --- /dev/null +++ b/examples/llmware_bling.py @@ -0,0 +1,148 @@ +''' This example demonstrates using llmware's bling model: https://huggingface.co/llmware/bling-1.4b-0.1 + BLING ("Best Little Instruction-following No-GPU-required") models are fine-tuned with distilled high-quality custom instruct datasets. + They are targeted at a specific subset of instruct tasks with the objective of providing a high-quality Instruct model that is 'inference-ready' + on a CPU laptop even without using any advanced quantization optimizations. +''' + +import os +import torch +from llmware.configs import LLMWareConfig +from llmware.prompts import Prompt +from llmware.models import ModelCatalog + +try: + from transformers import AutoModelForCausalLM, AutoTokenizer +except ImportError: + raise ImportError ( + "This example requires classes from the 'transformers' Python package" + "You can install it with 'pip install transformers'" + ) + +# Bling models published to date: +bling_models = ['llmware/bling-1b-0.1','llmware/bling-1.4b-0.1'] + +def use_llmware_bling(): + + # Load a llmware BLING model from HuggingFace + hf_model_name = "llmware/bling-1.4b-0.1" + print (f"\n > Loading model '{hf_model_name}'from HuggingFace...") + custom_hf_model = AutoModelForCausalLM.from_pretrained(hf_model_name) + hf_tokenizer = AutoTokenizer.from_pretrained(hf_model_name) + + # Bring the model into llmware + bling_model = ModelCatalog().load_hf_generative_model(custom_hf_model, hf_tokenizer, + instruction_following=False, prompt_wrapper="human_bot") + + # Setup a varity of test prompts with contexts + prompt_list = [ + + {"query": "What is the CEO's salary?", + "context": "The CEO has a salary of $350,000. The CFO has a salary of $285,000."}, + + {"query": "What is the stock price on Thursday?", + "context": "The stock was trading at $33 on Thursday, and is now trading at $36 on Friday."}, + + {"query": "What is Bob's age?", + "context": "John is 32 years old. Margaret is 46 years old. Bob is 61 years old."}, + + {"query": "What is the company's address?", + "context": "The company's headquarters are located at: 555 California Street, San Francisco, California 94123."}, + + {"query": "When was Biden inaugurated?", + "context": "Joe Biden's tenure as the 46th president of the United States began with " + "his inauguration on January 20, 2021. Biden, a Democrat from Delaware who " + "previously served as vice president under Barack Obama, " + "took office following his victory in the 2020 presidential election over " + "Republican incumbent president Donald Trump. Upon his inauguration, he " + "became the oldest president in American history."}, + + {"query": "Who was Biden's opponent in the 2020 presidential election?", + "context": "Joe Biden's tenure as the 46th president of the United States began with " + "his inauguration on January 20, 2021. Biden, a Democrat from Delaware who " + "previously served as vice president under Barack Obama, " + "took office following his victory in the 2020 presidential election over " + "Republican incumbent president and opponent Donald Trump. Upon his inauguration, he " + "became the oldest president in American history."}, + + {"query": "What is a list of the top summary points?", + "context": "Joe Biden's tenure as the 46th president of the United States began with " + "his inauguration on January 20, 2021. Biden, a Democrat from Delaware who " + "previously served as vice president under Barack Obama, " + "took office following his victory in the 2020 presidential election over " + "Republican incumbent president Donald Trump. Upon his inauguration, he " + "became the oldest president in American history."}, + + {"query": "Who refused to acknowledge Biden as the winner of the election?", + "context": "Though Biden was generally acknowledged as the winner, " + "General Services Administration head Emily W. Murphy " + "initially refused to begin the transition to the president-elect, " + "thereby denying funds and office space to his team. " + "On November 23, after Michigan certified its results, Murphy " + "issued the letter of ascertainment, granting the Biden transition " + "team access to federal funds and resources for an orderly transition. " + "Two days after becoming the projected winner of the 2020 election, " + "Biden announced the formation of a task force to advise him on the " + "COVID-19 pandemic during the transition, co-chaired by former " + "Surgeon General Vivek Murthy, former FDA commissioner David A. Kessler, " + "and Yale University's Marcella Nunez-Smith.On January 5, 2021, " + "the Democratic Party won control of the United States Senate, " + "effective January 20, as a result of electoral victories in " + "Georgia by Jon Ossoff in a runoff election for a six-year term " + "and Raphael Warnock in a special runoff election for a two-year term. " + "President-elect Biden had supported and campaigned for both " + "candidates prior to the runoff elections on January 5.On January 6, " + "a mob of thousands of Trump supporters violently stormed the Capitol " + "in the hope of overturning Biden's election, forcing Congress to " + "evacuate during the counting of the Electoral College votes. More " + "than 26,000 National Guard members were deployed to the capital " + "for the inauguration, with thousands remaining into the spring. "}, + + {"query": "What is the name of the Company?", + "context": "THIS EXECUTIVE EMPLOYMENT AGREEMENT (this “Agreement”) is entered " + "into this 2nd day of April, 2012, by and between Aphrodite Apollo " + "(“Executive”) and TestCo Software, Inc. (the “Company” or “Employer”), " + "and shall become effective upon Executive’s commencement of employment " + "(the “Effective Date”) which is expected to commence on April 16, 2012. " + "The Company and Executive agree that unless Executive has commenced " + "employment with the Company as of April 16, 2012 (or such later date as " + "agreed by each of the Company and Executive) this Agreement shall be " + "null and void and of no further effect."}, + + {"query": "What are the names of the two parties?", + "context": "THIS EXECUTIVE EMPLOYMENT AGREEMENT (this “Agreement”) is entered " + "into this 2nd day of April, 2012, by and between Aphrodite Apollo " + "(“Executive”) and TestCo Software, Inc. (the “Company” or “Employer”), " + "and shall become effective upon Executive’s commencement of employment " + "(the “Effective Date”) which is expected to commence on April 16, 2012. " + "The Company and Executive agree that unless Executive has commenced " + "employment with the Company as of April 16, 2012 (or such later date as " + "agreed by each of the Company and Executive) this Agreement shall be " + "null and void and of no further effect."}, + + {"query": "When will employment start?", + "context": "THIS EXECUTIVE EMPLOYMENT AGREEMENT (this “Agreement”) is entered " + "into this 2nd day of April, 2012, by and between Aphrodite Apollo " + "(“Executive”) and TestCo Software, Inc. (the “Company” or “Employer”), " + "and shall become effective upon Executive’s commencement of employment " + "(the “Effective Date”) which is expected to commence on April 16, 2012. " + "The Company and Executive agree that unless Executive has commenced " + "employment with the Company as of April 16, 2012 (or such later date as " + "agreed by each of the Company and Executive) this Agreement shall be " + "null and void and of no further effect."} + ] + + # Iterate through all the prompts and interact with the model + for i, entries in enumerate(prompt_list): + prompt = entries["query"] + context = entries["context"] + output = bling_model.inference(prompt, add_context=context, add_prompt_engineering=True)["llm_response"] + print(f"\nPrompt: {prompt}\nResponse:\n{output.strip()}") + + # You can also integrate the model into an llmware Prompt + prompt = "What is my age?" + context = "I am 33 years old" + prompter = Prompt(llm_model=custom_hf_model, tokenizer=hf_tokenizer, from_hf=True) + output = prompter.prompt_main(prompt, context=context)["llm_response"] + print(f"\nPrompt: {prompt}\nResponse:\n{output.strip()}") + +use_llmware_bling() diff --git a/examples/parsing.py b/examples/parsing.py new file mode 100644 index 00000000..34889e42 --- /dev/null +++ b/examples/parsing.py @@ -0,0 +1,128 @@ +''' This example demonstrates the parsing capablities of llmware + 1. Parsing files into libraries + 2. Parsing files into Memory + 3. Paraing files to json +''' + +import os +from llmware.configs import LLMWareConfig +from llmware.library import Library +from llmware.parsers import Parser, WebSiteParser, WikiParser +from llmware.resources import LibraryCatalog +from llmware.setup import Setup + +# Demonstrate adding files to a library, which implicitly parses them and creates blocks +def parsing_files_into_library(library_name): + + # Create new library + print (f"\n > Creating library {library_name}...") + library = Library().create_new_library(library_name) + + # Load the llmware sample files + print (f"\n > Loading the llmware sample files...") + sample_files_path = Setup().load_sample_files() + pdf_file_path = os.path.join(sample_files_path,"UN-Resolutions-76th") + office_file_path = os.path.join(sample_files_path,"Agreements") + + # Add files from a local path (this will pull in all supported file types: + # .pdf, .pptx, .docx, .xlsx, .csv, .txt, .json, .wav, and .zip, .jpg, .png + print (f"\n > Adding (parsing) files from {pdf_file_path}...") + library.add_files(pdf_file_path) + + # Add only files of a speciic type + print (f"\n > Adding (parsing) Office files only, from {office_file_path}...") + library.add_office(office_file_path) + + # Note: An alternate method is to call the Parser directly and pass in the library. For example: + #Parser(library=library).parse_pdf(sample_files_path) + #Parser(library=library).parse_office(sample_files_path) + + # Add other/specialized content to library + print (f"\n > Adding Website and Wiki content....") + website_results = library.add_website("https://www.politico.com") + wikipedia_results = library.add_wiki("Joe Biden") + + # Note: The default size of blocks is set to 400 characters (~100 tokens). This can be configured by + # setting the following value prior to adding files and the parsers will use it has a guide when creating blocks + library.block_size_target_characters = 800 + + # Print the library stats + library_card = LibraryCatalog().get_library_card(library_name) + blocks = library_card["blocks"] + documents = library_card["documents"] + images = library_card["images"] + print (f"\n > Library Stats") + print (f" - {blocks} blocks, {documents} documents, {images} images") + print (f" Note: images extracted during parsing can be found here: {library.image_path}") + +# For some use cases you may only need to parse one or a few files. +# You can do so completely in memory and no state/parsing output will be saved +def parsing_files_into_memory(): + + # Load the llmware sample files + print (f"\n > Loading the llmware sample files...") + sample_files_path = Setup().load_sample_files() + + # Parse individual documents. The output will be a list of blocks (dicts with metadata) + pdf_file_path = os.path.join(sample_files_path,"SmallLibrary") + pdf_file = "Gaia EXECUTIVE EMPLOYMENT AGREEMENT.pdf" + print (f"\n > Parsing {pdf_file}") + pdf_parsed_output = Parser().parse_one_pdf(pdf_file_path, pdf_file) + page_number = pdf_parsed_output[0]["master_index"] + block_text = pdf_parsed_output[0]["text"] + print(f"\nFirst block found on page {page_number}:\n{block_text}") + + # Parse an MS Office document. The parser handles .pptx, .docx and .xlsx + office_file_path = os.path.join(sample_files_path,"SmallLibrary") + office_file = "Janis-Joplin-s-Biography.docx" + print (f"\n > Parsing {office_file}") + office_parsed_output = Parser().parse_one_office(office_file_path, office_file) + page_number = office_parsed_output[0]["master_index"] + block_text = office_parsed_output[0]["text"] + print(f"\nFirst block found on page {page_number}:\n{block_text}") + + # Perform OCR to extract text from iamges (e.g scanned documents) + image_file_path = os.path.join(sample_files_path,"Images") + image_file = "Apache2_License.png" + print (f"\n > Parsing {image_file}") + image_parsed_output = Parser().parse_one_image(image_file_path, image_file) + block_text = image_parsed_output[0]["text"] + print(f"\nFirst block found in image:\n{block_text}") + + # Parse website + # website = "https://politico.com" + # print (f"\n > Parsing {website}") + # website_parsed_output = Parser().parse_website(website, write_to_db=False,save_history=False,get_links=False) + # block_text = website_parsed_output[0]["text"] + # print(f"\nFirst block found in website:\n{block_text}") + + # Parse wiki + wiki_topic = "Canada" + print (f"\n > Parsing wiki article '{wiki_topic}'") + wiki_parsed_output = Parser().parse_wiki([wiki_topic], write_to_db=False,save_history=False, target_results = 10) + block_text = wiki_parsed_output[0]["text"] + print(f"\nFirst block found in wiki:\n{block_text}") + + +# Parse an entire folder to json (import all supported file types) +def parse_to_json(): + + # Load the llmware sample files + print (f"\n > Loading the llmware sample files...") + sample_files_path = Setup().load_sample_files() + input_folder = os.path.join(sample_files_path,"SmallLibrary") + + # Create a parser + parser = Parser() + + # Parse entire folder to json + print (f"\n > Parsing folder: {input_folder}...") + blocks = parser.ingest_to_json(input_folder) + print (f"Total Blocks: {len(parser.parser_output)}") + print (f"Files Parsed:") + for processed_file in blocks["processed_files"]: + print(f" - {processed_file}") + +parsing_files_into_library("parsing_tests") +parsing_files_into_memory() +parse_to_json() \ No newline at end of file diff --git a/examples/rag.py b/examples/rag.py new file mode 100644 index 00000000..6f987a62 --- /dev/null +++ b/examples/rag.py @@ -0,0 +1,70 @@ +# This example demonstrates Retrieval Augmented Retrieval (RAG): +import os +from llmware.library import Library +from llmware.retrieval import Query +from llmware.prompts import Prompt +from llmware.setup import Setup + +# Update this value with your own API Key, either by setting the env var or editing it directly here: +openai_api_key = os.environ["OPENAI_API_KEY"] + +# A self-contained end-to-end example of RAG +def end_to_end_rag(): + + # Create a library called "Agreements", and load it with llmware sample files + print (f"\n > Creating library 'Agreements'...") + library = Library().create_new_library("Agreements") + sample_files_path = Setup().load_sample_files() + library.add_files(os.path.join(sample_files_path,"Agreements")) + + # Create vector embeddings for the library using the "industry-bert-contracts" model and store them in Milvus + print (f"\n > Generating vector embeddings using embedding model: 'industry-bert-contracts'...") + library.install_new_embedding(embedding_model_name="industry-bert-contracts", vector_db="milvus") + + # Perform a semantic search against our library. This will gather evidence to be used in the LLM prompt + print (f"\n > Performing a semantic query...") + os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid a HuggingFace tokenizer warning + query_results = Query(library).semantic_query("Termination", result_count=20) + + # Create a new prompter using the GPT-4 and add the query_results captured above + prompt_text = "Summarize the termination provisions" + print (f"\n > Prompting LLM with '{prompt_text}'") + prompter = Prompt().load_model("gpt-4", api_key=openai_api_key) + sources = prompter.add_source_query_results(query_results) + + # Prompt the LLM with the sources and a query string + responses = prompter.prompt_with_source(prompt_text, prompt_name="summarize_with_bullets") + for response in responses: + print ("\n > LLM response\n" + response["llm_response"]) + + # Finally, generate a CSV report that can be shared + print (f"\n > Generating CSV report...") + report_data = prompter.send_to_human_for_review() + print ("File: " + report_data["report_fp"] + "\n") + +end_to_end_rag() + +# ********************************************************************************************************** +# ************************************** SAMPLE OUTPUT ***************************************************** +# ********************************************************************************************************** +''' +> python examples/rag.py + + > Creating library 'Agreements'... + + > Generating vector embeddings using embedding model: 'industry-bert-contracts'... + + > Performing a semantic query... + + > Prompting LLM with 'Summarize the termination provisions' + + > LLM response +- Employment period ends on the first occurrence of either the 6th anniversary of the effective date or a company sale. +- Early termination possible as outlined in sections 3.1 through 3.4. +- Employer can terminate executive's employment under section 3.1 anytime without cause, with at least 30 days' prior written notice. +- If notice is given, the executive is allowed to seek other employment during the notice period. + + > Generating CSV report... +File: /Users/turnham/llmware_data/prompt_history/interaction_report_Fri Sep 29 12:07:42 2023.csv + +''' diff --git a/examples/retrieval.py b/examples/retrieval.py new file mode 100644 index 00000000..c7b892e8 --- /dev/null +++ b/examples/retrieval.py @@ -0,0 +1,232 @@ +''' This example demonstrates the various ways to retrieve data from libraries: + 1. Basic retrieval + 2. Retrieval with filters + 3. Bulk retrieval + 4. Retrieval State and Export + ''' + +import os +from llmware.configs import LLMWareConfig +from llmware.library import Library +from llmware.resources import LibraryCatalog +from llmware.retrieval import Query +from llmware.setup import Setup +from sentence_transformers import SentenceTransformer + +# Create a library (if it doesn't already exist), add files to it and generate vector embeddings which enables semantic query +def create_and_populate_a_library(library_name): + + print (f" > Creating and populating a library: {library_name}") + # Load the library or create and populate it if doesn't exist + if Library().check_if_library_exists(library_name): + # Load the library + library = Library().load_library(library_name) + else: + # Create the library + library = Library().create_new_library(library_name) + # Load the llmware sample file repository + sample_files_path = Setup().load_sample_files() + # Add files from the "SmallLibrary" folder to library + library.add_files(os.path.join(sample_files_path,"SmallLibrary")) + # Load an embedding model and create vector embeddings for the library + library.install_new_embedding(from_sentence_transformer=True, embedding_model_name=embedding_model_name, model=embedding_model, vector_db="milvus") + # Return the library + return library + +# A retrieval returns a query_result dict that contains information about the query including the "results" +def perform_basic_text_retrieval(library, query_text): + + # Create a Query instance + query = Query(library) + # Set the keys that should be returned + query.query_result_return_keys = ["file_source", "page_num", "text"] + # Perform a simple query + query_results = query.query(query_text) + # Get the top result: + top_result = query_results[0] + # Print the file, page_num and text from from the first result + file_source = top_result["file_source"] + page_number = top_result["page_num"] + result_text = top_result["text"] + print (f"\n> Top result for '{query_text}': {file_source} (page {page_number}):\nText:{result_text}") + +# A retrieval returns a query_result dict that contains information about the query including the "results" +def perform_text_retrieval_by_author(library, query_text, author): + + # Create a Query instance + query = Query(library) + # Set the keys that should be returned + query.query_result_return_keys = ["file_source", "page_num", "text", "author_or_speaker"] + # Perform a text query by author + query_results = query.text_query_by_author_or_speaker(query_text, author) + # Get the top result: + top_result = query_results[0] + # Print the file, page_num, text and author from from the first result + file_source = top_result["file_source"] + page_num = top_result["page_num"] + text = top_result["text"] + author = top_result["author_or_speaker"] + print (f"\n> Top result for '{query_text}': {file_source} (page {page_num}), Author: {author}:\nText:{text}") + +# A bibliography is the lsit of documents and their pages referenced in a set of query results. +# The format is: [{'Gaia EXECUTIVE EMPLOYMENT AGREEMENT.pdf': [3, 5, 2, 4, 1]}] +def get_bibliography_from_query_results(library, query_text): + + # Create a Query instance + query = Query(library) + # Perform a simple query + query_results = query.query(query_text, result_count=20) + # Get a bibliography + biblio = query.bibliography_builder_from_qr(query_results=query_results) + # Print out the bibliography + print (f"\n> Bibliography for '{query_text}':\n{biblio}") + +# If a particular result is interesting, you can widen the context window to retrieve more text before and/or after +def focus_on_and_expand_result(library, query_text): + + # Create a Query instance + query = Query(library) + # Perform a simple query + query_results = query.query(query_text, result_count=20) + # Capture the third result + interesting_result = query_results[2] + # Pull a 500 character window from before the result + result_before = query.expand_text_result_before(interesting_result,window_size=500) + # Pull a 100 character window from after the result + result_after = query.expand_text_result_after(interesting_result, window_size=100) + # Print the original result and the expanded result + original_result_text = interesting_result["text"] + expanded_result_text = result_before["expanded_text"] + original_result_text + result_after["expanded_text"] + print (f"\n> Expanding a result context window'") + print (f"\nOriginal result: {original_result_text}") + print (f"\nExpanded Result: {expanded_result_text}") + +# A very powerful form of retrieval involves document filters. Once a 'document filter' is created, it can be +# applied to query further only in that document set +# For example:You could set up a document filter to get all documents that mention a topic like 'Artificial Intelligence' +# and then within that subset of documents, look for details on leading researchers. +def perform_retrieval_with_document_filters(library, doc_filter_text, query_text): + + # Create a Query instance + query = Query(library,from_sentence_transformer=True,embedding_model_name=embedding_model_name, embedding_model=embedding_model) + # Create a document filter using exact (text) search mode + doc_filter = query.document_filter(doc_filter_text, query_mode="text", exact_mode=True) + # Perform a semantic query with the document filter + semantic_results = query.semantic_query_with_document_filter(query_text, doc_filter, result_count=3) + # Print the text from the results + print (f"\n> Retrieval with a document filter'") + for i, result in enumerate(semantic_results): + result_text = result["text"] + print (f"\n{1}. {result_text}") + + +# Sometimes you want to retrieve all data so you can further process it yourself +def perform_bulk_retrieval(library): + + # Create a Query instance + query = Query(library) + # Create a list of keys of interest. This can be omitted if you want all keys + key_dict = ["file_source", "text", "page_num", "author_or_speaker"] + # Get the whole libary. The returns a list of all blocks + all_blocks = query.get_whole_library(selected_keys=key_dict) + print (f"\n> Bulk retrieval Retrieval'") + print (f"\n{len(all_blocks)} blocks were retrieved") + +# A dual pass retrieval combines semantic + text query +def perform_dual_pass_retrieval(library, query_text): + + # Create a Query instance configured for semantic search + query = Query(library,from_sentence_transformer=True,embedding_model_name=embedding_model_name, embedding_model=embedding_model) + # Do a dual_pass_query + hybrid_qr_results = query.dual_pass_query(query_text,result_count=20, primary="semantic") + num_of_results = len(hybrid_qr_results) + print (f"\n> Dual Pass Retrieval'") + print (f"\n{num_of_results} were found") + +# Demonstrate some methods involved with persisting and loading Query state as well as export +def retreival_state_and_export(library): + + # Create a Query instance with history peristence + query = Query(library, save_history=True) + # Capture the query_id + query_id = query.query_id + # Run a series of queries + query_results = query.text_query("sustainable development", result_count=20) + query_results = query.text_query("africa", result_count=26) + query_results = query.text_query("pandemic risk", result_count=15) + # Save state + query.save_query_state() + # Generate Retrieval Report. The report will be stored in the llmware_data/query_history folder + csv_file = query.generate_csv_report() + csv_file_path = os.path.join(LLMWareConfig().get_query_path(), csv_file) + print (f"\n> Retrieval State and Export'") + print (f"\nExport for query id '{query_id}': {csv_file_path}") + + # Additionally here is how can clear state and reload based on a query_id: + query.clear_query_state() + query.load_query_state(query_id) + +# Demonstrate the methods and capabilities available for doing filtered retrieval +# Note: This method is not meant to be run as-is. +def retrieval_filter_options(library): + + # Create a Query instance + query = Query(library) + # Basic filters by block fields + filter_dict = {"content_type": "text", "author_or_speaker": "Margaret Smith"} + query_results = query.text_query_with_custom_filter("human rights", filter_dict, result_count=20) + + # Document filters + doc_id_list = [0,1,5,13,22,135] + query_results = query.text_query_with_document_filter("human rights", doc_id_list) + + # Page Lookup - especially useful when looking for data on a notable or known page in your documents, e.g., the first page + page_list = [1,6] + query_results = query.page_lookup(page_list=page_list,doc_id_list=doc_id_list,text_only=True) + + # Predefined filters + query_results_with_images = query.image_query("africa") + query_results_with_tables = query.table_query("revenue") + query_results_by_page = query.text_search_by_page("recitals", page_num=1) + query_results_from_filter = query.text_query_by_author_or_speaker("company stock price", "John Smith") + query_results_docs_only = query.document_filter("tesla stock",exact_mode=True, result_count=50) + + # Timestamp filters + first_date = "2005-05-10" + last_datae = "2023-12-31" + query_results_time_window = query.filter_by_time_stamp(query_reults, first_date=first_date, last_date=last_date) + +# Demonstrate the methods and capabilities available when doing semantic queries +# Note: This method is not meant to be run as-is. +def semantic_retrieval_strategies(library): + + # Create a Query instance configured for semantic search + query = Query(library,from_sentence_transformer=True,embedding_model_name=embedding_model_name, embedding_model=embedding_model) + + # Use semantic embeddings to 're-rank' query results + text_query_results = query.text_query("stock trends", result_count=30) + reranked_results = query.apply_semantic_ranking(text_query_results,"stock trends") + + # Use semantic embedding space directly by passing 'text block' directly, rather than a query + one_block = reranked_results["results"][0] + direct_embedding_results = query.similar_blocks_embedding(one_block, result_count=30, embedding_distance_threshold=1.5) + + # Augment text query with semantic + text_query_results = query.text_query("stock trends", result_count=30) + augmented_qr = query.augment_qr(texts_query_results,"stock trends",augment_query="semantic") + + +# Embedding model used for the examples below +embedding_model_name = "all-MiniLM-L6-v1" +print (f"Loading embedding model: '{embedding_model_name}'") +embedding_model = SentenceTransformer(embedding_model_name) + +library = create_and_populate_a_library(library_name="retrieval_tests") +perform_basic_text_retrieval(library=library, query_text='salary') +perform_text_retrieval_by_author(library=library, query_text='United Nations', author='Andrea Chambers') +get_bibliography_from_query_results(library=library, query_text='salary') +focus_on_and_expand_result(library=library, query_text='salary') +perform_retrieval_with_document_filters(library=library, doc_filter_text="Agreement", query_text='Agreement') +perform_bulk_retrieval(library) +perform_dual_pass_retrieval(library=library, query_text='africa') +retreival_state_and_export(library) diff --git a/examples/working_with_libraries.py b/examples/working_with_libraries.py new file mode 100644 index 00000000..dee6d841 --- /dev/null +++ b/examples/working_with_libraries.py @@ -0,0 +1,55 @@ +''' This example demonstrates creating and using libraries + 1. Library creation and loading existig libraries + 2. LibraryCatalog and library cards + 3. Exporting libraires to json or csv +''' +import json +import os +import tempfile +from llmware.configs import LLMWareConfig +from llmware.library import Library, LibraryCatalog +from llmware.setup import Setup + +def core_library_functions(library_name): + + # Create a library + print (f"\n > Creating library {library_name}...") + library = Library().create_new_library(library_name) + + # Load an existing library. This not required after library creation and is only shown here for reference + library = Library().load_library(library_name) + + # The LibraryCatalog is used to query all libraries + print (f"\n > All libraries") + for library_card in LibraryCatalog().all_library_cards(): + lib_name = library_card["library_name"] + docs = library_card["documents"] + print (f" - {lib_name} ({docs} documents)") + + # Add a few files to our library + print (f"\n > Adding some files to {library_name}") + sample_files_path = Setup().load_sample_files() + library.add_files(os.path.join(sample_files_path,"SmallLibrary")) + + # View the library card to confirm document, block and other counts + print (f"\n > Library Card") + library_card = library.get_library_card() + library_card["_id"] = str(library_card["_id"]) # The _id needs to be converted to a str before printing + print (json.dumps(library_card, indent=2)) + + # Library Export to JSON + print (f"\n > Exporting library to jsonl file...") + temp_export_dir = tempfile.gettempdir() + json_export_path = library.export_library_to_jsonl_file(temp_export_dir, "lib_export") + print (f" - library exported to {json_export_path}") + + # Library export to txt file + print (f"\n > Exporting library to text file...") + text_export_path = library.export_library_to_txt_file(temp_export_dir, "lib_export") + print (f" - library exported to {text_export_path}") + + # Delete the library + print (f"\n > Deleting the library...") + library.delete_library(confirm_delete=True) + +core_library_functions("library_tests") \ No newline at end of file diff --git a/examples/working_with_llms.py b/examples/working_with_llms.py new file mode 100644 index 00000000..b9cb9b8e --- /dev/null +++ b/examples/working_with_llms.py @@ -0,0 +1,77 @@ +''' This example demonstrates connecting to the following LLMs and doing basic completions + 1. OpenAi - gpt-4 + 2. Google - test-bison@001 + 3. Anthropic - claude-instant-v1 + 3. AI21 - 2-grande-instruct + + Notes: + 1. API Keys for the given LLMs are assumed to be set as environemnt variables below + 2. Google API Keys are handled differently from others. The key needs to be the full text of your .json credential file. + This can be set as follows: + export GOOGLE_API_KEY=$(cat credentials.json) +''' + +import os +from llmware.prompts import Prompt + +# Update these values with your own API Keys, either by setting env vars or editing them directly here: +openai_api_key = os.environ["OPENAI_API_KEY"] +anthropic_api_key = os.environ["ANTHROPIC_API_KEY"] +ai21_api_key = os.environ["AI21_API_KEY"] +google_api_key = os.environ["GOOGLE_API_KEY"] + +def prompt_llm_and_print_response(query, vendor_name, llm_name, llm_api_key): + # Create an instance of the Prompt class using the given LLM + prompter = Prompt(llm_name=llm_name, llm_api_key=llm_api_key) + # Perform an LLM completion with the given query + response = prompter.completion(query) + # The resononse is a dict that contains "llm_response" which may contain some whitespace, so we'll strip() it + answer = response["llm_response"].strip() + # Print + print (f"\n > {vendor_name}:{llm_name}\nAnswer: {answer}") + +query = "what is artificial intelligence?" +print (f"\n > Prompting LLMs with: '{query}'") +prompt_llm_and_print_response(query, "OpenAI", "gpt-4", openai_api_key) +prompt_llm_and_print_response(query, "Google", "text-bison@001", os.environ["GOOGLE_API_KEY"]) +prompt_llm_and_print_response(query, "Anthropic", "claude-instant-v1", anthropic_api_key) +prompt_llm_and_print_response(query, "AI21", "j2-grande-instruct", ai21_api_key) + + +# ********************************************************************************************************** +# ************************************** SAMPLE OUTPUT ***************************************************** +# ********************************************************************************************************** +''' +> python examples/working_with_llms.py + + > Prompting LLMs with: 'what is artificial intelligence?' + +> OpenAI:gpt-4 +Answer: Artificial Intelligence (AI) refers to the simulation of human intelligence processes by machines, especially computer systems. These processes can include activities such as learning (the acquisition of information and rules for using the information), reasoning (using the rules to reach approximate or definite conclusions), and self-correction. Essentially, it involves creating systems that behave intelligently, making complex decisions, solving problems, understanding language, recognizing patterns and learning from experience. + +There are two types of AI: narrow AI, which is designed to + +> Google:text-bison@001 +Answer: Artificial Intelligence (AI) is a branch of computer science that deals with the creation of intelligent agents, which are systems that can reason, learn, and act autonomously. AI research has been highly successful in developing effective techniques for solving a wide range of problems, including natural language processing, computer vision, and robotics. However, AI is still in its early stages of development, and there are many challenges that need to be overcome before AI systems can achieve human-level intelligence. + +One of the main challenges in AI is the problem of representation. AI systems need to be able to represent the world in a way that they can understand and reason + +> Anthropic:claude-instant-v1 +Answer: Here is a brief overview of artificial intelligence: + +- Artificial intelligence (AI) refers to intelligence demonstrated by machines, as opposed to the natural intelligence displayed by humans and animals. + +- AI technologies include machine learning, deep learning, natural language processing, expert systems, robotics, and computer vision. These technologies allow machines to sense, comprehend, act, and learn from experiences. + +- Machine learning is a core driver of AI. It allows computer systems to automatically learn and improve + +> AI21:j2-grande-instruct +Answer: Artificial intelligence (AI) is a field of computer science that aims to create intelligent machines that can perform tasks normally requiring human intelligence. The goal of AI is to develop systems that can understand and reason about the world around them, and that can solve problems and make decisions in the same way that humans do. + +There are several different approaches to AI, including rule-based systems, expert systems, machine learning, and natural language processing. Rule-based systems use hard-coded rules to make decisions, while expert systems use a combination of rules and knowledge from a human expert. Machine learning systems use algorithms to learn from data and make predictions, and natural language processing systems use algorithms to process and understand human language.''' + + + + + + diff --git a/examples/working_with_prompts.py b/examples/working_with_prompts.py new file mode 100644 index 00000000..0382a325 --- /dev/null +++ b/examples/working_with_prompts.py @@ -0,0 +1,116 @@ +''' This example demonstrates inspection of prompt history which can be particularly useful in AI Audit scenarios + 1. Prompt save persistence + 2. Prompt interaction history + 3. Prompt dialog tracker + 4. Prompt Interaction Report generation + 5. Prompt evidence verfication +''' +import json +import os +from llmware.library import Library +from llmware.prompts import Prompt +from llmware.retrieval import Query +from llmware.setup import Setup +from llmware.util import PromptState, Datasets + +# Update these values with your own API Keys, either by setting env vars or editing them directly here: +openai_api_key = os.environ["OPENAI_API_KEY"] + +os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoiding a HuggingFace warning about process forking + +# Demonstrate interacting with the prompts in a variety of ways +def prompt_operations(llm_model): + + # Create a new prompter with state persistence + prompter = Prompt(save_state=True) + # Capture the prompt_id (which can be used later to reload state) + prompt_id = prompter.prompt_id + # Load the model + prompter.load_model(llm_model,api_key=openai_api_key) + # Define a list of prompts + prompts = [ + {"query": "How old is Bob?", "context": "John is 43 years old. Bob is 27 years old."}, + {"query": "When did COVID start?", "context": "COVID started in March of 2020 in most of the world."}, + {"query": "What is the current stock price?", "context": "The stock is trading at $26 today."}, + {"query": "When is the big game?", "context": "The big game will be played on November 14, 2023."}, + {"query": "What is the CFO's salary?", "context": "The CFO has a salary of $285,000."}, + {"query": "What year is Michael in school?", "context": "Michael is starting 11th grade."} + ] + + # Iterate through the prompt which will save each response dict in in the prompt_state + print (f"\n > Sending a series of prompts to {llm_model}...") + for i, prompt in enumerate(prompts): + print (" - " + prompt["query"]) + response = prompter.prompt_main(prompt["query"],context=prompt["context"],register_trx=True) + + # Print how many interactions are now in the prompt history + interaction_history = prompter.interaction_history + print (f"\n > Prompt Interaction History now contains {len(interaction_history)} interactions") + + # Use the dialog_tracker to regenerate the conversation with the LLM + print (f"\n > Reconstructed Dialog") + dialog_history = prompter.dialog_tracker + for i, conversation_turn in enumerate(dialog_history): + print(" - ", i, "[user]: ", conversation_turn["user"]) + print(" - ", i, "[ bot]: ", conversation_turn["bot"]) + + # Saving and cleae the prompt state + prompter.save_state() + prompter.clear_history() + + # Print the number of interactions + interaction_history = prompter.interaction_history + print (f"\n > Prompt history has been cleared") + print (f"\n > Prompt Interaction History now contains {len(interaction_history)} interactions") + + # Reload the prompt state using the prompt_id and print again the number of interactions + prompter.load_state(prompt_id) + interaction_history = prompter.interaction_history + print (f"\n > The previous prompt state has been re-loaded") + print (f"\n > Prompt Interaction History now contains {len(interaction_history)} interactions") + + # Generate a Promppt transaction report + prompt_transaction_report = PromptState().generate_interaction_report([prompt_id]) + print (f"\n > A prompt transaction report has been generated: {prompt_transaction_report}") + + +def prompt_fact_checking(library_name, llm_model): + + print (f"\n > Creating library {library_name}...") + library = Library().create_new_library(library_name) + sample_files_path = Setup().load_sample_files() + library.add_files(os.path.join(sample_files_path,"SmallLibrary")) + + # Create vector embeddings for the library using the "industry-bert-contracts" model and store them in faiss + print (f"\n > Generating vector embeddings using embedding model: 'industry-bert-contracts'...") + library.install_new_embedding(embedding_model_name="industry-bert-contracts", vector_db="faiss") + + # Perform a semantic search against our library. This will gather evidence to be used in the LLM prompt + print (f"\n > Performing a semantic query...") + query_results = Query(library).semantic_query("what are the termination provisions", result_count=20) + + # Create a new prompter based on the query results and the given llm_model + print (f"\n > Prompting with query results...") + prompter = Prompt(save_state=True) + prompter.load_model(llm_model,api_key=openai_api_key) + sources = prompter.add_source_query_results(query_results) + response = prompter.prompt_with_source("Is the termination provision 12 months?") + + # Fact-check the first response + print (f"\n > Checking sources") + + # Check sources + source_check = prompter.evidence_check_sources(response)[0]["source_review"] + print (f"\nEvidence check of sources:\n{json.dumps(source_check, indent=2)}") + + # Check numbers + number_check = prompter.evidence_check_numbers(response)[0]["fact_check"] + print (f"\nEvidence check of numbers:\n{json.dumps(number_check, indent=2)}") + + # Check comparison stats + token_comparison = prompter.evidence_comparison_stats(response)[0]["comparison_stats"] + print (f"\nEvidence check of comparison stats:\n{json.dumps(token_comparison, indent=2)}") + + +prompt_operations(llm_model="gpt-3.5-turbo") +prompt_fact_checking("test_fact_checking", llm_model="gpt-3.5-turbo") diff --git a/examples/working_without_a_database.py b/examples/working_without_a_database.py new file mode 100644 index 00000000..2caf6be1 --- /dev/null +++ b/examples/working_without_a_database.py @@ -0,0 +1,288 @@ +''' This example demonstrates what can be accomplished with llmware with no databases (e.g no MongoDB) + 1. +''' + +import json +import os +from llmware.parsers import Parser +from llmware.prompts import Prompt +from llmware.setup import Setup +from llmware.util import PromptCatalog, Datasets +from llmware.resources import PromptState + +# Update these values with your own API Keys, either by setting env vars or editing them directly here: +openai_api_key = os.environ["OPENAI_API_KEY"] +anthropic_api_key = os.environ["ANTHROPIC_API_KEY"] + +# Iterate through and analyze the contracts in a folder +def analyze_contracts_on_the_fly(): + + # Load the llmware sample files + print (f"\n > Loading the llmware sample files...") + sample_files_path = Setup().load_sample_files() + contracts_path = os.path.join(sample_files_path,"Agreements") + + # Create a Prompt instance) + prompter = Prompt(save_state=True).load_model("claude-instant-v1", api_key=anthropic_api_key) + + # Iterate through contracts + prompt_text = "What is the executive's annual base salary?" + print (f"\n > Analyzing contracts with prompt: '{prompt_text}'") + for i, contract in enumerate(os.listdir(contracts_path)): + print (f"\n > Analyzing {contract}") + + # Add contract as a prompt source + source = prompter.add_source_document(contracts_path, contract, query="base salary") + + # Prompt LLM and display response + responses = prompter.prompt_with_source(prompt_text, prompt_name="number_or_none") + for response in responses: + print("LLM Response: " + response["llm_response"]) + + # Fact check response and display result + updated_responses = prompter.evidence_check_numbers(responses) + for response in updated_responses: + for fact_check in response["fact_check"]: + status = fact_check["status"] + text = fact_check["text"] + print(f"Fact Check: {status} -> {text}") + + # We're done with this contract, clear the source from the prompt + prompter.clear_source_materials() + + # Save jsonl report to jsonl to /prompt_history folder + prompter.save_state() + +# Use prompt history to easily create model-ready fine-tuning datasets +def create_datasets_from_prompt_history(): + + context = "Joe Biden is the 46th President of the United States. He was born in Scranton, " \ + "Pennsylvania. He served as Vice President from 2008 through 2016." + + # Start a new PromptState session + #PromptState().initiate_new_state_session() + + # Create a Prompt + prompter = Prompt(llm_name="gpt-4", llm_api_key=openai_api_key, save_state=True) + + # Perform several prompts + print (f"\n > Performing several prompts to populate the prompt state...") + response = prompter.prompt_main(prompt="Who was the 46th president?", context=context) + response = prompter.number_or_none(prompt="What year did Joe Biden start as vice president?", context=context) + response = prompter.summarize_with_bullets(prompt="Who is Joe Biden?", context=context) + + # Create a Datasets object + datasets = Datasets() + + # Create dataset wrapped in "Alpaca format" + print (f"\n > Creating a dataset from prompt history in ALPACA format...") + alpaca_dataset = datasets.build_gen_ds_from_prompt_history(prompt_wrapping="alpaca") + print (f"\nThe dataset dict:\n{json.dumps(alpaca_dataset, indent=2)}") + sample = datasets.get_dataset_sample(datasets.current_ds_name) + print (f"\nRandom sample from the dataset:\n{json.dumps(sample, indent=2)}") + + # Create dataset wrapped in "Chat GPT format" + print (f"\n > Creating a dataset from prompt history in CHAT GPT format...") + chatgpt_dataset = datasets.build_gen_ds_from_prompt_history(prompt_wrapping="chat_gpt") + print (f"\nThe dataset dict:\n{json.dumps(chatgpt_dataset, indent=2)}") + sample = datasets.get_dataset_sample(datasets.current_ds_name) + print (f"\nRandom sample from the dataset:\n{json.dumps(sample, indent=2)}") + + # Create dataset wrapped in "Chat GPT format" + print (f"\n > Creating a dataset from prompt history in HUMAN BOT format...") + humanbot_dataset = datasets.build_gen_ds_from_prompt_history(prompt_wrapping="human_bot") + print (f"\nThe dataset dict:\n{json.dumps(humanbot_dataset, indent=2)}") + sample = datasets.get_dataset_sample(datasets.current_ds_name) + print (f"\nRandom sample from the dataset:\n{json.dumps(sample, indent=2)}") + +# Parse files/content of various types +def parsing_with_no_library(): + + # Load the llmware sample files + print (f"\n > Loading the llmware sample files...") + sample_files_path = Setup().load_sample_files() + + # Create a parser + parser = Parser() + + # Parse PDF files + print (f"\n > Parsing PDF files...") + pdf_path = os.path.join(sample_files_path,"SmallLibrary") + pdf_output = parser.parse_pdf(pdf_path, write_to_db=False, save_history=False) + print(f"Running block count: {len(parser.parser_output)}") + + # Parse MS Office files + print (f"\n > Parsing MS Office files...") + office_path = os.path.join(sample_files_path,"SmallLibrary") + office_output = parser.parse_office(office_path, write_to_db=False, save_history=False) + print(f"Running block count: {len(parser.parser_output)}") + + # Parse website + print (f"\n > Parsing Website...") + website = "https://www.politico.com" + website_output = parser.parse_website(website, write_to_db=False, save_history=False, get_links=False) + print(f"Running block count: {len(parser.parser_output)}") + + # Parse AWS Transcribe transcripts + print (f"\n > Parsing AWS Transcribe transcripts...") + transcripts_path = os.path.join(sample_files_path,"AWS-Transcribe") + transcripts_output = parser.parse_dialog(transcripts_path, write_to_db=False, save_history=False) + print(f"Running block count: {len(parser.parser_output)}") + + # Save state + print (f"\n > Saving parser state...") + parser.save_state() + parser_state_file = os.path.join(parser.parser_folder, "parser_job_" + parser.parser_job_id + ".jsonl") + print(f"File: {parser_state_file}") + +# Parse an entire folder to json (import all supported file types) +def parse_all_to_json(): + + # Load the llmware sample files + print (f"\n > Loading the llmware sample files...") + sample_files_path = Setup().load_sample_files() + input_folder = os.path.join(sample_files_path,"SmallLibrary") + + # Create a parser + parser = Parser() + + # Parse entire folder to json + print (f"\n > Parsing folder: {input_folder}...") + blocks = parser.ingest_to_json(input_folder) + print (f"Total Blocks: {len(parser.parser_output)}") + print (f"Files Parsed:") + for processed_file in blocks["processed_files"]: + print(f" - {processed_file}") + +# Try out all prompt instruction types +def try_all_prompt_instructions(): + + + test_sample = "Joseph Robinette Biden Jr. ( BY-dən; born November 20, 1942) is an American politician " \ + "who is the 46th and current president of the United States. A member of the Democratic Party, " \ + "he previously served as the 47th vice president from 2009 to 2017 under President Barack Obama " \ + "and represented Delaware in the United States Senate from 1973 to 2009. Born in Scranton, " \ + "Pennsylvania, Biden moved with his family to Delaware in 1953. He studied at the University of " \ + "Delaware before earning his law degree from Syracuse University. He was elected to the New Castle " \ + "County Council in 1970 and to the U.S. Senate in 1972. As a senator, Biden drafted and led the " \ + "effort to pass the Violent Crime Control and Law Enforcement Act and the Violence Against Women " \ + "Act; and oversaw six U.S. Supreme Court confirmation hearings, including the contentious hearings " \ + "for Robert Bork and Clarence Thomas. Biden ran unsuccessfully for the Democratic presidential " \ + "nomination in 1988 and 2008. In 2008, Barack Obama chose Biden as his running mate, and Biden " \ + "was a close counselor to Obama during his two terms as vice president.In the 2020 presidential " \ + "election, Biden and his running mate, Kamala Harris, defeated incumbents Donald Trump and " \ + "Mike Pence. Taking office at age 78, Biden is the oldest president in U.S. history, the " \ + "first to have a female vice president, and the first from Delaware. In 2021, he signed a " \ + "bipartisan infrastructure bill, as well as a $1.9 trillion economic stimulus package in " \ + "response to the COVID-19 pandemic and subsequent recession." + + test_sample_short = "Joe Biden is the 46th President of the United States. He was born in Scranton, " \ + "Pennsylvania. He served as Vice President from 2008 through 2016." + + # Create a prompt + prompter = Prompt(save_state=True).load_model("gpt-3.5-turbo", api_key=openai_api_key) + + # Iterate through all prompt instructions and display the responses for the same prompt question + prompt_question = "Who is Joe Biden?" + print (f"\n > Running all available prompt instructions with provided context and asking '{prompt_question}'") + for i, prompt in enumerate(PromptCatalog().list_all_prompts()): + response = prompter.prompt_from_catalog(prompt_question, context=test_sample, prompt_name=prompt)["llm_response"] + print(f"\n{i+1}. {prompt}\n{response}") + +# Use specific methods to invoke various prompt instructions +def use_specific_prompt_instructions(): + + test_sample = "Joseph Robinette Biden Jr. ( BY-dən; born November 20, 1942) is an American politician " \ + "who is the 46th and current president of the United States. A member of the Democratic Party, " \ + "he previously served as the 47th vice president from 2009 to 2017 under President Barack Obama " \ + "and represented Delaware in the United States Senate from 1973 to 2009. Born in Scranton, " \ + "Pennsylvania, Biden moved with his family to Delaware in 1953. He studied at the University of " \ + "Delaware before earning his law degree from Syracuse University. He was elected to the New Castle " \ + "County Council in 1970 and to the U.S. Senate in 1972. As a senator, Biden drafted and led the " \ + "effort to pass the Violent Crime Control and Law Enforcement Act and the Violence Against Women " \ + "Act; and oversaw six U.S. Supreme Court confirmation hearings, including the contentious hearings " \ + "for Robert Bork and Clarence Thomas. Biden ran unsuccessfully for the Democratic presidential " \ + "nomination in 1988 and 2008. In 2008, Barack Obama chose Biden as his running mate, and Biden " \ + "was a close counselor to Obama during his two terms as vice president.In the 2020 presidential " \ + "election, Biden and his running mate, Kamala Harris, defeated incumbents Donald Trump and " \ + "Mike Pence. Taking office at age 78, Biden is the oldest president in U.S. history, the " \ + "first to have a female vice president, and the first from Delaware. In 2021, he signed a " \ + "bipartisan infrastructure bill, as well as a $1.9 trillion economic stimulus package in " \ + "response to the COVID-19 pandemic and subsequent recession." + + test_sample_short = "Joe Biden is the 46th President of the United States. He was born in Scranton, " \ + "Pennsylvania. He served as Vice President from 2008 through 2016." + + # Create a prompt + prompter = Prompt(save_state=True).load_model("gpt-3.5-turbo", api_key=openai_api_key) + + print (f"\n > Running specific prompt instructions") + + # yes_no + response = prompter.yes_or_no("Was Joe Biden born in Michigan?",test_sample_short) + print("\nyes/no\n" + response["llm_response"]) + + # summarize with bullets + response = prompter.summarize_with_bullets("Who is Joe Biden?", test_sample, number_of_bullets=9) + print("\nnumbered bullets\n" + response["llm_response"]) + + # multiple choice + prompt = "Where was Joe Biden born?" + choice_list = ["Scranton, Pennsylvania", "Detroit, Michigan", "Cleveland, Ohio", "None of the Above"] + response = prompter.multiple_choice(prompt,test_sample_short, choice_list) + print("\nmultiple choice\n" + response["llm_response"]) + + # xsummary + response = prompter.xsummary(test_sample,number_of_words=20) + print("\nxsummary\n" + response["llm_response"]) + + # number_or_none + prompt = "What is the stock price?" + context = "The stock price is currently $15.50" + response = prompter.number_or_none(prompt,context=context) + print("\nnumber_or_none\n" + response["llm_response"]) + + # completion + response = prompter.completion("In the dark of the night, the man heard a noise and ...", temperature=1.0, + target_len=200) + print("\ncompletion\n" + response["llm_response"]) + + # title generator + response = prompter.title_generator_from_source("who is joe biden?", context=test_sample,title_only=True) + print("\ntitle generator\n" + response) + +# Add your own custom prompt +def create_custom_prompt(): + + test_sample_short = "Joe Biden is the 46th President of the United States. He was born in Scranton, " \ + "Pennsylvania. He served as Vice President from 2008 through 2016." + + # Run Order List - How to construct the prompt + run_order_list = ["blurb1", "$context", "blurb2", "$query", "instruction"] + + # Dictionary to use for the prompt + my_prompt_dict = {"blurb1": "Please use the following materials- ", + "blurb2": "Please answer the following question - ", + "instruction": "In answering the question, please mention 'Lucy told you that'.", + "system_message": "You are a helpful assistant."} + + print (f"\n > Prompting LLM with a custom prompt (to add in the response that 'Lucy' was the source of the answer)") + + # Add the new custom prompt + prompt_catalog = PromptCatalog() + prompt_catalog.add_custom_prompt_card("my_prompt",run_order_list,my_prompt_dict) + + # Create a new prompt + prompter = Prompt(save_state=True,prompt_catalog=prompt_catalog).load_model("gpt-3.5-turbo", api_key=openai_api_key) + + # Prompt the LLM + response = prompter.prompt_from_catalog("Where was Joe Biden born?",context=test_sample_short, prompt_name="my_prompt") + print("\nLLM Response:\n" + response["llm_response"]) + +analyze_contracts_on_the_fly() +create_datasets_from_prompt_history() +parsing_with_no_library() +parse_all_to_json() +try_all_prompt_instructions() +use_specific_prompt_instructions() +create_custom_prompt() diff --git a/llmware/__init__.py b/llmware/__init__.py new file mode 100644 index 00000000..b60833ec --- /dev/null +++ b/llmware/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2023 llmware + +# Licensed under the Apache License, Version 2.0 (the "License"); you +# may not use this file except in compliance with the License. You +# may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. + +__version__ = '0.0.9' +__author__ = 'llmware' +__license__ = 'Apache 2.0 License' diff --git a/llmware/configs.py b/llmware/configs.py new file mode 100644 index 00000000..ef8b4aaa --- /dev/null +++ b/llmware/configs.py @@ -0,0 +1,187 @@ + + +# Copyright 2023 llmware + +# Licensed under the Apache License, Version 2.0 (the "License"); you +# may not use this file except in compliance with the License. You +# may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. + + +import os + +from llmware.exceptions import HomePathDoesNotExistException + + +class LLMWareConfig: + + # initial setting will pull from environ variables - future updates must be set directly + + _base_fp = {"home_path": os.environ.get("HOME"), + "llmware_path_name": "llmware_data/"} + + _fp = {"model_repo_path_name": "model_repo/", + "library_path_name": "accounts/", + "input_path_name": "input_channel/", + "parser_path_name": "parser_history/", + "query_path_name": "query_history/", + "prompt_path_name": "prompt_history/", + "tmp_path_name": "tmp/"} + + _conf = {"collection_db_uri": "mongodb://localhost:27017/", + "collection_db_username": "", + "collection_db_password": "", + "collection_db": "mongo", + "milvus_host": "localhost", + "milvus_port": 19530, + "debug_mode": 0, + "llmware_sample_files_bucket": "llmware-sample-docs", + "llmware_public_models_bucket": "llmware-public-models", + "shared_lib_path": os.path.join(os.path.dirname(os.path.realpath(__file__)), "lib") + } + + @classmethod + def get_config(cls,name): + if name in cls._conf: + return cls._conf[name] + raise "Key not found in configs" + + @classmethod + def set_config(cls,name, value): + cls._conf[name] = value + + @classmethod + def get_home(cls): + return cls._base_fp["home_path"] + + @classmethod + def set_home(cls, new_value): + cls._base_fp["home_path"] = new_value + + @classmethod + def set_llmware_path_name(cls, new_value): + cls._base_fp["llmware_path_name"] = new_value + + @classmethod + def get_fp_name(cls,file_path): + if file_path in cls._fp: + return cls._fp[file_path] + raise "File path not found in configs" + + @classmethod + def set_fp_name(cls,file_path, new_value): + if file_path in cls._fp: + cls._fp.update({file_path, new_value}) + + @classmethod + def get_llmware_path(cls): + return os.path.join(cls._base_fp["home_path"], cls._base_fp["llmware_path_name"]) + + @classmethod + def get_library_path(cls): + return os.path.join(cls._base_fp["home_path"], cls._base_fp["llmware_path_name"], cls._fp["library_path_name"]) + + @classmethod + def get_model_repo_path(cls): + return os.path.join(cls._base_fp["home_path"],cls._base_fp["llmware_path_name"], cls._fp["model_repo_path_name"]) + + @classmethod + def get_input_path(cls): + return os.path.join(cls._base_fp["home_path"], cls._base_fp["llmware_path_name"], cls._fp["input_path_name"]) + + @classmethod + def get_parser_path(cls): + return os.path.join(cls._base_fp["home_path"],cls._base_fp["llmware_path_name"], cls._fp["parser_path_name"]) + + @classmethod + def get_query_path(cls): + return os.path.join(cls._base_fp["home_path"],cls._base_fp["llmware_path_name"], cls._fp["query_path_name"]) + + @classmethod + def get_prompt_path(cls): + return os.path.join(cls._base_fp["home_path"], cls._base_fp["llmware_path_name"],cls._fp["prompt_path_name"]) + + @classmethod + def get_tmp_path(cls): + return os.path.join(cls._base_fp["home_path"], cls._base_fp["llmware_path_name"],cls._fp["tmp_path_name"]) + + @classmethod + def get_path(cls, name): + + if name+"_name" in cls._fp: + return os.path.join(cls._base_fp["home_path"], cls._base_fp["llmware_path_name"], + cls._fp[name+"_name"]) + + raise HomePathDoesNotExistException(name) + + @classmethod + def setup_llmware_workspace (cls): + + # create file structure - configured through use of env variable ["HOME"] + home_path = cls._base_fp["home_path"] + + if not os.path.exists(home_path): + raise HomePathDoesNotExistException(home_path) + + llmware_path = cls.get_llmware_path() + if not os.path.exists(llmware_path): + os.mkdir(llmware_path) + + library_path = cls.get_library_path() + if not os.path.exists(library_path): + os.mkdir(library_path) + + input_path = cls.get_input_path() + if not os.path.exists(input_path): + os.mkdir(input_path) + + model_repo_path = cls.get_model_repo_path() + if not os.path.exists(model_repo_path): + os.mkdir(model_repo_path) + + parser_path = cls.get_parser_path() + if not os.path.exists(parser_path): + os.mkdir(parser_path) + + query_path = cls.get_query_path() + if not os.path.exists(query_path): + os.mkdir(query_path) + + prompt_path = cls.get_prompt_path() + if not os.path.exists(prompt_path): + os.mkdir(prompt_path) + + tmp_path = cls.get_tmp_path() + if not os.path.exists(tmp_path): + os.mkdir(tmp_path) + + # set 'open' read/write directory permissions, e.g., chmod 777 + os.chmod(library_path, 0o777) + os.chmod(input_path, 0o777) + os.chmod(model_repo_path, 0o777) + os.chmod(parser_path, 0o777) + os.chmod(query_path, 0o777) + os.chmod(prompt_path, 0o777) + os.chmod(tmp_path, 0o777) + + return 0 + + @classmethod + def create_new_account(cls, account_name): + + # will set up a secondary account file structure + # no management of account permissions in llmware- assumed to be handled in calling application + + library_path = cls.get_library_path() + new_account_path = os.path.join(library_path, account_name) + os.mkdir(new_account_path) + + return 0 + diff --git a/llmware/embeddings.py b/llmware/embeddings.py new file mode 100644 index 00000000..9da602e6 --- /dev/null +++ b/llmware/embeddings.py @@ -0,0 +1,523 @@ + +# Copyright 2023 llmware + +# Licensed under the Apache License, Version 2.0 (the "License"); you +# may not use this file except in compliance with the License. You +# may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. + + +import os +import faiss +import logging +import numpy as np +import re + +from bson import ObjectId +from pymilvus import connections, utility, FieldSchema, CollectionSchema, DataType, Collection + +from llmware.configs import LLMWareConfig +from llmware.resources import CollectionRetrieval, CollectionWriter +from llmware.exceptions import UnsupportedEmbeddingDatabaseException + + +class EmbeddingHandler: + + # An EmbeddingHandler is used for all embedding-related interactions with a library + # It provides a common set of methods that wrap the specific embedding classes. + + def __init__(self, library): + + self.supported_embedding_dbs = ["milvus", "faiss", "pinecone"] + self.library = library + + # Create a new embedding. + def create_new_embedding(self, embedding_db, model, doc_ids=None, batch_size=500): + + embedding_class = self._load_embedding_db(embedding_db, model) + embedding_status = embedding_class.create_new_embedding(doc_ids, batch_size) + + if embedding_status: + self.library.update_embedding_status("yes", model.model_name, embedding_db) + + return embedding_status + + # Search the vector space + def search_index(self, query_vector, embedding_db, model, sample_count=10): + # Need to normalize the query_vector. Sometimes it comes in as [[1.1,2.1,3.1]] (from Transformers) and sometimes as [1.1,2.1,3.1] + # We'll make sure it's the latter and then each Embedding Class will deal with it how it needs to + if len(query_vector) == 1: + query_vector = query_vector[0] + + embedding_class = self._load_embedding_db(embedding_db, model) + return embedding_class.search_index(query_vector,sample_count=sample_count) + + # Delete a specific index (for a given model) + def delete_index(self, embedding_db, model): + + embedding_class = self._load_embedding_db(embedding_db, model) + embedding_class.delete_embedding() + self.library.update_embedding_status(None, None, None) + + # Delete all embeddings for the given library across all embedding dbs + def delete_all_indexes(self): + EmbeddingMilvus(self.library).delete_all_indexes() + EmbeddingFAISS(self.library).delete_all_indexes() + try: + EmbeddingPinecone(self.library).delete_all_indexes() + except ImportError: + logging.info("Not deleting any pinecone indexes due to pinecone module not being present") + + # Load the appropriate embedding class and update the class variables + def _load_embedding_db(self, embedding_db, model): + + if not embedding_db in self.supported_embedding_dbs: + raise UnsupportedEmbeddingDatabaseException(embedding_db) + + if embedding_db == "milvus": + return EmbeddingMilvus(self.library, model) + + if embedding_db == "faiss": + return EmbeddingFAISS(self.library, model) + + if embedding_db == "pinecone": + return EmbeddingPinecone(self.library, model) + + def generate_index_name(self, account_name, library_name, model_name, max_component_length=19): + + index_name = account_name + + # Remove non-alphanumerics from the remaining components and if still longer than the max, remove middle chars + for s in [library_name, model_name]: + s = re.sub(r'\W+', '', s) + if len(s) > max_component_length: + excess_length = len(s) - max_component_length + left_length = (len(s) - excess_length) // 2 + right_start = left_length + excess_length + index_name += s[:left_length] + s[right_start:] + + # Return the lowercase name: + return index_name.lower() + + +class EmbeddingMilvus: + + def __init__(self, library, model): + + self.library = library + self.milvus_alias = "default" + + # Connect to milvus + connections.connect(self.milvus_alias, + host=LLMWareConfig.get_config("milvus_host"), + port=LLMWareConfig.get_config("milvus_port")) + + # look up model card + self.model = model + self.model_name = self.model.model_name + + # milvus - 255 chars - letters, numbers and "_" OK -> does not accept "-" or " " in collection name + # removes a few common non-alpha characters - we can expand the regex to be wider + # caps at 43 chars + two '_'s in collection name - conforms with Pinecone char size + # puts in lower case - conforms with Pinecone requirement + + converted_library_name = re.sub("[-@_.\/ ]", "", self.library.library_name).lower() + if len(converted_library_name) > 18: + converted_library_name = converted_library_name[0:18] + + converted_model_name = re.sub("[-@_.\/ ]", "", self.model_name).lower() + if len(converted_model_name) > 18: + # chops off the start of the model name if longer than 18 chars + starter = len(converted_model_name) - 18 + converted_model_name = converted_model_name[starter:] + + converted_account_name = re.sub("[-@_.\/ ]","", self.library.account_name).lower() + if len(converted_model_name) > 7: + converted_account_name = converted_account_name[0:7] + + # get collection name here + self.collection_name = f"{converted_account_name}_{converted_library_name}_{converted_model_name}" + + # If the Collection doesn't already exist, create it + if not utility.has_collection(self.collection_name): + fields = [ + FieldSchema(name="block_mongo_id", dtype=DataType.VARCHAR, is_primary=True, max_length=30,auto_id=False), + FieldSchema(name="block_doc_id", dtype=DataType.INT64), + FieldSchema(name="embedding_vector", dtype=DataType.FLOAT_VECTOR, dim=self.model.embedding_dims) + ] + + collection = Collection(self.collection_name, CollectionSchema(fields)) + index_params = { + "metric_type": "L2", + "index_type": "IVF_FLAT", + "params": {"nlist": 1024} + } + collection.create_index("embedding_vector", index_params) + + self.collection = Collection(self.collection_name) + + # will leave "-" and "_" in file path, but remove "@" and " " + model_safe_path = re.sub("[@ ]", "", self.model_name).lower() + self.mongo_key = "embedding_milvus_" + model_safe_path + + def __del__(self): + connections.disconnect("default") + + def create_new_embedding(self, doc_ids = None, batch_size=500): + + if doc_ids: + all_blocks_cursor = CollectionRetrieval(self.library.collection).filter_by_key_value_range("doc_ID", doc_ids) + else: + all_blocks_cursor = CollectionRetrieval\ + (self.library.collection).custom_filter({self.mongo_key: {"$exists": False }}) + + num_of_blocks = self.library.collection.count_documents({}) + embeddings_created = 0 + current_index = 0 + finished = False + + all_blocks_iter = iter(all_blocks_cursor) + while not finished: + block_ids, doc_ids, sentences = [], [], [] + # Build the next batch + for i in range(batch_size): + block = next(all_blocks_iter, None) + if not block: + finished = True + break + text_search = block["text_search"].strip() + if not text_search or len(text_search) < 1: + continue + block_ids.append(str(block["_id"])) + doc_ids.append(int(block["doc_ID"])) + sentences.append(text_search) + + if len(sentences) > 0: + # Process the batch + vectors = self.model.embedding(sentences) + data = [block_ids, doc_ids, vectors] + self.collection.insert(data) + + # Update mongo + for block_id in block_ids: + self.library.collection.update_one({"_id": ObjectId(block_id)}, {"$set": {self.mongo_key: + current_index}}) + current_index += 1 + + embeddings_created += len(sentences) + print (f"Embeddings Created: {embeddings_created} of {num_of_blocks}") + + self.collection.flush() + embedding_summary = {"embeddings_created": embeddings_created} + + return embedding_summary + + def search_index(self, query_embedding_vector, sample_count=10): + + self.collection.load() + + search_params = { + "metric_type": "L2", + "params": {"nprobe": 10} + } + result = self.collection.search( + data=[query_embedding_vector], + anns_field="embedding_vector", + param=search_params, + limit=sample_count, + output_fields=["block_mongo_id"] + ) + + block_list = [] + for hits in result: + for hit in hits: + _id = hit.entity.get('block_mongo_id') + block_cursor = CollectionRetrieval(self.library.collection).filter_by_key("_id", ObjectId(_id)) + if block_cursor: + block_list.append( (block_cursor[0], hit.distance) ) + + return block_list + + def delete_embedding(self): + collection = Collection(self.collection_name) + collection.release() + utility.drop_collection(self.collection_name) + connections.disconnect(self.milvus_alias) + + # Delete mongo fields + block_cursor = CollectionWriter(self.library.collection).update_many_records_custom({}, { + "$unset": {self.mongo_key: ""}}) + + def convert_to_underscores(self, input_string): + return input_string.replace("-", "_").replace(" ", "_") + + +class EmbeddingFAISS: + + def __init__(self, library, model=None): + + self.library = library + self.index = None + + self.model = model + self.model_name = model.model_name + if not self.model_name: + self.model_name = model.__class__.__name__ + + self.embedding_dims = self.model.embedding_dims + + # embedding file name here + + # will leave "-" and "_" in file path, but remove "@" and " " + model_safe_path = re.sub("[@\/. ]", "", self.model_name).lower() + + self.embedding_file_path = os.path.join(self.library.embedding_path, model_safe_path, "embedding_file_faiss") + self.mongo_key = "embedding_faiss_" + model_safe_path + + def create_new_embedding(self, doc_ids=None, batch_size=100): + + # Load or create index + if not self.index: + if os.path.exists(self.embedding_file_path): + self.index = faiss.read_index(self.embedding_file_path) + else: + self.index = faiss.IndexFlatL2(self.embedding_dims) + + if doc_ids: + all_blocks_cursor = CollectionRetrieval(self.library.collection).filter_by_key_value_range("doc_ID", doc_ids) + else: + all_blocks_cursor = CollectionRetrieval(self.library.collection).\ + custom_filter({self.mongo_key: { "$exists": False }}) + + num_of_blocks = self.library.collection.count_documents({}) + + # print("update: num_of_blocks = ", num_of_blocks) + + embeddings_created = 0 + finished = False + + # batch_size = 50 + + all_blocks_iter = iter(all_blocks_cursor) + while not finished: + + block_ids, sentences = [], [] + current_index = self.index.ntotal + # Build the next batch + for i in range(batch_size): + + block = next(all_blocks_iter, None) + + # print("update: faiss iteration thru collection - ", i) + + if not block: + finished = True + break + + text_search = block["text_search"].strip() + + # print("update: text_search - ", text_search) + + if not text_search or len(text_search) < 1: + continue + block_ids.append(str(block["_id"])) + sentences.append(text_search) + + if len(sentences) > 0: + # Process the batch + vectors = self.model.embedding(sentences) + self.index.add(np.array(vectors)) + + # Update mongo + for block_id in block_ids: + self.library.collection.update_one({"_id": ObjectId(block_id)}, {"$set": {self.mongo_key: current_index}}) + current_index += 1 + + embeddings_created += len(sentences) + print (f"Embeddings Created: {embeddings_created} of {num_of_blocks}") + + # Ensure any existing file is removed before saving + if os.path.exists(self.embedding_file_path): + os.remove(self.embedding_file_path) + os.makedirs(os.path.dirname(self.embedding_file_path), exist_ok=True) + faiss.write_index(self.index, self.embedding_file_path) + + embedding_summary = {"embeddings_created": embeddings_created} + return embedding_summary + + def search_index (self, query_embedding_vector, sample_count=10): + + if not self.index: + self.index = faiss.read_index(self.embedding_file_path) + + distance_list, index_list = self.index.search(np.array([query_embedding_vector]), sample_count) + + block_list = [] + for i, index in enumerate(index_list[0]): + index_int = int(index.item()) + block_cursor = CollectionRetrieval(self.library.collection).filter_by_key(self.mongo_key, index_int) + if block_cursor and block_cursor[0]: + block_list.append( (block_cursor[0], distance_list[0][i]) ) + + return block_list + + def delete_embedding(self): + if os.path.exists(self.embedding_file_path): + os.remove(self.embedding_file_path) + + # Delete mongo fields + block_cursor = CollectionWriter(self.library.collection).update_many_records_custom({}, { + "$unset": {self.mongo_key: ""}}) + + +class EmbeddingPinecone: + + def __init__(self, library, model=None): + + # Try to import pinecone + try: + import pinecone + except ImportError: + raise ImportError ( + "Could not import the pinecone Python package. " + "Please install it with 'pip install pinecone-client'" + ) + + self.api_key = os.environ.get("USER_MANAGED_PINECONE_API_KEY") + self.environment = os.environ.get("USER_MANAGED_PINECONE_ENVIRONMENT") + + self.library = library + + # look up model card + self.model_name = model.model_name + self.model = model + self.embedding_dims = model.embedding_dims + + # initialize pinecone + self.index = None + + # initiate connection to Pinecone + pinecone.init(api_key=self.api_key, environment=self.environment) + + # check index name - pinecone - 45 chars - numbers, letters and "-" ok - no "_" and all lowercase + + converted_library_name = re.sub("[-@_.\/ ]", "", self.library.library_name).lower() + if len(converted_library_name) > 18: + converted_library_name = converted_library_name[0:18] + + converted_model_name = re.sub("[-@_.\/ ]", "", self.model_name).lower() + if len(converted_model_name) > 18: + # chops off the start of the model name if longer than 18 chars + starter = len(converted_model_name) - 18 + converted_model_name = converted_model_name[starter:] + # converted_model_name = converted_model_name[0:18] + + converted_account_name = re.sub("[-@_.\/ ]", "", self.library.account_name).lower() + if len(converted_model_name) > 7: + converted_account_name = converted_account_name[0:7] + + # converted_library_name = self.convert_to_hyphens(self.library.library_name) + # converted_model_name = self.convert_to_hyphens(self.model_name) + + # build new name here + self.index_name = f"{converted_account_name}-{converted_library_name}-{converted_model_name}" + + if self.index_name not in pinecone.list_indexes(): + pinecone.create_index(self.index_name, dimension=self.embedding_dims, metric="euclidean") + pinecone.describe_index(self.index_name) # Waits for index to be created + # describe_index_stats() # Returns: {'dimension': 8, 'index_fullness': 0.0, 'namespaces': {'': {'vector_count': 5}}} + + # connect to index + self.index = pinecone.Index(self.index_name) + + # will leave "-" and "_" in file path, but remove "@" and " " + model_safe_path = re.sub("[@ ]", "", self.model_name).lower() + self.mongo_key = "embedding_pinecone_" + model_safe_path + + def create_new_embedding(self, doc_ids = None, batch_size=500): + + if doc_ids: + all_blocks_cursor = CollectionRetrieval(self.library.collection).filter_by_key_value_range("doc_ID", doc_ids) + else: + all_blocks_cursor = CollectionRetrieval(self.library.collection).\ + custom_filter({self.mongo_key: { "$exists": False }}) + + num_of_blocks = self.library.collection.count_documents({}) + embeddings_created = 0 + + # starting current_index @ 0 + current_index = 0 + + finished = False + + all_blocks_iter = iter(all_blocks_cursor) + while not finished: + block_ids, doc_ids, sentences = [], [], [] + # Build the next batch + for i in range(batch_size): + block = next(all_blocks_iter, None) + if not block: + finished = True + break + text_search = block["text_search"].strip() + if not text_search or len(text_search) < 1: + continue + block_ids.append(str(block["_id"])) + doc_ids.append(int(block["doc_ID"])) + sentences.append(text_search) + + if len(sentences) > 0: + # Process the batch + vectors = self.model.embedding(sentences).tolist() + + # expects records as tuples - (batch of _ids, batch of vectors, batch of dict metadata) + records = zip(block_ids, vectors) #, doc_ids) + # upsert to Pinecone + self.index.upsert(vectors=records) + + # Update mongo + for block_id in block_ids: + self.library.collection.update_one({"_id": ObjectId(block_id)}, + {"$set": {self.mongo_key: current_index}}) + current_index += 1 + + embeddings_created += len(sentences) + print (f"Embeddings Created: {embeddings_created} of {num_of_blocks}") + + embedding_summary = {"embeddings_created": embeddings_created} + return embedding_summary + + def search_index(self, query_embedding_vector, sample_count=10): + + result = self.index.query(vector=query_embedding_vector.tolist(), top_k=sample_count,include_values=True) + + block_list = [] + for match in result["matches"]: + _id = match["id"] + block_cursor = CollectionRetrieval(self.library.collection).filter_by_key("_id", ObjectId(_id)) + if block_cursor and block_cursor[0]: + distance = match["score"] + block_list.append( (block_cursor[0], distance) ) + + return block_list + + def delete_index(self, index_name): + pinecone.delete_index(index_name) + + # Delete mongo fields + block_cursor = CollectionWriter(self.library.collection).update_many_records_custom({}, { + "$unset": {self.mongo_key: ""}}) + + def delete_all_indexes(self): + placeholder_no_action_taken_currently = 0 + + def convert_to_hyphens(self, input_string): + return input_string.replace("_", "-").replace(" ", "-").lower() + diff --git a/llmware/exceptions.py b/llmware/exceptions.py new file mode 100644 index 00000000..a9b83626 --- /dev/null +++ b/llmware/exceptions.py @@ -0,0 +1,141 @@ + +# Copyright 2023 llmware + +# Licensed under the Apache License, Version 2.0 (the "License"); you +# may not use this file except in compliance with the License. You +# may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. + + +# Base exception class for all others +class LLMWareException(Exception): + + __module__ = 'llmware' + + def __init__(self, message="An unspecified error occurred"): + super().__init__(message) + self.message = message + + +class UnsupportedEmbeddingDatabaseException(LLMWareException): + + def __init__(self, embedding_db): + message = f"'{embedding_db}' is not a supported vector embedding database" + super().__init__(message) + + +class LLMInferenceResponseException(LLMWareException): + + def __init__(self, cloud_api_response): + message = f"'{cloud_api_response}' is not a supported vector embedding database" + super().__init__(message) + + +class HomePathDoesNotExistException(LLMWareException): + + def __init__(self, home_path): + message = f"'{home_path}' file path does not exist" + super().__init__(message) + + +class FilePathDoesNotExistException(LLMWareException): + + def __init__(self, file_path): + message = f"'{file_path}' file path does not exist" + super().__init__(message) + + +class UnsupportedCollectionDatabaseException(LLMWareException): + + def __init__(self, collection_db): + message = f"'{collection_db}' is not currently a supported collection database" + super().__init__(message) + + +class CollectionDatabaseNotFoundException(LLMWareException): + + def __init__(self, uri): + message = f"'{uri}' path to collection database is not connected currently. Library functions, such " \ + f"as add_files, Query, Embedding, and Graph require connection to a collection database to " \ + f"store, organize and index artifacts." + + super().__init__(message) + + +class PromptStateNotFoundException(LLMWareException): + + def __init__(self, prompt_id): + message = f"'{prompt_id}' could not be located" + super().__init__(message) + + +class PromptNotInCatalogException(LLMWareException): + + def __init__(self, prompt_name): + message = f"'{prompt_name}' could not be located in the Prompt Catalog." + super().__init__(message) + + +class DependencyNotInstalledException(LLMWareException): + + def __init__(self, required_library_dependency): + message = f"'{required_library_dependency}' needs to be installed to use this function. Please refer to the " \ + f"documentation with any questions. " + super().__init__(message) + + +class LibraryNotFoundException(LLMWareException): + + def __init__(self, library_name,account_name): + message = f"'{library_name}' in '{account_name}' could not be located" + super().__init__(message) + + +# when Library obj passed, and either null or lacking correct attributes +class LibraryObjectNotFoundException(LLMWareException): + + def __init__(self, library): + message = f"'{library}' object must be passed to use this function." + super().__init__(message) + + +class ModelNotFoundException(LLMWareException): + + def __init__(self, model_name): + message = f"'{model_name}' could not be located" + super().__init__(message) + + +class ImportingSentenceTransformerRequiresModelNameException(LLMWareException): + + def __init__(self): + message = f"Importing a sentence transformer model requires that a name is provided so that the model " \ + f"can be looked up in the future to retrieve the embeddings." + super().__init__(message) + + +class APIKeyNotFoundException(LLMWareException): + + def __init__(self, model_name): + message = f"'{model_name}' could not be located" + super().__init__(message) + + +class SetUpLLMWareWorkspaceException(LLMWareException): + + def __init__(self, home_path): + message = f"Setting up llmware workspace at '{home_path}'. To set up a custom path, " \ + f"call Setup() explicitly with Setup().setup_llmware_workspace(home_path='/my/folder/')" + + super().__init__(message) + + + + diff --git a/llmware/lib/darwin/arm64/libgraph_llmware.dylib b/llmware/lib/darwin/arm64/libgraph_llmware.dylib new file mode 100755 index 00000000..2f7ebe8f Binary files /dev/null and b/llmware/lib/darwin/arm64/libgraph_llmware.dylib differ diff --git a/llmware/lib/darwin/arm64/liboffice_llmware.dylib b/llmware/lib/darwin/arm64/liboffice_llmware.dylib new file mode 100755 index 00000000..3f68e134 Binary files /dev/null and b/llmware/lib/darwin/arm64/liboffice_llmware.dylib differ diff --git a/llmware/lib/darwin/arm64/libpdf_llmware.dylib b/llmware/lib/darwin/arm64/libpdf_llmware.dylib new file mode 100755 index 00000000..b08746dc Binary files /dev/null and b/llmware/lib/darwin/arm64/libpdf_llmware.dylib differ diff --git a/llmware/lib/darwin/x86_64/libgraph_llmware.dylib b/llmware/lib/darwin/x86_64/libgraph_llmware.dylib new file mode 100755 index 00000000..cfeca5f1 Binary files /dev/null and b/llmware/lib/darwin/x86_64/libgraph_llmware.dylib differ diff --git a/llmware/lib/darwin/x86_64/liboffice_llmware.dylib b/llmware/lib/darwin/x86_64/liboffice_llmware.dylib new file mode 100755 index 00000000..c950066a Binary files /dev/null and b/llmware/lib/darwin/x86_64/liboffice_llmware.dylib differ diff --git a/llmware/lib/darwin/x86_64/libpdf_llmware.dylib b/llmware/lib/darwin/x86_64/libpdf_llmware.dylib new file mode 100755 index 00000000..8733ff21 Binary files /dev/null and b/llmware/lib/darwin/x86_64/libpdf_llmware.dylib differ diff --git a/llmware/lib/linux/aarch64/libgraph_llmware.so b/llmware/lib/linux/aarch64/libgraph_llmware.so new file mode 100755 index 00000000..7f603406 Binary files /dev/null and b/llmware/lib/linux/aarch64/libgraph_llmware.so differ diff --git a/llmware/lib/linux/aarch64/liboffice_llmware.so b/llmware/lib/linux/aarch64/liboffice_llmware.so new file mode 100755 index 00000000..5cf22076 Binary files /dev/null and b/llmware/lib/linux/aarch64/liboffice_llmware.so differ diff --git a/llmware/lib/linux/aarch64/libpdf_llmware.so b/llmware/lib/linux/aarch64/libpdf_llmware.so new file mode 100755 index 00000000..c264189a Binary files /dev/null and b/llmware/lib/linux/aarch64/libpdf_llmware.so differ diff --git a/llmware/lib/linux/x86_64/libgraph_llmware.so b/llmware/lib/linux/x86_64/libgraph_llmware.so new file mode 100755 index 00000000..8a104e2d Binary files /dev/null and b/llmware/lib/linux/x86_64/libgraph_llmware.so differ diff --git a/llmware/lib/linux/x86_64/liboffice_llmware.so b/llmware/lib/linux/x86_64/liboffice_llmware.so new file mode 100755 index 00000000..82de2cef Binary files /dev/null and b/llmware/lib/linux/x86_64/liboffice_llmware.so differ diff --git a/llmware/lib/linux/x86_64/libpdf_llmware.so b/llmware/lib/linux/x86_64/libpdf_llmware.so new file mode 100755 index 00000000..669a4124 Binary files /dev/null and b/llmware/lib/linux/x86_64/libpdf_llmware.so differ diff --git a/llmware/library.py b/llmware/library.py new file mode 100644 index 00000000..60cfddf3 --- /dev/null +++ b/llmware/library.py @@ -0,0 +1,580 @@ + +# Copyright 2023 llmware + +# Licensed under the Apache License, Version 2.0 (the "License"); you +# may not use this file except in compliance with the License. You +# may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. + + +from werkzeug.utils import secure_filename +import shutil +import os +import json +import logging + +from llmware.configs import LLMWareConfig +from llmware.util import Utilities, Graph +from llmware.parsers import Parser +from llmware.models import ModelCatalog +from llmware.resources import LibraryCatalog, LibraryCollection, CollectionRetrieval, CollectionWriter, \ + CloudBucketManager, check_db_uri +from llmware.embeddings import EmbeddingHandler +from llmware.exceptions import LibraryNotFoundException, SetUpLLMWareWorkspaceException, \ + CollectionDatabaseNotFoundException, ImportingSentenceTransformerRequiresModelNameException + + +class Library: + + def __init__(self): + + # default settings for basic parameters + self.account_name = None + self.library_name = None + + # base file paths in each library + self.library_main_path = None + + # each of these paths hang off library_main_path + self.file_copy_path = None + self.image_path = None + self.dataset_path = None + self.nlp_path = None + self.output_path = None + self.tmp_path = None + self.embedding_path = None + + # default key structure of block -> re-order for nicer display + self.default_keys = ["block_ID", "doc_ID", "content_type", "file_type","master_index","master_index2", + "coords_x", "coords_y", "coords_cx", "coords_cy", "author_or_speaker", "modified_date", + "created_date", "creator_tool", "added_to_collection", "file_source", + "table", "external_files", "text", "header_text", "text_search", + "user_tags", "special_field1", "special_field2", "special_field3","graph_status","dialog"] + + # default library card elements + self.default_library_card = ["library_name", "embedding_status", "embedding_model", "embedding_db", + "knowledge_graph", "unique_doc_id", "documents", "blocks", "images", "pages", + "tables"] + + self.block_size_target_characters = 400 + + # attributes used in parsing workflow + self.doc_ID = 0 + self.block_ID = 0 + + # db settings + self.collection = None + + # check for llmware path & create if not already set up + if not os.path.exists(LLMWareConfig.get_llmware_path()): + + # if not explicitly set up by user, then create folder directory structure + LLMWareConfig.setup_llmware_workspace() + + # check if collection datastore is connected + if not check_db_uri(timeout_secs=3): + raise CollectionDatabaseNotFoundException(LLMWareConfig.get_config("collection_db_uri")) + + # explicit constructor to create a new library + def create_new_library(self, library_name, account_name="llmware"): + + # note: default behavior - if library with same name already exists, then it loads existing library + + self.library_name = library_name + self.account_name = account_name + + # apply safety check to library_name path + library_name = secure_filename(library_name) + + library_exists = self.check_if_library_exists(library_name,account_name) + + if library_exists: + # do not create + logging.info("update: library already exists - returning library - %s - %s ", library_name, account_name) + return self.load_library(library_name, account_name) + + # allow 'dynamic' creation of a new account path + account_path = os.path.join(LLMWareConfig.get_library_path(), account_name) + if not os.path.exists(account_path): + os.makedirs(account_path,exist_ok=True) + + self.library_main_path = os.path.join(LLMWareConfig.get_library_path(), account_name, library_name) + + # add new file dir for this collection + self.file_copy_path = os.path.join(self.library_main_path,"uploads/") + self.image_path = os.path.join(self.library_main_path, "images/") + self.dataset_path = os.path.join(self.library_main_path, "datasets/") + self.nlp_path = os.path.join(self.library_main_path, "nlp/") + self.output_path = os.path.join(self.library_main_path, "output/") + self.tmp_path = os.path.join(self.library_main_path, "tmp/") + self.embedding_path = os.path.join(self.library_main_path, "embedding/") + + library_folder = os.path.exists(self.library_main_path) + + # this is a new library to create -> build file paths for work products + if not library_folder: + os.mkdir(self.library_main_path) + os.mkdir(self.file_copy_path) + os.mkdir(self.image_path) + os.mkdir(self.dataset_path) + os.mkdir(self.nlp_path) + os.mkdir(self.output_path) + os.mkdir(self.tmp_path) + os.mkdir(self.embedding_path) + os.chmod(self.dataset_path, 0o777) + os.chmod(self.nlp_path, 0o777) + os.chmod(self.output_path, 0o777) + os.chmod(self.tmp_path, 0o777) + os.chmod(self.embedding_path, 0o777) + + new_library_entry = {"library_name": self.library_name, + + # track embedding status - each embedding tracked as new dict in list + # --by default, when library created, no embedding in place + + "embedding": [{"embedding_status": "no", "embedding_model": "none", "embedding_db": "none"}], + + # knowledge graph + "knowledge_graph": "no", + + # doc trackers + "unique_doc_id": 0, "documents": 0, "blocks": 0, "images": 0, "pages": 0, "tables": 0} + + # LibraryCatalog will register the new library card + new_library_card = LibraryCatalog(self).create_new_library_card(new_library_entry) + + # assumes DB Connection for saving .collection + self.collection = LibraryCollection(self).create_library_collection() + + return self + + def load_library(self, library_name, account_name="llmware"): + + # first check that library exists + + library_exists = self.check_if_library_exists(library_name) + + if not library_exists: + logging.error("error: library/account not found - %s - %s ", library_name, account_name) + raise LibraryNotFoundException(library_name, account_name) + + self.library_name = library_name + self.account_name = account_name + self.library_main_path = os.path.join(LLMWareConfig.get_library_path(), account_name, library_name) + + # add new file dir for this collection + self.file_copy_path = os.path.join(self.library_main_path, "uploads/") + self.image_path = os.path.join(self.library_main_path, "images/") + self.dataset_path = os.path.join(self.library_main_path, "datasets/") + self.nlp_path = os.path.join(self.library_main_path, "nlp/") + self.output_path = os.path.join(self.library_main_path, "output/") + self.tmp_path = os.path.join(self.library_main_path, "tmp/") + self.embedding_path = os.path.join(self.library_main_path, "embedding/") + os.makedirs(self.library_main_path, exist_ok=True) + os.makedirs(self.file_copy_path,exist_ok=True) + os.makedirs(self.image_path,exist_ok=True) + os.makedirs(self.dataset_path,exist_ok=True) + os.makedirs(self.nlp_path,exist_ok=True) + os.makedirs(self.output_path,exist_ok=True) + os.makedirs(self.tmp_path,exist_ok=True) + os.makedirs(self.embedding_path,exist_ok=True) + # assumes DB Connection for saving collection + self.collection = LibraryCollection(self).create_library_collection() + + return self + + def get_library_card(self, library_name=None, account_name="llmware"): + + if library_name: + self.library_name = library_name + + if account_name: + self.account_name = account_name + + library_card= LibraryCatalog().get_library_card(self.library_name, account_name=account_name) + + if not library_card: + logging.warning("warning: error retrieving library card - not found - %s - %s ", library_name, account_name) + + return library_card + + def check_if_library_exists(self, library_name, account_name="llmware"): + + # first look in library catalog + library_card = LibraryCatalog().get_library_card(library_name, account_name=account_name) + + # check file path + lib_path = os.path.join(LLMWareConfig.get_library_path(), account_name, library_name) + library_folder = os.path.exists(lib_path) + + # if all checks consistent + if library_card and library_folder: + # library exists and is in good state + return library_card + + if not library_card and not library_folder: + # library does not exist conclusively + return None + + # may be error state - some artifacts exist and others do not + if library_card: + # view the library_card as the definitive record + return library_card + + return library_card + + def update_embedding_status (self, status_message, embedding_model, embedding_db): + + # sets three parameters for embedding - + # "embedding_status", e.g., is embedding completed for library + # "embedding_model", e.g., what is the embedding model used to create the embedding + # "embedding_db":, e.g., Milvus, FAISS, Pinecone + + # special handling for updating "embedding" in update_library_card + # -- will append this new embedding dict to the end of the embedding list + + update_dict = {"embedding": {"embedding_status": status_message, + "embedding_model": embedding_model, + "embedding_db": embedding_db}} + + updater = LibraryCatalog(self).update_library_card(self.library_name, update_dict) + + return True + + def get_embedding_status (self): + + library_card = LibraryCatalog(self).get_library_card(self.library_name, account_name=self.account_name) + + if not library_card: + + logging.error("error: library/account not found - %s - %s ", self.library_name, self.account_name) + raise LibraryNotFoundException(self.library_name, self.account_name) + + # embedding record will be a list of {"embedding_status" | "embedding_model" | "embedding_db"} + logging.info("update: library_card - %s ", library_card) + + if "embedding" in library_card: + embedding_record = library_card["embedding"] + else: + logging.warning("warning: could not identify embedding record in library card - %s ", library_card) + embedding_record = None + + """ + embedding_record = {"embedding_status": library_card["embedding_status"], + "embedding_model": library_card["embedding_model"], + "embedding_db": library_card["embedding_db"]} + """ + + return embedding_record + + def get_knowledge_graph_status (self): + + library_card = LibraryCatalog(self).get_library_card(self.library_name, self.account_name) + + if not library_card: + logging.error("error: library/account not found - %s - %s ", self.library_name, self.account_name) + raise LibraryNotFoundException(self.library_name, self.account_name) + + status_message = library_card["knowledge_graph"] + + return status_message + + def set_knowledge_graph_status (self, status_message): + + update_dict = {"knowledge_graph": status_message} + updater = LibraryCatalog(self).update_library_card(self.library_name,update_dict) + + return True + + def get_and_increment_doc_id(self): + unique_doc_id = LibraryCatalog(self).get_and_increment_doc_id(self.library_name) + return unique_doc_id + + def set_incremental_docs_blocks_images(self, added_docs=0, added_blocks=0, added_images=0, added_pages=0, + added_tables=0): + + # updates counting parameters at end of parsing + updater = LibraryCatalog(self).set_incremental_docs_blocks_images(added_docs=added_docs, + added_blocks=added_blocks, + added_images=added_images, + added_pages=added_pages, + added_tables=added_tables) + + return True + + # Helper method to support adding a single file to a library + def add_file(self, file_path): + # Ensure the input path exists + os.makedirs(LLMWareConfig.get_input_path(), exist_ok=True) + + file_name = os.path.basename(file_path) + target_path = os.path.join(LLMWareConfig.get_input_path(), file_name) + + shutil.copyfile(file_path,target_path) + self.add_files() + + # main method for adding file - pass local filepath and appropriate parsers will be called + def add_files (self, input_folder_path=None): + + if not input_folder_path: + input_folder_path = LLMWareConfig.get_input_path() + + # get overall counters at start of process + lib_counters_before = self.get_library_card() + + logging.info("update: lib_counters_before - %s ", lib_counters_before) + + parsing_results = Parser(library=self).ingest(input_folder_path,dupe_check=True) + + logging.info("update: parsing results - %s ", parsing_results) + + # post-processing: get the updated lib_counters + lib_counters_after = self.get_library_card() + + # parsing_results = {"processed_files" | "rejected_files" | "duplicate_files"} + output_results = None + + if lib_counters_after and lib_counters_before: + output_results = {"docs_added": lib_counters_after["documents"] - lib_counters_before["documents"], + "blocks_added": lib_counters_after["blocks"] - lib_counters_before["blocks"], + "images_added": lib_counters_after["images"] - lib_counters_before["images"], + "pages_added": lib_counters_after["pages"] - lib_counters_before["pages"], + "tables_added": lib_counters_after["tables"] - lib_counters_before["tables"], + "rejected_files": parsing_results["rejected_files"]} + else: + logging.error("error: unexpected - could not identify the library_card correctly") + + logging.info("update: output_results - %s ", output_results) + + # update collection text index in collection after adding documents + LibraryCollection(self).create_index() + + return output_results + + def export_library_to_txt_file(self, output_fp=None, output_fn=None, include_text=True, include_tables=True, + include_images=False): + + if not output_fp: + output_fp = self.output_path + + if not output_fn: + output_fn = self.library_name + "_" + str(Utilities().get_current_time_now()) + + filter_list = [] + if include_text: filter_list.append("text") + if include_tables: filter_list.append("table") + if include_images: filter_list.append("images") + + if not filter_list: + # go with default - text only + filter_list = ["text"] + + results = CollectionRetrieval(self.collection).filter_by_key_value_range("content_type",filter_list) + + file_location = os.path.join(output_fp + "/", output_fn + ".txt") + output_file = open(file_location, "w") + text_field = "text_search" + for elements in results: + new_entry = elements[text_field].strip() + "\n" + output_file.write(new_entry) + + output_file.close() + + return file_location + + def export_library_to_jsonl_file(self, output_fp, output_fn, include_text=True, include_tables=True, + include_images=False, dict_keys=None): + + if not output_fp: + output_fp = self.output_path + + if not output_fn: + output_fn = self.library_name + "_" + str(Utilities().get_current_time_now()) + + # expects dict_keys to be a list of dictionary keys + if not dict_keys: + dict_keys = self.default_keys + + filter_list = [] + if include_text: filter_list.append("text") + if include_tables: filter_list.append("table") + if include_images: filter_list.append("images") + + if not filter_list: + # go with default - text only + filter_list = ["text"] + + results = CollectionRetrieval(self.collection).filter_by_key_value_range("content_type", filter_list) + + file_location = os.path.join(output_fp, output_fn + ".jsonl") + output_file = open(file_location, "w") + + for elements in results: + + # package up each jsonl entry as dict with selected keys to extract + new_dict_entry = {} + for keys in dict_keys: + if keys in elements: + new_dict_entry.update({keys:elements[keys]}) + + if new_dict_entry: + jsonl_row = json.dumps(new_dict_entry) + output_file.write(jsonl_row) + output_file.write("\n") + + output_file.close() + + return file_location + + def pull_files_from_cloud_bucket (self, aws_access_key=None, aws_secret_key=None, bucket_name=None): + + # pull files into local cache for processing + files_copied = CloudBucketManager().connect_to_user_s3_bucket (aws_access_key, aws_secret_key, + bucket_name, LLMWareConfig.get_input_path()) + + return files_copied + + def generate_knowledge_graph(self): + kg = Graph(library=self).build_graph() + self.set_knowledge_graph_status("yes") + return 0 + + def install_new_embedding (self, embedding_model_name=None, vector_db="milvus", + from_hf= False, from_sentence_transformer=False, model=None, tokenizer=None, model_api_key=None, + vector_db_api_key=None, batch_size=500): + + embeddings = None + my_model = None + + # step 1 - load selected model from ModelCatalog - will pass 'loaded' model to the EmbeddingHandler + + # check if instantiated model and tokenizer -> load as HuggingFace model + if model: + if from_hf: + logging.warning("update: loading hf model") + my_model = ModelCatalog().load_hf_embedding_model(model, tokenizer) + batch_size = 50 + + if from_sentence_transformer: + logging.warning("update: loading sentence transformer model") + if not embedding_model_name: + raise ImportingSentenceTransformerRequiresModelNameException + + my_model = ModelCatalog().load_sentence_transformer_model(model,embedding_model_name) + else: + # if no model explicitly passed, then look up in the model catalog + if embedding_model_name: + my_model = ModelCatalog().load_model(selected_model=embedding_model_name, api_key=model_api_key) + + if not my_model: + logging.error("error: install_new_embedding - can not identify a selected model") + return -1 + + # step 2 - pass loaded embedding model to EmbeddingHandler, which will route to the appropriate resource + embeddings = EmbeddingHandler(self).create_new_embedding(vector_db, my_model, batch_size=batch_size) + + if not embeddings: + logging.warning("warning: no embeddings created") + + return embeddings + + def delete_library(self, library_name=None, confirm_delete=False): + + # remove all artifacts from library to wipe the slate clean + + if library_name: + self.library_name = library_name + + success_code = 1 + + try: + if confirm_delete: + + # 1st - remove the blocks - drop the collection in database + CollectionWriter(self.collection).destroy_collection(confirm_destroy=True) + + # 2nd - Eliminate the local file structure + file_path = self.library_main_path + shutil.rmtree(file_path) + + # 3rd - remove record in LibraryCatalog + LibraryCatalog(self).delete_library_card(self.library_name) + + logging.info("update: deleted all library file artifacts + folders") + + except: + logging.exception("Error destroying library") + success_code = -1 + + return success_code + + def update_block (self, doc_id, block_id, key, new_value): + completed = CollectionWriter(self.collection).update_block(doc_id, block_id,key,new_value,self.default_keys) + return completed + + def add_website (self, url, get_links=True, max_links=5): + + Parser(library=self).parse_website(url,get_links=get_links,max_links=max_links) + LibraryCollection(self).create_index(self.library_name) + + return self + + def add_wiki(self, topic_list,target_results=10): + Parser(library=self).parse_wiki(topic_list,target_results=target_results) + LibraryCollection(self).create_index(self.library_name) + return self + + def add_dialogs(self, input_folder=None): + if not input_folder: + input_folder = LLMWareConfig.get_input_path() + + output = Parser(library=self).parse_dialog(input_folder) + + return self + + def add_image(self, input_folder=None): + if not input_folder: + input_folder = LLMWareConfig.get_input_path() + + output = Parser(library=self).parse_image(input_folder) + + return self + + def add_pdf_by_ocr(self, input_folder=None): + + if not input_folder: + input_folder = LLMWareConfig.get_input_path() + + output = Parser(library=self).parse_pdf_by_ocr_images(input_folder) + + return self + + def add_pdf(self, input_folder=None): + + if not input_folder: + input_folder = LLMWareConfig.get_input_path() + + output = Parser(library=self).parse_pdf(input_folder) + + return self + + def add_office(self, input_folder=None): + + if not input_folder: + input_folder = LLMWareConfig.get_input_path() + + output = Parser(library=self).parse_office(input_folder) + + return self + + def get_all_library_cards(self, account_name='llmware'): + library_cards = LibraryCatalog(account_name=account_name).all_library_cards() + return library_cards + diff --git a/llmware/models.py b/llmware/models.py new file mode 100644 index 00000000..512d5cf7 --- /dev/null +++ b/llmware/models.py @@ -0,0 +1,2828 @@ + +# Copyright 2023 llmware + +# Licensed under the Apache License, Version 2.0 (the "License"); you +# may not use this file except in compliance with the License. You +# may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. + + +import logging +import json +import numpy as np +import os +import re +import requests +import tempfile +import traceback +import ast +from collections import OrderedDict +from tqdm.auto import trange + +import torch +from torch.utils.data import Dataset +import torch.nn.functional as F +from torch import Tensor, nn +from tqdm.autonotebook import trange +import math +import inspect + +import torch.utils.checkpoint +from torch.nn import CrossEntropyLoss, BCEWithLogitsLoss + +from llmware.util import Utilities, PromptCatalog +from llmware.configs import LLMWareConfig +from llmware.resources import CloudBucketManager +from llmware.exceptions import ModelNotFoundException, LLMInferenceResponseException + +# api model imports +import openai, anthropic, ai21, cohere +from vertexai.preview.language_models import TextGenerationModel, TextEmbeddingModel +from vertexai import init +import google.cloud.aiplatform as aiplatform + + +global_model_repo_catalog_list = [ + + # embedding models + {"model_name": 'mini-lm-sbert', "display_name": "Sentence_Transformers (MPNet-Base)", "model_family": "LLMWareSemanticModel", + "model_category": "embedding", "model_location": "llmware_repo", "is_trainable": "yes", "embedding_dims": 384}, + + {"model_name": 'industry-bert-insurance', "display_name": "Insurance_LLMWare_Accelerator", "model_family": "LLMWareSemanticModel", + "model_category": "embedding", "model_location": "llmware_repo", "is_trainable": "yes", "embedding_dims": 768}, + + {"model_name": 'industry-bert-contracts', "display_name": "Contracts_LLMWare_Accelerator", "model_family": "LLMWareSemanticModel", + "model_category": "embedding", "model_location": "llmware_repo", "is_trainable": "yes", "embedding_dims": 768}, + + {"model_name": 'industry-bert-asset-management', "display_name": "Asset_Management_LLMWare_Accelerator", + "model_family": "LLMWareSemanticModel", + "model_category": "embedding", "model_location": "llmware_repo", "is_trainable": "yes", "embedding_dims": 768}, + + {"model_name": 'industry-bert-sec', "display_name": "SEC_LLMWare_Accelerator", "model_family": "LLMWareSemanticModel", + "model_category": "embedding", "model_location": "llmware_repo", "is_trainable": "yes", "embedding_dims": 768}, + + # add open ai embeddings + {"model_name": 'text-embedding-ada-002', "display_name": "OpenAI-Embedding", "model_family": "OpenAIEmbeddingModel", + "model_category": "embedding", "model_location": "api", "is_trainable": "no", "context_window": 2048, + "embedding_dims": 1536}, + + # add cohere embeddings + {"model_name": 'medium', "display_name": "Cohere-Medium-Embedding", "model_family": "CohereEmbeddingModel", + "model_category": "embedding", "model_location": "api", "is_trainable": "no","context_window": 2048, + "embedding_dims": 4096}, + + {"model_name": 'xlarge', "display_name": "Cohere-XLarge-Embedding", "model_family": "CohereEmbeddingModel", + "model_category": "embedding", "model_location": "api", "is_trainable": "no", "context_window": 2048, + "embedding_dims": 4096}, + + # add google embeddings + # textembedding-gecko@001 + {"model_name": 'textembedding-gecko@latest', "display_name": "Google-Embedding", "model_family": "GoogleEmbeddingModel", + "model_category": "embedding","model_location": "api", "is_trainable": "no", "context_window": 4000, + "embedding_dims": 768}, + + # generative-api models + {"model_name": 'claude-v1', "display_name": "Anthropic Claude-v1", "model_family": "ClaudeModel", + "model_category": "generative-api", "model_location": "api", "is_trainable": "no", + "context_window": 8000}, + {"model_name": 'claude-instant-v1', "display_name": "Anthropic Claude-Instant-v1", "model_family": "ClaudeModel", + "model_category": "generative-api","model_location": "api", "is_trainable": "no", + "context_window": 8000}, + {"model_name": 'command-medium-nightly', "display_name": "Cohere Command Medium", "model_family": "CohereGenModel", + "model_category": "generative-api","model_location": "api", "is_trainable": "no", + "context_window": 2048}, + {"model_name": 'command-xlarge-nightly', "display_name": "Cohere Command XLarge", "model_family": "CohereGenModel", + "model_category": "generative-api","model_location": "api", "is_trainable": "no", + "context_window": 2048}, + + {"model_name": 'summarize-xlarge', "display_name": "Cohere Summarize Xlarge", "model_family": "CohereGenModel", + "model_category":"generative-api","model_location": "api", "is_trainable": "no", + "context_window": 2048}, + {"model_name": 'summarize-medium', "display_name": "Cohere Summarize Medium", "model_family": "CohereGenModel", + "model_category":"generative-api","model_location": "api", "is_trainable": "no", + "context_window": 2048}, + {"model_name": 'j2-jumbo-instruct', "display_name": "Jurassic-2-Jumbo-Instruct", "model_family": "JurassicModel", + "model_category":"generative-api", "model_location": "api", "is_trainable": "no", + "context_window": 2048}, + {"model_name": 'j2-grande-instruct', "display_name": "Jurassic-2-Grande-Instruct", "model_family": "JurassicModel", + "model_category":"generative-api","model_location": "api", "is_trainable": "no", + "context_window": 2048}, + {"model_name": 'text-bison@001', "display_name": "Google Palm", "model_family": "GoogleGenModel", + "model_category": "generative-api","model_location": "api", "is_trainable": "no", + "context_window": 8192}, + {"model_name": 'chat-bison@001', "display_name": "Google Chat", "model_family": "GoogleGenModel", + "model_category": "generative-api","model_location": "api", "is_trainable": "no", + "context_window": 8192}, + {"model_name": 'text-davinci-003', "display_name": "GPT3-Davinci", "model_family": "OpenAIGenModel", + "model_category": "generative-api","model_location": "api", "is_trainable": "no", + "context_window": 4096}, + {"model_name": 'text-curie-001', "display_name": "GPT3-Curie", "model_family": "OpenAIGenModel", + "model_category": "generative-api","model_location": "api", "is_trainable": "no", + "context_window": 2048}, + {"model_name": 'text-babbage-001', "display_name": "GPT3-Babbage", "model_family": "OpenAIGenModel", + "model_category": "generative-api","model_location": "api", "is_trainable": "no", + "context_window": 2048}, + {"model_name": 'text-ada-001', "display_name": "GPT3-Ada", "model_family": "OpenAIGenModel", + "model_category": "generative-api","model_location": "api", "is_trainable": "no", + "context_window": 2048}, + {"model_name": "gpt-3.5-turbo", "display_name": "ChatGPT", "model_family": "OpenAIGenModel", + "model_category": "generative-api","model_location": "api", "is_trainable": "no", + "context_window": 4000}, + + # gpt-4 add + {"model_name": "gpt-4", "display_name": "GPT-4", "model_family": "OpenAIGenModel", + "model_category": "generative-api", "model_location": "api", "is_trainable": "no", + "context_window": 8000}, + + # gpt-3.5-turbo-instruct + {"model_name": "gpt-3.5-turbo-instruct", "display_name": "GPT-3.5-Instruct", "model_family": "OpenAIGenModel", + "model_category": "generative-api", "model_location": "api", "is_trainable": "no", + "context_window": 4000}, + + # generative AIB models - aib-read-gpt - "main model" + {"model_name": "aib-read-gpt", "display_name": "AIB-READ-GPT", "model_family": "AIBReadGPTModel", + "model_category": "generative-api", "model_location": "api", "is_trainable": "no", + "context_window": 2048}, + + # HF embedding models + {"model_name": "HF-Embedding", "display_name": "HF-Embedding", "model_family": "HFEmbeddingModel", + "model_category": "semantic-hf", "model_location": "api", "is_trainable": "no", + "context_window": 2048}, + + # HF generative models + {"model_name": "HF-Generative", "display_name": "HF-Generative", "model_family": "HFGenerativeModel", + "model_category": "generative-hf", "model_location": "api", "is_trainable": "no", + "context_window": 2048}, + + # base supporting models and components + {"model_name": "bert", "display_name": "Bert", "model_family": "BaseModel", "model_category": "base", + "is_trainable": "no","model_location": "llmware_repo"}, + {"model_name": "roberta", "display_name": "Roberta", "model_family": "BaseModel", "model_category": "base", + "is_trainable": "no","model_location": "llmware_repo"}, + {"model_name": "gpt2", "display_name": "GPT-2", "model_family": "BaseModel", "model_category": "base", + "is_trainable": "no","model_location": "llmware_repo"} + ] + + +def build_json_models_manifest(manifest_dict, fp, fn="llmware_supported_models_manifest.json"): + + json_dict = json.dumps(manifest_dict,indent=1) + with open(os.path.join(fp,fn), "w") as outfile: + outfile.write(json_dict) + + return 0 + + +class ModelCatalog: + + # ModelCatalog responsible for model lookup of (1) Model Card, and (2) Finding Model Class + + def __init__(self): + + # ModelCatalog is simple, flexible mechanism to track registered models + # Easy to create "model repo" with mix of model types and instantiation approaches + # Builds on standard model classes with standard inference + + self.model_classes = [ + # generative model classes + "OpenAIGenModel", "ClaudeModel", "GoogleGenModel", + "CohereGenModel", "JurassicModel", "AIBReadGPTModel", + "HFGenerativeModel", + + # embedding model classes + "LLMWareSemanticModel", + "OpenAIEmbeddingModel", "CohereEmbeddingModel", + "GoogleEmbeddingModel", "HFEmbeddingModel" + ] + + self.global_model_list = global_model_repo_catalog_list + + self.account_name = None + self.library_name= None + + def pull_latest_manifest(self): + # will add to check manifest in global repo and make available for pull down + return 0 + + def lookup_model_card (self, selected_model_name): + + model_card = None + + # first check in the global_model_repo + confirm location + for models in self.global_model_list: + if models["model_name"] == selected_model_name: + model_card = models + model_card.update({"standard":True}) + break + + # if model not found, then return None, and downstream calling function responsible for handling + + return model_card + + def locate_and_retrieve_model_bits (self, model_card): + + # check for llmware path & create if not already set up + if not os.path.exists(LLMWareConfig.get_llmware_path()): + # if not explicitly set up by user, then create folder directory structure + LLMWareConfig.setup_llmware_workspace() + + model_name = model_card["model_name"] + + if not os.path.exists(LLMWareConfig.get_model_repo_path()): + os.mkdir(LLMWareConfig.get_model_repo_path()) + + model_location = os.path.join(LLMWareConfig.get_model_repo_path(), model_name) + + if os.path.exists(model_location): + model_parts_in_folder = os.listdir(model_location) + if len(model_parts_in_folder) > 0: + # print("path exists - ", model_location) + return model_location + + logging.info("update: ModelCatalog - this model - %s - is not in local repo - %s, so pulling " + "from global repo - please note that this may take a little time to load " + "for the first time.", model_name, LLMWareConfig.get_model_repo_path()) + + logging.info("update: ModelCatalog - pulling model from global repo - %s ", model_name) + + CloudBucketManager().pull_single_model_from_llmware_public_repo(model_name) + + logging.info("update: ModelCatalog - done pulling model into local folder - %s ", model_location) + + if os.path.exists(model_location): + return model_location + + raise ModelNotFoundException(model_name) + + def _instantiate_model_class_from_string(self, model_class, model_name, model_card): + + # by default - if model not found - return None + my_model = None + + if model_class in self.model_classes: + + # generative models + if model_class == "ClaudeModel": my_model = ClaudeModel(model_name=model_name) + if model_class == "OpenAIGenModel": my_model = OpenAIGenModel(model_name=model_name) + if model_class == "CohereGenModel": my_model = CohereGenModel(model_name=model_name) + if model_class == "JurassicModel": my_model = JurassicModel(model_name=model_name) + if model_class == "GoogleGenModel": my_model = GoogleGenModel(model_name=model_name) + + # stub for READ GPT provided -> will add other 3rd party models too + if model_class == "AIBReadGPTModel": my_model = AIBReadGPTModel(model_name=model_name) + + # embedding models + embedding_dims = None + + if "embedding_dims" in model_card: + embedding_dims = model_card["embedding_dims"] + + if model_class == "OpenAIEmbeddingModel": my_model = OpenAIEmbeddingModel(model_name=model_name, + embedding_dims=embedding_dims) + + if model_class == "CohereEmbeddingModel": my_model = CohereEmbeddingModel(model_name=model_name, + embedding_dims=embedding_dims) + + if model_class == "GoogleEmbeddingModel": my_model = GoogleEmbeddingModel(model_name=model_name, + embedding_dims=embedding_dims) + + if model_class == "LLMWareSemanticModel": my_model = LLMWareSemanticModel(model_name=model_name, + embedding_dims=embedding_dims) + + # placeholder for HF models + if model_class == "HFGenerativeModel": my_model = HFGenerativeModel(None,None, + model_name=model_name) + + if model_class == "HFEmbeddingModel": my_model = HFEmbeddingModel(None,None, + model_name=model_name) + + return my_model + + # completes all preparatory steps, and returns 'ready-for-inference' model + def load_model (self, selected_model, api_key=None): + + # step 1- lookup model card from the catalog + model_card = self.lookup_model_card(selected_model) + if not model_card: + logging.error("error: ModelCatalog - unexpected - could not identify model card for " + "selected model - %s ", selected_model) + + raise ModelNotFoundException(selected_model) + + # step 2- instantiate the right model class + my_model = self.get_model_by_name(model_card["model_name"]) + if not my_model: + logging.error("error: ModelCatalog - unexpected - could not identify the model - %s ", my_model) + raise ModelNotFoundException(selected_model) + + # step 3- if physical model, then find the location on local server, and if not available, then pull from s3 + if model_card["model_location"] != "api": + loading_directions = self.locate_and_retrieve_model_bits(model_card) + my_model = my_model.load_model_for_inference(loading_directions) + else: + # if api_key passed, save as environ variable + # TODO - look at this + if api_key: + my_model.set_api_key(api_key) + os.environ[selected_model] = api_key + + # pass model name to the model directly + my_model.model_name = selected_model + + return my_model + + def add_api_key (self, selected_model_name, api_key): + + # step 1- lookup model card from the catalog + model_card = self.lookup_model_card(selected_model_name) + + if not model_card: + + logging.error("error: ModelCatalog - could not identify model card for " + "selected model - %s ", selected_model_name) + + raise ModelNotFoundException(selected_model_name) + + # step 2 - save api key as environmental variable + model_name = model_card["model_name"] + os.environ[model_name] = api_key + + return self + + # enables passing of a 'loaded' sentence transformer model + def load_sentence_transformer_model(self,model, model_name): + model = LLMWareSemanticModel(model=model,model_name=model_name) + return model + + # integrate hf model passed + def load_hf_embedding_model(self, model, tokenizer): + model = HFEmbeddingModel(model, tokenizer) + return model + + # integrate pretrained decoder-based hf 'causal' model + # Provide options to control model preprocessing prompt behavior + def load_hf_generative_model(self, model,tokenizer,prompt_wrapper=None, + instruction_following=False): + + model = HFGenerativeModel(model, tokenizer, prompt_wrapper=prompt_wrapper, + instruction_following=instruction_following) + + return model + + # master handler to be used by any calling function, especially Retrieval / Query + def load_embedding_model (self, model_name=None, + model=None, tokenizer=None,from_hf=False, + from_sentence_transformers=False): + + loaded_model = None + + # if user passed a 'loaded model' object, then apply directly + if model: + # first, check for 'from_hf' flag and load as HuggingFace model + if from_hf: + loaded_model = ModelCatalog().load_hf_embedding_model(model,tokenizer) + else: + # second, check for 'from_sentence_transformer' flag and load as SBERT model + if from_sentence_transformers: + loaded_model = ModelCatalog().load_sentence_transformer_model(model,model_name) + + if not model: + logging.error("error: ModelCatalog load_embedding_model could not identify the " + "passed model - if model is from HuggingFace, then mark optional " + "'from_hf' flag to True. If model is from Sentence Transformers, " + " then mark optional 'from_sentence_transformers' flag " + "to True. Note: setting search mode to text search, in absence of embedding " + "model.") + else: + # main case - load embedding model from Catalog + loaded_model = ModelCatalog().load_model(selected_model=model_name) + + return loaded_model + + def list_embedding_models(self): + + embedding_models = [] + + for x in self.global_model_list: + if x["model_category"] == "embedding": + embedding_models.append(x) + + return embedding_models + + def list_generative_models(self): + + gen_models = [] + + for x in self.global_model_list: + if x["model_category"].startswith("generative"): + gen_models.append(x) + + gen_models = sorted(gen_models, key=lambda x: x["model_name"], reverse=False) + + return gen_models + + def list_all_models(self): + + all_models = [] + for x in self.global_model_list: + all_models.append(x) + + all_models = sorted(all_models, key=lambda x: x["model_category"], reverse=False) + + return all_models + + def model_lookup(self,model_name): + my_model = None + + for models in self.global_model_list: + if models["model_name"] == model_name: + my_model = models + break + + return my_model + + def get_model_by_name(self, model_name): + + my_model = None + + for models in self.global_model_list: + + if models["model_name"] == model_name: + selected_model = models + my_model = self._instantiate_model_class_from_string(selected_model["model_family"], + model_name, models) + break + + return my_model + + +class OpenAIGenModel: + + def __init__(self, model_name=None, api_key=None): + + self.api_key = api_key + self.model_name = model_name + + self.error_message = "\nUnable to connect to OpenAI. Please try again later." + + self.separator = "\n" + + # set max_total_len -> adjust input and output based on use case + # TODO - need to update these parameters by model + self.max_total_len = 4000 + self.max_input_len = 2000 + self.llm_max_output_len = 2000 + + # inference settings + self.temperature = 0.7 + self.target_requested_output_tokens = 100 + self.add_prompt_engineering = False + self.add_context = "" + + def set_api_key (self, api_key, env_var="USER_MANAGED_OPENAI_API_KEY"): + + # set api_key + os.environ[env_var] = api_key + logging.info("update: added and stored OpenAI api_key in environmental variable- %s", env_var) + + return self + + def _get_api_key (self, env_var="USER_MANAGED_OPENAI_API_KEY"): + + self.api_key = os.environ.get(env_var) + + if not self.api_key: + logging.error("error: _get_api_key could not successfully retrieve value from: %s ", env_var) + + return self.api_key + + def token_counter(self, text_sample): + + # open ai recommends using the open source gpt2 tokenizer to count tokens + tokenizer = Utilities().get_default_tokenizer + toks = tokenizer.encode(text_sample).ids + + return len(toks) + + def prompt_engineer_chatgpt3(self, query, context, inference_dict=None): + + if not self.add_prompt_engineering: + if context: + selected_prompt = "default_with_context" + else: + selected_prompt = "default_no_context" + else: + selected_prompt = self.add_prompt_engineering + + prompt_dict = PromptCatalog().build_core_prompt(prompt_name=selected_prompt, + separator=self.separator, + query=query, context=context, + inference_dict=inference_dict) + + system_message = prompt_dict["prompt_card"]["system_message"] + if not system_message: + system_message = "You are a helpful assistant." + + core_prompt = prompt_dict["core_prompt"] + + messages = [ + {"role": "system", "content": system_message}, + {"role": "user", "content": core_prompt} + ] + + return messages + + def prompt_engineer (self, query, context, inference_dict=None): + + if not self.add_prompt_engineering: + if context: + selected_prompt = "default_with_context" + else: + selected_prompt = "default_no_context" + + else: + selected_prompt = self.add_prompt_engineering + + prompt_dict = PromptCatalog().build_core_prompt(prompt_name=selected_prompt, + separator=self.separator, + query=query, context=context, + inference_dict=inference_dict) + + core_prompt = prompt_dict["core_prompt"] + + return core_prompt + + def inference(self, prompt, add_context=None, add_prompt_engineering=None, inference_dict=None, + api_key=None): + + if add_context: + self.add_context = add_context + + if add_prompt_engineering: + self.add_prompt_engineering = add_prompt_engineering + + if inference_dict: + + if "temperature" in inference_dict: + self.temperature = inference_dict["temperature"] + + if "max_tokens" in inference_dict: + self.target_requested_output_tokens = inference_dict["max_tokens"] + + # api_key + if api_key: + self.api_key = api_key + + if not self.api_key: + self.api_key = self._get_api_key() + + if not self.api_key: + logging.error("error: invoking OpenAI Generative model with no api_key") + + # default case - pass the prompt received without change + prompt_enriched = prompt + + usage = {} + + try: + + if self.model_name in ["gpt-3.5-turbo","gpt-4"]: + + messages = self.prompt_engineer_chatgpt3(prompt_enriched, self.add_context, inference_dict) + + # different api for "chat completion" -> only applies to ChatGPT = 'gpt-3.5-turbo' + openai.api_key = self.api_key + response = openai.ChatCompletion.create(model=self.model_name,messages=messages, + max_tokens=self.target_requested_output_tokens) + + text_out = response["choices"][0]["message"]["content"] + + usage = {"input": response["usage"]["prompt_tokens"], + "output": response["usage"]["completion_tokens"], + "total": response["usage"]["total_tokens"], + "metric": "tokens"} + + # logging.info("update: open ai response: %s ", response) + + else: + # 'instruct gpt' models + + prompt_enriched = self.prompt_engineer(prompt_enriched, self.add_context, inference_dict=inference_dict) + + prompt_final = prompt_enriched + + text_prompt = prompt_final + self.separator + logging.info("update: openai model - FINAL PROMPT: %s %s ", self.model_name, prompt_final) + openai.api_key = self.api_key + response = openai.Completion.create(model=self.model_name, prompt=text_prompt, + temperature=self.temperature, + max_tokens=self.target_requested_output_tokens) + + logging.info("update: open ai response: %s ", response["choices"]) + text_out = response["choices"][0]["text"] + # openai response "usage" dict - {"completion_tokens" | "prompt_tokens" | total_tokens"} + + usage = {"input": response["usage"]["prompt_tokens"], + "output": response["usage"]["completion_tokens"], + "total": response["usage"]["total_tokens"], + "metric": "tokens"} + + except Exception as e: + # this is special error code that will be picked and handled in AIModels().inference handler + text_out = "/***ERROR***/" + usage = {"input":0, "output":0, "total":0, "metric": "tokens"} + + # raise LLMInferenceResponseException(e) + logging.error("error: OpenAI model inference produced error - %s ", e) + + # will look to capture usage metadata + # "usage" = {"completion_tokens", "prompt_tokens", "total_tokens"} + + output_response = {"llm_response": text_out, "usage": usage} + + return output_response + + +class ClaudeModel: + + def __init__(self, model_name=None, api_key=None): + + self.api_key = api_key + self.model_name = model_name + + self.error_message = "\nUnable to connect to Anthropic/Claude. Please try again later." + + self.separator = "\n" + + # Claude/Anthropic model - 8000 max token context window + self.max_total_len = 8000 + self.max_input_len = 4000 + self.llm_max_output_len = 4000 + + # inference settings + self.temperature = 0.7 + self.target_requested_output_tokens = 100 + self.add_prompt_engineering = False + self.add_context = "" + + def set_api_key(self, api_key, env_var="USER_MANAGED_ANTHROPIC_API_KEY"): + + # set api_key + os.environ[env_var] = api_key + logging.info("update: added and stored ANTHROPIC api_key in environmental variable- %s", env_var) + + return self + + def _get_api_key(self, env_var="USER_MANAGED_ANTHROPIC_API_KEY"): + + self.api_key = os.environ.get(env_var) + + if not self.api_key: + logging.error("error: _get_api_key could not successfully retrieve value from: %s ", env_var) + + return self.api_key + + def token_counter(self, text_sample): + + tokenizer = Utilities().get_default_tokenizer() + toks = tokenizer.encode(text_sample).ids + return len(toks) + + def prompt_engineer (self, query, context, inference_dict=None): + + # default case -> prompt = input query + + prompt_engineered = "" + + if not self.add_prompt_engineering: + if context: + selected_prompt = "default_with_context" + else: + selected_prompt = "default_no_context" + else: + selected_prompt = self.add_prompt_engineering + + prompt_dict = PromptCatalog().build_core_prompt(prompt_name=selected_prompt, + separator=self.separator, + query=query,context=context, + inference_dict=inference_dict) + + if prompt_dict: + + core_prompt = prompt_dict["core_prompt"] + + # prototype prompt for Anthropic: + # "\n\nHuman:" + {text} + "\n\nAssistant:" + # per Anthropic docs, usually best to include the query at the END, rather than the Beginning + + prompt_engineered = "\n\nHuman: " + core_prompt + "\n\nAssistant:" + + return prompt_engineered + + def inference(self, prompt, add_context=None, add_prompt_engineering=None, inference_dict=None, + api_key=None): + + if add_context: + self.add_context = add_context + + if add_prompt_engineering: + self.add_prompt_engineering = add_prompt_engineering + + if inference_dict: + + if "temperature" in inference_dict: + self.temperature = inference_dict["temperature"] + + if "max_tokens" in inference_dict: + self.target_requested_output_tokens = inference_dict["max_tokens"] + + if api_key: + self.api_key = api_key + + if not self.api_key: + self.api_key = self._get_api_key() + + if not self.api_key: + logging.error("error: invoking Anthropic Claude Generative model with no api_key") + + client = anthropic.Client(api_key=self.api_key) + + # prototype prompt sample: prompt_enriched = "\n\nHuman:" + " please read the following- " + + # self.add_context + " Based on these materials, " + prompt["prompt"] + "\n\nAssistant:" + + prompt_enriched = self.prompt_engineer(prompt,self.add_context, inference_dict=inference_dict) + + # preferred model = "claude-instant-v1" + + try: + response = client.completions.create(prompt=prompt_enriched, + stop_sequences=[anthropic.HUMAN_PROMPT], + max_tokens_to_sample=self.target_requested_output_tokens, + model=self.model_name, + stream=False, + temperature=self.temperature) + + #text_out = list(response)[-1].completion + text_out = response.completion + + input_count = client.count_tokens(prompt_enriched) + output_count = client.count_tokens(text_out) + + usage = {"input": input_count, "output": output_count, "total": input_count + output_count, "metric": "tokens"} + + except Exception as e: + # this is special error code that will be picked and handled by calling function + text_out = "/***ERROR***/" + usage = {"input":0, "output":0, "total":0, "metric": "tokens"} + + # raise LLMInferenceResponseException(e) + logging.error("error: Anthropic model inference produced error - %s ", e) + + output_response = {"llm_response": text_out, "usage": usage} + + logging.info(f"update: output_response - anthropic: {output_response}") + + return output_response + + +class GoogleGenModel: + + def __init__(self, model_name=None, api_key=None): + + self.api_key = api_key + self.model_name = model_name + self.model = None + self.error_message = "\nUnable to connect to Google/PALM Model. Please try again later." + self.separator = "\n" + + # need to confirm max input and output + # set max_total_len -> adjust input and output based on use case + self.max_total_len = 8192 + 1024 + self.max_input_len = 8192 + + self.llm_max_output_len = 1024 + + # inference settings + self.temperature = 0.7 + self.target_requested_output_tokens = 100 + self.add_prompt_engineering = False + self.add_context = "" + + def set_api_key(self, api_key, env_var="USER_MANAGED_GOOGLE_API_KEY"): + + # set api_key + os.environ[env_var] = api_key + logging.info("update: added and stored GOOGLE api_key in environmental variable- %s", env_var) + + return self + + def _get_api_key(self, env_var="USER_MANAGED_GOOGLE_API_KEY"): + + self.api_key = os.environ.get(env_var) + return self.api_key + + def token_counter(self, text_sample): + + tokenizer = Utilities().get_default_tokenizer() + toks = tokenizer.encode(text_sample).ids + + return len(toks) + + def prompt_engineer (self, query, context, inference_dict=None): + + if not self.add_prompt_engineering: + if context: + selected_prompt = "default_with_context" + else: + selected_prompt = "default_no_context" + else: + selected_prompt = self.add_prompt_engineering + + prompt_dict = PromptCatalog().build_core_prompt(prompt_name=selected_prompt, + separator=self.separator, + query=query, + context=context, + inference_dict=inference_dict) + + if prompt_dict: + prompt_engineered = prompt_dict["core_prompt"] + + else: + # default case -> prompt = input query + prompt_engineered = "Please read the following text: " + context + \ + " and answer the question: " + query + + return prompt_engineered + + def inference(self, prompt, add_context=None, add_prompt_engineering=None, inference_dict=None, + api_key=None): + + if add_context: + self.add_context = add_context + + if add_prompt_engineering: + self.add_prompt_engineering = add_prompt_engineering + + if inference_dict: + + if "temperature" in inference_dict: + self.temperature = inference_dict["temperature"] + + if "max_tokens" in inference_dict: + self.target_requested_output_tokens = inference_dict["max_tokens"] + + # api_key + if api_key: + self.api_key = api_key + + if not self.api_key: + self.api_key = self._get_api_key() + + if not self.api_key: + logging.error("error: invoking Google Generative model with no api_key") + + prompt_enriched = self.prompt_engineer(prompt,self.add_context, inference_dict=inference_dict) + + self.target_requested_output_tokens= 2000 + # note: google api is not well-documented + + try: + + # Important: Before calling the model, we need to ensure the contents of the + # api_key (the json dict string) have been persisted to a file + # and the environment variable GOOGLE_APPLICATION_CREDENTIALS points to that file path + + google_json_credentials = self.api_key_to_json() + os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = google_json_credentials + + self.model = TextGenerationModel.from_pretrained("text-bison@001") + response = self.model.predict(prompt=prompt_enriched, + temperature=0.7) + + logging.info(f"google model response: {response.text}") + + text_out = response.text + + input_count = len(prompt_enriched) + output_count = len(text_out) + + usage = {"input": input_count, "output": output_count, "total": input_count + output_count, + "metric": "characters"} + + except Exception as e: + + # this is special error code that will be picked and handled in AIModels().inference handler + text_out = "/***ERROR***/" + usage = {"input":0, "output":0, "total":0, "metric": "characters"} + + # raise LLMInferenceResponseException(e) + logging.error("error: Google model inference produced error: %s", e) + + finally: + # Close the credentials json which automatically deletes it (since it is a NamedTemporaryFile) + os.remove(google_json_credentials) + + output_response = {"llm_response": text_out, "usage": usage} + + logging.info("update: output_response - google: %s ", output_response) + + return output_response + + def api_key_to_json(self): + + # Google authentication key is an entire json dictionary which we have the user pass in as an env var + # We write out the json and we need to escape newlines which seem to be always present in + # google auth json files + + temp_json_path = tempfile.NamedTemporaryFile(prefix="googlecreds", delete=False).name + + with open(temp_json_path, "w") as f: + f.write(self.api_key.replace("\n", "\\n")) + + return temp_json_path + + +class JurassicModel: + + def __init__(self, model_name=None, api_key=None): + + self.api_key = api_key + self.model_name = model_name + + self.error_message = "\nUnable to connect to Jurassic. Please try again later." + + self.separator = " -- " + + # set max_total_len -> adjust input and output based on use case + self.max_total_len = 2048 + self.max_input_len = 1024 + + self.llm_max_output_len = 1024 + + # inference settings + self.temperature = 0.7 + self.target_requested_output_tokens = 100 + self.add_prompt_engineering = False + self.add_context = "" + + # 'j2-jumbo-instruct', 'j2-grande-instruct','j2-jumbo','j2-grande', 'j2-large' + + def set_api_key(self, api_key, env_var="USER_MANAGED_AI21_API_KEY"): + + # set api_key + os.environ[env_var] = api_key + logging.info("update: added and stored AI21 api_key in environmental variable- %s", env_var) + + return self + + def _get_api_key(self, env_var="USER_MANAGED_AI21_API_KEY"): + self.api_key = os.environ.get(env_var) + return self.api_key + + def token_counter(self, text_sample): + + tokenizer = Utilities().get_default_tokenizer() + toks = tokenizer.encode(text_sample).ids + + return len(toks) + + def prompt_engineer (self, query, context, inference_dict=None): + + if not self.add_prompt_engineering: + if context: + selected_prompt = "default_with_context" + else: + selected_prompt = "default_no_context" + else: + selected_prompt = self.add_prompt_engineering + + prompt_dict = PromptCatalog().build_core_prompt(prompt_name=selected_prompt, + separator=self.separator, + query=query, + context=context, + inference_dict=inference_dict) + + if prompt_dict: + prompt_engineered = prompt_dict["core_prompt"] + else: + + # default case + prompt_engineered = "Please read the following text: " + context + " -- " + prompt_engineered += " ## " + prompt_engineered += "Please answer the following question based on the text: " + query + prompt_engineered += " ## " + + return prompt_engineered + + def inference(self, prompt, add_context=None, add_prompt_engineering=None, inference_dict=None, + api_key=None): + + if add_context: + self.add_context = add_context + + if add_prompt_engineering: + self.add_prompt_engineering = add_prompt_engineering + + if inference_dict: + + if "temperature" in inference_dict: + self.temperature = inference_dict["temperature"] + + if "max_tokens" in inference_dict: + self.target_requested_output_tokens = inference_dict["max_tokens"] + + if api_key: + self.api_key = api_key + + if not self.api_key: + self.api_key = self._get_api_key() + + if not self.api_key: + logging.error("error: invoking AI21 Jurassic model with no api_key") + + prompt_enriched = prompt + + prompt_enriched = self.prompt_engineer(prompt_enriched,self.add_context, inference_dict=inference_dict) + + try: + ai21.api_key = self.api_key + + response = ai21.Completion.execute( + model=self.model_name, + prompt=prompt_enriched, + numResults=1, + maxTokens=self.target_requested_output_tokens, + temperature=0.7, + topKReturn=0, + topP=1, + stopSequences=["##"] + ) + + # api parameters: {"prompt", "numResults", "maxTokens", "minTokens", "temperature", "topP", + # "stopSequences" = list of sequences that when generated will cause the model to stop + # "topKReturn" = number of top scoring tokens to consider in each generation step + # "frequencyPenalty" = penalty applied to frequently generated tokens + # "presencePenalty" = penalty applied to tokens already present in the prompt. + # "countPenalty" = penalty applied to tokens based on frequency in the generated responses. + + text_out = response["completions"][0]["data"]["text"] + + usage = {"input": len(prompt_enriched), "output": len(text_out), + "total": len(prompt_enriched) + len(text_out), "metric": "chars"} + + except Exception as e: + + # this is special error code that will be picked and handled in inference handler + + text_out = "/***ERROR***/" + + usage = {"input": 0, "output": 0, "total": 0, "metric": "chars"} + + # raise LLMInferenceResponseException(e) + logging.error("error: Jurassic model inference produced error - %s ", e) + + # will look to capture usage metadata + + output_response = {"llm_response": text_out, "usage": usage} + + return output_response + + +class CohereGenModel: + + def __init__(self, model_name=None, api_key=None): + + self.api_key = api_key + self.model_name = model_name + + self.error_message = "\nUnable to connect to Cohere. Please try again later." + + self.separator = " -- " + + # set max_total_len -> adjust input and output based on use case + # confirmed - Cohere generation models - 2048 max context window + self.max_total_len = 2048 + self.max_input_len = 1024 + + self.llm_max_output_len = 1024 + + # inference settings + self.temperature = 0.7 + self.target_requested_output_tokens = 100 + self.add_prompt_engineering = False + self.add_context = "" + + # cohere generative models - 'command-medium-nightly', + # 'command-xlarge-nightly','xlarge','medium', "summarize-xlarge", "summarize-medium" + + def set_api_key(self, api_key, env_var="USER_MANAGED_COHERE_API_KEY"): + + # set api_key + os.environ[env_var] = api_key + logging.info("update: added and stored COHERE api_key in environmental variable- %s", env_var) + + return self + + def _get_api_key(self, env_var="USER_MANAGED_COHERE_API_KEY"): + + self.api_key = os.environ.get(env_var) + + return self.api_key + + def token_counter(self, text_sample): + + tokenizer = Utilities().get_default_tokenizer() + toks = tokenizer.encode(text_sample).ids + + return len(toks) + + def prompt_engineer (self, query, context, inference_dict=None): + + # Cohere prompt prototype - very simple - uses " -- " as separators - does not like " " at the end + + if not self.add_prompt_engineering: + if context: + selected_prompt = "default_with_context" + else: + selected_prompt = "default_no_context" + else: + selected_prompt = self.add_prompt_engineering + + prompt_dict = PromptCatalog().build_core_prompt(prompt_name=selected_prompt, + separator=self.separator, + query=query, + context=context, + inference_dict=inference_dict) + + if prompt_dict: + prompt_engineered = prompt_dict["core_prompt"] + else: + # default case + prompt_engineered = "Please read the following materials: " + context + self.separator + prompt_engineered += "Please answer the following question: " + query + self.separator + prompt_engineered += "Please answer the question only with facts provided in the materials. " \ + "If the question can not be answered in the materials, then please " \ + "respond 'Not Found.'" + + return prompt_engineered + + def inference(self, prompt, add_context=None, add_prompt_engineering=None, inference_dict=None, + api_key=None): + + if add_context: + self.add_context = add_context + + if add_prompt_engineering: + self.add_prompt_engineering = add_prompt_engineering + + if inference_dict: + + if "temperature" in inference_dict: + self.temperature = inference_dict["temperature"] + + if "max_tokens" in inference_dict: + self.target_requested_output_tokens = inference_dict["max_tokens"] + + #tokens_in_prompt = self.token_counter(prompt) + #tokens_in_context = self.token_counter(self.add_context) + + prompt_enriched = prompt + + logging.info("update: in cohere model inference: %s - %s", prompt_enriched, self.add_prompt_engineering) + + prompt_enriched = self.prompt_engineer(prompt_enriched,self.add_context, inference_dict=inference_dict) + + if api_key: + self.api_key = api_key + + if not self.api_key: + self.api_key = self._get_api_key() + + if not self.api_key: + logging.error("error: invoking Cohere Generative model with no api_key") + + co = cohere.Client(self.api_key) + + try: + + if self.model_name in ["summarize-xlarge", "summarize-medium"]: + # alternate - summarize api + response = co.summarize(text=self.add_context, model=self.model_name, length='short', temperature=0.7, + format="bullets", extractiveness='medium', additional_command=prompt) + + text_out = response.summary + + usage = {"input": len(prompt_enriched), "output": len(text_out), + "total": len(prompt_enriched) + len(text_out), "metric": "chars"} + + else: + # generate api + response = co.generate(model=self.model_name, prompt=prompt_enriched, + max_tokens=self.target_requested_output_tokens, temperature=0.6, + stop_sequences=["--"]) + + text_out = response.generations[0].text + + usage = {"input": len(prompt_enriched), "output": len(text_out), + "total": len(prompt_enriched) + len(text_out), "metric": "chars"} + + except Exception as e: + + # print(traceback.format_exc()) + + text_out = "/***ERROR***/" + + usage = {"input": 0, "output": 0, "total": 0, "metric": "chars"} + # raise LLMInferenceResponseException(e) + logging.error("error: Cohere model inference produced error - %s - ", e) + + # will look to capture usage metadata + + output_response = {"llm_response": text_out, "usage": usage} + + logging.info("update: output response - cohere : %s ", output_response) + + return output_response + + +class AIBReadGPTModel: + + def __init__(self, model_name=None, api_key=None): + + self.api_key = api_key + + self.model_name = model_name + self.model = None + self.tokenizer = None + + self.error_message = "\nUnable to connect to AIB READ GPT API. Please try again later." + + # set max_total_len -> adjust input and output based on use case + self.max_total_len = 2048 + self.max_input_len = 1024 + + self.llm_max_output_len = 1024 + + self.separator = "\n" + + # inference settings + self.temperature = 0.2 + self.target_requested_output_tokens = 200 + self.add_prompt_engineering = True + self.add_context = "" + + def set_api_key(self, api_key, env_var="USER_MANAGED_READ_GPT_API_KEY"): + + # set api_key + os.environ[env_var] = api_key + logging.info("update: added and stored READ_GPT api_key in environmental variable- %s", env_var) + + return self + + def _get_api_key(self, env_var="USER_MANAGED_READ_GPT_API_KEY"): + + self.api_key = os.environ.get(env_var) + + return self.api_key + + def token_counter(self, text_sample): + + tokenizer = Utilities().get_default_tokenizer() + toks = tokenizer.encode(text_sample).ids + return len(toks) + + # very simple prompt construction used for now -> will likely evolve over time + def prompt_engineer(self, query, context, inference_dict=None): + + if not query: + query = "What is a list that summarizes the key points?" + + # default_case + prompt_engineered = context + "\n" + query + + if self.add_prompt_engineering == "top_level_summary_select": + prompt_engineered += query + "\n" + prompt_engineered += "Which of the following selections best answers the question?" + prompt_engineered += context + + if self.add_prompt_engineering == "summarize_with_bullets_no_query": + issue = "What is a list of the most important points?" + prompt_engineered = context + "\n" + issue + + return prompt_engineered + + def load_model_for_inference(self, model_name=None, fp=None): + # look up model_name in configs + if model_name: + self.model_name = model_name + return self + + def load_pretrained_model(self, model_name=None): + if model_name: + self.model_name = model_name + # convenience method for pretrained models as a single step + return self + + def inference(self, prompt, add_context=None, add_prompt_engineering=None, inference_dict=None, + api_key=None): + + if add_context: + self.add_context = add_context + + if add_prompt_engineering: + self.add_prompt_engineering = add_prompt_engineering + + if inference_dict: + + if "temperature" in inference_dict: + self.temperature = inference_dict["temperature"] + + if "max_tokens" in inference_dict: + self.target_requested_output_tokens = inference_dict["max_tokens"] + + prompt_enriched = self.prompt_engineer(prompt, self.add_context, inference_dict=inference_dict) + + # safety check on length - set cap with small 'buffer' + input_tokens = self.token_counter(prompt_enriched) + buffer = 10 + available_tokens_in_output_context_window = self.max_total_len - input_tokens - buffer + # if target requested output is less, then keep - otherwise, cap with 'safe' maximum len + target_len = min(self.target_requested_output_tokens, available_tokens_in_output_context_window) + + output_dict_new = {} + output_response = {} + usage = {"input": input_tokens} + + if api_key: + self.api_key = api_key + + if not self.api_key: + self.api_key = self._get_api_key() + + params = {"prompt": prompt_enriched, "max_output_tokens": target_len, "api_key": self.api_key} + + try: + # linked to TEST SERVER + output = requests.post(os.environ.get("AIB_READ_GPT_URI"), data=params) + output_dict_new = ast.literal_eval(output.text) + success_path = 1 + + except: + + text_output = "/***ERROR***/" + usage = {"input": 0, "output": 0, "total": 0, "metric": "tokens"} + + logging.error("error: no response from aib remote server for aib-read-gpt model - " + "check api key and connection") + + success_path = -1 + output_response = {"llm_response": "", "usage": usage} + + # quick postprocessing + + if success_path == 1: + + for keys, values in output_dict_new.items(): + if keys.startswith("response_"): + response = output_dict_new[keys] + + output_len = self.token_counter(response) + usage.update({"output": output_len}) + usage.update({"total": usage["input"] + output_len}) + usage.update({"metric": "tokens"}) + + output_response = {"llm_response": response, "usage": usage} + + logging.info("update: output_response - aib-read-gpt - %s", output_response) + + if keys == "message": + logging.error("error - output not received from model") + + return output_response + + +class OpenAIEmbeddingModel: + + def __init__(self, model_name=None, api_key=None, embedding_dims=None): + + # must have elements for embedding model + self.model_name = model_name + self.api_key = api_key + + if not embedding_dims: + self.embedding_dims = 1536 + else: + self.embedding_dims = embedding_dims + + self.max_total_len = 2048 + + self.error_message = "\nUnable to connect to OpenAI. Please try again later." + + def set_api_key(self, api_key,env_var="USER_MANAGED_OPENAI_API_KEY"): + + # set api_key + os.environ[env_var] = api_key + logging.info("update: added and stored OpenAI api_key in environmental variable- %s", env_var) + + return self + + def _get_api_key(self, env_var="USER_MANAGED_OPENAI_API_KEY"): + + self.api_key = os.environ.get(env_var) + return self.api_key + + def token_counter(self, text_sample): + tokenizer = Utilities().get_default_tokenizer() + toks = tokenizer.encode(text_sample).ids + return len(toks) + + def embedding(self, text_sample, api_key=None): + + model = "text-embedding-ada-002" + + if api_key: + self.api_key = api_key + + if not self.api_key: + self.api_key = self._get_api_key() + + if not self.api_key: + logging.error("error: invoking OpenAI Embedding model with no api_key") + + # need to prepare for batches + if isinstance(text_sample, list): + text_prompt = text_sample + input_len = len(text_sample) + else: + text_prompt = [text_sample] + input_len = 1 + + openai.api_key = self.api_key + response = openai.Embedding.create(model=model, input=text_prompt) + + logging.info("update: response: %s ", response) + + if input_len == 1: + embedding = response['data'][0]['embedding'] + else: + embedding = [] + for i, entries in enumerate(response['data']): + embedding.append(response['data'][i]['embedding']) + + # logging.info("update: embedding only: %s ", embedding) + logging.info("update: embedding dims: %s ", len(embedding)) + + # embedding = np.array(embedding) + # embedding_2d = np.expand_dims(embedding, 0) + + return embedding + + +class CohereEmbeddingModel: + + def __init__(self, model_name = None, api_key=None, embedding_dims=None): + + self.api_key = api_key + self.model_name = model_name + + if not embedding_dims: + self.embedding_dims = 4096 + else: + self.embedding_dims = embedding_dims + + self.max_total_len = 2048 + self.error_message = "\nUnable to connect to Cohere. Please try again later." + + def set_api_key(self, api_key, env_var="USER_MANAGED_COHERE_API_KEY"): + + # set api_key + os.environ[env_var] = api_key + logging.info("update: added and stored COHERE api_key in environmental variable- %s", env_var) + + return self + + def _get_api_key(self, env_var="USER_MANAGED_COHERE_API_KEY"): + + self.api_key = os.environ.get(env_var) + + return self.api_key + + def token_counter(self, text_sample): + tokenizer = Utilities().get_default_tokenizer() + toks = tokenizer.encode(text_sample).ids + return len(toks) + + def embedding(self,text_sample): + + if not self.api_key: + self.api_key = self._get_api_key() + + if not self.api_key: + logging.error("error: invoking Cohere embedding model with no api_key") + + co = cohere.Client(self.api_key) + + # need safety check on length of text_sample + + # need to prepare for batches + if isinstance(text_sample, list): + text_prompt = text_sample + input_len = len(text_sample) + else: + text_prompt = [text_sample] + input_len = 1 + + response = co.embed(text_prompt) + + output = [] + for i, emb in enumerate(response.embeddings): + + logging.info("update: embedding - %s - %s ", i, emb) + + # normalization of the Cohere embedding vector improves performance + emb_vec = np.array(emb) / np.linalg.norm(emb) + + output.append(emb_vec) + + return output + + +class GoogleEmbeddingModel: + + def __init__(self, model_name=None, api_key=None, embedding_dims=None): + + self.api_key = api_key + self.model_name = model_name + + self.max_total_len = 3072 + + # supports context window up to 3072 tokens for embedding + + if not embedding_dims: + self.embedding_dims = 768 # Google text-embedding-gecko-001 has 768 dims + else: + self.embedding_dims = embedding_dims + + self.error_message = "\nUnable to connect to Google/Text Embedding Model. Please try again later." + + def set_api_key(self, api_key, env_var="USER_MANAGED_GOOGLE_API_KEY"): + + # set api_key + os.environ[env_var] = api_key + logging.info("update: added and stored GOOGLE api_key in environmental variable- %s", env_var) + + return self + + def _get_api_key(self, env_var="USER_MANAGED_GOOGLE_API_KEY"): + + self.api_key = os.environ.get(env_var) + return self.api_key + + def token_counter(self, text_sample): + tokenizer = Utilities().get_default_tokenizer() + toks = tokenizer.encode(text_sample).ids + return len(toks) + + def embedding(self,text_sample, api_key= None): + + if api_key: + self.api_key = api_key + + if not self.api_key: + self.api_key = self._get_api_key() + + if not self.api_key: + logging.error("error: invoking Google Embedding model with no api_key") + + # Important: Before calling the model, we need to ensure the contents of the api_key + # (the json dict string) have been persisted to a file + # and the environment variable GOOGLE_APPLICATION_CREDENTIALS points to that file path + + google_json_credentials = self.api_key_to_json() + os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = google_json_credentials + + embeddings_output = [] + + try: + + model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001") + + if isinstance(text_sample,list): + text_list = text_sample + else: + text_list = [text_sample] + + # need to batch the text list + # Google appears to set a cap of 5 text samples per embedding inference call + + google_max_samples_per_inference = 5 + + batch_count = len(text_list) // google_max_samples_per_inference + if batch_count * google_max_samples_per_inference < len(text_list): + batch_count += 1 + + for x in range(0, batch_count): + new_batch = text_list[x*google_max_samples_per_inference: + min((x+1)*google_max_samples_per_inference, len(text_list))] + + logging.info("update: new batch - %s - %s ", x, len(new_batch)) + + embeddings_from_google = model.get_embeddings(new_batch) + + for i, embedding in enumerate(embeddings_from_google): + embeddings_output.append(np.array(embedding.values)) + + except Exception as e: + # raise LLMInferenceResponseException(e) + logging.error("error: Google model inference produced error - %s ", e) + + finally: + os.remove(google_json_credentials) + + return embeddings_output + + def api_key_to_json(self): + + # Google authentication key is an entire json dictionary which we have the user pass in as an env var + # We write out the json and we need to escape newlines which seem to be always present in + # google auth json files + + temp_json_path = tempfile.NamedTemporaryFile(prefix="googlecreds", delete=False).name + with open(temp_json_path, "w") as f: + f.write(self.api_key.replace("\n", "\\n")) + return temp_json_path + + +class HFEmbeddingModel: + + def __init__(self, model=None, tokenizer=None, model_name=None, api_key=None, + embedding_dims=None): + + # pull in expected hf input + self.model_name = model_name + self.model = model + self.tokenizer= tokenizer + self.embedding_dims = embedding_dims + self.model_type = None + self.max_total_len = 2048 + self.model_architecture = None + + logging.info("update - loading HF Model - %s", model.config.to_dict()) + + if self.model: + + self.config = model.config.to_dict() + + if "hidden_size" in self.config: + self.embedding_dims = self.config["hidden_size"] + logging.info("warning: embedding_dims - from config - %s ", self.embedding_dims) + + if "model_type" in self.config: + self.model_type = self.config["model_type"] + + if "max_position_embeddings" in self.config: + self.max_total_len = self.config["max_position_embeddings"] + + if "_name_or_path" in self.config: + self.model_name = self.config["_name_or_path"] + logging.info("update: model_name - from config - %s ", self.model_name) + + if "architectures" in self.config: + if isinstance(self.config["architectures"],list): + self.model_architectures = self.config["architectures"][0] + else: + self.model_architectures = self.config["architectures"] + + else: + raise ModelNotFoundException(model_name) + + # no api key expected or required + self.api_key = api_key + + def token_counter(self, text_sample): + # need to support HF tokenizer + toks = self.tokenizer.encode(text_sample).ids + return len(toks) + + # this is here for temporary reference - will be removed + def stransformer_embedding(self, sentence): + embedding = self.model.encode(sentence, convert_to_tensor=True) + embedding_2d = embedding.unsqueeze(0) + return embedding_2d + + def embedding (self, text_sample, api_key=None): + + # return embeddings only + if isinstance(text_sample,list): + sequence = text_sample + + else: + sequence = [text_sample] + + logging.info("update: HFEmbedding.embedding() - %s ", len(text_sample)) + + # shorter than 512 + model_inputs = self.tokenizer(sequence, truncation=True, max_length=500, return_tensors="pt",padding=True) + + model_outputs = self.model(model_inputs.input_ids, + attention_mask=model_inputs.attention_mask, output_hidden_states=True) + + # the [cls] aggregated embedding is in the last hidden state + # dims of [1, 768] + + embedding = model_outputs.hidden_states[-1][:,0] + + # embedding = embedding.detach().numpy() + logging.info("update: hf embeddings output shape - %s ", embedding.shape) + + # normalize hf embeddings + embeddings_normalized = torch.nn.functional.normalize(embedding, p=2, dim=1) + embeddings_normalized = embeddings_normalized.detach().numpy() + + return embeddings_normalized + + +class HFGenerativeModel: + + def __init__(self, model=None, tokenizer=None, model_name=None, api_key=None, + prompt_wrapper=None, instruction_following=False): + + # pull in expected hf input + self.model_name = model_name + self.model = model + self.tokenizer= tokenizer + + # note - these two parameters will control how prompts are handled - model-specific + self.prompt_wrapper = prompt_wrapper + self.instruction_following = instruction_following + + self.model_type = None + self.config = None + self.max_total_len = 2048 + self.max_input_len = 1024 + self.llm_max_output_len = 1024 + self.model_architecture = None + self.separator = "\n" + + if self.model: + + if isinstance(self.model.config, dict): + self.config = self.model.config + else: + self.config = self.model.config.to_dict() + + if "model_type" in self.config: + self.model_type = self.config["model_type"] + + if "hidden_size" in self.config: + self.embedding_dims = self.config["hidden_size"] + + if "max_position_embeddings" in self.config: + self.max_total_len = self.config["max_position_embeddings"] + + if "architectures" in self.config: + if isinstance(self.config["architectures"],list): + self.model_architectures = self.config["architectures"][0] + else: + self.model_architectures = self.config["architectures"] + + else: + logging.error("error: HFGenerativeModel - could not identify model - ", model_name) + + # no api key expected or required + self.api_key = api_key + + self.error_message = "\nUnable to identify and load HuggingFace model." + + # inference settings + self.temperature = 0.5 + self.target_requested_output_tokens = 100 + self.add_prompt_engineering = False + self.add_context = "" + + def token_counter(self, text_sample): + tokenizer = Utilities().get_default_tokenizer() + toks = tokenizer.encode(text_sample).ids + return len(toks) + + def prompt_engineer (self, query, context, inference_dict): + + # if loaded model was not pretrained on instruction_following, then skip any instructions + if not self.instruction_following: + + if context: + output = context + "\n" + query + else: + output = query + + # unlikely that there would be an 'instruct wrapping' on text, but allow for possibility + if self.prompt_wrapper: + output = PromptCatalog().apply_prompt_wrapper(output, self.prompt_wrapper, + instruction=None) + + return output + + # move ahead to add instructions and prompt engineering + + if not self.add_prompt_engineering: + if context: + selected_prompt = "default_with_context" + else: + selected_prompt = "default_no_context" + else: + selected_prompt = self.add_prompt_engineering + + prompt_dict = PromptCatalog().build_core_prompt(prompt_name=selected_prompt, + separator=self.separator, + query=query, + context=context, + inference_dict=inference_dict) + + if prompt_dict: + prompt_engineered = prompt_dict["core_prompt"] + else: + # default case + prompt_engineered = "Please read the following text: " + context + self.separator + prompt_engineered += "Based on this text, please answer the question: " + query + self.separator + prompt_engineered += "Please answer the question only with facts provided in the materials. " \ + "If the question can not be answered in the materials, then please " \ + "respond 'Not Found.'" + + # final wrapping, based on model-specific instruct training format + # --provides a final 'wrapper' around the core prompt text, based on model expectations + + if self.prompt_wrapper: + prompt_engineered = PromptCatalog().apply_prompt_wrapper(prompt_engineered, self.prompt_wrapper, + instruction=None) + + return prompt_engineered + + @torch.no_grad() + def inference(self, prompt,add_context=None, add_prompt_engineering=None, api_key=None, + inference_dict=None): + + # first prepare the prompt + + if add_context: + self.add_context = add_context + + if add_prompt_engineering: + self.add_prompt_engineering = add_prompt_engineering + + if inference_dict: + + if "temperature" in inference_dict: + self.temperature = inference_dict["temperature"] + + if "max_tokens" in inference_dict: + self.target_requested_output_tokens = inference_dict["max_tokens"] + + text_prompt = prompt + + if self.add_prompt_engineering: + prompt_enriched = self.prompt_engineer(prompt, self.add_context, inference_dict=inference_dict) + prompt_final = prompt_enriched + text_prompt = prompt_final + "\n" + + # second - tokenize to get the input_ids + + tokenizer_output = self.tokenizer.encode(text_prompt) + input_token_len = len(tokenizer_output) + input_ids = torch.tensor(tokenizer_output).unsqueeze(0) + + # Note: this is a simplified 'sampling' generation loop, derived from the far more + # sophisticated Generation capabilities provided by the Transformers library + # It is included here to enable transformers users to easily extend llmware to include + # their favorite generative models in the transformers library. + + # The code below contains code copied from, derived from or inspired from the Huggingface + # transformers generation code. + # (https: // github.com / huggingface / transformers / src / transformers / generation) + + # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc.team. + # Copyright(c) 2018, NVIDIA CORPORATION.All rights reserved. + # Licensed under the Apache License, Version 2.0(the "License"); you may not use this + # file except in compliance with the License. You may obtain a copy of the License at + # http: // www.apache.org / licenses / LICENSE - 2.0 Unless required by applicable law or agreed + # to in writing, software distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License + # for the specific language governing permissions and limitations under the License. + + # default settings + pad_token_id = 0 + eos_token_id = [0] + + eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) + + # keep track of which sequences are already finished + unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device) + + this_peer_finished = False # used by synced_gpus only + # auto-regressive generation + new_tokens_generated = 0 + + attn_mask = torch.ones(input_ids.shape[1]).unsqueeze(0) + + batch_size = input_ids.shape[0] + seq_len = input_ids.shape[1] + + pkv = None + + while True: + + inp_one_time: torch.LongTensor = input_ids + + if new_tokens_generated > 0: + inp_one_time = input_ids[:, -1:] + + inp0 = inp_one_time + inp1 = attn_mask + # inp3 = torch.LongTensor([new_tokens_generated]) + + # need to invoke forward pass on model + # outputs = self.model(inp0,inp1,pkv) + outputs = self.model(input_ids=inp0,attention_mask=inp1, past_key_values=pkv, + return_dict=True) + + new_tokens_generated += 1 + + next_token_logits = outputs.logits[:,-1,:] + + if self.temperature: + next_token_scores = next_token_logits / self.temperature + else: + next_token_scores = next_token_logits + + # sample + probs = nn.functional.softmax(next_token_scores, dim=-1) + next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) + + # finished sentences should have their next token be a padding token + if eos_token_id is not None: + next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) + + # update generated ids, model inputs, and length for next step + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) + + # testing output in progress starts here + """ + print("update: input_ids -", input_ids) + # outputs_detached = outputs.to('cpu') + outputs_np = np.array(input_ids[0]) + output_str = self.tokenizer.decode(outputs_np) + print("update: output string - ", output_str) + """ + # end - testing output in progress + + pkv = outputs.past_key_values + + # update attention mask + attn_mask = torch.cat([attn_mask, attn_mask.new_ones((attn_mask.shape[0], 1))], dim=-1) + + # if eos_token was found in one sentence, set sentence to finished + if eos_token_id_tensor is not None: + unfinished_sequences = unfinished_sequences.mul( + next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0) + ) + + # stop when each sentence is finished + if unfinished_sequences.max() == 0: + this_peer_finished = True + + # stop if we exceed the maximum length + if new_tokens_generated > self.target_requested_output_tokens: + this_peer_finished = True + + if this_peer_finished: + break + + # Generation completed - prepare the output + + # outputs_detached = outputs.to('cpu') + outputs_np = np.array(input_ids[0]) + output_only = outputs_np[input_token_len:] + output_str = self.tokenizer.decode(output_only) + + if output_str.endswith("<|endoftext|>"): + output_str = output_str[:-len("<|endoftext|>")] + if output_str.strip().startswith(":"): + output_str = output_str[len(":"):] + output_str.strip() + + total_len = len(outputs_np) + + usage = {"input": input_token_len, + "output": total_len - input_token_len, + "total": total_len, + "metric": "tokens"} + + output_response = {"llm_response": output_str, "usage": usage} + + return output_response + + +class LLMWareSemanticModel: + + def __init__(self, model_name=None, model=None, embedding_dims=None, max_seq_length=150): + + self.model_name = model_name + self.error_message = "\nUnable to process LLMWare Semantic Model. Please try again later" + + self.max_input_len = 512 + self.max_output_len = 512 + self.max_seq_length = max_seq_length + + # to be applied to 'passed-in' Sentence Transformers model + self.normalize_embeddings = True + self.received_loaded_model = False + + # need to parameterize the embedding dims based on model config + if not embedding_dims: + self.embedding_dims = 768 + if model_name == 'mini-lm-sbert': + self.embedding_dims = 384 + + else: + self.embedding_dims = embedding_dims + + self.model_repo_location = LLMWareConfig.get_model_repo_path() + self.model_size="standard" + if model_name == 'mini-lm-sbert': + self.model_size = "mini" + self.transformer_base_model = None + + if model: + logging.info("update: SemanticEmbedding model received model - will attempt to load as " + "Sentence Transformer model") + + self.model = model + self.received_loaded_model = True + + if len(model) >= 2: + + try: + # general case is that embedding dimension is the "word_embedding_dimension" of the + # 'Pooling' layer, which is generally the second and last layer of the sbert model + self.embedding_dims = model[1].word_embedding_dimension + + # there are at least 2 edge cases, in which a "Dense" layer is attached after the + # Pooling layer, and further consolidates the embeddings + + if len(model) > 2: + logging.info("update: Sentence Transformer model with more than two layers - unusual - " + " depending upon the architecture, there may be issues loading the model- %s", + len(model)) + + # note: the most common case is with a Dense 3rd layer that maps the Pooling output to + # a different dimension - in this case - this should give the dimensions: + # + # last_layer_config = model[-1].get_config_dict() + # if "out_features" in last_layer_config: + # self.embedding_dims = last_layer_config["out_features"] + + except: + logging.error("error: could not identify model to run embedding - ", model_name) + raise ModelNotFoundException(model_name) + + def load_model_for_inference(self,fp=None): + + if fp: + self.model_repo_location = fp + + self.model = STransformer(self.model_repo_location, model_size=self.model_size, + max_seq_length=self.max_seq_length) + + return self + + def embedding(self, sentence): + + # embedding = self.model.encode(sentence, convert_to_tensor=True) + embedding = self.model.encode(sentence) + + # add normalization for imported sentence transformer models + """ + if self.received_loaded_model and self.normalize_embeddings: + # normalize embeddings + embedding = torch.tensor(embedding).squeeze(0) + embedding = torch.nn.functional.normalize(embedding, p=2, dim=1) + embedding = embedding.detach().numpy() + """ + + # embedding_2d = embedding.unsqueeze(0) + return embedding + + def cosine_similarity(self, a, b): + return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) + + def euclidean_distance(self,a,b): + # aligning with FAISS - which returns square of Euclidean distance + return np.linalg.norm(a - b) * np.linalg.norm(a-b) + + +# The code that follows contains code copied from, derived from or inspired by Nils Reimers and the +# UKP Lab Sentence Transformers Model. (https://github.com/UKPLab/sentence-transformers) +# Copyright 2019 Nils Reimers +# Modifications Copyright 2023 llmware + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class STransformer (nn.Sequential): + + def __init__(self, model_path, max_seq_length=150, model_size="standard", + torch_dtype=torch.float32): + + super().__init__() + + self.do_lower_case=False + self.max_seq_length = max_seq_length + self.torch_dtype = torch_dtype + self.device = 'cuda' if torch.cuda.is_available() else "cpu" + + logging.info("update - creating Transformer - model dims - %s ", model_size) + + self.word_embedding_model = Transformer(model_path, model_size=model_size) + + # pooling mode = "mean" by default + self.pooling_model = Pooling(self.word_embedding_model.get_word_embedding_dimension()) + + modules=[self.word_embedding_model, self.pooling_model] + self.model = OrderedDict([(str(idx), module) for idx, module in enumerate(modules)]) + + def tokenize(self, texts): + return self.word_embedding_model.tokenize_wrapper(texts) + + def encode(self, sentences, batch_size=32, normalize_embeddings=True): + + self.eval() + + output_value = "sentence_embedding" + convert_to_numpy = True + convert_to_tensor = False + normalize_embeddings = True + device = None + + # output expected to be in numpy array + + input_was_string = False + if isinstance(sentences, str) or not hasattr(sentences, '__len__'): + sentences = [sentences] + input_was_string = True + + self.to(self.device) + + all_embeddings = [] + length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences]) + sentences_sorted = [sentences[idx] for idx in length_sorted_idx] + + show_progress_bar = None + + for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar): + sentences_batch = sentences_sorted[start_index:start_index+batch_size] + features = self.tokenize(sentences_batch) + + for key in features: + if isinstance(features[key], Tensor): + features[key] = features[key].to(self.device) + + with torch.no_grad(): + + out_features = self.forward(features) + + # assume sentence_embeddings only + embeddings = out_features[output_value] + embeddings = embeddings.detach() + + if normalize_embeddings: + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + if convert_to_numpy: + embeddings = embeddings.cpu() + + all_embeddings.extend(embeddings) + + all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)] + + if convert_to_tensor: + all_embeddings = torch.stack(all_embeddings) + + elif convert_to_numpy: + all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings]) + + if input_was_string: + all_embeddings = all_embeddings[0] + + return all_embeddings + + def _text_length(self, text): + + if isinstance(text, dict): + # {key: value} case + return len(next(iter(text.values()))) + + elif not hasattr(text, '__len__'): + # Object has no len() method + return 1 + + elif len(text) == 0 or isinstance(text[0], int): + # Empty string or list of ints + return len(text) + + else: + # Sum of length of individual strings + return sum([len(t) for t in text]) + + +class Transformer (nn.Module): + + def __init__(self, model_path, max_seq_length=150, do_lower_case= False, model_size="standard"): + super().__init__() + + # need to look up model config first + try: + self.config = json.load(open(os.path.join(model_path,"config.json"), "r")) + + except: + if model_size == "mini": + self.config = bert_mini_config + else: + self.config = bert_base_config + + self.config_keys = ['max_seq_length', 'do_lower_case'] + + self.do_lower_case = do_lower_case + self.max_seq_length = max_seq_length + + bert_config = BertConfig(config_dict=self.config) + # print("loading weights from path - ", model_path) + + # by default, assume BERT based model - TODO: extend to Roberta base options + self.auto_model = BertModel(bert_config).load_weights_from_file(model_path) + + tokenizer_file = "tokenizer.json" + self.tokenizer = Utilities().load_tokenizer_from_file(os.path.join(model_path, tokenizer_file)) + + # tokenizer is where the max_length is applied + self.tokenizer.enable_truncation(max_length=self.max_seq_length,strategy="longest_first") + self.tokenizer.enable_padding(pad_id=0) + + def forward(self, features): + + # note: features in forward from Transformer passed to Pooling layer for final output + trans_features = {'input_ids': features['input_ids'], 'attention_mask': features['attention_mask']} + output_states = self.auto_model(**trans_features) + output_tokens = output_states[0] + features.update({'token_embeddings': output_tokens, 'attention_mask': features['attention_mask']}) + + return features + + def get_word_embedding_dimension(self): + return self.auto_model.config.hidden_size + + def tokenize_wrapper(self, text, padding=True, truncation="longest_first"): + + self.tokenizer.enable_truncation(max_length=self.max_seq_length, strategy=truncation) + if padding: + self.tokenizer.enable_padding(pad_id=0) + + batch_input = self.tokenizer.encode_batch(text) + + input_id_list = [] + token_id_list = [] + am_list = [] + + for i, encoding_obj in enumerate(batch_input): + input_id_list.append(encoding_obj.ids) + token_id_list.append(encoding_obj.type_ids) + am_list.append(encoding_obj.attention_mask) + + inputs_agg = {"input_ids": torch.tensor(input_id_list, dtype=torch.long), + "token_type_ids": torch.tensor(token_id_list, dtype=torch.long), + "attention_mask": torch.tensor(am_list, dtype=torch.long)} + + return inputs_agg + + +class Pooling(nn.Module): + + def __init__(self, word_embedding_dimension): + + super(Pooling, self).__init__() + + self.pooling_mode = "mean" + self.word_embedding_dimension = word_embedding_dimension + self.pooling_mode_mean_tokens = True + + def forward(self, features): + + token_embeddings = features['token_embeddings'] + attention_mask = features['attention_mask'] + + # Pooling strategy - "pooling_mode_mean_tokens" + output_vectors = [] + + self.pooling_mode_mean_tokens = True + + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) + + sum_mask = input_mask_expanded.sum(1) + sum_mask = torch.clamp(sum_mask, min=1e-9) + + output_vectors.append(sum_embeddings / sum_mask) + + output_vector = torch.cat(output_vectors, 1) + features.update({'sentence_embedding': output_vector}) + + return features + + +"""PyTorch BERT model.""" + +# The code below contains code copied from, derived from or inspired from the PyTorch BERT model. +# (https://github.com/huggingface/transformers) +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + +# Note: this is a very streamlined implementation of the BERT model, optimized for use in LLMWARE +# There are many features and options that have been purposefully omitted +# For a more robust implementation of BERT, please see the Google BERT repository, or HuggingFace + +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +bert_base_config = { + "_name_or_path": "bert-base-uncased", + "architectures": [ + "BertModel" + ], + "attention_probs_dropout_prob": 0.1, + "classifier_dropout": None, + "gradient_checkpointing": False, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 512, + "model_type": "bert", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 0, + "position_embedding_type": "absolute", + "torch_dtype": "float32", + "type_vocab_size": 2, + "use_cache": True, + "vocab_size": 30522, + "model_size": "standard" +} + + +bert_mini_config = { + "_name_or_path": "nreimers/MiniLM-L6-H384-uncased", + "architectures": [ + "BertModel" + ], + "attention_probs_dropout_prob": 0.1, + "gradient_checkpointing": False, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 384, # vs 768 + "initializer_range": 0.02, + "intermediate_size": 1536, # vs. 3072 + "layer_norm_eps": 1e-12, + "max_position_embeddings": 512, + "model_type": "bert", + "num_attention_heads": 12, + "num_hidden_layers": 6, # vs. 12 + "pad_token_id": 0, + "position_embedding_type": "absolute", + "type_vocab_size": 2, + "use_cache": True, + "vocab_size": 30522, + "model_size": "mini" +} + + +class BertConfig: + + # note: if no config passed, then defaults to standard 'bert-base-uncased' model + def __init__(self, config_dict=None, **kwargs): + + # set default parameters -> will be over-ridden by any passed configs + self.vocab_size =30522, + self.hidden_size =768, + self.num_hidden_layers =12, + self.num_attention_heads =12, + self.intermediate_size =3072, + self.hidden_act ="gelu", + self.hidden_dropout_prob =0.1, + self.attention_probs_dropout_prob =0.1, + self.max_position_embeddings =512, + self.type_vocab_size =2, + self.initializer_range =0.02, + self.layer_norm_eps =1e-12, + self.pad_token_id =0, + self.position_embedding_type ="absolute", + self.use_cache =True, + self.classifier_dropout =None, + self.model_size ="standard" + + for key in config_dict: + setattr(self, key, config_dict[key]) + + self.output_hidden_states = False + self.output_attentions = False + self.torch_dtype = kwargs.pop("torch_dtype", None) + self.pruned_heads = kwargs.pop("pruned_heads", {}) + + # self.tie_word_embeddings = kwargs.pop("tie_word_embeddings", True) + + +class BertEmbeddings(nn.Module): + + def __init__(self, config): + super().__init__() + + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, + padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + self.position_embedding_type = "absolute" + + self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + + def forward(self, input_ids): + + past_key_values_length = 0 + + input_shape = input_ids.size() + seq_length = input_shape[1] + + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Module): + + def __init__(self, config): + super().__init__() + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + self.position_embedding_type = "absolute" + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward(self, hidden_states, attention_mask= None): + + output_attentions = False + + mixed_query_layer = self.query(hidden_states) + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to (taken from original Transformer paper) + attention_probs = self.dropout(attention_probs) + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class BertSelfOutput(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Module): + + def __init__(self, config): + super().__init__() + self.self = BertSelfAttention(config) + self.output = BertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward(self, hidden_states, attention_mask= None): + + self_outputs = self.self(hidden_states, attention_mask) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class BertIntermediate(nn.Module): + + def __init__(self, config): + super().__init__() + + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + self.intermediate_act_fn = nn.functional.gelu + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(nn.Module): + + def __init__(self, config): + super().__init__() + + self.seq_len_dim = 1 + self.attention = BertAttention(config) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward(self, hidden_states, attention_mask= None): + + self_attention_outputs = self.attention(hidden_states, attention_mask) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] + + layer_output = self.feed_forward_chunk(attention_output) + + outputs = (layer_output,) + outputs + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BertEncoder(nn.Module): + + def __init__(self, config): + super().__init__() + + self.config = config + self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) + + def forward(self, hidden_states, attention_mask= None): + + for i, layer_module in enumerate(self.layer): + layer_outputs = layer_module(hidden_states, attention_mask) + hidden_states = layer_outputs[0] + + return (hidden_states,) + + +class BertPooler(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertModel (nn.Module): + + def __init__(self, config, add_pooling_layer=True, torch_dtype=torch.float16): + super().__init__() + + self.config = config + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config) + self.pooler = BertPooler(config) if add_pooling_layer else None + self.torch_dtype = torch_dtype + self.dtype = torch_dtype + + def load_weights_from_file(self, fp=None): + model_file = "pytorch_model.bin" + self.load_state_dict(torch.load(os.path.join(fp,model_file), map_location=torch.device('cpu')), strict=False) + logging.info("update: re-loaded model weights from file") + self.eval() + return self + + def _prune_heads(self, heads_to_prune): + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def forward(self, input_ids, attention_mask= None): + + token_type_ids = None + input_shape = input_ids.size() + batch_size, seq_length = input_shape + device = input_ids.device + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length)), device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + + embedding_output = self.embeddings(input_ids=input_ids) + + encoder_outputs = self.encoder(embedding_output, attention_mask=extended_attention_mask) + + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + return (sequence_output, pooled_output) + encoder_outputs[1:] + + def get_extended_attention_mask(self, attention_mask, input_shape): + + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is an encoder, make the mask: [batch_size, num_heads, seq_length, seq_length] + extended_attention_mask = attention_mask[:, None,None, :] + + else: + raise ValueError( + f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" + ) + + extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(self.dtype).min + return extended_attention_mask + + +def find_pruneable_heads_and_indices(heads, n_heads, head_size, already_pruned_heads): + + mask = torch.ones(n_heads, head_size) + heads = set(heads) - already_pruned_heads # Convert to set and remove already pruned heads + for head in heads: + # Compute how many pruned heads are before the head and move the index accordingly + head = head - sum(1 if h < head else 0 for h in already_pruned_heads) + mask[head] = 0 + mask = mask.view(-1).contiguous().eq(1) + index: torch.LongTensor = torch.arange(len(mask))[mask].long() + return heads, index + + +def prune_linear_layer(layer, index, dim= 0): + + index = index.to(layer.weight.device) + W = layer.weight.index_select(dim, index).clone().detach() + if layer.bias is not None: + if dim == 1: + b = layer.bias.clone().detach() + else: + b = layer.bias[index].clone().detach() + new_size = list(layer.weight.size()) + new_size[dim] = len(index) + new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device) + new_layer.weight.requires_grad = False + new_layer.weight.copy_(W.contiguous()) + new_layer.weight.requires_grad = True + if layer.bias is not None: + new_layer.bias.requires_grad = False + new_layer.bias.copy_(b.contiguous()) + new_layer.bias.requires_grad = True + return new_layer + + + diff --git a/llmware/parsers.py b/llmware/parsers.py new file mode 100644 index 00000000..3bc75650 --- /dev/null +++ b/llmware/parsers.py @@ -0,0 +1,3372 @@ + +# Copyright 2023 llmware + +# Licensed under the Apache License, Version 2.0 (the "License"); you +# may not use this file except in compliance with the License. You +# may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. + + +import time +import json +import re +from werkzeug.utils import secure_filename +import os +import numpy as np +from zipfile import ZipFile, ZIP_DEFLATED +import ssl +import shutil +from collections import Counter +import requests +from bs4 import BeautifulSoup +from urllib.request import urlopen, Request +from PIL import Image +from PIL.ExifTags import TAGS, GPSTAGS +import pytesseract +from pdf2image import convert_from_path +import struct +import logging +import random +from ctypes import * +import platform +import sysconfig + +from llmware.configs import LLMWareConfig +from llmware.util import Utilities, WikiKnowledgeBase, TextChunker +from llmware.resources import CollectionRetrieval, CollectionWriter, check_db_uri, ParserState +from llmware.exceptions import DependencyNotInstalledException, FilePathDoesNotExistException + +# setting important when testing locally - should be removed in production +# ssl._create_default_https_context = ssl._create_unverified_context + +# Best ways we've found to detect machine architecture +system = platform.system().lower() +machine = os.uname().machine.lower() +file_ext = { "darwin": "dylib", "linux": "so", "windows": "dll" } + +# Default to known architectures if we encounter an unknown one +if system == 'darwin' and machine not in ['arm64','x86_64']: + machine = 'arm64' +if system == 'linux' and machine not in ['aarch64','x86_64']: + machine = 'x86_64' + +# Constuct the path to a specific lib folder. Eg. .../llmware/lib/darwin/x86_64 +machine_dependent_lib_path = os.path.join(LLMWareConfig.get_config("shared_lib_path"), system, machine) + +_path_office = os.path.join(machine_dependent_lib_path, "liboffice_llmware." + file_ext[system]) +_path_pdf = os.path.join(machine_dependent_lib_path, "libpdf_llmware." + file_ext[system]) +_path_graph = os.path.join(machine_dependent_lib_path, "libgraph_llmware." + file_ext[system]) + +_mod = cdll.LoadLibrary(_path_office) +_mod_pdf = cdll.LoadLibrary(_path_pdf) +_mod_initialize = cdll.LoadLibrary(_path_graph) + + +""" +# Load shared libraries based on current platform/architecture +system = platform.system().lower() +machine = sysconfig.get_platform().split("-")[-1].lower() +_path_office, _path_pdf, _path_graph = None, None, None + + +if system == "darwin" and machine == "x86_64": + _path_office = os.path.join(LLMWareConfig.get_config("shared_lib_path"), "darwin", "x86_64", "liboffice_llmware.dylib") + _path_pdf = os.path.join(LLMWareConfig.get_config("shared_lib_path"), "darwin", "x86_64", "libpdf_llmware.dylib") + _path_graph = os.path.join(LLMWareConfig.get_config("shared_lib_path"), "darwin", "x86_64", "libgraph_llmware.dylib") + +print("system - machine - ", system, machine, LLMWareConfig.get_config("shared_lib_path")) + +if system == "darwin" and machine in ["arm64", "universal2"]: + _path_office = os.path.join(LLMWareConfig.get_config("shared_lib_path"), "darwin", "arm64", "liboffice_llmware.dylib") + _path_pdf = os.path.join(LLMWareConfig.get_config("shared_lib_path"), "darwin", "arm64", "libpdf_llmware.dylib") + _path_graph = os.path.join(LLMWareConfig.get_config("shared_lib_path"), "darwin", "arm64", "libgraph_llmware.dylib") + +if system == "linux" and machine == "x86_64": + _path_office = os.path.join(LLMWareConfig.get_config("shared_lib_path"), "x86_64", "liboffice_llmware.so") + _path_pdf = os.path.join(LLMWareConfig.get_config("shared_lib_path"), "x86_64", "libpdf_llmware.so") + _path_graph = os.path.join(LLMWareConfig.get_config("shared_lib_path"), "x86_64", "libgraph_llmware.so") + +if system == "linux" and machine == "aarch64": + _path_office = os.path.join(LLMWareConfig.get_config("shared_lib_path"), "aarch64", "liboffice_llmware.so") + _path_pdf = os.path.join(LLMWareConfig.get_config("shared_lib_path"), "aarch64", "libpdf_llmware.so") + _path_graph = os.path.join(LLMWareConfig.get_config("shared_lib_path"), "aarch64", "libgraph_llmware.so") + +_mod = cdll.LoadLibrary(_path_office) +_mod_pdf = cdll.LoadLibrary(_path_pdf) +_mod_initialize = cdll.LoadLibrary(_path_graph) + +""" + + +class Parser: + + def __init__(self, library=None, account_name="llmware", parse_to_db=False, file_counter=1): + + # check for llmware path & create if not already set up + if not os.path.exists(LLMWareConfig.get_llmware_path()): + # if not explicitly set up by user, then create folder directory structure + LLMWareConfig.setup_llmware_workspace() + + # new path for parser history - records of parse job outputs (outside of library construct) + self.parser_folder = LLMWareConfig.get_parser_path() + + if not os.path.exists(self.parser_folder): + os.mkdir(self.parser_folder) + + # create tmp workspace for parser + parser_tmp_work_folder = os.path.join(LLMWareConfig.get_tmp_path(), "parser_tmp/") + + # if tmp workspace folder already exists, then delete - start fresh + if os.path.exists(parser_tmp_work_folder): + shutil.rmtree(parser_tmp_work_folder) + os.mkdir(parser_tmp_work_folder) + + self.parser_tmp_folder = parser_tmp_work_folder + + # shift library to optional parameter - allows calls to Parser class without a library declared + self.account_name = account_name + + # placeholder used if no library passed in constructor + self.library_name = "default" + + self.library = library + self.block_size_target_characters = 600 + + # will track and increment files processed within same parsing job + self.file_counter = file_counter + + # by default, parse_to_db = False + self.parse_to_db = parse_to_db + + self.parser_job_id = ParserState().issue_new_parse_job_id() + + # if library is passed to parser, then assumes will write to library db, if available + if library: + self.account_name = library.account_name + self.library_name = library.library_name + self.block_size_target_characters = library.block_size_target_characters + + self.parser_image_folder = library.image_path + + # sets parse_to_db == True only if (a) library passed in constructor, and (b) collection db found + if check_db_uri(timeout_secs=3): + self.parse_to_db = True + else: + logging.warning("warning: Parser not able to connect to document store collection database" + "at uri - %s - will write parsing output to a parsing file.", + LLMWareConfig().get_config("collection_db_uri")) + + self.parse_to_db = False + else: + # if no library passed + self.parse_to_db = False + self.parser_image_folder = self.parser_tmp_folder + + # used to pass to the C parsers in pdf/office parsing paths + self.collection_path = LLMWareConfig.get_config("collection_db_uri") + self.collection_db_username = LLMWareConfig.get_config("collection_db_username") + self.collection_db_password = LLMWareConfig.get_config("collection_db_password") + + # 'active' output state tracker + self.parser_output = [] + + self.ACCEPTED_FILE_FORMATS = ["pptx","xlsx","docx","pdf","txt","csv","html","jsonl", + "jpg","jpeg","png","wav","zip"] + self.office_types = ["PPTX", "pptx", "XLSX", "xlsx", "DOCX", "docx"] + self.pdf_types = ["PDF", "pdf"] + self.text_types = ["txt", "csv", "html", "jsonl"] + self.ocr_types = ["jpg", "jpeg", "png"] + self.voice_types = ["wav"] + self.zip_types = ["zip"] + self.office_work_folder = None + self.pdf_work_folder = None + self.text_work_folder = None + self.voice_work_folder = None + self.zip_work_folder = None + self.ocr_work_folder = None + self.dialog_work_folder = None + self.website_work_folder = None + self.supported_parser_types = ["pdf", "office", "text", "voice", "dialog", "web", "image", + "pdf_by_ocr"] + + def clear_state(self): + self.parser_output = [] + return self + + def save_state(self): + ParserState().save_parser_output(self.parser_job_id, self.parser_output) + return self + + def _setup_workspace(self, local_work_path): + + # set up local workspace folders + if not local_work_path: + + if self.library: + local_work_path = self.library.tmp_path + else: + # if no library selected, then default to parser_tmp_folder + local_work_path = self.parser_tmp_folder + + if not os.path.exists(local_work_path): + os.makedirs(local_work_path, exist_ok=True) + + office_fp = os.path.join(local_work_path, "process_office_files/") + pdf_fp = os.path.join(local_work_path, "process_pdf_files/") + text_fp = os.path.join(local_work_path, "process_text_files/") + ocr_fp = os.path.join(local_work_path, "process_ocr_files/") + voice_fp = os.path.join(local_work_path, "process_voice_files/") + zip_fp = os.path.join(local_work_path, "process_zip_files/") + + office_workspace_fp = os.path.join(local_work_path, "office_tmp/") + + # start clean with new directories for both office + pdf + if os.path.exists(office_fp): + shutil.rmtree(office_fp, ignore_errors=True) + os.mkdir(office_fp) + self.office_work_folder = office_fp + + if os.path.exists(pdf_fp): + shutil.rmtree(pdf_fp, ignore_errors=True) + os.mkdir(pdf_fp) + self.pdf_work_folder = pdf_fp + + if os.path.exists(text_fp): + shutil.rmtree(text_fp, ignore_errors=True) + os.mkdir(text_fp) + self.text_work_folder = text_fp + + if os.path.exists(ocr_fp): + shutil.rmtree(ocr_fp, ignore_errors=True) + os.mkdir(ocr_fp) + self.ocr_work_folder = ocr_fp + + if os.path.exists(voice_fp): + shutil.rmtree(voice_fp, ignore_errors=True) + os.mkdir(voice_fp) + self.voice_work_folder = voice_fp + + if os.path.exists(zip_fp): + shutil.rmtree(zip_fp, ignore_errors=True) + os.mkdir(zip_fp) + self.zip_work_folder = zip_fp + + if os.path.exists(office_workspace_fp): + shutil.rmtree(office_workspace_fp, ignore_errors=True) + os.mkdir(office_workspace_fp) + self.office_tmp = office_workspace_fp + + def _collator(self, input_folder_path, dupe_check=False): + + # run comparison for existing files if dupe_check set True + # default case - no checking for dupes + existing_files = [] + + # run comparison for existing files if dupe_check set True + if self.library: + if dupe_check and os.path.exists(self.library.file_copy_path): + existing_files = os.listdir(self.library.file_copy_path) + + # counters + dup_counter = 0 + office_found = 0 + pdf_found = 0 + zip_found = 0 + text_found = 0 + ocr_found = 0 + voice_found = 0 + + # list of input files + input_file_names = os.listdir(input_folder_path) + files_to_be_processed = [] + duplicate_files = [] + + for filename in input_file_names: + + filetype = filename.split(".")[-1] + + go_ahead = True + + if dupe_check: + if filename in existing_files: + go_ahead= False + dup_counter += 1 + duplicate_files.append(filename) + + if go_ahead: + + files_to_be_processed.append(filename) + + # copy file into specific channel for targeted parser + + if filetype.lower() in self.office_types: + shutil.copy(os.path.join(input_folder_path,filename), os.path.join(self.office_work_folder,filename)) + office_found += 1 + + if filetype.lower() in self.pdf_types: + shutil.copy(os.path.join(input_folder_path,filename), os.path.join(self.pdf_work_folder, filename)) + pdf_found += 1 + + if filetype.lower() in self.text_types: + shutil.copy(os.path.join(input_folder_path,filename), os.path.join(self.text_work_folder,filename)) + text_found += 1 + + if filetype.lower() in self.ocr_types: + shutil.copy(os.path.join(input_folder_path,filename), os.path.join(self.ocr_work_folder,filename)) + ocr_found += 1 + + if filetype.lower() in self.voice_types: + shutil.copy(os.path.join(input_folder_path,filename), os.path.join(self.voice_work_folder,filename)) + voice_found += 1 + + if filetype.lower() in self.zip_types: + shutil.copy(os.path.join(input_folder_path,filename), os.path.join(self.zip_work_folder,filename)) + zip_found += 1 + + logging.info("update: Duplicate files (skipped): %s ", dup_counter) + logging.info("update: Total uploaded: %s ", len(input_file_names)) + + if zip_found > 0: + + # if any zip files found in upload, then unpack and process first + # --once zip extracted, push all files into the appropriate work folder for pdf, office, etc. + # --inside zip_extract_handler- will update counters + + zip_work_order = self.zip_extract_handler() + pdf_found += zip_work_order["pdf_found"] + office_found += zip_work_order["office_found"] + text_found += zip_work_order["text_found"] + voice_found += zip_work_order["voice_found"] + ocr_found += zip_work_order["ocr_found"] + + work_order = {"pdf": pdf_found, + "office": office_found, + "text": text_found, + "ocr": ocr_found, + "voice": voice_found, + "duplicate_files": duplicate_files, + "file_list": files_to_be_processed} + + return work_order + + def ingest (self, input_folder_path, dupe_check=True): + + # input_folder_path = where the input files are located + + # first - confirm that library and connection to collection db are in place + if not self.library or not self.parse_to_db: + + logging.error("error: Parser().ingest() method requires loading a library, e.g., Parser(library=my_library)," + "and a connection to a document data store - please try Parse().parse_one set of methods" + "to parse a document of any type directly into list of dictionaries in memory, and written" + "to /parser_history as a .json file") + + parsing_results = {"processed_files": 0, "rejected_files": 0, "duplicate_files": []} + return parsing_results + + # prepares workspace for individual parsers + self._setup_workspace(self.parser_tmp_folder) + + # collate and sort the file types in the work path + work_order = self._collator(input_folder_path, dupe_check=dupe_check) + + # write to db - True only if library loaded + collection connect in place + write_to_db = self.parse_to_db + + if work_order["office"] > 0: + self.parse_office(self.office_work_folder, save_history=False) + self.uploads(self.office_work_folder) + + if work_order["pdf"] > 0: + self.parse_pdf(self.pdf_work_folder, save_history=False) + self.uploads(self.pdf_work_folder) + + if work_order["text"] > 0: + self.parse_text(self.text_work_folder, save_history=False) + self.uploads(self.text_work_folder) + + if work_order["ocr"] > 0: + self.parse_image(self.ocr_work_folder, save_history=False) + self.uploads(self.ocr_work_folder) + + if work_order["voice"] > 0: + self.parse_voice(self.voice_work_folder, save_history=False) + self.uploads(self.voice_work_folder) + + # need to systematically capture list of rejected docs + + processed, not_processed = self.input_ingestion_comparison(work_order["file_list"]) + + parsing_results = {"processed_files": processed, + "rejected_files": not_processed, + "duplicate_files": work_order["duplicate_files"]} + + return parsing_results + + def ingest_to_json(self, input_folder_path): + + # prepares workspace for individual parsers + self._setup_workspace(self.parser_tmp_folder) + + # collate and sort the file types in the work path + work_order = self._collator(input_folder_path, dupe_check=False) + + # write to db - True only if library loaded + collection connect in place + self.parse_to_db = False + self.library = None + + if work_order["office"] > 0: + self.parse_office(self.office_work_folder, write_to_db=False, save_history=False) + + if work_order["pdf"] > 0: + self.parse_pdf(self.pdf_work_folder, write_to_db=False, save_history=False) + + if work_order["text"] > 0: + self.parse_text(self.text_work_folder, write_to_db=False, save_history=False) + + if work_order["ocr"] > 0: + self.parse_image(self.ocr_work_folder, write_to_db=False, save_history=False) + + if work_order["voice"] > 0: + self.parse_voice(self.voice_work_folder, write_to_db=False, save_history=False) + + # need to systematically capture list of rejected docs + + fn = ParserState().save_parser_output(self.parser_job_id, self.parser_output) + + processed, not_processed = self.input_ingestion_comparison_from_parser_state(work_order["file_list"]) + + parsing_results = {"processed_files": processed, + "rejected_files": not_processed, + "parser_output_filename": fn} + + return parsing_results + + def parse_by_type(self, parser_type, input_folder_path, url=None): + + output = None + + if parser_type in self.supported_parser_types: + + if parser_type == "pdf": + output = self.parse_pdf(input_folder_path, write_to_db=self.parse_to_db) + + if parser_type == "office": + output = self.parse_office(input_folder_path, write_to_db=self.parse_to_db) + + if parser_type == "text": + output = self.parse_text(input_folder_path, write_to_db=self.parse_to_db) + + if parser_type == "voice": + output = self.parse_voice(input_folder_path, write_to_db=self.parse_to_db) + + if parser_type == "dialog": + output = self.parse_dialog(input_folder_path, write_to_db=self.parse_to_db) + + if parser_type == "web": + output = self.parse_website(url, write_to_db=self.parse_to_db) + + if parser_type == "pdf_by_ocr": + output = self.parse_pdf_by_ocr_images(input_folder_path, write_to_db=self.parse_to_db) + + return output + + # designed to take any input zip files and iteratively unzip and push files to specific fp + def zip_extract_handler(self): + + # tracker for files found inside the zip + pdf_found = 0 + office_found = 0 + text_found = 0 + ocr_found = 0 + voice_found = 0 + + z = "" + + zip_files = os.listdir(self.zip_work_folder) + + for my_zip_names in zip_files: + + # iterate thru all of the .zip files found + + my_zip = self.zip_work_folder + my_zip_names + + # create fresh /tmp file to extract the zip files + if os.path.exists(os.path.join(self.zip_work_folder,"tmp")): + shutil.rmtree(os.path.join(self.zip_work_folder,"tmp"), ignore_errors=True) + os.mkdir(os.path.join(self.zip_work_folder,"tmp")) + + try: + # unzip and extract into /tmp folder + z = ZipFile(my_zip, 'r', compression=ZIP_DEFLATED) + ZipFile.extractall(z, os.path.join(self.zip_work_folder, "tmp")) + success_code = 1 + + except: + # may fail + success_code = -1 + logging.info("error: caution - could not open Zip- %s ", my_zip) + + if success_code == 1: + + # iterate thru all of the files found in the zip archive + # apply secure_filename and prep_filename + # route to the appropriate work folder, if applicable + + for f in z.namelist(): + + # will apply secure name and cap length, but does not run duplicate file check + fn = self.prep_filename(f, max_len=240, secure_name=True) + ext = fn.split(".")[-1] + + if success_code == 1: + + if ext in ["pptx", "docx", "xlsx"]: + shutil.copy(os.path.join(self.zip_work_folder,"tmp/",f), + os.path.join(self.office_work_folder,fn)) + office_found += 1 + + if ext in ["pdf"]: + shutil.copy(os.path.join(self.zip_work_folder, "tmp/", f), + os.path.join(self.pdf_work_folder,fn)) + pdf_found += 1 + + if ext in ["txt", "csv"]: + shutil.copy(os.path.join(self.zip_work_folder, "tmp/", f), + os.path.join(self.text_work_folder,fn)) + text_found += 1 + + if ext in ["png", "jpg", "jpeg"]: + shutil.copy(os.path.join(self.zip_work_folder,"tmp/",f), + os.path.join(self.ocr_work_folder,fn)) + ocr_found += 1 + + if ext in ["wav"]: + shutil.copy(os.path.join(self.zip_work_folder,"tmp/",f), + os.path.join(self.voice_work_folder, fn)) + voice_found += 1 + + work_order = {"pdf": pdf_found, "office": office_found, "text": text_found, "ocr": ocr_found, "voice": voice_found} + + return work_order + + # new method - picks up .txt file from Office or PDF parser and converts to list of dictionaries for insertion in DB + def convert_parsing_txt_file_to_json(self, file_path=None, fn="pdf_internal_test0.txt"): + + default_keys = ["block_ID", "doc_ID", "content_type", "file_type", "master_index", "master_index2", + "coords_x", "coords_y", "coords_cx", "coords_cy", "author_or_speaker", "modified_date", + "created_date", "creator_tool", "added_to_collection", "file_source", + "table", "external_files", "text", "header_text", "text_search", + "user_tags", "special_field1", "special_field2", "special_field3", "graph_status", "dialog"] + + if not file_path: + # this is the default path where parser will put the txt file + file_path = self.parser_tmp_folder + + # test script for parsing txt file + try: + output_file = open(os.path.join(file_path, fn), "r").read() + + except: + logging.warning("warning: Parser - could not find parsing output - %s - %s ", file_path, fn) + return [] + + # this seems to work with a few library sets, but we can probably enhance the 'splitting' + # \n marks the end of a block of text with ~28 dictionary keys + blocks = output_file.split("\n") + + output_list = [] + + for i, b in enumerate(blocks): + + # split of "\n<" will split the block into ~28 individual slices + splitter = b.split("\n<") + block_dict = {} + # it is likely redundant to have 'double loop' but it is a little extra insurance + for j, keys in enumerate(default_keys): + # iterates thru each of the default keys + match_found = -1 + for k, entries in enumerate(splitter): + + key_string = keys + ">: " + if entries.startswith(key_string): + + value = entries[len(key_string):].strip() + + # remove trailing ',' + if value.endswith(","): + value= value[:-1] + + block_dict.update({keys: value}) + match_found = 1 + break + + if match_found == -1: + # note: could not find a key - i, keys, splitter - no action required + do_nothing = 1 + + if block_dict: + if len(block_dict) == len(default_keys): + output_list.append(block_dict) + else: + logging.warning("update: Parser - potential error- parsing-to-dict conversion - " + "lengths don't match - %s - %s", len(block_dict), len(default_keys)) + + return output_list + + # this is new parser endpoint designed for llmware - aligns to latest native branch + def parse_pdf (self, fp, write_to_db=True, save_history=True, image_save=1): + + output = [] + + write_to_filename = "pdf_parse_output_0.txt" + + # must have three conditions in place - (a) user selects, (b) ping successfully, and (c) library loadedd + if write_to_db and self.parse_to_db and self.library: + write_to_db_on = 1 + unique_doc_num = -1 + else: + write_to_db_on = 0 + unique_doc_num = int(self.file_counter) + + # warning to user that no library loaded in Parser constructor + if write_to_db and not self.library: + logging.warning("warning: Parser().parse_pdf - request to write to database but no library loaded " + "in Parser constructor. Will write parsing output to file and will place the " + "file in /parser_history path.") + + # warning to user that database connection not found + if write_to_db and not self.parse_to_db: + logging.error("warning: Parser().parse_pdf - could not connect to database at %s. Will write " + "parsing output to file and will place the file in /parser_history path.", + LLMWareConfig.get_config("collection_db_uri")) + + # function declaration for .add_pdf_main_llmware + # char * input_account_name, + # char * input_library_name, + # char * input_fp, + # char * input_mongo_db_path, + # char * input_images_fp, + # int input_debug_mode, + # int input_image_save_mode, + # int write_to_db_on, + # char * write_to_filename, + # int user_block_size, + # int unique_doc_num, + # char * db_user_name, + # char * db_pw + + # pdf_handler = _mod_pdf.add_pdf_main_customize_parallel + pdf_handler = _mod_pdf.add_pdf_main_llmware + + pdf_handler.argtypes = (c_char_p, c_char_p, c_char_p, c_char_p, c_char_p, + c_int, c_int, c_int, + c_char_p, + c_int, c_int, + c_char_p, c_char_p) + + pdf_handler.restypes = c_int + + # prepare all of the inputs to invoke the c library + + t0 = time.time() + + # config options pulled from the Library object + account_name = create_string_buffer(self.account_name.encode('ascii', 'ignore')) + library_name = create_string_buffer(self.library_name.encode('ascii', 'ignore')) + + # image_fp = self.library.image_path + image_fp = self.parser_image_folder + + if not image_fp.endswith("/"): + image_fp += "/" + + image_fp_c = create_string_buffer(image_fp.encode('ascii', 'ignore')) + input_collection_db_path = self.collection_path + collection_db_path_c = create_string_buffer(input_collection_db_path.encode('ascii', 'ignore')) + + # fp = passed as parameter -> this is the input file path folder containing the .PDF docs to be parsed + if not fp.endswith("/"): + fp += "/" + + fp_c = create_string_buffer(fp.encode('ascii', 'ignore')) + + # pull debug mode 'verbosity' levels from LLMWareConfig + if LLMWareConfig.get_config("debug_mode") == 1: + debug_mode = 1 + else: + debug_mode = 0 + + input_debug_mode = c_int(debug_mode) # default - 0 = "off" + input_image_save_mode = c_int(image_save) # default - 1 = "on" | use 0 = "off" in production + + write_to_db_on_c = c_int(write_to_db_on) + write_to_filename_c = create_string_buffer(write_to_filename.encode('ascii','ignore')) + + # pull target block size from library parameters + user_block_size = c_int(self.block_size_target_characters) # standard 400-600 + + # unique_doc_num -> if <0: interpret as "OFF" ... if >=0 then use and increment doc_id directly + # unique_doc_num = -1 + unique_doc_num_c = c_int(unique_doc_num) + + # db credentials + db_user_name = self.collection_db_username + db_user_name_c = create_string_buffer(db_user_name.encode('ascii', 'ignore')) + + db_pw = self.collection_db_password + db_pw_c = create_string_buffer(db_pw.encode('ascii', 'ignore')) + + # + # * main call to pdf library * + # + + logging.info("update: start parsing of PDF Documents...") + + # function declaration for .add_pdf_main_llmware + # char * input_account_name, + # char * input_library_name, + # char * input_fp, + # char * input_mongodb_path, + # char * input_images_fp, + # int input_debug_mode, + # int input_image_save_mode, + # int write_to_db_on, + # char * write_to_filename, + # int user_block_size, + # int unique_doc_num, + # char * db_user_name, + # char * db_pw + + pages_created = pdf_handler(account_name, library_name, fp_c, collection_db_path_c, image_fp_c, + input_debug_mode, input_image_save_mode, write_to_db_on_c, + write_to_filename_c, user_block_size, unique_doc_num_c, + db_user_name_c, db_pw_c) + + logging.info("update: completed parsing of pdf documents - time taken: %s ", time.time() - t0) + + if write_to_db_on == 0: + # package up results in Parser State + parser_output = self.convert_parsing_txt_file_to_json(self.parser_image_folder,write_to_filename) + if len(parser_output) > 0: + last_entry = parser_output[-1] + last_doc_id = last_entry["doc_ID"] + + # print("update: last doc_ID = ", last_doc_id) + + self.file_counter = int(last_doc_id) + + logging.info("update: adding new entries to parser output state - %s", len(parser_output)) + + self.parser_output += parser_output + output += parser_output + + if save_history: + ParserState().save_parser_output(self.parser_job_id,parser_output) + + return output + + # new office parser entry point for llmware specifically + def parse_office (self, input_fp, write_to_db=True, save_history=True): + + output = [] + + # used internally by parser to capture text + write_to_filename = "office_parser_output_0.txt" + + # must have three conditions in place - (a) user selects, (b) ping successfully, and (c) library loadedd + if write_to_db and self.parse_to_db and self.library: + write_to_db_on = 1 + unique_doc_num = -1 + else: + write_to_db_on = 0 + unique_doc_num = int(self.file_counter) + + # warning to user that no library loaded in Parser constructor + if write_to_db and not self.library: + logging.warning("error: Parser().parse_office - request to write to database but no library loaded " + "in Parser constructor. Will write parsing output to file and will place the " + "file in Parser /parser_history path.") + + # warning to user that database connection not found + if write_to_db and not self.parse_to_db: + logging.error("error: Parser().parse_office - could not connect to database at %s. Will write " + "parsing output to file and will place the file in Library /images path.", + LLMWareConfig.get_config("collection_db_uri")) + + # designed for bulk upload of office parse into library structure + + if not input_fp.endswith("/"): + input_fp += "/" + + office_fp = input_fp + + workspace_fp = os.path.join(self.parser_tmp_folder,"office_tmp/") + + if not os.path.exists(workspace_fp): + os.mkdir(workspace_fp) + os.chmod(workspace_fp, 0o777) + + # need to synchronize as config parameter + + # start timing track for parsing job + t0 = time.time() + + # only one tmp work folder used currently - can consolidate over time + for z in range(0, 5): + + if os.path.exists(os.path.join(workspace_fp,str(z))): + shutil.rmtree(os.path.join(workspace_fp,str(z)), ignore_errors=True) + + if not os.path.exists(os.path.join(workspace_fp,str(z))): + os.mkdir(os.path.join(workspace_fp,str(z))) + os.chmod(os.path.join(workspace_fp, str(z)), 0o777) + + # end -initialize workspace + + # new endpoint for llmware + main_handler = _mod.add_files_main_llmware + + # int add_files_main_llmware: + # char * input_account_name, + # char * input_library_name, + # char * input_fp, + # char * workspace_fp, + # char * input_mongodb_path, + # char * image_fp, + # int input_debug_mode, + # int write_to_db_on, + # char * write_to_filename, + # int unique_doc_num, + # char *db_user_name, + # char *db_pw + + # main_handler = _mod.add_files_main_customize_parallel + main_handler.argtypes = (c_char_p, c_char_p, c_char_p, c_char_p, c_char_p, c_char_p, + c_int, c_int, + c_char_p, + c_int, + c_char_p, c_char_p) + + main_handler.restype = c_int + + # three inputs - account_name // library_name // fp to web_dir - files to be processed + # prep each string: account_name = create_string_buffer(py_account_str.encode('ascii','ignore')) + + account_name = create_string_buffer(self.account_name.encode('ascii', 'ignore')) + library_name = create_string_buffer(self.library_name.encode('ascii', 'ignore')) + + fp_c = create_string_buffer(office_fp.encode('ascii', 'ignore')) + workspace_fp_c = create_string_buffer(workspace_fp.encode('ascii', 'ignore')) + + # debug_mode global parameter + # "on" = 1 + # "off" = all other values + + if LLMWareConfig.get_config("debug_mode") == 1: + debug_mode = 1 + else: + debug_mode = 0 + + debug_mode_c = c_int(debug_mode) + + # image_fp = self.library.image_path + + image_fp = self.parser_image_folder + if not image_fp.endswith("/"): + image_fp += "/" + + image_fp_c = create_string_buffer(image_fp.encode('ascii', 'ignore')) + + input_collection_db_path = self.collection_path + collection_path_c = create_string_buffer(input_collection_db_path.encode('ascii', 'ignore')) + + write_to_db_on_c = c_int(write_to_db_on) + + write_to_fn_c = create_string_buffer(write_to_filename.encode('ascii', 'ignore')) + + # unique_doc_num is key parameter - if <0: will pull from incremental db, if >=0, then will start at this value + # unique_doc_num = -1 + unique_doc_num_c = c_int(unique_doc_num) + + # db credentials + db_user_name = "llmware" + db_user_name_c = create_string_buffer(db_user_name.encode('ascii', 'ignore')) + + db_pw = "test-123" + db_pw_c = create_string_buffer(db_pw.encode('ascii', 'ignore')) + + # int add_files_main_llmware: + # char * input_account_name, + # char * input_library_name, + # char * input_fp, + # char * workspace_fp, + # char * input_mongodb_path, + # char * image_fp, + # int input_debug_mode, + # int write_to_db_on, + # char * write_to_filename, + # int unique_doc_num, + # char * db_user_name, + # char * db_pw + + logging.info("update: start parsing of office documents...") + + pages_created = main_handler(account_name, library_name, fp_c, workspace_fp_c, collection_path_c, image_fp_c, + debug_mode_c, write_to_db_on_c, write_to_fn_c, unique_doc_num_c, + db_user_name_c, db_pw_c) + + logging.info("update: completed parsing of office documents - time taken: %s ", time.time() - t0) + + if write_to_db_on == 0: + # package up results in Parser State + parser_output = self.convert_parsing_txt_file_to_json(self.parser_image_folder,write_to_filename) + if len(parser_output) > 0: + last_entry = parser_output[-1] + last_doc_id = last_entry["doc_ID"] + + self.file_counter = int(last_doc_id) + + self.parser_output += parser_output + output += parser_output + + if save_history: + # save parser state + ParserState().save_parser_output(self.parser_job_id,parser_output) + + return output + + def parse_text(self, input_fp, write_to_db=True, save_history=True): + + output = [] + + # must have three conditions in place - (a) user selects, (b) ping successfully, and (c) library loadedd + if write_to_db and self.parse_to_db and self.library: + write_to_db_on = 1 + else: + write_to_db_on = 0 + + # warning to user that no library loaded in Parser constructor + if write_to_db and not self.library: + logging.warning("warning: Parser().parse_text - request to write to database but no library loaded " + "in Parser constructor. Will write parsing output to file and will place the " + "file in /parser_history path.") + + # warning to user that database connection not found + if write_to_db and not self.parse_to_db: + logging.error("warning: Parser().parse_text - could not connect to database at %s. Will write " + "parsing output to file and will place the file in /parser_history path.", + LLMWareConfig.get_config("collection_db_uri")) + + # set counters + blocks_created = 0 + docs_added = 0 + pages_added = 0 + text_output = [] + content_type = "text" + + for file in os.listdir(input_fp): + + # increment and get new doc_id + if write_to_db_on == 1: + self.library.doc_ID = self.library.get_and_increment_doc_id() + + file_type = file.split(".")[-1] + + # sub-routing by type of text file to appropriate handler + + if file_type.lower() in ["txt"]: + # will parse as text + text_output = TextParser(self).text_file_handler (input_fp, file) + content_type = "text" + file_type = "txt" + + if file_type.lower() in ["csv"]: + # will parse as table + interpret_as_table=True + text_output = TextParser(self).csv_file_handler(input_fp, file, interpret_as_table=True) + content_type = "text" + file_type = "csv" + if interpret_as_table: + content_type = "table" + + if file_type.lower() in ["json","jsonl"]: + # will parse each line item as separate entry + + interpret_as_table=False + keys = ["text"] + text_output = TextParser(self).jsonl_file_handler(input_fp,file, + key_list=keys, + interpret_as_table=interpret_as_table, + separator="\n") + content_type = "text" + file_type = "jsonl" + if interpret_as_table: + content_type = "table" + + # consolidate into single function - breaking down output rows + + if write_to_db_on == 1: + new_output, new_blocks, new_pages = self._write_output_to_db(text_output, file, + content_type=content_type, + file_type=file_type) + else: + new_output, new_blocks, new_pages = self._write_output_to_dict(text_output,file, + content_type=content_type, + file_type=file_type) + + output += new_output + + docs_added += 1 + blocks_created += new_blocks + pages_added += new_pages + + # update overall library counter at end of parsing + + if write_to_db_on == 1: + dummy = self.library.set_incremental_docs_blocks_images(added_docs=docs_added,added_blocks=blocks_created, + added_images=0, added_pages=pages_added) + + if save_history and write_to_db_on == 0: + ParserState().save_parser_output(self.parser_job_id, self.parser_output) + + return output + + def parse_pdf_by_ocr_images(self, input_fp, write_to_db=True, save_history=True): + + output = [] + + # must have three conditions in place - (a) user selects, (b) ping successfully, and (c) library loadedd + + if write_to_db and self.parse_to_db and self.library: + write_to_db_on = 1 + else: + write_to_db_on = 0 + + # warning to user that no library loaded in Parser constructor + if write_to_db and not self.library: + logging.warning("warning: Parser().parse_text - request to write to database but no library loaded " + "in Parser constructor. Will write parsing output to file and will place the " + "file in /parser_history path.") + + # warning to user that database connection not found + if write_to_db and not self.parse_to_db: + logging.error("warning: Parser().parse_text - could not connect to database at %s. Will write " + "parsing output to file and will place the file in /parser_history path.", + LLMWareConfig.get_config("collection_db_uri")) + + # set counters + blocks_added = 0 + docs_added = 0 + pages_added = 0 + + content_type = "text" + + for file in os.listdir(input_fp): + + ext = file.split(".")[-1] + if ext == "pdf": + doc_fn = secure_filename(file) + + # get new doc_ID number + if write_to_db_on == 1: + self.library.doc_ID = self.library.get_and_increment_doc_id() + + docs_added += 1 + + output_by_page = ImageParser(self).process_pdf_by_ocr(input_fp, file) + + for i, pages in enumerate(output_by_page): + for j, blocks in enumerate(pages): + + if write_to_db_on == 1: + new_output, new_blocks, _ = self._write_output_to_db(blocks,doc_fn,page_num=(j+1)) + else: + new_output, new_blocks, _ = self._write_output_to_dict(blocks,doc_fn,page_num=(j+1)) + + output += new_blocks + blocks_added += new_blocks + pages_added += 1 + + # update overall library counter at end of parsing + + if write_to_db_on == 1: + dummy = self.library.set_incremental_docs_blocks_images(added_docs=docs_added,added_blocks=blocks_added, + added_images=0, added_pages=pages_added) + + if save_history and write_to_db_on == 0: + ParserState().save_parser_output(self.parser_job_id, self.parser_output) + + return output + + def _write_output_to_db(self, output, file, content_type="text", file_type="text",page_num=1): + + db_record_output = [] + + # trackers + blocks_added = 0 + pages_added = 0 + + meta = {"author": "", "modified_date": "", "created_date": "", "creator_tool": ""} + coords_dict = {"coords_x": 0, "coords_y": 0, "coords_cx": 0, "coords_cy": 0} + + counter = 0 + + for entries in output: + + if content_type == "text": + # table entry = "" [7] + new_entry = (content_type, file_type, (page_num, 0), counter, "", "", file, "", entries, "", + "", entries, entries, "", entries, "", "", "", "", "") + else: + # could be table if csv file -> in this case, keep both text [11] and table [7] + new_entry = (content_type, file_type, (page_num, 0), counter, "", "", file, entries, entries, "", + "", entries, entries, "", entries, "", "", "", "", "") + + counter += 1 + + new_db_entry = self.add_create_new_record(self.library,new_entry, meta, coords_dict) + db_record_output.append(new_db_entry) + + blocks_added += 1 + self.library.block_ID += 1 + + # need to adapt potentially for longer text files + pages_added = 1 + + return db_record_output, blocks_added, pages_added + + def _write_output_to_dict(self, wp_output, input_fn, content_type="text", file_type="text", page_num=1): + + output = [] + # consolidate output + counter = 0 + blocks_added = 0 + pages_added = 0 + + meta = {"author": "", "modified_date": "", "created_date": "", "creator_tool": ""} + coords_dict = {"coords_x": 0, "coords_y": 0, "coords_cx": 0, "coords_cy": 0} + + for j, blocks in enumerate(wp_output): + + if content_type == "text": + new_entry = ("text", file_type, (page_num, 0), counter, "", "", input_fn, "", blocks, "", + "", blocks, blocks, "", blocks, "", "", "", "", "") + else: + # could be table if csv file -> in this case, keep both text [11] and table [7] + new_entry = ("table", file_type, (page_num, 0), counter, "", "", input_fn, blocks, blocks, "", + "", blocks, blocks, "", blocks, "", "", "", "", "") + + # creates a single 'unbound' parsing output dict -> no storage + parsing_output_dict = self.create_one_parsing_output_dict(counter, + new_entry, meta, coords_dict, + dialog_value="false") + + output.append(parsing_output_dict) + blocks_added += 1 + + pages_added = 1 + + self.parser_output += output + + return output, blocks_added, pages_added + + def add_create_new_record(self, library, new_entry, meta, coords_dict,dialog_value="false", + write_to_db=True): + + # assumes that new_entry is packaged in individual handler + # objective is to keep one single place where new entry gets loaded into db + # ensure consistency of db data model + + time_stamp = Utilities().get_current_time_now() + + new_entry = { + "block_ID": library.block_ID, # note - needs caution + "doc_ID": library.doc_ID, # note - needs caution + "content_type": new_entry[0], + "file_type": new_entry[1], + "master_index": new_entry[2][0], + "master_index2": new_entry[2][1:], + "coords_x": coords_dict["coords_x"], + "coords_y": coords_dict["coords_y"], + "coords_cx": coords_dict["coords_cx"], + "coords_cy": coords_dict["coords_cy"], + "author_or_speaker": meta["author"], + "modified_date": meta["modified_date"], + "created_date": meta["created_date"], + "creator_tool": meta["creator_tool"], + "added_to_collection": time_stamp, + "file_source": new_entry[6], + "table": new_entry[7], + "external_files": new_entry[10], + "text": new_entry[11], + "header_text": new_entry[13], + "text_search": new_entry[14], + "user_tags": new_entry[15], + "special_field1": new_entry[17], + "special_field2": new_entry[18], + "special_field3": new_entry[19], + "graph_status": "false", + "dialog": dialog_value + } + + if write_to_db: + # registry_id = library.collection.insert_one(new_entry).inserted_id + registry_id = CollectionWriter(library.collection).write_new_record(new_entry) + + return new_entry + + def create_one_parsing_output_dict(self, block_id,new_entry, meta, coords_dict,dialog_value="false"): + + # Mirrors the data structure in "self.add_create_new_record" + # --does not write_to_db or storage + # --does not assume that there is a library index + # --creates one parsing output dict that can be used and stored for any purpose (outside of library) + + # Note: expects explicit passing of a block_id and doc_id as reference numbers + + time_stamp = Utilities().get_current_time_now() + + new_entry = { + "block_ID": block_id, + "doc_ID": self.file_counter, + "content_type": new_entry[0], + "file_type": new_entry[1], + "master_index": new_entry[2][0], + "master_index2": new_entry[2][1:], + "coords_x": coords_dict["coords_x"], + "coords_y": coords_dict["coords_y"], + "coords_cx": coords_dict["coords_cx"], + "coords_cy": coords_dict["coords_cy"], + "author_or_speaker": meta["author"], + "modified_date": meta["modified_date"], + "created_date": meta["created_date"], + "creator_tool": meta["creator_tool"], + "added_to_collection": time_stamp, + "file_source": new_entry[6], + "table": new_entry[7], + "external_files": new_entry[10], + "text": new_entry[11], + "header_text": new_entry[13], + "text_search": new_entry[14], + "user_tags": new_entry[15], + "special_field1": new_entry[17], + "special_field2": new_entry[18], + "special_field3": new_entry[19], + "graph_status": "false", + "dialog": dialog_value + } + + return new_entry + + def parse_wiki(self, topic_list, write_to_db=True, save_history=False, target_results=10): + + output = [] + + # must have three conditions in place - (a) user selects, (b) ping successfully, and (c) library loadedd + if write_to_db and self.parse_to_db and self.library: + write_to_db_on = 1 + else: + write_to_db_on = 0 + + # warning to user that no library loaded in Parser constructor + if write_to_db and not self.library: + logging.warning("warning: Parser().parse_text - request to write to database but no library loaded " + "in Parser constructor. Will write parsing output to file and will place the " + "file in /parser_history path.") + + # warning to user that database connection not found + if write_to_db and not self.parse_to_db: + logging.error("warning: Parser().parse_text - could not connect to database at %s. Will write " + "parsing output to file and will place the file in /parser_history path.", + LLMWareConfig.get_config("collection_db_uri")) + + # set counters + blocks_added = 0 + docs_added = 0 + pages_added = 0 + + for i, topic in enumerate(topic_list): + + fn = "wiki-topic-" + secure_filename(topic) + ".txt" + + logging.info("update: parse_wiki - %s - %s", topic, fn) + + # increment and get new doc_id + if write_to_db_on == 1: + self.library.doc_ID = self.library.get_and_increment_doc_id() + + # topic_results = {"search_results": topic_query_results, "articles": articles_output, + # "text_chunks": text_chunks} + + topic_results = WikiParser(self).add_wiki_topic(topic, target_results=target_results) + + wp_output = topic_results["text_chunks"] + + if write_to_db_on == 1: + new_output, new_blocks, new_pages = self._write_output_to_db(wp_output, fn, content_type="text", + file_type="wiki") + + else: + new_output, new_blocks, new_pages = self._write_output_to_dict(wp_output,fn, content_type="text", + file_type="wiki") + output += new_output + + docs_added += 1 + blocks_added += new_blocks + pages_added += new_pages + + for i, articles in enumerate(topic_results["articles"]): + + # need to copy into library_copy path + if self.library: + upload_fp = self.library.file_copy_path + else: + upload_fp = self.parser_tmp_folder + + # save as the article title now + article_txt = articles["title"]+".txt" + safe_name = self.prep_filename(article_txt) + + art = open(os.path.join(upload_fp,safe_name), "w") + art.write(articles["text"]) + art.close() + + if write_to_db_on == 1: + dummy = self.library.set_incremental_docs_blocks_images(added_docs=docs_added, added_blocks=blocks_added, + added_images=0, added_pages=pages_added) + + if save_history and write_to_db_on == 0: + ParserState().save_parser_output(self.parser_job_id, self.parser_output) + + return output + + def parse_image(self, input_folder, write_to_db=True, save_history=True): + + output = [] + + # must have three conditions in place - (a) user selects, (b) ping successfully, and (c) library loadedd + if write_to_db and self.parse_to_db and self.library: + write_to_db_on = 1 + else: + write_to_db_on = 0 + + # warning to user that no library loaded in Parser constructor + if write_to_db and not self.library: + logging.warning("warning: Parser().parse_text - request to write to database but no library loaded " + "in Parser constructor. Will write parsing output to file and will place the " + "file in /parser_history path.") + + # warning to user that database connection not found + if write_to_db and not self.parse_to_db: + logging.error("warning: Parser().parse_text - could not connect to database at %s. Will write " + "parsing output to file and will place the file in /parser_history path.", + LLMWareConfig.get_config("collection_db_uri")) + + # set counters + blocks_added = 0 + docs_added = 0 + pages_added = 0 + + for file in os.listdir(input_folder): + + # increment and get new doc_id + if write_to_db_on == 1: + self.library.doc_ID = self.library.get_and_increment_doc_id() + + ip_output = ImageParser(self).process_ocr(input_folder, file) + + if write_to_db_on == 1: + new_output, new_blocks, new_pages = self._write_output_to_db(ip_output,file,content_type="text", + file_type="ocr") + else: + new_output, new_blocks, new_pages = self._write_output_to_dict(ip_output,file, content_type="text", + file_type="ocr") + output += new_output + + docs_added += 1 + blocks_added += new_blocks + pages_added += new_pages + + if write_to_db_on == 1: + dummy = self.library.set_incremental_docs_blocks_images(added_docs=docs_added, added_blocks=blocks_added, + added_images=0, added_pages=pages_added) + + if save_history and write_to_db_on == 0: + ParserState().save_parser_output(self.parser_job_id, self.parser_output) + + return output + + def parse_voice(self, input_folder, write_to_db=True, save_history=True): + + output = [] + + # must have three conditions in place - (a) user selects, (b) ping successfully, and (c) library loaded + if write_to_db and self.parse_to_db and self.library: + write_to_db_on = 1 + else: + write_to_db_on = 0 + + # warning to user that no library loaded in Parser constructor + if write_to_db and not self.library: + logging.warning("warning: Parser().parse_text - request to write to database but no library loaded " + "in Parser constructor. Will write parsing output to file and will place the " + "file in /parser_history path.") + + # warning to user that database connection not found + if write_to_db and not self.parse_to_db: + logging.error("warning: Parser().parse_text - could not connect to database at %s. Will write " + "parsing output to file and will place the file in /parser_history path.", + LLMWareConfig.get_config("collection_db_uri")) + + # set counters + blocks_added = 0 + docs_added = 0 + pages_added = 0 + + for file in os.listdir(input_folder): + + # increment and get new doc_id + if write_to_db_on == 1: + self.library.doc_ID = self.library.get_and_increment_doc_id() + + vp_output = VoiceParser(self).add_voice_file(input_folder, file) + + if write_to_db_on == 1: + new_output, new_blocks, new_pages = self._write_output_to_db(vp_output, file, content_type="text", + file_type="voice-wav") + else: + new_output, new_blocks, new_pages = self._write_output_to_dict(vp_output,file, content_type="text", + file_type="voice-wav") + output += new_output + + docs_added += 1 + blocks_added += new_blocks + pages_added += new_pages + + if write_to_db_on == 1: + dummy = self.library.set_incremental_docs_blocks_images(added_docs=docs_added, added_blocks=blocks_added, + added_images=0, added_pages=pages_added) + + if save_history and write_to_db_on == 0: + ParserState().save_parser_output(self.parser_job_id, self.parser_output) + + return output + + def parse_dialog(self, input_folder, write_to_db=True, save_history=True): + + output = [] + + # must have three conditions in place - (a) user selects, (b) ping successfully, and (c) library loadedd + if write_to_db and self.parse_to_db and self.library: + write_to_db_on = 1 + else: + write_to_db_on = 0 + + # warning to user that no library loaded in Parser constructor + if write_to_db and not self.library: + logging.warning("warning: Parser().parse_text - request to write to database but no library loaded " + "in Parser constructor. Will write parsing output to file and will place the " + "file in /parser_history path.") + + # warning to user that database connection not found + if write_to_db and not self.parse_to_db: + logging.error("warning: Parser().parse_text - could not connect to database at %s. Will write " + "parsing output to file and will place the file in /parser_history path.", + LLMWareConfig.get_config("collection_db_uri")) + + # set counters + conversation_turns = 0 + dialog_transcripts_added = 0 + counter = 0 + + for file in os.listdir(input_folder): + + if file.endswith(".json"): + + # increment and get new doc_id + if write_to_db_on == 1: + self.library.doc_ID = self.library.get_and_increment_doc_id() + + logging.info(f"update: dialog file - {file}") + + dp_parse_output = DialogParser(self).parse_aws_json_file_format(input_folder, file) + + block_id = 0 + + for i, blocks in enumerate(dp_parse_output): + + logging.info(f"update: dialog turn - {i} {blocks}") + + # iterate thru each block -> add to metadata + speaker_name = blocks["speaker_name"] + + meta = {"author": speaker_name, "modified_date": "", "created_date": "", "creator_tool": ""} + + coords_dict = {"coords_x": blocks["start_time"], "coords_y": blocks["stop_time"], + "coords_cx": 0, "coords_cy": 0} + + text_entry = blocks["text"] + + # conforming file format with full path of dialog intake path + + format_type = "aws_json" + + new_entry = ("text", format_type, (1, 0), counter, "", "", input_folder + file, + text_entry, text_entry, "", "", text_entry, text_entry, "", text_entry, + "", "", "", "", "") + + counter += 1 + dialog_transcripts_added += 1 + conversation_turns += 1 + + if write_to_db_on == 1: + output = self.add_create_new_record(self.library, new_entry, meta, coords_dict, + dialog_value="true") + self.library.block_ID += 1 + else: + entry_output = self.create_one_parsing_output_dict(block_id,new_entry,meta,coords_dict, + dialog_value="true") + block_id += 1 + self.parser_output.append(output) + output.append(entry_output) + + pages_added = dialog_transcripts_added + + if write_to_db_on == 1: + dummy = self.library.set_incremental_docs_blocks_images(added_docs=dialog_transcripts_added, + added_blocks=conversation_turns, + added_images=0, + added_pages=pages_added) + + self.uploads(input_folder) + + if save_history and write_to_db_on == 0: + ParserState().save_parser_output(self.parser_job_id, self.parser_output) + + return output + + # entry point for website handler should come through Parser class + def parse_website(self, url_base, write_to_db=True, save_history=True, get_links=True, max_links=10): + + output = [] + + # must have three conditions in place - (a) user selects, (b) ping successfully, and (c) library loadedd + if write_to_db and self.parse_to_db and self.library: + write_to_db_on = 1 + else: + write_to_db_on = 0 + + # warning to user that no library loaded in Parser constructor + if write_to_db and not self.library: + logging.warning("warning: Parser().parse_website - request to write to database but no library loaded " + "in Parser constructor. Will write parsing output to file and will place the " + "file in /parser_history path.") + + # warning to user that database connection not found + if write_to_db and not self.parse_to_db: + logging.error("warning: Parser().parse_website - could not connect to database at %s. Will write " + "parsing output to file and will place the file in /parser_history path.", + LLMWareConfig.get_config("collection_db_uri")) + + local_work_folder = self.parser_tmp_folder + # local_work_folder = self.library.tmp_path + + if not os.path.exists(local_work_folder): + os.mkdir(local_work_folder) + + self.website_work_folder = os.path.join(local_work_folder, "process_website/") + + # start clean + if os.path.exists(self.website_work_folder): + shutil.rmtree(self.website_work_folder, ignore_errors=True) + os.mkdir(self.website_work_folder) + + # iterative parse thru website to follow links enabled + + website = WebSiteParser(url_base, reset_img_folder=True, local_file_path=self.website_work_folder) + + if website.success_code == 1: + + # increment and get new doc_id + if write_to_db_on == 1: + self.library.doc_ID = self.library.get_and_increment_doc_id() + + entries, img_counter = website.website_main_processor(website.image_counter, + output_index=False) + + # if get_links, then pursue internal links and 'add' to indexed output gathered + if get_links: + + if len(website.internal_links) > 0: + + max_links = min(len(website.internal_links), max_links) + + # img_counter = new_image_count + + for z in range(0, max_links): + + logging.info("\nupdate: WebSite Parser iterate - " + "child site link - %s - %s - %s", z, url_base, website.internal_links[z]) + + child_site = WebSiteParser(url_base + website.internal_links[z], reset_img_folder=False, + local_file_path=self.website_work_folder) + + if child_site.success_code == 1: + new_child_entries, img_counter = child_site.website_main_processor(img_counter, + output_index=False) + + for c in range(0, len(child_site.core_index)): + website.core_index.append(child_site.core_index[c]) + + # write parser output to storage + + entries_created = 0 + images_created = 0 + running_links = "" + file_type = "html" + + file_source = str(random.randint(100000, 999999)) + "_" + website.url_main.split(".")[-2] + ".html" + # file_source = website.url_main.split(".")[-2] + ".html" + + meta = {"author": "", "modified_date": "", "created_date": "", "creator_tool": ""} + coords_dict = {"coords_x": 0, "coords_y": 0, "coords_cx": 0, "coords_cy": 0} + + # prep loop - consolidate links with text or image + for z in range(0, len(website.core_index)): + + """ + # core index entry is dictionary + entry = {"content_type": entry_type, + "text": text, + "image": {"image_name": img_name, "image_url": img_url}, + "link": {"link_type": link_type, "link": link}, + "master_index": master_index, + "last_header": last_header} + """ + + content_type = website.core_index[z]["content_type"] + + if content_type == "link": + link_type = website.core_index[z]["link"]["link_type"] + + if link_type == "internal": + # attach internal links to last piece of text or image + running_links += website.core_index[z]["link"]["link"] + " , " + + if content_type == "text" or content_type == "image": + # close out last entry & start new one + save_entry = 1 + text1_core = website.core_index[z]["text"] + if not text1_core: + text1_core = website.core_index[z]["last_header"] + + # no tables currently extracted in website parser + content1_core = "" + + text3_format = website.core_index[z]["last_header"] + text2_spatial = running_links + links = running_links + running_links = "" + + master_index = (entries_created, 0) + coords = master_index + user_tags = [] + external_files = "" + + if content_type == "image": + + fp_tmp = self.website_work_folder + image_num = website.core_index[z]["image"]["image_name"] + + if self.library: + doc_id = self.library.doc_ID + save_file_path = self.library.image_path + else: + doc_id = self.file_counter + save_file_path = self.parser_image_folder + + new_image_name, created = website._save_image_website(fp_tmp, image_num, doc_id, save_file_path) + + images_created += 1 + external_files = new_image_name + + if not text1_core: + # take adjacent header_text, if no text linked to image + text1_core = text3_format + + new_entry = (content_type, file_type, master_index, "", "", "", + file_source, content1_core,"","", external_files, text1_core,text2_spatial, + text3_format,text1_core, user_tags,links,"","" ,"") + + if write_to_db_on == 1: + entry_output = self.add_create_new_record(self.library, new_entry,meta,coords_dict) + else: + entry_output = self.create_one_parsing_output_dict(entries_created, + new_entry,meta,coords_dict, + dialog_value="false") + self.parser_output.append(entry_output) + output.append(entry_output) + entries_created += 1 + + # once done with all of the record updates- update the master counters + # need to save new block_ID & new doc_ID + docs_created = 1 + self.file_counter += 1 + + if write_to_db_on == 1: + dummy = self.library.set_incremental_docs_blocks_images(added_docs=docs_created, + added_blocks=entries_created, + added_images=images_created, + added_pages=1) + + # c.uploads - upload website_file + fp_tmp = os.path.join(local_work_folder, "process_website/") + + website_name = "my_website.html" + out_name = str(random.randint(100000, 999999)) + "_" + website.url_main.split(".")[-2] + ".html" + + if self.library: + upload_fp = self.library.file_copy_path + else: + upload_fp = self.parser_tmp_folder + + shutil.copy(os.path.join(fp_tmp,website_name), os.path.join(upload_fp, out_name)) + + if save_history and write_to_db_on == 0: + ParserState().save_parser_output(self.parser_job_id, self.parser_output) + + return output + + def uploads(self, tmp_dir): + + # designed for upload of input files into library structure + + if not self.library: + logging.error("error: Parser().uploads is designed for connecting files " + "into library - no library selected - to use, create Parser with library loaded, e.g., " + "Parser(library=my_library)") + return -1 + + upload_fp = self.library.file_copy_path + + files = os.listdir(tmp_dir) + for x in range(0, len(files)): + safe_name = self.prep_filename(files[x]) + + # exclude any folders + if not os.path.isdir(os.path.join(tmp_dir,files[x])): + shutil.copy(os.path.join(tmp_dir, files[x]), os.path.join(upload_fp, files[x])) + + return len(files) + + def prep_filename(self, fn, secure_name=True, prepend_string=None, postpend_string=None, max_len=None): + + fn_out = fn + + # default - apply basic secure name, e.g., remove / and insert _ + if secure_name: + fn_out= secure_filename(fn) + + # if requested prepend or postpend + if prepend_string: + fn_out= prepend_string + fn_out + + if postpend_string: + fn_base, ext = fn_out.split(".") + fn_out = fn_base + postpend_string + ext + + # if max len applied + if max_len: + if len(fn_out) > max_len: + fn_base, ext = fn_out.split(".") + fn_out = fn_base[0:max_len-len(ext)] + ext + + return fn_out + + def input_ingestion_comparison (self, file_list): + + # simple approach - compares input file_list from ingestion 'work_order' with state of library collection + # --if input file found, then added to 'found_list' -> else, added to 'not_found_list' + + if not self.library: + logging.error("error: Parser().input_ingestion_comparison is designed for bulk parsing of files " + "into library - no library selected - to use, create Parser with library loaded, e.g., " + "Parser(library=my_library)") + return -1 + + found_list = [] + not_found_list = [] + + doc_fn_raw_list = CollectionRetrieval(self.library.collection).get_distinct_list("file_source") + + doc_fn_out = [] + for i, file in enumerate(doc_fn_raw_list): + doc_fn_out.append(file.split("/")[-1]) + + for i, input_file in enumerate(file_list): + found_file = -1 + for j, ingested_file in enumerate(doc_fn_out): + + # need to confirm 'symmetrical' transformations, e.g., secure_filename and any prepend/postpend + if input_file == ingested_file: + found_file = 1 + found_list.append(input_file) + break + if found_file == -1: + not_found_list.append(input_file) + + return found_list, not_found_list + + def input_ingestion_comparison_from_parser_state (self, file_list): + + # simple approach - compares input file_list from ingestion 'work_order' with state of library collection + # --if input file found, then added to 'found_list' -> else, added to 'not_found_list' + + doc_fn_out = [] + + for i, doc_fn in enumerate(self.parser_output): + if "file_source" in doc_fn: + if doc_fn["file_source"] not in doc_fn_out: + doc_fn_out.append(doc_fn["file_source"]) + + found_list = [] + not_found_list = [] + + for i, input_file in enumerate(file_list): + found_file = -1 + for j, ingested_file in enumerate(doc_fn_out): + + # need to confirm 'symmetrical' transformations, e.g., secure_filename and any prepend/postpend + if input_file == ingested_file: + found_file = 1 + found_list.append(input_file) + break + if found_file == -1: + not_found_list.append(input_file) + + return found_list, not_found_list + + def parse_one (self, fp, fn, save_history=True): + + # new method for 'ad hoc' 'unbound' parsing of a single document in memory -> no library required + + # check that path exists + if not os.path.exists(os.path.join(fp, fn)): + raise FilePathDoesNotExistException(os.path.join(fp,fn)) + + output = None + + ext = fn.split(".")[-1].lower() + + if ext == "pdf": + output = self.parse_one_pdf(fp, fn, save_history=False) + + if ext in self.office_types: + output = self.parse_one_office(fp, fn, save_history=False) + + if ext in self.text_types: + output = self.parse_one_text(fp, fn, save_history=False) + + if ext in self.voice_types: + output = self.parse_one_voice(fp, fn, save_history=False) + + # no history saved by the individual parsers, as it will be saved below + if save_history: + ParserState().save_parser_output(self.parser_job_id, output) + + return output + + def parse_one_office (self, fp, fn, save_history=True): + + # Designed for 'ad hoc' and 'unbound' quick parse of a single office document with no storage + # --output provided as list of Dicts in memory with same structure as parsing output + + # check that path exists + if not os.path.exists(os.path.join(fp, fn)): + raise FilePathDoesNotExistException(os.path.join(fp,fn)) + + workspace_fp = self.parser_tmp_folder + + if not os.path.exists(workspace_fp): + os.mkdir(workspace_fp) + os.chmod(workspace_fp, 0o777) + + # safety check - will need to improve + expand for supporting windows path + if not workspace_fp.endswith("/"): + workspace_fp += "/" + logging.warning("warning: workspace_fp did not end with trailing '/' as expected by parser") + + # need to update this + for z in range(0, 1): + + if os.path.exists(os.path.join(workspace_fp,str(z))): + shutil.rmtree(os.path.join(workspace_fp,str(z)), ignore_errors=True) + + if not os.path.exists(os.path.join(workspace_fp,str(z))): + os.mkdir(os.path.join(workspace_fp,str(z))) + os.chmod(os.path.join(workspace_fp, str(z)), 0o777) + + # end -initialize workspace + + # int add_one_office + # char * input_account_name, + # char * input_library_name, + # char * input_fp, + # char * input_fn, + # char * workspace_fp, + # char * image_fp, + # char * write_to_filename + + main_handler = _mod.add_one_office + main_handler.argtypes = (c_char_p, c_char_p, c_char_p, c_char_p, c_char_p, c_char_p, c_char_p) + + main_handler.restype = c_int + + # three inputs - account_name // library_name // fp to web_dir - files to be processed + # prep each string: account_name = create_string_buffer(py_account_str.encode('ascii','ignore')) + + if not self.account_name: + self.account_name = "llmware" + + account_name = create_string_buffer(self.account_name.encode('ascii', 'ignore')) + library_name = create_string_buffer(self.library_name.encode('ascii', 'ignore')) + + if not fp.endswith("/"): + fp += "/" + + fp_c = create_string_buffer(fp.encode('ascii', 'ignore')) + fn_c = create_string_buffer(fn.encode('ascii', 'ignore')) + + workspace_fp_c = create_string_buffer(workspace_fp.encode('ascii', 'ignore')) + + # image_fp = self.library.image_path + + # will need to fix this - C code expects trailing "/" + # image_fp = self.parser_tmp_folder # + "/" + image_fp = self.parser_image_folder + + if not image_fp.endswith("/"): + image_fp += "/" + logging.warning("warning: adding '/' to image_fp as expected by c parser") + + image_fp_c = create_string_buffer(image_fp.encode('ascii', 'ignore')) + + write_to_filename = "office_internal_test0.txt" + write_to_fn_c = create_string_buffer(write_to_filename.encode('ascii', 'ignore')) + + # int add_one_office + # char * input_account_name, + # char * input_library_name, + # char * input_fp, + # char * input_fn, + # char * workspace_fp, + # char * image_fp, + # char * write_to_filename + + pages_created = main_handler(account_name, library_name, fp_c, fn_c, workspace_fp_c, + image_fp_c, write_to_fn_c) + + # self.library.image_path + output = self.convert_parsing_txt_file_to_json(file_path=self.parser_tmp_folder,fn=write_to_filename) + + if len(output) > 0: + self.parser_output += output + + if save_history: + ParserState().save_parser_output(self.parser_job_id, self.parser_output) + + return output + + def parse_one_pdf (self, fp, fn, save_history=True): + + # check that path exists + if not os.path.exists(os.path.join(fp,fn)): + raise FilePathDoesNotExistException(os.path.join(fp,fn)) + + # c function header - add_one_pdf( + # char * account_name, + # char * library_name, + # char * input_fp, + # char * input_filename, + # char * input_images_fp, + # char * write_to_filename, + # int user_block_size) + + pdf_handler = _mod_pdf.add_one_pdf + + # c function header- + pdf_handler.argtypes = (c_char_p, c_char_p, c_char_p, c_char_p, c_char_p, c_char_p, c_int) + + pdf_handler.restypes = c_int + + # prepare all of the inputs to invoke the c library + + t0 = time.time() + + # config options pulled from the Library object + if not self.account_name: + acct_name = "llmware" + + account_name = create_string_buffer(self.account_name.encode('ascii', 'ignore')) + + library_name = create_string_buffer(self.library_name.encode('ascii', 'ignore')) + + # fp = passed as parameter -> this is the input file path folder containing the .PDF docs to be parsed + + if not fp.endswith("/"): + fp += "/" + + fp_c = create_string_buffer(fp.encode('ascii', 'ignore')) + + fn_c = create_string_buffer(fn.encode('ascii', 'ignore')) + + # shift output fp to + # image_fp = self.library.image_path + image_fp = self.parser_tmp_folder + if not image_fp.endswith("/"): + image_fp += "/" + + image_fp_c = create_string_buffer(image_fp.encode('ascii', 'ignore')) + + # prep parameters passed in the method invocation above + write_to_filename = "pdf_internal_test0.txt" + write_to_filename_c = create_string_buffer(write_to_filename.encode('ascii','ignore')) + + # pull target block size from library parameters + + user_block_size = c_int(self.block_size_target_characters) # standard 400-600 + + # + # * main call to pdf library * + # + + # c function header - add_one_pdf( + # char * account_name, + # char * library_name, + # char * input_fp, + # char * input_filename, + # char * input_images_fp, + # char * write_to_filename, + # int user_block_size) + + logging.info("update: starting pdf_parser ...") + + pages_created = pdf_handler(account_name, library_name, fp_c, fn_c, image_fp_c, + write_to_filename_c, user_block_size) + + logging.info("update: completed pdf_parser - time taken: %s ", time.time() - t0) + + output = self.convert_parsing_txt_file_to_json(file_path=self.parser_tmp_folder,fn=write_to_filename) + + if len(output) > 0: + self.parser_output += output + + if save_history: + ParserState().save_parser_output(self.parser_job_id, self.parser_output) + + return output + + def parse_one_pdf_by_ocr_images(self, input_fp, input_fn, save_history=True): + + # check that path exists + if not os.path.exists(os.path.join(input_fp, input_fn)): + raise FilePathDoesNotExistException(os.path.join(input_fp,input_fn)) + + # Designed for parse of a single PDF_BY_OCR - no storage, no link into Library + # --output returned as in-memory list of Dicts + + # set counters + output = [] + doc_id = 0 + + ext = input_fn.split(".")[-1] + + if ext == "pdf": + + doc_fn = secure_filename(input_fn) + + output_by_page = ImageParser(self).process_pdf_by_ocr(input_fp, input_fn) + + meta = {"author": "", "modified_date": "", "created_date": "", "creator_tool": ""} + coords_dict = {"coords_x": 0, "coords_y": 0, "coords_cx": 0, "coords_cy": 0} + + counter = 0 + for i, pages in enumerate(output_by_page): + for j, blocks in enumerate(pages): + + new_entry = ("text", "pdf-ocr", (j+1, 0), counter, "", "", doc_fn, "", blocks, "", + "", blocks, blocks, "", blocks, "", "", "", "", "") + + # creates a single 'unbound' parsing output dict -> no storage + parsing_output_dict = self.create_one_parsing_output_dict(counter, + new_entry, meta, coords_dict, + dialog_value="false") + + output.append(parsing_output_dict) + self.parser_output.append(parsing_output_dict) + + counter += 1 + + if save_history: + ParserState().save_parser_output(self.parser_job_id, self.parser_output) + + return output + + def parse_one_image(self, input_fp, input_fn, save_history=True): + + # Designed to parse a single image using OCR - no storage or link to library + + # check that path exists + if not os.path.exists(os.path.join(input_fp, input_fn)): + raise FilePathDoesNotExistException(os.path.join(input_fp,input_fn)) + + # set counters + output= [] + counter = 0 + ext = input_fn.split(".")[-1].lower() + + if ext in self.ocr_types: + + doc_fn = secure_filename(input_fn) + ocr_output = ImageParser(self).process_ocr(input_fp, input_fn) + + meta = {"author": "", "modified_date": "", "created_date": "", "creator_tool": ""} + coords_dict = {"coords_x": 0, "coords_y": 0, "coords_cx": 0, "coords_cy": 0} + + for j, blocks in enumerate(ocr_output): + + new_entry = ("text", "pdf-ocr", (1, 0), counter, "", "", doc_fn, "", blocks, "", + "", blocks, blocks, "", blocks, "", "", "", "", "") + + # creates a single 'unbound' parsing output dict -> no storage + parsing_output_dict = self.create_one_parsing_output_dict(counter, new_entry, meta, coords_dict, + dialog_value="false") + + output.append(parsing_output_dict) + self.parser_output.append(parsing_output_dict) + + counter += 1 + + if save_history: + ParserState().save_parser_output(self.parser_job_id, output) + + return output + + def parse_one_text(self, input_fp, input_fn, save_history=True): + + # Designed as single document parse with no storage or linkage into library + + # check that path exists + if not os.path.exists(os.path.join(input_fp, input_fn)): + raise FilePathDoesNotExistException(os.path.join(input_fp,input_fn)) + + # set counters + output = [] + content_type = "text" + parser_output = [] + counter = 0 + + file_type = input_fn.split(".")[-1].lower() + + if file_type not in self.text_types: + return output + + # sub-routing by type of text file to appropriate handler + + if file_type in ["txt"]: + # will parse as text + parser_output = TextParser(self).text_file_handler (input_fp, input_fn) + content_type = "text" + file_type = "txt" + + if file_type.lower() in ["csv"]: + # will parse as table + interpret_as_table=True + parser_output = TextParser(self).csv_file_handler(input_fp, input_fn, interpret_as_table=True) + content_type = "text" + file_type = "csv" + if interpret_as_table: + content_type = "table" + + if file_type.lower() in ["json","jsonl"]: + # will parse each line item as separate entry + + interpret_as_table=False + keys = ["text"] + parser_output = TextParser(self).jsonl_file_handler(input_fp,input_fn, + key_list=keys, + interpret_as_table=interpret_as_table, + separator="\n") + content_type = "text" + file_type = "jsonl" + if interpret_as_table: + content_type = "table" + + # consolidate output + meta = {"author": "", "modified_date": "", "created_date": "", "creator_tool": ""} + coords_dict = {"coords_x": 0, "coords_y": 0, "coords_cx": 0, "coords_cy": 0} + + for j, blocks in enumerate(parser_output): + + if content_type == "text": + new_entry = ("text", file_type, (1, 0), counter, "", "", input_fn, "", blocks, "", + "", blocks, blocks, "", blocks, "", "", "", "", "") + else: + # could be table if csv file -> in this case, keep both text [11] and table [7] + new_entry = ("table", file_type, (1, 0), counter, "", "", input_fn, blocks, blocks, "", + "", blocks, blocks, "", blocks, "", "", "", "", "") + + # creates a single 'unbound' parsing output dict -> no storage + parsing_output_dict = self.create_one_parsing_output_dict(counter, + new_entry, meta, coords_dict, + dialog_value="false") + + output.append(parsing_output_dict) + self.parser_output.append(parsing_output_dict) + + counter += 1 + + if save_history: + ParserState().save_parser_output(self.parser_job_id, self.parser_output) + + return output + + def parse_one_dialog(self, input_fp, input_fn, save_history=True): + + # Designed as single dialog parse - no storage or link to library + # --note: only supports AWS dialog standard for now + + # check that path exists + if not os.path.exists(os.path.join(input_fp, input_fn)): + raise FilePathDoesNotExistException(os.path.join(input_fp, input_fn)) + + # set counters + counter = 0 + output = [] + + ext = input_fn.split(".")[-1].lower() + + if ext == ".json": + + output = DialogParser(self).parse_aws_json_file_format(input_fp, input_fn) + + for i, blocks in enumerate(output): + + # iterate thru each block -> add to metadata + speaker_name = blocks["speaker_name"] + + meta = {"author": speaker_name, "modified_date": "", "created_date": "", "creator_tool": ""} + + coords_dict = {"coords_x": blocks["start_time"], + "coords_y": blocks["stop_time"], + "coords_cx": 0, + "coords_cy": 0} + + text_entry = blocks["text"] + + # conforming file format with full path of dialog intake path + + format_type = "aws_json" + + new_entry = ("text", format_type, (1, 0), counter, "", "", input_fn, + text_entry, text_entry, "", "", text_entry, text_entry, "", text_entry, + "", "", "", "", "") + + # creates a single 'unbound' parsing output dict -> no storage + parsing_output_dict = self.create_one_parsing_output_dict(counter, + new_entry, meta, coords_dict, + dialog_value="true") + + output.append(parsing_output_dict) + self.parser_output.append(parsing_output_dict) + counter += 1 + + if save_history: + ParserState().save_parser_output(self.parser_job_id, self.parser_output) + + return output + + def parse_one_voice(self, input_fp, input_fn, save_history=True): + + # Designed to parse a single WAV/voice file - no storage or linkage to library + + # check that path exists + if not os.path.exists(os.path.join(input_fp, input_fn)): + raise FilePathDoesNotExistException(os.path.join(input_fp,input_fn)) + + # set counters + counter = 0 + output = [] + + ext = input_fn.split(".")[-1].lower() + + if ext in self.voice_types: + + parser_output = VoiceParser(self).add_voice_file(input_fp, input_fn) + + meta = {"author": "", "modified_date": "", "created_date": "", "creator_tool": ""} + coords_dict = {"coords_x": 0, "coords_y": 0, "coords_cx": 0, "coords_cy": 0} + + for j, blocks in enumerate(parser_output): + + new_entry = ("text", "ocr-wav", (1, 0), counter, "", "", input_fn, "", blocks, "", + "", blocks, blocks, "", blocks, "", "", "", "", "") + + # creates a single 'unbound' parsing output dict -> no storage + parsing_output_dict = self.create_one_parsing_output_dict(counter, + new_entry, meta, coords_dict, + dialog_value="false") + + output.append(parsing_output_dict) + self.parser_output.append(parsing_output_dict) + + if save_history: + ParserState().save_parser_output(self.parser_job_id, self.parser_output) + + return output + + def query_parser_state(self, query, results=None, remove_stop_words=True): + + if not results: + results = self.parser_output + + output = Utilities().fast_search_dicts(query,results, text_key="text",remove_stop_words=remove_stop_words) + + return output + + +class WebSiteParser: + + def __init__(self, url_or_fp, link="/", save_images=True, reset_img_folder=False, local_file_path=None, + from_file=False, text_only=False): + + # by default, assume that url_or_fp is a url path + self.url_main = url_or_fp + + # by default, will get images and links + self.text_only = text_only + + # by passing link - provides option for recursive calls to website for internal links + if link == "/": + self.url_link = "" + else: + self.url_link = link + + self.url_base = self.url_main + self.url_link + + # check for llmware path & create if not already set up + if not os.path.exists(LLMWareConfig.get_llmware_path()): + # if not explicitly set up by user, then create folder directory structure + LLMWareConfig.setup_llmware_workspace() + + if not local_file_path: + # need to update this path + self.local_dir = os.path.join(LLMWareConfig.get_llmware_path(),"process_website/") + else: + self.local_dir = local_file_path + + if reset_img_folder: + if os.path.exists(self.local_dir): + # important step to remove & clean out any old artifacts in the /tmp/ directory + shutil.rmtree(self.local_dir) + os.makedirs(self.local_dir, exist_ok=True) + + if not os.path.exists(self.local_dir): + os.makedirs(self.local_dir, exist_ok=True) + + if from_file: + # interpret url as file_path and file_name + try: + html = open(url_or_fp, encoding='utf-8', errors='ignore').read() + bs = BeautifulSoup(html, features="lxml") + self.html = bs.findAll() + success_code = 1 + self.text_only = True + except: + logging.error("error: WebSite parser- could not find html file to parse at %s ", url_or_fp) + success_code = -1 + self.text_only = True + else: + # this is the most likely default case -interpret url_or_fp as url + try: + req = Request(self.url_base, headers={'User-Agent': 'Mozilla/5.0'}) + html = urlopen(req).read() + + bs = BeautifulSoup(html,features="lxml") + + self.bs = bs + self.html = bs.findAll() + + out_str = "" + for x in self.html: + out_str += str(x) + " " + + with open(self.local_dir + "my_website.html", "w") as f: + f.write(out_str) + f.close() + + success_code = 1 + + except Exception as e: + logging.error("error: website_parser could not find website to open - caught error - %s ", e) + success_code = -1 + + self.save_images = save_images + self.image_counter = 0 + self.links = [] + self.mc = None + self.entries = None + self.core_index = [] + self.header_text = [] + self.internal_links = [] + self.external_links = [] + self.other_links = [] + + # meta-data expected in library add process + self.source = str(self.url_base) + self.success_code = success_code + + def website_main_processor(self, img_start, output_index=True): + + output = [] + counter = 0 + # by passing img_start explicitly- enables recursive calls to links/children sites + img_counter = img_start + + long_running_string = "" + + # new all_text to remove duplications + all_text = [] + + internal_links = [] + external_links = [] + header_text = [] + unique_text_list = [] + unique_header_list = [] + + last_text = "" + last_header = "" + + text = "" + + for elements in self.html: + + content_found = 0 + img = "" + img_success = 0 + img_url = "" + img_name = "" + + link = "" + link_type = "" + + # text = "" + + entry_type = "text" + + # if text only, then skip checks for images and links + if not self.text_only: + + if "property" in elements.attrs: + if elements.attrs["property"] == "og:image": + if "content" in elements.attrs: + + img_extension = elements["content"] + img_success, img, img_url, img_name = \ + self.image_handler(img_extension, elements, img_counter) + + if img_success == 1: + img_counter += 1 + content_found += 1 + + if "src" in elements.attrs: + + img_extension = elements["src"] + img_success, img, img_url, img_name = self.image_handler(img_extension, elements, img_counter) + + if img_success == 1: + img_counter += 1 + content_found += 1 + + if "href" in elements.attrs: + + if elements.attrs["href"]: + link_success, link, link_type = self.link_handler(elements) + content_found += 1 + + if link_success == 0: + # skip .js files and other formatting in link crawling + # link_success == 0 if not .js // ==1 if .js file + + if link_type == "internal": + if link != "/": + if link not in internal_links: + internal_links.append(link) + + if link_type == "external": + external_links.append(link) + + # main check for text + if elements.get_text(): + get_text = 1 + + if "type" in elements.attrs: + # skip css and javascript + if elements.attrs["type"] == "text/css" or elements.attrs["type"] == "text/javascript": + get_text = -1 + + if get_text == 1: + + # text handler + s_out = "" + + # alt for consideration to clean up string + # s_out += string.replace('\n', ' ').replace('\r', ' ').replace('\xa0', ' ').replace('\t', ' ') + + for string in elements.stripped_strings: + s_out += string + " " + + text += s_out + + if text: + header_entry = [] + + if text not in unique_text_list: + unique_text_list.append(text) + content_found += 1 + long_running_string += text + " " + last_text = text + + if "h1" in elements.name: + header_entry = (counter, "h1", text) + + if "h2" in elements.name: + header_entry = (counter, "h2", text) + + if "h3" in elements.name: + header_entry = (counter, "h3", text) + + if header_entry: + if text not in unique_header_list: + last_header = text + header_text.append(header_entry) + unique_header_list.append(text) + + # if looking for images and links, then prioritize in attribution + if not self.text_only: + if img and img_success == 1: + entry_type = "image" + else: + if link: + entry_type = "link" + else: + if text: + entry_type = "text" + else: + content_found = 0 + else: + entry_type = "text" + + if content_found > 0: + master_index = (self.url_main, self.url_link, counter) + if not text: + text = last_text + + entry = {"content_type": entry_type, + "text": text, + "image": {"image_name": img_name, "image_url": img_url}, + "link": {"link_type": link_type, "link": link}, + "master_index": master_index, + "last_header": last_header} + + # entry = (entry_type, text, (img_name, img_url), (link_type, link), master_index, last_header) + + counter += 1 + # save entry if image, or if (A) text > 50 and (B) not a dupe + if entry_type == "image" or (len(text) > 50 and text not in all_text): + output.append(entry) + all_text.append(text) + text = "" + + self.image_counter = img_counter + self.internal_links = internal_links + self.external_links = external_links + self.header_text = header_text + + if header_text: + header_text_sorted = sorted(header_text, key=lambda x: x[1]) + self.header_text = header_text_sorted + + self.core_index = output + self.entries = len(output) + + if not output_index: + return len(output), img_counter + + return self.core_index + + def link_handler(self, elements): + + link_out = "" + link_type = "" + js_skip = 0 + + if elements.attrs["href"].endswith(".js"): + link_out = elements.attrs["href"] + link_type = "js" + js_skip = 1 + + if elements.attrs["href"].endswith(".ico") or elements.attrs["href"].endswith(".ttf"): + link_out = elements.attrs["href"] + link_type = "other_formatting" + js_skip = 1 + + if elements.attrs["href"].endswith(".css"): + link_out = elements.attrs["href"] + link_type = "css" + js_skip = 1 + + if elements.attrs["href"].startswith(self.url_base): + # save relative link only + link_out = elements.attrs["href"][len(self.url_base):] + link_type = "internal" + + if str(elements.attrs["href"])[0] == "/": + # relative link + if elements.attrs["href"]: + if not elements.attrs["href"].startswith("//"): + link_out = elements.attrs["href"] + link_type = "internal" + + if elements.attrs["href"].startswith("https://") and \ + not elements.attrs["href"].startswith(self.url_base): + # website but not the url_base - external link + link_out = elements.attrs["href"] + link_type = "external" + + return js_skip, link_out, link_type + + def image_handler(self, img_extension, elements, img_counter): + + success = -1 + img_raw = [] + image_name = "" + full_url = "" + + try: + img_raw, response_code, full_url = self._request_image(img_extension, elements) + + if response_code == 200: + + if self.save_images: + + # need to capture img type, e.g., .jpg + img_type = "" + if img_extension.endswith("png"): img_type = "png" + if img_extension.endswith("jpg") or img_extension.endswith("jpeg"): img_type = "jpg" + if img_extension.endswith("tiff"): img_type = "tiff" + if img_extension.endswith("svg"): img_type = "svg" + + # secondary check if not at end - break off at '?' query string + if img_type == "": + original_img_name = img_extension.split("/")[-1] + original_img_name = original_img_name.split("?")[0] + if original_img_name.endswith("png"): img_type = "png" + if original_img_name.endswith("jpg") or img_extension.endswith("jpeg"): img_type = "jpg" + if original_img_name.endswith("tiff"): img_type = "tiff" + if original_img_name.endswith("svg"): img_type = "svg" + + # only save image if valid img format found + if img_type in ("png", "jpg", "svg", "tiff"): + image_name = "image{}.{}".format(img_counter, img_type) + fp = self.local_dir + image_name + s = self._save_image(img_raw, fp) + success = 1 + + else: + logging.info("update: WebSite - found image OK but could not " + "figure out img type: %s ", img_extension) + + except: + logging.info("warning: WebSite - could not retrieve potential image: %s ", elements.attrs["src"]) + success = -1 + + return success, img_raw, full_url, image_name + + # called in two different places + def _save_image(self, img_raw, fp): + + with open(fp, 'wb') as f: + img_raw.decode_content = True + shutil.copyfileobj(img_raw, f) + + return 0 + + def _save_image_website(self, fp, img_num, doc_id, save_file_path): + + # internal method to save image files and track counters + + img_type = img_num.split(".")[-1] + img_core = img_num[len("image"):].split(".")[0] + + # image name of format: image{{doc_ID}}_{{img_num}}.png + new_img_name = "image" + str(doc_id) + "_" + str(img_core) + "." + img_type + # new_img_name = "image" + str(library.image_ID) + "." + img_type + created = 0 + + img = open(os.path.join(fp,img_num), "rb").read() + if img: + f = open(os.path.join(save_file_path,new_img_name), "wb") + f.write(img) + f.close() + created += 1 + + return new_img_name, created + + # called by main handler + def _request_image(self, img_extension, img): + + # relative link - refers back to main index page + # check if url_main gives better performance than .url_base + + url_base = self.url_main + # url_ext = img.attrs['src'] + url_ext = img_extension + + full_url = url_ext + + if url_ext: + if url_ext.startswith("https:"): + # this is an external link - just use the source + full_url = url_ext + + if url_ext.startswith("/"): + # relative ID - add url_base to get img + + full_url = url_base + url_ext + + r = requests.get(full_url, stream=True, headers={'User-Agent': 'Mozilla/5.0'}) + + return r.raw, r.status_code, full_url + + # not called by the main handler - keep as direct callable method + def get_all_links(self): + + internal_links = [] + external_links = [] + other_links = [] + js_links = [] + + for content in self.html: + + found = 0 + js = 0 + + if "href" in content.attrs: + if content.attrs["href"]: + + if content.attrs["href"].endswith(".js"): + js_links.append(content.attrs["href"]) + js = 1 + + if content.attrs["href"].startswith(self.url_base): + # save relative link only + out = content.attrs["href"][len(self.url_base):] + internal_links.append(out) + found = 1 + + if str(content.attrs["href"])[0] == "/": + # relative link + out = content.attrs["href"] + if out: + # skip double // + if not out.startswith("//"): + internal_links.append(out) + found = 1 + + if content.attrs["href"].startswith("https://") and \ + not content.attrs["href"].startswith(self.url_base): + # website but not the url_base - external link + out = content.attrs["href"] + external_links.append(out) + found = 1 + + if found == 0: + other_links.append(content.attrs["href"]) + + self.internal_links = internal_links + self.external_links = external_links + self.other_links = other_links + + top_links = [] + + for z in range(0, len(internal_links)): + + link_tokens = internal_links[z].split("/") + for y in range(0, len(self.mc)): + if self.mc[y][0].lower() in link_tokens: + if internal_links[z] not in top_links: + top_links.append(internal_links[z]) + break + + link_results = {"internal_links": internal_links, "external_links": external_links, + "other_links": other_links, "top_links": top_links} + + return link_results + + # not called by main handler - keep as separate standalone method + def get_all_img(self, save_dir): + + counter = 0 + for content in self.html: + counter += 1 + if "src" in content.attrs: + if str(content).startswith(" will need to test/experiment + text_out = self.voice_to_text(input_fp, fn, 16000) + + # will chop up the long text into individual blocks + text_chunks = TextChunker(text_chunk=text_out, + max_char_size=self.text_chunk_size, + look_back_char_range=self.look_back_range).convert_text_to_chunks() + + return text_chunks + + +class TextParser: + + def __init__(self, parser=None, library=None, text_chunk_size=600, look_back_range=300): + + self.parser = parser + + # defaults + self.text_chunk_size = text_chunk_size + self.look_back_range = look_back_range + + if library: + self.text_chunk_size = library.block_size_target_characters + 200 + self.look_back_range = 300 + + if parser and not library: + if parser.library: + self.text_chunk_size = library.block_size_target_characters + 200 + self.look_back_range = 300 + + def jsonl_file_handler (self, dir_fp,sample_file, key_list=None, interpret_as_table=False, + separator="\n"): + + # will extract each line in jsonl as separate sample + # --based on key_list and interpret_as_table + + output = [] + my_file = open(os.path.join(dir_fp, sample_file), 'r', encoding='utf-8') + + if not key_list: + # as default, if no key_list, then look for "text" attribute in jsonl by default + key_list = ["text"] + + for i, lines in enumerate(my_file): + + row_tmp = json.loads(lines) + + if not interpret_as_table: + row_text = "" + for keys in key_list: + if keys in row_tmp: + row_text += row_tmp[keys] + separator + output.append(row_text) + + else: + row_table = [] + for keys in key_list: + if keys in row_tmp: + row_table.append(keys) + output.append(row_table) + + return output + + def text_file_handler (self, dir_fp, sample_file): + + text_out = open(os.path.join(dir_fp,sample_file), encoding='utf-8', errors='ignore').read() + + # will chop up the long text into individual text chunks + text_chunks = TextChunker(text_chunk=text_out, + max_char_size=self.text_chunk_size, + look_back_char_range=self.look_back_range).convert_text_to_chunks() + + return text_chunks + + def csv_file_handler (self, dir_fp,sample_file, max_rows=100, interpret_as_table=True): + + if interpret_as_table: + + # will split the table by rows and columns (\n for rows and ',' for cells in row) + t = Utilities().file_load(os.path.join(dir_fp,sample_file)) + tables_out= [] + + if len(t) < max_rows: + tables_out = [t] + else: + table_chunks = len(t) // max_rows + if max_rows > table_chunks * len(t): + # there is a remainder, so create one additional partial chunk with last set of rows + table_chunks += 1 + starter = 0 + stopper = 0 + for x in range(0,table_chunks): + starter = starter + stopper + stopper = starter + min(len(t)-starter, max_rows) + tables_out.append(t[starter:stopper]) + + return tables_out + + else: + # chunk and split as a big piece of text + raw_csv = open(os.path.join(dir_fp,sample_file), encoding='utf-8', errors='ignore').read() + # replace ',' & '\n' & '\r' with spaces + text_out = re.sub("[,\n\r]", " ", raw_csv) + + # will chop up the long text into individual text chunks + text_chunks = TextChunker(text_chunk=text_out, + max_char_size=self.text_chunk_size, + look_back_char_range=self.look_back_range).convert_text_to_chunks() + + return text_chunks + + +class WikiParser: + + def __init__(self, parser=None, library=None, text_chunk_size=600, look_back_range=300): + + self.wiki = WikiKnowledgeBase() + + self.parser = parser + self.library = library + + self.text_chunk_size = text_chunk_size + self.look_back_range = look_back_range + + if library: + self.text_chunk_size = self.library.block_size_target_characters + 200 + self.look_back_range = 300 + + if parser and not library: + if parser.library: + self.text_chunk_size = parser.library.block_size_target_characters + 200 + self.look_back_range = 300 + + def add_wiki_topic(self, topic, target_results=10): + + # used in both Parser / Library, as well as directly in Prompts (integrate as "Source" into Prompt) + + articles_output = [] + text_only = "" + blocks = [] + topic_query_results = self.wiki.search_wikipedia(topic,result_count=target_results, suggestion=False) + + text_chunks_all = [] + + for j, title in enumerate(topic_query_results): + article = self.wiki.get_article(title["title"]) + article.update({"topic": topic}) + articles_output.append(article) + + text_chunks = TextChunker(text_chunk=article["text"], + max_char_size=self.text_chunk_size, + look_back_char_range=self.look_back_range).convert_text_to_chunks() + + for i, chunk in enumerate(text_chunks): + new_block = {"file_source": title["title"], "page_num": max(1, i // 5), "text": chunk} + blocks.append(new_block) + + text_chunks_all += text_chunks + + topic_results = {"search_results": topic_query_results, "articles": articles_output, + "text_chunks": text_chunks_all, "blocks": blocks} + + return topic_results + + +class DialogParser: + + def __init__(self, parser=None, library=None, text_chunk_size=600, look_back_range=300): + + self.parser = parser + self.library = library + + self.text_chunk_size = text_chunk_size + self.look_back_range = look_back_range + + if library: + self.text_chunk_size = self.library.block_size_target_characters + 200 + self.look_back_range = 300 + + if parser and not library: + if parser.library: + self.text_chunk_size = parser.library.block_size_target_characters + 200 + self.look_back_range = 300 + + # currently only has support for AWS dialog format + self.supported_format_types = ["aws"] + + # map to aws transcript json output format + def parse_aws_json_file_format(self, input_folder, fn_json): + + f = json.load(open(os.path.join(input_folder, fn_json), "r")) + + # aws standard call transcript format: ["results"]["items"] -> key conversation elements to aggregate + # note: we will need many more documents for testing + # --possible that AWS call transcript has different formats and/or has evolved over time! + + block_output = [] + + # quick format check - will need to enhance over time + + format_validated = False + + if "results" in f: + if "items" in f["results"]: + format_validated = True + + # improve validation of format + user message back with link to AWS documents + if not format_validated: + logging.error("error: DialogParser currently only supports AWS Transcribe dialog format - For more " + "information, please see Amazon Web Services Transcription - " + "https://docs.aws.amazon.com/transcribe/latest/dg/how-input.html#how-it-works-output ") + + return block_output + + # end - quick format check + + # speaker label conversation snippets + conversation_snippets = f["results"]["items"] + + if len(conversation_snippets) == 0: + # no results to parse + logging.error("error: unexpected - AWS JSON dialog transcript empty") + return block_output + + text= "" + current_speaker = "spk_0" + start_time = float(0) + end_time = float(0) + + for i, items in enumerate(conversation_snippets): + + if i == 0: + current_speaker = items["speaker_label"] + start_time = float(items["start_time"]) + end_time = float(items["end_time"]) + # initialize text with the first word + text="" + if "alternatives" in items: + if "content" in items["alternatives"][0]: + text = items["alternatives"][0]["content"] + + else: + # general case after first snippet + new_block = False + + # if found switch in speakers - write block and re-set + if "speaker_label" in items: + if items["speaker_label"] != current_speaker: + + new_block = True + + new_entry = {"speaker_name": current_speaker, + "speaker_id": current_speaker, "text": text, + "start_time": start_time, "stop_time": end_time} + + block_output.append(new_entry) + current_speaker = items["speaker_label"] + start_time = float(items["start_time"]) + end_time = float(items["end_time"]) + # re-initialize text with the first word of the new speaker + text = "" + if "alternatives" in items: + if "content" in items["alternatives"][0]: + text = items["alternatives"][0]["content"] + + if not new_block: + if "alternatives" in items: + if "content" in items["alternatives"][0]: + if items["type"] == "punctuation": + text += items["alternatives"][0]["content"] + else: + # general case - type = "pronunciation" [insert space] + text += " " + items["alternatives"][0]["content"] + + if "end_time" in items: + end_time = float(items["end_time"]) + + # pick up the last block, if any + if text: + new_entry = {"speaker_name": current_speaker, "speaker_id": current_speaker, "text": text, + "start_time": start_time, "stop_time": end_time} + block_output.append(new_entry) + + return block_output + diff --git a/llmware/prompts.py b/llmware/prompts.py new file mode 100644 index 00000000..4a801587 --- /dev/null +++ b/llmware/prompts.py @@ -0,0 +1,1752 @@ + +# Copyright 2023 llmware + +# Licensed under the Apache License, Version 2.0 (the "License"); you +# may not use this file except in compliance with the License. You +# may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. + + +from bson import ObjectId +import statistics +from collections import Counter +import re +import time +import logging +import os + +from llmware.util import Utilities, CorpTokenizer, PromptCatalog, YFinance, Graph +from llmware.resources import PromptState +from llmware.models import ModelCatalog +from llmware.parsers import Parser +from llmware.retrieval import Query +from llmware.library import Library +from llmware.exceptions import LibraryObjectNotFoundException, PromptNotInCatalogException + + +class Prompt: + + def __init__(self, llm_name=None, tokenizer=None, model_card=None, library=None, account_name="llmware", + prompt_id=None, save_state=False, llm_api_key=None, llm_model=None, from_hf=False, + prompt_catalog=None): + + self.account_name = account_name + self.library = library + + # model specific attributes + self.model_card = model_card + self.tokenizer = tokenizer + + self.llm_model = None + self.llm_model_api_key = llm_api_key + self.llm_name = llm_name + + if from_hf and llm_model: + self.llm_model = ModelCatalog().load_hf_generative_model(llm_model, tokenizer) + # print("update: loading HF Generative model - ", self.llm_model) + + """ + if llm_model: + # intended for seamless passing of an in-memory HF Generative Model + self.llm_model = llm_model + """ + + if llm_name: + self.llm_model = ModelCatalog().load_model(llm_name, api_key=llm_api_key) + + # default batch size, assuming all LLMs have min 2048 full context (50% in / 50% out) + self.context_window_size = 1000 + + if model_card: + + if "context_window" in model_card: + self.context_window_size = int(0.5 * model_card["context_window"]) + + if "model_name" in model_card: + self.llm_model = ModelCatalog().load_model(model_card["model_name"], api_key=llm_api_key) + + if not tokenizer: + self.tokenizer = Utilities().get_default_tokenizer() + else: + self.tokenizer = tokenizer + + # if model card is passed, it will be used to derive batch sizes for packaging evidence + self.model_card = model_card + + # inference parameters + self.temperature = 0.5 + self.prompt_type = "" + self.llm_max_output_len = 200 + + # state attributes + if prompt_id: + PromptState(self).load_state(prompt_id) + self.prompt_id = prompt_id + else: + new_prompt_id = PromptState(self).issue_new_prompt_id() + self.prompt_id = PromptState(self).initiate_new_state_session(new_prompt_id) + + logging.info(f"update: creating new prompt id - {new_prompt_id}") + + self.save_prompt_state = save_state + + # interaction_history is the main running 'active' tracker of current prompt history + # interaction_history is added by each 'register' invocation + # interaction_history can also be pulled from PromptState, or from database lookup + + self.interaction_history = [] + + # dialog tracker is an extract from the interaction history, consisting of running series of tuples: + # --"prompt" & "llm_response" response + + self.dialog_tracker = [] + + self.llm_state_vars = ["llm_response", "prompt", + "instruction", "usage", "time_stamp", "calling_app_ID", "account_name", + "prompt_id", "batch_id", "event_type", + # source/evidence + "evidence", "evidence_metadata", "biblio" + # fact-checking + "source_review", "comparison_stats", "fact_check", + # human-in-the-loop feedback + "human_feedback","human_assessed_accuracy", "human_rating", "change_log"] + + # prompt catalog options + if prompt_catalog: + self.pc = prompt_catalog + # print("update: loading custom prompt catalog") + + else: + self.pc = PromptCatalog() + + self.prompt_catalog = self.pc.get_all_prompts() + + # source materials - available for all prompts, passed as 'context' + # this is a 'stateful' list that aggregates and tracks all of the source materials added to the prompt + # each list entry consists of a dict with keys - "batch_id" | "text" | "batch_metadata" | "batch_stats" + # --batch_metadata is a list of metadata for each 'sub-source' integrated into the batch + # --batch_stats is a sub-list that tracks that # of elements in the batch_metadata + + self.source_materials = [] + + self.batch_separator = "\n" + + """ + if self.llm: + self.batch_separator = self.llm.separator + """ + + self.query_results = None + + def load_model(self, gen_model,api_key=None): + + if api_key: + self.llm_model_api_key = api_key + + self.llm_model = ModelCatalog().load_model(gen_model, api_key=self.llm_model_api_key) + self.llm_name = gen_model + + return self + + def set_inference_parameters(self, temperature=0.5, llm_max_output_len=200): + self.temperature = temperature + self.llm_max_output_len = llm_max_output_len + return self + + def get_current_history(self, key_list=None): + + # will return selected state vars from current prompt session, based on key list + + if not key_list: + key_list = self.llm_state_vars + + output_dict = {} + for i, keys in enumerate(key_list): + output_dict.update({keys: []}) + for j, entries in enumerate(self.interaction_history): + if keys in entries: + output_dict[keys].append(entries[keys]) + + return output_dict + + def clear_history(self): + # removes elements from interaction history + self.interaction_history = [] + self.dialog_tracker = [] + return self + + def clear_source_materials(self): + self.source_materials = [] + return self + + def register_llm_inference (self, ai_dict, prompt_id=None, trx_dict=None): + + if not prompt_id: + prompt_id = self.prompt_id + + # update elements from interaction + ai_dict.update({"prompt_id": prompt_id}) + ai_dict.update({"event_type": "inference"}) + ai_dict.update({"human_feedback": ""}) + ai_dict.update({"human_assessed_accuracy": ""}) + + # if trx_dict passed -> append key/value pairs into ai_dict + if isinstance(trx_dict, dict): + for key,value in trx_dict.items(): + ai_dict.update({key:value}) + + # captures new interaction into the interaction history + logging.info("update: ai_dict getting registered - %s", ai_dict["event_type"]) + + PromptState(self).register_interaction(ai_dict) + new_dialog = {"user": ai_dict["prompt"], "bot": ai_dict["llm_response"]} + self.dialog_tracker.append(new_dialog) + + return ai_dict + + def lookup_llm_trx_all (self): + ai_trx_list = PromptState(self).full_history() + return ai_trx_list + + def load_state(self, prompt_id, clear_current_state=True): + PromptState(self).load_state(prompt_id,clear_current_state=clear_current_state) + for entries in self.interaction_history: + self.dialog_tracker.append({"user": entries["prompt"], "bot": entries["llm_response"]}) + + return self + + def save_state(self): + PromptState(self).save_state(self.prompt_id) + return self + + def lookup_by_prompt_id (self, prompt_id): + ai_trx_list = PromptState(self).lookup_by_prompt_id(prompt_id) + return ai_trx_list + + def lookup_ai_trx_with_filter(self, filter_dict): + ai_trx_list = PromptState(self).lookup_prompt_with_filter(filter_dict) + return ai_trx_list + + # prepare sources + + def add_source_new_query(self, library, query=None, query_type="semantic", result_count=10): + + # step 1 - run selected query against library + query_results = Query(library).query(query,query_type=query_type, result_count=result_count, results_only=True) + + # step 2 - package query_results directly as source, loaded to prompt, and packaged as 'llm context' + sources = Sources(self).package_source(query_results,aggregate_source=True) + + # enables use of 'prompt_with_sources' + + return sources + + def add_source_query_results(self, query_results): + + # example use - run a query directly, and then 'add' the query results to a prompt + # query_results = Query(self.library).semantic_query("what is the duration of the non-compete clause?") + # prompter = Prompt().load_model("claude-instant-v1",api_key="my_api_key") + # sources = prompter.add_source_query_results(query_results["results"]) + + sources = Sources(self).package_source(query_results,aggregate_source=True) + + return sources + + def add_source_library(self, library_name): + + # example use - created a small library with a few key documents in a previous step + # my_lib.add_documents(fp) + # sources = prompter.add_source_library("my_lib") + + lib = Library().load_library(library_name) + query_results = Query(lib).get_whole_library() + + sources = Sources(self).package_source(query_results,aggregate_source=True) + + return sources + + def add_source_wikipedia(self, topic, article_count=3, query=None): + + # step 1 - get wikipedia article + output = Parser().parse_wiki([topic],write_to_db=False,target_results=article_count) + + if query: + output = Utilities().fast_search_dicts(query, output, remove_stop_words=True) + + for i, entries in enumerate(output): + logging.info("update: source entries - %s - %s", i, entries) + + # step 2 - package wiki article results as source, loaded to prompt, and packaged as 'llm context' + sources = Sources(self).package_source(output,aggregate_source=True) + + return sources + + def add_source_yahoo_finance(self, ticker=None, key_list=None): + + # example: primary use is to quickly grab a factset about a specific company / stock ticker + # and 'inject' real-time, up-to-date fact set into the prompt to minimize hallucination risk + + fin_info = YFinance().ticker(ticker).info + + logging.info("update: fin_info - %s ", fin_info) + + output = "" + if key_list: + for keys in key_list: + if keys in fin_info: + output += keys + " : " + str(fin_info[keys]) + self.batch_separator + else: + for keys, values in fin_info.items(): + output += keys + " : " + str(values) + self.batch_separator + + results = {"file_source": "yfinance-" + str(ticker), "page_num": "na", "text": output} + + logging.info("update: yfinance results - %s ", results) + + # step 2 - package as source + sources = Sources(self).package_source([results], aggregate_source=True) + + return sources + + # TODO - WIP - how to pass/add knowledge graph context to a prompt + def add_source_knowledge_graph(self, library, query): + + # need to check for library and for graph + if library: + self.library = library + + if not self.library: + raise LibraryObjectNotFoundException + + if self.library.get_knowledge_graph_status() == "yes": + + kg_output = Graph(self.library).kg_query(query,th=10) + text_string_out = "" + + for key, values in kg_output.items(): + if key: + text_string_out += key + " " + for entries in values: + text_string_out += entries + " " + + # print("update: kg_output - ", kg_output, text_string_out) + source_output = [{"text": text_string_out, "page_num":0, "file_source": "knowledge_graph"}] + + sources = Sources(self).package_source(source_output, aggregate_source=True) + else: + raise LibraryObjectNotFoundException + + return sources + + def add_source_website(self, url, query=None): + + # get website content + output = Parser().parse_website(url,write_to_db=False,max_links=3) + + if query: + output = Utilities().fast_search_dicts(query, output, remove_stop_words=True) + + """ + for i, entry in enumerate(output): + logging.info("update: website parse - %s - %s", i, entry) + """ + + sources = Sources(self).package_source(output, aggregate_source=True) + + return sources + + def add_source_document(self, input_fp,input_fn, query=None): + + # example: intended for use to rapidly parse and add a document (of any type) from local file to a prompt + + output = Parser().parse_one(input_fp,input_fn) + + # run in memory filtering to sub-select from document only items matching query + if query: + output = Utilities().fast_search_dicts(query, output, remove_stop_words=True) + + """ + for i, entry in enumerate(output): + print("update: results - ", i, len(entry["text"]),entry) + # logging.info("update: parsing output - %s - %s ", i, entry) + """ + + sources = Sources(self).package_source(output, aggregate_source=True) + + return sources + + def add_source_last_interaction_step(self): + + # will take the last interaction and add to source, useful in conversational dialog + + interaction= "" + if len(self.dialog_tracker) > 0: + interaction += self.dialog_tracker[-1]["user"] + "\n" + self.dialog_tracker[-1]["bot"] + "\n" + + interaction_source = [{"text": interaction, "page_num":0, "file_source":"dialog_tracker"}] + + # print("interaction_source - ", interaction_source) + + sources = Sources(self).package_source(interaction_source, aggregate_source=True) + + return sources + + def review_sources_summary(self): + + # Source metadata for each entry - ["batch_id", "text", "metadata", "biblio", "batch_stats", + # "batch_stats.tokens", "batch_stats.chars", "batch_stats.samples"] + + source_summary_output = [] + for i, sources in enumerate(self.source_materials): + new_entry = {"batch_id": sources["batch_id"], "batch_stats": sources["batch_stats"]} + source_summary_output.append(new_entry) + + return source_summary_output + + def prompt_with_source(self, prompt, prompt_name=None, source_id_list=None, first_source_only=True): + + # this method is intended to be used in conjunction with sources as follows: + # prompter = Prompt().load_model("claude-instant-v1", api_key=None) + # source = prompter.add_source (....) + # response = prompter.prompt_with_source("what is the stock price of XYZ?") + # + # if multiple loaded sources, then the method will automatically call the model several times + # --user can select either 'call once' with first_source_only = True + # --OR ... by selecting specific sources by their batch_id, + # e.g., source_id_list = [0,1,5] would iterate through sources 0, 1, 5 + + response_list = [] + + if prompt_name: + self.prompt_type = prompt_name + + # this method assumes a 'closed context' with set of preloaded sources into the prompt + if len(self.source_materials) == 0: + logging.error("error: to use prompt_with_source, there must be a loaded source - try '.add_sources' first") + return [{}] + + # this is the 'default' and will use the first batch of source material only + if first_source_only: + + response_dict = self.prompt_main(prompt,prompt_name=self.prompt_type, + context=self.source_materials[0]["text"], + register_trx=False) + + # add details on the source materials to the response dict + if "metadata" in self.source_materials[0]: + response_dict.update({"evidence_metadata": self.source_materials[0]["metadata"]}) + + if "biblio" in self.source_materials[0]: + response_dict.update({"biblio": self.source_materials[0]["biblio"]}) + + response_list.append(response_dict) + + else: + # if first_source_only is false, then run prompts with all of the sources available + for i, batch in enumerate(self.source_materials): + if source_id_list: + + if i in source_id_list: + response_dict = self.prompt_main(prompt,prompt_name=self.prompt_type, + context=self.source_materials[i]["text"], + register_trx=False) + + # add details on the source materials to the response dict + if "metadata" in self.source_materials[i]: + response_dict.update({"evidence_metadata": self.source_materials[i]["metadata"]}) + + if "biblio" in self.source_materials[i]: + response_dict.update({"biblio": self.source_materials[i]["biblio"]}) + + response_list.append(response_dict) + + else: + + response_dict = self.prompt_main(prompt, prompt_name=self.prompt_type, + context=self.source_materials[i]["text"], + register_trx=False) + + # add details on the source materials to the response dict + if "metadata" in self.source_materials[i]: + response_dict.update({"evidence_metadata": self.source_materials[i]["metadata"]}) + + if "biblio" in self.source_materials[i]: + response_dict.update({"biblio": self.source_materials[i]["biblio"]}) + + response_list.append(response_dict) + + # register inferences in state history, linked to prompt_id + for l, llm_inference in enumerate(response_list): + + logging.info ("update: llm inference - %s - %s - %s", l, len(response_list),llm_inference) + + self.register_llm_inference(llm_inference) + + return response_list + + # prompt management + + def select_prompt_from_catalog(self, prompt_name): + + if prompt_name in self.pc.list_all_prompts(): + self.prompt_type = prompt_name + else: + raise PromptNotInCatalogException(prompt_name) + + return self + + def prompt_from_catalog(self, prompt, context=None, prompt_name=None, inference_dict=None): + + if prompt_name not in self.pc.list_all_prompts(): + raise PromptNotInCatalogException(prompt_name) + + # self.llm_model.add_prompt_engineering= prompt_name + response = self.prompt_main(prompt,context=context, prompt_name=prompt_name,inference_dict=inference_dict) + + return response + + # useful 'out of the box' prompts + def number_or_none(self, prompt, context=None): + # rename - "facts_or_not_found" + output = self.prompt_from_catalog(prompt, context=context,prompt_name="number_or_none") + return output + + def summarize_with_bullets(self, prompt, context, number_of_bullets=5): + + # useful 'out of the box' summarize capability with ability to parameterize the number_of_bullets + # note: most models are 'approximately' accurate when specifying a number of bullets + + inference_dict = {"number_of_bullets": number_of_bullets} + output = self.prompt_from_catalog(prompt, context=context,prompt_name="summarize_with_bullets", + inference_dict=inference_dict) + + return output + + def yes_or_no(self, prompt, context): + + # useful classification prompt, assumes prompt is a question that expects a "yes" or "no" answer + response = self.prompt_from_catalog(prompt, context=context,prompt_name="yes_no") + + return response + + def completion(self, prompt, temperature=0.7, target_len=200): + + self.llm_model.temperature = temperature + self.llm_model.ai_max_output_len = target_len + + response = self.prompt_from_catalog(prompt, prompt_name="completion") + + return response + + def multiple_choice(self, prompt, context, choice_list): + + prompt += "\nWhich of the following choices best answers the question - " + for i, choice in enumerate(choice_list): + prompt += "(" + chr(65+i) + ") " + choice + ", " + + if prompt.endswith(", "): + prompt = prompt[:-2] + "?" + + response = self.prompt_from_catalog(prompt, context=context, prompt_name="multiple_choice") + + return response + + def xsummary(self, context, number_of_words=20): + + # provides an 'extreme summary', e.g., 'xsum' with ability to parameterize the number of words + # --most models are reasonably accurate when asking for specific number of words + + prompt="" + inference_dict = {"number_of_words": number_of_words} + response = self.prompt_from_catalog(prompt, context=context, prompt_name="xsummary",inference_dict=inference_dict) + + return response + + def title_generator_from_source (self, prompt, context=None, title_only=True): + + response = self.prompt_from_catalog(prompt, context=context,prompt_name="report_title") + + if title_only: + return response["llm_response"] + + return response + + # core basic prompt inference method + def prompt_main (self, prompt, prompt_name=None, context=None, call_back_attempts=1, calling_app_id="", + prompt_id=0,batch_id=0, trx_dict=None, selected_model= None, register_trx=False, + inference_dict=None): + + usage = {} + + if not prompt_name: + + # pull from .add_prompt_engineering state + if self.llm_model.add_prompt_engineering: + prompt_name = self.llm_model.add_prompt_engineering + + else: + # defaults + if context: + prompt_name = "default_with_context" + else: + prompt_name = "default_no_context" + + if selected_model: + self.llm_model = ModelCatalog().load_model(selected_model) + + self.llm_model.temperature = self.temperature + self.llm_model.max_tokens = self.llm_model.llm_max_output_len + self.llm_model.add_context = context + self.llm_model.add_prompt_engineering = prompt_name + + output_dict = self.llm_model.inference(prompt, inference_dict=inference_dict) + + output = output_dict["llm_response"] + + if isinstance(output,list): + output = output[0] + + # triage process - if output is ERROR code, then keep trying up to parameter- call_back_attempts + # by default - will not attempt to triage, e.g., call_back_attempts = 1 + # --depending upon the calling function, it can decide the criticality and # of attempts + + if output == "/***ERROR***/": + # try again + attempts = 1 + + while attempts < call_back_attempts: + + # wait 5 seconds to try back + time.sleep(5) + + # exact same call to inference + output_dict = self.llm_model.inference(prompt) + + output = output_dict["llm_response"] + # if list output, then take the string from the first output + if isinstance(output, list): + output = output[0] + + # keep trying until not ERROR message found + if output != "/***ERROR***/": + break + + attempts += 1 + + # if could not triage, then present "pretty" error output message + if output == "/***ERROR***/": + if "error_message" in output_dict: + output = output_dict["error_message"] + else: + output = "AI Output Not Available" + + # strip & which are used by some models as end of text marker + output = str(output).replace("","") + output = str(output).replace("","") + + # output = re.sub("","", output) + # output = re.sub("","", output) + + if "usage" in output_dict: + usage = output_dict["usage"] + + output_dict = {"llm_response": output, "prompt": prompt, + "evidence": context, + "instruction": prompt_name, "model": self.llm_model.model_name, + "usage": usage, + "time_stamp": Utilities().get_current_time_now("%a %b %d %H:%M:%S %Y"), + "calling_app_ID": calling_app_id, + "rating": "", + "account_name": self.account_name, + # "library_name": self.library_name, + "prompt_id": prompt_id, + "batch_id": batch_id, + } + + if register_trx: + self.register_llm_inference(output_dict,prompt_id,trx_dict) + + return output_dict + + # useful "call backs" + def summarize_multiple_responses(self, list_of_response_dict=None, response_id_list=None): + + batch = None + + if list_of_response_dict: + batch = list_of_response_dict + elif response_id_list: + batch = [] + for response_id in response_id_list: + batch += PromptState(self).lookup_by_prompt_id + + if not batch: + batch = self.interaction_history + + # batch of response dictionaries -> need to aggregate the llm_responses- and run prompt + aggregated_response_dict = {} + + return aggregated_response_dict + + def select_among_multiple_responses(self, list_of_response_dict=None, response_id_list=None): + + batch = None + + if list_of_response_dict: + batch = list_of_response_dict + elif response_id_list: + batch = [] + for response_id in response_id_list: + batch += PromptState(self).lookup_by_prompt_id + + if not batch: + batch = self.interaction_history + + # batch of response dictionaries -> need to aggregate the llm_responses- and run prompt + aggregated_response_dict = {} + + return aggregated_response_dict + + # post processing + + def evidence_check_numbers(self, response): + + # expect that response is a list of response dictionaries + if isinstance(response, dict): + response = [response] + + response_out = [] + + for i, response_dict in enumerate(response): + qc = QualityCheck(self).fact_checker_numbers(response_dict) + + # print("FACT CHECK - ", qc) + + response_dict.update({"fact_check": qc}) + response_out.append(response_dict) + + return response_out + + def evidence_check_sources(self, response): + + # expect that response is a list of response dictionaries + if isinstance(response, dict): + response = [response] + + response_out = [] + for i, response_dict in enumerate(response): + qc = QualityCheck(self).source_reviewer(response_dict) + + # print("SOURCE REVIEW - ", qc) + + response_dict.update({"source_review": qc}) + response_out.append(response_dict) + + return response_out + + def evidence_comparison_stats(self, response): + + # expect that response is a list of response dictionaries + if isinstance(response, dict): + response = [response] + + response_out = [] + for i, response_dict in enumerate(response): + qc = QualityCheck(self).token_comparison(response_dict) + + # print("COMPARISON STATS - ", qc) + + response_dict.update({"comparison_stats": qc}) + response_out.append(response_dict) + + return response_out + + def classify_not_found_response(self, response_list,parse_response=True,evidence_match=True,ask_the_model=True): + + output_response_all = [] + + if isinstance(response_list,dict): + response_list = [response_list] + + for i, response_dict in enumerate(response_list): + output_response_all.append(self._classify_not_found_one_response(response_dict, + parse_response=parse_response, + evidence_match=evidence_match, + ask_the_model=ask_the_model)) + + return output_response_all + + def _classify_not_found_one_response(self, response_dict, parse_response=True, evidence_match=True, ask_the_model=True): + + output_response = {} + nf = [] + + if parse_response: + nf1 = QualityCheck(self).classify_not_found_parse_llm_response(response_dict) + output_response.update({"parse_llm_response": nf1}) + if nf1 not in nf: + nf.append(nf1) + + if evidence_match: + nf2 = QualityCheck(self).classify_not_found_evidence_match(response_dict) + output_response.update({"evidence_match": nf2}) + if nf2 not in nf: + nf.append(nf2) + + if ask_the_model: + nf3 = QualityCheck(self).classify_not_found_ask_the_model(response_dict) + output_response.update({"ask_the_model": nf3}) + if nf3 not in nf: + nf.append(nf3) + + if len(nf) == 0: + logging.warning("error: Prompt().classify_not_response() expects at least one of the tests to be marked" + "as True - none of the tests were executed - please try again with one test as 'True'") + + return output_response + + # simple case - all of the tests are conforming + if len(nf) == 1: + output_response.update({"not_found_classification": nf[0]}) + else: + output_response.update({"not_found_classification": "undetermined"}) + + return output_response + + # user ratings + + def send_to_human_for_review(self, output_path=None, output_fn=None): + + output = HumanInTheLoop(prompt=self).export_current_interaction_to_csv(output_path=output_path,report_name=output_fn) + return output + + def apply_user_ratings(self, ratings_dict): + + output = HumanInTheLoop(prompt=self).add_or_update_human_rating(self.prompt_id,ratings_dict) + return output + + def apply_user_corrections(self, updates_dict): + + output = HumanInTheLoop(prompt=self).update_llm_response_record(self.prompt_id,updates_dict,keep_change_log=True) + return output + + +class Sources: + + # Sources can accept any Python iterable consisting of dictionary entries + # Each dictionary entry must support minimally the keys in self.source_input_keys + # By default, this is a minimum of 3 keys - "text", "file_source", "page_num" + + def __init__(self, prompt): + + self.prompt= prompt + # self.tokenizer = prompt.tokenizer + self.tokenizer = Utilities().get_default_tokenizer() + + self.source_input_keys = ["text", "file_source", "page_num"] + self.source_output_keys = [] + + self.source_keys = ["batch_id", "text", "metadata", "biblio", "batch_stats", "batch_stats.tokens", + "batch_stats.chars", "batch_stats.samples"] + + self.source_metadata = ["batch_source_num", "evidence_start_char", "evidence_stop_char", + "doc_fn", "page_num"] + + def token_counter(self, text_sample): + toks = self.tokenizer.encode(text_sample).ids + return len(toks) + + def tokenize (self, text_sample): + toks = self.tokenizer.encode(text_sample).ids + return toks + + def package_source(self, retrieval_material, aggregate_source=True, add_to_prompt=True, + backup_source_filename="user_provided_unknown_source"): + + # generalized source packager + # --assumes minimal metadata - doc_name, page_num and text chunk + # --add to existing 'state' source & create new batch on top if overflow + + # tracking variables + tokens_per_batch = [] + samples_per_batch = [] + sample_counter = 0 + doc_sources = {} + doc_sources_per_batch = {} + biblio_per_batch = [] + batches = [] + meta = [] + + samples = [] + + for i, q in enumerate(retrieval_material): + + # simple deduplication check to remove identical entries - more 'cleaning' options can be offered over time + if q not in samples: + samples.append(q) + + # default + current_batch = "" + token_counter = 0 + batch_metadata = [] + batch_id = 0 + char_counter = 0 + + if aggregate_source: + # start current batch with the last entry in source materials and aggregate from this point + if len(self.prompt.source_materials) > 0: + + # pull up the last 'in-progress' entry in current source materials state + current_batch = self.prompt.source_materials[-1]["text"] + token_counter = self.token_counter(current_batch) + char_counter = len(current_batch) + batch_metadata = self.prompt.source_materials[-1]["metadata"] + batch_stats = self.prompt.source_materials[-1]["batch_stats"] + batch_id = len(self.prompt.source_materials) - 1 + + # 'pop' the last entry 'in-progress' off the list + self.prompt.source_materials = self.prompt.source_materials[:-1] + + samples_chunked = [] + + # print("update: input samples len - ", len(samples)) + + for x in range(0,len(samples)): + + t = self.token_counter(samples[x]["text"]) + + if t > self.prompt.context_window_size: + chunks = self.chunk_large_sample(samples[x]) + samples_chunked += chunks + else: + samples_chunked.append(samples[x]) + + samples = samples_chunked + + # print("update: chunked samples len - ", len(samples)) + + for x in range(0, len(samples)): + + t = self.token_counter(samples[x]["text"]) + + if "file_source" in samples[x]: + source_fn = samples[x]["file_source"] + else: + source_fn = backup_source_filename + + if "page_num" in samples[x]: + page_num = samples[x]["page_num"] + else: + # if can not retrieve from metadata, then set as default - page 1 + page_num = 1 + + # keep aggregating text batch up to the size of the target context_window for selected model + if (t + token_counter) < self.prompt.context_window_size: + + # appends separator at end of sample text before adding the next chunk of text + current_batch += samples[x]["text"] + self.prompt.batch_separator + batch_char_len = len(current_batch) + + new_source = {"batch_source_id": len(batch_metadata), + "evidence_start_char": char_counter, + # remove adding char_counter to evidence_stop_char + "evidence_stop_char": batch_char_len, + "source_name": source_fn, + "page_num": page_num, + } + + batch_metadata.append(new_source) + + char_counter = batch_char_len + token_counter += t + + # new trackers + sample_counter += 1 + if source_fn not in doc_sources: + doc_sources.update({source_fn: [page_num]}) + else: + if page_num not in doc_sources[source_fn]: + doc_sources[source_fn].append(page_num) + + if source_fn not in doc_sources_per_batch: + doc_sources_per_batch.update({source_fn: [page_num]}) + else: + if page_num not in doc_sources_per_batch[source_fn]: + doc_sources_per_batch[source_fn].append(page_num) + + else: + # capture number of tokens in batch + tokens_per_batch.append(token_counter) + samples_per_batch.append(sample_counter) + sample_counter = 1 + + biblio_per_batch.append(doc_sources_per_batch) + + doc_sources_per_batch = {} + + if "file_source" in samples[x]: + doc_filename = samples[x]["file_source"] + else: + doc_filename = backup_source_filename + + if "page_num" in samples[x]: + page_num = samples[x]["page_num"] + else: + page_num = 1 + + doc_sources_per_batch.update({doc_filename: [page_num]}) + + batches.append(current_batch) + meta.append(batch_metadata) + + if add_to_prompt: + new_batch_dict = {"batch_id": batches, "text": current_batch, "metadata": batch_metadata, + "biblio": doc_sources_per_batch, "batch_stats": + {"tokens": token_counter, + "chars": len(current_batch), + "samples": len(batch_metadata)}} + + self.prompt.source_materials.append(new_batch_dict) + + # reset current_batch -> current snippet + current_batch = samples[x]["text"] + token_counter = t + new_source = {"batch_source_num": 0, + "evidence_start_char": 0, + "evidence_stop_char": len(samples[x]["text"]), + "doc_fn": source_fn, + "page_num": page_num, + } + + batch_metadata = [new_source] + char_counter = len(samples[x]["text"]) + + if len(current_batch) > 0: + batches.append(current_batch) + meta.append(batch_metadata) + + if add_to_prompt: + new_batch_dict = {"batch_id": batches, "text": current_batch, "metadata": batch_metadata, + "biblio": doc_sources_per_batch, "batch_stats": {"tokens": token_counter, + "chars": len(current_batch), + "samples": len(batch_metadata)}} + + self.prompt.source_materials.append(new_batch_dict) + + # add new stats for last batch + tokens_per_batch.append(token_counter) + samples_per_batch.append(sample_counter) + biblio_per_batch.append(doc_sources_per_batch) + + new_sources = {"text_batch": batches, "metadata_batch": meta, "batches_count": len(batches)} + + return new_sources + + def chunk_large_sample(self, sample): + + # if there is a very large single sample, which is bigger than the context window, + # then, break up into smaller chunks + + chunks = [] + max_size = self.prompt.context_window_size + sample_len = self.token_counter(sample["text"]) + + chunk_count = sample_len // max_size + if max_size * chunk_count < sample_len: + chunk_count += 1 + + stopper = 0 + base_dict = {} + for key, values in sample.items(): + base_dict.update({key:values}) + + sample_tokens = self.tokenize(sample["text"]) + + for x in range(0,chunk_count): + starter = stopper + stopper = min((x+1)*max_size,sample_len) + new_chunk_tokens = sample_tokens[starter:stopper] + new_dict = base_dict + new_dict.update({"text":self.tokenizer.decode(new_chunk_tokens)}) + chunks.append(new_dict) + + # print("update: created sample chunks - ", chunk_count, max_size, sample_len, len(chunks)) + + return chunks + + +class QualityCheck: + + def __init__(self, prompt=None): + + self.llm_response = None + self.evidence = None + self.evidence_metadata= None + self.add_markup = False + + self.prompt = prompt + + # add instruction + self.instruction = None + + self.comparison_stats = {} + self.fact_check = {} + self.ner_fact_check = {} + self.source_review = {} + + def review (self, response_dict, add_markup=False, review_numbers=True, comparison_stats=True, + source_review=True, instruction=None): + + self.llm_response = response_dict["llm_response"] + self.evidence= response_dict["evidence"] + self.evidence_metadata = response_dict["evidence_metadata"] + self.add_markup = add_markup + + # add instruction + self.instruction = instruction + + # review - main entry point into Quality Check - runs several methods for output + + if comparison_stats: + self.comparison_stats = self.token_comparison (response_dict) + + if review_numbers: + self.fact_check = self.fact_checker_numbers(response_dict) + + if source_review: + self.source_review = self.source_reviewer(response_dict) + + return self + + def fact_checker_numbers (self, response_dict): + + ai_gen_output = response_dict["llm_response"] + evidence = response_dict["evidence"] + evidence_metadata = response_dict["evidence_metadata"] + add_markup= False + + # looks for numbers only right now + llm_response_markup = "" + fact_check = [] + + ai_numbers = [] + ai_numbers_token_tracker = [] + ai_numbers_char_tracker = [] + + confirmations = [] + unconfirmations = [] + + tokens = ai_gen_output.split(" ") + percent_on = -1 + char_counter = 0 + for i, tok in enumerate(tokens): + + tok_len = len(tok) + + # minimal cleaning of tokens + + # remove bullet point + if len(tok) > 0: + if ord(tok[-1]) == 8226: + tok = tok[:-1] + + if len(tok) > 1: + if tok.startswith("\n"): + tok = tok[1:] + + if tok.endswith("\n"): + tok = tok[:-1] + + if tok.endswith(",") or tok.endswith(".") or tok.endswith("-") or tok.endswith(";") or \ + tok.endswith(")") or tok.endswith("]"): + tok = tok[:-1] + + if tok.startswith("$") or tok.startswith("(") or tok.startswith("["): + tok = tok[1:] + + if tok.endswith("%"): + tok = tok[:-1] + percent_on = 1 + + tok = re.sub("[,-]","",tok) + # look for integer numbers - will not find floats + if Utilities().isfloat(tok): + + if percent_on == 1: + tok_fl = float(tok) / 100 + # turn off + percent_on = -1 + else: + tok_fl = float(tok) + ai_numbers.append(tok_fl) + ai_numbers_token_tracker.append(i) + ai_numbers_char_tracker.append((char_counter,char_counter+tok_len)) + + char_counter += tok_len + 1 + + # iterate thru all of the numbers generated - and look for match in evidence + found_confirming_match = [] + tokens = evidence.split(" ") + evidence_char_counter = 0 + percent_on = -1 + current_str_token = "" + + for x in range(0, len(ai_numbers)): + match_tmp = -1 + match_token = -1 + + percent_on = -1 + for i, tok in enumerate(tokens): + + tok_len = len(tok) + + if tok.endswith("\n"): + tok = tok[:-1] + + current_str_token = tok + + if tok.endswith(",") or tok.endswith(".") or tok.endswith("-") or tok.endswith(";") or \ + tok.endswith(")") or tok.endswith("]"): + tok = tok[:-1] + + if tok.startswith("$") or tok.startswith("(") or tok.startswith("["): + tok = tok[1:] + + if tok.endswith("%"): + tok = tok[:-1] + percent_on = 1 + + tok = re.sub("[,-]","",tok) + + if Utilities().isfloat(tok): + tok = float(tok) + if percent_on == 1: + tok = tok / 100 + # turn off + percent_on = -1 + + if tok == ai_numbers[x]: + match_token = i + + if i > 10: + start = i-10 + else: + start = 0 + + if i+10 < len(tokens): + stop = i+10 + else: + stop = len(tokens) + + context_window = " ... " + for j in range(start,stop): + context_window += tokens[j] + " " + context_window = re.sub("[\n\r]","",context_window) + context_window += " ... " + + # insert page_num - future update + # default - set to the last batch + minibatch = len(evidence_metadata)-1 + for m in range(0,len(evidence_metadata)): + + starter = evidence_metadata[m]["evidence_start_char"] + stopper = evidence_metadata[m]["evidence_stop_char"] + if starter <= char_counter <= stopper: + minibatch = m + break + + # set default as "NA" - will update once confirmed found in evidence_metadata below + page_num = "NA" + source_fn = "NA" + + if len(evidence_metadata[minibatch]) > 1: + if "page_num" in evidence_metadata[minibatch]: + page_num = evidence_metadata[minibatch]["page_num"] + + if "file_source" in evidence_metadata[minibatch]: + source_fn = evidence_metadata[minibatch]["file_source"] + + new_fact_check_entry = {"fact": current_str_token, + "status": "Confirmed", + "text": context_window, + "page_num": page_num, + "source": source_fn} + fact_check.append(new_fact_check_entry) + + confirmations.append(current_str_token) + + match_tmp = 1 + break + + evidence_char_counter += tok_len + 1 + + if match_tmp == -1: + new_fact_check_entry = {"fact": current_str_token, + "status": "Not Confirmed", + "text": "", + "page_num": "", + "source": ""} + + fact_check.append(new_fact_check_entry) + unconfirmations.append(current_str_token) + + # provide markup highlighting confirmations and non-confirmations + confirm_updates = [] + if add_markup: + for i,f in enumerate(fact_check): + + char_slice = ai_numbers_char_tracker[i] + + # if not confirmed status, then markup as "unconfirm" + markup_entry = [i, ai_numbers_char_tracker[i], "unconfirm"] + + # test to update mark_up entry to "confirm" + if len(f) > 1: + if "status" in f: + if f["status"] == "Confirmed": + markup_entry = [i, ai_numbers_char_tracker[i], "confirm"] + + confirm_updates.append(markup_entry) + + confirm_updates = sorted(confirm_updates, key=lambda x:x[0], reverse=True) + + ai_output_markup = ai_gen_output + + for c in confirm_updates: + + output_tmp = ai_output_markup + + if c[2] == "confirm": + ai_output_markup = output_tmp[0:c[1][0]] + " " + ai_output_markup += output_tmp[c[1][0]:c[1][1]] + " " + ai_output_markup += output_tmp[c[1][1]:] + else: + ai_output_markup = output_tmp[0:c[1][0]] + " " + ai_output_markup += output_tmp[c[1][0]:c[1][1]] + " " + ai_output_markup += output_tmp[c[1][1]:] + + # fact_check.update({"confirmations": confirmations}) + # fact_check.update({"unconfirmations": unconfirmations}) + # fact_check.update({"ai_web_markup": ai_output_markup}) + + # note: ai_web_markup not passed + + return fact_check + + def source_reviewer (self, response_dict): + + ai_tmp_output = response_dict["llm_response"] + evidence_batch = response_dict["evidence"] + evidence_metadata = response_dict["evidence_metadata"] + add_markup = False + + # insert test starts here + # text_snippet_dict = self._evidence_token_matcher(ai_tmp_output, evidence_batch) + # end - insert test here + + min_th = 0.25 + conclusive_th = 0.75 + min_match_count = 3 + + # remove numbers from source review match ??? + c = CorpTokenizer(remove_stop_words=True, one_letter_removal=True, remove_punctuation=True, + remove_numbers=False, lower_case=False) + + c2 = CorpTokenizer(remove_stop_words=False, one_letter_removal=False, remove_punctuation=True, + remove_numbers=False, lower_case=False) + + # ai_tmp_output = re.sub("[()\"\u201d\u201c]"," ", ai_tmp_output) + ai_tokens = c.tokenize(ai_tmp_output) + ai_token_len = len(ai_tokens) + + if ai_token_len == 0: + # rare case - no ai output, so no need to do any further work + empty_results = [] + return empty_results + + matching_evidence_score = [] + for x in range(0, len(evidence_metadata)): + match = 0 + ev_match_tokens = [] + + ev_starter = evidence_metadata[x]["evidence_start_char"] + ev_stopper = evidence_metadata[x]["evidence_stop_char"] + + local_text = evidence_batch[ev_starter:ev_stopper] + # local_text = re.sub("[()\"\u201d\u201c]", "", local_text) + evidence_tokens_tmp = c2.tokenize(local_text) + # evidence_tokens_tmp = local_text.split(" ") + + for tok in ai_tokens: + for i, etoks in enumerate(evidence_tokens_tmp): + if tok.lower() == etoks.lower(): + match += 1 + ev_match_tokens.append(i) + break + + match_score = match / ai_token_len + + # min threshold to count as source -> % of total or absolute # of matching tokens + if match_score > min_th or len(ev_match_tokens) > min_match_count: + matching_evidence_score.append([match_score, x, ev_match_tokens, evidence_tokens_tmp, + evidence_metadata[x]["page_num"]]) + + mes = sorted(matching_evidence_score, key=lambda x: x[0], reverse=True) + + sources_output = [] + text_output = [] + + if len(mes) > 3: + top_sources = 3 + else: + top_sources = len(mes) + + for m in range(0, top_sources): + + page_num = mes[m][4] + + # text_snippet = "Page {}- ... ".format(str(page_num)) + text_snippet = "" + + median_token = int(statistics.median(mes[m][2])) + if median_token >= 10: + starter = median_token - 10 + else: + starter = 0 + + if median_token + 10 < len(mes[m][3]): + stopper = median_token + 10 + else: + stopper = len(mes[m][3]) + + for y in range(starter, stopper): + text_snippet += str(mes[m][3][y]) + " " + + # text_snippet += " ... " + + text_snippet = re.sub("[\n\r]", " ... ", text_snippet) + + if text_snippet not in text_output: + text_output.append(text_snippet) + # print("update: source_reviewer: ", evidence_metadata[mes[m][1]][1]) + try: + # mes[m][1] = array index corresponding to the 'batch' of evidence metadata + # the batch = (index, content), so look at index [1] to get the actual content + source = evidence_metadata[mes[m][1]][1]["doc_fn"] + except: + source = "" + # new_output = {"text": text_snippet, "match_score": mes[m][0],"source": evidence_metadata[mes[m][1]]} + new_output = {"text": text_snippet, "match_score": mes[m][0], "source": source, + "page_num": page_num} + + sources_output.append(new_output) + + if mes[m][0] > conclusive_th: + # found conclusive source -> no need to look for others + break + + return sources_output + + # enhanced token comparison + # --applies different rules by instruction, e.g., yes-no exclude + # --if number in output, looks to handle 'word numbers' + float value comparison + # --if multiple points in output, will run comparison separately against each "key point" + + def token_comparison (self, response_dict): + + ai_output_text = response_dict["llm_response"] + evidence_batch = response_dict["evidence"] + evidence_metadata = response_dict["evidence_metadata"] + + yes_no = False + key_point_output_list = [] + + if self.instruction == "yes_no": + yes_no = True + + key_point_list = [ai_output_text] + + c = CorpTokenizer(remove_stop_words=True, remove_numbers=False,one_letter_removal=True, remove_punctuation=False) + evidence_tokens = c.tokenize(evidence_batch) + + # iterate thru each key point and analyze comparison match + confirmed_match_agg = [] + unmatched_agg = [] + ai_tokens_agg = [] + + evidence_with_numbers = "" + evidence_numbers_list = [] + + for i, kp in enumerate(key_point_list): + + ai_tokens = c.tokenize(kp) + ai_tokens_agg += ai_tokens + + # skip any empty kp + if len(ai_tokens) > 0: + + confirmed_match = [] + unmatched = [] + + for tok in ai_tokens: + match = -1 + + # change starts here - july 19 + # sharpen matching rules for dollar amounts + if tok.endswith("."): + tok = tok[:-1] + + # only remove "." or "," if at the end + tok = re.sub("[,();$\"\n\r\t\u2022\u201c\u201d]","",tok) + + float_check_on = Utilities().isfloat(tok) + + run_compare = True + + if float_check_on: + if not evidence_with_numbers: + + evidence_with_numbers, evidence_numbers_list, \ + token_index_location = Utilities().replace_word_numbers(evidence_batch) + + for ev_num in evidence_numbers_list: + try: + if float(ev_num) == float(tok): + confirmed_match.append(tok) + match = 1 + run_compare = False + except: + pass + + if run_compare: + for etoks in evidence_tokens: + + # change here - mirrrors check in the evidence + if etoks.endswith("."): + etoks = etoks[:-1] + + etoks = re.sub("[(),;$\n\r\t\"\u2022\u201c\u201d]","",etoks) + + # removed lemmatizer and other approximate string matches - look for exact match + if tok == etoks: + confirmed_match.append(tok) + match = 1 + break + + # add token compare check if number -> look for numeric equality (even if strings different) + if float_check_on: + if Utilities().isfloat(etoks): + if float(tok) == float(etoks): + confirmed_match.append(tok) + match = 1 + break + + if match == -1: + # no duplicates + if tok not in unmatched: + unmatched.append(tok) + + # create new entry for kp + match = len(confirmed_match) / len(ai_tokens) + new_entry = {"key_point": kp, "entry": len(key_point_output_list), "verified_match": match} + key_point_output_list.append(new_entry) + unmatched_agg += unmatched + confirmed_match_agg += confirmed_match + + # match_percent = 0.0 + match_percent = "{0:.1f}%".format(0.0) + match_fr = 0.0 + + if len(ai_tokens_agg) > 0: + + match_fr = len(confirmed_match_agg) / len(ai_tokens_agg) + if match_fr > 1.0: + match_fr = 1.0 + match_percent = "{0:.1f}%".format((match_fr * 100)) + + # how to handle, if at all? + if yes_no and match_fr == 0: + no_action_for_now = 0 + + comparison_stats = {"percent_display": match_percent, + "confirmed_words": confirmed_match_agg, + "unconfirmed_words": unmatched_agg, + "verified_token_match_ratio": match_fr, + "key_point_list": key_point_output_list} + + return comparison_stats + + def classify_not_found_parse_llm_response(self, response_dict): + + # Simple, but reasonably accurate way to classify as "not found" - especially with "not found" instructions + # --(1) most models will follow the "not found" instruction and this will be the start of the response + # --(2) if a model gets confused and does not provide any substantive response, then this will get flagged too + + # minimal cleaning of response output + llm_response = response_dict["llm_response"] + llm_response_cleaned = re.sub("[;!?•(),.\n\r\t\u2022]", "", llm_response).strip().lower() + + # first test: if no content in 'cleaned' response + if not llm_response_cleaned: + return True + + # second test: if response starts with 'not found' + if llm_response_cleaned.lower().startswith("not found"): + return True + + return False + + # alternative to calling the model to classify - may be more reliable + def classify_not_found_evidence_match (self, response_dict, verified_token_match_threshold=0.25): + + # Objective of this method is to classify a LLM response as "not found" + # --this is a key requirement of 'evidence-based' retrieval augmented generation + # Note on output: "True" - indicates that classification of 'Not Found' + # "False" - indicates not 'Not Found' - in other words, use as a valid response + + if "comparison_stats" not in response_dict: + comparison_stats = self.token_comparison(response_dict) + else: + comparison_stats = response_dict["comparison_stats"] + + verified_token_match = comparison_stats["verified_token_match_ratio"] + + # simple threshold passed as parameter - assumes 0.25 as baseline + # --e.g., if there is less than 1 in 4 tokens verified in evidence, SKIP + # --we could make this higher filter, but occasionally might exclude a valid answer in different format + + llm_response = response_dict["llm_response"] + llm_response_cleaned = re.sub("[;!?•(),.\n\r\t\u2022]", "", llm_response).strip().lower() + + # carve-out "yes" | "no" answers - special case - will not having 'matching tokens' in evidence + if llm_response_cleaned in ["yes", "yes.", "no","no."]: + return False + + if verified_token_match < verified_token_match_threshold: + return True + + return False + + def classify_not_found_ask_the_model(self, response_dict, selected_model_name=None, model_api_key=None): + + if not selected_model_name: + selected_model_name = self.prompt.llm_name + model_api_key = self.prompt.llm_model_api_key + + new_prompt = Prompt().load_model(selected_model_name,api_key=model_api_key) + new_response = new_prompt.prompt_from_catalog(prompt="", context=response_dict["llm_response"], + prompt_name="not_found_classifier") + + # print("new response - ", new_response) + + llm_response = new_response["llm_response"] + llm_response_cleaned = re.sub("[;!?•(),.\n\r\t\u2022]", "", llm_response).strip().lower() + + if llm_response_cleaned.startswith("yes"): + return True + + if llm_response_cleaned.startswith("no"): + return False + + # if the test is inconclusive, then it returns False + + return False + + +class HumanInTheLoop: + + # add user ratings to queries and analysis + package for 'human-in-the-loop' review + + def __init__(self, prompt, prompt_id_list=None): + + self.prompt= prompt + self.user_rating_keys = ["human_rating", "human_feedback", "human_assessed_accuracy", "change_log"] + + def export_interaction_to_csv(self, prompt_id_list=None, output_path=None, report_name=None): + + # this method will take a list of one or more prompt_ids and dump to csv for user to review and edit + + output = PromptState(self.prompt).generate_interaction_report(prompt_id_list, + output_path=output_path, + report_name=report_name) + + return output + + def export_current_interaction_to_csv(self, output_path=None, report_name=None): + + # this method will take the current interaction state and dump to csv ffor user to review and edit + + output = PromptState(self.prompt).generate_interaction_report_current_state(output_path=output_path, + report_name=report_name) + + return output + + # TODO-WIP + def import_updated_csv(self, fp, fn, prompt_id): + + # allows corrections to be uploaded by csv spreadsheet and corrections made in the history + return 0 + + def add_or_update_human_rating (self, prompt_id, rating_dict): + + rating = -1 + accuracy = "" + feedback = "" + + f = {"prompt_id": prompt_id} + + if "human_rating" in rating_dict: + rating = int(rating_dict["human_rating"]) + + if "human_feedback" in rating_dict: + feedback = rating_dict["human_feedback"] + + if "human_assessed_accuracy" in rating_dict: + accuracy = rating_dict["human_assessed_accuracy"] + + update_dict = {"human_rating": rating, "human_feedback": feedback, "human_assessed_accuracy": accuracy} + + PromptState(self).update_records(prompt_id, f, update_dict) + + return 0 + + # new method to allow user to edit/update an llm_response record + def update_llm_response_record(self,prompt_id, update_dict,keep_change_log=True): + + # as default option, preserve the current values in a change_log list + # --over time, we can evaluate whether to capture more metadata about the change, roll-back, etc. + + if keep_change_log: + # get original record - will save in "change_log" list below changing + current_record = list(PromptState(self).lookup_by_prompt_id(prompt_id=prompt_id)) + # current_record = list(coll.find(f)) + + if len(current_record) == 1: + current_dict = {} + for keys in update_dict: + if keys in current_record[0]: + # this is what will be saved in the list of 'change log' events within the record + current_dict.update({keys:current_record[0][keys], + "time_stamp":Utilities().get_current_time_now()}) + + if "change_log" in current_record[0]: + change_log = current_record[0]["change_log"] + else: + change_log = [] + change_log.append(current_dict) + update_dict.update({"change_log": change_log}) + + # save and update records + confirmation = PromptState(self).update_records(prompt_id,f,update_dict) + + return confirmation + + + diff --git a/llmware/requirements.txt b/llmware/requirements.txt new file mode 100644 index 00000000..3950f236 --- /dev/null +++ b/llmware/requirements.txt @@ -0,0 +1,23 @@ +ai21==1.0.3 +anthropic==0.3.11 +beautifulsoup4==4.11.1 +boto3==1.24.53 +cohere==4.1.3 +faiss-cpu==1.7.4 +google-cloud-aiplatform==1.33.1 +lxml==4.9.3 +numpy==1.23.2 +openai==0.27.7 +pdf2image==1.16.0 +Pillow==9.2.0 +pymilvus==2.3.0 +pymongo==4.5.0 +pytesseract==0.3.10 +scipy==1.11.2 +tokenizers==0.13.3 +torch==1.13.1 +Werkzeug==2.3.7 +word2number==1.1 +Wikipedia-API==0.6.0 +yfinance==0.2.28 +python_on_whales==0.64.3 \ No newline at end of file diff --git a/llmware/resources.py b/llmware/resources.py new file mode 100644 index 00000000..68d21d7d --- /dev/null +++ b/llmware/resources.py @@ -0,0 +1,1339 @@ + +# Copyright 2023 llmware + +# Licensed under the Apache License, Version 2.0 (the "License"); you +# may not use this file except in compliance with the License. You +# may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. + + +from bson.int64 import Int64 + +from pymongo import ReturnDocument +from pymongo import MongoClient +import pymongo +from bson.objectid import ObjectId +import sys +import bson.json_util as json_util +import boto3 +from botocore import UNSIGNED +from botocore.config import Config +from werkzeug.utils import secure_filename +import shutil +from botocore.exceptions import ClientError +import os +import json +import csv +import uuid +import re +from datetime import datetime +import random +import logging +from pymongo.errors import ConnectionFailure + +from llmware.configs import LLMWareConfig +from llmware.exceptions import LibraryNotFoundException, UnsupportedCollectionDatabaseException + + +class DBManager: + + class __DBManager: + + def __init__(self): + + self.collection_db_path = LLMWareConfig.get_config("collection_db_uri") + username = LLMWareConfig.get_config("collection_db_username") + password = LLMWareConfig.get_config("collection_db_password") + + # default client is Mongo currently + self.client = MongoClient(self.collection_db_path, username=username, password=password, + unicode_decode_error_handler='ignore') + #self.client.admin.authenticate(username, password) + + __instance = None + + def __init__(self): + + if not DBManager.__instance: + DBManager.__instance = DBManager.__DBManager() + + def __getattr__(self, item): + return getattr(self.__instance, item) + + +def check_db_uri(timeout_secs=5): + + username = LLMWareConfig.get_config("collection_db_username") + password = LLMWareConfig.get_config("collection_db_password") + uri_string = LLMWareConfig.get_config("collection_db_uri") + + # default client is Mongo currently + client = MongoClient(uri_string, username=username, password=password,unicode_decode_error_handler='ignore') + + # self.client.admin.authenticate(username, password) + + try: + # catch if mongo not available + with pymongo.timeout(timeout_secs): + client.admin.command('ping') + logging.info("update: confirmed - Collection DB available at uri string - %s ", uri_string) + db_status = True + + except ConnectionFailure: + logging.warning("warning: Collection DB not found at uri string in LLMWareConfig - %s - check " + "connection and/or reset LLMWareConfig 'collection_db_uri' to point to the correct uri.", + uri_string) + + db_status = False + + return db_status + + +class CollectionWriter: + + def __init__(self, collection): + self.collection = collection + + # write - this is where new blocks get added to the library collection + def write_new_record(self, new_record): + registry_id = self.collection.insert_one(new_record).inserted_id + return 1 + + # note: this will delete the entire collection + def destroy_collection(self, confirm_destroy=False): + + if confirm_destroy: + self.collection.drop() + return 1 + + logging.warning("update: library not destroyed - need to set confirm_destroy = True") + return 0 + + # write/update specific record + def update_block (self, doc_id, block_id, key, new_value, default_keys): + + # for specific (doc_id, block_id) identified, update {key:new_value} + completed = False + + f = {"$and": [{"block_ID": block_id}, {"doc_ID": doc_id}]} + + if key in default_keys: + new_values = {"$set": {key: new_value}} + self.collection.update_one(f,new_values) + completed = True + + return completed + + def update_one_record(self, filter_dict, key,new_value): + new_values = {"$set": {key:new_value}} + self.collection.update_one(filter_dict, new_values) + return 0 + + def update_many_records(self, filter_dict, key, new_value): + new_values = {"$set": {key:new_value}} + self.collection.update_many(filter_dict, new_values) + return 0 + + def update_many_records_custom(self, filter_dict, update_dict): + self.collection.update_many(filter_dict, update_dict) + return 0 + + +class CollectionRetrieval: + + def __init__(self, collection): + self.collection = collection + + def get_whole_collection(self): + all_output = list(self.collection.find({},no_cursor_timeout=True)) + return all_output + + def basic_query(self, query): + match_results_cursor = self.collection.find( + {"$text": {"$search": query}}, + {"score": {"$meta": "textScore"}}).sort([('score', {'$meta': 'textScore'})]).allow_disk_use(True) + + return match_results_cursor + + def filter_by_key(self, key, value): + match_results_cursor = self.collection.find({key:value}) + return match_results_cursor + + def text_search_with_key_low_high_range(self, query, key, low, high, key_value_dict=None): + + # accepts key with low & high value + optional key_value_dict with additional parameters + d = [] + f = {} + + text_search = {"$text": {"$search": query}} + d.append(text_search) + key_value_range = {key: {"$gte": low, "$lte": high}} + d.append(key_value_range) + + if key_value_dict: + for key, value in key_value_dict.items(): + d.append({key: value}) + + # if one key-value pair, then simple filter + if len(d) == 1: f = d[0] + + # if multiple key-value pairs, then insert list with "$and" + if len(d) >= 2: + f = {"$and": d} + + results = list(self.collection.find(f, + {"score": {"$meta": "textScore"}}). + sort([('score', {'$meta': 'textScore'})]).allow_disk_use(True)) + + return results + + def text_search_with_key_value_range(self, query, key, value_range_list, key_value_dict=None): + + f = {} + text_search = {"$text": {"$search": query}} + + d = [text_search] + range_filter = {key: {"$in": value_range_list}} + d.append(range_filter) + + if key_value_dict: + for key, value in key_value_dict.items(): + d.append({key: value}) + + # if one key-value pair, then simple filter + if len(d) == 1: f = d[0] + + # if multiple key-value pairs, then insert list with "$and" + if len(d) >= 2: + f = {"$and": d} + + results = list(self.collection.find(f, + {"score": {"$meta": "textScore"}}). + sort([('score', {'$meta': 'textScore'})]).allow_disk_use(True)) + + return results + + def text_search_with_key_value_dict_filter(self, query, key_value_dict): + + f = {} + text_search = {"$text": {"$search": query}} + d = [text_search] + for key, value in key_value_dict.items(): + + if isinstance(value, list): + # if value is a list, then interpret as "$in" + range_filter = {key: {"$in": value}} + d.append(range_filter) + else: + # if value is not a list, then look for exact match + d.append({key: value}) + + # if one key-value pair, then simple filter + if len(d) == 1: f = d[0] + + # if multiple key-value pairs, then insert list with "$and" + if len(d) >= 2: + f = {"$and": d} + + results = list(self.collection.find(f, + {"score": {"$meta": "textScore"}}). + sort([('score', {'$meta': 'textScore'})]).allow_disk_use(True)) + + return results + + def get_distinct_list(self, key): + distinct_list = list(self.collection.distinct(key)) + return distinct_list + + def filter_by_key_dict (self, key_dict): + + f = {} + d = [] + for key, value in key_dict.items(): + d.append({key:value}) + + # if one key-value pair, then simple filter + if len(d) == 1: f = d[0] + + # if multiple key-value pairs, then insert list with "$and" + if len(d) >= 2: + f= {"$and":d} + + results = list(self.collection.find(f)) + + return results + + def filter_by_key_value_range(self, key, value_range): + # e.g., {"doc_ID": [1,2,3,4,5]} + results = list(self.collection.find({key: {"$in": value_range}})) + return results + + def filter_by_key_ne_value(self, key, value): + f = {key: {"$ne":value}} + output = list(self.collection.find(f)) + return output + + def custom_filter(self, custom_filter): + results = list(self.collection.find(custom_filter)) + return results + + def get_cursor_by_block(self, doc_id, block_id, selected_page): + + block_cursor = self.collection.find_one({"$and": [ + {"doc_ID": int(doc_id)}, + {"block_ID": {"$gt": block_id}}, + {"content_type": {"$ne": "image"}}, + {"master_index": {"$in": [selected_page, selected_page + 1]}}]}) + + return block_cursor + + +class CloudBucketManager: + + def __init__(self): + # placeholder - no state / config required currently + self.start = 0 + + # used in Setup() to get sample test documents + def pull_file_from_public_s3(self, object_key, local_file, bucket_name): + + # return list of successfully downloaded files + downloaded_files = [] + + try: + # Ensure the local file's folder exists + os.makedirs(os.path.dirname(local_file), exist_ok=True) + + s3 = boto3.resource('s3', config=Config(signature_version=UNSIGNED)) + s3.Bucket(bucket_name).download_file(object_key, local_file) + downloaded_files.append(object_key) + + except ClientError as e: + logging.error(e) + + return downloaded_files + + def create_local_model_repo(self, access_key=None,secret_key=None): + + # list of models retrieved from cloud repo + models_retrieved = [] + + # check for llmware path & create if not already set up + if not os.path.exists(LLMWareConfig.get_llmware_path()): + # if not explicitly set up by user, then create folder directory structure + LLMWareConfig.setup_llmware_workspace() + + # confirm that local model repo path has been created + local_model_repo_path = LLMWareConfig.get_model_repo_path() + if not os.path.exists(local_model_repo_path): + os.mkdir(local_model_repo_path) + + aws_repo_bucket = LLMWareConfig.get_config("llmware_public_models_bucket") + + # if specific model_list passed, then only retrieve models on the list + + bucket = boto3.resource('s3', aws_access_key_id=access_key, + aws_secret_access_key=secret_key).Bucket(aws_repo_bucket) + + files = bucket.objects.all() + + s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key) + + # bucket = s3.Bucket(bucket_name) + # files = bucket.objects.all() + + for file in files: + + name_parts = file.key.split("/") + + # confirm that file.key is correctly structure as [0] model name, and [1] model component + if len(name_parts) == 2: + + logging.info("update: identified models in model_repo: %s ", name_parts) + + if name_parts[0] and name_parts[1]: + + model_folder = os.path.join(local_model_repo_path,name_parts[0]) + + if not os.path.exists(model_folder): + os.mkdir(model_folder) + models_retrieved.append(name_parts[0]) + + logging.info("update: downloading file from s3 bucket - %s - %s ", name_parts[1], file.key) + + s3.download_file(aws_repo_bucket, file.key, os.path.join(model_folder,name_parts[1])) + + logging.info("update: created local model repository - # of models - %s - model list - %s ", + len(models_retrieved), models_retrieved) + + return models_retrieved + + def pull_single_model_from_llmware_public_repo(self, model_name=None): + + # if no model name selected, then get all + bucket_name = LLMWareConfig().get_config("llmware_public_models_bucket") + + # check for llmware path & create if not already set up + if not os.path.exists(LLMWareConfig.get_llmware_path()): + # if not explicitly set up by user, then create folder directory structure + LLMWareConfig().setup_llmware_workspace() + + model_path_local = LLMWareConfig.get_model_repo_path() + + if not os.path.exists(model_path_local): + os.makedirs(model_path_local) + + # assumes that files in model folder are something like: + # "pytorch_model.bin" | "config.json" | "tokenizer.json" + + bucket = boto3.resource('s3', config=Config(signature_version=UNSIGNED)).Bucket(bucket_name) + + files = bucket.objects.all() + + for file in files: + + if file.key.startswith(model_name): + + # found component of model in repo, so go ahead and create local model folder, if needed + local_model_folder = os.path.join(model_path_local, model_name) + if not os.path.exists(local_model_folder): + os.mkdir(local_model_folder) + + # simple model_repo structure - each model is a separate folder + # each model is a 'flat list' of files, so safe to split on ("/") to get key name + if not file.key.endswith("/"): + local_file_path = os.path.join(local_model_folder,file.key.split("/")[-1]) + bucket.download_file(file.key, local_file_path) + + logging.info("update: successfully downloaded model - %s - from aws s3 bucket for future access", + model_name) + + return files + + # called in Library as convenience method to connect to user S3 bucket and download into library path + def connect_to_user_s3_bucket (self, aws_access_key, aws_secret_key, + user_bucket_name, local_download_path, max_files=1000): + + files_copied = [] + + accepted_file_formats = ["pptx", "xlsx", "docx", "pdf", "txt", "csv", "html", "jsonl", + "jpg", "jpeg", "png", "wav", "zip"] + + try: + s3 = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key) + + bucket = s3.Bucket(user_bucket_name) + files = bucket.objects.all() + + for file in files: + f = secure_filename(file.key) + file_type = f.split(".")[-1].lower() + if file_type in accepted_file_formats: + s3.download_file(user_bucket_name, file.key, local_download_path + f) + files_copied.append(f) + if len(files_copied) > max_files: + break + + except: + logging.error("error: could not connect to s3 bucket - % ", user_bucket_name) + + return files_copied + + return files_copied + + +class LibraryCollection: + + def __init__(self, library=None): + + self.library = library + + if library: + self.library_name = library.library_name + self.account_name = library.account_name + else: + self.library_name = None + self.account_name = "llmware" # default, if no account name specified + + def create_library_collection(self, library_name=None, account_name="llmware"): + + collection = None + if not library_name: + library_name = self.library_name + + logging.info("update: LibraryCollection().create_library_collection - %s - %s - %s", + library_name, self.account_name, LLMWareConfig.get_config("collection_db")) + + if LLMWareConfig().get_config("collection_db") == "mongo": + + if check_db_uri(timeout_secs=5): + + collection = DBManager().client[self.account_name][library_name] + logging.info("update: creating library collection with Mongo - %s ", + LLMWareConfig.get_config("collection_db_uri")) + + else: + logging.error("error: tried unsuccessfully to connect to Mongo - %s ", + LLMWareConfig.get_config("collection_db_uri")) + + else: + raise UnsupportedCollectionDatabaseException(LLMWareConfig.get_config("collection_db")) + + return collection + + def get_library_collection(self, library_name, account_name="llmware"): + + if LLMWareConfig.get_config("collection_db") == "mongo": + + if check_db_uri(timeout_secs=5): + collection = DBManager().client[account_name][library_name] + logging.info("update: creating library collection with Mongo - %s ", + LLMWareConfig.get_config("collection_db_uri")) + + else: + logging.error("error: tried unsuccessfully to connect to Mongo - %s - " + "please check connection and reset LLMWare Config collection db settings" + "if needed to point to correction uri.", LLMWareConfig.get_config("collection_db_uri")) + + collection = None + + return collection + else: + raise UnsupportedCollectionDatabaseException(LLMWareConfig.get_config("collection_db")) + + def get_library_card_collection(self, account_name="llmware"): + + if LLMWareConfig.get_config("collection_db") == "mongo": + + if check_db_uri(timeout_secs=5): + collection = DBManager().client[account_name].library + logging.info("update: creating library collection with Mongo - %s ", + LLMWareConfig.get_config("collection_db_uri")) + else: + logging.error("error: tried unsuccessfully to connect to Mongo - %s ", + LLMWareConfig.get_config("collection_db_uri")) + collection = None + + return collection + + else: + raise UnsupportedCollectionDatabaseException(LLMWareConfig.get_config("collection_db")) + + def create_index(self, library_name=None): + + if not library_name: + library_name = self.library_name + + if LLMWareConfig.get_config("collection_db") == "mongo": + self.library.collection.create_index([("text_search", "text")]) + else: + raise UnsupportedCollectionDatabaseException(LLMWareConfig.get_config("collection_db")) + + return 0 + + +class LibraryCatalog: + + def __init__(self, library=None, library_path=None, account_name="llmware"): + + self.library = library + if library: + self.library_name = library.library_name + self.account_name = library.account_name + else: + self.library_name = None + self.account_name = account_name + + self.library_card_collection = LibraryCollection().get_library_card_collection(self.account_name) + + # check for llmware path & create if not already set up + if not os.path.exists(LLMWareConfig.get_llmware_path()): + LLMWareConfig.setup_llmware_workspace() + + if not library_path: + self.library_path = LLMWareConfig.get_llmware_path() + else: + self.library_path = library_path + + def get_library_card (self, library_name, account_name="llmware"): + + # return either library_card {} or None + + if account_name != "llmware": + # dynamically change to point to selected account_name - else points to default + # self.library_card_collection = DBManager().client[account_name].library + self.library_card_collection = LibraryCollection().get_library_card_collection(account_name=account_name) + + db_record = list(self.library_card_collection.find({"library_name": library_name})) + + if len(db_record) == 1: + library_card = db_record[0] + else: + library_card = None + + return library_card + + def all_library_cards(self): + all_library_cards = list(self.library_card_collection.find({})) + return all_library_cards + + def create_new_library_card(self, new_library_card): + registry_id = self.library_card_collection.insert_one(new_library_card).inserted_id + return 0 + + def update_library_card(self, library_name, update_dict, account_name="llmware"): + + f = {"library_name": library_name} + new_values = {"$set": update_dict} + + if account_name != "llmware": + self.library_card_collection = LibraryCollection().get_library_card_collection(account_name=account_name) + + # standard collection update for all except embedding + if "embedding" not in update_dict: + self.library_card_collection.update_one(f,new_values) + + else: + # special handling for "embedding" attribute + + lib_card = self.get_library_card(library_name) + embedding_list = lib_card["embedding"] + + if len(embedding_list) > 0: + # if the last row is a "no" embedding, then remove it + if embedding_list[-1]["embedding_status"] == "no": + del embedding_list[-1] + + embedding_list.append(update_dict["embedding"]) + embedding_update_dict = {"embedding": embedding_list} + self.library_card_collection.update_one(f, {"$set": embedding_update_dict}) + + return 1 + + def delete_library_card(self, library_name=None, account_name="llmware"): + + if not library_name: + library_name = self.library_name + + f = {"library_name": library_name} + + if account_name != "llmware": + self.library_card_collection = LibraryCollection().get_library_card_collection(account_name=account_name) + + self.library_card_collection.delete_one(f) + + return 1 + + def get_and_increment_doc_id (self, library_name, account_name="llmware"): + + # controller for setting the library_collection and pointer to the DB Collection library + + if account_name != "llmware": + self.library_card_collection = LibraryCollection().get_library_card_collection(account_name=account_name) + + # method called at the start of parsing each new doc -> each parser gets a new doc_id + library_counts = self.library_card_collection.find_one_and_update( + {"library_name": self.library_name}, + {"$inc": {"unique_doc_id": 1}}, + return_document=ReturnDocument.AFTER + ) + + unique_doc_id = library_counts.get("unique_doc_id",-1) + + return unique_doc_id + + def set_incremental_docs_blocks_images(self, added_docs=0, added_blocks=0, added_images=0, added_pages=0, + added_tables=0): + + # updates counting parameters at end of parsing + + self.library_card_collection.update_one( + {"library_name": self.library_name}, + {"$inc": {"documents": added_docs, "blocks": added_blocks, "images": added_images, "pages": added_pages, + "tables": added_tables + }}) + + return 0 + + def bulk_update_graph_status(self): + + update_dict = {"graph_status": "true"} + self.update_library_card(self.library_name,update_dict) + + return 0 + + +class ParserState: + + def __init__(self, parsing_output=None, parse_job_id=None): + + self.parse_job_id = parse_job_id + self.parsing_output = parsing_output + self.parser_job_output_base_name = "parser_job_" + self.parser_output_format = ".jsonl" + self.parser_output_fp = LLMWareConfig.get_parser_path() + + # check for llmware path & create if not already set up + if not os.path.exists(LLMWareConfig.get_llmware_path()): + + # if not explicitly set up by user, then create folder directory structure + LLMWareConfig.setup_llmware_workspace() + + def get_parser_state_fn_from_id(self, parser_job_id): + + fn = self.parser_job_output_base_name + str(parser_job_id) + self.parser_output_format + return fn + + def get_parser_id_from_parser_state_fn(self, fn): + + core_fn = fn.split(".")[0] + id = core_fn.split("_")[-1] + return id + + def lookup_by_parser_job_id(self, parser_id): + + parser_output = self.lookup_by_parse_job_id(parser_id) + return parser_output + + def save_parser_output(self, parser_job_id, parser_output): + + fn = self.get_parser_state_fn_from_id(parser_job_id) + fp = os.path.join(self.parser_output_fp, fn) + + outfile = open(fp, "w") + + for entry_dict in parser_output: + jsonl_row = json.dumps(entry_dict) + outfile.write(jsonl_row) + outfile.write("\n") + + outfile.close() + + return fn + + def issue_new_parse_job_id(self, custom_id=None, mode="uuid"): + + # issue new parse_job_id + + if custom_id: + self.parse_job_id = custom_id + else: + if mode == "time_stamp": + self.parse_job_id = StateResourceUtil().get_current_time_now() + elif mode == "uuid": + self.parse_job_id = str(StateResourceUtil().get_uuid()) + elif mode == "random_number": + self.parse_job_id = str(random.randint(1000000, 9999999)) + + return self.parse_job_id + + def lookup_by_parse_job_id (self, prompt_id): + + output = [] + + fn = self.get_parser_state_fn_from_id(prompt_id) + fp = os.path.join(self.parser_output_fp, fn) + + try: + my_file = open(fp, 'r', encoding='utf-8') + for lines in my_file: + new_row = json.loads(lines) + output.append(new_row) + + except: + logging.info("warning: ParserState - could not find previous parse job record - %s ", prompt_id) + output = [] + + return output + + +class PromptState: + + def __init__(self, prompt=None): + + self.prompt = prompt + self.prompt_state_base_name = "prompt_" + self.prompt_state_format = ".jsonl" + + # check for llmware path & create if not already set up + if not os.path.exists(LLMWareConfig.get_llmware_path()): + + # if not explicitly set up by user, then create folder directory structure + LLMWareConfig.setup_llmware_workspace() + + self.prompt_path = LLMWareConfig.get_prompt_path() + self.output_path = LLMWareConfig.get_prompt_path() + + # prompt state written to files + self.prompt_collection = None + self.write_to_db = False + + def get_prompt_state_fn_from_id(self, prompt_id): + fn = self.prompt_state_base_name + str(prompt_id) + self.prompt_state_format + return fn + + def get_prompt_id_from_prompt_state_fn(self, fn): + core_fn = fn.split(".")[0] + id = core_fn.split("_")[-1] + return id + + def lookup_by_prompt_id(self, prompt_id): + ai_trx_list = self.lookup_by_prompt_id_from_file(prompt_id) + return ai_trx_list + + def register_interaction(self, ai_dict): + + # by default, add to the interaction_history in memory + self.prompt.interaction_history.append(ai_dict) + + return ai_dict + + def initiate_new_state_session(self, prompt_id=None): + + if not prompt_id: + prompt_id = self.issue_new_prompt_id() + + # reset key trackers + self.prompt.llm_history = [] + self.prompt.prompt_id = prompt_id + return prompt_id + + def issue_new_prompt_id(self, custom_id=None, mode="uuid"): + + # issue new prompt_id + if custom_id: + self.prompt.prompt_id = custom_id + else: + if mode == "time_stamp": + self.prompt.prompt_id = StateResourceUtil().get_current_time_now() + elif mode == "uuid": + self.prompt.prompt_id = str(StateResourceUtil().get_uuid()) + elif mode == "random_number": + self.prompt.prompt_id = str(random.randint(1000000, 9999999)) + + return self.prompt.prompt_id + + def load_state(self, prompt_id, prompt_path=None,clear_current_state=True): + + output = None + + if not prompt_path: + prompt_path = self.prompt_path + + fn = self.get_prompt_state_fn_from_id(prompt_id) + fp = os.path.join(prompt_path, fn) + + try: + if clear_current_state: + self.prompt.interaction_history = [] + + my_file = open(fp, 'r', encoding='utf-8') + for lines in my_file: + new_row = json.loads(lines) + self.prompt.interaction_history.append(new_row) + self.prompt.prompt_id = prompt_id + output = self.prompt.interaction_history + + except: + logging.info("update: PromptState - could not find previous prompt interaction state- %s ", prompt_id) + output = None + + return output + + def lookup_by_prompt_id_from_file(self, prompt_id): + + output = [] + + fn = self.get_prompt_state_fn_from_id(prompt_id) + fp = os.path.join(self.prompt_path, fn) + + try: + my_file = open(fp, 'r', encoding='utf-8') + for lines in my_file: + new_row = json.loads(lines) + output.append(new_row) + except: + logging.info("warning: PromptState - could not find previous prompt interaction state- %s ", prompt_id) + output = [] + + return output + + def full_history(self): + ai_trx_list = self.full_history_from_file() + return ai_trx_list + + def full_history_from_file(self): + + output= [] + + all_prompts = os.listdir(self.prompt_path) + + for i, files in enumerate(all_prompts): + + # will iterate through all files in the prompt path that start with the expected + # prompt base name + + if files.startswith(self.prompt_state_base_name): + prompt_id = self.get_prompt_id_from_prompt_state_fn(files) + records = self.lookup_by_prompt_id(prompt_id) + output += records + + return output + + def lookup_prompt_with_filter(self, filter_dict): + + # default - return [] + output = [] + + # may want to add safety check that filter_dict is dict + all_prompt_records = self.full_history_from_file() + + for i, prompt in enumerate(all_prompt_records): + match = -1 + for keys, values in filter_dict.items(): + + # must match every key in the filter dict + if keys in prompt: + if values == prompt[keys]: + match = 1 + else: + match = -1 + break + else: + # if key not in record, then not a match + match = -1 + break + + if match == 1: + output.append(prompt) + + return output + + def update_records(self, prompt_id, filter_dict, update_dict): + + updated_prompt_records = [] + matching_record = {} + prompt_records = self.lookup_by_prompt_id(prompt_id) + for record in prompt_records: + match = -1 + for keys, values in filter_dict.items(): + if keys in record: + if record[keys] == values: + match = 1 + else: + match = -1 + break + else: + match = -1 + break + + if match == -1: + updated_prompt_records.append(record) + else: + # found matching record + matching_record = record + + # update records according to update_dict + updated_record = {} + for key, value in matching_record.items(): + for update_key, update_value in update_dict.items(): + if key == update_key: + updated_record.update({key: update_value}) + else: + updated_record.update({key:value}) + + updated_prompt_records.append(updated_record) + + self.save_custom_state(prompt_id, updated_prompt_records) + + return 0 + + def save_custom_state(self, prompt_id, custom_history, prompt_path=None): + + if not prompt_path: + prompt_path = LLMWareConfig.get_prompt_path() + + fn = self.get_prompt_state_fn_from_id(prompt_id) + fp = os.path.join(prompt_path, fn) + + outfile = open(fp, "w") + + for entry_dict in custom_history: + jsonl_row = json.dumps(entry_dict) + outfile.write(jsonl_row) + outfile.write("\n") + + outfile.close() + + return fp + + def save_state(self, prompt_id, prompt_path=None): + + if not prompt_path: + prompt_path = LLMWareConfig.get_prompt_path() + + fn = self.get_prompt_state_fn_from_id(prompt_id) + fp = os.path.join(prompt_path, fn) + + outfile = open(fp, "w") + + for entry_dict in self.prompt.interaction_history: + jsonl_row = json.dumps(entry_dict) + outfile.write(jsonl_row) + outfile.write("\n") + + outfile.close() + + return fp + + def available_states(self, prompt_path=None): + + available_states = [] + + if not prompt_path: + prompt_path = self.prompt_path + + for x in os.listdir(prompt_path): + if x.startswith(self.prompt_state_base_name): + prompt_id = self.get_prompt_id_from_prompt_state_fn(x) + new_entry = {"prompt_id": prompt_id, "prompt_fn": x} + available_states.append(new_entry) + + logging.info("update: PromptState - available states - ", available_states) + + return available_states + + def generate_interaction_report(self, prompt_id_list, output_path=None, report_name=None): + + # prepares a csv report that can be extracted to a spreadsheet + + if not output_path: + output_path = self.output_path + + if not report_name: + report_name = "interaction_report_" + str(StateResourceUtil().get_current_time_now()) + ".csv" + + result_count = 0 + + report_fp = os.path.join(output_path,report_name) + + with open(report_fp, 'w', newline='') as csvfile: + c = csv.writer(csvfile, dialect='excel', doublequote=False, delimiter=',', escapechar=']') + + header_row = ["Prompt_ID", "Prompt", "LLM_Response", "Instruction", "Evidence", "Model", "Time-Stamp"] + c.writerow(header_row) + + for i, prompt_id in enumerate(prompt_id_list): + + fn = self.get_prompt_state_fn_from_id(prompt_id) + fp = os.path.join(self.prompt_path, fn) + + my_file = open(fp, 'r', encoding='utf-8') + for lines in my_file: + new_row = json.loads(lines) + + # create new csv row + + csv_row = [prompt_id, + new_row["prompt"], + new_row["llm_response"], + new_row["instruction"], + new_row["evidence"], + new_row["model"], + new_row["time_stamp"] + ] + + c.writerow(csv_row) + result_count += 1 + + csvfile.close() + + output_response = {"report_name": report_name, "report_fp": report_fp, "results": result_count} + + return output_response + + def generate_interaction_report_current_state(self, output_path=None, report_name=None): + + # prepares a csv report that can be extracted to a spreadsheet + + if not output_path: + output_path = self.output_path + + if not report_name: + report_name = "interaction_report_" + str(StateResourceUtil().get_current_time_now()) + ".csv" + + result_count = 0 + + report_fp = os.path.join(output_path,report_name) + + with open(report_fp, 'w', newline='') as csvfile: + c = csv.writer(csvfile, dialect='excel', doublequote=False, delimiter=',', escapechar=']') + + header_row = ["Prompt_ID", "Prompt", "LLM_Response", "Instruction", "Evidence", "Model", "Time-Stamp"] + c.writerow(header_row) + + for i, new_row in enumerate(self.prompt.interaction_history): + + # create new csv row + + csv_row = [self.prompt.prompt_id, + new_row["prompt"], + new_row["llm_response"], + new_row["instruction"], + new_row["evidence"], + new_row["model"], + new_row["time_stamp"] + ] + + c.writerow(csv_row) + result_count += 1 + + csvfile.close() + + output_response = {"report_name": report_name, "report_fp": report_fp, "results": result_count} + + return output_response + + +class QueryState: + + def __init__(self, query=None, query_id=None): + + if query: + self.query = query + self.query_id = query.query_id + + if query_id: + self.query_id = query_id + self.query = None + + self.query_state_base_name = "query_" + self.query_state_format = ".jsonl" + + self.query_path = LLMWareConfig.get_query_path() + self.output_path = LLMWareConfig.get_query_path() + + # check for llmware path & create if not already set up + if not os.path.exists(LLMWareConfig.get_llmware_path()): + + # if not explicitly set up by user, then create folder directory structure + LLMWareConfig.setup_llmware_workspace() + + def get_query_state_fn_from_id(self, prompt_id): + fn = self.query_state_base_name + str(prompt_id) + self.query_state_format + return fn + + def get_query_id_from_prompt_state_fn(self, fn): + core_fn = fn.split(".")[0] + id = core_fn.split("_")[-1] + return id + + def initiate_new_state_session(self, query_id=None): + + if not query_id: + query_id = self.issue_new_query_id() + + # reset key trackers + self.query.query_history = [] + self.query.results = [] + self.query.doc_id_list = [] + self.query.doc_fn_list = [] + + self.query_id = query_id + + return query_id + + def issue_new_query_id(self, custom_id=None, mode="uuid"): + + # issue new query_id + if not custom_id: + + if mode == "time_stamp": + custom_id = StateResourceUtil().get_current_time_now() + elif mode == "uuid": + custom_id = StateResourceUtil().get_uuid() + elif mode == "random_number": + custom_id = str(random.randint(1000000, 9999999)) + + return custom_id + + def available_states(self): + + available_states = [] + + for x in os.listdir(self.query_path): + if x.startswith(self.query_state_base_name): + query_id = self.get_query_id_from_prompt_state_fn(x) + new_entry = {"query_id": query_id, "query_fn": x} + available_states.append(new_entry) + + logging.info("update: QueryState - available saved query states - ", available_states) + + return available_states + + def load_state (self, query_id): + + output = [] + doc_id_list = [] + doc_fn_list = [] + query_history = [] + + fn = self.get_query_state_fn_from_id(query_id) + fp = os.path.join(self.query_path, fn) + + try: + my_file = open(fp, 'r', encoding='utf-8') + for lines in my_file: + new_row = json.loads(lines) + output.append(new_row) + + if "doc_ID" in new_row: + if new_row["doc_ID"] not in doc_id_list: + doc_id_list.append(new_row["doc_ID"]) + + if "file_source" in new_row: + if new_row["file_source"] not in doc_fn_list: + doc_fn_list.append(new_row["file_source"]) + + if "query" in new_row: + if new_row["query"] not in query_history: + query_history.append(new_row["query"]) + + except: + logging.info("warning: QueryState - could not find previous query state- %s ", query_id) + output = [] + + self.query.results = output + self.query.doc_id_list = doc_id_list + self.query.doc_fn_list = doc_fn_list + self.query.query_history = query_history + + return self + + def save_state(self, query_id=None): + + if not query_id: + query_id = self.query.query_id + + fn = self.get_query_state_fn_from_id(query_id) + fp = os.path.join(self.query_path, fn) + + outfile = open(fp, "w") + + for entry_dict in self.query.results: + jsonl_row = json.dumps(entry_dict) + outfile.write(jsonl_row) + outfile.write("\n") + + outfile.close() + + return fn + + def generate_query_report_current_state(self, report_name=None): + + # prepares a csv report that can be extracted to a spreadsheet + + if not self.query: + logging.error("error: QueryState - report generation - must load a current query") + return -1 + + query_name = "" + for entries in self.query.query_history: + query_name += re.sub(" ", "_", entries) + "-" + + if len(query_name) > 100: + query_name = query_name[0:100] + if query_name.endswith("-"): + query_name = query_name[:-1] + + if not report_name: + report_name = "query_report_" + query_name + ".csv" + + report_out = [] + col_headers = ["Query", "File_Source", "Doc_ID", "Block_ID", "Page", "Text"] + + report_out.append(col_headers) + + for j, results in enumerate(self.query.results): + + query = "" + if "query" in results: + query = results["query"] + + file_source = "" + if "file_source" in results: + file_source = results["file_source"] + + doc_id = "NA" + if "doc_ID" in results: + doc_id = results["doc_ID"] + + block_id = "NA" + if "block_ID" in results: + block_id = results["block_ID"] + + page_num = "NA" + if "page_num" in results: + page_num = results["page_num"] + + text = "" + if "text" in results: + text = re.sub("[,\"]"," ", results["text"]) + + new_row = [query, file_source, doc_id, block_id, page_num, text] + + report_out.append(new_row) + + fp = os.path.join(self.query_path, report_name) + + StateResourceUtil().file_save(report_out, self.output_path, report_name) + + return report_name + + +class StateResourceUtil: + + def __init__(self): + self.do_nothing = 0 # placeholder - may add attributes in the future + + def get_uuid(self): + # uses unique id creator from uuid library + return uuid.uuid4() + + @staticmethod + def get_current_time_now (time_str="%a %b %e %H:%M:%S %Y"): + time_stamp = datetime.now().strftime(time_str) + return time_stamp + + @staticmethod + def file_save (cfile, file_path, file_name): + + max_csv_size = 20000 + csv.field_size_limit(max_csv_size) + + out_file = file_path + file_name + with open(out_file, 'w', newline='') as csvfile: + c = csv.writer(csvfile, dialect='excel', doublequote=False, delimiter=',', escapechar= ']') + # c.writerow(first_row) + for z in range(0 ,len(cfile)): + # intercept a line too large here + if sys.getsizeof(cfile[z]) < max_csv_size: + c.writerow(cfile[z]) + else: + logging.error("error: CSV ERROR: Row exceeds MAX SIZE: %s %s", sys.getsizeof(cfile[z]) + ,cfile[z]) + + csvfile.close() + + return 0 + + + + diff --git a/llmware/retrieval.py b/llmware/retrieval.py new file mode 100644 index 00000000..475677ad --- /dev/null +++ b/llmware/retrieval.py @@ -0,0 +1,1387 @@ + +# Copyright 2023 llmware + +# Licensed under the Apache License, Version 2.0 (the "License"); you +# may not use this file except in compliance with the License. You +# may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. + + +import logging +import os +from collections import Counter +from datetime import datetime +from bson.objectid import ObjectId + +from llmware.configs import LLMWareConfig +from llmware.embeddings import EmbeddingHandler +from llmware.resources import CollectionRetrieval, QueryState +from llmware.util import Utilities, CorpTokenizer +from llmware.models import ModelCatalog +from llmware.exceptions import LibraryObjectNotFoundException,UnsupportedEmbeddingDatabaseException,\ + ImportingSentenceTransformerRequiresModelNameException + + +class Query: + + def __init__(self, library, embedding_model=None, tokenizer=None, vector_db_api_key=None, + query_id=None, from_hf=False, from_sentence_transformer=False,embedding_model_name=None, + save_history=True, query_mode=None): + + # load user profile & instantiate core library assets linked to profile + + self.library = library + + if library: + self.library_name = library.library_name + self.account_name = library.account_name + else: + # throw error if library object does not have library_name and account_name attributes + raise LibraryObjectNotFoundException(library) + + # explicitly pass name of embedding model, if multiple embeddings on library + self.embedding_model_name = embedding_model_name + + # added option to pass embedding_model and tokenizer + self.user_passed_model = embedding_model + self.user_passed_tokenizer = tokenizer + self.from_hf = from_hf + self.from_sentence_transformer = from_sentence_transformer + + # edge case - if a user tries to load a sentence_transformer model but does not pass a model name + if from_sentence_transformer and not embedding_model_name: + raise ImportingSentenceTransformerRequiresModelNameException + + # load default configs + # embedding initialization parameters + self.query_embedding = None + self.embedding_model = None + self.embedding_db = None + self.embeddings = None + + if self.library: + self.embeddings = EmbeddingHandler(self.library) + + self.semantic_distance_threshold = 1000 # basic shut off at such a high level + + # keys that will be included in query results + + # full list + self.query_result_standard_keys = ["_id", "text", "doc_ID", "block_ID","page_num","content_type", + "author_or_speaker", "special_field1", "file_source","added_to_collection", + "table", "coords_x", "coords_y", "coords_cx", "coords_cy", "external_files", + "score", "similarity", "distance", "matches"] + + # short_list + self.query_result_short_keys = ["text", "file_source", "page_num", "score", "distance","matches"] + + # minimum_list + self.query_result_min_required_keys = ["text", "file_source", "page_num"] + + # default - set at 'full list' + self.query_result_return_keys = self.query_result_standard_keys + + # default is semantic if embedding in place + embedding_record = self.library.get_embedding_status() + + matched_lib_model = False + + if embedding_model_name: + for emb in embedding_record: + + logging.info("update: Query - embedding record lookup - %s - %s", embedding_model_name, emb) + + if emb["embedding_model"] == embedding_model_name: + + if emb["embedding_status"] == "yes": + self.embedding_db = emb["embedding_db"] + self.search_mode = "semantic" + matched_lib_model = True + else: + if len(embedding_record) > 0: + last_emb_record = embedding_record[-1] + if last_emb_record["embedding_status"] == "yes": + self.embedding_db = last_emb_record["embedding_db"] + self.search_mode = "semantic" + self.embedding_model_name = last_emb_record["embedding_model"] + matched_lib_model = True + + if matched_lib_model: + + logging.info("update: Query - found matches in embedding record - %s - %s", + self.embedding_db, self.embedding_model_name) + + if not self.embedding_model: + self.load_embedding_model() + + else: + self.search_mode = "text" + + # passed for accessing api_based vector db + self.vector_db_api_key = vector_db_api_key + + # if query_id passed, then load that state + if query_id: + self.query_id = query_id + self.load_query_state(query_id) + else: + self.query_id = QueryState().issue_new_query_id() + + self.result_text_chunk_size = self.library.block_size_target_characters + + # state variables + self.results = [] + self.query_history = [] + self.doc_id_list = [] + self.doc_fn_list = [] + + self.save_history = save_history + + if query_mode: + self.search_mode = query_mode + + def load_embedding_model(self): + + # skip if already instantiated self.embedding_model + + if not self.embedding_model: + + if self.user_passed_model: + + if self.from_hf: + self.embedding_model = ModelCatalog().load_hf_embedding_model(self.user_passed_model, + self.user_passed_tokenizer) + if self.from_sentence_transformer: + self.embedding_model = ModelCatalog().load_sentence_transformer_model(self.user_passed_model, + self.embedding_model_name) + + else: + if ModelCatalog().lookup_model_card(self.embedding_model_name): + self.embedding_model = ModelCatalog().load_model(selected_model=self.embedding_model_name) + else: + logging.info("update: Query - selected embedding model could not be found- %s ", + self.embedding_model_name) + + return self + + def get_output_keys(self): + # list of keys that will be provided in each query_result + return self.query_result_return_keys + + def set_output_keys(self, result_key_list): + + # set the output keys + validated_list = [] + for key in result_key_list: + if key in self.library.default_keys: + validated_list.append(key) + + # minimum required list + for rk in self.query_result_min_required_keys: + if rk not in validated_list: + validated_list.append(rk) + logging.warning("warning: Query - adding required keys useful in downstream processing - %s ", rk) + + # setting updated query_return_keys that is used in packaging query results + self.query_result_return_keys = validated_list + + return validated_list + + def start_query_session(self, query_id=None): + + if query_id: + self.query_id = query_id + + if self.query_id: + QueryState(self).load_state(self.query_id) + else: + query_id = QueryState(self).initiate_new_state_session() + + return query_id + + def register_query (self, retrieval_dict): + + # qr_dict = ["query", "results", "doc_ID", "file_source"] + + # add query results as new "column" in query state + self.results += retrieval_dict["results"] + + if retrieval_dict["query"] not in self.query_history: + self.query_history.append(retrieval_dict["query"]) + + for doc_id in retrieval_dict["doc_ID"]: + if doc_id not in self.doc_id_list: + self.doc_id_list.append(doc_id) + + for doc_fn in retrieval_dict["file_source"]: + if doc_fn not in self.doc_fn_list: + self.doc_fn_list.append(doc_fn) + + # QueryState(self).save_state(self.query_id) + + return self + + def load_query_state(self, query_id): + state = QueryState(self).load_state(query_id) + return self + + def save_query_state(self): + QueryState(self).save_state() + return self + + def clear_query_state(self): + # need to reset state variables + QueryState(self).initiate_new_state_session() + return self + + def dump_current_query_state(self): + + query_state_dict = {"query_id": self.query_id, + "query_history": self.query_history, + "results": self.results, + "doc_ID": self.doc_id_list, + "file_source": self.doc_fn_list + } + + return query_state_dict + + def query(self, query, query_type="text", result_count=20, results_only=True): + + output_result = {"results": [], "doc_ID": [], "file_source": []} + + if query_type not in ["text", "semantic"]: + logging.error("error: Query().query expects a query type of either 'text' or 'semantic'") + return output_result + + if query_type == "text": + output_result = self.text_query(query,result_count=result_count,results_only=results_only) + + if query_type == "semantic": + + # check that embedding model is available, and if not, flip to text search + + if not self.embedding_model: + self.load_embedding_model() + + if self.search_mode == "text" or not self.embedding_model: + output_result = self.text_query(query, result_count=result_count,results_only=results_only) + else: + output_result = self.semantic_query(query, result_count=result_count,results_only=results_only) + + return output_result + + # basic simple text query method - only requires entering the query + def text_query (self, query, exact_mode=False, result_count=20, exhaust_full_cursor=False, results_only=True): + + # prepare query if exact match required + if exact_mode: + query = self.exact_query_prep(query) + + # query the text collection + cursor = CollectionRetrieval(self.library.collection).basic_query(query) + + # package results, with correct sample counts and output keys requested + results_dict = self._cursor_to_qr(query, cursor,result_count=result_count,exhaust_full_cursor= + exhaust_full_cursor) + + if results_only: + return results_dict["results"] + + return results_dict + + def text_query_with_document_filter(self, query, doc_filter, result_count=20, exhaust_full_cursor=False, + results_only=True): + + key = None + value_range = [] + + if "doc_ID" in doc_filter: + key = "doc_ID" + value_range = doc_filter["doc_ID"] + + elif "file_source" in doc_filter: + key = "file_source" + value_range = doc_filter["file_source"] + + else: + logging.warning("warning: Query - expected to receive document filter with keys of 'doc_ID' or " + "'file_source' - as a safe fall-back - will run the requested query without a filter.") + + if key: + cursor = CollectionRetrieval(self.library.collection). \ + text_search_with_key_value_range(query, key, value_range) + else: + # as fallback, if no key found, then run query without filter + cursor = CollectionRetrieval(self.library.collection).basic_query(query) + + result_dict = self._cursor_to_qr(query, cursor, result_count=result_count, + exhaust_full_cursor=exhaust_full_cursor) + + if results_only: + return result_dict["results"] + + return result_dict + + def text_query_by_content_type (self, query, content_type,results_only=True): + + filter_dict = {"content_type": content_type} + retrieval_dict = self.text_query_with_custom_filter(query,filter_dict,results_only=True) + return retrieval_dict + + def image_query(self, query, results_only=True): + + filter_dict = {"content_type": "image"} + retrieval_dict = self.text_query_with_custom_filter(query, filter_dict,results_only=True) + return retrieval_dict + + def table_query(self, query, export_tables_to_csv=False, results_only=True): + + filter_dict = {"content_type": "table"} + retrieval_dict = self.text_query_with_custom_filter(query, filter_dict,results_only=True) + + # output and write tables to csv files + if export_tables_to_csv: + for i, entry in enumerate(retrieval_dict["results"]): + f = self.export_one_table_to_csv(entry,output_fp=LLMWareConfig.get_query_path(), + output_fn="table_{}.csv".format(i)) + + logging.warning("update: csv created - %s - %s", LLMWareConfig.get_query_path(),f) + + return retrieval_dict + + def text_search_by_page (self, query, page_num=1, results_only=True): + + key = "master_index" # parsing uses "master_index" across multiple input sources, interpret as "page_num" + + if not isinstance(page_num, list): + page_num = [page_num] + + cursor_results = CollectionRetrieval(self.library.collection).\ + text_search_with_key_value_range(query, key, page_num) + + retrieval_dict = self._cursor_to_qr(query, cursor_results) + + if results_only: + return retrieval_dict["results"] + + return retrieval_dict + + def text_query_by_author_or_speaker(self, query, author_or_speaker, results_only=True): + + filter_dict = {"author_or_speaker": author_or_speaker} + retrieval_dict = self.text_query_with_custom_filter(query,filter_dict,results_only=results_only) + return retrieval_dict + + def text_query_with_custom_filter (self, query, filter_dict, result_count=20, + exhaust_full_cursor=False, results_only=True): + + # filter_dict is a dict with indefinite number of key:value pairs - each key will be interpreted + # as "$and" in the query, requiring a match against all of the key:values in the filter_dict + + # validate filter dict + validated_filter_dict = {} + for key, values in filter_dict.items(): + for valid_keys in self.library.default_keys: + if key in valid_keys: + validated_filter_dict.update({key:values}) + + if validated_filter_dict: + cursor = CollectionRetrieval(self.library.collection).\ + text_search_with_key_value_dict_filter(query,validated_filter_dict) + + else: + logging.error("error: Query text_query_with_custom_filter - keys in filter_dict are not" + "recognized as part of the library.collection default_keys list.") + + return -1 + + result_dict = self._cursor_to_qr_with_secondary_filter(query, cursor,filter_dict, + result_count=result_count, + exhaust_full_cursor=exhaust_full_cursor) + + if results_only: + return result_dict["results"] + + return result_dict + + def _cursor_to_qr_with_secondary_filter(self, query, cursor_results, filter_dict, + result_count=20, exhaust_full_cursor=False): + + qr = [] + counter = 0 + doc_id_list = [] + doc_fn_list = [] + + for raw_qr in cursor_results: + + # update to locate match and add to result + matches_found = self.locate_query_match(query, raw_qr["text"]) + raw_qr.update({"matches": matches_found}) + raw_qr.update({"page_num": raw_qr["master_index"]}) + + raw_qr.update({"_id": str(raw_qr["_id"])}) + + if "score" not in raw_qr: + raw_qr.update({"score": 0.0}) + + if "similarity" not in raw_qr: + raw_qr.update({"similarity": 0.0}) + + if "distance" not in raw_qr: + raw_qr.update({"distance": 0.0}) + + # apply secondary filter dict + match = -1 + for key, value in filter_dict.items(): + if key in raw_qr: + # support case in which filter_dict is a list, e.g., doc_id = {5,9,13} + if raw_qr[key] == value or (isinstance(value,list) and raw_qr[key] in value): + match = 1 + else: + match = -1 + break + + if match == 1: + + # output target keys + output_dict = {} + output_dict.update({"query": query}) + + for key in self.query_result_return_keys: + if key not in raw_qr: + logging.warning("warning: Query() - selected output key not found in result - %s ", key) + else: + output_dict.update({key: raw_qr[key]}) + + output_dict.update({"account_name": self.account_name}) + output_dict.update({"library_name": self.library_name}) + + qr.append(output_dict) + + if raw_qr["doc_ID"] not in doc_id_list: + doc_id_list.append(raw_qr["doc_ID"]) + doc_fn_list.append(raw_qr["file_source"]) + + counter += 1 + + # will exhaust full cursor if .exhaust_full_cursor = True + if counter >= result_count and not exhaust_full_cursor: + break + + qr_dict = {"query": query, "results": qr, "doc_ID": doc_id_list, "file_source": doc_fn_list} + + if self.save_history: + self.register_query(qr_dict) + + return qr_dict + + def _cursor_to_qr (self, query, cursor_results, result_count=20, exhaust_full_cursor=False): + + qr = [] + counter = 0 + doc_id_list = [] + doc_fn_list = [] + + for raw_qr in cursor_results: + + # update to locate match and add to result + matches_found = self.locate_query_match(query, raw_qr["text"]) + raw_qr.update({"matches": matches_found}) + raw_qr.update({"page_num": raw_qr["master_index"]}) + + raw_qr.update({"_id": str(raw_qr["_id"])}) + + if "score" not in raw_qr: + raw_qr.update({"score": 0.0}) + + if "similarity" not in raw_qr: + raw_qr.update({"similarity": 0.0}) + + if "distance" not in raw_qr: + raw_qr.update({"distance": 0.0}) + + # output target keys + output_dict = {} + output_dict.update({"query": query}) + + for key in self.query_result_return_keys: + if key not in raw_qr: + logging.warning("warning: Query() - selected output key not found in result - %s ", key) + else: + output_dict.update({key: raw_qr[key]}) + + output_dict.update({"account_name": self.account_name}) + output_dict.update({"library_name": self.library_name}) + + qr.append(output_dict) + + if raw_qr["doc_ID"] not in doc_id_list: + doc_id_list.append(raw_qr["doc_ID"]) + doc_fn_list.append(raw_qr["file_source"]) + + counter += 1 + + # will exhaust full cursor if .exhaust_full_cursor = True + if counter >= result_count and not exhaust_full_cursor: + break + + qr_dict = {"query": query,"results": qr, "doc_ID": doc_id_list, "file_source": doc_fn_list} + + if self.save_history: + self.register_query(qr_dict) + + return qr_dict + + # basic semantic query + def semantic_query (self, query, result_count=20, embedding_distance_threshold=None, results_only=True): + + if not embedding_distance_threshold: + embedding_distance_threshold = self.semantic_distance_threshold + + self.load_embedding_model() + + # will run semantic query and get blocks by similiarity + self.query_embedding = self.embedding_model.embedding(query) + + if self.embedding_db and self.embedding_model: + + semantic_block_results = self.embeddings.search_index(self.query_embedding, + embedding_db=self.embedding_db, + model=self.embedding_model, + sample_count=result_count) + + else: + logging.error("error: Query - embedding record does not indicate embedding db - %s " + "and/or embedding model - %s ", self.embedding_db, self.embedding_model) + + raise UnsupportedEmbeddingDatabaseException(self.embedding_db) + + qr_raw = [] + + # may need to conform the output structure of semantic_block_results + for i, blocks in enumerate(semantic_block_results): + + # assume that each block has at least two components: [0] core mongo block, and [1] distance metric + if blocks[1] < embedding_distance_threshold: + + blocks[0]["distance"] = blocks[1] + blocks[0]["semantic"] = "semantic" + blocks[0]["score"] = 0.0 + + qr_raw.append(blocks[0]) + + # pick up with boilerplate + results_dict = self._cursor_to_qr (query, qr_raw,result_count=result_count) + + if results_only: + return results_dict["results"] + + return results_dict + + # basic semantic query + def semantic_query_with_document_filter(self, query, filter_dict, embedding_distance_threshold=None, + result_count=100, results_only=True): + + # checks for filter to offer option to do semantic query in specific doc, page or content range + if not embedding_distance_threshold: + embedding_distance_threshold = self.semantic_distance_threshold + + # note: by default, retrieves a much larger set of results to try to account for filter + + th = self.semantic_distance_threshold + + # run semantic query + + self.query_embedding = self.embedding_model.embedding(query) + + if self.embedding_db and self.embedding_model: + semantic_block_results = self.embeddings.search_index(self.query_embedding, + embedding_db=self.embedding_db, + model=self.embedding_model, + sample_count=result_count) + + else: + logging.error("error: Query - embedding record does not indicate embedding db- %s and/or " + "an embedding_model - %s ", self.embedding_db, self.embedding_model) + + raise UnsupportedEmbeddingDatabaseException(self.embedding_db) + + qr_raw = [] + + # may need to conform the output structure of semantic_block_results + for i, blocks in enumerate(semantic_block_results): + # assume that each block has at least two components: [0] core mongo block, and [1] distance metric + if blocks[1] < embedding_distance_threshold: + + blocks[0].update({"distance": blocks[1]}) + blocks[0].update({"semantic": "semantic"}) + blocks[0].update({"score": 0.0}) + + qr_raw.append(blocks[0]) + + result_output = self._cursor_to_qr_with_secondary_filter(query,qr_raw,filter_dict,result_count=result_count) + + if results_only: + return result_output["results"] + + return result_output + + def similar_blocks_embedding(self, block, result_count=20, embedding_distance_threshold=10, results_only=True): + + # will use embedding to find similar blocks from a given block + block_ev = self.embedding_model.embedding(block["text"]) + + if self.embedding_model and self.embedding_db: + semantic_block_results = self.embeddings.search_index(self.query_embedding, + embedding_db=self.embedding_db, + model=self.embedding_model, + sample_count=result_count) + + else: + logging.error("error: Query - embedding record does not indicate embedding db- %s and/or " + "embedding model - %s ", self.embedding_db, self.embedding_model) + + raise UnsupportedEmbeddingDatabaseException(self.embedding_db) + + qr_raw = [] + + # may need to conform the output structure of semantic_block_results + for i, blocks in enumerate(semantic_block_results): + # assume that each block has at least two components: [0] core mongo block, and [1] distance metric + if blocks[1] < embedding_distance_threshold: + + blocks[0].update({"distance": blocks[1]}) + blocks[0].update({"semantic": "semantic"}) + blocks[0].update({"score": 0.0}) + + qr_raw.append(blocks[0]) + + # pick up with boilerplate + results_dict = self._cursor_to_qr("", qr_raw, result_count=result_count) + + if results_only: + return results_dict["results"] + + return results_dict + + def dual_pass_query (self, query, result_count=20, primary="text", safety_check=True,results_only=True): + + # safety check + if safety_check and result_count > 100: + + logging.warning("warning: Query().dual_pass_query runs a comparison of output rankings using semantic " + "and text. This particular implementation is not optimized for sample lists longer " + "than ~100 X 100. To remove this warning, there are two options - (1) set the " + "safety_check to False in the method declaration, or (2) keep sample count below 100.") + + result_count = 100 + + # run dual pass - text + semantic + retrieval_dict_text = self.text_query(query, result_count=result_count,results_only=True) + retrieval_dict_semantic = self.semantic_query(query, result_count=result_count,results_only=True) + + if primary == "text": + first_list = retrieval_dict_text + second_list = retrieval_dict_semantic + else: + first_list = retrieval_dict_semantic + second_list = retrieval_dict_text + + confirming_list = [] + primary_only = [] + secondary_only = [] + matched_second_list = [] + + # this is the time intensive "n-squared" loop - probably OK up to 100+ + + for i, entry in enumerate(first_list): + match = -1 + for j, entry2 in enumerate(second_list): + if entry["_id"] == entry2["_id"]: + confirming_list.append(entry) + match = 1 + matched_second_list.append(entry2["_id"]) + break + if match == -1: + primary_only.append(entry) + + for k, entry2 in enumerate(second_list): + if entry2["_id"] not in matched_second_list: + secondary_only.append(entry2) + + # assemble merged top results + merged_results = [] + merged_results += confirming_list + + select_primary = min(len(primary_only),5) + select_secondary = min(len(secondary_only),5) + + merged_results += primary_only[0:select_primary] + merged_results += secondary_only[0:select_secondary] + + doc_id_list = [] + doc_fn_list = [] + + for qr in merged_results: + if qr["doc_ID"] not in doc_id_list: + doc_id_list.append(qr["doc_ID"]) + if qr["file_source"] not in doc_fn_list: + doc_fn_list.append(qr["file_source"]) + + retrieval_dict = {"results": merged_results, + "text_results": retrieval_dict_semantic, + "semantic_results": retrieval_dict_semantic, + "doc_ID": doc_id_list, + "file_source": doc_fn_list} + + if results_only: + return merged_results + + return retrieval_dict + + def augment_qr (self, query_result, query_topic, augment_query="semantic"): + + if augment_query == "semantic": + qr_aug = self.semantic_query(query_topic,result_count=20, results_only=True) + else: + qr_aug = self.text_query(query_topic,result_count=20, results_only=True) + + # consolidate the qr lists + updated_qr = [] + for qr in query_result: + updated_qr.append(qr) # start with original qr list + + # add up to 10 entries from semantic list + semantic_return_max = 10 + + for j, sem_entries in enumerate(qr_aug): + if sem_entries not in updated_qr: + updated_qr.append(sem_entries) + if j > semantic_return_max: + break + + return updated_qr + + def apply_semantic_ranking(self, qr, issue_semantic): + + # designed to take a set of query results, and re-rank the order of results by their semantic distance + # --note: possible to use a different query term for issue_semantic than the original query result + + # heuristic - look for result targets of at least 20, but up to the exact len of the qr + result_target = max(len(qr),20) + + semantic_qr = self.semantic_query(issue_semantic,result_count=result_target) + + reranked_qr = [] + for i, s in enumerate(semantic_qr): + + for q in qr: + if s["_id"] == q["_id"]: + reranked_qr.append(q) + break + + for q in qr: + if q not in reranked_qr: + reranked_qr.append(q) + + return reranked_qr + + def document_filter (self, filter_topic, query_mode="text", result_count=30, + exact_mode = False, exhaust_full_cursor=True): + + result_dict = None + + if query_mode not in ["text", "semantic", "hybrid"]: + + logging.error("error: Query document_filter supports query types - 'text', " + "'semantic', and 'hybrid' - type selected not recognized - %s ", query_mode) + + return result_dict + + if query_mode == "text": + result_dict = self.text_query(filter_topic,exact_mode=exact_mode,result_count=result_count, + exhaust_full_cursor=exhaust_full_cursor,results_only=False) + + if query_mode == "semantic": + result_dict = self.semantic_query(filter_topic,result_count=result_count, results_only=False) + + if query_mode == "hybrid": + result_dict = self.dual_pass_query(filter_topic) + + if not result_dict: + + logging.error("error: Query file_selector_only could not find a result - unexpected error - %s ", + filter_topic) + + return result_dict + + doc_filter_output = {"doc_ID": result_dict["doc_ID"], "file_source": result_dict["file_source"]} + + return doc_filter_output + + def page_lookup(self, page_list=None, doc_id_list=None, text_only=False): + + doc_id = doc_id_list + page = page_list + + if text_only: + page_dict = {"doc_ID": doc_id, "master_index": page, "content_type": "text"} + else: + page_dict = {"doc_ID": doc_id, "master_index": page} + + cursor_results = CollectionRetrieval(self.library.collection).filter_by_key_dict(page_dict) + + counter = 0 + output = [] + + for x in cursor_results: + x.update({"matches": []}) + x.update({"page_num": x["master_index"]}) + + output.append(x) + counter += 1 + if counter > 10: + break + + return output + + # new method to extract whole library + def get_whole_library(self, selected_keys=None): + + match_results = CollectionRetrieval(self.library.collection).get_whole_collection() + + qr = [] + + # option to retrieve only user selected keys + if not selected_keys: + selected_keys = self.library.default_keys + + for i, block in enumerate(match_results): + + new_row = {} + new_row.update({"_id": str(block["_id"])}) + new_row.update({"matches": []}) + new_row.update({"page_num": block["master_index"]}) + new_row.update({"score": 0.0}) + new_row.update({"similarity": 0.0}) + new_row.update({"distance": 0.0}) + + for keys in selected_keys: + if keys in block: + if keys not in new_row: + new_row.update({keys:block[keys]}) + + qr.append(new_row) + + return qr + + # new method to generate csv files for each table entry + def export_all_tables(self, query="", output_fp=None): + + table_csv_files_created = [] + + if not output_fp: + output_fp = self.library.misc_path + + if not query: + + match_results = CollectionRetrieval(self.library.collection).filter_by_key("content_type","table") + + else: + kv_dict = {"content_type": "table"} + match_results = CollectionRetrieval(self.library.collection).\ + text_search_with_key_value_dict_filter(query,kv_dict) + + counter = 0 + + for i, entries in enumerate(match_results): + + table = entries["table"] + + output = [] + + table_raw = table + rows = table_raw.split("") + cols_tracker = [] + coords_master = [] + + for row in rows: + + new_row = [] + if row.strip().endswith(""): + row = row.strip()[:-5] + + cells = row.lstrip().split("") + cols_count = 0 + coords = [] + + for c in cells: + + if c.strip().endswith(""): + c = c.strip()[:-5] + + clean_cell = "" + bracket_on = 0 + + fields = c.split("<") + + if fields[0]: + index = fields[1].rstrip()[0:-1] + + main_entry = fields[2].split(">") + value = main_entry[-1] + + co = main_entry[0].split(" ") + + if len(co) > 2: + x = co[1] + y = co[2] + + coords.append((int(x), int(y))) + + for c1 in c: + if bracket_on == 0 and c1 not in ("<", ">"): + clean_cell += c1 + if c1 == "<": + bracket_on = 1 + if c1 == ">": + bracket_on = 0 + + if c: + c_strip = c.split(">")[-1] + new_row.append(c_strip.strip()) + cols_count += 1 + + coords_master.append(coords) + cols_tracker.append(cols_count) + output.append(new_row) + + new_file = "table_{}.csv".format(counter) + + counter += 1 + f = Utilities().file_save(output, output_fp, new_file) + output = [] + + table_csv_files_created.append(new_file) + + output_dict = {"library": self.library_name, "query": query, "tables_created": counter, + "file_names": table_csv_files_created, "output_fp": output_fp} + + return output_dict + + def export_one_table_to_csv(self, query_result, output_fp=None, output_fn=None): + + table = query_result["table"] + + output = [] + + table_raw = table + rows = table_raw.split("") + cols_tracker = [] + coords_master = [] + + for row in rows: + + new_row = [] + if row.strip().endswith(""): + row = row.strip()[:-5] + + cells = row.lstrip().split("") + cols_count = 0 + coords = [] + + for c in cells: + + if c.strip().endswith(""): + c = c.strip()[:-5] + + clean_cell = "" + bracket_on = 0 + + fields = c.split("<") + + if fields[0]: + index = fields[1].rstrip()[0:-1] + main_entry = fields[2].split(">") + value = main_entry[-1] + co = main_entry[0].split(" ") + if len(co) > 2: + x = co[1] + y = co[2] + coords.append((int(x), int(y))) + + for c1 in c: + if bracket_on == 0 and c1 not in ("<", ">"): + clean_cell += c1 + if c1 == "<": + bracket_on = 1 + if c1 == ">": + bracket_on = 0 + + if c: + c_strip = c.split(">")[-1] + new_row.append(c_strip.strip()) + cols_count += 1 + coords_master.append(coords) + cols_tracker.append(cols_count) + output.append(new_row) + + if not output_fn: + new_file = "table_0.csv" + else: + new_file = output_fn + + f = Utilities().file_save(output, output_fp, new_file) + + return new_file + + def list_doc_id(self): + + # utility function - returns list of all doc_ids in the library + doc_id_list = CollectionRetrieval(self.library.collection).get_distinct_list("doc_ID") + + return doc_id_list + + def list_doc_fn(self): + + # utility function -returns list of all document names in the library + doc_fn_raw_list = CollectionRetrieval(self.library.collection).get_distinct_list("file_source") + + doc_fn_out = [] + for i, file in enumerate(doc_fn_raw_list): + doc_fn_out.append(file.split("/")[-1]) + return doc_fn_out + + def block_lookup(self, block_id, doc_id): + + result = None + + kv_dict = {"doc_ID": doc_id, "block_ID": block_id} + + output = CollectionRetrieval(self.library.collection).filter_by_key_dict(kv_dict) + + if len(output) == 0: + logging.info("update: Query - Library - block_lookup - block not found: %s ", block_id) + result = None + + if len(output) > 1: + result = output[0] + + if len(output) == 1: + result = output[0] + + result.update({"matches": []}) + result.update({"page_num": result["master_index"]}) + + return result + + def get_header_text_from_collection(self, text_field="header_text"): + + ds_folder = self.library.nlp_path + + results = CollectionRetrieval(self.library.collection).get_whole_collection() + + f = open(ds_folder + "header_text.txt", "w") + counter = 0 + for elements in results: + text_sample = elements[text_field] + if text_sample: + f.write(text_sample) + f.write("\n") + f.write(elements["text"]) + f.write("\n") + counter += 1 + + f.close() + results.close() + return counter + + def get_core_text_from_collection(self, text_field="text"): + + ds_folder = self.library.nlp_path + + results = CollectionRetrieval(self.library.collection).get_whole_collection() + + f = open(os.path.join(ds_folder,"core_text.txt"), "w") + counter = 0 + for elements in results: + text_sample = elements[text_field] + if text_sample: + f.write(text_sample) + f.write("\n") + counter += 1 + + f.close() + results.close() + return counter + + def get_user_tags(self): + + # look for all non-empty user_tags + output = CollectionRetrieval(self.library.collection).filter_by_key_ne_value("user_tags", "") + + counter = 0 + user_tags_out = [] + for elements in output: + counter += 1 + user_tags_out.append((elements["block_ID"], elements["user_tags"])) + + return user_tags_out + + def filter_by_time_stamp (self, qr, first_date="", last_date=""): + + # apply filter dict to the qr results found + time_str = "%Y-%m-%d" + if first_date: + first_date = datetime.strptime(first_date,time_str) + + if last_date: + last_date = datetime.strptime(last_date, time_str) + + filtered_qr = [] + + for i, entry in enumerate(qr): + + if entry["added_to_collection"]: + + time_str="%a %b %d %H:%M:%S %Y" + + doc_date = datetime.strptime(entry["added_to_collection"], time_str) + + time_accept = self._time_window_filter(first_date,last_date,doc_date) + + if time_accept: + filtered_qr.append(entry) + + return filtered_qr + + def _time_window_filter(self, start_time,end_time, test_time, time_str="%a %b %d %H:%M:%S %Y"): + + if start_time and end_time: + if start_time <= test_time <= end_time: + return True + + if start_time and not end_time: + if start_time <= test_time: + return True + + if not start_time and end_time: + if test_time <= end_time: + return True + + return False + + def locate_query_match (self, query, core_text): + + matches_found = [] + + b = CorpTokenizer(one_letter_removal=False, remove_stop_words=False, remove_punctuation=False, + remove_numbers=False) + + query_tokens = b.tokenize(query) + + for x in range(0, len(core_text)): + match = 0 + for key_term in query_tokens: + if key_term.startswith('"'): + key_term = key_term[1:-1] + + if core_text[x].lower() == key_term[0].lower(): + match += 1 + if (x + len(key_term)) <= len(core_text): + for y in range(1, len(key_term)): + if key_term[y].lower() == core_text[x + y].lower(): + match += 1 + else: + match = -1 + break + + if match == len(key_term): + new_entry = [x, key_term] + matches_found.append(new_entry) + + return matches_found + + def exact_query_prep(self, query): + + if query.startswith('"') and query.endswith('"'): + prepared_query = '\"' + query[1:-1] + '\"' + + else: + # even if user did not wrap in quotes, treat as exact search + prepared_query = '\"' + query + '\"' + + return prepared_query + + def bibliography_builder_from_qr(self, query_results): + + bibliography = [] + doc_id_reviewed = [] + doc_fn_reviewed = [] + + # first - assemble the list of docs in the query_results + for y in range(0,len(query_results)): + if "doc_ID" in query_results[y]: + if query_results[y]["doc_ID"] not in doc_id_reviewed: + doc_id_reviewed.append(query_results[y]["doc_ID"]) + doc_fn_reviewed.append(query_results[y]["file_source"]) + + # second - identify and sort the key pages associated with the doc + for x in range(0,len(doc_id_reviewed)): + pages_reviewed = [] + for z in range(0,len(query_results)): + if "doc_ID" in query_results[z]: + if query_results[z]["doc_ID"] == doc_id_reviewed[x]: + pages_reviewed.append(query_results[z]["page_num"]) + + pr = Counter(pages_reviewed) + mc = pr.most_common() + page_output_list = [] + for m in mc: + page_output_list.append(m[0]) + + if len(doc_fn_reviewed) > x: + doc_fn_tmp = doc_fn_reviewed[x] + else: + doc_fn_tmp = "Doc# " + str(doc_id_reviewed[x]) + + bibliography.append({doc_fn_tmp:page_output_list}) + + return bibliography + + def filter_cursor_list(self, cursor, filter_dict, sample_count=20, exhaust_full_cursor=None): + + validated_filter_dict = self.prep_validated_filter_dict(filter_dict) + result_output = [] + + for i, entry in enumerate(cursor): + + for key, value in validated_filter_dict.items(): + if key not in entry: + logging.warning("warning: Query - retrieval cursor does not contain filter key - %s ", key) + else: + if entry[key] == value: + result_output.append(entry) + + if len(result_output) > sample_count and not exhaust_full_cursor: + break + + return result_output + + def prep_validated_filter_dict(self, filter_dict): + + validated_filter_dict = {} + + for key, values in filter_dict.items(): + if key in self.library.default_keys: + validated_filter_dict.update({key:values}) + else: + logging.warning("warning: Query - filter key not in library collection - %s ", key) + + return validated_filter_dict + + def block_lookup_by_collection_id(self, _id): + # specific to Mongo lookup - uses mongo '_id' which needs to be wrapped in ObjectId + return CollectionRetrieval(self.library.collection).filter_by_key("_id", ObjectId(_id)) + + def compare_text_blocks(self, t1, t2): + + b = CorpTokenizer(one_letter_removal=True, remove_numbers=True, remove_stop_words=True) + tokens1 = b.tokenize(t1) + tokens2 = b.tokenize(t2) + match_per = 0 + match = 0 + + for x in range(0, len(tokens1)): + for y in range(0, len(tokens2)): + if tokens1[x].lower() == tokens2[y].lower(): + match += 1 + break + + if len(tokens1) > 0: + match_per = match / len(tokens1) + + return match_per + + def block_similarity_retrieval_more_like_this (self, target_text, qr, similarity_threshold=0.25): + + # will rank and order a list of query results using a target text as the reference point + output = [] + + for i, block in enumerate(qr): + + compare_text = block["text"] + similarity = self.compare_text_blocks(target_text, compare_text) + + if similarity > similarity_threshold: + block.update({"similarity": similarity}) + + output.append(block) + + output = sorted(output, key=lambda x:x["similarity"], reverse=True) + + return output + + def build_doc_id_fn_list(self, qr): + + doc_id_list = [] + fn_list = [] + + for q in qr: + if q["doc_ID"] not in doc_id_list: + doc_id_list.append(q["doc_ID"]) + fn_list.append(q["file_source"]) + + return doc_id_list, fn_list + + def expand_text_result_before(self, block, window_size=400): + + block_id = block["block_ID"] -1 + doc_id = block["doc_ID"] + + before_text = "" + pre_blocks = [] + + while len(before_text) < window_size and block_id >= 0: + + before_block = self.block_lookup(block_id, doc_id) + + if before_block: + before_text += before_block["text"] + pre_blocks.append(before_block) + + output = {"expanded_text": before_text, "results": pre_blocks} + + return output + + def expand_text_result_after (self, block, window_size=400): + + block_id = block["block_ID"] + 1 + doc_id = block["doc_ID"] + + after_text = "" + post_blocks = [] + + while len(after_text) < window_size and block_id >= 0: + + after_block = self.block_lookup(block_id, doc_id) + + if after_block: + after_text += after_block["text"] + post_blocks.append(after_block) + + output = {"expanded_text": after_text, "results": post_blocks} + + return output + + def generate_csv_report(self): + output = QueryState(self).generate_query_report_current_state() + return output + diff --git a/llmware/setup.py b/llmware/setup.py new file mode 100644 index 00000000..c98f2bb5 --- /dev/null +++ b/llmware/setup.py @@ -0,0 +1,58 @@ + +# Copyright 2023 llmware + +# Licensed under the Apache License, Version 2.0 (the "License"); you +# may not use this file except in compliance with the License. You +# may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. + + +import shutil +import os +from llmware.resources import CloudBucketManager +from llmware.configs import LLMWareConfig +from llmware.library import Library +from llmware.retrieval import Query + +from python_on_whales import DockerClient + +import subprocess +import logging + + +class Setup: + + @staticmethod + def load_sample_files(): + + # changed name from demo to 'sample_files' + # simplified: no user config - pulls into llmware_path + + if not os.path.exists(LLMWareConfig.get_llmware_path()): + LLMWareConfig.setup_llmware_workspace() + + # not configurable - will pull into /sample_files under llmware_path + sample_files_path = os.path.join(LLMWareConfig.get_llmware_path(), "sample_files") + + if not os.path.exists(sample_files_path): + os.makedirs(sample_files_path,exist_ok=True) + else: + logging.info("update: sample_files path already exists - %s ", sample_files_path) + + # pull from sample files bucket + bucket_name = LLMWareConfig().get_config("llmware_sample_files_bucket") + remote_zip = bucket_name + ".zip" + local_zip = os.path.join(sample_files_path, bucket_name + ".zip") + + CloudBucketManager().pull_file_from_public_s3(remote_zip, local_zip, bucket_name) + shutil.unpack_archive(local_zip, sample_files_path, "zip") + os.remove(local_zip) + + return sample_files_path diff --git a/llmware/util.py b/llmware/util.py new file mode 100644 index 00000000..0ffe374c --- /dev/null +++ b/llmware/util.py @@ -0,0 +1,4957 @@ + +# Copyright 2023 llmware + +# Licensed under the Apache License, Version 2.0 (the "License"); you +# may not use this file except in compliance with the License. You +# may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. + + +import csv +from collections import Counter +import sys +import os +import random +import urllib.parse +import platform +import sysconfig +from pathlib import Path +from PIL import Image +import json +from zipfile import ZipFile, ZIP_DEFLATED +import numpy as np +import re +from tokenizers import Tokenizer +from word2number import w2n +from datetime import datetime +import time +from ctypes import * +import logging +import requests +import uuid + + +from wikipediaapi import Wikipedia, ExtractFormat +import yfinance + +from llmware.resources import CollectionRetrieval, CollectionWriter, PromptState, CloudBucketManager +from llmware.configs import LLMWareConfig +from llmware.exceptions import ModelNotFoundException, DependencyNotInstalledException, \ + FilePathDoesNotExistException, LibraryObjectNotFoundException + + +class Utilities: + + def __init__(self, library=None): + self.start = 0 + self.library = library + + def get_default_tokenizer(self): + + # gpt2 tokenizer is used in several places as a default tokenizer + + # check for llmware path & create if not already set up + if not os.path.exists(LLMWareConfig.get_llmware_path()): + + # if not explicitly set up by user, then create folder directory structure + LLMWareConfig.setup_llmware_workspace() + + # first, check if it is in the local repo + local_model_repo = LLMWareConfig.get_model_repo_path() + models = os.listdir(local_model_repo) + + if "gpt2" not in models: + + # if not found locally, then pull from global repo + + logging.info("update: gpt2 tokenizer used as default - not in local model repository, so pulling " + "from global repo - this may take a few seconds the first time to download.") + + files = CloudBucketManager().pull_single_model_from_llmware_public_repo(model_name="gpt2") + + # quick check to confirm that model is present + models = os.listdir(local_model_repo) + if "gpt2" not in models: + raise ModelNotFoundException("gpt2_tokenizer") + + tokenizer = Tokenizer.from_file(os.path.join(local_model_repo, "gpt2", "tokenizer.json")) + + return tokenizer + + def load_tokenizer_from_file(self, fp): + tokenizer = Tokenizer.from_file(fp) + return tokenizer + + def get_uuid(self): + # uses unique id creator from uuid library + return uuid.uuid4() + + @staticmethod + def file_save (cfile, file_path, file_name): + + max_csv_size = 20000 + csv.field_size_limit(max_csv_size) + + out_file = os.path.join(file_path, file_name) + + with open(out_file, 'w', newline='') as csvfile: + c = csv.writer(csvfile, dialect='excel', doublequote=False, delimiter=',',escapechar = ']') + + for z in range(0, len(cfile)): + # intercept a line too large here + if sys.getsizeof(cfile[z]) < max_csv_size: + c.writerow(cfile[z]) + else: + logging.error("error: CSV ERROR: Row exceeds MAX SIZE: %s %s", sys.getsizeof(cfile[z]) + ,cfile[z]) + + csvfile.close() + + return 0 + + @staticmethod + def file_load (in_path): + record_file = open(in_path, encoding='ISO-8859-1') + c = csv.reader(record_file, dialect='excel', doublequote=False, delimiter=',') + output = [] + for lines in c: + output.append(lines) + record_file.close() + + return output + + @staticmethod + def csv_save(rows, file_dir, file_name): + + full_path = Path(file_dir, file_name) + + with full_path.open('w', encoding='utf-8') as out: + writer = csv.writer(out) + try: + writer.writerows(rows) + except csv.Error as e: + logging.exception("Exception writing csv file") + return False + + return True + + @staticmethod + def get_top_bigrams (tokens, top_n): + + bigrams = [] + for z in range(1, len(tokens)): + entry = (tokens[z-1] + "_" + tokens[z]) + bigrams.append(entry) + + d = Counter(bigrams) + dc = d.most_common(top_n) + + return dc + + @staticmethod + def get_top_trigrams (tokens, top_n): + + trigrams = [] + for z in range(2 ,len(tokens)): + entry = (tokens[ z -2] + "_" + tokens[ z -1] + "_" + tokens[z]) + trigrams.append(entry) + + d = Counter(trigrams) + dc = d.most_common(top_n) + + return dc + + @staticmethod + def get_top_4grams (tokens, top_n): + + four_grams = [] + for z in range(3 ,len(tokens)): + entry = (tokens[ z -3 ]+ "_" + tokens[ z -2] + "_" + tokens[ z -1] + "_" + tokens[z]) + four_grams.append(entry) + + d = Counter(four_grams) + dc = d.most_common(top_n) + + return dc + + @staticmethod + def compare_timestamps (t1, t2, time_str="%a %b %d %H:%M:%S %Y"): + + t1_obj = datetime.strptime(t1, time_str) + t2_obj = datetime.strptime(t2, time_str) + + time_delta_obj = t1_obj - t2_obj + + days = time_delta_obj.days + seconds = time_delta_obj.seconds + + return time_delta_obj, days, seconds + + @staticmethod + def get_current_time_now (time_str="%a %b %e %H:%M:%S %Y"): + time_stamp = datetime.now().strftime(time_str) + return time_stamp + + @staticmethod + def get_time_string_standard(): + time_str_standard = "%a %b %e %H:%M:%S %Y" + return time_str_standard + + @staticmethod + def isfloat(num): + try: + float(num) + return True + except ValueError: + return False + + @staticmethod + def prep_filename_alt(filename_in, accepted_file_formats_list): + + success_code = 1 + + fn_toks = filename_in.split(".") + fn_base = fn_toks[0] + ext = fn_toks[-1] + + # only accept upload files with file extension in accepted_file_formats_list + if ext.lower() in accepted_file_formats_list and not filename_in.startswith("."): + + # prepend a random number to the front of the secure filename + + if len(fn_base) > 240: + # cap len of filename at 240 + filename_in = fn_base[0:240] + "." + ext + + fn_out = str(random.randint(100000, 999999)) + "_" + filename_in + + else: + success_code = -1 + fn_out = filename_in + + return success_code, fn_out + + @staticmethod + def safe_url(string): + + try: + return urllib.parse.quote_plus(string) + except TypeError: + logging.exception("Error encoding string (%s)", string) + return "" + + @staticmethod + def get_stop_words_master_list(): + + stop_words = ["a", "able", "about","above","accordance","according", "accordingly","across","act","actually", + "added" ,"adj" ,"affected" ,"affecting" ,"affects" ,"after" ,"afterwards" ,"again" ,"against", + "ah","al" ,"all", "almost" ,"alone" ,"along" ,"already", "also" ,"although" ,"always" ,"am" , + "among" ,"amongst" ,"an","and","announce" ,"another" ,"any" ,"anybody" ,"anyhow" ,"anymore" , + "anyone" ,"anything" ,"anyway","anyways","anywhere" ,"apparently" ,"approximately" ,"are" , + "aren" ,"arent" ,"arise", "around", "as" ,"aside", "ask", "asked" ,"asking" ,"at" ,"auth", + "available" ,"away" ,"awfully" ,"b" ,"back", "basically" ,"be", "became" ,"because", + "become" ,"becomes", "becoming" ,"been", "before" ,"beforehand", "begin", "beginning" ,"beginnings", + "begins" ,"behind" ,"being" ,"believe" ,"below" ,"beside" ,"besides" ,"between" ,"beyond", "biol" + ,"both", "brief" ,"briefly" ,"but" ,"by" ,"c" ,"ca" ,"came" ,"can" ,"cannot" ,"can't" ,"cant" ,"cause" + ,"causes", "certain" ,"certainly" ,"co" ,"com" ,"come" ,"comes" ,"contain" ,"containing" ,"contains", + "could","couldnt", "d" ,"date" ,"did" ,"didnt" ,"didn't", "different" ,"do" ,"does" ,"doesn't", + "doesnt" ,"doing","done","don't" ,"dont" ,"down" ,"downwards" ,"due" ,"during" ,"e" ,"each" , + "ed","edu","effect","eg","e.g." ,"eight", "eighty" ,"either" ,"else" ,"elsewhere" ,"end" , + "ending" ,"enough" ,"especially" ,"et" ,"etal" ,"etc" ,"even","ever" ,"every" ,"everybody", + "everyone" ,"everything" ,"everywhere" ,"ex" ,"except" ,"f" ,"far" ,"few" ,"ff", "fifth", + "first" ,"five" ,"fix" ,"followed" ,"following" ,"follows" ,"for" ,"former" ,"formerly","forth", + "found" ,"four" ,"from" ,"further" ,"furthermore" ,"g" ,"gave" ,"generally" ,"get" ,"gets" ,"getting" + ,"give" ,"given", "gives" ,"giving" ,"go" ,"goes" ,"gone" ,"got" ,"gotten" ,"h" ,"had" ,"happens", + "hardly" ,"has","hasn't","have" ,"haven't" ,"having" ,"he" ,"hed" ,"hence" ,"her" ,"here", + "hereafter" ,"hereby" ,"herein","heres", "here's" ,"hereupon" ,"hers" ,"herself" ,"hes" ,"he's", + "hi" ,"hid" ,"him" ,"himself" ,"his" ,"hither" ,"home", "how" ,"howbeit" ,"however" ,"hundred", + "i" ,"id" ,"ie" ,"i.e." ,"if" ,"i'll" ,"ill" ,"im" ,"i'm" ,"immediate", "immediately" ,"importance", + "important" ,"in" ,"inc" ,"inc." ,"indeed" ,"index" ,"information","instead", "into", + "invention","inward" ,"is" ,"isn't" ,"isnt" ,"it" ,"itd" ,"it'll","its","it's" ,"itself" + ,"i've" ,"ive" ,"j", "just" ,"k" ,"keep" ,"keeps" ,"kept" ,"kg" ,"km" ,"know","known","knows", + "l","largely","last","lately", "later","latter","latterly","least","less","lest","let","lets", + "let's" ,"like" ,"liked","likely", "line" ,"little" ,"'ll" ,"look" ,"looking" ,"looks", + "ltd" ,"m" ,"made" ,"mainly" ,"make" ,"makes","many", "may" ,"maybe" ,"me" ,"mean" ,"means" , + "meantime" ,"meanwhile" ,"merely" ,"mg" ,"might" ,"million","miss", "ml" ,"more" ,"moreover", + "most" ,"mostly" ,"mr" ,"mr." ,"mrs" ,"mrs." ,"ms", "ms." ,"much" ,"mug","must" ,"my" ,"myself", + "n" ,"na" ,"name" ,"namely" ,"nay" ,"nd" ,"near" ,"nearly" ,"necessarily" ,"necessary" ,"need" + ,"needs", "neither" ,"never""nevertheless" ,"new" ,"next" ,"nine" ,"ninety" ,"no" ,"nobody", + "non" ,"none","nonetheless","noone" ,"nor" ,"normally" ,"nos" ,"not" ,"note" ,"noted" , + "nothing" ,"now" ,"nowhere" ,"o" ,"obtain","obtained", "obviously" ,"of" ,"off" ,"often", + "oh" ,"ok" ,"okay" ,"old" ,"omit" ,"omitted" ,"on" ,"once" ,"one","ones","only" ,"onto" ,"or", + "ord" ,"other" ,"others" ,"otherwise" ,"ought" ,"our" ,"ours" ,"ourselves","out", + "outside" ,"over" ,"overall" ,"owing" ,"own" ,"p" ,"page" ,"pages" ,"part" ,"particular" + ,"particularly", "past" ,"per" ,"perhaps" ,"placed" ,"please" ,"plus" ,"poorly" ,"possible", + "possibly" ,"potentially","pp","predominantly" ,"present" ,"previously" ,"primarily","probably", + "promptly" ,"proud" ,"provide", "provides" ,"put" ,"q" ,"que" ,"quickly" ,"quite" ,"qv" , + "r" ,"ran" ,"rather" ,"rd" ,"re" ,"readily","really","recent" ,"recently" ,"ref" ,"refs", + "regarding" ,"regardless" ,"regards" ,"regard" ,"related","relative", "relatively" , + "research","respectively" ,"resulted" ,"resulting" ,"results" ,"right" ,"run" ,"s","said", + "same" ,"saw" ,"say" ,"saying" ,"says" ,"see" ,"seeing" ,"seem" ,"seemed","seeming","seems", + "seen" ,"self","selves" ,"sent" ,"seven" ,"several" ,"shall" ,"she" ,"shed" ,"she'll" ,"shes", + "she's" ,"should","shouldn't", "shouldnt" ,"show" ,"showed" ,"shown" ,"showns" ,"shows" , + "significant" ,"significantly" ,"similar", "similarly" ,"since" ,"six" ,"slightly" ,"so" , + "some" ,"somebody" ,"somehow" ,"someone" ,"somethan","something" ,"sometime" ,"sometimes" , + "somewhat" ,"somewhere" ,"soon" ,"sorry" ,"specifically","specified", "specify" , + "specifying" ,"still" ,"stop" ,"strongly" ,"sub" ,"substantially" ,"successfully" ,"such", + "sufficiently" ,"suggest" ,"sup" ,"sure" ,"t" ,"take" ,"taken" ,"taking" ,"talk" , + "talked" ,"td","tell" ,"tends" ,"th" ,"than", "thank" ,"thanks" ,"thanx" ,"that" ,"that'll" , + "thats" ,"that've" ,"the" ,"their" ,"theirs" ,"them", "themselves" ,"then" ,"thence" , + "there" ,"thereafter", "thereby" ,"thered" ,"therefore" ,"therein","there'll" ,"thereof", + "therere" ,"theres" ,"thereto" ,"thereupon" ,"there've" ,"these", "they", + "theyd" ,"they'll" ,"theyre" ,"they've" ,"think" ,"this" ,"those" ,"thou" ,"though" ,"thoughh" + ,"thousand", "throug" ,"through" ,"throughout" ,"thru" ,"thus" ,"til" ,"tip" ,"to" , + "together" ,"too" ,"took","toward","towards" ,"tr" ,"tried" ,"tries" ,"truly" ,"try" , + "trying" ,"ts" ,"twice" ,"two", "u" ,"un" ,"under", "unfortunately" ,"unless" ,"unlike" , + "unlikely" ,"until" ,"unto" ,"up" ,"upon" ,"ups" ,"us" ,"use","used","useful", + "usefully" ,"usefulness" ,"uses" ,"using" ,"usually" ,"v" ,"value" ,"various" ,"ve" ,"very" + ,"via","viz" ,"vol" ,"vols" ,"vs" ,"w" ,"want" ,"wants" ,"was" ,"wasnt" ,"way" , + "we" ,"wed" ,"welcome","well" ,"we'll" ,"went","were" ,"werent" ,"we've" ,"what" ,"whatever", + "what'll" ,"whats" ,"when" ,"whence" ,"whenever","where","whereafter", "whereas", + "whereby" ,"wherein" ,"wheres" ,"whereupon" ,"wherever" ,"whether" ,"which", + "while" ,"whim" ,"whither" ,"who" ,"whod" ,"whoever" ,"whole" ,"who'll" ,"whom","whomever","whos" + ,"whose", "why" ,"widely" ,"willing" ,"will" ,"wish" ,"with" ,"within" ,"without","wont", + "words" ,"world" ,"would" ,"wouldnt","www" ,"x" ,"xx" ,"xxx", "y" ,"yes" ,"yet" , + "you" ,"youd" ,"you'll" ,"your" ,"youre" ,"yours" ,"yourself","yourselves" ,"you've" ,"z", + "zero" ,"xoxo", "ii", "iii", "iv" ,"ix" ,"vi" ,"vii" ,"viii" ,"", + "" ,"three" ,"ten" ,"view" ,"met" ,"follow" ,"consist" ,"lack" ,"lacks" ,"base" ,"based" ,"ago", + "addition" ,"additional" ,"depend" ,"depends" ,"include" ,"includes" ,"including" ,"continue" + ,"bring", "brings" ,"ahead" ,"add" ,"adds" ,"attribute" ,"attributes" ,"associated" ,"associate", "follow", + "happen" ,"happened" ,"happening" ,"single" ,"consider" ,"considered" ,"looked" ,"involve" + ,"involves", "involved" ,"thing" ,"things" ,"going", "brought", "lot"] + + return stop_words + + def load_stop_words_list (self, library_fp): + + stop_words = self.get_stop_words_master_list() + + s = open(os.path.join(library_fp, "stop_words_list.txt"), "w") + + for words in stop_words: + s.write((words + ",")) + s.close() + os.chmod((library_fp+ "stop_words_list.txt"), 0o777) + + return stop_words + + def remove_stop_words(self, token_list): + stop_words = self.get_stop_words_master_list() + + tokens_out = [] + for z in range(0, len(token_list)): + if token_list[z] not in stop_words: + tokens_out.append(token_list[z]) + + return tokens_out + + # used by CorpTokenizer + @staticmethod + def clean_list (token_list): + + punctuation = ("-" ,"," ,"'", "/" ,"(')", "'('" ,":" ,".", "?" ,"%", "[", "]" ,"(')'" ,"('('" ,"'–'") + clean_out = [] + for z in range(0 ,len(token_list)): + t = token_list[z] + clean_word = "" + for y in range(0 ,len(t)): + if t[y] in punctuation: + if len(clean_word) == len(t) -1: + # if last letter in word, then skip, no additional space added + char_out = "" + else: + char_out = "" + else: + char_out = t[y] + clean_word += char_out + + if clean_word != "": + clean_out.append(clean_word) + + return clean_out + + def get_sentences(self,lib, key_term, block_cursor): + + output = [] + sentences_only = [] + + abbrevs = "^Mr^Mrs^Ms^Jr^Sr" + regex_string = '((?=([^"]*"[^"]*")*[^"]*$)(?<=[^\d' + abbrevs + '])[.!?])' + + for block in block_cursor: + text_block_tmp = block["text"] + + # need to check if previous_block and/or next_block in results + previous_block = self.get_cursor_previous_block(lib, block["doc_ID"],block["block_ID"],block["master_index"]) + next_block = self.get_cursor_next_block(lib, block["doc_ID"],block["block_ID"],block["master_index"]) + + if previous_block: + previous_text = previous_block["text"] + # regex - split on .!? unless inside a " + last_sentence = list(re.split(regex_string, previous_text))[-1] + text_block_tmp = last_sentence + " " + text_block_tmp + + if next_block: + next_text = next_block["text"] + # regex - split on .!? unless inside a " + first_sentence = list(re.split(regex_string, next_text))[0] + + text_block_tmp = text_block_tmp + " " + first_sentence + + sentences = re.split(regex_string,text_block_tmp) + + leftover = "" + + for x in range(0,len(sentences)): + + if sentences[x]: + working_sentence = leftover + sentences[x] + leftover = "" + # if shorter than 10 chars, bundle with the next full sentence + # -- need more sophisticated regex to avoid splitting on 'Section 2.3' etc + + if len(working_sentence) > 10: + + matches_found = self.find_match(key_term, working_sentence) + if matches_found: + + if working_sentence not in sentences_only: + + if (x+1) < len(sentences): + if sentences[x+1]: + if len(sentences[x+1]) <= 10: + working_sentence += " " + sentences[x+1] + + new_row = {"sentence": working_sentence, "matches": matches_found, "block": block} + output.append(new_row) + sentences_only.append(working_sentence) + leftover = "" + + return output, sentences_only + + def get_sentences_fast_cap(self,lib, key_term_list, block_cursor,top_k=20): + + output = [] + sentences_only = [] + + abbrevs = "^Mr^Mrs^Ms^Jr^Sr" + regex_string = '((?=([^"]*"[^"]*")*[^"]*$)(?<=[^\d' + abbrevs + '])[.!?])' + + for block in block_cursor: + text_block_tmp = block["text"] + + # need to check if previous_block and/or next_block in results + previous_block = self.get_cursor_previous_block(lib, block["doc_ID"],block["block_ID"],block["master_index"]) + next_block = self.get_cursor_next_block(lib, block["doc_ID"],block["block_ID"],block["master_index"]) + + if previous_block: + previous_text = previous_block["text"] + last_sentence = list(re.split(regex_string, previous_text))[-1] + text_block_tmp = last_sentence + " " + text_block_tmp + + if next_block: + next_text = next_block["text"] + first_sentence = list(re.split(regex_string, next_text))[0] + + text_block_tmp = text_block_tmp + " " + first_sentence + + sentences = re.split(regex_string,text_block_tmp) + + leftover = "" + + for x in range(0,len(sentences)): + + if sentences[x]: + working_sentence = leftover + sentences[x] + leftover = "" + # if shorter than 10 chars, bundle with the next full sentence + # -- need more sophisticated regex to avoid splitting on 'Section 2.3' etc + + if len(working_sentence) > 10: + + matches_found = [] + for kt in key_term_list: + matches_found.append(self.find_match(kt, working_sentence)) + + if matches_found: + + if working_sentence not in sentences_only: + + if (x+1) < len(sentences): + if sentences[x+1]: + if len(sentences[x+1]) <= 10: + working_sentence += " " + sentences[x+1] + + new_row = {"sentence": working_sentence, "matches": matches_found, "block": block} + output.append(new_row) + sentences_only.append(working_sentence) + leftover = "" + if len(output) >= top_k: + break + + return output, sentences_only + + def sentence_splitter(self, sentence, key_word, marker_list): + + text = [] + completion = [] + # will split sentence either 'before' or 'after' the marker + # simplest pattern - split at marker + + for m in marker_list: + + # if key_word is at the start of the sentence, e.g., marker = 0, include in text ... + if m < len(key_word): + text.append(sentence[0:m+len(key_word)]) + completion.append(sentence[m+len(key_word):]) + else: + text.append(sentence[0:m]) + completion.append(sentence[m:]) + + return text, completion + + def prep_custom_mlm_label (self, input_sentence,key_word_list, mask_token_value="", mlm_prob=0.15): + + label_id = [] + for x in input_sentence: + r = random.randint(1,100) + if r <= (mlm_prob * 100): + r2 = random.randint(1,10) + if r2 <= 10: + label_id.append(mask_token_value) + else: + # keep original value + label_id.append(x) + + return label_id + + def fast_search_dicts(self, query,output_dicts, text_key="text", remove_stop_words=True): + + # will return a subset of the output_dicts that have the key_terms + # no ranking or prioritization - "match" or "no-match" only + # designed primarily to filter in-memory sources and parser outputs + + matched_dicts = [] + + c = CorpTokenizer(remove_stop_words=remove_stop_words, remove_numbers=False, one_letter_removal=True, + remove_punctuation=True) + + key_terms = c.tokenize(query) + + for i, entries in enumerate(output_dicts): + + text_tokens = c.tokenize(entries[text_key]) + + match_found = 0 + + for j, toks in enumerate(text_tokens): + for key_term in key_terms: + if key_term.lower() == toks.lower(): + + if "page_num" not in entries: + + if "master_index" in entries: + page_num = entries["master_index"] + else: + page_num = 0 + + entries.update({"page_num": page_num}) + + if "query" not in entries: + entries.update({"query": key_term}) + + matched_dicts.append(entries) + match_found = 1 + break + + if match_found == 1: + break + + return matched_dicts + + def find_match(self, key_term, sentence): + + matches_found = [] + for x in range(0,len(sentence)): + match = 0 + if sentence[x].lower() == key_term[0].lower(): + match += 1 + if (x+len(key_term)) <= len(sentence): + for y in range(1,len(key_term)): + if key_term[y].lower() == sentence[x+y].lower(): + match += 1 + else: + match = -1 + break + + if match == len(key_term): + matches_found.append(x) + + return matches_found + + def package_answer(self, raw_query, text_core, answer_window, x): + + answer = [] + l = len(text_core) + + for t in range(0, l): + match = 0 + if text_core[t].lower() == raw_query[0].lower(): + if (t + len(raw_query)) < l: + for z in range(1, len(raw_query)): + + if text_core[t + z].lower() == raw_query[z].lower(): + match = z + else: + match = -1 + break + if match > 1: + + stop_slice = min(t + len(raw_query) + answer_window, t + l) + ans = text_core[t + len(raw_query) + 1:stop_slice] + doc = x['doc_ID'] + block = x['block_ID'] + page_num = x['master_index'] + fn = x['file_source'] + text_out = x['text'] + slice = t + len(raw_query) + 1 + answer.append((fn, doc, block, page_num, raw_query, slice, ans, text_out)) + + return answer + + def get_cursor_next_block(self, lib,doc_id, block_id, selected_page): + next_block = CollectionRetrieval(lib.collection).get_cursor_by_block(doc_id, block_id+1, selected_page) + return next_block + + def get_cursor_previous_block(self,lib, doc_id, block_id, selected_page): + previous_block = CollectionRetrieval(lib.collection).get_cursor_by_block(doc_id,block_id-1, selected_page) + return previous_block + + def split_context_row (self, context_row): + + entries_list = [] + entries_weights = [] + + for z in range(0,len(context_row)): + entries_list.append(context_row[z][0]) + entries_weights.append(int(context_row[z][1])) + + return entries_list, entries_weights + + # need to update / remove + def dataset_smart_packager(self, text_block, min_th=200, max_th=400): + + # best outcome is to split at the end of a sentence + # use simple regex command to split the sentence on end punctuation (e.g., '.', '!', '?') + + sentences = list(re.split('(?<=[.!?])', text_block)) + + # logging.info("update: dataset smart packager - len sentences: %s ", len(sentences)) + + if len(sentences) == 1 or len(sentences) == 0: + # easy case - text block ends with "." -> return the whole block + return text_block, "" + + if len(sentences) > 1: + # check if last sentence ends with exclamation mark - otherwise, return as remainder + last_sentence = sentences[-1] + if last_sentence.endswith(".") or last_sentence.endswith("!") or last_sentence.endswith("?"): + return text_block, "" + else: + # re-assemble the sentences (excluding the last fragment) + output_text = "" + remainder_text = "" + for x in range(0, len(sentences) - 1): + if len(output_text) + len(sentences[x]) < max_th: + output_text += sentences[x] + " " + else: + remainder_text += sentences[x] + " " + + remainder_text += last_sentence + + if len(output_text) < min_th: + # in this case, retain the text_block as "remainder" and keep going + return "", text_block + else: + # the assembled sentences are longer than the min threshold + # if the remainder is very short, then append to output + if len(remainder_text) > 20: + return output_text, remainder_text + output_text += " " + remainder_text + return output_text, "" + + # something has gone wrong unexpectedly if this is reached + return text_block, "" + + def replace_word_numbers(self, evidence): + evidence_toks = evidence.split(" ") + + word_numbers_lookup = {"zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5, "six": 6, + "seven": 7, "eight": 8, "nine": 9, "ten": 10, "eleven": 11, "twelve": 12, + "thirteen": 13, "fourteen": 14, "fifteen": 15, "sixteen": 16, "seventeen": 17, + "eighteen": 18, "nineteen": 19, "twenty": 20, "thirty": 30, "forty": 40, "fifty": 50, + "sixty": 60, "seventy": 70, "eighty": 80, "ninety": 90, "hundred": 100, + "thousand": 1000, "million": 1000000, "billion": 1000000000, "percent": 0.01} + + num_toks_in_progress = "" + text_with_numbers = "" + build_num = False + nums_in_text_list = [] + percent_flag = False + + # new - added on aug 26, 2023 + token_index_of_match_found = [] + + for i, toks in enumerate(evidence_toks): + + if toks in word_numbers_lookup or (build_num and toks in ["and", "plus"]): + build_num = True + if toks not in ["and", "plus", "percent", "percentage"]: + num_toks_in_progress += toks + " " + if toks in ["percent", "percentage"]: + percent_flag = True + + else: + # add any number in progress, if any + if build_num: + + if percent_flag: + try: + my_num = w2n.word_to_num(num_toks_in_progress) * 0.01 + except: + my_num = -9999.1234 + else: + try: + my_num = w2n.word_to_num(num_toks_in_progress) + except: + my_num = -9999.1234 + + if my_num != -9999.1234: + text_with_numbers += str(my_num) + " " + nums_in_text_list.append(my_num) + + # new add - aug 26 + token_index_of_match_found.append(i) + + build_num = False + percent_flag = False + num_toks_in_progress = "" + + # add next token + text_with_numbers += toks + " " + + logging.info("update: text_with_numbers output: %s ", text_with_numbers) + logging.info("update: nums found list: %s ", nums_in_text_list) + + return text_with_numbers, nums_in_text_list, token_index_of_match_found + + +class CorpTokenizer: + + def __init__(self, lower_case=True, remove_punctuation=True, remove_stop_words=True, + remove_numbers=True, one_letter_removal=False): + + self.lower_case = lower_case + self.remove_punctuation = remove_punctuation + self.remove_stop_words = remove_stop_words + self.remove_numbers = remove_numbers + self.one_letter_removal = one_letter_removal + + def tokenize(self, text): + + # start with basic whitespace tokenizing + text2 = text.split(" ") + + if self.remove_punctuation: + text2 = Utilities().clean_list(text2) + + if self.lower_case: + text_l = [] + for z in range(0, len(text2)): + text_l.append(str(text2[z]).lower()) + text2 = text_l + + if self.remove_stop_words: + text2 = Utilities().remove_stop_words(text2) + + if self.remove_numbers: + text_n = [] + for z in range(0, len(text2)): + if not str(text2[z]).isnumeric(): + text_n.append(text2[z]) + text2 = text_n + + if self.one_letter_removal: + text_out = [] + for z in range(0, len(text2)): + if len(text2[z]) > 1: + text_out.append(text2[z]) + text2 = text_out + + return text2 + + +class XlTable: + + def __init__(self, xl_table_block, account, library): + + self.table = xl_table_block["table"] + self.table_rows = self.table.split("") + self.row_count = len(self.table_rows) + self.new_table_blocks = [] + self.search_query = [] + self.search_tokens = [] + self.search_len = 0 + self.row_list = [] + self.col_list = [] + self.neighborhood = [] + + self.account = account + self.library = library + + # info about table block + self.file_source = xl_table_block["file_source"] + + self.sheet_num = 1 + + if "master_index" in xl_table_block: + self.sheet_num = xl_table_block["master_index"] + if "page_num" in xl_table_block: + self.sheet_num = xl_table_block["page_num"] + + self.batch_num = xl_table_block["coords_y"] + + # batch_num starts at 0 + if self.batch_num == 0: + self.first_batch = self.table + else: + self.first_batch = "" + + def get_first_batch(self): + + empty_result = "" + + if self.batch_num == 0: + return self.table + + else: + key_dict = {"file_source": self.file_source, "master_index": self.sheet_num, "coords_y": 0} + results = CollectionRetrieval(self.library.collection).filter_by_key_dict(key_dict) + + if results: + first_batch = list(results) + if len(first_batch) == 1: + return first_batch[0]["table"] + + return empty_result + + def get_xl_cell_contents (self, ind): + + # takes table block str, e.g., block["content1_core"] and index, e.g, "C6" + # ... and returns full string content from that cell + + index_with_brackets = "<" + ind + ">" + my_cell_str = "" + + found_search_term_in_cell = -1 + + for x in range(0, self.row_count): + tok = self.table_rows[x].lower().split(" ") + for y in range(0, len(tok)): + + if my_cell_str != "": + break + + if tok[y].lower() == index_with_brackets: + + if len(tok) > y + 1: + for z in range(y,len(tok)): + + if not tok[z] in ("","",""): + if tok[z].lower() in self.search_tokens: + + if tok[z].startswith("<") and tok[z].endswith(">"): + tok_display = "" + else: + tok_display = tok[z] + my_cell_str += " " + tok_display.strip("\n") + " " + " " + found_search_term_in_cell = 1 + else: + my_cell_str += tok[z].strip("\n") + " " + else: + break + + return my_cell_str, found_search_term_in_cell + + def get_xl_cell_contents_passed_table_str (self, ind, table_str): + + table_rows = table_str.split("") + + index_with_brackets = "<" + ind + ">" + my_cell_str = "" + + for x in range(0, len(table_rows)): + tok = table_rows[x].lower().split(" ") + for y in range(0, len(tok)): + + if my_cell_str != "": + break + + if tok[y].lower() == index_with_brackets: + + if len(tok) > y + 1: + for z in range(y + 1, len(tok)): + + if not tok[z].startswith("<"): + if tok[z].lower() in self.search_tokens: + my_cell_str += " " + tok[z].strip("\n") + " " + " " + else: + my_cell_str += tok[z].strip("\n") + " " + else: + break + + return my_cell_str + + def get_row_col_from_xl_index (self,ind): + + # unpacks a "C9" index into row = 3 & column = 9 + + col = [] + column = 0 + row = 0 + num_started = -1 + + for x in range(0,len(ind)): + + # found lower-case letter char + if (96 < ord(ind[x]) < 123) and num_started == -1: + col.append((ord(ind[x]) - 96)) + + if 47 < ord(ind[x]) < 58: + num_started = 1 + row = int(ind[x:]) + break + + if len(col) == 1: + column = col[0] + + if len(col) == 2: + column = (col[0] * 26) + col[1] + + if len(col) == 3: + column = (col[0] * 26 * 26) + (col[1] * 26) + col[2] + + if len(col) > 3: + column = 0 + + return column, row + + def convert_col_to_letter (self,col_num): + + # utility to convert column number back to letter, e.g., column 3 = "C" + + col_str = "" + if col_num < 27: + col_str += chr(col_num-1 + 65) + + if 26 < col_num < 53: + col_str += chr(65) + col_str += chr(65 + (col_num - 26)) + + if 52 < col_num < 79: + col_str += chr(66) + col_str += chr(65 + (col_num - 52)) + + if col_num > 79: + dummy = 0 + + return col_str + + def prep_xl_cell_neighborhood (self, col, row,context_window=3,header="yes"): + + new_table_str = "" + my_col = "" + col_list = [] + + if col > context_window: + for x in range(1,context_window+1): + col_list.append(self.convert_col_to_letter(col-x)) + else: + for x in range(1,col): + col_list.append(self.convert_col_to_letter(col-x)) + + for y in range(0,context_window+1): + col_tmp = self.convert_col_to_letter(col+y) + if y == 0: + my_col = col_tmp + + col_list.append(col_tmp) + + row_list = [] + + if row > context_window: + for x in range(1,context_window + 1): + row_list.append((row-x)) + else: + for x in range(1,row): + row_list.append((row-x)) + + for y in range(0,context_window + 1): + row_list.append((row+y)) + + if header == "yes": + header_rows = self.get_header_rows(my_col,row,col_list) + new_table_str += header_rows + + row_list = sorted(row_list) + col_list = sorted(col_list) + + self.row_list = row_list + self.col_list = col_list + + for x in range(0,len(row_list)): + new_row = " " + for y in range(0,len(col_list)): + my_index = col_list[y].lower() + str(row_list[x]) + self.neighborhood.append(my_index) + my_cell, search_term_found = self.get_xl_cell_contents(my_index) + + if search_term_found > 0: + new_row += '' + " " + my_index.upper() + " " + \ + my_cell + " " + else: + new_row += "" + " " + my_index.upper() + " " + my_cell + " " + + new_row += " " + + new_table_str += new_row + + return new_table_str + + def get_header_rows(self,my_col_str, my_row_num, column_list): + + first_batch = "" + row_list_first_batch = [] + + if my_row_num > 15: + stopper_row = 15 + else: + stopper_row = my_row_num - 1 + + if self.batch_num > 0: + stopper_row = 15 + first_batch = self.get_first_batch() + if first_batch: + row_list_first_batch = first_batch.split("") + + my_header_row = -1 + header_row = "" + + row_rank = [] + max_cols = 0 + max_col_num = 0 + my_row = [] + + for z in range(0,stopper_row): + + if self.batch_num == 0: + my_row = self.table_rows[z] + + else: + if len(row_list_first_batch) > z: + my_row = row_list_first_batch[z] + + else: + my_row = [] + + if my_row: + cols_in_row = my_row.split("") + else: + cols_in_row = [] + + if cols_in_row: + total_cols, alpha_cols, row_num = self.count_alpha_items_in_row(cols_in_row) + row_rank.append((alpha_cols,z, row_num)) + + r = sorted(row_rank, key=lambda x:x[0], reverse=True) + top_three = r[0:3] + top_three = sorted(top_three,key=lambda x:x[1]) + + if top_three: + + for t in range(0,len(top_three)): + my_header_row = top_three[t][2] + header_row += ' ' + + for y in sorted(column_list): + my_index_tmp = y + str(my_header_row) + if self.batch_num == 0: + my_cell, search_term_found = self.get_xl_cell_contents(my_index_tmp.lower()) + else: + my_cell = self.get_xl_cell_contents_passed_table_str(my_index_tmp.lower(),first_batch) + + header_tmp = " " + " " + my_index_tmp + " " + my_cell + " " + header_row += header_tmp + + header_row += " " + + return header_row + + def count_alpha_items_in_row(self,row_list): + + total_count = 0 + alpha_count = 0 + + row_num = 0 + + for z in range(0,len(row_list)): + alpha = self.check_if_alpha_string(row_list[z]) + if z < 3: + row_num = self.get_row_num(row_list[z]) + if alpha > 0: + alpha_count += 1 + total_count += 1 + + return total_count, alpha_count, row_num + + def get_row_num(self,row_cell): + + row_num_str = "" + row_num = 0 + + row_toks = row_cell.split(" ") + + for z in range(0,len(row_toks)): + + if row_num_str: + break + + if row_toks[z].startswith("<"): + if len(row_toks[z]) > 2: + if row_toks[z] not in ("","","", "","",""): + c = row_toks[z][1:-1] + row_num_str = "" + + for y in range(0,len(c)): + if 47 < ord(c[y]) < 58: + row_num_str += c[y] + try: + row_num = int(row_num_str) + except: + # print("must be error converting str: ", row_num_str) + row_num = 0 + + break + + return row_num + + def check_if_alpha_string(self,s): + + alpha = -1 + escape_on = -1 + + for x in range(0,len(s)): + + if ord(s[x]) == 60: + escape_on = 1 + + # simple test - look for any alpha character outside of < > + if escape_on == -1: + if (64 < ord(s[x]) < 91) or (96 < ord(s[x]) < 123): + alpha = 1 + break + + if ord(s[x]) == 62: + escape_on = -1 + + return alpha + + def get_index (self, search_query): + + self.search_query = search_query.lower() + self.search_tokens = self.search_query.split(" ") + + c = 0 + r = 0 + current_index = "" + + for x in range(0, self.row_count): + + tok = self.table_rows[x].lower().split(" ") + current_index = "" + + for y in range(0, len(tok)): + + if tok[y].startswith("<"): + if len(tok[y]) > 3: + if tok[y][1:2] not in ("td", "tr", "th"): + current_index = tok[y][1:-1] + + if search_query.lower() == tok[y]: + c, r = self.get_row_col_from_xl_index(current_index) + break + + return current_index, c, r + + def main_parse(self, search_query): + + new_table_blocks_out = [] + + self.search_query = search_query.lower() + self.search_tokens = self.search_query.split(" ") + self.search_len = len(self.search_tokens) + + for x in range(0, self.row_count): + + tok = self.table_rows[x].lower().split(" ") + + current_index = "" + + for y in range(0, len(tok)): + + if tok[y].startswith("<"): + if len(tok[y]) > 3: + if tok[y][1:2] not in ("td", "tr", "th"): + current_index = tok[y][1:-1] + + match = -1 + if self.search_tokens[0] == tok[y]: + match = 1 + + if self.search_len > 1 and len(tok) > (y + self.search_len - 1): + for s in range(1,self.search_len): + if self.search_tokens[s] != tok[y+s]: + match = -1 + break + + if match == 1: + if current_index not in self.neighborhood: + c, r = self.get_row_col_from_xl_index(current_index) + new_table_str = self.prep_xl_cell_neighborhood(c, r) + new_table_blocks_out.append(new_table_str) + + self.new_table_blocks = new_table_blocks_out + + return new_table_blocks_out + + +class WikiKnowledgeBase: + + def __init__(self): + + # importing here to suppress log warnings produced by urllib3 + import urllib3 + urllib3.disable_warnings() + + self.user_agent = "Examples/3.0" + + self.wiki = Wikipedia(user_agent=self.user_agent, extract_format=ExtractFormat.WIKI, verify=False) + self.wiki_search_api_url = 'http://en.wikipedia.org/w/api.php' + + def get_article(self, article_name): + + article_response = {"title": "", "summary": "", "text": ""} + + try: + page_py = self.wiki.page(article_name) + + if page_py.exists(): + + logging.info("update: page_py - %s - %s", page_py.title, page_py.summary) + logging.info("update: text - %s ", page_py.text) + + article_response = {"title": page_py.title, "summary": page_py.summary, "text": page_py.text} + + else: + logging.info("update: connected with Wikipedia - selected article does not exist - %s ", article_name) + + except: + logging.error("error: could not retrieve wikipedia article - please try again") + + return article_response + + def search_wikipedia(self, query, result_count=10, suggestion=False): + + # output result + output = [] + + # search params passed to the wikipedia api + search_params = {'list': 'search', 'srprop': '', 'srlimit': result_count, 'srsearch': query, + 'format': 'json', 'action': 'query'} + + if suggestion: search_params['srinfo'] = 'suggestion' + + headers = {'User-Agent': self.user_agent} + + try: + r = requests.get(self.wiki_search_api_url, params=search_params, headers=headers, verify=False) + + for i, title in enumerate(r.json()["query"]["search"]): + + logging.info("update: wiki results - %s - %s", i, title) + + new_entry = {"num": i, "title": title["title"], "pageid": title["pageid"]} + output.append(new_entry) + + except: + logging.error("error: could not connect with Wikipedia to retrieve search results") + + return output + + +class TextChunker: + + # simple class that can be inserted for OCR, Text or HTML + # class expects to be passed a big chunk of text, e.g., output from OCR or full read of text file + # --will chop up blocks out of the text + # --uses a "chisel" approach, so starts with 'max_block_size' and looks back to find sentence edges + # --in testing with a number of files, it results in avg block size ~500 with 90%+ ending on sentence or \n\r + + def __init__(self, text_chunk=None, max_char_size=600, look_back_char_range=300): + + self.text_chunk = text_chunk + self.max_char_size = max_char_size + self.look_back_range = look_back_char_range + + self.chunks = [] + + self.avg_char_size = 0 + self.smallest_chunk = self.max_char_size + self.largest_chunk = 0 + self.chunks_ending_with_period = 0 + + def convert_text_to_chunks (self): + + starter = 0 + + while starter < len(self.text_chunk): + + if (starter + self.max_char_size) < len(self.text_chunk): + stopper = starter + self.max_char_size + else: + stopper = len(self.text_chunk) + + smooth_stop = self.smooth_edge(starter, stopper) + chunk = self.text_chunk[starter:smooth_stop] + + starter = smooth_stop + + # if very short chunk, then concatenate with the previous chunk + if len(chunk) < self.look_back_range: + if len(self.chunks) > 0: + self.chunks[-1] += chunk + else: + self.chunks += chunk + + else: + # general case - create next chunk + # chunk_pp = re.sub("[\n\r]", " ", chunk) + self.chunks.append(chunk) + + if len(chunk) < self.smallest_chunk: + self.smallest_chunk = len(chunk) + + if len(chunk) > self.largest_chunk: + self.largest_chunk = len(chunk) + + if len(chunk) > 0: + if ord(chunk[-1]) in [46,10,13]: + self.chunks_ending_with_period += 1 + + self.avg_char_size += len(chunk) + + return self.chunks + + def smooth_edge(self,starter,stopper): + + # default case is to return the whole text sample as single chunk + smooth_stop = stopper + + # look back is the full range that will be reviewed to find proper stopping point + if (stopper - self.look_back_range) > starter: + look_back = stopper - self.look_back_range + else: + look_back = starter + + # best case - look for a period + found_period = -1 + for x in range(stopper-1,look_back,-1): + + # found a period followed by white space marker (space, \n, \r) - best case + if ord(self.text_chunk[x]) == 46: + + # first confirm that '.' is followed by white space or is the end of the text + if x+1 == stopper or ord(self.text_chunk[x + 1]) in [32, 13, 10]: + + # exclude 'several edge cases where '.' is not a reliable sentence end + short_window = self.text_chunk[x-5:x-1] + + # (A) first edge case - "two periods close to each other", e.g., "x.y." + if "." not in short_window: + + # (B) second edge case - "period after number in list", e.g., "point 2." + if not 47 < ord(short_window[-1]) < 58: + + # (C) third edge case - common abbreviations + if short_window[:-2] != "Mr" and short_window[:3] != "Mrs" and short_window[:2] != "Dr": + + # if none of (A) - (B) - (C) or apply, then consider period valid stopping point + found_period = x + 1 + break + + # alternate solid stopper is presence of \n\n | \n\r | \r\r -> usually marks a section/para end + if ord(self.text_chunk[x]) in [10,13]: + if x+1 == stopper or ord(self.text_chunk[x+1]) in [10,13]: + found_period = x+1 + break + + # if found a period, then smooth stop is the char right after the period + if found_period > - 1: + smooth_stop = found_period + + else: + # if no period found, then next best case is to look for whitespace between words + for y in range(stopper - 1, look_back,-1): + + # look for a white space separator + if ord(self.text_chunk[y]) in [32, 13, 10]: + smooth_stop = y + break + + # if no period or white space found, then return the original stopper + + return smooth_stop + + +global_default_prompt_catalog = [ + + {"prompt_name": "just_the_facts", + "prompt_description": "Closed Context - read passage, answer question, stick to the facts.", + "run_order": ["blurb1", "$context", "blurb2", "$query", "instruction"], + "blurb1": "Please read the following text: ", + "blurb2": " Please answer the question: ", + "instruction": "In providing the answer, please only use facts contained in the text.", + "system_message": "You are a helpful assistant who speaks with facts and no wasted words.", + "user_vars": {}}, + + {"prompt_name": "answer_or_not_found", + "prompt_description": "Closed Context - read passage, answer question, provide 'Not Found' if no answer in text.", + "run_order": ["blurb1", "$context", "blurb2", "$query", "instruction"], + "blurb1": "Please read the following text: ", + "blurb2": " Please answer the question: ", + "instruction": "Please only use facts in the text. If the text does not provide the answer, then please " + "respond with: {{not_found_response}}", + "system_message": "You are a helpful assistant who speaks with facts and no wasted words.", + "user_vars": {"not_found_response": "'Not Found.'"}}, + + {"prompt_name": "number_or_none", + "prompt_description": "Closed Context - read passage, answer question, provide 'Not Found' if no answer in text.", + "run_order": ["blurb1", "$context", "blurb2", "$query","instruction"], + "blurb1" : "Please read the following text: ", + "blurb2" : " Please answer the question: ", + "instruction": "Please provide a specific number as an answer from the text. " + "If the text does not provide a specific numerical answer, then please respond " + "with: {{not_found_response}}", + "system_message": "You are a helpful assistant who speaks with facts and no wasted words.", + "user_vars": {"not_found_response": "'Not Found.'"}}, + + {"prompt_name": "summarize_with_bullets", + "prompt_description": "Basic summarization with open ended number of bullet points.", + "run_order": ["blurb1", "$context", "instruction"], + "blurb1": "Please read the following text: ", + "instruction": "Please summarize with bulletpoints.", + "system_message": "You are a helpful assistant who speaks with facts and no wasted words.", + "user_vars": {}}, + + {"prompt_name": "summarize_with_numbered_bullets", + "prompt_description": "Summarization with specified number of bullet points.", + "run_order": ["blurb1", "$context", "instruction"], + "blurb1": "Please read the following text: ", + "instruction": "Please summarize the text with approximately {{number_of_bulletpoints}} numbered bulletpoints.", + "system_message": "You are a helpful assistant who speaks with facts and no wasted words.", + "user_vars": {"number_of_bulletpoints": 5}}, + + {"prompt_name": "xsummary", + "prompt_description": "Xtreme summarization with specified number of words.", + "run_order": ["blurb1", "$context", "instruction"], + "blurb1": "Please read the following text: ", + "instruction": "Please summarize the text in no more than {{number_of_words}} words.", + "system_message": "You are a helpful assistant who speaks with facts and no wasted words.", + "user_vars": {"number_of_words": 25}}, + + {"prompt_name": "completion", + "prompt_description": "Open context text generation to complete starting point provided in prompt.", + "run_order": ["blurb1", "$query", "instruction"], + "blurb1": "Here is the starting point of a longer text: ", + "instruction": "Please complete this text in the style provided in the text.", + "system_message": "You are a helpful assistant who is a good creative writer.", + "user_vars": {}}, + + {"prompt_name": "dialog_summary", + "prompt_description": "General summarization of a conversation text with specified number of bullet points.", + "run_order": ["blurb1", "$context", "instruction"], + "blurb1": "Please read the following discussion between two parties: ", + "instruction": "Please summarize the key points from the conversation using less " + "than {{number_of_bulletpoints}} bulletpoints.", + "system_message": "You are a helpful assistant.", + "user_vars": {"number_of_bulletpoints": 10}}, + + {"prompt_name": "not_found_classifier", + "prompt_description": "Not Found Response classifier - used to ask a model to classify a particular response " + "as 'not found' - very useful in RAG applications.", + "run_order": ["blurb1", "blurb2", "$context", "instruction"], + "blurb1": "Here are several examples of a 'not found' response: " + "Not Found \n" + "The text does not provide an answer. \n" + "The answer is not clear. \n" + "Sorry, I could not find a definitive answer. \n" + "The answer is not provided in the information given. \n" + "The text does not specify the answer to this question. \n", + "blurb2": "Here is a new example: ", + "instruction": "Please respond 'Yes' or 'No' if this new example is a 'Not Found' response.", + "system_message": "You are a helpful assistant.", + "user_vars": {}}, + + {"prompt_name": "top_level_select", + "prompt_description": "Select the best answer among choices provided.", + "run_order": ["blurb1", "$query", "blurb2","$context", "instruction"], + "blurb1": "We are trying to answer the following question: ", + "blurb2": "Which of the following selections best answers the question?", + "instruction": "Please respond with the best answer among these selections. " + "If more than one answer is useful, please summarize with bulletpoints.", + "system_message": "You are a helpful assistant who speaks with facts and no wasted words.", + "user_vars": {}}, + + {"prompt_name": "answer_question_in_role", + "prompt_description": "Answer a question with a specific role or point of view.", + "run_order": ["blurb1", "$context", "blurb2", "$query", "instruction"], + "blurb1": "Please read the following text: ", + "blurb2": "Please answer the following question: ", + "instruction": "In providing an answer to the question, please assume the perspective of a {{role}} and " + "write in that style.", + "system_message": "You are a helpful assistant.", + "user_vars": {"role": "business analyst"}}, + + {"prompt_name": "editor_in_role", + "prompt_description": "Edit a passage with a specific role or point of view.", + "run_order": ["blurb1", "$context", "instruction"], + "blurb1": "Please read the following text: ", + "instruction": "Our task is to edit and improve the language of the text from the perspective of a business analyst.", + "system_message": "You are a helpful editor and writer who reads text and improves the writing.", + "user_vars": {"role": "business analyst"}}, + + {"prompt_name": "yes_no", + "prompt_description": "Answer a question with 'Yes' or 'No'.", + "run_order": ["blurb1", "$context", "blurb2", "$query", "instruction"], + "blurb1": "Please read the following text: ", + "blurb2": "Based on these materials, please answer the question: ", + "instruction": "Please answer this question with 'Yes' or 'No'. If the text does not provide an answer," + "then please respond with 'Not Found.'", + "system_message": "You are a helpful assistant who speaks with facts and no wasted words.", + "user_vars": {}}, + + {"prompt_name": "multiple_choice", + "prompt_description": "Answer a question using a set of pre-defined choices provided.", + "run_order": ["blurb1", "$context", "blurb2", "$query", "instruction"], + "blurb1": "Please read the following text: ", + "blurb2": "Based on these materials, please answer the question: ", + "instruction": "Please select from the choices provided. If the text does not provide an answer," + "then please respond with 'Not Found.'", + "system_message": "You are a helpful assistant who speaks with facts and no wasted words."}, + + {"prompt_name": "default_with_context", + "prompt_description": "Default simple prompt when a question and context are passed.", + "run_order": ["blurb1", "$context", "blurb2", "$query"], + "blurb1": "Please read the following text: ", + "blurb2": "Based on this text, please answer the question: ", + "instruction": "", + "system_message": "You are a helpful assistant who speaks with facts and no wasted words."}, + + {"prompt_name": "default_no_context", + "prompt_description": "Default simple prompt when only a question is passed.", + "run_order": ["blurb1","$query"], + "blurb1": "Please discuss the following: ", + # "blurb2": "Based on this text, please answer the question: ", + "instruction": "", + "system_message": "You are a helpful assistant who likes to answer questions."}, + + {"prompt_name": "summarize_with_bullets_w_query", + "prompt_description": "Summarization of a text with a specific question being posed.", + "run_order": ["blurb1", "$context", "blurb2","$query","instruction"], + "blurb1": "Please read the following text: ", + "blurb2": "Please read the following question: ", + "instruction": "Please summarize with bulletpoints an analysis of the question.", + "system_message": "You are a helpful assistant who speaks with facts and no wasted words."}, + + {"prompt_name": "summarize_with_references_w_query", + "prompt_description": "Summarization with text with guidance to provide reference to specific " + "information in the text passage.", + "run_order": ["blurb1", "$context", "blurb2", "$query", "instruction"], + "blurb1": "Please read the following text: ", + "blurb2": "Please read the following question: ", + "instruction": "Please provide an analysis of the question using information and specific clauses " + "in the text.", + "system_message": "You are a helpful assistant who speaks with facts and no wasted words."}, + + {"prompt_name": "write_poem", + "prompt_description": "Write a poem prompt - note: results may vary greatly by model.", + "run_order": ["instruction", "$query"], + "instruction": "Please write a poem using the following prompt: ", + "system_message": "You are a helpful assistant who is a creative writer and can rhyme words easily."}, + + {"prompt_name": "ten_words", + "prompt_description": "Xtreme summarization to answer question from a text in 10 words of less.", + "run_order": ["instruction", "$query", "$context"], + "blurb1": "Please read the following text: ", + "blurb2": "Please read the following question: ", + "instruction": "In no more than ten words, please give concise answer to the following question, using the " + "text as evidence to support", + "system_message": "You are a helpful assistant who speaks with facts and no wasted words."}, + + {"prompt_name": "explain_child", + "prompt_description": "Standard simplified answer prompt - note: results may vary greatly by model.", + "run_order": ["instruction", "$query", "$context"], + "instruction": "Please explain to a child the following question using the provided text: ", + "system_message": "You are a helpful assistant."}, + + {"prompt_name": "make_joke", + "prompt_description": "Standard joke prompt - note: results may vary greatly by model.", + "run_order": ["instruction", "$query"], + "instruction": "Please be funny and tell a joke on the subject of: ", + "system_message": "You are a helpful assistant with a good sense of humor."}, + + {"prompt_name": "tell_story", + "prompt_description": "Standard tell a story prompt - note: results may vary greatly by model.", + "run_order": ["instruction", "$query"], + "instruction": "Please write the start of a story on the topic of: ", + "system_message": "You are a helpful assistant."}, + + {"prompt_name": "write_headline", + "prompt_description": "Generate a headline from a question and context.", + "run_order": ["instruction", "$query", "$context"], + "instruction": "Please write the headline only in a few words in capitalization to answer the question below, " + "using the materials provided. ", + "system_message": "You are a helpful assistant."}, + + {"prompt_name": "facts_only", + "prompt_description": "Basic 'facts only' Q&A prompt.", + "run_order": ["blurb1", "$context", "blurb2", "$query", "instruction"], + "blurb1": "Please use the following materials- ", + "blurb2": "Please answer the following question - ", + "instruction": "In answering the question, please only use information contained in the provided materials.", + "system_message": "You are a helpful assistant."}, + + {"prompt_name": "top_bulletpoints", + "prompt_description": "Summarization with question and answer in 5 bullet points.", + "run_order": ["blurb1", "$context", "blurb2", "$query", "instruction"], + "blurb1": "Please read the text below - ", + "blurb2": "Please read the following question - ", + "instruction": "Please answer the question using the text, and write no more than 5 bulletpoints.", + "system_message": "You are a helpful assistant."}, + + {"prompt_name": "report_title", + "prompt_description": "Generate title of report given context passage.", + "run_order": ["instruction", "$context"], + "instruction": "Please write the title to a report with the following information: ", + "system_message": "You are a helpful assistant."}, + + {"prompt_name": "marketing_slogan", + "prompt_description": "Generate marketing style slogan given context passage.", + "run_order": ["blurb1", "$context", "blurb2", "$query", "instruction"], + "blurb1": "Please read the following materials- ", + "blurb2": "Please answer the following question - ", + "instruction": "Please write a marketing slogan for the following offering using the following information as " + "background source materials.", + "system_message": "You are a helpful assistant."}, + + {"prompt_name": "top_level_summary", + "prompt_description": "Summarization prompt intended for 'second-level' summaries of materials.", + "run_order": ["blurb1", "$context", "blurb2", "$query", "instruction"], + "blurb1": "Please read the following materials- ", + "blurb2": "Please answer the following question - ", + "instruction": "In answering the question, please write no more than five bulletpoints, and reference the most " + "important facts in the source materials.", + "system_message": "You are a helpful assistant."}, + +] + + +class PromptCatalog: + + def __init__(self): + + self.prompt_catalog = global_default_prompt_catalog + self.prompt_wrappers = ["alpaca", "human_bot", "chatgpt"] + self.prompt_list = self.list_all_prompts() + + def lookup_prompt(self, prompt_name): + + for prompts in self.prompt_catalog: + if prompts["prompt_name"] == prompt_name: + return prompts + + return None + + def get_all_prompts(self): + return self.prompt_catalog + + def list_all_prompts(self): + prompt_list = [] + for prompt in self.prompt_catalog: + if "prompt_name" in prompt: + prompt_list.append(prompt["prompt_name"]) + return prompt_list + + def parse_instruction_for_user_vars(self, prompt_card, inference_dict=None): + + # if no user vars key in prompt_card, then return instruction unchanged + + if "user_vars" not in prompt_card: + return prompt_card["instruction"] + + if not prompt_card["user_vars"]: + return prompt_card["instruction"] + + # if no inference_dict, then define as empty dictionary + if not inference_dict: + inference_dict = {} + + # in this case, will 'parameterize' and dynamically update instruction + tokens = prompt_card["instruction"].split(" ") + updated_instruction = "" + + for i, t in enumerate(tokens): + + if t.startswith("{{") and t.endswith("}}"): + + t_core = t[2:-2] + + # if value found for key in the inference dict, then apply as true 'user_vars' + if t_core in inference_dict: + new_inserted_token = inference_dict[t_core] + updated_instruction += str(new_inserted_token) + " " + else: + # apply default value found in the prompt card as back-up + if t_core in prompt_card["user_vars"]: + new_inserted_token = prompt_card["user_vars"][t_core] + updated_instruction += str(new_inserted_token) + " " + + else: + updated_instruction += t + " " + + logging.info(f"update: prompt catalog - constructed dynamic instruction - {updated_instruction}") + + return updated_instruction.strip() + + def build_core_prompt(self, prompt_card=None, prompt_name=None, separator="\n", query=None,context=None, + inference_dict=None): + + if not context: context = "" + if not query: query= "" + + if not prompt_card and not prompt_name: + # error - returning query + logging.error("error: no prompt selected in PromptCatalog().build_core_prompt") + prompt_dict = {"core_prompt": context+"\n"+query, "prompt_card": {}} + return prompt_dict + + if not prompt_card: + prompt_card = PromptCatalog().lookup_prompt(prompt_name) + + logging.info(f"update: prompt_card - {prompt_card}") + + core_prompt = "" + + if prompt_card: + for keys in prompt_card["run_order"]: + + if keys == "instruction": + # special handler + instruction = self.parse_instruction_for_user_vars(prompt_card,inference_dict=inference_dict) + core_prompt += instruction + separator + else: + if not keys.startswith("$"): + core_prompt += prompt_card[keys] + separator + else: + if keys == "$query": + core_prompt += query + separator + if keys == "$context": + core_prompt += context + separator + + # update instruction, if user_vars accepted in instruction + """ + if "instruction" in prompt_card: + prompt_card["instruction"] = self.parse_instruction_for_user_vars(prompt_card,inference_dict=inference_dict) + print("update: prompt_card instruction - ", prompt_card) + core_prompt += prompt_card["instruction"] + """ + + prompt_dict = {"core_prompt": core_prompt, "prompt_card": prompt_card} + + # print("update - core prompt built - ", core_prompt) + + logging.info(f"update: prompt created - {prompt_dict}") + + return prompt_dict + + def add_custom_prompt_card(self, prompt_name, run_order_list, prompt_dict, prompt_description=None): + + new_prompt_card = {"prompt_name": prompt_name, + "prompt_description": prompt_description, + "run_order": run_order_list} + + for keys, values in prompt_dict.items(): + new_prompt_card.update({keys:values}) + + self.prompt_catalog.append(new_prompt_card) + + return new_prompt_card + + def apply_prompt_wrapper(self, text, prompt_wrapper, separator="\n", instruction=None): + + output_text = text + + if prompt_wrapper not in self.prompt_wrappers: + + logging.info("update: selected wrapper - %s - could not be identified -" + "returning text prompt without any special format wrapping", prompt_wrapper) + + return output_text + + if prompt_wrapper == "chatgpt": + return self.wrap_chatgpt_sample(text, instruction) + + if prompt_wrapper == "human_bot": + return self.wrap_human_bot_sample(text) + + if prompt_wrapper == "alpaca": + return self.wrap_alpaca_sample(text, separator) + + return output_text + + def wrap_chatgpt_sample(self, text, instruction): + + if not instruction: + instruction = "You are a helpful assistant." + + new_sample = [{"role": "system", "content": instruction}, + {"role": "user", "content": text}] + + return new_sample + + def wrap_human_bot_sample(self, text, user_separator=": ", response_separator=": "): + content = user_separator + text + "\n" + response_separator + return content + + def wrap_alpaca_sample(self, text, separator="\n"): + content = "### Instruction: " + text + separator + "### Response: " + return content + + +# * C Utility functions * +# Load shared libraries based on current platform/architecture + +# Best ways we've found to detect machine architecture +system = platform.system().lower() +machine = os.uname().machine.lower() +file_ext = { "darwin": "dylib", "linux": "so", "windows": "dll" } + +# Default to known architectures if we encounter an unknown one +if system == 'darwin' and machine not in ['arm64','x86_64']: + machine = 'arm64' +if system == 'linux' and machine not in ['aarch64','x86_64']: + machine = 'x86_64' + +# Constuct the path to a specific lib folder. Eg. .../llmware/lib/darwin/x86_64 +machine_dependent_lib_path = os.path.join(LLMWareConfig.get_config("shared_lib_path"), system, machine) + +_path_graph = os.path.join(machine_dependent_lib_path, "libgraph_llmware." + file_ext[system]) + +_mod_utility = cdll.LoadLibrary(_path_graph) + +# * End - C Utility functions * + +""" +# * C Utility functions * +# Load shared libraries based on current platform/architecture + +system = platform.system().lower() +machine = sysconfig.get_platform().split("-")[-1].lower() +_path_graph = None + + +if system == "darwin" and machine == "x86_64": + _path_graph = os.path.join(LLMWareConfig.get_config("shared_lib_path"), "darwin", "x86_64", "libgraph_llmware.dylib") + +if system == "darwin" and machine in ["universal2", "arm64"]: + _path_graph = os.path.join(LLMWareConfig.get_config("shared_lib_path"), "darwin", "arm64", "libgraph_llmware.dylib") + +if system == "linux" and machine == "x86_64": + _path_graph = os.path.join(LLMWareConfig.get_config("shared_lib_path"), "x86_64", "libgraph_llmware.so") + +if system == "linux" and machine == "aarch64": + _path_graph = os.path.join(LLMWareConfig.get_config("shared_lib_path"), "aarch64", "libgraph_llmware.so") + +_mod_utility = cdll.LoadLibrary(_path_graph) + +# * End - C Utility functions * +""" + + +class Graph: + + def __init__(self, library): + + self.library = library + self.account_name = library.account_name + self.library_name = library.library_name + + # nlp analytics settings shifted from Library to Graph + self.bigram_count = 100 + + self.targets_len_max = 5000 + self.context_len_max = 10000 + + # expand vocab_len_max = 100000 + self.vocab_len_max = 50000 + + # new parameter - max size of BOW file before starting new one + self.bow_max = 10000000 + + self.bow_count = 0 + + # nltk.download('averaged_perceptron_tagger', quiet=True) + + self.pre_initialization_bow_data = {} + self.post_initialization_bow_data = {} + + # create stop words txt file in nlp path + self.stop_words = Utilities().load_stop_words_list(self.library.nlp_path) + + # new method - used to track 'counter' inside the bow files for incremental read/write/analysis + def bow_locator(self): + + # iterate thru bow_fp_list to find correct BOW + right split to start + dataset_fp = self.library.nlp_path + + ds_files = os.listdir(dataset_fp) + + bow_files = [] + for f in ds_files: + if f.startswith("bow"): + bow_files.append(f) + + bow_index = 0 + bow_byte_index = 0 + bow_tokens = 0 + no_bow = True + + if len(bow_files) > 0: + bow_files_sorted = sorted(bow_files, reverse=True) + top_bow_file = bow_files_sorted[0] + no_bow = False + try: + bow_index = int(top_bow_file.split(".")[0][3:]) + except: + logging.warning("warning - Graph - unexpected - could not identify bow index on bow file - %s ", top_bow_file) + bow_index = 0 + + fp = open(os.path.join(dataset_fp, top_bow_file), "r") + fp.seek(0, 2) + bow_byte_index = fp.tell() + fp.seek(0, 0) # rewind + bow_tokens = len(fp.read().split(",")) + fp.close() + + return bow_index, bow_byte_index, bow_tokens, bow_files, no_bow + + def build_graph(self): + + # Generates multiple valuable nlp artifacts in /nlp folder + # Primary objective is generation of co-occurrence matrix + + os.makedirs(self.library.nlp_path, exist_ok=True) + + # note: this function has been updated -> ~750 stop words + stop_words = Utilities().load_stop_words_list(self.library.nlp_path) + + # first major step -> build the BOW + + bow_index, bow_byte_index, bow_token_index, bow_files, no_bow = self.bow_locator() + + # save the 'pre_initialization bow data" + + self.pre_initialization_bow_data = {"bow_index": bow_index, "bow_byte_index": bow_byte_index, + "bow_token_index": bow_token_index, "bow_files": bow_files, + "no_bow": no_bow} + + logging.info(f"update: Graph().initialization - bow parameters at start: {self.pre_initialization_bow_data}") + + t0 = time.time() + + # no need to capture outputs directly from .bow_builder() method -> will pick indirectly thru .bow_locator() + _ = self.bow_builder() + + logging.info("update: initialization - Step 1- BOW processing - time - %s ", time.time() - t0) + + bow_index, bow_byte_index, bow_token_index, bow_files, no_bow = self.bow_locator() + + # get and save the 'post_initialization bow data" + + self.post_initialization_bow_data = {"bow_index": bow_index, "bow_byte_index": bow_byte_index, + "bow_token_index": bow_token_index, "bow_files": bow_files, + "no_bow": no_bow} + + logging.info("update: Graph().initialization - bow parameters post: %s ", self.post_initialization_bow_data) + + # second major step -> build the MCW + t1 = time.time() + vocab_len, targets_len, context_len, min_len = self.mcw_builder() + + logging.info("update: Graph().initialization - Step 2- MCW processing - time - %s ", time.time() - t1, vocab_len) + + # third major step -> build the BG + t3 = time.time() + + graph_output = self.build_graph_raw(vocab_len, targets_len, context_len, min_len) + + logging.info("update: Graph().initialization - Step 3 - Graph building - time - %s ", time.time() - t3) + + # extract key files from /nlp & create new dataset folder + # shifting from build_dataset to core initialization + dummy = self.bg_text_package() + + t4 = time.time() + + graph_summary = self.post_initialization_bow_data + bow_count = len(graph_summary["bow_files"]) + if bow_count == 0: + bow_total = 0 + else: + bow_total = (bow_count - 1) * self.bow_max + graph_summary["bow_token_index"] + + graph_summary.update({"bow_count": len(graph_summary["bow_files"])}) + graph_summary.update({"bow_total": bow_total}) + graph_summary.update({"unique_vocab": vocab_len}) + graph_summary.update({"library_name": self.library_name}) + ts = str(Utilities().get_current_time_now()) + graph_summary.update({"time_stamp": ts}) + + # write to manifest.json for knowledge graph + json_dict = json.dumps(graph_summary,indent=2) + with open(os.path.join(self.library.nlp_path,"manifest.json"),"w") as outfile: + outfile.write(json_dict) + + return graph_summary + + def bow_builder(self): + + # key inputs for c functions + input_account_name = self.account_name + input_library_name = self.library_name + account_name = create_string_buffer(input_account_name.encode('ascii', 'ignore')) + library_name = create_string_buffer(input_library_name.encode('ascii', 'ignore')) + + input_db_path = LLMWareConfig.get_config("collection_db_uri") + + db_path_c = create_string_buffer(input_db_path.encode('ascii', 'ignore')) + + input_stop_words_fp = self.library.nlp_path + "stop_words_list.txt" + stop_words_c = create_string_buffer(input_stop_words_fp.encode('ascii', 'ignore')) + + # pass core_path -> will pick up {}.txt in c file + input_bow_fp = self.library.nlp_path + "bow" + bow_fp_c = create_string_buffer(input_bow_fp.encode('ascii', 'ignore')) + + input_text_field = "text" + text_field_c = create_string_buffer(input_text_field.encode('ascii', 'ignore')) + + teh = _mod_utility.text_extract_main_handler + teh.argtypes = (c_char_p, c_char_p, c_int, c_char_p, c_char_p, c_char_p, c_char_p, c_int, c_int) + teh.restype = c_int + + # note: key input - is there an existing bow already to build off ('a'), or start new ('w') ? + + if self.pre_initialization_bow_data["no_bow"]: + new_bow = 0 + else: + new_bow = 1 + + bow_index_current = self.pre_initialization_bow_data["bow_index"] + bow_len_remainder_only = self.pre_initialization_bow_data["bow_token_index"] + + new_bow_c = c_int(new_bow) + bow_index_current_c = c_int(bow_index_current) + + bow_len_current_c = c_int(bow_len_remainder_only) + + logging.info("update: Graph() bow_builder - calling on text_extract handler - bow vars - %s - %s ", bow_index_current, + bow_len_remainder_only) + + bow_count = teh(account_name, + library_name, + new_bow_c, + db_path_c, + stop_words_c, + bow_fp_c, + text_field_c, + bow_index_current_c, + bow_len_current_c) + + logging.info("update: Graph() - completed major C function step - utility BOW create - %s -", bow_count) + + return 0 + + def mcw_builder(self): + + # new utility function - builds most common words across library, based on multiple BOW files + dataset_fp = self.library.nlp_path + + # open bow0.txt as default start -> in most cases, this will be the only BOW + bow = open(dataset_fp + "bow0.txt", mode="r", encoding="utf-8", errors='ignore').read().split(",") + bow_len = len(bow) + + # hard-coded scaling principle - target most_common_words list = bow len / 300 + # experimenting with ratio + targets_len = bow_len // 300 + + # will need to set a floor for very small BOW + if targets_len < 100: + targets_len = 100 + + bow_files = self.post_initialization_bow_data["bow_files"] + + number_of_bow = len(bow_files) + + # run counter and most common on bow0.txt list + + co = Counter(bow) + mc = co.most_common() + + # build prune_count approximation + # this is the lowest entry on the target mcw list + # guiding assumption: in worst case, if each bow had an entry with this quantity... + # it would still be less than .... lowest number in the target + + if len(mc) > targets_len: + prune_count = mc[targets_len][1] // number_of_bow + + else: + # cap len of targets at the length of the most common words + # safety check for very small libraries + targets_len = len(mc) - 1 + prune_count = mc[targets_len][1] // number_of_bow + + mc_pruned = [] + + prune_count = 0 + + for z in range(0, len(mc)): + if mc[z][1] > prune_count: + mc_pruned.append((mc[z][0], mc[z][1])) + else: + break + + # this may be the end in default case if only one BOW + + mc_final = mc_pruned + + if len(bow_files) > 1: + + for z in range(1, len(bow_files)): + + bow_new = open(os.path.join(dataset_fp, "bow{}.txt".format(z)), mode="r", encoding="utf-8", + errors='ignore').read().split(",") + + # bow_new_len = len(bow_new) + c_tmp = Counter(bow_new) + mcw_new = c_tmp.most_common() + added_new = 0 + + for y in range(0, len(mcw_new)): + new_entry = (mcw_new[y][0], mcw_new[y][1]) + if mcw_new[y][1] > prune_count: + mc_pruned.append(new_entry) + added_new += 1 + else: + logging.info("update: mcw analysis - stopping at prune_count: %s %s %s ", y, prune_count, mcw_new[y]) + break + + mc_combined = sorted(mc_pruned, key=lambda x: x[0]) + + mc_final = [] + current_entry = mc_combined[0][0] + current_count = mc_combined[0][1] + + one_left = 0 + for w in range(1, len(mc_combined)): + + if mc_combined[w][0] == current_entry: + current_count += mc_combined[w][1] + one_left = 0 + else: + new_entry = (current_entry, current_count) + mc_final.append(new_entry) + current_entry = mc_combined[w][0] + current_count = mc_combined[w][1] + one_left = 1 + + if one_left == 1: + final_entry = (current_entry, current_count) + mc_final.append(final_entry) + + mc_final = sorted(mc_final, key=lambda x: x[1], reverse=True) + + mcw = open(os.path.join(dataset_fp,"most_common_words.txt"), 'w') + + # for vocab lookup, cap vocab at .vocab_len_max, e.g., 50,000 by default + logging.info("update: Graph() mcw_builder - vocab len: %s ", len(mc_final)) + + if len(mc_final) > self.vocab_len_max: + max_len = self.vocab_len_max + else: + max_len = len(mc_final) + + vocab_dict = {} + target_list = [] + + mcw_counter_out = [] + + new_entry_counter = 0 + for x in range(0, max_len): + new_entry = mc_final[x][0] + # strip out special markers in the BOW + if not new_entry.startswith("[") and not new_entry.startswith("<"): + mcw.write((new_entry + ",")) + new_dict_entry = {new_entry: new_entry_counter} + vocab_dict.update(new_dict_entry) + target_list.append(new_entry) + mcw_counter_out.append((new_entry, mc_final[x][1])) + new_entry_counter += 1 + mcw.close() + + # create bigrams list from the bow_list -> initialization (store in nlp) + + bigrams = self.get_bigrams(bow_files) + bi = open(os.path.join(dataset_fp,"bigrams.txt"), 'w') + for x in range(0, len(bigrams)): + bi.write((bigrams[x][0] + ",")) + bi.write((str(bigrams[x][1]) + ",")) + bi.close() + + json_dict = json.dumps(vocab_dict) + with open(os.path.join(dataset_fp,"vocab_lookup.json"), "w") as outfile: + outfile.write(json_dict) + + reverse_look_up_dict = {v: k for k, v in vocab_dict.items()} + rlu_json_dict = json.dumps(reverse_look_up_dict) + with open(os.path.join(dataset_fp,"token_lookup.json"), "w") as outfile: + outfile.write(rlu_json_dict) + + mcw_alt = open(os.path.join(dataset_fp,"mcw_counts.txt"), 'w') + + min_len = -1 + MIN_COUNT = 5 + + for x in range(0, len(mcw_counter_out)): + if mcw_counter_out[x][1] < MIN_COUNT and min_len == -1: + min_len = x - 1 + mcw_alt.write((mcw_counter_out[x][0] + ",")) + mcw_alt.write((str(mcw_counter_out[x][1]) + ",")) + mcw_alt.close() + + vocab_len = len(mc_final) + + if targets_len > vocab_len: + targets_len = vocab_len + + context_len = 2 * targets_len + + if context_len > vocab_len: + context_len = vocab_len + + if min_len == -1: + min_len = vocab_len + + return vocab_len, targets_len, context_len, min_len + + def build_graph_raw(self, vocab_len, targets_len, context_len, min_len): + + # default - targets_len_max = 5000 + if targets_len > self.targets_len_max: + targets_len = self.targets_len_max + + # default - context_len_max = 10000 + if context_len > self.context_len_max: + context_len = self.context_len_max + + # default - vocab len max = 50000 + if vocab_len > self.vocab_len_max: + vocab_len = self.vocab_len_max + + if min_len > vocab_len: + min_len = vocab_len + + # bow_len passed is the total size of all BOW files + # in simple case, bow_len = # of tokens in bow0.txt + # check if greater than 10M -> need to check multiple bow files + + # default bow_len_max = 10000000 + + bow_count = self.post_initialization_bow_data["bow_index"] + 1 + bow_len_remainder = self.post_initialization_bow_data["bow_token_index"] + + logging.info("update: build_graph_raw: bow len - %s %s: ", bow_count, bow_len_remainder) + + graph_handler = _mod_utility.graph_builder + + graph_handler.argtypes = (c_char_p, + c_char_p, + c_char_p, + c_char_p, + c_int, + c_int, + c_int, + c_int, + c_int, + c_char_p, + c_int, + c_int, + c_int) + + graph_handler.restype = c_int + + account_name = create_string_buffer(self.account_name.encode('ascii', 'ignore')) + library_name = create_string_buffer(self.library.library_name.encode('ascii', 'ignore')) + + input_bow_fp = self.library.nlp_path + "bow" + bow_fp_c = create_string_buffer(input_bow_fp.encode('ascii', 'ignore')) + + input_mcw_fp = self.library.nlp_path + "most_common_words.txt" + + mcw_fp_c = create_string_buffer(input_mcw_fp.encode('ascii', 'ignore')) + + graph_fp = self.library.nlp_path + "bg.txt" + graph_fp_c = create_string_buffer(graph_fp.encode('ascii', 'ignore')) + + # bow_len_remainder -> only the remainder from the last bow file + # in usual, simple case -> this is the len of bow0.txt + + bow_len_c = c_int(bow_len_remainder) + + # target len set at half of context len window + + mcw_context_len = context_len + # mcw_target_len = mcw_len // 2 + mcw_target_len = targets_len + + mcw_context_len_c = c_int(mcw_context_len) + mcw_target_len_c = c_int(mcw_target_len) + vocab_len_c = c_int(vocab_len) + # end - setting target/context mcw lens + + graph_index_c = c_int(0) + graph_max_size_c = c_int(1000000) + + bow_index = c_int(bow_count) + + min_len_c = c_int(min_len) + + # key parameters - account/library = find BOW + target most_common_words list + # parameters: min_counts, targets, window_size == 3 + + logging.info("update: Graph - initiating call to graph handler - %s - %s - %s - %s ", vocab_len, mcw_target_len, + mcw_context_len, min_len) + + # input to bow_handler: bow.txt & most_common_words.txt + # output to bow_handler: bg.txt + dummy = graph_handler(account_name, + library_name, + bow_fp_c, + mcw_fp_c, + bow_index, + bow_len_c, + mcw_target_len_c, + mcw_context_len_c, + vocab_len_c, + graph_fp_c, + graph_index_c, + graph_max_size_c, + min_len_c) + + logging.info("update: Graph() - completed graph build - output value is - %s ", dummy) + + return 0 + + def bg_text_package(self): + + # output + text_out = [] + + fp = os.path.join(self.library.nlp_path, "bg.txt") + + # defensive check - if file path does not exist, then build_graph + if not os.path.exists(fp): + self.build_graph() + + # once graph is built, this path should exist + try: + f = open(fp, encoding="utf-8", errors="ignore").read().split("\n") + + for z in range(0, len(f)): + entry_tokens = f[z].split(",") + entry = "" + entry += entry_tokens[0] + " " + new_tokens_added = 1 + for y in range(2, len(entry_tokens), 2): + if entry_tokens[y] != "": + entry += entry_tokens[y] + " " + new_tokens_added += 1 + if y > 100: + break + + if new_tokens_added > 7: + text_out.append(entry) + + except: + logging.error("error: Graph - could not identify correct file in nlp path") + + # write to file + g = open(os.path.join(self.library.nlp_path,"bg_text.txt"), "w") + for t in text_out: + g.write((t + "\n")) + g.close() + + return text_out + + def _get_top_bigrams_exclude_special_tokens(self, tokens, top_n): + + bigrams = [] + for z in range(1, len(tokens)): + + # skip special tokens in the BOW starting with "[" and "<" + if str(tokens[z - 1]).startswith("[") or str(tokens[z - 1]).startswith("<") or \ + str(tokens[z]).startswith("[") or str(tokens[z]).startswith("<"): + do_nothing = 0 + else: + # excluded the special tokens - capture bigram + + entry = (tokens[z - 1] + "_" + tokens[z]) + bigrams.append(entry) + + d = Counter(bigrams) + dc = d.most_common(top_n) + + return dc + + def get_bigrams(self, bow_list): + + top_bigrams_out = [] + + for x in bow_list: + + bow_fp = os.path.join(self.library.nlp_path,x) + + bow = open(bow_fp, mode="r", encoding="utf-8", errors='ignore').read().split(",") + + bigrams = self._get_top_bigrams_exclude_special_tokens(bow, self.bigram_count) + + for b in bigrams: + # floor for asserting bigram + if b[1] > 10: + top_bigrams_out.append(b) + + # prune size of bigrams list + if len(top_bigrams_out) > self.bigram_count: + top_bigrams_out = top_bigrams_out[0:self.bigram_count] + + bigrams_sorted = sorted(top_bigrams_out, key=lambda x: x[1], reverse=True) + + return bigrams_sorted + + def get_bow_list(self): + + ds_fp = self.library.nlp_path + files = os.listdir(ds_fp) + bow_list = [] + for x in files: + if str(x).startswith("bow"): + bow_list.append(x) + + if len(bow_list) > 1: + bow_list = sorted(bow_list) + last_bow = open(os.path.join(ds_fp,bow_list[-1]), "r").read().split(",") + bow_count = (len(bow_list) - 1) * self.bow_max + len(last_bow) + elif len(bow_list) == 1: + only_bow = open(os.path.join(ds_fp,bow_list[0]), "r").read().split(",") + bow_count = len(only_bow) + else: + bow_count = 0 + + return bow_count, bow_list + + def export_graph_to_visualize (self, graph_target_size): + + # exports graph elements in node/edge dataset, packaged for popular visualization libraries + # e.g., vis.Network (Javascript) + # e.g., networkX (Python) + + if self.library.get_knowledge_graph_status() != "yes": + self.build_graph() + + context_search = self.retrieve_knowledge_graph() + + # Step 1 - build full graph from context_search_table + + node_dataset = [] + edge_dataset = [] + + if len(context_search) > 2 * graph_target_size: + max_ct = 2 * graph_target_size + else: + max_ct = len(context_search) + + edge_counter = 0 + node_counter = 0 + + for z in range(0, max_ct): + t = context_search[z][0] + l = len(context_search[z][1]) + + new_node = {"id": t, "label": t, "shape": "dot", "size": 10} + if new_node not in node_dataset: + node_dataset.append(new_node) + node_counter += 1 + + if l > graph_target_size: + l = graph_target_size + + for y in range(0, l): + c = context_search[z][1][y][0] + w = context_search[z][1][y][1] + + # G_viz.add_edge(t,c,weight=w,title="") + + new_c_node = {"id": c, "label": c, "shape": "dot", "size": 10} + if new_c_node not in node_dataset: + node_dataset.append(new_c_node) + node_counter += 1 + + new_edge = {"from": t, "title": "", "to": c, "weight": w} + new_edge_rev = {"from": c, "title": "", "to": t, "weight": w} + if new_edge not in edge_dataset: + edge_dataset.append(new_edge) + edge_counter += 1 + + return node_dataset, edge_dataset + + def export_graph_with_query_to_visualize(self, graph_target_size, query): + + # runs a 'pseudo-query' on graph, and retrieves elements from graph 'neighborhood' for visualization + # exports graph elements in node/edge dataset, packaged for popular visualization libraries + # e.g., vis.Network (Javascript) + # e.g., networkX (Python) + + if self.library.get_knowledge_graph_status() != "yes": + self.build_graph() + + b = CorpTokenizer(one_letter_removal=True, remove_stop_words=True, remove_numbers=True) + + query_tokens = CorpTokenizer().tokenize(query) + + context_search = self.retrieve_knowledge_graph() + + if context_search is None or len(context_search) == 0: + logging.info("update: Graph - knowledge graph appears to be empty") + + # Step 0 - find targeted keyword in context_search + + node_dataset = [] + edge_dataset = [] + + # G = nx.Graph() + counter = 0 + red_nodes = [] + + for tokens in query_tokens: + for z in range(0, len(context_search)): + if tokens.lower() == context_search[z][0].lower(): + # G.add_node(context_search[z][0],color="red") + t = context_search[z][0] + new_node = {"color": "red", "id": t, "label": t, "shape": "dot", "size": 10} + if new_node not in node_dataset: + node_dataset.append(new_node) + red_nodes.append(new_node) + + if len(context_search[z][1]) > graph_target_size: + l = graph_target_size + else: + l = len(context_search[z][1]) + + logging.info("update: Graph - in targeted_build - found match: %s %s %s %s", len(context_search[z][1]), l, + tokens, new_node) + + for y in range(0, l): + c = context_search[z][1][y][0] + w = context_search[z][1][y][1] + + # G.add_edge(context_search[z][0],c,weight=w,title="") + + t = context_search[z][0] + + new_c_node = {"id": c, "label": c, "shape": "dot", "size": 10} + + if new_c_node not in node_dataset and c.lower() not in query_tokens: + logging.info("update: Graph - adding node: %s", new_c_node) + node_dataset.append(new_c_node) + + new_edge = {"from": t, "title": "", "to": c, "weight": w} + if new_edge not in edge_dataset: + edge_dataset.append(new_edge) + counter += 1 + + for x in range(0, len(context_search)): + if c.lower() == context_search[x][0].lower(): + if len(context_search[x][1]) > int(graph_target_size / 2): + l2 = int(graph_target_size / 2) + else: + l2 = len(context_search[x][1]) + + for y2 in range(0, l2): + c2 = context_search[x][1][y2][0] + w2 = context_search[x][1][y2][1] + + # G.add_edge(context_search[x][0],c2,weight=w2,title="") + + t = context_search[x][0] + + new_node = {"id": t, "label": t, "shape": "dot", "size": 10} + if new_node not in node_dataset and t.lower() not in query_tokens: + node_dataset.append(new_node) + + new_c_node = {"id": c2, "label": c2, "shape": "dot", "size": 10} + if new_c_node not in node_dataset and c2.lower() not in query_tokens: + node_dataset.append(new_c_node) + + new_edge = {"from": t, "title": "", "to": c2, "weight": w2} + if new_edge not in edge_dataset: + edge_dataset.append(new_edge) + + counter += 1 + + return red_nodes, node_dataset, edge_dataset + + def get_unique_vocab_len(self): + return len(self.get_unique_vocab_lookup()) + + def get_unique_vocab_lookup(self): + + if self.library.get_knowledge_graph_status() != "yes": + self.build_graph() + + j = json.load(open(os.path.join(self.library.nlp_path,"vocab_lookup.json"), "r")) + + return j + + def get_unique_vocab_reverse_lookup(self): + + if self.library.get_knowledge_graph_status() != "yes": + self.build_graph() + + j = json.load(open(os.path.join(self.library.nlp_path,"token_lookup.json"), "r")) + + return j + + def retrieve_knowledge_graph(self): + + ct = [] + + if not os.path.exists(os.path.join(self.library.nlp_path,"bg.txt")): + d = -1 + if d == -1: + # initialization failed - so contexts_np = [] + contexts_np = np.array([], dtype=object) + return contexts_np + + if os.path.exists(os.path.join(self.library.nlp_path,"bg.txt")): + + ct_raw = open(os.path.join(self.library.nlp_path,"bg.txt"), + mode='r', encoding='utf-8', errors='ignore').read().split(',') + + new_row = [] + target = ct_raw[0] + start = 0 + got_tuple = 0 + + for x in range(1, len(ct_raw)): + + if "" in ct_raw[x]: + full_row = (target, new_row) + ct.append(full_row) + start = 0 + target = ct_raw[x].split("\n")[-1] + # if x < len(ct_raw) - 2: target = ct_raw[x + 1] + + if start == 1: + if got_tuple == 0: + new_row.append((ct_raw[x], ct_raw[x + 1])) + got_tuple = 1 + else: + got_tuple = 0 + + if ct_raw[x] == "": + new_row = [] + start = 1 + + contexts_np = np.array(ct, dtype=object) + + return contexts_np + + def retrieve_mcw_counts(self): + + if self.library.get_knowledge_graph_status() != "yes": + + logging.info("update: to retrieve_mcw_counts, the knowledge graph must be created for this library. " + "This is a 'one-time' build, and depending upon the size of the library, may take a little " + "bit of time.") + + self.build_graph() + + try: + mcw = open(os.path.join(self.library.nlp_path,"mcw_counts.txt"), "r").read().split(",") + + except OSError: + logging.exception("error: Graph - opening mcw_counts file - path not found.") + return [], [] + + mcw_count_list = [] + mcw_names_only = [] + + for z in range(0, len(mcw), 2): + + if (z + 1) < len(mcw): + try: + new_entry = (mcw[z], int(mcw[z + 1])) + mcw_count_list.append(new_entry) + mcw_names_only.append(mcw[z]) + + except: + logging.error("error: Graph - unexpected mcw file issue - %s %s %s", z, mcw[z], mcw[z + 1]) + + return mcw_count_list, mcw_names_only + + def retrieve_bigrams(self): + + if self.library.get_knowledge_graph_status() != "yes": + self.build_graph() + + try: + bigrams = open(os.path.join(self.library.nlp_path,"bigrams.txt"), "r").read().split(",") + + except OSError: + logging.exception("error: Graph - unexpected error opening bigrams file.") + return [] + + bigram_pairs_list = [] + + for z in range(0, len(bigrams), 2): + + if (z + 1) < len(bigrams): + try: + bigs = bigrams[z].split("_") + new_entry = (bigrams[z], int(bigrams[z + 1]), bigs[0], bigs[1]) + bigram_pairs_list.append(new_entry) + + except: + logging.error("error: Graph - unexpected problem with bigram file" + "- %s %s %s ", z, bigrams[z], bigrams[z + 1]) + + return bigram_pairs_list + + def get_library_data_stats(self): + + library_stats = {} + + lib_card = self.library.get_library_card(self.library.library_name) + + # basic library counting data + doc_count = {"documents": lib_card["documents"]} + block_count = {"blocks": lib_card["blocks"]} + image_count = {"images": lib_card["images"]} + table_count = {"tables": lib_card["tables"]} + + library_stats.update(doc_count) + library_stats.update(block_count) + library_stats.update(image_count) + library_stats.update(table_count) + + # statistical analysis prepared during initialization + bigrams = self.retrieve_bigrams() + + if len(bigrams) > 50: + bigrams = bigrams[0:50] + + library_stats.update({"bigrams": bigrams}) + + mcw_list, mcw_names_only = self.retrieve_mcw_counts() + + if len(mcw_list) > 50: + mcw_list = mcw_list[0:50] + + library_stats.update({"mcw": mcw_list}) + + # repackage summary of bg + bg = self.retrieve_knowledge_graph() + + if len(bg) > 50: + bg = bg[0:50] + bg_out = [] + for t in bg: + + if len(t) > 1: + target = t[0] + context = t[1] + context_out = [] + if len(context) > 0: + if len(context) > 10: + context = context[0:10] + for y in range(0, len(context)): + context_out.append(context[y]) + new_row = {"target": target, "context": context_out} + bg_out.append(new_row) + + library_stats.update({"graph_top": bg_out}) + + # get BOW + unique vocab data from manifest.json in /nlp + + try: + data_manifest = json.load(open(os.path.join(self.library.nlp_path,"manifest.json"), "r")) + + except OSError: + logging.exception("error: Graph - could not open manifest file at path- %s ", self.library.nlp_path) + data_manifest = {} + + if "bow_count" in data_manifest: + library_stats.update({"bow_count": data_manifest["bow_count"]}) + + if "unique_vocab_len" in data_manifest: + library_stats.update({"unique_vocab_len": data_manifest["unique_vocab_len"]}) + + return library_stats + + def bow_adhoc_builder(self, sentence_list): + + bow_out = [] + b = CorpTokenizer(one_letter_removal=True, remove_stop_words=True, remove_numbers=True) + + for sentences in sentence_list: + tokens = b.tokenize(sentences) + for t in tokens: + bow_out.append(t) + + return bow_out + + def mcw_adhoc_builder(self, bow): + + c = Counter(bow) + mc = c.most_common() + + return mc + + def retrieve_mcw(self): + + if self.library.get_knowledge_graph_stats() != "yes": + self.build_graph() + + mcw = open(os.path.join(self.library.nlp_path,"mcw_counts.txt"), "r").read().split(",") + mcw_pairs_list = [] + + for z in range(0, len(mcw), 2): + + if (z + 1) < len(mcw): + new_entry = (mcw[z], mcw[z + 1]) + mcw_pairs_list.append(new_entry) + + return mcw_pairs_list + + def assemble_top_blocks(self, block_scores_list,doc_id, max_samples=3): + + blocks_to_get = min(max_samples, len(block_scores_list)) + bloks_out = "" + + for x in range(0,blocks_to_get): + + if len(block_scores_list[x]) == 2: + if block_scores_list[x][0].startswith("block_id="): + bid = int(block_scores_list[x][0][len("block_id="):]) + + filter_dict = {"doc_ID": int(doc_id), "block_ID": bid} + blok_qr = CollectionRetrieval(self.library.collection).filter_by_key_dict(filter_dict) + if blok_qr: + bloks_out += blok_qr[0]["text"] + "\n" + + return bloks_out + + def doc_graph_builder (self): + + # * note: this method loops through a lot of key analytical artifacts at a document level * + # * there are several commented out items which we will look to explore/add in future versions * + # * ... will also look to shift this to C + background process for performance ... * + + dataset_fp = self.library.nlp_path + + nlp_files = os.listdir(dataset_fp) + + my_bow_iter_list = [] + for files in nlp_files: + if files.startswith("bow") and files.endswith(".txt"): + my_bow_iter_list.append(files) + + my_bow_iter_list = sorted(my_bow_iter_list) + + doc_graph = [] + + bow_byte_index = 0 + + for b in range(0,len(my_bow_iter_list)): + + bow_file = my_bow_iter_list[b] + + bow_file_object = open(os.path.join(dataset_fp,bow_file), mode="r", encoding="utf-8",errors="ignore") + + if b == 0: + # skip ahead to the current byte index + bow_file_object.seek(bow_byte_index,0) + + bow = bow_file_object.read().split("<") + + last_found_block = 0 + doc_start = 1 + + for x in range(doc_start,len(bow)): + + entry = bow[x].split(",") + + if len(entry) > 1 and entry[0].startswith("doc_id"): + ct = [] + doc_bow = entry[1:] + doc_id_tmp = entry[0][7:-1] + c = Counter(doc_bow) + mc = c.most_common(20) + mc_updated = [] + + for y in range(0, len(mc)): + my_context_row = [] + + if not(mc[y][0].startswith("[") or mc[y][0].startswith("<")): + mc_updated.append(mc[y]) + + for z in range(0, len(doc_bow)): + if mc[y][0] == doc_bow[z]: + + if z - 3 >= 0: lb = 3 + else: lb = z + + if z + 4 < len(doc_bow): lf = 3 + else: lf = len(doc_bow) - z - 1 + + for a in range(z - lb, z): + if not doc_bow[a].startswith("["): + my_context_row.append(doc_bow[a]) + for b in range(z + 1, z + 1 + lf): + if not doc_bow[b].startswith("["): + my_context_row.append(doc_bow[b]) + + cs = Counter(my_context_row) + new_row = cs.most_common(10) + + o = (mc[y][0], new_row) + ct.append(o) + + for nr in new_row: + c = nr[0] + w = nr[1] + + blocks = bow[x].split("[") + + doc_id_confirm = blocks[0].split(",")[0] + if len(blocks) >= 1: + try: + first_block_in_doc = blocks[1].split(",")[0][:-1] + last_block_in_doc = blocks[-1].split(",")[0][:-1] + except: + logging.error("error: malformed BOW - need to investigate root cause") + first_block_in_doc = "block_id=" + str(last_found_block) + last_block_in_doc = "block_id=" + str(last_found_block) + else: + first_block_in_doc = "block_id=" + str(last_found_block) + last_block_in_doc = "block_id=" + str(last_found_block) + + last_found_block = last_block_in_doc + + block_scores = [] + for b in blocks: + score = 0 + elements = b.split(",") + block_id = elements[0][:-1] + tokens = elements[1:] + for t in tokens: + for a in range(0,len(mc)): + if t == mc[a][0]: + score += mc[a][1] + if score > 0: + new_entry = (block_id, score) + block_scores.append(new_entry) + + block_scores = sorted(block_scores, key=lambda j:j[1], reverse=True) + if len(block_scores) > 20: + block_scores = block_scores[0:20] + + d = {"doc_ID": doc_id_tmp, + "block_scores": block_scores, + "most_common_words": mc_updated, + "context_table": ct, + "first_block_in_doc": first_block_in_doc, + "last_block_in_doc": last_block_in_doc} + + doc_graph.append(d) + + # write to manifest.json for knowledge graph + json_dict = json.dumps(doc_graph,indent=1) + with open(self.library.nlp_path + "doc_graph.json","w") as outfile: + outfile.write(json_dict) + + return doc_graph + + def kg_query_counts(self, query): + + # 'queries' the knowledge graph to find related terms + + if self.library.get_knowledge_graph_status() != "yes": + + logging.info("update: use of this method requires a 'one-time' creation of knowledge graph on the " + "library, which is being created now - this may take some time depending upon the size " + "of the library %s", self.library) + + self.library.build_graph() + + bigram_list = Graph(self.library).retrieve_bigrams() + mcw_count, mcw_names_only = Graph(self.library).retrieve_mcw_counts() + context_search = Graph(self.library).retrieve_knowledge_graph() + query_tokens = CorpTokenizer().tokenize(query) + + count_dict = {} + + for tok in query_tokens: + + for j, entry in enumerate(mcw_count): + if tok == entry[0]: + count_dict.update({tok:entry[1]}) + break + + return count_dict + + def kg_query_related_bigrams(self, query): + + # 'queries' the knowledge graph to find related terms + + if self.library.get_knowledge_graph_status() != "yes": + + logging.info("update: use of this method requires a 'one-time' creation of knowledge graph on the " + "library, which is being created now - this may take some time depending upon the size " + "of the library %s", self.library) + + self.library.build_graph() + + enhanced_search_terms = [] + + bigram_list = Graph(self.library).retrieve_bigrams() + mcw_count, mcw_names_only = Graph(self.library).retrieve_mcw_counts() + context_search = Graph(self.library).retrieve_knowledge_graph() + query_tokens = CorpTokenizer().tokenize(query) + + output_dict = {} + count_dict = {} + + for tok in query_tokens: + for i, bigram in enumerate(bigram_list): + bigram_splitter = bigram[0].split("_") + if tok in bigram_splitter: + output_dict.update({bigram[0]: bigram[1]}) + + for j, entry in enumerate(mcw_count): + if tok == entry[0]: + count_dict.update({tok:entry[1]}) + break + + bigrams_out = {"bigrams": output_dict, "counts": count_dict} + + logging.info("update: Graph - bigrams out - %s ", bigrams_out) + + return bigrams_out + + def kg_query(self, query, th=10): + + # 'queries' the knowledge graph to find related terms + + if self.library.get_knowledge_graph_status() != "yes": + + logging.info("update: use of this method requires a 'one-time' creation of knowledge graph on the " + "library, which is being created now - this may take some time depending upon the size " + "of the library %s", self.library) + + self.library.build_graph() + + enhanced_search_terms = [] + + bigrams = Graph(self.library).retrieve_bigrams() + mcw_count = Graph(self.library).retrieve_mcw_counts() + + context_search = Graph(self.library).retrieve_knowledge_graph() + + query_tokens = CorpTokenizer().tokenize(query) + + output_dict = {} + + for z in range(0, len(query_tokens)): + + output_dict.update({query_tokens[z]: []}) + + for y in range(0, len(context_search)): + if query_tokens[z] == context_search[y][0]: + if context_search[y][1]: + for c in range(0, len(context_search[y][1])): + tmp_count = context_search[y][1][c][1] + + if int(tmp_count) > th: + g_entry = context_search[y][1][c][0] + + if g_entry not in output_dict[query_tokens[z]]: + output_dict[query_tokens[z]].append(g_entry) + + if g_entry not in enhanced_search_terms: + enhanced_search_terms.append(g_entry) + + if c > 3: + break + + return output_dict + + +class Datasets: + + def __init__(self, library=None, ds_folder=None, validation_split=0.1, testing_split=0.1, tokenizer=None): + + # loading a library object is required for most, but not all, of the dataset builds + # if no library passed, and it is required, then exception raised in the dataset builder method + + self.library = library + self.library_name = None + self.account_name = "llmware" + + if library: + self.library_name = library.library_name + self.account_name = library.account_name + + # set up path where dataset files will be created and stored + + if not ds_folder: + + if library: + # default preferred path - put /dataset folder archives in library path structure + self.work_folder = self.library.dataset_path + else: + # backup - will place in /tmp path + self.work_folder = LLMWareConfig().get_tmp_path() + else: + # will put in passed ds_folder path + self.work_folder = ds_folder + + # incorporate tokenizer + if tokenizer: + self.tokenizer = tokenizer + else: + self.tokenizer = Utilities().get_default_tokenizer() + + # these are char-level tests, so 'independent' of specific tokenization + self.text_sample_max_len = 512 + self.text_long_sample_max_len = 2048 + self.text_long_sample_min_len = 64 + self.text_short_sample_max_len = 128 + self.text_empty_min_threshold = 50 + + # base folder path for newly created dataset asset will start with .ds_base_name + self.ds_base_name = "dataset_" + self.ds_id_mode = "uuid" + + # after building dataset, this will be populated with the name of the current ds + self.current_ds_name = "" + + # separator configs + self.separator = "\n" + + self.file_batch_size = 50000 + + self.alpaca = {"intro_blurb": "Below is an instruction that describes a task. " + "Write a response that appropriately completes the request.", + "user_separator": " ### Instruction: ", + "response_separator": " ### Response: ", + "end_of_text_separator": "<|endoftext|>" + } + + self.human_bot = {"intro_blurb": "", + "user_separator": ": ", + "response_separator": "\n: ", + "end_of_text_separator": "<|endoftext|>" } + + self.chatgpt = {"system_instruction": "You are a helpful assistant who speaks with facts and no wasted words."} + + self.testing_split = testing_split + self.validation_split = validation_split + + self.training_sample_file_name_base = "training_samples" + self.testing_sample_file_name_base = "testing_samples" + self.validation_sample_file_name_base = "validation_samples" + + def token_counter(self, text_sample): + toks = self.tokenizer.encode(text_sample).ids + return len(toks) + + def tokenize_text(self, text_sample): + toks = self.tokenizer.encode(text_sample).ids + return toks + + def get_dataset_sample(self, ds_name, ds_path=None, sample_range=1000): + + # useful for testing to randomly sample an element from the dataset + # picks a sample randomly from the first training sample file + + if ds_path: + self.work_folder = ds_path + + ds_folder = os.path.join(self.work_folder,ds_name) + + first_training_file = self.training_sample_file_name_base + "_0.jsonl" + + if not os.path.exists(os.path.join(ds_folder, first_training_file)): + raise FilePathDoesNotExistException(os.path.join(ds_folder, first_training_file)) + + # picks from first training file + train_file = [] + my_file = open(os.path.join(ds_folder, first_training_file), 'r', encoding='utf-8') + for lines in my_file: + new_row = json.loads(lines) + train_file.append(new_row) + + if len(train_file) > sample_range: + r = random.randint(0, sample_range) + else: + r = random.randint(0, len(train_file) - 1) + + ds_sample = train_file[r] + + return ds_sample + + def issue_new_ds_id (self, custom_id=None, mode="uuid"): + + # issue new ds_id + ds_id = "default_new" + + if custom_id: + ds_id = custom_id + else: + + if mode == "time_stamp": + ds_id = str(Utilities().get_current_time_now()) + + elif mode == "uuid": + ds_id = str(Utilities().get_uuid()) + + elif mode == "random_number": + ds_id = str(random.randint(1000000, 9999999)) + + # create new dataset specific folder + self.current_ds_name = self.ds_base_name + ds_id + new_ds_folder = os.path.join(self.work_folder,self.current_ds_name) + if not os.path.exists(new_ds_folder): + os.mkdir(new_ds_folder) + + return ds_id, new_ds_folder + + def package_chatgpt_sample(self, turn1, turn2, add_system_instruction=True): + + if "system_instruction" in self.chatgpt: + system_instruction = self.chatgpt["system_instruction"] + else: + system_instruction = "You are a helpful assistant." + + if add_system_instruction: + new_sample = [{"role": "system", "content": system_instruction}, + {"role": "user", "content": turn1}, + {"role": "assistant", "content": turn2}] + else: + # if no system instruction, then do not add + new_sample = [{"role": "user", "content": turn1}, {"role": "assistant", "content": turn2}] + + return new_sample + + def package_human_bot_sample(self, turn1, turn2): + + if "intro_blurb" in self.human_bot: + intro_blurb = self.human_bot["intro_blurb"] + if intro_blurb: + intro_blurb += self.separator + else: + intro_blurb = "" + + if "user_separator" in self.human_bot: + user_separator = self.human_bot["user_separator"] + else: + user_separator = ": " + + if "response_separator" in self.human_bot: + response_separator = self.human_bot["response_separator"] + else: + response_separator = "\n: " + + if "end_of_text" in self.human_bot: + end_of_text = self.human_bot["end_of_text"] + else: + end_of_text = "<|endoftext|>" + + content = intro_blurb + user_separator + turn1 + self.separator + response_separator + turn2 + end_of_text + + sample = {"text": content} + + return sample + + def package_alpaca_sample(self, instruction, response): + + if "intro_blurb" in self.alpaca: + intro_blurb = self.alpaca["intro_blurb"] + else: + intro_blurb = "Below is an instruction that describes a task. " \ + "Write a response that appropriately completes the request." + + if "user_separator" in self.alpaca: + user_separator = self.alpaca["user_separator"] + else: + user_separator = " ### Instruction: " + + if "response_separator" in self.alpaca: + response_separator = self.alpaca["response_separator"] + else: + response_separator = " ### Response: " + + if "end_of_text" in self.alpaca: + end_of_text = self.alpaca["end_of_text"] + else: + end_of_text = "<|endoftext|>" + + content = intro_blurb + self.separator + \ + user_separator + instruction + \ + response_separator + response + self.separator + end_of_text + + sample = {"text": content} + + return sample + + def build_text_ds (self, min_tokens=100, max_tokens=1000,query=None,filter_dict=None, qr=None, custom_id=None): + + # create specific folder for dataset artifacts inside library dataset path + ds_id, ds_folder = self.issue_new_ds_id(custom_id=custom_id) + + if not qr: + + # optional: by passing query and/or filter_dict, allows targeted 'subset' of library to be used + if not query and not filter_dict: + + # by default, will get only text and table entries, but no images (since text is duplicative) + filter_list = ["text", "table"] + + if self.library: + results = CollectionRetrieval(self.library.collection).filter_by_key_value_range("content_type",filter_list) + else: + raise LibraryObjectNotFoundException("no-library-loaded-in-Dataset-constructor") + + else: + + if self.library: + results = CollectionRetrieval(self.library.collection).\ + text_search_with_key_value_dict_filter(query, filter_dict) + else: + raise LibraryObjectNotFoundException("no-library-loaded-in-Dataset-constructor") + + else: + results = qr + + counter = 0 + batch_counter = 0 + output = [] + text_out = [] + batch_number = 0 + total_sample_count = 0 + training_sample_count = 0 + testing_sample_count = 0 + validation_sample_count = 0 + training_files_created = [] + validation_files_created = [] + testing_files_created = [] + + text_sample = "" + current_doc = 0 + + results = sorted(results, key=lambda x:x["doc_ID"], reverse=False) + + for i, elements in enumerate(results): + + if i == 0: + current_doc = elements["doc_ID"] + text_sample = elements["text"] + + tok_count = self.token_counter(text_sample) + + # if in target range or if last sample in doc + if min_tokens <= tok_count <= max_tokens or elements["doc_ID"] != current_doc: + + # create sample + # replace in output doc_ID for file_source? "doc_ID" | current_doc + new_entry = {"sample_number": counter, "file_source": elements["file_source"], "text": text_sample} + output.append(new_entry) + text_out.append(text_sample) + counter += 1 + batch_counter += 1 + + # edge case for i==0 + if i == 0: + text_sample = "" + else: + # start fresh + text_sample = elements["text"] + current_doc = elements["doc_ID"] + else: + if tok_count <= min_tokens: + text_sample += " " + elements["text"] + tok_count = self.token_counter(text_sample) + + if tok_count >= max_tokens: + + while tok_count > max_tokens: + + tokens = self.tokenize_text(text_sample) + chopped = tokens[0:max_tokens] + remainder = tokens[max_tokens:] + remainder_text = self.tokenizer.decode(remainder) + chopped_text = self.tokenizer.decode(chopped) + + smooth_stop = self._smooth_stopper(chopped_text,200) + + new_text_sample = chopped_text[:smooth_stop] + new_remainder = chopped_text[smooth_stop:] + remainder_text + + # replacing doc_ID: current_doc + new_entry = {"sample_number": counter, "file_source": elements["file_source"], + "text": new_text_sample} + + output.append(new_entry) + text_out.append(text_sample) + counter += 1 + batch_counter += 1 + text_sample = new_remainder + tok_count = self.token_counter(text_sample) + + # pick up last entry, if any + if len(text_sample) > 0: + + # replacing "doc_ID" | current_doc + new_entry = {"sample_number": counter, "file_source": elements["file_source"], + "text": text_sample} + + output.append(new_entry) + text_out.append(text_sample) + counter += 1 + batch_counter += 1 + + # pick up last remaining sample, if any + if len(text_sample) > 0: + + # replacing "doc_ID" | current_doc + new_entry = {"sample_number": counter, "file_source": elements["file_source"], "text": text_sample} + output.append(new_entry) + text_out.append(text_sample) + counter += 1 + batch_counter += 1 + + if batch_counter >= self.file_batch_size: + # write samples to file + start new batch + + output, text_out, testing_set, validation_set, testing_text, validation_text = \ + self.test_validation_splitter(output, text_out) + + training_sample_count += len(output) + validation_sample_count += len(validation_set) + testing_sample_count += len(testing_set) + + tr, va, te = self.save_tr_va_te_sets(output, text_out, validation_set, validation_text, + testing_set, testing_text, ds_folder, batch_number) + + if tr: + for f in tr: + training_files_created.append(f) + + if va: + for f in va: + validation_files_created.append(f) + + if te: + for f in te: + testing_files_created.append(f) + + batch_number += 1 + total_sample_count += len(output) + total_sample_count += len(validation_set) + total_sample_count += len(testing_set) + output = [] + text_out = [] + batch_counter = 0 + + if len(output) > 0: + + output, text_out, testing_set, validation_set, testing_text, validation_text = \ + self.test_validation_splitter(output, text_out) + + training_sample_count += len(output) + validation_sample_count += len(validation_set) + testing_sample_count += len(testing_set) + + tr, va, te = self.save_tr_va_te_sets(output, text_out, validation_set, validation_text, + testing_set, testing_text, ds_folder, batch_number) + + if tr: + for f in tr: + training_files_created.append(f) + + if va: + for f in va: + validation_files_created.append(f) + + if te: + for f in te: + testing_files_created.append(f) + + total_sample_count += len(output) + total_sample_count += len(validation_set) + total_sample_count += len(testing_set) + + dataset_dict = {"ds_id": ds_id, + "training_samples": training_sample_count, + "training_files": training_files_created, + "validation_samples": validation_sample_count, + "validation_files": validation_files_created, + "testing_samples": testing_sample_count, + "testing_files": testing_files_created, + "batches": batch_number + 1, + "prompt_wrapping": "None", + "description": "Core unsupervised text chunk dataset useful for text embedding " + "fine-tuning and domain adaptation with token span size between " + "{} - {}".format(str(min_tokens),str(max_tokens)), + "features": ["text", "file_source", "sample_number"] + } + + # save dataset dict -> and put in ds folder + json_dict = json.dumps(dataset_dict,indent=2) + with open(os.path.join(ds_folder, "manifest.json"),"w") as outfile: + outfile.write(json_dict) + + return dataset_dict + + def build_gen_ds_headline_topic_prompter (self, prompt_wrapping="human_bot", custom_id=None, qr=None): + + # create specific folder for dataset artifacts inside library dataset path + ds_id, ds_folder = self.issue_new_ds_id(custom_id=custom_id) + + if not qr: + # basic filter to get all text and tables in collection + filter_list = ["text", "table"] + + if self.library: + results = CollectionRetrieval(self.library.collection).\ + filter_by_key_value_range("content_type", filter_list) + else: + raise LibraryObjectNotFoundException("no-library-loaded-in-Dataset-constructor") + + else: + results = qr + + total_sample_count = 0 + training_sample_count = 0 + validation_sample_count = 0 + testing_sample_count = 0 + batch_number = 0 + text_out = [] + training_files_created = [] + validation_files_created = [] + testing_files_created = [] + batch_counter = 0 + counter = 0 + output = [] + new_sample = None + features = [] + + for elements in results: + + text_long = elements["text"] + if not text_long: + text_long = elements["table"] + + text_short = elements["header_text"] + doc_id = elements["doc_ID"] + + # looking for samples that are 'organically' paired + if text_long and text_short: + + if len(text_long) > self.text_long_sample_min_len and len(text_short) > self.text_empty_min_threshold: + # need to additional checks if text_long is > max + + if prompt_wrapping == "human_bot": + + instruction = "Please write a paragraph based on the topic: " + new_sample = self.package_human_bot_sample(text_short,text_long) + features = ["text"] + + if prompt_wrapping == "alpaca": + + instruction = "Please write a paragraph based on the topic: " + text_short + response = text_long + new_sample = self.package_alpaca_sample(instruction,response) + features = ["text"] + + if prompt_wrapping == "chat_gpt": + + instruction = "Please write a paragraph based on the topic: " + text_short + new_sample = self.package_chatgpt_sample(instruction, text_long) + features = ["role", "text"] + + if prompt_wrapping == "dict" or not new_sample: + + new_sample = {"sample_number": counter, "file_source": elements["file_source"], + "text_long": text_long, + "text_short": text_short} + + features = ["sample_number", "file_source", "text_long", "text_short"] + + text_entry = text_long + self.separator + text_short + text_out.append(text_entry) + output.append(new_sample) + + counter += 1 + batch_counter += 1 + + if batch_counter >= self.file_batch_size: + + output, text_out, testing_set, validation_set, testing_text, validation_text = \ + self.test_validation_splitter(output, text_out) + + training_sample_count += len(output) + validation_sample_count += len(validation_set) + testing_sample_count += len(testing_set) + + tr, va, te = self.save_tr_va_te_sets(output, text_out, validation_set, validation_text, + testing_set, testing_text, ds_folder, batch_number) + + if tr: + for f in tr: + training_files_created.append(f) + + if va: + for f in va: + validation_files_created.append(f) + + if te: + for f in te: + testing_files_created.append(f) + + total_sample_count += batch_counter + batch_counter = 0 + output = [] + text_out = [] + batch_number += 1 + + if len(output) > 0: + + output, text_out, testing_set, validation_set, testing_text, validation_text = \ + self.test_validation_splitter(output, text_out) + + training_sample_count += len(output) + validation_sample_count += len(validation_set) + testing_sample_count += len(testing_set) + + tr, va, te = self.save_tr_va_te_sets(output, text_out, validation_set, validation_text, + testing_set, testing_text, ds_folder, batch_number) + + if tr: + for f in tr: training_files_created.append(f) + + if va: + for f in va: validation_files_created.append(f) + + if te: + for f in te: testing_files_created.append(f) + + total_sample_count += batch_counter + + # results.close() + + dataset_dict = {"ds_id": ds_id, + "training_samples": training_sample_count, + "training_files": training_files_created, + "validation_samples": validation_sample_count, + "validation_files": validation_files_created, + "testing_samples": testing_sample_count, + "testing_files": testing_files_created, + "batches": batch_number + 1, + "prompt_wrapping": prompt_wrapping, + "description": "Generative AI Dataset created in self-supervised extraction of 'headlines', " + "paired with longer neighboring text passages. In this dataset, the 'headline' " + "is used a prompter topic with the expected Generative output to be a longer " + "paragraph or text on the selected headline subject matter- assembled in format " + "{} for generative model fine-tuning".format(prompt_wrapping), + "features": features} + + # save dataset dict -> and put in ds folder + json_dict = json.dumps(dataset_dict,indent=2) + with open(os.path.join(ds_folder, "manifest.json"),"w") as outfile: + outfile.write(json_dict) + + return dataset_dict + + def build_gen_ds_headline_text_xsum(self, prompt_wrapping="human_bot", custom_id=None, qr=None): + + # create specific folder for dataset artifacts inside library dataset path + ds_id, ds_folder = self.issue_new_ds_id(custom_id=custom_id) + + if not qr: + filter_list = ["text"] # includes only text - should tables be excluded ? + + if self.library: + results = CollectionRetrieval(self.library.collection).\ + filter_by_key_value_range("content_type", filter_list) + else: + raise LibraryObjectNotFoundException("no-library-loaded-in-Dataset-constructor") + + else: + results = qr + + total_sample_count = 0 + training_sample_count = 0 + validation_sample_count = 0 + testing_sample_count = 0 + batch_number = 0 + text_out = [] + training_files_created = [] + validation_files_created = [] + testing_files_created = [] + batch_counter = 0 + counter = 0 + output = [] + new_sample = None + features = [] + + for elements in results: + + text_long = elements["text"] + if not text_long: + text_long = elements["table"] + + text_short = elements["header_text"] + doc_id = elements["doc_ID"] + + # looking for samples that are 'organically' paired + if text_long and text_short: + + if len(text_long) > self.text_long_sample_min_len and len(text_short) > self.text_empty_min_threshold: + # need to additional checks if text_long is > max + + if prompt_wrapping == "human_bot": + instruction = "Please read the following passage, and provide a short summary.\n" + text_long + new_sample = self.package_human_bot_sample(instruction, text_short) + features = ["text"] + + if prompt_wrapping == "alpaca": + instruction = "Please read the following passage, and provide a short summary.\n" + text_long + response = text_short + new_sample = self.package_alpaca_sample(instruction, response) + features = ["text"] + + if prompt_wrapping == "chat_gpt": + instruction = "Please read the following passage, and provide a short summary.\n" + text_long + new_sample = self.package_chatgpt_sample(instruction, text_short) + features = ["role", "text"] + + if prompt_wrapping == "dict" or not new_sample: + new_sample = {"sample_number": counter, "file_source": elements["file_source"], + "text_long": text_long, + "text_short": text_short} + features = ["sample_number", "file_source", "text_long", "text_short"] + + text_entry = text_long + self.separator + text_short + text_out.append(text_entry) + output.append(new_sample) + + counter += 1 + batch_counter += 1 + + if batch_counter >= self.file_batch_size: + + output, text_out, testing_set, validation_set, testing_text, validation_text = \ + self.test_validation_splitter(output, text_out) + + training_sample_count += len(output) + validation_sample_count += len(validation_set) + testing_sample_count += len(testing_set) + + tr, va, te = self.save_tr_va_te_sets(output, text_out, validation_set, validation_text, + testing_set, testing_text, ds_folder, batch_number) + + if tr: + for f in tr: + training_files_created.append(f) + + if va: + for f in va: + validation_files_created.append(f) + + if te: + for f in te: + testing_files_created.append(f) + + total_sample_count += batch_counter + batch_counter = 0 + output = [] + text_out = [] + batch_number += 1 + + if len(output) > 0: + + output, text_out, testing_set, validation_set, testing_text, validation_text = \ + self.test_validation_splitter(output, text_out) + + training_sample_count += len(output) + validation_sample_count += len(validation_set) + testing_sample_count += len(testing_set) + + tr, va, te = self.save_tr_va_te_sets(output, text_out, validation_set, validation_text, + testing_set, testing_text, ds_folder, batch_number) + + if tr: + for f in tr: training_files_created.append(f) + + if va: + for f in va: validation_files_created.append(f) + + if te: + for f in te: testing_files_created.append(f) + + total_sample_count += batch_counter + + # results.close() + + dataset_dict = {"ds_id": ds_id, + "training_samples": training_sample_count, + "training_files": training_files_created, + "validation_samples": validation_sample_count, + "validation_files": validation_files_created, + "testing_samples": testing_sample_count, + "testing_files": testing_files_created, + "batches": batch_number + 1, + "prompt_wrapping": prompt_wrapping, + "description": "Generative AI Dataset for 'XSUM' or extreme summarization, created in " + "self-supervised extraction of 'headlines' paired with neighboring text " + "passages, and assembled in {} format for generative model " + "fine-tuning.".format(prompt_wrapping), + "features": features} + + # save dataset dict -> and put in ds folder + json_dict = json.dumps(dataset_dict, indent=2) + with open(os.path.join(ds_folder, "manifest.json"), "w") as outfile: + outfile.write(json_dict) + + return dataset_dict + + def build_gen_dialog_ds (self, prompt_wrapping="human_bot", human_first=True, role_dict=None, + custom_id=None, qr=None): + + # create specific folder for dataset artifacts inside library dataset path + ds_id, ds_folder = self.issue_new_ds_id(custom_id=custom_id) + + if not qr: + + if self.library: + dialogs = CollectionRetrieval(self.library.collection).filter_by_key("dialog", "true") + else: + raise LibraryObjectNotFoundException("no-library-loaded-in-Dataset-constructor") + + dialogs = sorted(dialogs, key=lambda x:x["doc_ID"], reverse=False) + + if len(dialogs) == 0: + logging.error("error: Datasets builder - not able to identify text as dialog conversation turns") + return - 1 + else: + dialogs = qr + + # counters + output = [] + total_sample_count = 0 + training_sample_count = 0 + validation_sample_count = 0 + testing_sample_count = 0 + training_files_created = [] + validation_files_created = [] + testing_files_created = [] + text_out = [] + batch_number = 0 + batch_counter = 0 + + if len(dialogs) == 0: + logging.error("error: Datasets - no dialog transcripts found") + return -1 + + # pull the doc_id for the first document + current_doc = dialogs[0]["doc_ID"] + current_transcript = [] + current_speaker_list = [] + + for x in range(0,len(dialogs)): + + # bundle all of the conversational turns by document + if dialogs[x]["doc_ID"] == current_doc: + current_transcript.append(dialogs[x]) + if dialogs[x]["author_or_speaker"] not in current_speaker_list: + current_speaker_list.append(dialogs[x]["author_or_speaker"]) + + else: + # process transcript + + transcript_output, trans_text = self._conversation_builder(current_transcript, current_speaker_list, + prompt_wrapping="human_bot") + + output += transcript_output + text_out += trans_text + batch_counter = len(output) + + # reset + current_transcript = [dialogs[x]] + current_speaker_list = [dialogs[x]["author_or_speaker"]] + current_doc = dialogs[x]["doc_ID"] + + # need to confirm "dialog" & then transcript-by-transcript - assigning roles by different speakers + + if batch_counter >= self.file_batch_size: + + output, text_out, testing_set, validation_set, testing_text, validation_text = \ + self.test_validation_splitter(output, text_out) + + training_sample_count += len(output) + validation_sample_count += len(validation_set) + testing_sample_count += len(testing_set) + + tr, va, te = self.save_tr_va_te_sets(output, text_out, validation_set, validation_text, + testing_set, testing_text, ds_folder, batch_number) + + if tr: + for f in tr: + training_files_created.append(f) + + if va: + for f in va: + validation_files_created.append(f) + + if te: + for f in te: + testing_files_created.append(f) + + total_sample_count += batch_counter + batch_counter = 0 + output = [] + text_out = [] + batch_number += 1 + + if len(output) > 0: + + output, text_out, testing_set, validation_set, testing_text, validation_text = \ + self.test_validation_splitter(output, text_out) + + training_sample_count += len(output) + validation_sample_count += len(validation_set) + testing_sample_count += len(testing_set) + + tr, va, te = self.save_tr_va_te_sets(output, text_out, validation_set, validation_text, + testing_set, testing_text, ds_folder, batch_number) + + if tr: + for f in tr: training_files_created.append(f) + + if va: + for f in va: validation_files_created.append(f) + + if te: + for f in te: testing_files_created.append(f) + + total_sample_count += batch_counter + + # results.close() + + dataset_dict = {"ds_id": ds_id, + "training_samples": training_sample_count, + "training_files": training_files_created, + "validation_samples": validation_sample_count, + "validation_files": validation_files_created, + "testing_samples": testing_sample_count, + "testing_files": testing_files_created, + "batches": batch_number + 1, + "prompt_wrapping": prompt_wrapping, + "description": "Generative AI fine-tuning dataset, generated in self-supervised process using " + "dialog transcripts to re-create role-based dialog.", + "features": ["text"]} + + # save dataset dict -> and put in ds folder + json_dict = json.dumps(dataset_dict, indent=2) + with open(os.path.join(ds_folder, "manifest.json"), "w") as outfile: + outfile.write(json_dict) + + return dataset_dict + + def _conversation_builder(self, conversation_blocks, speaker_list, prompt_wrapping="chat_gpt"): + + # note: currently only supports a human_bot format, and assumes human is first speaker + + # inner loop that builds output from a list of conversational turns within a single transcript + dialog_turn = [] + first_speaker = "" + last_speaker = "" + running_convo = "" + output = [] + text_output = [] + + for i, convo in enumerate(conversation_blocks): + + if i == 0: + first_speaker = convo["author_or_speaker"] + running_convo = convo["text"] + dialog_turn.append([first_speaker, running_convo]) + last_speaker = convo["author_or_speaker"] + else: + # general case + if convo["author_or_speaker"] == last_speaker: + running_convo += convo["text"] + for j, speakers in enumerate(dialog_turn): + if speakers[0] == last_speaker: + dialog_turn[j] = [last_speaker, running_convo] + else: + # new speaker + if convo["author_or_speaker"] == first_speaker: + + # wrap up the convo thread + + # prepare output record + turns = [] + for k, convo_turns in enumerate(dialog_turn): + turns.append(convo_turns[1]) + + prompt_wrapping = "human_bot" + if prompt_wrapping == "human_bot": + sample = "" + p = ": " + for t in turns: + sample += p + t + "\n" + # alternate + if p == ": ": + p = ": " + else: + p = ": " + + sample_record = {"text": sample} + output.append(sample_record) + text_output.append(sample) + + # resets + dialog_turn = [] + dialog_turn.append([first_speaker, convo["text"]]) + running_text = convo["text"] + last_speaker = first_speaker + + else: + + running_convo = convo["text"] + last_speaker = convo["author_or_speaker"] + in_list = False + for s, speakers in enumerate(dialog_turn): + if last_speaker == speakers[0]: + dialog_turn[s] = [last_speaker, running_convo] + in_list = True + if not in_list: + dialog_turn.append([last_speaker,running_convo]) + + return output, text_output + + def build_gen_ds_from_prompt_history (self, prompt_wrapping="alpaca", custom_id=None): + + # create specific folder for dataset artifacts inside library dataset path + ds_id, ds_folder = self.issue_new_ds_id(custom_id=custom_id) + + ai_results = PromptState().full_history() + + # counters + batch_counter = 0 + counter = 0 + output = [] + total_sample_count = 0 + training_sample_count = 0 + validation_sample_count = 0 + testing_sample_count = 0 + training_files_created = [] + validation_files_created = [] + testing_files_created = [] + text_out = [] + batch_number = 0 + + for i, entries in enumerate(ai_results): + + prompt = str(entries["prompt"]) + evidence = str(entries["evidence"]) + ai_output = str(entries["llm_response"]) + instruction = str(entries["instruction"]) + sample = None + + if prompt_wrapping not in ["human_bot", "alpaca", "chat_gpt"]: + + prompt_wrapping = "human_bot" + + if prompt_wrapping == "human_bot": + + turn1 = evidence + "\n" + prompt + turn2 = ai_output + sample = self.package_human_bot_sample(turn1,turn2) + + if prompt_wrapping == "alpaca": + + instruction = evidence + "\n" + prompt + response = ai_output + sample = self.package_alpaca_sample(instruction,response) + + if prompt_wrapping == "chat_gpt": + + turn1 = evidence + "\n" + prompt + turn2 = ai_output + sample = self.package_chatgpt_sample(turn1,turn2) + + if sample: + + output.append(sample) + + text_agg = instruction + "\n" + prompt + "\n" + evidence + "\n" + ai_output + text_out.append(text_agg) + + if batch_counter >= self.file_batch_size: + + output, text_out, testing_set, validation_set, testing_text, validation_text = \ + self.test_validation_splitter(output, text_out) + + training_sample_count += len(output) + validation_sample_count += len(validation_set) + testing_sample_count += len(testing_set) + + tr, va, te = self.save_tr_va_te_sets(output, text_out, validation_set, validation_text, + testing_set, testing_text, ds_folder, batch_number) + + if tr: + for f in tr: + training_files_created.append(f) + + if va: + for f in va: + validation_files_created.append(f) + + if te: + for f in te: + testing_files_created.append(f) + + total_sample_count += batch_counter + batch_counter = 0 + output = [] + text_out = [] + batch_number += 1 + + if len(output) > 0: + + output, text_out, testing_set, validation_set, testing_text, validation_text = \ + self.test_validation_splitter(output, text_out) + + training_sample_count += len(output) + validation_sample_count += len(validation_set) + testing_sample_count += len(testing_set) + + tr, va, te = self.save_tr_va_te_sets(output, text_out, validation_set, validation_text, + testing_set, testing_text, ds_folder, batch_number) + + if tr: + for f in tr: training_files_created.append(f) + + if va: + for f in va: validation_files_created.append(f) + + if te: + for f in te: testing_files_created.append(f) + + total_sample_count += batch_counter + + # results.close() + + dataset_dict = {"ds_id": ds_id, + "training_samples": training_sample_count, + "training_files": training_files_created, + "validation_samples": validation_sample_count, + "validation_files": validation_files_created, + "testing_samples": testing_sample_count, + "testing_files": testing_files_created, + "batches": batch_number + 1, + "prompt_wrapping": prompt_wrapping, + "description": "Generative AI Dataset created self-supervised from AI audit log records that " + "capture all facets of generative AI inferences, and can re-packaged to enhance " + "fine-tuning.", + "features": ["text"]} + + # save dataset dict -> and put in ds folder + json_dict = json.dumps(dataset_dict, indent=2) + with open(os.path.join(ds_folder, "manifest.json"), "w") as outfile: + outfile.write(json_dict) + + return dataset_dict + + def build_visual_ds_image_labels (self, query=None, filter_dict=None, qr=None, custom_id=None): + + # create specific folder for dataset artifacts inside library dataset path + ds_id, ds_folder = self.issue_new_ds_id(custom_id=custom_id) + + if not qr: + + # optional: by passing query and/or filter_dict, allows targeted 'subset' of library to be used + if not query and not filter_dict: + + # by default, will get only text and table entries, but no images (since text is duplicative) + filter_list = ["image"] + + if self.library: + results = CollectionRetrieval(self.library.collection).filter_by_key_value_range("content_type", + filter_list) + else: + raise LibraryObjectNotFoundException("no-library-loaded-in-Dataset-constructor") + + else: + # 'assert' content_type == image in filter_dict to only retrieve images + filter_dict.update({"content_type": "image"}) + + if self.library: + results = CollectionRetrieval(self.library.collection). \ + text_search_with_key_value_dict_filter(query, filter_dict) + else: + raise LibraryObjectNotFoundException("no-library-loaded-in-Dataset-constructor") + + else: + results = qr + + batch_counter = 0 + counter = 0 + output = [] + total_sample_count = 0 + training_sample_count = 0 + validation_sample_count = 0 + testing_sample_count = 0 + training_files_created = [] + validation_files_created = [] + testing_files_created = [] + text_out = [] + batch_number = 0 + + for elements in results: + text_long = elements["text"] + text_short = elements["header_text"] + doc_id = elements["doc_ID"] + block_id = elements["block_ID"] + file_name = elements["external_files"] + + if text_long or text_short: + + if len(text_long) > self.text_empty_min_threshold or len(text_short) > self.text_empty_min_threshold: + + new_entry = {"sample_number": counter, "image_ref": file_name, "doc_ID": doc_id, + "block_ID": block_id, "text_long": text_long, "text_short": text_short} + + output.append(new_entry) + text_entry = text_long + self.separator + text_short + text_out.append(text_entry) + + counter += 1 + batch_counter += 1 + + if batch_counter >= self.file_batch_size: + + output, text_out, testing_set, validation_set, testing_text, validation_text = \ + self.test_validation_splitter(output, text_out) + + training_sample_count += len(output) + validation_sample_count += len(validation_set) + testing_sample_count += len(testing_set) + + tr, va, te = self.save_tr_va_te_sets(output, text_out, validation_set, validation_text, + testing_set, testing_text, ds_folder, batch_number) + + if tr: + for f in tr: training_files_created.append(f) + + if va: + for f in va: validation_files_created.append(f) + + if te: + for f in te: testing_files_created.append(f) + + total_sample_count += batch_counter + + batch_counter = 0 + output = [] + text_out = [] + batch_number += 1 + + if len(output) > 0: + + output, text_out, testing_set, validation_set, testing_text, validation_text = \ + self.test_validation_splitter(output, text_out) + + training_sample_count += len(output) + validation_sample_count += len(validation_set) + testing_sample_count += len(testing_set) + + tr, va, te = self.save_tr_va_te_sets(output, text_out, validation_set, validation_text, + testing_set, testing_text, ds_folder, batch_number) + + if tr: + for f in tr: training_files_created.append(f) + + if va: + for f in va: validation_files_created.append(f) + + if te: + for f in te: testing_files_created.append(f) + + total_sample_count += batch_counter + + # results.close() + + # need to package up images into zip folder + + dataset_dict = {"ds_id": ds_id, + "training_samples": training_sample_count, + "training_files": training_files_created, + "validation_samples": validation_sample_count, + "validation_files": validation_files_created, + "testing_samples": testing_sample_count, + "testing_files": testing_files_created, + "batches": batch_number + 1, + "description": "Generative Visual dataset, captured in self-supervised automated process " + "by associating nearby text with images for training visual description " + "generation.", + "features": ["sample_number","image_ref","doc_ID","block_ID","text_long","text_short"]} + + # save dataset dict -> and put in ds folder + json_dict = json.dumps(dataset_dict, indent=2) + with open(os.path.join(ds_folder, "manifest.json"), "w") as outfile: + outfile.write(json_dict) + + return dataset_dict + + def build_gen_ds_targeted_text_completion (self, prompt_wrapping="alpaca", + query=None, filter_dict=None, qr=None, custom_id=None): + + # create specific folder for dataset artifacts inside library dataset path + ds_id, ds_folder = self.issue_new_ds_id(custom_id=custom_id) + + if not qr: + + # optional: by passing query and/or filter_dict, allows targeted 'subset' of library to be used + if not query and not filter_dict: + + # by default, will get only text and table entries, but no images (since text is duplicative) + filter_list = ["text", "table"] + + if self.library: + results = CollectionRetrieval(self.library.collection).filter_by_key_value_range("content_type", + filter_list) + else: + raise LibraryObjectNotFoundException("no-library-loaded-in-Dataset-constructor") + else: + + if self.library: + results = CollectionRetrieval(self.library.collection).\ + text_search_with_key_value_dict_filter(query, filter_dict) + else: + raise LibraryObjectNotFoundException("no-library-loaded-in-Dataset-constructor") + + else: + results = qr + + batch_number = 0 + training_files_created = [] + validation_files_created = [] + testing_files_created = [] + + counter = 0 + batch_counter = 0 + training_sample_count = 0 + validation_sample_count = 0 + testing_sample_count = 0 + total_sample_count = 0 + text_sample = "" + current_doc = -1 + min_tokens = 100 + max_tokens = 1000 + new_sample = "" + text_out = [] + output = [] + + for i, elements in enumerate(results): + + if i == 0: + current_doc = elements["doc_ID"] + text_sample = elements["text"] + + tok_count = self.token_counter(text_sample) + + # if in target range or if last sample in doc + if min_tokens <= tok_count <= max_tokens or elements["doc_ID"] != current_doc: + + # split the sample + text_tokens = self.tokenize_text(text_sample) + tok_count = len(text_tokens) + r = random.randint(0, tok_count-1) + t1 = self.tokenizer.decode(text_tokens[0:r]) + t2 = self.tokenizer.decode(text_tokens[r:]) + + if prompt_wrapping == "human_bot": + instruction = "Please read the following text, and provide a completion.\n" + t1 + new_sample = self.package_human_bot_sample(instruction, t2) + features = ["text"] + + if prompt_wrapping == "alpaca": + instruction = "Please read the following text, and provide a completion.\n" + t1 + new_sample = self.package_alpaca_sample(instruction, t2) + features = ["text"] + + if prompt_wrapping == "chat_gpt": + instruction = "Please read the following text, and provide a completion.\n" + t1 + new_sample = self.package_chatgpt_sample(instruction, t2) + features = ["role", "text"] + + if prompt_wrapping == "dict" or not new_sample: + new_sample = {"sample_number": counter, "file_source": elements["file_source"], + "text": t1, + "completion": t2} + features = ["sample_number", "file_source", "text", "completion"] + + text_entry = t1 + self.separator + t2 + text_out.append(text_entry) + + output.append(new_sample) + text_out.append(text_sample) + counter += 1 + batch_counter += 1 + + # edge case for i==0 + if i == 0: + text_sample = "" + else: + # start fresh + text_sample = elements["text"] + current_doc = elements["doc_ID"] + else: + if tok_count <= min_tokens: + text_sample += " " + elements["text"] + tok_count = self.token_counter(text_sample) + + if tok_count >= max_tokens: + + while tok_count > max_tokens: + + tokens = self.tokenize_text(text_sample) + chopped = tokens[0:max_tokens] + remainder = tokens[max_tokens:] + remainder_text = self.tokenizer.decode(remainder) + chopped_text = self.tokenizer.decode(chopped) + + smooth_stop = self._smooth_stopper(chopped_text,200) + + new_text_sample = chopped_text[:smooth_stop] + new_remainder = chopped_text[smooth_stop:] + remainder_text + + # split the sample + text_tokens = self.tokenize_text(text_sample) + tok_count = len(text_tokens) + r = random.randint(0, tok_count - 1) + t1 = self.tokenizer.decode(text_tokens[0:r]) + t2 = self.tokenizer.decode(text_tokens[r:]) + + if prompt_wrapping == "human_bot": + instruction = "Please read the following text, and provide a completion.\n" + t1 + new_sample = self.package_human_bot_sample(instruction, t2) + features = ["text"] + + if prompt_wrapping == "alpaca": + instruction = "Please read the following text, and provide a completion.\n" + t1 + new_sample = self.package_alpaca_sample(instruction, t2) + features = ["text"] + + if prompt_wrapping == "chat_gpt": + instruction = "Please read the following text, and provide a completion.\n" + t1 + new_sample = self.package_chatgpt_sample(instruction, t2) + features = ["role", "text"] + + if prompt_wrapping == "dict" or not new_sample: + new_sample = {"sample_number": counter, "file_source": elements["file_source"], + "text": t1, + "completion": t2} + features = ["sample_number", "file_source", "text", "completion"] + + text_sample = t1 + "\n" + t2 + output.append(new_sample) + text_out.append(text_sample) + counter += 1 + batch_counter += 1 + text_sample = new_remainder + tok_count = self.token_counter(text_sample) + + # pick up last remaining sample, if any + if len(text_sample) > 0: + + # split the sample + text_tokens = self.tokenize_text(text_sample) + tok_count = len(text_tokens) + r = random.randint(0, tok_count - 1) + t1 = self.tokenizer.decode(text_tokens[0:r]) + t2 = self.tokenizer.decode(text_tokens[r:]) + + if prompt_wrapping == "human_bot": + instruction = "Please read the following text, and provide a completion.\n" + t1 + new_sample = self.package_human_bot_sample(instruction, t2) + features = ["text"] + + if prompt_wrapping == "alpaca": + instruction = "Please read the following text, and provide a completion.\n" + t1 + new_sample = self.package_alpaca_sample(instruction, t2) + features = ["text"] + + if prompt_wrapping == "chat_gpt": + instruction = "Please read the following text, and provide a completion.\n" + t1 + new_sample = self.package_chatgpt_sample(instruction, t2) + features = ["role", "text"] + + if prompt_wrapping == "dict" or not new_sample: + new_sample = {"sample_number": counter, "file_source": elements["file_source"], + "text": t1, + "completion": t2} + features = ["sample_number", "file_source", "text", "completion"] + + # replacing "doc_ID" | current_doc + text_sample = t1 + "\n" + t2 + output.append(new_sample) + text_out.append(text_sample) + counter += 1 + batch_counter += 1 + + if batch_counter >= self.file_batch_size: + # write samples to file + start new batch + + output, text_out, testing_set, validation_set, testing_text, validation_text = \ + self.test_validation_splitter(output, text_out) + + training_sample_count += len(output) + validation_sample_count += len(validation_set) + testing_sample_count += len(testing_set) + + tr, va, te = self.save_tr_va_te_sets(output, text_out, validation_set, validation_text, + testing_set, testing_text, ds_folder, batch_number) + + if tr: + for f in tr: + training_files_created.append(f) + + if va: + for f in va: + validation_files_created.append(f) + + if te: + for f in te: + testing_files_created.append(f) + + batch_number += 1 + total_sample_count += len(output) + total_sample_count += len(validation_set) + total_sample_count += len(testing_set) + output = [] + text_out = [] + batch_counter = 0 + + if len(output) > 0: + + output, text_out, testing_set, validation_set, testing_text, validation_text = \ + self.test_validation_splitter(output, text_out) + + training_sample_count += len(output) + validation_sample_count += len(validation_set) + testing_sample_count += len(testing_set) + + tr, va, te = self.save_tr_va_te_sets(output, text_out, validation_set, validation_text, + testing_set, testing_text, ds_folder, batch_number) + + if tr: + for f in tr: + training_files_created.append(f) + + if va: + for f in va: + validation_files_created.append(f) + + if te: + for f in te: + testing_files_created.append(f) + + total_sample_count += len(output) + total_sample_count += len(validation_set) + total_sample_count += len(testing_set) + + dataset_dict = {"ds_id": ds_id, + "training_samples": training_sample_count, + "training_files": training_files_created, + "validation_samples": validation_sample_count, + "validation_files": validation_files_created, + "testing_samples": testing_sample_count, + "testing_files": testing_files_created, + "batches": batch_number + 1, + "description": "Generative Text/Completion Dataset - splits selected sentences to " + "create an open-context 'what is the completion?' text gen dataset.", + "features": ["text"]} + + # save dataset dict -> and put in ds folder + json_dict = json.dumps(dataset_dict, indent=2) + with open(os.path.join(ds_folder, "manifest.json"), "w") as outfile: + outfile.write(json_dict) + + return dataset_dict + + def test_validation_splitter(self, output, text_out): + + # 100% training with no validation and testing split option + if self.validation_split == 0.0 and self.testing_split == 0.0: + return output, text_out, [], [], [], [] + + validation_count = int(self.validation_split * len(output)) + testing_count = int(self.testing_split * len(output)) + + output_new = [] + text_out_new = [] + testing_set = [] + validation_set = [] + testing_text = [] + validation_text = [] + + random_samples_list = [] + first_entry = random.randint(0, len(output) - 1) + random_samples_list.append(first_entry) + + for x in range(1, validation_count + testing_count): + i = first_entry + while i in random_samples_list: + i = random.randint(0, len(output) - 1) + random_samples_list.append(i) + + validation_adder = 0 + for x in range(0, len(output)): + if x not in random_samples_list: + # keep in training set + output_new.append(output[x]) + text_out_new.append(text_out[x]) + else: + # put in either validation or testing set + if validation_adder < validation_count: + # fill up validation first + validation_set.append(output[x]) + validation_text.append(text_out[x]) + validation_adder += 1 + else: + # once validation set filled, then build testing set + testing_set.append(output[x]) + testing_text.append(text_out[x]) + + return output_new, text_out_new, testing_set, validation_set, testing_text, validation_text + + def save_tr_va_te_sets(self, tr_output, tr_text, va_output, va_text, te_output, te_text, ds_folder, batch_number): + + training_files_created = [] + validation_files_created = [] + testing_files_created = [] + + # save training files + json_batch = self.training_sample_file_name_base + "_{}.jsonl".format(str(batch_number)) + with open(os.path.join(ds_folder,json_batch), "w") as outfile: + for i, sample_dict in enumerate(tr_output): + jsonl_row = json.dumps(sample_dict) + outfile.write(jsonl_row) + outfile.write("\n") + + outfile.close() + + training_files_created.append(json_batch) + + # save validation set + + if len(va_output) > 0: + + new_json_batch = self.validation_sample_file_name_base + "_{}.jsonl".format(str(batch_number)) + with open(os.path.join(ds_folder,new_json_batch), "w") as outfile: + for i, sample_dict in enumerate(va_output): + jsonl_row = json.dumps(sample_dict) + outfile.write(jsonl_row) + outfile.write("\n") + + outfile.close() + + validation_files_created.append(new_json_batch) + + # save testing set + + if len(te_output) > 0: + + new_json_batch = self.testing_sample_file_name_base + "_{}.jsonl".format(str(batch_number)) + with open(os.path.join(ds_folder,new_json_batch), "w") as outfile: + + for i, sample_dict in enumerate(te_output): + jsonl_row = json.dumps(sample_dict) + outfile.write(jsonl_row) + outfile.write("\n") + + outfile.close() + + testing_files_created.append(new_json_batch) + + # save text only version for easy access + new_txt_batch = self.training_sample_file_name_base + "_text_{}.txt".format(str(batch_number)) + t = open(os.path.join(ds_folder,new_txt_batch), 'w') + for x in range(0, len(tr_text)): + t.write((str(tr_text[x]) + "\n")) + t.close() + + training_files_created.append(new_txt_batch) + + # save validation text only version for easy access + + if len(va_text) > 0: + new_txt_batch = self.validation_sample_file_name_base + "_text_{}.txt".format(str(batch_number)) + t = open(os.path.join(ds_folder,new_txt_batch), 'w') + for x in range(0, len(va_text)): + t.write((str(va_text[x]) + "\n")) + t.close() + + validation_files_created.append(new_txt_batch) + + # save testing text only version for easy access + + if len(te_text) > 0: + new_txt_batch = self.testing_sample_file_name_base + "_text_{}.txt".format(str(batch_number)) + t = open(os.path.join(ds_folder,new_txt_batch), 'w') + for x in range(0, len(te_text)): + t.write((str(te_text[x]) + "\n")) + t.close() + + testing_files_created.append(new_txt_batch) + + return training_files_created, validation_files_created, testing_files_created + + # not connected yet - will evaluate further + def _create_image_zip(self, image_list, ds_path): + + zip_name = os.path.join(ds_path, "image.zip") + ds_folder = self.library.image_path + + with ZipFile(zip_name, 'w') as ZipF: + for f in image_list: + ZipF.write(ds_folder + f, f, compress_type=ZIP_DEFLATED) + + ZipF.close() + + return zip_name + + def _smooth_stopper(self, text_chunk, look_back_range): + + # default case is to return the whole text sample as single chunk + smooth_stop = len(text_chunk) + + # look back is the full range that will be reviewed to find proper stopping point + if len(text_chunk) > look_back_range: + look_back = len(text_chunk) - look_back_range + else: + look_back = 0 + + # best case - look for a period + found_period = -1 + for x in range(len(text_chunk)-1,look_back,-1): + + # found a period followed by white space marker (space, \n, \r) - best case + if ord(text_chunk[x]) == 46: + + # first confirm that '.' is followed by white space or is the end of the text + if x+1 == len(text_chunk) or ord(text_chunk[x + 1]) in [32, 13, 10]: + + # exclude 'several edge cases where '.' is not a reliable sentence end + short_window = text_chunk[x-5:x-1] + + # (A) first edge case - "two periods close to each other", e.g., "x.y." + if "." not in short_window: + + # (B) second edge case - "period after number in list", e.g., "point 2." + if not 47 < ord(short_window[-1]) < 58: + + # (C) third edge case - common abbreviations + if short_window[:-2] != "Mr" and short_window[:3] != "Mrs" and short_window[:2] != "Dr": + + # if none of (A) - (B) - (C) or apply, then consider period valid stopping point + found_period = x + 1 + break + + # alternate solid stopper is presence of \n\n | \n\r | \r\r -> usually marks a section/para end + if ord(text_chunk[x]) in [10,13]: + if x+1 == len(text_chunk) or ord(text_chunk[x+1]) in [10,13]: + found_period = x+1 + break + + # if found a period, then smooth stop is the char right after the period + if found_period > - 1: + smooth_stop = found_period + + else: + # if no period found, then next best case is to look for whitespace between words + for y in range(len(text_chunk) - 1, look_back,-1): + + # look for a white space separator + if ord(text_chunk[y]) in [32, 13, 10]: + smooth_stop = y + break + + # if no period or white space found, then return the original stopper + + return smooth_stop + + +# simple API wrapper around popular Yahoo Finance - used in Prompt to pull in real-time info + +class YFinance: + + def __init__(self, ticker=None): + + """ + Widely used Yahoo Finance API - key object = " + TickerObj = yahooFinance.Ticker("META") + print("All Info : ", TickerObj.info) + for keys, values in TickerObj.info.items(): + print("keys: ", keys, values) + + # display Company Sector + print("Company Sector : ", TickerObj.info['sector']) + + # display Price Earnings Ratio + print("Price Earnings Ratio : ", TickerObj.info['trailingPE']) + + # display Company Beta + print(" Company Beta : ", TickerObj.info['beta']) + print(" Financials : ", TickerObj.get_financials()) + """ + + self.company_info = None + + self.financial_summary_keys = ["shortName", "symbol","marketCap", "totalRevenue", "ebitda", "revenueGrowth", "grossMargins", + "freeCashflow", "priceToSalesTrailing12Months", "grossMargins","currency"] + + self.stock_summary_keys = ["shortName", "symbol", "exchange","bid", "ask", "fiftyTwoWeekLow", "fiftyTwoWeekHigh", "symbol", + "shortName", "longName", "currentPrice", "targetHighPrice", "targetLowPrice", + "returnOnAssets", "returnOnEquity", "trailingPE", "forwardPE", "volume", + "forwardEps", "pegRatio", "currency"] + + self.risk_summary_keys = ["shortName","symbol", "auditRisk", "boardRisk", "compensationRisk", "shareHolderRightsRisk", "overallRisk", + "shortName", "longBusinessSummary"] + + self.company_summary_keys = ["shortName", "longName", "symbol", "marketCap", "companyOfficers", "website", + "industry", "sector", "longBusinessSummary", "fullTimeEmployees"] + + self.keys = ["address1", "city", "state", "zip", "country", "phone","website","industry", + "industryDisp", "sector", "sectorDisp", "longBusinessSummary", "fullTimeEmployees", + "companyOfficers", "auditRisk", "boardRisk", "compensationRisk", "shareHolderRightsRisk", + "overallRisk", "previousClose", "open", "dayLow", "dayHigh", "regularMarketPreviousClose", + "regularMarketOpen", "regularMarketDayLow", "regularMarketDayHigh", "payoutRatio", "beta", + "trailingPE", "forwardPE", "volume", "regularMarketVolume", "averageVolume", + "averageVolume10days", "bid", "ask", "bidSize", "askSize", "marketCap", "fiftyTwoWeekLow", + "fiftyTwoWeekHigh", "priceToSalesTrailing12Months", "fiftyDayAverage", "twoHundredDayAverage", + "trailingAnnualDividendRate", "trailingAnnualDividendYield", "currency", "enterpriseValue", + "profitMargins", "floatShares", "sharesOutstanding", "sharesShort", "sharesShortPriorMonth", + "sharesShortPreviousMonthDate", "dateShortInterest", "sharesPercentSharesOut", + "heldPercentInsiders", "heldPercentInstitutions", "shortRatio", "shortPercentOfFloat", + "impliedSharesOutstanding", "bookValue", "priceToBook", "lastFiscalYearEnd", + "nextFiscalYearEnd", "mostRecentQuarter", "earningsPerQuarterlyGrowth", "netIncomeToCommon", + "trailingEps", "forwardEps", "pegRatio", "enterpriseToRevenue", "enterpriseToEbitda", + "52WeekChange", "SandP52WeekChange", "exchange", "quoteType", "symbol", "underlyingSymbol", + "shortName", "longName", "currentPrice", "targetHighPrice", "targetLowPrice", "targetMeanPrice", + "targetMedianPrice", "recommendationMean", "recommendationKey", "numberOfAnalystOpinions", + "totalCash", "totalCashPerShare", "ebitda", "totalDebt", "quickRatio", "currentRatio", + "totalRevenue", "debtToEquity", "revenuePerShare", "returnOnAssets" "returnOnEquity", "grossProfits", + "freeCashflow", "operatingCashflow", "earningsGrowth", "revenueGrowth", "grossMargins", + "ebitdaMargins", "operatingMargins", "financialCurrency", "trailingPegRatio"] + + if ticker: + self.company_info = yfinance.Ticker(ticker) + else: + self.company_info = None + + def ticker(self, company_ticker): + company_info = yfinance.Ticker(company_ticker) + return company_info + + def get_company_summary(self, ticker=None): + output_info = {} + company_info = yfinance.Ticker(ticker).info + for targets in self.company_summary_keys: + for keys, values in company_info.items(): + if targets == keys: + output_info.update({targets: values}) + return output_info + + def get_financial_summary(self, ticker=None): + output_info = {} + company_info = yfinance.Ticker(ticker).info + for targets in self.financial_summary_keys: + for keys, values in company_info.items(): + if targets == keys: + output_info.update({targets: values}) + return output_info + + def get_stock_summary(self, ticker=None): + output_info = {} + company_info = yfinance.Ticker(ticker).info + for targets in self.stock_summary_keys: + for keys,values in company_info.items(): + if targets == keys: + output_info.update({targets: values}) + return output_info + diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..3eeaefbe --- /dev/null +++ b/setup.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python + +import os +import platform +import re +import sys +from setuptools import find_packages, setup +from setuptools.command.install import install +from setuptools.command.develop import develop +from setuptools.command.egg_info import egg_info + +def custom_install_command(): + + try: + if platform.system() == "Windows": + print("llmware is not yet supported on Windows, but it's on our roadmap. Check back soon!") + sys.exit(-1) + + if platform.system() == "Darwin": + if os.system('brew --version') != 0: + error_message="llmware needs Homebrew ('brew') to be installed to setup a few depencencies." + error_message+="\nInstalling HomeBrew is quick and easy: https://brew.sh" + sys.exit(error_message) + os.system('brew install mongo-c-driver libpng libzip libtiff zlib tesseract poppler') + return + + if platform.system() == "Linux": + if os.system('apt list') == 0: + os.system('apt update && apt install -y gcc libxml2 libmongoc-dev libzip4') + return + if os.system('yum help') == 0: + os.system('yum update && yum -y install gcc libxml2 libmongoc-dev libzip4') + return + except Exception as e: + print (e) + # Silently exit (and allow the install to continue if there was any problem) + + +class CustomInstallCommand(install): + def run(self): + custom_install_command() + install.run(self) + +class CustomDevelopCommand(develop): + def run(self): + custom_install_command() + develop.run(self) + +class CustomEggInfoCommand(egg_info): + def run(self): + custom_install_command() + egg_info.run(self) + +VERSION_FILE = "llmware/__init__.py" +with open(VERSION_FILE) as version_file: + match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",version_file.read(), re.MULTILINE) + +if match: + version = match.group(1) +else: + raise RuntimeError(f"Unable to find version string in {VERSION_FILE}.") + +with open("README.md") as readme_file: + long_description = readme_file.read() + + +setup( + name="llmware", # Required + version=version, # Required + description="An enterprise-grade LLM-based development framework, tools, and fine-tuned models", # Optional + long_description=long_description, # Optional + long_description_content_type="text/markdown", # Optional + url="https://github.com/llmware-ai", + project_urls={ + 'Repository': 'https://github.com/llmware-ai/llmware', + }, + author="llmware", + author_email="support@aibloks.com", # Optional + classifiers=[ # Optional + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Topic :: Software Development", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + ], + keywords="ai,data,development", # Optional + packages=['llmware'], + package_data={'llmware': ['lib/**/*.so', 'lib/**/**/*.dylib', 'default_model_repo/**/*.*']}, + python_requires=">=3.9, <3.11", + zip_safe=True, + cmdclass={ + 'install': CustomInstallCommand, + 'develop': CustomDevelopCommand, + 'egg_info': CustomEggInfoCommand, + }, + install_requires=[ + 'ai21>=1.0.3', + 'anthropic>=0.3.11', + 'beautifulsoup4>=4.11.1', + 'boto3>=1.24.53', + 'cohere>=4.1.3', + 'faiss-cpu>=1.7.4', + 'google-cloud-aiplatform>=1.33.1', + 'lxml>=4.9.3', + 'numpy>=1.23.2', + 'openai>=0.27.7', + 'pdf2image>=1.16.0', + 'Pillow>=9.2.0', + 'pymilvus>=2.3.0', + 'pymongo>=4.5.0', + 'pytesseract>=0.3.10', + 'python-on-whales>=0.64.3', + 'scipy>=1.11.2', + 'tokenizers>=0.13.3', + 'torch>=1.13.1', + 'Werkzeug>=2.3.7', + 'word2number>=1.1', + 'Wikipedia-API>=0.6.0', + 'yfinance>=0.2.28' + ] +)