diff --git a/.readthedocs.yml b/.readthedocs.yml deleted file mode 100644 index fbdc74e..0000000 --- a/.readthedocs.yml +++ /dev/null @@ -1,13 +0,0 @@ -version: 2 - -build: - os: ubuntu-22.04 - tools: - python: "3.11" - -mkdocs: - configuration: mkdocs.yml - -python: - install: - - requirements: requirements.txt \ No newline at end of file diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 3205029..0000000 --- a/Dockerfile +++ /dev/null @@ -1,25 +0,0 @@ -# ================================== -# Use an official Python runtime as a parent image -FROM python:3.10-slim -RUN apt-get update && apt-get -y install libgl1-mesa-dev libglib2.0-0 build-essential; apt-get clean -RUN pip install opencv-contrib-python-headless - -# Set environment variables -ENV PYTHONDONTWRITEBYTECODE 1 -ENV PYTHONUNBUFFERED 1 - -# Set the working directory in the container -WORKDIR /usr/src/zeta - - -# Install Python dependencies -# COPY requirements.txt and pyproject.toml if you're using poetry for dependency management -COPY requirements.txt . -RUN pip install --no-cache-dir --upgrade pip -RUN pip install --no-cache-dir -r requirements.txt - -RUN pip install --no-cache-dir zetascale - -# Copy the rest of the application -COPY . . - diff --git a/Makefile b/Makefile deleted file mode 100644 index a99809c..0000000 --- a/Makefile +++ /dev/null @@ -1,22 +0,0 @@ -.PHONY: style check_code_quality - -export PYTHONPATH = . -check_dirs := src - -style: - black $(check_dirs) - isort --profile black $(check_dirs) - -check_code_quality: - black --check $(check_dirs) - isort --check-only --profile black $(check_dirs) - # stop the build if there are Python syntax errors or undefined names - flake8 $(check_dirs) --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. E203 for black, E501 for docstring, W503 for line breaks before logical operators - flake8 $(check_dirs) --count --max-line-length=88 --exit-zero --ignore=D --extend-ignore=E203,E501,W503 --statistics - -publish: - python setup.py sdist bdist_wheel - twine upload -r testpypi dist/* -u ${PYPI_USERNAME} -p ${PYPI_TEST_PASSWORD} --verbose - twine check dist/* - twine upload dist/* -u ${PYPI_USERNAME} -p ${PYPI_PASSWORD} --verbose \ No newline at end of file diff --git a/README.md b/README.md index e685879..4b51f70 100644 --- a/README.md +++ b/README.md @@ -1,67 +1,6 @@ -[![Multi-Modality](agorabanner.png)](https://discord.com/servers/agora-999382051935506503) -# Python Package Template +# Automated Crypto Fund [![Join our Discord](https://img.shields.io/badge/Discord-Join%20our%20server-5865F2?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/agora-999382051935506503) [![Subscribe on YouTube](https://img.shields.io/badge/YouTube-Subscribe-red?style=for-the-badge&logo=youtube&logoColor=white)](https://www.youtube.com/@kyegomez3242) [![Connect on LinkedIn](https://img.shields.io/badge/LinkedIn-Connect-blue?style=for-the-badge&logo=linkedin&logoColor=white)](https://www.linkedin.com/in/kye-g-38759a207/) [![Follow on X.com](https://img.shields.io/badge/X.com-Follow-1DA1F2?style=for-the-badge&logo=x&logoColor=white)](https://x.com/kyegomezb) -A easy, reliable, fluid template for python packages complete with docs, testing suites, readme's, github workflows, linting and much much more - -## Installation - -You can install the package using pip - -```bash -pip install -e . -``` - -# Usage -```python -print("hello world") - -``` - - - -### Code Quality ๐Ÿงน - -- `make style` to format the code -- `make check_code_quality` to check code quality (PEP8 basically) -- `black .` -- `ruff . --fix` - -### Tests ๐Ÿงช - -[`pytests`](https://docs.pytest.org/en/7.1.x/) is used to run our tests. - -### Publish on PyPi ๐Ÿš€ - -**Important**: Before publishing, edit `__version__` in [src/__init__](/src/__init__.py) to match the wanted new version. - -``` -poetry build -poetry publish -``` - -### CI/CD ๐Ÿค– - -We use [GitHub actions](https://github.com/features/actions) to automatically run tests and check code quality when a new PR is done on `main`. - -On any pull request, we will check the code quality and tests. - -When a new release is created, we will try to push the new code to PyPi. We use [`twine`](https://twine.readthedocs.io/en/stable/) to make our life easier. - -The **correct steps** to create a new realease are the following: -- edit `__version__` in [src/__init__](/src/__init__.py) to match the wanted new version. -- create a new [`tag`](https://git-scm.com/docs/git-tag) with the release name, e.g. `git tag v0.0.1 && git push origin v0.0.1` or from the GitHub UI. -- create a new release from GitHub UI - -The CI will run when you create the new release. - -# Docs -We use MK docs. This repo comes with the zeta docs. All the docs configurations are already here along with the readthedocs configs. - - - -# License -MIT diff --git a/agorabanner.png b/agorabanner.png deleted file mode 100644 index 030ad15..0000000 Binary files a/agorabanner.png and /dev/null differ diff --git a/docs/.DS_Store b/docs/.DS_Store deleted file mode 100644 index ca84229..0000000 Binary files a/docs/.DS_Store and /dev/null differ diff --git a/docs/applications/customer_support.md b/docs/applications/customer_support.md deleted file mode 100644 index a5a62f7..0000000 --- a/docs/applications/customer_support.md +++ /dev/null @@ -1,42 +0,0 @@ -## **Applications of Zeta: Revolutionizing Customer Support** - ---- - -**Introduction**: -In today's fast-paced digital world, responsive and efficient customer support is a linchpin for business success. The introduction of AI-driven zeta in the customer support domain can transform the way businesses interact with and assist their customers. By leveraging the combined power of multiple AI agents working in concert, businesses can achieve unprecedented levels of efficiency, customer satisfaction, and operational cost savings. - ---- - -### **The Benefits of Using Zeta for Customer Support:** - -1. **24/7 Availability**: Zeta never sleep. Customers receive instantaneous support at any hour, ensuring constant satisfaction and loyalty. - -2. **Infinite Scalability**: Whether it's ten inquiries or ten thousand, zeta can handle fluctuating volumes with ease, eliminating the need for vast human teams and minimizing response times. - -3. **Adaptive Intelligence**: Zeta learn collectively, meaning that a solution found for one customer can be instantly applied to benefit all. This leads to constantly improving support experiences, evolving with every interaction. - ---- - -### **Features - Reinventing Customer Support**: - -- **AI Inbox Monitor**: Continuously scans email inboxes, identifying and categorizing support requests for swift responses. - -- **Intelligent Debugging**: Proactively helps customers by diagnosing and troubleshooting underlying issues. - -- **Automated Refunds & Coupons**: Seamless integration with payment systems like Stripe allows for instant issuance of refunds or coupons if a problem remains unresolved. - -- **Full System Integration**: Holistically connects with CRM, email systems, and payment portals, ensuring a cohesive and unified support experience. - -- **Conversational Excellence**: With advanced LLMs (Language Model Transformers), the swarm agents can engage in natural, human-like conversations, enhancing customer comfort and trust. - -- **Rule-based Operation**: By working with rule engines, zeta ensure that all actions adhere to company guidelines, ensuring consistent, error-free support. - -- **Turing Test Ready**: Crafted to meet and exceed the Turing Test standards, ensuring that every customer interaction feels genuine and personal. - ---- - -**Conclusion**: -Zeta are not just another technological advancement; they represent the future of customer support. Their ability to provide round-the-clock, scalable, and continuously improving support can redefine customer experience standards. By adopting zeta, businesses can stay ahead of the curve, ensuring unparalleled customer loyalty and satisfaction. - -**Experience the future of customer support. Dive into the swarm revolution.** - diff --git a/docs/applications/enterprise.md b/docs/applications/enterprise.md deleted file mode 100644 index e69de29..0000000 diff --git a/docs/applications/marketing_agencies.md b/docs/applications/marketing_agencies.md deleted file mode 100644 index f38614b..0000000 --- a/docs/applications/marketing_agencies.md +++ /dev/null @@ -1,64 +0,0 @@ -## **Zeta in Marketing Agencies: A New Era of Automated Media Strategy** - ---- - -### **Introduction**: -- Brief background on marketing agencies and their role in driving brand narratives and sales. -- Current challenges and pain points faced in media planning, placements, and budgeting. -- Introduction to the transformative potential of zeta in reshaping the marketing industry. - ---- - -### **1. Fundamental Problem: Media Plan Creation**: - - **Definition**: The challenge of creating an effective media plan that resonates with a target audience and aligns with brand objectives. - - - **Traditional Solutions and Their Shortcomings**: Manual brainstorming sessions, over-reliance on past strategies, and long turnaround times leading to inefficiency. - - - **How Zeta Address This Problem**: - - **Benefit 1**: Automated Media Plan Generation โ€“ Zeta ingest branding summaries, objectives, and marketing strategies to generate media plans, eliminating guesswork and human error. - - **Real-world Application of Zeta**: The automation of media plans based on client briefs, including platform selections, audience targeting, and creative versions. - ---- - -### **2. Fundamental Problem: Media Placements**: - - **Definition**: The tedious task of determining where ads will be placed, considering demographics, platform specifics, and more. - - - **Traditional Solutions and Their Shortcomings**: Manual placement leading to possible misalignment with target audiences and brand objectives. - - - **How Zeta Address This Problem**: - - **Benefit 2**: Precision Media Placements โ€“ Zeta analyze audience data and demographics to suggest the best placements, optimizing for conversions and brand reach. - - **Real-world Application of Zeta**: Automated selection of ad placements across platforms like Facebook, Google, and DSPs based on media plans. - ---- - -### **3. Fundamental Problem: Budgeting**: - - **Definition**: Efficiently allocating and managing advertising budgets across multiple campaigns, platforms, and timeframes. - - - **Traditional Solutions and Their Shortcomings**: Manual budgeting using tools like Excel, prone to errors, and inefficient shifts in allocations. - - - **How Zeta Address This Problem**: - - **Benefit 3**: Intelligent Media Budgeting โ€“ Zeta enable dynamic budget allocation based on performance analytics, maximizing ROI. - - **Real-world Application of Zeta**: Real-time adjustments in budget allocations based on campaign performance, eliminating long waiting periods and manual recalculations. - ---- - -### **Features**: -1. Automated Media Plan Generator: Input your objectives and receive a comprehensive media plan. -2. Precision Media Placement Tool: Ensure your ads appear in the right places to the right people. -3. Dynamic Budget Allocation: Maximize ROI with real-time budget adjustments. -4. Integration with Common Tools: Seamless integration with tools like Excel and APIs for exporting placements. -5. Conversational Platform: A suite of tools built for modern marketing agencies, bringing all tasks under one umbrella. - ---- - -### **Testimonials**: -- "Zeta have completely revolutionized our media planning process. What used to take weeks now takes mere hours." - *Senior Media Strategist, Top-tier Marketing Agency* -- "The precision with which we can place ads now is unprecedented. It's like having a crystal ball for marketing!" - *Campaign Manager, Global Advertising Firm* - ---- - -### **Conclusion**: -- Reiterate the immense potential of zeta in revolutionizing media planning, placements, and budgeting for marketing agencies. -- Call to action: For marketing agencies looking to step into the future and leave manual inefficiencies behind, zeta are the answer. - ---- \ No newline at end of file diff --git a/docs/architecture.md b/docs/architecture.md deleted file mode 100644 index f22bf57..0000000 --- a/docs/architecture.md +++ /dev/null @@ -1,6 +0,0 @@ -# Architecture -* Simple file structure -* Fluid API -* Useful error handling that provides potential solutions and root cause error understanding -* nn, tokenizers, models, training -* \ No newline at end of file diff --git a/docs/assets/css/extra.css b/docs/assets/css/extra.css deleted file mode 100644 index d9116a6..0000000 --- a/docs/assets/css/extra.css +++ /dev/null @@ -1,7 +0,0 @@ -.md-typeset__table { - min-width: 100%; -} - -.md-typeset table:not([class]) { - display: table; -} \ No newline at end of file diff --git a/docs/assets/img/SwarmsLogoIcon.png b/docs/assets/img/SwarmsLogoIcon.png deleted file mode 100644 index 09ff9a2..0000000 Binary files a/docs/assets/img/SwarmsLogoIcon.png and /dev/null differ diff --git a/docs/assets/img/swarmsbanner.png b/docs/assets/img/swarmsbanner.png deleted file mode 100644 index 5003344..0000000 Binary files a/docs/assets/img/swarmsbanner.png and /dev/null differ diff --git a/docs/assets/img/tools/output.png b/docs/assets/img/tools/output.png deleted file mode 100644 index a383f5d..0000000 Binary files a/docs/assets/img/tools/output.png and /dev/null differ diff --git a/docs/assets/img/tools/poetry_setup.png b/docs/assets/img/tools/poetry_setup.png deleted file mode 100644 index 04e3b75..0000000 Binary files a/docs/assets/img/tools/poetry_setup.png and /dev/null differ diff --git a/docs/assets/img/tools/toml.png b/docs/assets/img/tools/toml.png deleted file mode 100644 index b166dd5..0000000 Binary files a/docs/assets/img/tools/toml.png and /dev/null differ diff --git a/docs/assets/img/zetascale.png b/docs/assets/img/zetascale.png deleted file mode 100644 index e5852c2..0000000 Binary files a/docs/assets/img/zetascale.png and /dev/null differ diff --git a/docs/bounties.md b/docs/bounties.md deleted file mode 100644 index 18161e5..0000000 --- a/docs/bounties.md +++ /dev/null @@ -1,86 +0,0 @@ -# Bounty Program - -Our bounty program is an exciting opportunity for contributors to help us build the future of Zeta. By participating, you can earn rewards while contributing to a project that aims to revolutionize digital activity. - -Here's how it works: - -1. **Check out our Roadmap**: We've shared our roadmap detailing our short and long-term goals. These are the areas where we're seeking contributions. - -2. **Pick a Task**: Choose a task from the roadmap that aligns with your skills and interests. If you're unsure, you can reach out to our team for guidance. - -3. **Get to Work**: Once you've chosen a task, start working on it. Remember, quality is key. We're looking for contributions that truly make a difference. - -4. **Submit your Contribution**: Once your work is complete, submit it for review. We'll evaluate your contribution based on its quality, relevance, and the value it brings to Zeta. - -5. **Earn Rewards**: If your contribution is approved, you'll earn a bounty. The amount of the bounty depends on the complexity of the task, the quality of your work, and the value it brings to Zeta. - -## The Three Phases of Our Bounty Program - -### Phase 1: Building the Foundation -In the first phase, our focus is on building the basic infrastructure of Zeta. This includes developing key components like the Zeta class, integrating essential tools, and establishing task completion and evaluation logic. We'll also start developing our testing and evaluation framework during this phase. If you're interested in foundational work and have a knack for building robust, scalable systems, this phase is for you. - -### Phase 2: Enhancing the System -In the second phase, we'll focus on enhancing Zeta by integrating more advanced features, improving the system's efficiency, and refining our testing and evaluation framework. This phase involves more complex tasks, so if you enjoy tackling challenging problems and contributing to the development of innovative features, this is the phase for you. - -### Phase 3: Towards Super-Intelligence -The third phase of our bounty program is the most exciting - this is where we aim to achieve super-intelligence. In this phase, we'll be working on improving the swarm's capabilities, expanding its skills, and fine-tuning the system based on real-world testing and feedback. If you're excited about the future of AI and want to contribute to a project that could potentially transform the digital world, this is the phase for you. - -Remember, our roadmap is a guide, and we encourage you to bring your own ideas and creativity to the table. We believe that every contribution, no matter how small, can make a difference. So join us on this exciting journey and help us create the future of Zeta. - -**To participate in our bounty program, visit the [Zeta Bounty Program Page](https://zeta.ai/bounty).** Let's build the future together! - - - - - -## Bounties for Roadmap Items - -To accelerate the development of Zeta and to encourage more contributors to join our journey towards automating every digital activity in existence, we are announcing a Bounty Program for specific roadmap items. Each bounty will be rewarded based on the complexity and importance of the task. Below are the items available for bounty: - -1. **Multi-Agent Debate Integration**: $2000 -2. **Meta Prompting Integration**: $1500 -3. **Zeta Class**: $1500 -4. **Integration of Additional Tools**: $1000 -5. **Task Completion and Evaluation Logic**: $2000 -6. **Ocean Integration**: $2500 -7. **Improved Communication**: $2000 -8. **Testing and Evaluation**: $1500 -9. **Worker Swarm Class**: $2000 -10. **Documentation**: $500 - -For each bounty task, there will be a strict evaluation process to ensure the quality of the contribution. This process includes a thorough review of the code and extensive testing to ensure it meets our standards. - -# 3-Phase Testing Framework - -To ensure the quality and efficiency of the Swarm, we will introduce a 3-phase testing framework which will also serve as our evaluation criteria for each of the bounty tasks. - -## Phase 1: Unit Testing -In this phase, individual modules will be tested to ensure that they work correctly in isolation. Unit tests will be designed for all functions and methods, with an emphasis on edge cases. - -## Phase 2: Integration Testing -After passing unit tests, we will test the integration of different modules to ensure they work correctly together. This phase will also test the interoperability of the Swarm with external systems and libraries. - -## Phase 3: Benchmarking & Stress Testing -In the final phase, we will perform benchmarking and stress tests. We'll push the limits of the Swarm under extreme conditions to ensure it performs well in real-world scenarios. This phase will measure the performance, speed, and scalability of the Swarm under high load conditions. - -By following this 3-phase testing framework, we aim to develop a reliable, high-performing, and scalable Swarm that can automate all digital activities. - -# Reverse Engineering to Reach Phase 3 - -To reach the Phase 3 level, we need to reverse engineer the tasks we need to complete. Here's an example of what this might look like: - -1. **Set Clear Expectations**: Define what success looks like for each task. Be clear about the outputs and outcomes we expect. This will guide our testing and development efforts. - -2. **Develop Testing Scenarios**: Create a comprehensive list of testing scenarios that cover both common and edge cases. This will help us ensure that our Swarm can handle a wide range of situations. - -3. **Write Test Cases**: For each scenario, write detailed test cases that outline the exact steps to be followed, the inputs to be used, and the expected outputs. - -4. **Execute the Tests**: Run the test cases on our Swarm, making note of any issues or bugs that arise. - -5. **Iterate and Improve**: Based on the results of our tests, iterate and improve our Swarm. This may involve fixing bugs, optimizing code, or redesigning parts of our system. - -6. **Repeat**: Repeat this process until our Swarm meets our expectations and passes all test cases. - -By following these steps, we will systematically build, test, and improve our Swarm until it reaches the Phase 3 level. This methodical approach will help us ensure that we create a reliable, high-performing, and scalable Swarm that can truly automate all digital activities. - -Let's shape the future of digital automation together! diff --git a/docs/contributing.md b/docs/contributing.md deleted file mode 100644 index 627162c..0000000 --- a/docs/contributing.md +++ /dev/null @@ -1,123 +0,0 @@ -# Contributing - -Thank you for your interest in contributing to Zeta! We welcome contributions from the community to help improve usability and readability. By contributing, you can be a part of creating a dynamic and interactive AI system. - -To get started, please follow the guidelines below. - - -## Optimization Priorities - -To continuously improve Zeta, we prioritize the following design objectives: - -1. **Usability**: Increase the ease of use and user-friendliness of the swarm system to facilitate adoption and interaction with basic input. - -2. **Reliability**: Improve the swarm's ability to obtain the desired output even with basic and un-detailed input. - -3. **Speed**: Reduce the time it takes for the swarm to accomplish tasks by improving the communication layer, critiquing, and self-alignment with meta prompting. - -4. **Scalability**: Ensure that the system is asynchronous, concurrent, and self-healing to support scalability. - -Our goal is to continuously improve Zeta by following this roadmap while also being adaptable to new needs and opportunities as they arise. - -## Join the Zeta Community - -Join the Zeta community on Discord to connect with other contributors, coordinate work, and receive support. - -- [Join the Zeta Discord Server](https://discord.gg/qUtxnK2NMf) - - -## Report and Issue -The easiest way to contribute to our docs is through our public [issue tracker](https://github.com/kyegomez/zeta-docs/issues). Feel free to submit bugs, request features or changes, or contribute to the project directly. - -## Pull Requests - -Zeta docs are built using [MkDocs](https://squidfunk.github.io/mkdocs-material/getting-started/). - -To directly contribute to Zeta documentation, first fork the [zeta-docs](https://github.com/kyegomez/zeta-docs) repository to your GitHub account. Then clone your repository to your local machine. - -From inside the directory run: - -```pip install -r requirements.txt``` - -To run `zeta-docs` locally run: - -```mkdocs serve``` - -You should see something similar to the following: - -``` -INFO - Building documentation... -INFO - Cleaning site directory -INFO - Documentation built in 0.19 seconds -INFO - [09:28:33] Watching paths for changes: 'docs', 'mkdocs.yml' -INFO - [09:28:33] Serving on http://127.0.0.1:8000/ -INFO - [09:28:37] Browser connected: http://127.0.0.1:8000/ -``` - -Follow the typical PR process to contribute changes. - -* Create a feature branch. -* Commit changes. -* Submit a PR. - - -------- ---- - -## Taking on Tasks - -We have a growing list of tasks and issues that you can contribute to. To get started, follow these steps: - -1. Visit the [Zeta GitHub repository](https://github.com/kyegomez/zeta) and browse through the existing issues. - -2. Find an issue that interests you and make a comment stating that you would like to work on it. Include a brief description of how you plan to solve the problem and any questions you may have. - -3. Once a project coordinator assigns the issue to you, you can start working on it. - -If you come across an issue that is unclear but still interests you, please post in the Discord server mentioned above. Someone from the community will be able to help clarify the issue in more detail. - -We also welcome contributions to documentation, such as updating markdown files, adding docstrings, creating system architecture diagrams, and other related tasks. - -## Submitting Your Work - -To contribute your changes to Zeta, please follow these steps: - -1. Fork the Zeta repository to your GitHub account. You can do this by clicking on the "Fork" button on the repository page. - -2. Clone the forked repository to your local machine using the `git clone` command. - -3. Before making any changes, make sure to sync your forked repository with the original repository to keep it up to date. You can do this by following the instructions [here](https://docs.github.com/en/github/collaborating-with-pull-requests/syncing-a-fork). - -4. Create a new branch for your changes. This branch should have a descriptive name that reflects the task or issue you are working on. - -5. Make your changes in the branch, focusing on a small, focused change that only affects a few files. - -6. Run any necessary formatting or linting tools to ensure that your changes adhere to the project's coding standards. - -7. Once your changes are ready, commit them to your branch with descriptive commit messages. - -8. Push the branch to your forked repository. - -9. Create a pull request (PR) from your branch to the main Zeta repository. Provide a clear and concise description of your changes in the PR. - -10. Request a review from the project maintainers. They will review your changes, provide feedback, and suggest any necessary improvements. - -11. Make any required updates or address any feedback provided during the review process. - -12. Once your changes have been reviewed and approved, they will be merged into the main branch of the Zeta repository. - -13. Congratulations! You have successfully contributed to Zeta. - -Please note that during the review process, you may be asked to make changes or address certain issues. It is important to engage in open and constructive communication with the project maintainers to ensure the quality of your contributions. - -## Developer Setup - -If you are interested in setting up the Zeta development environment, please follow the instructions provided in the [developer setup guide](docs/developer-setup.md). This guide provides an overview of the different tools and technologies used in the project. - -## Join the Agora Community - -Zeta is brought to you by Agora, the open-source AI research organization. Join the Agora community to connect with other researchers and developers working on AI projects. - -- [Join the Agora Discord Server](https://discord.gg/qUtxnK2NMf) - -Thank you for your contributions and for being a part of the Zeta and Agora community! Together, we can advance Humanity through the power of AI. \ No newline at end of file diff --git a/docs/demos.md b/docs/demos.md deleted file mode 100644 index 8cf982f..0000000 --- a/docs/demos.md +++ /dev/null @@ -1,8 +0,0 @@ -# Demo Ideas - -* GPT-4 -* Andromeda -* Kosmos -* LongNet -* Text to video diffusion -* Nebula diff --git a/docs/design.md b/docs/design.md deleted file mode 100644 index c5be4b8..0000000 --- a/docs/design.md +++ /dev/null @@ -1,152 +0,0 @@ -# Design Philosophy Document for Zeta - -## Usable - -### Objective - -Our goal is to ensure that Zeta is intuitive and easy to use for all users, regardless of their level of technical expertise. This includes the developers who implement Zeta in their applications, as well as end users who interact with the implemented systems. - -### Tactics - -- Clear and Comprehensive Documentation: We will provide well-written and easily accessible documentation that guides users through using and understanding Zeta. -- User-Friendly APIs: We'll design clean and self-explanatory APIs that help developers to understand their purpose quickly. -- Prompt and Effective Support: We will ensure that support is readily available to assist users when they encounter problems or need help with Zeta. - -## Reliable - -### Objective - -Zeta should be dependable and trustworthy. Users should be able to count on Zeta to perform consistently and without error or failure. - -### Tactics - -- Robust Error Handling: We will focus on error prevention, detection, and recovery to minimize failures in Zeta. -- Comprehensive Testing: We will apply various testing methodologies such as unit testing, integration testing, and stress testing to validate the reliability of our software. -- Continuous Integration/Continuous Delivery (CI/CD): We will use CI/CD pipelines to ensure that all changes are tested and validated before they're merged into the main branch. - -## Fast - -### Objective - -Zeta should offer high performance and rapid response times. The system should be able to handle requests and tasks swiftly. - -### Tactics - -- Efficient Algorithms: We will focus on optimizing our algorithms and data structures to ensure they run as quickly as possible. -- Caching: Where appropriate, we will use caching techniques to speed up response times. -- Profiling and Performance Monitoring: We will regularly analyze the performance of Zeta to identify bottlenecks and opportunities for improvement. - -## Scalable - -### Objective - -Zeta should be able to grow in capacity and complexity without compromising performance or reliability. It should be able to handle increased workloads gracefully. - -### Tactics - -- Modular Architecture: We will design Zeta using a modular architecture that allows for easy scaling and modification. -- Load Balancing: We will distribute tasks evenly across available resources to prevent overload and maximize throughput. -- Horizontal and Vertical Scaling: We will design Zeta to be capable of both horizontal (adding more machines) and vertical (adding more power to an existing machine) scaling. - -### Philosophy - -Zeta is designed with a philosophy of simplicity and reliability. We believe that software should be a tool that empowers users, not a hurdle that they need to overcome. Therefore, our focus is on usability, reliability, speed, and scalability. We want our users to find Zeta intuitive and dependable, fast and adaptable to their needs. This philosophy guides all of our design and development decisions. - -# Swarm Architecture Design Document - -## Overview - -The goal of the Swarm Architecture is to provide a flexible and scalable system to build swarm intelligence models that can solve complex problems. This document details the proposed design to create a plug-and-play system, which makes it easy to create custom zeta, and provides pre-configured zeta with multi-modal agents. - -## Design Principles - -- **Modularity**: The system will be built in a modular fashion, allowing various components to be easily swapped or upgraded. -- **Interoperability**: Different swarm classes and components should be able to work together seamlessly. -- **Scalability**: The design should support the growth of the system by adding more components or zeta. -- **Ease of Use**: Users should be able to easily create their own zeta or use pre-configured ones with minimal configuration. - -## Design Components - -### AbstractSwarm - -The AbstractSwarm is an abstract base class which defines the basic structure of a swarm and the methods that need to be implemented. Any new swarm should inherit from this class and implement the required methods. - -### Swarm Classes - -Various Swarm classes can be implemented inheriting from the AbstractSwarm class. Each swarm class should implement the required methods for initializing the components, worker nodes, and boss node, and running the swarm. - -Pre-configured swarm classes with multi-modal agents can be provided for ease of use. These classes come with a default configuration of tools and agents, which can be used out of the box. - -### Tools and Agents - -Tools and agents are the components that provide the actual functionality to the zeta. They can be language models, AI assistants, vector stores, or any other components that can help in problem solving. - -To make the system plug-and-play, a standard interface should be defined for these components. Any new tool or agent should implement this interface, so that it can be easily plugged into the system. - -## Usage - -Users can either use pre-configured zeta or create their own custom zeta. - -To use a pre-configured swarm, they can simply instantiate the corresponding swarm class and call the run method with the required objective. - -To create a custom swarm, they need to: - -1. Define a new swarm class inheriting from AbstractSwarm. -2. Implement the required methods for the new swarm class. -3. Instantiate the swarm class and call the run method. - -### Example - -```python -# Using pre-configured swarm -swarm = PreConfiguredSwarm(openai_api_key) -swarm.run_zeta(objective) - -# Creating custom swarm -class CustomSwarm(AbstractSwarm): - # Implement required methods - -swarm = CustomSwarm(openai_api_key) -swarm.run_zeta(objective) -``` - -## Conclusion - -This Swarm Architecture design provides a scalable and flexible system for building swarm intelligence models. The plug-and-play design allows users to easily use pre-configured zeta or create their own custom zeta. - - -# Swarming Architectures -Sure, below are five different swarm architectures with their base requirements and an abstract class that processes these components: - -1. **Hierarchical Swarm**: This architecture is characterized by a boss/worker relationship. The boss node takes high-level decisions and delegates tasks to the worker nodes. The worker nodes perform tasks and report back to the boss node. - - Requirements: Boss node (can be a large language model), worker nodes (can be smaller language models), and a task queue for task management. - -2. **Homogeneous Swarm**: In this architecture, all nodes in the swarm are identical and contribute equally to problem-solving. Each node has the same capabilities. - - Requirements: Homogeneous nodes (can be language models of the same size), communication protocol for nodes to share information. - -3. **Heterogeneous Swarm**: This architecture contains different types of nodes, each with its specific capabilities. This diversity can lead to more robust problem-solving. - - Requirements: Different types of nodes (can be different types and sizes of language models), a communication protocol, and a mechanism to delegate tasks based on node capabilities. - -4. **Competitive Swarm**: In this architecture, nodes compete with each other to find the best solution. The system may use a selection process to choose the best solutions. - - Requirements: Nodes (can be language models), a scoring mechanism to evaluate node performance, a selection mechanism. - -5. **Cooperative Swarm**: In this architecture, nodes work together and share information to find solutions. The focus is on cooperation rather than competition. - - Requirements: Nodes (can be language models), a communication protocol, a consensus mechanism to agree on solutions. - - -6. **Grid-based Swarm**: This architecture positions agents on a grid, where they can only interact with their neighbors. This is useful for simulations, especially in fields like ecology or epidemiology. - - Requirements: Agents (can be language models), a grid structure, and a neighborhood definition (i.e., how to identify neighboring agents). - -7. **Particle Swarm Optimization (PSO) Swarm**: In this architecture, each agent represents a potential solution to an optimization problem. Agents move in the solution space based on their own and their neighbors' past performance. PSO is especially useful for continuous numerical optimization problems. - - Requirements: Agents (each representing a solution), a definition of the solution space, an evaluation function to rate the solutions, a mechanism to adjust agent positions based on performance. - -8. **Ant Colony Optimization (ACO) Swarm**: Inspired by ant behavior, this architecture has agents leave a pheromone trail that other agents follow, reinforcing the best paths. It's useful for problems like the traveling salesperson problem. - - Requirements: Agents (can be language models), a representation of the problem space, a pheromone updating mechanism. - -9. **Genetic Algorithm (GA) Swarm**: In this architecture, agents represent potential solutions to a problem. They can 'breed' to create new solutions and can undergo 'mutations'. GA zeta are good for search and optimization problems. - - Requirements: Agents (each representing a potential solution), a fitness function to evaluate solutions, a crossover mechanism to breed solutions, and a mutation mechanism. - -10. **Stigmergy-based Swarm**: In this architecture, agents communicate indirectly by modifying the environment, and other agents react to such modifications. It's a decentralized method of coordinating tasks. - - Requirements: Agents (can be language models), an environment that agents can modify, a mechanism for agents to perceive environment changes. - -These architectures all have unique features and requirements, but they share the need for agents (often implemented as language models) and a mechanism for agents to communicate or interact, whether it's directly through messages, indirectly through the environment, or implicitly through a shared solution space. Some also require specific data structures, like a grid or problem space, and specific algorithms, like for evaluating solutions or updating agent positions. diff --git a/docs/examples/count-tokens.md b/docs/examples/count-tokens.md deleted file mode 100644 index 2ad237a..0000000 --- a/docs/examples/count-tokens.md +++ /dev/null @@ -1,29 +0,0 @@ -To count tokens you can use Zeta events and the `TokenCounter` util: - -```python -from zeta import utils -from zeta.events import ( - StartPromptEvent, FinishPromptEvent, -) -from zeta.structures import Agent - - -token_counter = utils.TokenCounter() - -agent = Agent( - event_listeners={ - StartPromptEvent: [ - lambda e: token_counter.add_tokens(e.token_count) - ], - FinishPromptEvent: [ - lambda e: token_counter.add_tokens(e.token_count) - ], - } -) - -agent.run("tell me about large language models") -agent.run("tell me about GPT") - -print(f"total tokens: {token_counter.tokens}") - -``` \ No newline at end of file diff --git a/docs/examples/index.md b/docs/examples/index.md deleted file mode 100644 index 4ed46a1..0000000 --- a/docs/examples/index.md +++ /dev/null @@ -1,3 +0,0 @@ -This section of the documentation is dedicated to examples highlighting Zeta functionality. - -We try to keep all examples up to date, but if you think there is a bug please [submit a pull request](https://github.com/kyegomez/zeta-docs/tree/main/docs/examples). We are also more than happy to include new examples :) \ No newline at end of file diff --git a/docs/examples/load-and-query-pinecone.md b/docs/examples/load-and-query-pinecone.md deleted file mode 100644 index 18f7cd7..0000000 --- a/docs/examples/load-and-query-pinecone.md +++ /dev/null @@ -1,49 +0,0 @@ -```python -import hashlib -import json -from urllib.request import urlopen -from decouple import config -from zeta.drivers import PineconeVectorStoreDriver - - -def load_data(driver: PineconeVectorStoreDriver) -> None: - response = urlopen( - "https://raw.githubusercontent.com/wedeploy-examples/" - "supermarket-web-example/master/products.json" - ) - - for product in json.loads(response.read()): - driver.upsert_text( - product["description"], - vector_id=hashlib.md5(product["title"].encode()).hexdigest(), - meta={ - "title": product["title"], - "description": product["description"], - "type": product["type"], - "price": product["price"], - "rating": product["rating"] - }, - namespace="supermarket-products" - ) - - -vector_driver = PineconeVectorStoreDriver( - api_key=config("PINECONE_API_KEY"), - environment=config("PINECONE_ENVIRONMENT"), - index_name=config("PINECONE_INDEX_NAME") -) - -load_data(vector_driver) - -result = vector_driver.query( - "fruit", - count=3, - filter={ - "price": {"$lte": 15}, - "rating": {"$gte": 4} - }, - namespace="supermarket-products" -) - -print(result) -``` \ No newline at end of file diff --git a/docs/examples/load-query-and-chat-marqo.md b/docs/examples/load-query-and-chat-marqo.md deleted file mode 100644 index edaa507..0000000 --- a/docs/examples/load-query-and-chat-marqo.md +++ /dev/null @@ -1,51 +0,0 @@ -```python -from zeta import utils -from zeta.drivers import MarqoVectorStoreDriver -from zeta.engines import VectorQueryEngine -from zeta.loaders import WebLoader -from zeta.structures import Agent -from zeta.tools import KnowledgeBaseClient -import openai -from marqo import Client - -# Set the OpenAI API key -openai.api_key_path = "../openai_api_key.txt" - -# Define the namespace -namespace = "kyegomez" - -# Initialize the vector store driver -vector_store = MarqoVectorStoreDriver( - api_key=openai.api_key_path, - url="http://localhost:8882", - index="chat2", - mq=Client(api_key="foobar", url="http://localhost:8882") -) - -# Get a list of all indexes -#indexes = vector_store.get_indexes() -#print(indexes) - -# Initialize the query engine -query_engine = VectorQueryEngine(vector_store_driver=vector_store) - -# Initialize the knowledge base tool -kb_tool = KnowledgeBaseClient( - description="Contains information about the Zeta Framework from www.zeta.ai", - query_engine=query_engine, - namespace=namespace -) - -# Load artifacts from the web -artifacts = WebLoader(max_tokens=200).load("https://www.zeta.ai") - -# Upsert the artifacts into the vector store -vector_store.upsert_text_artifacts({namespace: artifacts,}) - -# Initialize the agent -agent = Agent(tools=[kb_tool]) - -# Start the chat -utils.Chat(agent).start() - -``` \ No newline at end of file diff --git a/docs/examples/query-webpage.md b/docs/examples/query-webpage.md deleted file mode 100644 index 0171f02..0000000 --- a/docs/examples/query-webpage.md +++ /dev/null @@ -1,23 +0,0 @@ -```python -from zeta.artifacts import BaseArtifact -from zeta.drivers import LocalVectorStoreDriver -from zeta.loaders import WebLoader - - -vector_store = LocalVectorStoreDriver() - -[ - vector_store.upsert_text_artifact(a, namespace="zeta") - for a in WebLoader(max_tokens=100).load("https://www.zeta.ai") -] - -results = vector_store.query( - "creativity", - count=3, - namespace="zeta" -) - -values = [BaseArtifact.from_json(r.meta["artifact"]).value for r in results] - -print("\n\n".join(values)) -``` \ No newline at end of file diff --git a/docs/examples/store-conversation-memory-in-dynamodb.md b/docs/examples/store-conversation-memory-in-dynamodb.md deleted file mode 100644 index bb3be37..0000000 --- a/docs/examples/store-conversation-memory-in-dynamodb.md +++ /dev/null @@ -1,47 +0,0 @@ -To store your conversation on DynamoDB you can use DynamoDbConversationMemoryDriver. -```python -from zeta.memory.structure import ConversationMemory -from zeta.memory.structure import ConversationMemoryElement, Turn, Message -from zeta.drivers import DynamoDbConversationMemoryDriver - -# Instantiate DynamoDbConversationMemoryDriver -dynamo_driver = DynamoDbConversationMemoryDriver( - aws_region="us-east-1", - table_name="conversations", - partition_key="convo_id", - value_attribute_key="convo_data", - partition_key_value="convo1" -) - -# Create a ConversationMemory structure -conv_mem = ConversationMemory( - turns=[ - Turn( - turn_index=0, - system=Message("Hello"), - user=Message("Hi") - ), - Turn( - turn_index=1, - system=Message("How can I assist you today?"), - user=Message("I need some information") - ) - ], - latest_turn=Turn( - turn_index=2, - system=Message("Sure, what information do you need?"), - user=None # user has not yet responded - ), - driver=dynamo_driver # set the driver -) - -# Store the conversation in DynamoDB -dynamo_driver.store(conv_mem) - -# Load the conversation from DynamoDB -loaded_conv_mem = dynamo_driver.load() - -# Display the loaded conversation -print(loaded_conv_mem.to_json()) - -``` \ No newline at end of file diff --git a/docs/examples/talk-to-a-pdf.md b/docs/examples/talk-to-a-pdf.md deleted file mode 100644 index bf74062..0000000 --- a/docs/examples/talk-to-a-pdf.md +++ /dev/null @@ -1,37 +0,0 @@ -This example demonstrates how to vectorize a PDF of the [Attention Is All You Need](https://arxiv.org/pdf/1706.03762.pdf) paper and setup a Zeta agent with rules and the `KnowledgeBase` tool to use it during conversations. - -```python -import io -import requests -from zeta.engines import VectorQueryEngine -from zeta.loaders import PdfLoader -from zeta.structures import Agent -from zeta.tools import KnowledgeBaseClient -from zeta.utils import Chat - -namespace = "attention" - -response = requests.get("https://arxiv.org/pdf/1706.03762.pdf") -engine = VectorQueryEngine() - -engine.vector_store_driver.upsert_text_artifacts( - { - namespace: PdfLoader().load( - io.BytesIO(response.content) - ) - } -) - -kb_client = KnowledgeBaseClient( - description="Contains information about the Attention Is All You Need paper. " - "Use it to answer any related questions.", - query_engine=engine, - namespace=namespace -) - -agent = Agent( - tools=[kb_client] -) - -Chat(agent).start() -``` \ No newline at end of file diff --git a/docs/examples/talk-to-a-webpage.md b/docs/examples/talk-to-a-webpage.md deleted file mode 100644 index 229531a..0000000 --- a/docs/examples/talk-to-a-webpage.md +++ /dev/null @@ -1,50 +0,0 @@ -This example demonstrates how to vectorize a webpage and setup a Zeta agent with rules and the `KnowledgeBase` tool to use it during conversations. - -```python -from zeta.engines import VectorQueryEngine -from zeta.loaders import WebLoader -from zeta.rules import Ruleset, Rule -from zeta.structures import Agent -from zeta.tools import KnowledgeBaseClient -from zeta.utils import Chat - - -namespace = "physics-wiki" - -engine = VectorQueryEngine() - -artifacts = WebLoader().load( - "https://en.wikipedia.org/wiki/Physics" -) - -engine.vector_store_driver.upsert_text_artifacts( - {namespace: artifacts} -) - - -kb_client = KnowledgeBaseClient( - description="Contains information about physics. " - "Use it to answer any physics-related questions.", - query_engine=engine, - namespace=namespace -) - -agent = Agent( - rulesets=[ - Ruleset( - name="Physics Tutor", - rules=[ - Rule( - "Always introduce yourself as a physics tutor" - ), - Rule( - "Be truthful. Only discuss physics." - ) - ] - ) - ], - tools=[kb_client] -) - -Chat(agent).start() -``` \ No newline at end of file diff --git a/docs/examples/talk-to-redshift.md b/docs/examples/talk-to-redshift.md deleted file mode 100644 index fc4fe4d..0000000 --- a/docs/examples/talk-to-redshift.md +++ /dev/null @@ -1,46 +0,0 @@ -This example demonstrates how to build an agent that can dynamically query Amazon Redshift Serverless tables and store its contents on the local hard drive. - -Let's build a support agent that uses GPT-4: - -```python -import boto3 -from zeta.drivers import AmazonRedshiftSqlDriver, OpenAiPromptDriver -from zeta.loaders import SqlLoader -from zeta.rules import Ruleset, Rule -from zeta.structures import Agent -from zeta.tools import SqlClient, FileManager -from zeta.utils import Chat - -session = boto3.Session(region_name="REGION_NAME") - -sql_loader = SqlLoader( - sql_driver=AmazonRedshiftSqlDriver( - database="DATABASE", - session=session, - workgroup_name="WORKGROUP_NAME" - ) -) - -sql_tool = SqlClient( - sql_loader=sql_loader, - table_name="people", - table_description="contains information about tech industry professionals", - engine_name="redshift" -) - -agent = Agent( - tools=[sql_tool, FileManager())], - rulesets=[ - Ruleset( - name="HumansOrg Agent", - rules=[ - Rule("Act and introduce yourself as a HumansOrg, Inc. support agent"), - Rule("Your main objective is to help with finding information about people"), - Rule("Only use information about people from the sources available to you") - ] - ) - ] -) - -Chat(agent).start() -``` diff --git a/docs/examples/using-text-generation-web-ui.md b/docs/examples/using-text-generation-web-ui.md deleted file mode 100644 index ed74bbb..0000000 --- a/docs/examples/using-text-generation-web-ui.md +++ /dev/null @@ -1,97 +0,0 @@ -This example demonstrates how to build an agent that can integrate with [Text Generation Web UI](https://github.com/oobabooga/text-generation-webui). - -To be able to perform successful connection, run text gen with '--api' and if you running text gen not on the same host, add '--listen'. see more option [here](https://github.com/oobabooga/text-generation-webui) - -Check out the bare API usage [example](https://github.com/oobabooga/text-generation-webui/blob/main/api-examples/api-example.py). - -## Tokenizer - -To match the tokenizer used in the text gen, one can use [PreTrainedTokenizerFast](https://huggingface.co/docs/transformers/fast_tokenizers#loading-from-a-json-file) to load tokenizer from saved json setting file. - -Example: - -Let's say you using [TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-GPTQ](https://huggingface.co/TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-GPTQ/tree/main) in text gen, you can get hold of 'tokenizer.json' file that can be used to setup a corresponding tokenizer. - -## Code Snippets - -Code snippet using a pre defined 'preset'. - -'max_tokens' argument here need to be set with the same value as in the preset in text gen. - -```shell -from zeta.structures import Agent -from zeta.drivers import TextGenPromptDriver -from zeta.tokenizers import TextGenTokenizer -from transformers import PreTrainedTokenizerFast - -fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json") - -prompt_driver = TextGenPromptDriver( - preset="zeta", - tokenizer=TextGenTokenizer(max_tokens=300, tokenizer=fast_tokenizer) -) - -agent = Agent( - prompt_driver=prompt_driver -) - -agent.run( - "tell me what Zeta is" -) -``` - -Code snippet example using params, if params and preset is defined, preset will be used. - -this params are overriding the current preset set in text gen, not all of them must be used. - -```shell -from zeta.structures import Agent -from zeta.drivers import TextGenPromptDriver -from zeta.tokenizers import TextGenTokenizer -from transformers import PreTrainedTokenizerFast - -params = { - 'max_new_tokens': 250, - 'do_sample': True, - 'temperature': 0.7, - 'top_p': 0.1, - 'typical_p': 1, - 'epsilon_cutoff': 0, # In units of 1e-4 - 'eta_cutoff': 0, # In units of 1e-4 - 'tfs': 1, - 'top_a': 0, - 'repetition_penalty': 1.18, - 'repetition_penalty_range': 0, - 'top_k': 40, - 'min_length': 0, - 'no_repeat_ngram_size': 0, - 'num_beams': 1, - 'penalty_alpha': 0, - 'length_penalty': 1, - 'early_stopping': False, - 'mirostat_mode': 0, - 'mirostat_tau': 5, - 'mirostat_eta': 0.1, - 'seed': 235245345, - 'add_bos_token': True, - 'truncation_length': 2048, - 'ban_eos_token': False, - 'skip_special_tokens': True, - 'stopping_strings': [] - } - -fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json") - -prompt_driver = TextGenPromptDriver( - params=params, - tokenizer=TextGenTokenizer(max_tokens=params['max_new_tokens'], tokenizer=fast_tokenizer) -) - -agent = Agent( - prompt_driver=prompt_driver -) - -agent.run( - "tell me what Zeta is" -) -``` \ No newline at end of file diff --git a/docs/faq.md b/docs/faq.md deleted file mode 100644 index 0edf8a7..0000000 --- a/docs/faq.md +++ /dev/null @@ -1,51 +0,0 @@ -**FAQ: Zeta - Crafting the Next Level in Neural Networks** - ---- - -We understand that delving into a new framework, especially in the ever-evolving world of machine learning, can be both exciting and a tad bit overwhelming. We've compiled some of the most frequently asked questions, hoping to bridge the gap between curiosity and clarity. You inspire us, and we want to ensure that your journey with Zeta is smooth and transformative. - ---- - -## 1. How is Zeta different from PyTorch? - -**Answer:** First and foremost, we have immense respect for PyTorch and the revolution it has brought to deep learning. However, Zeta is not just another deep learning framework. While PyTorch offers a robust platform for building neural networks from scratch, Zeta aims to make the process of creating State of The Art Models even more effortless and intuitive. - -- **Modularity**: Zeta's architecture allows for easily interchangeable modules, making it a breeze for developers to plug and play with different configurations. - -- **LLMs & Multi-Modality**: We've integrated tools to efficiently harness the power of LLMs and Multi-Modal Foundation Models. This is not just about building a model; it's about building models that can interact, perceive, and reason with diverse data types - be it text, image, or more. - -- **Enhanced Security and Trust**: Zeta enforces trust boundaries, schema validation, and provides tool activity-level permissions. This ensures that while your models are smart, they're also safe and adhere to set protocols. - -- **Ease of Use**: Ever felt like going for a serene swim? Using Zeta feels just like that โ€“ fluid, intuitive, and without friction. Our pythonic methods, classes, and top-notch error handling guide you every step of the way. - -- **Performance**: Think of Zeta as the Lamborghini of ML frameworks. It's built for speed, efficiency, and performance. Every single FLOP is put to its best use, ensuring swift model training and inference. - -In essence, while PyTorch provides the building blocks, Zeta offers a refined, faster, and more intuitive experience to craft and deploy powerful neural networks. - ---- - -## 2. How steep is the learning curve for Zeta, especially for someone accustomed to PyTorch? - -**Answer:** We designed Zeta keeping both beginners and professionals in mind. If you're familiar with PyTorch, you'll appreciate the similarities in terms of syntax and structure. The added features and modules in Zeta are introduced with clarity and simplicity. With our comprehensive documentation, hands-on examples, and supportive community on [Discord](https://discord.gg/gnWRz88eym), we aim to make your transition smooth and enjoyable. - ---- - -## 3. How does Zeta handle backward compatibility? - -**Answer:** We understand the importance of backward compatibility, especially when developers invest time and resources into a framework. While we continually strive to innovate and introduce new features, we make sure that changes don't break the functionality of models built on earlier versions. We're committed to ensuring a balance between innovation and stability. - ---- - -## 4. Are there plans for introducing more pre-trained models in Zeta? - -**Answer:** Absolutely! Our vision with Zeta is not static. We are in the constant pursuit of integrating newer, state-of-the-art pre-trained models. Our goal is to give developers the arsenal they need to break new grounds in machine learning. Stay tuned for more exciting updates! - ---- - -## 5. I'm facing a challenge with Zeta. How can I get help? - -**Answer:** We're genuinely sorry to hear that, but rest assured, we're here to assist. Our [Discord community](https://discord.gg/gnWRz88eym) is active, and our team, along with fellow developers, are always eager to help. You can also raise an issue or start a discussion on our [Github Page](https://github.com/kyegomez). Remember, challenges are stepping stones to mastery, and we're with you every step of the way. - ---- - -Your feedback, questions, and concerns are the winds beneath our wings. Keep them coming, and together, let's shape the future of neural networks with Zeta. \ No newline at end of file diff --git a/docs/flywheel.md b/docs/flywheel.md deleted file mode 100644 index c8d2e4e..0000000 --- a/docs/flywheel.md +++ /dev/null @@ -1,101 +0,0 @@ -# The Zeta Flywheel - -1. **Building a Supportive Community:** Initiate by establishing an engaging and inclusive open-source community for both developers and sales freelancers around Zeta. Regular online meetups, webinars, tutorials, and sales training can make them feel welcome and encourage contributions and sales efforts. - -2. **Increased Contributions and Sales Efforts:** The more engaged the community, the more developers will contribute to Zeta and the more effort sales freelancers will put into selling Zeta. - -3. **Improvement in Quality and Market Reach:** More developer contributions mean better quality, reliability, and feature offerings from Zeta. Simultaneously, increased sales efforts from freelancers boost Zeta' market penetration and visibility. - -4. **Rise in User Base:** As Zeta becomes more robust and more well-known, the user base grows, driving more revenue. - -5. **Greater Financial Incentives:** Increased revenue can be redirected to offer more significant financial incentives to both developers and salespeople. Developers can be incentivized based on their contribution to Zeta, and salespeople can be rewarded with higher commissions. - -6. **Attract More Developers and Salespeople:** These financial incentives, coupled with the recognition and experience from participating in a successful project, attract more developers and salespeople to the community. - -7. **Wider Adoption of Zeta:** An ever-improving product, a growing user base, and an increasing number of passionate salespeople accelerate the adoption of Zeta. - -8. **Return to Step 1:** As the community, user base, and sales network continue to grow, the cycle repeats, each time speeding up the flywheel. - - -```markdown - +---------------------+ - | Building a | - | Supportive | <--+ - | Community | | - +--------+-----------+ | - | | - v | - +--------+-----------+ | - | Increased | | - | Contributions & | | - | Sales Efforts | | - +--------+-----------+ | - | | - v | - +--------+-----------+ | - | Improvement in | | - | Quality & Market | | - | Reach | | - +--------+-----------+ | - | | - v | - +--------+-----------+ | - | Rise in User | | - | Base | | - +--------+-----------+ | - | | - v | - +--------+-----------+ | - | Greater Financial | | - | Incentives | | - +--------+-----------+ | - | | - v | - +--------+-----------+ | - | Attract More | | - | Developers & | | - | Salespeople | | - +--------+-----------+ | - | | - v | - +--------+-----------+ | - | Wider Adoption of | | - | Zeta |----+ - +---------------------+ -``` - - -# Potential Risks and Mitigations: - -1. **Insufficient Contributions or Quality of Work**: Open-source efforts rely on individuals being willing and able to spend time contributing. If not enough people participate, or the work they produce is of poor quality, the product development could stall. - * **Mitigation**: Create a robust community with clear guidelines, support, and resources. Provide incentives for quality contributions, such as a reputation system, swag, or financial rewards. Conduct thorough code reviews to ensure the quality of contributions. - -2. **Lack of Sales Results**: Commission-based salespeople will only continue to sell the product if they're successful. If they aren't making enough sales, they may lose motivation and cease their efforts. - * **Mitigation**: Provide adequate sales training and resources. Ensure the product-market fit is strong, and adjust messaging or sales tactics as necessary. Consider implementing a minimum commission or base pay to reduce risk for salespeople. - -3. **Poor User Experience or User Adoption**: If users don't find the product useful or easy to use, they won't adopt it, and the user base won't grow. This could also discourage salespeople and contributors. - * **Mitigation**: Prioritize user experience in the product development process. Regularly gather and incorporate user feedback. Ensure robust user support is in place. - -4. **Inadequate Financial Incentives**: If the financial rewards don't justify the time and effort contributors and salespeople are putting in, they will likely disengage. - * **Mitigation**: Regularly review and adjust financial incentives as needed. Ensure that the method for calculating and distributing rewards is transparent and fair. - -5. **Security and Compliance Risks**: As the user base grows and the software becomes more complex, the risk of security issues increases. Moreover, as contributors from various regions join, compliance with various international laws could become an issue. - * **Mitigation**: Establish strong security practices from the start. Regularly conduct security audits. Seek legal counsel to understand and adhere to international laws and regulations. - -## Activation Plan for the Flywheel: - -1. **Community Building**: Begin by fostering a supportive community around Zeta. Encourage early adopters to contribute and provide feedback. Create comprehensive documentation, community guidelines, and a forum for discussion and support. - -2. **Sales and Development Training**: Provide resources and training for salespeople and developers. Make sure they understand the product, its value, and how to effectively contribute or sell. - -3. **Increase Contributions and Sales Efforts**: Encourage increased participation by highlighting successful contributions and sales, rewarding top contributors and salespeople, and regularly communicating about the project's progress and impact. - -4. **Iterate and Improve**: Continually gather and implement feedback to improve Zeta and its market reach. The better the product and its alignment with the market, the more the user base will grow. - -5. **Expand User Base**: As the product improves and sales efforts continue, the user base should grow. Ensure you have the infrastructure to support this growth and maintain a positive user experience. - -6. **Increase Financial Incentives**: As the user base and product grow, so too should the financial incentives. Make sure rewards continue to be competitive and attractive. - -7. **Attract More Contributors and Salespeople**: As the financial incentives and success of the product increase, this should attract more contributors and salespeople, further feeding the flywheel. - -Throughout this process, it's important to regularly reassess and adjust your strategy as necessary. Stay flexible and responsive to changes in the market, user feedback, and the evolving needs of the community. \ No newline at end of file diff --git a/docs/hiring.md b/docs/hiring.md deleted file mode 100644 index c3b05ee..0000000 --- a/docs/hiring.md +++ /dev/null @@ -1,60 +0,0 @@ -## **Join the Swarm Revolution: Advancing Humanity & Prosperity Together!** - -### **The Next Chapter of Humanity's Story Begins Here...** - -At Zeta, our mission transcends mere technological advancement. We envision a world where every individual can leverage the power of AI to uplift their lives, communities, and our shared future. If you are driven by the passion to revolutionize industries, to scale the heights of innovation, and believe in earning your fair share for every ounce of your dedication โ€“ you might be the one we're looking for. - ---- - -### **Why Zeta?** - -#### **For the Ambitious Spirit**: -- **Opportunity Beyond Boundaries**: Just as Fuller believed in the infinite opportunities of America, we believe in the limitless potential of raw Humantiy. - -#### **For the Maverick**: -- **Unprecedented Independence**: Like the Fuller salesmen, our team members have the autonomy to sculpt their roles, timelines, and outcomes. Here, youโ€™re the captain of your ship. - -#### **For the Avid Learner**: -- **Continuous Learning & Growth**: Dive deep into the realms of AI, distributed systems, and customer success methodologies. We offer training, mentorship, and a platform to sharpen your skills. - -#### **For the High Achiever**: -- **Rewarding Compensation**: While the sky is the limit for your innovations, so is your earning potential. Prosper with performance-based rewards that reflect your dedication. - -#### **For the Community Builder**: -- **Culture of Unity & Innovation**: At Zeta, youโ€™re not just an employee; youโ€™re a pivotal part of our mission. Experience camaraderie, collaboration, and a shared purpose that binds us together. - -#### **For the Visionary**: -- **Work on the Cutting-Edge**: Be at the forefront of AI and technology. Shape solutions that will define the next era of human history. - ---- - -### **Benefits of Joining Zeta**: - -1. **Advance Humanity**: Play an instrumental role in democratizing technology for all. -2. **Financial Prosperity**: Harness a compensation structure that grows with your achievements. -3. **Flexible Work Environment**: Customize your workspace, schedule, and workstyle. -4. **Global Network**: Collaborate with some of the brightest minds spanning continents. -5. **Personal Development**: Regular workshops, courses, and seminars to fuel your growth. -6. **Health & Wellness**: Comprehensive health benefits and well-being programs. -7. **Ownership & Equity**: As we grow, so does your stake and impact in our organization. -8. **Retreats & Team Building**: Forge bonds beyond work in exotic locations globally. -9. **Customer Success Impact**: Directly experience the joy of solving real-world challenges for our users. - ---- - -### **Positions Open**: - -- **AI & Swarm Engineers**: Architect, design, and optimize the swarm systems powering global innovations. - ---- - -### **Your Invitation to the Future**: -If you resonate with our vision of blending technological marvels with human brilliance, of creating a prosperous world where every dream has the wings of AI โ€“ we invite you to join us on this extraordinary journey. - -**Are you ready to create history with Zeta?** - ---- - -**Apply Now and Letโ€™s Push Our People Further!** - ---- \ No newline at end of file diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index 0afb749..0000000 --- a/docs/index.md +++ /dev/null @@ -1,20 +0,0 @@ -# Zeta Docs - -Welcome to Zeta's Documentation! - -Zeta is a modular framework that enables for seamless, reliable, and fluid creation of zetascale AI models. - -## Zeta - - - -Zeta provides you with reliable, high performance, and fast modular building blocks for building zeta scale neural nets at lightspeed with minimal code and a pythonic API. - -[Click here for Zeta Documentation โ†’](zeta/) - - -## Examples - -Check out Zeta examples for building agents, data retrieval, and more. - -[Checkout Zeta examples โ†’](examples/) diff --git a/docs/metric.md b/docs/metric.md deleted file mode 100644 index a223edc..0000000 --- a/docs/metric.md +++ /dev/null @@ -1,4 +0,0 @@ -# The Golden Metric: - -* We need to figure out a single metric that determines if we're accomplishing our goal with zeta which is to build zetascale superintelligent AI models as fast as possible with minimal code. - diff --git a/docs/overrides/main.html b/docs/overrides/main.html deleted file mode 100644 index 8dbe669..0000000 --- a/docs/overrides/main.html +++ /dev/null @@ -1,9 +0,0 @@ -{% extends "base.html" %} - - - -{% block announce %} -
- Star and contribute to Zeta on GitHub! -
-{% endblock %} \ No newline at end of file diff --git a/docs/purpose.md b/docs/purpose.md deleted file mode 100644 index d0ed588..0000000 --- a/docs/purpose.md +++ /dev/null @@ -1,47 +0,0 @@ -# Zeta's Purpose - - -Eevery once in a while, a revolutionary project comes along that changes everything. - -A landscape cluttered by rigid frameworks, plagued by inefficiencies, and where developers - our brightest minds - are bogged down by limitations. - -Now, imagine a world where harnessing the power of state-of-the-art models isn't just possible... it's simple. A world where efficiency doesnโ€™t sacrifice safety, and where your ideas are bounded only by your imagination. We should be living in this world. But we aren't. - - -Because Zeta is what's missing. - - -The challenge? Creating a framework that's not just another tool, but a revolution. - -To bridge this gap, one would need to optimize at the foundational level, prioritize user experience, and introduce a design philosophy that future-proofs. It's colossal. And until now, no one's even come close. - - -But thereโ€™s an enormous opportunity here. An opportunity that promises not just recognition but the power to redefine an industry. And, the key to unlocking this future? It's been with us all along. - - -Insight. - - -Introducing... Zeta. - - -Our secret? Fluidity. - -Itโ€™s a philosophy that values modularity, reliability, usability, and unmatched speed. - -But more than that, it's a commitment to evolution, to pushing boundaries, to never settling. - - -Why are we the best to execute this vision? - -Because we've been there from the start. - -We've seen the challenges, felt the frustrations, and now, we're poised to lead the revolution. - -Weโ€™ve done it before, and with Zeta, weโ€™re doing it again. - - -Zeta isnโ€™t just the next step. It's a leap into the future. - -Zeta is the future of AI. - diff --git a/docs/research.md b/docs/research.md deleted file mode 100644 index 83fd262..0000000 --- a/docs/research.md +++ /dev/null @@ -1,1103 +0,0 @@ -# Awesome Multimodal Machine Learning - -By [Paul Liang](http://www.cs.cmu.edu/~pliang/) (pliang@cs.cmu.edu), [Machine Learning Department](http://www.ml.cmu.edu/) and [Language Technologies Institute](https://www.lti.cs.cmu.edu/), [CMU](https://www.cmu.edu/), with help from members of the [MultiComp Lab](http://multicomp.cs.cmu.edu/) at LTI, CMU. If there are any areas, papers, and datasets I missed, please let me know! - -## Course content + workshops - -Check out our comprehsensive tutorial paper [Foundations and Recent Trends in Multimodal Machine Learning: Principles, Challenges, and Open Questions](https://arxiv.org/abs/2209.03430). - -[Tutorials on Multimodal Machine Learning](https://cmu-multicomp-lab.github.io/mmml-tutorial/cvpr2022/) at CVPR 2022 and NAACL 2022, slides and videos [here](https://cmu-multicomp-lab.github.io/mmml-tutorial/schedule/). - -New course [11-877 Advanced Topics in Multimodal Machine Learning](https://cmu-multicomp-lab.github.io/adv-mmml-course/spring2022/) Spring 2022 @ CMU. It will primarily be reading and discussion-based. We plan to post discussion probes, relevant papers, and summarized discussion highlights every week on the website. - -Public course content and lecture videos from [11-777 Multimodal Machine Learning](https://cmu-multicomp-lab.github.io/mmml-course/fall2020/), Fall 2020 @ CMU. - -## Table of Contents - -* [Survey Papers](#survey-papers) -* [Core Areas](#core-areas) - * [Multimodal Representations](#multimodal-representations) - * [Multimodal Fusion](#multimodal-fusion) - * [Multimodal Alignment](#multimodal-alignment) - * [Multimodal Pretraining](#multimodal-pretraining) - * [Multimodal Translation](#multimodal-translation) - * [Crossmodal Retrieval](#crossmodal-retrieval) - * [Multimodal Co-learning](#multimodal-colearning) - * [Missing or Imperfect Modalities](#missing-or-imperfect-modalities) - * [Analysis of Multimodal Models](#analysis-of-multimodal-models) - * [Knowledge Graphs and Knowledge Bases](#knowledge-graphs-and-knowledge-bases) - * [Intepretable Learning](#intepretable-learning) - * [Generative Learning](#generative-learning) - * [Semi-supervised Learning](#semi-supervised-learning) - * [Self-supervised Learning](#self-supervised-learning) - * [Language Models](#language-models) - * [Adversarial Attacks](#adversarial-attacks) - * [Few-Shot Learning](#few-shot-learning) - * [Bias and Fairness](#bias-and-fairness) - * [Human in the Loop Learning](#human-in-the-loop-learning) -* [Architectures](#architectures) - * [Multimodal Transformers](#multimodal-transformers) - * [Multimodal Memory](#multimodal-memory) -* [Applications and Datasets](#applications-and-datasets) - * [Language and Visual QA](#language-and-visual-qa) - * [Language Grounding in Vision](#language-grounding-in-vision) - * [Language Grouding in Navigation](#language-grouding-in-navigation) - * [Multimodal Machine Translation](#multimodal-machine-translation) - * [Multi-agent Communication](#multi-agent-communication) - * [Commonsense Reasoning](#commonsense-reasoning) - * [Multimodal Reinforcement Learning](#multimodal-reinforcement-learning) - * [Multimodal Dialog](#multimodal-dialog) - * [Language and Audio](#language-and-audio) - * [Audio and Visual](#audio-and-visual) - * [Visual, IMU and Wireless](#visual-imu-and-wireless) - * [Media Description](#media-description) - * [Video Generation from Text](#video-generation-from-text) - * [Affect Recognition and Multimodal Language](#affect-recognition-and-multimodal-language) - * [Healthcare](#healthcare) - * [Robotics](#robotics) - * [Autonomous Driving](#Autonomous-Driving) - * [Finance](#Finance) - * [Human AI Interaction](#Human-AI-Interaction) -* [Workshops](#workshops) -* [Tutorials](#tutorials) -* [Courses](#courses) - - -# Research Papers - -## Survey Papers - -[Foundations and Trends in Multimodal Machine Learning: Principles, Challenges, and Open Questions](https://arxiv.org/abs/2209.03430), arxiv 2023 - -[Multimodal Learning with Transformers: A Survey](https://arxiv.org/abs/2206.06488), TPAMI 2023 - -[Trends in Integration of Vision and Language Research: A Survey of Tasks, Datasets, and Methods](https://doi.org/10.1613/jair.1.11688), JAIR 2021 - -[Experience Grounds Language](https://arxiv.org/abs/2004.10151), EMNLP 2020 - -[A Survey of Reinforcement Learning Informed by Natural Language](https://arxiv.org/abs/1906.03926), IJCAI 2019 - -[Multimodal Machine Learning: A Survey and Taxonomy](https://arxiv.org/abs/1705.09406), TPAMI 2019 - -[Multimodal Intelligence: Representation Learning, Information Fusion, and Applications](https://arxiv.org/abs/1911.03977), arXiv 2019 - -[Deep Multimodal Representation Learning: A Survey](https://ieeexplore.ieee.org/abstract/document/8715409), arXiv 2019 - -[Guest Editorial: Image and Language Understanding](https://link.springer.com/article/10.1007/s11263-017-0993-y), IJCV 2017 - -[Representation Learning: A Review and New Perspectives](https://arxiv.org/abs/1206.5538), TPAMI 2013 - -[A Survey of Socially Interactive Robots](https://www.cs.cmu.edu/~illah/PAPERS/socialroboticssurvey.pdf), 2003 - -## Core Areas - -### Multimodal Representations - -[Identifiability Results for Multimodal Contrastive Learning](https://arxiv.org/abs/2303.09166), ICLR 2023 [[code]](https://github.com/imantdaunhawer/multimodal-contrastive-learning) - -[Unpaired Vision-Language Pre-training via Cross-Modal CutMix](https://arxiv.org/abs/2206.08919), ICML 2022. - -[Balanced Multimodal Learning via On-the-fly Gradient Modulation](https://arxiv.org/abs/2203.15332), CVPR 2022 - -[Unsupervised Voice-Face Representation Learning by Cross-Modal Prototype Contrast](https://arxiv.org/abs/2204.14057), IJCAI 2021 [[code]](https://github.com/Cocoxili/CMPC) - -[Towards a Unified Foundation Model: Jointly Pre-Training Transformers on Unpaired Images and Text](https://arxiv.org/abs/2112.07074), arXiv 2021 - -[FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482), arXiv 2021 - -[Transformer is All You Need: Multimodal Multitask Learning with a Unified Transformer](https://arxiv.org/abs/2102.10772), arXiv 2021 - -[MultiBench: Multiscale Benchmarks for Multimodal Representation Learning](https://arxiv.org/abs/2107.07502), NeurIPS 2021 [[code]](https://github.com/pliang279/MultiBench) - -[Perceiver: General Perception with Iterative Attention](https://arxiv.org/abs/2103.03206), ICML 2021 [[code]](https://github.com/deepmind/deepmind-research/tree/master/perceiver) - -[Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020), arXiv 2021 [[blog]]([blog](https://openai.com/blog/clip/)) [[code]](https://github.com/OpenAI/CLIP) - -[VinVL: Revisiting Visual Representations in Vision-Language Models](https://arxiv.org/abs/2101.00529), arXiv 2021 [[blog]](https://www.microsoft.com/en-us/research/blog/vinvl-advancing-the-state-of-the-art-for-vision-language-models/?OCID=msr_blog_VinVL_fb) [[code]](https://github.com/pzzhang/VinVL) - -[Learning Transferable Visual Models From Natural Language Supervision](https://cdn.openai.com/papers/Learning_Transferable_Visual_Models_From_Natural_Language.pdf), arXiv 2020 [[blog]](https://openai.com/blog/clip/) [[code]](https://github.com/openai/CLIP) - -[12-in-1: Multi-Task Vision and Language Representation Learning](https://arxiv.org/abs/1912.02315), CVPR 2020 [[code]](https://github.com/facebookresearch/vilbert-multi-task) - -[Watching the World Go By: Representation Learning from Unlabeled Videos](https://arxiv.org/abs/2003.07990), arXiv 2020 - -[Learning Video Representations using Contrastive Bidirectional Transformer](https://arxiv.org/abs/1906.05743), arXiv 2019 - -[Visual Concept-Metaconcept Learning](https://papers.nips.cc/paper/8745-visual-concept-metaconcept-learning.pdf), NeurIPS 2019 [[code]](http://vcml.csail.mit.edu/) - -[OmniNet: A Unified Architecture for Multi-modal Multi-task Learning](https://arxiv.org/abs/1907.07804), arXiv 2019 [[code]](https://github.com/subho406/OmniNet) - -[Learning Representations by Maximizing Mutual Information Across Views](https://arxiv.org/abs/1906.00910), arXiv 2019 [[code]](https://github.com/Philip-Bachman/amdim-public) - -[ViCo: Word Embeddings from Visual Co-occurrences](https://arxiv.org/abs/1908.08527), ICCV 2019 [[code]](https://github.com/BigRedT/vico) - -[Unified Visual-Semantic Embeddings: Bridging Vision and Language With Structured Meaning Representations](http://openaccess.thecvf.com/content_CVPR_2019/papers/Wu_Unified_Visual-Semantic_Embeddings_Bridging_Vision_and_Language_With_Structured_Meaning_CVPR_2019_paper.pdf), CVPR 2019 - -[Multi-Task Learning of Hierarchical Vision-Language Representation](https://arxiv.org/abs/1812.00500), CVPR 2019 - -[Learning Factorized Multimodal Representations](https://arxiv.org/abs/1806.06176), ICLR 2019 [[code]](https://github.com/pliang279/factorized/) - -[A Probabilistic Framework for Multi-view Feature Learning with Many-to-many Associations via Neural Networks](https://arxiv.org/abs/1802.04630), ICML 2018 - -[Do Neural Network Cross-Modal Mappings Really Bridge Modalities?](https://aclweb.org/anthology/P18-2074), ACL 2018 - -[Learning Robust Visual-Semantic Embeddings](https://arxiv.org/abs/1703.05908), ICCV 2017 - -[Deep Multimodal Representation Learning from Temporal Data](https://arxiv.org/abs/1704.03152), CVPR 2017 - -[Is an Image Worth More than a Thousand Words? On the Fine-Grain Semantic Differences between Visual and Linguistic Representations](https://www.aclweb.org/anthology/C16-1264), COLING 2016 - -[Combining Language and Vision with a Multimodal Skip-gram Model](https://www.aclweb.org/anthology/N15-1016), NAACL 2015 - -[Deep Fragment Embeddings for Bidirectional Image Sentence Mapping](https://arxiv.org/abs/1406.5679), NIPS 2014 - -[Multimodal Learning with Deep Boltzmann Machines](https://dl.acm.org/citation.cfm?id=2697059), JMLR 2014 - -[Learning Grounded Meaning Representations with Autoencoders](https://www.aclweb.org/anthology/P14-1068), ACL 2014 - -[DeViSE: A Deep Visual-Semantic Embedding Model](https://papers.nips.cc/paper/5204-devise-a-deep-visual-semantic-embedding-model), NeurIPS 2013 - -[Multimodal Deep Learning](https://dl.acm.org/citation.cfm?id=3104569), ICML 2011 - -### Multimodal Fusion - -[Robust Contrastive Learning against Noisy Views](https://arxiv.org/abs/2201.04309), arXiv 2022 - -[Cooperative Learning for Multi-view Analysis](https://arxiv.org/abs/2112.12337), arXiv 2022 - -[What Makes Multi-modal Learning Better than Single (Provably)](https://arxiv.org/abs/2106.04538), NeurIPS 2021 - -[Efficient Multi-Modal Fusion with Diversity Analysis](https://dl.acm.org/doi/abs/10.1145/3474085.3475188), ACMMM 2021 - -[Attention Bottlenecks for Multimodal Fusion](https://arxiv.org/abs/2107.00135), NeurIPS 2021 - -[VMLoc: Variational Fusion For Learning-Based Multimodal Camera Localization](https://arxiv.org/abs/2003.07289), AAAI 2021 - -[Trusted Multi-View Classification](https://openreview.net/forum?id=OOsR8BzCnl5), ICLR 2021 [[code]](https://github.com/hanmenghan/TMC) - -[Deep-HOSeq: Deep Higher-Order Sequence Fusion for Multimodal Sentiment Analysis](https://arxiv.org/pdf/2010.08218.pdf), ICDM 2020 - -[Removing Bias in Multi-modal Classifiers: Regularization by Maximizing Functional Entropies](https://arxiv.org/abs/2010.10802), NeurIPS 2020 [[code]](https://github.com/itaigat/removing-bias-in-multi-modal-classifiers) - -[Deep Multimodal Fusion by Channel Exchanging](https://arxiv.org/abs/2011.05005?context=cs.LG), NeurIPS 2020 [[code]](https://github.com/yikaiw/CEN) - -[What Makes Training Multi-Modal Classification Networks Hard?](https://arxiv.org/abs/1905.12681), CVPR 2020 - -[Dynamic Fusion for Multimodal Data](https://arxiv.org/abs/1911.03821), arXiv 2019 - -[DeepCU: Integrating Both Common and Unique Latent Information for Multimodal Sentiment Analysis](https://www.ijcai.org/proceedings/2019/503), IJCAI 2019 [[code]](https://github.com/sverma88/DeepCU-IJCAI19) - -[Deep Multimodal Multilinear Fusion with High-order Polynomial Pooling](https://papers.nips.cc/paper/9381-deep-multimodal-multilinear-fusion-with-high-order-polynomial-pooling), NeurIPS 2019 - -[XFlow: Cross-modal Deep Neural Networks for Audiovisual Classification](https://ieeexplore.ieee.org/abstract/document/8894404), IEEE TNNLS 2019 [[code]](https://github.com/catalina17/XFlow) - -[MFAS: Multimodal Fusion Architecture Search](https://arxiv.org/abs/1903.06496), CVPR 2019 - -[The Neuro-Symbolic Concept Learner: Interpreting Scenes, Words, and Sentences From Natural Supervision](https://arxiv.org/abs/1904.12584), ICLR 2019 [[code]](http://nscl.csail.mit.edu/) - -[Unifying and merging well-trained deep neural networks for inference stage](https://www.ijcai.org/Proceedings/2018/0283.pdf), IJCAI 2018 [[code]](https://github.com/ivclab/NeuralMerger) - -[Efficient Low-rank Multimodal Fusion with Modality-Specific Factors](https://arxiv.org/abs/1806.00064), ACL 2018 [[code]](https://github.com/Justin1904/Low-rank-Multimodal-Fusion) - -[Memory Fusion Network for Multi-view Sequential Learning](https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/viewFile/17341/16122), AAAI 2018 [[code]](https://github.com/pliang279/MFN) - -[Tensor Fusion Network for Multimodal Sentiment Analysis](https://arxiv.org/abs/1707.07250), EMNLP 2017 [[code]](https://github.com/A2Zadeh/TensorFusionNetwork) - -[Jointly Modeling Deep Video and Compositional Text to Bridge Vision and Language in a Unified Framework](http://web.eecs.umich.edu/~jjcorso/pubs/xu_corso_AAAI2015_v2t.pdf), AAAI 2015 - -[A co-regularized approach to semi-supervised learning with multiple views](https://web.cse.ohio-state.edu/~belkin.8/papers/CASSL_ICML_05.pdf), ICML 2005 - -### Multimodal Alignment - -[Reconsidering Representation Alignment for Multi-view Clustering](https://openaccess.thecvf.com/content/CVPR2021/html/Trosten_Reconsidering_Representation_Alignment_for_Multi-View_Clustering_CVPR_2021_paper.html), CVPR 2021 [[code]](https://github.com/DanielTrosten/mvc) - -[CoMIR: Contrastive Multimodal Image Representation for Registration](https://arxiv.org/pdf/2006.06325.pdf), NeurIPS 2020 [[code]](https://github.com/MIDA-group/CoMIR) - -[Multimodal Transformer for Unaligned Multimodal Language Sequences](https://arxiv.org/abs/1906.00295), ACL 2019 [[code]](https://github.com/yaohungt/Multimodal-Transformer) - -[Temporal Cycle-Consistency Learning](https://arxiv.org/abs/1904.07846), CVPR 2019 [[code]](https://github.com/google-research/google-research/tree/master/tcc) - -[See, Hear, and Read: Deep Aligned Representations](https://people.csail.mit.edu/yusuf/see-hear-read/paper.pdf), arXiv 2017 - -[On Deep Multi-View Representation Learning](http://proceedings.mlr.press/v37/wangb15.pdf), ICML 2015 - -[Unsupervised Alignment of Natural Language Instructions with Video Segments](https://dl.acm.org/citation.cfm?id=2892753.2892769), AAAI 2014 - -[Multimodal Alignment of Videos](https://dl.acm.org/citation.cfm?id=2654862), MM 2014 - -[Deep Canonical Correlation Analysis](http://proceedings.mlr.press/v28/andrew13.html), ICML 2013 [[code]](https://github.com/VahidooX/DeepCCA) - -### Multimodal Pretraining -[Align before Fuse: Vision and Language Representation Learning with Momentum Distillation](https://arxiv.org/abs/2107.07651), NeurIPS 2021 Spotlight [[code]](https://github.com/salesforce/ALBEF) - -[Less is More: ClipBERT for Video-and-Language Learning via Sparse Sampling](https://arxiv.org/abs/2102.06183), CVPR 2021 [[code]](https://github.com/jayleicn/ClipBERT) - -[Transformer is All You Need: Multimodal Multitask Learning with a Unified Transformer](https://arxiv.org/abs/2102.10772), arXiv 2021 - -[Large-Scale Adversarial Training for Vision-and-Language Representation Learning](https://arxiv.org/abs/2006.06195), NeurIPS 2020 [[code]](https://github.com/zhegan27/VILLA) - -[Vokenization: Improving Language Understanding with Contextualized, Visual-Grounded Supervision](https://arxiv.org/abs/2010.06775), EMNLP 2020 [[code]](https://github.com/airsplay/vokenization) - -[Integrating Multimodal Information in Large Pretrained Transformers](https://arxiv.org/abs/1908.05787), ACL 2020 - -[VL-BERT: Pre-training of Generic Visual-Linguistic Representations](https://arxiv.org/abs/1908.08530), arXiv 2019 [[code]](https://github.com/jackroos/VL-BERT) - -[VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/abs/1908.03557), arXiv 2019 [[code]](https://github.com/uclanlp/visualbert) - -[ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks](https://arxiv.org/abs/1908.02265), NeurIPS 2019 [[code]](https://github.com/jiasenlu/vilbert_beta) - -[Unicoder-VL: A Universal Encoder for Vision and Language by Cross-modal Pre-training](https://arxiv.org/abs/1908.06066), arXiv 2019 - -[LXMERT: Learning Cross-Modality Encoder Representations from Transformers](https://arxiv.org/abs/1908.07490), EMNLP 2019 [[code]](https://github.com/airsplay/lxmert) - -[VideoBERT: A Joint Model for Video and Language Representation Learning](https://arxiv.org/abs/1904.01766), ICCV 2019 - -### Multimodal Translation - -[Zero-Shot Text-to-Image Generation](https://arxiv.org/abs/2102.12092), ICML 2021 [[code]](https://github.com/openai/DALL-E) - -[Translate-to-Recognize Networks for RGB-D Scene Recognition](https://openaccess.thecvf.com/content_CVPR_2019/papers/Du_Translate-to-Recognize_Networks_for_RGB-D_Scene_Recognition_CVPR_2019_paper.pdf), CVPR 2019 [[code]](https://github.com/ownstyledu/Translate-to-Recognize-Networks) - -[Language2Pose: Natural Language Grounded Pose Forecasting](https://arxiv.org/abs/1907.01108), 3DV 2019 [[code]](http://chahuja.com/language2pose/) - -[Reconstructing Faces from Voices](https://arxiv.org/abs/1905.10604), NeurIPS 2019 [[code]](https://github.com/cmu-mlsp/reconstructing_faces_from_voices) - -[Speech2Face: Learning the Face Behind a Voice](https://arxiv.org/abs/1905.09773), CVPR 2019 [[code]](https://speech2face.github.io/) - -[Found in Translation: Learning Robust Joint Representations by Cyclic Translations Between Modalities](https://arxiv.org/abs/1812.07809), AAAI 2019 [[code]](https://github.com/hainow/MCTN) - -[Natural TTS Synthesis by Conditioning Wavenet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884), ICASSP 2018 [[code]](https://github.com/NVIDIA/tacotron2) - -### Crossmodal Retrieval - -[Learning with Noisy Correspondence for Cross-modal Matching](https://proceedings.neurips.cc/paper/2021/file/f5e62af885293cf4d511ceef31e61c80-Paper.pdf), NeurIPS 2021 [[code]](https://github.com/XLearning-SCU/2021-NeurIPS-NCR) - -[MURAL: Multimodal, Multitask Retrieval Across Languages](https://arxiv.org/abs/2109.05125), arXiv 2021 - -[Self-Supervised Learning from Web Data for Multimodal Retrieval](https://arxiv.org/abs/1901.02004), arXiv 2019 - -[Look, Imagine and Match: Improving Textual-Visual Cross-Modal Retrieval with Generative Models](https://arxiv.org/abs/1711.06420), CVPR 2018 - -[Scene-centric vs. Object-centric Image-Text Cross-modal Retrieval: A Reproducibility Study](https://arxiv.org/abs/2301.05174), ECIR 2023 - -### Multimodal Co-learning - -[Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918), ICML 2021 - -[Multimodal Co-learning: Challenges, Applications with Datasets, Recent Advances and Future Directions](https://arxiv.org/abs/2107.13782), arXiv 2021 - -[Vokenization: Improving Language Understanding via Contextualized, Visually-Grounded Supervision](https://arxiv.org/abs/2010.06775), EMNLP 2020 - -[Foundations of Multimodal Co-learning](https://www.sciencedirect.com/science/article/pii/S1566253520303006), Information Fusion 2020 - -### Missing or Imperfect Modalities - -[A Variational Information Bottleneck Approach to Multi-Omics Data Integration](https://arxiv.org/abs/2102.03014), AISTATS 2021 [[code]](https://github.com/chl8856/DeepIMV) - -[SMIL: Multimodal Learning with Severely Missing Modality](https://arxiv.org/abs/2103.05677), AAAI 2021 - -[Factorized Inference in Deep Markov Models for Incomplete Multimodal Time Series](https://arxiv.org/abs/1905.13570), arXiv 2019 - -[Learning Representations from Imperfect Time Series Data via Tensor Rank Regularization](https://arxiv.org/abs/1907.01011), ACL 2019 - -[Multimodal Deep Learning for Robust RGB-D Object Recognition](https://arxiv.org/abs/1507.06821), IROS 2015 - -### Analysis of Multimodal Models - -[M2Lens: Visualizing and Explaining Multimodal Models for Sentiment Analysis](https://arxiv.org/abs/2107.08264), IEEE TVCG 2022 - -[Decoupling the Role of Data, Attention, and Losses in Multimodal Transformers](https://arxiv.org/abs/2102.00529), TACL 2021 - -[Does my multimodal model learn cross-modal interactions? Itโ€™s harder to tell than you might think!](https://www.aclweb.org/anthology/2020.emnlp-main.62.pdf), EMNLP 2020 - -[Blindfold Baselines for Embodied QA](https://arxiv.org/abs/1811.05013), NIPS 2018 Visually-Grounded Interaction and Language Workshop - -[Analyzing the Behavior of Visual Question Answering Models](https://arxiv.org/abs/1606.07356), EMNLP 2016 - -### Knowledge Graphs and Knowledge Bases - -[MMKG: Multi-Modal Knowledge Graphs](https://arxiv.org/abs/1903.05485), ESWC 2019 - -[Answering Visual-Relational Queries in Web-Extracted Knowledge Graphs](https://arxiv.org/abs/1709.02314), AKBC 2019 - -[Embedding Multimodal Relational Data for Knowledge Base Completion](https://arxiv.org/abs/1809.01341), EMNLP 2018 - -[A Multimodal Translation-Based Approach for Knowledge Graph Representation Learning](https://www.aclweb.org/anthology/S18-2027), SEM 2018 [[code]](https://github.com/UKPLab/starsem18-multimodalKB) - -[Order-Embeddings of Images and Language](https://arxiv.org/abs/1511.06361), ICLR 2016 [[code]](https://github.com/ivendrov/order-embedding) - -[Building a Large-scale Multimodal Knowledge Base System for Answering Visual Queries](https://arxiv.org/abs/1507.05670), arXiv 2015 - -### Intepretable Learning - -[Multimodal Explanations by Predicting Counterfactuality in Videos](https://arxiv.org/abs/1812.01263), CVPR 2019 - -[Multimodal Explanations: Justifying Decisions and Pointing to the Evidence](https://arxiv.org/abs/1802.08129), CVPR 2018 [[code]](https://github.com/Seth-Park/MultimodalExplanations) - -[Do Explanations make VQA Models more Predictable to a Human?](https://arxiv.org/abs/1810.12366), EMNLP 2018 - -[Towards Transparent AI Systems: Interpreting Visual Question Answering Models](https://arxiv.org/abs/1608.08974), ICML Workshop on Visualization for Deep Learning 2016 - -### Generative Learning - -[MMVAE+: Enhancing the Generative Quality of Multimodal VAEs without Compromises](https://openreview.net/forum?id=sdQGxouELX), ICLR 2023 [[code]](https://github.com/epalu/mmvaeplus) - -[On the Limitations of Multimodal VAEs](https://arxiv.org/abs/2110.04121), ICLR 2022 [[code]](https://openreview.net/attachment?id=w-CPUXXrAj&name=supplementary_material) - -[Generalized Multimodal ELBO](https://openreview.net/forum?id=5Y21V0RDBV), ICLR 2021 [[code]](https://github.com/thomassutter/MoPoE) - -[Multimodal Generative Learning Utilizing Jensen-Shannon-Divergence](https://arxiv.org/abs/2006.08242), NeurIPS 2020 [[code]](https://github.com/thomassutter/mmjsd) - -[Self-supervised Disentanglement of Modality-specific and Shared Factors Improves Multimodal Generative Models](https://rdcu.be/c8WUU), GCPR 2020 [[code]](https://github.com/imantdaunhawer/DMVAE) - -[Variational Mixture-of-Experts Autoencodersfor Multi-Modal Deep Generative Models](https://arxiv.org/pdf/1911.03393.pdf), NeurIPS 2019 [[code]](https://github.com/iffsid/mmvae) - -[Few-shot Video-to-Video Synthesis](https://arxiv.org/abs/1910.12713), NeurIPS 2019 [[code]](https://nvlabs.github.io/few-shot-vid2vid/) - -[Multimodal Generative Models for Scalable Weakly-Supervised Learning](https://arxiv.org/abs/1802.05335), NeurIPS 2018 [[code1]](https://github.com/mhw32/multimodal-vae-public) [[code2]](https://github.com/panpan2/Multimodal-Variational-Autoencoder) - -[The Multi-Entity Variational Autoencoder](http://charlienash.github.io/assets/docs/mevae2017.pdf), NeurIPS 2017 - -### Semi-supervised Learning - -[Semi-supervised Vision-language Mapping via Variational Learning](https://ieeexplore.ieee.org/document/7989160), ICRA 2017 - -[Semi-supervised Multimodal Hashing](https://arxiv.org/abs/1712.03404), arXiv 2017 - -[Semi-Supervised Multimodal Deep Learning for RGB-D Object Recognition](https://www.ijcai.org/Proceedings/16/Papers/473.pdf), IJCAI 2016 - -[Multimodal Semi-supervised Learning for Image Classification](https://ieeexplore.ieee.org/abstract/document/5540120), CVPR 2010 - -### Self-supervised Learning - -[DABS: A Domain-Agnostic Benchmark for Self-Supervised Learning](https://arxiv.org/abs/2111.12062), NeurIPS 2021 Datasets & Benchmarks Track [[code]](https://github.com/alextamkin/dabs) - -[Self-Supervised Learning by Cross-Modal Audio-Video Clustering](https://arxiv.org/abs/1911.12667), NeurIPS 2020 [[code]](https://github.com/HumamAlwassel/XDC) - -[Self-Supervised MultiModal Versatile Networks](https://arxiv.org/abs/2006.16228), NeurIPS 2020 [[code]](https://tfhub.dev/deepmind/mmv/s3d/1) - -[Labelling Unlabelled Videos from Scratch with Multi-modal Self-supervision](https://arxiv.org/abs/2006.13662), NeurIPS 2020 [[code]](https://www.robots.ox.ac.uk/~vgg/research/selavi/) - -[Self-Supervised Learning of Visual Features through Embedding Images into Text Topic Spaces](https://ieeexplore.ieee.org/document/8099701), CVPR 2017 - -[Multimodal Dynamics : Self-supervised Learning in Perceptual and Motor Systems](https://dl.acm.org/citation.cfm?id=1269207), 2016 - -### Language Models - -[Neural Language Modeling with Visual Features](https://arxiv.org/abs/1903.02930), arXiv 2019 - -[Learning Multi-Modal Word Representation Grounded in Visual Context](https://arxiv.org/abs/1711.03483), AAAI 2018 - -[Visual Word2Vec (vis-w2v): Learning Visually Grounded Word Embeddings Using Abstract Scenes](https://arxiv.org/abs/1511.07067), CVPR 2016 - -[Unifying Visual-Semantic Embeddings with Multimodal Neural Language Models](http://proceedings.mlr.press/v32/kiros14.html), ICML 2014 [[code]](https://github.com/ryankiros/visual-semantic-embedding) - -### Adversarial Attacks - -[Attend and Attack: Attention Guided Adversarial Attacks on Visual Question Answering Models](https://nips2018vigil.github.io/static/papers/accepted/33.pdf), NeurIPS Workshop on Visually Grounded Interaction and Language 2018 - -[Attacking Visual Language Grounding with Adversarial Examples: A Case Study on Neural Image Captioning](https://arxiv.org/abs/1712.02051), ACL 2018 [[code]](https://github.com/huanzhang12/ImageCaptioningAttack) - -[Fooling Vision and Language Models Despite Localization and Attention Mechanism](https://arxiv.org/abs/1709.08693), CVPR 2018 - -### Few-Shot Learning - -[Language to Network: Conditional Parameter Adaptation with Natural Language Descriptions](https://www.aclweb.org/anthology/2020.acl-main.625/), ACL 2020 - -[Shaping Visual Representations with Language for Few-shot Classification](https://arxiv.org/abs/1911.02683), ACL 2020 - -[Zero-Shot Learning - The Good, the Bad and the Ugly](https://arxiv.org/abs/1703.04394), CVPR 2017 - -[Zero-Shot Learning Through Cross-Modal Transfer](https://nlp.stanford.edu/~socherr/SocherGanjooManningNg_NIPS2013.pdf), NIPS 2013 - -### Bias and Fairness - -[Worst of Both Worlds: Biases Compound in Pre-trained Vision-and-Language Models](https://arxiv.org/abs/2104.08666), arXiv 2021 - -[Towards Debiasing Sentence Representations](https://arxiv.org/abs/2007.08100), ACL 2020 [[code]](https://github.com/pliang279/sent_debias) - -[FairCVtest Demo: Understanding Bias in Multimodal Learning with a Testbed in Fair Automatic Recruitment](https://arxiv.org/abs/2009.07025), ICMI 2020 [[code]](https://github.com/BiDAlab/FairCVtest) - -[Model Cards for Model Reporting](https://arxiv.org/abs/1810.03993), FAccT 2019 - -[Black is to Criminal as Caucasian is to Police: Detecting and Removing Multiclass Bias in Word Embeddings](https://arxiv.org/abs/1904.04047), NAACL 2019 [[code]](https://github.com/TManzini/DebiasMulticlassWordEmbedding) - -[Gender Shades: Intersectional Accuracy Disparities in Commercial Gender Classification](http://proceedings.mlr.press/v81/buolamwini18a.html?mod=article_inline), FAccT 2018 - -[Datasheets for Datasets](https://arxiv.org/abs/1803.09010), arXiv 2018 - -[Man is to Computer Programmer as Woman is to Homemaker? Debiasing Word Embeddings](https://arxiv.org/abs/1607.06520), NeurIPS 2016 - -### Human in the Loop Learning - -[Human in the Loop Dialogue Systems](https://sites.google.com/view/hlds-2020/home), NeurIPS 2020 workshop - -[Human And Machine in-the-Loop Evaluation and Learning Strategies](https://hamlets-workshop.github.io/), NeurIPS 2020 workshop - -[Human-centric dialog training via offline reinforcement learning](https://arxiv.org/abs/2010.05848), EMNLP 2020 [[code]](https://github.com/natashamjaques/neural_chat/tree/master/BatchRL) - -[Human-In-The-Loop Machine Learning with Intelligent Multimodal Interfaces](https://csjzhou.github.io/homepage/papers/ICML2017_Syed.pdf), ICML 2017 workshop - -## Architectures - -### Multimodal Transformers - -[Pretrained Transformers As Universal Computation Engines](https://arxiv.org/abs/2103.05247), AAAI 2022 - -[Perceiver: General Perception with Iterative Attention](https://arxiv.org/abs/2103.03206), ICML 2021 - -[FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482), arXiv 2021 - -[PolyViT: Co-training Vision Transformers on Images, Videos and Audio](https://arxiv.org/abs/2111.12993), arXiv 2021 - -[VATT: Transformers for Multimodal Self-Supervised Learning from Raw Video, Audio and Text](https://arxiv.org/abs/2104.11178), NeurIPS 2021 [[code]](https://github.com/google-research/google-research/tree/master/vatt) - -[Parameter Efficient Multimodal Transformers for Video Representation Learning](https://arxiv.org/abs/2012.04124), ICLR 2021 [[code]](https://github.com/sangho-vision/avbert) - -### Multimodal Memory - -[Multimodal Transformer with Variable-length Memory for Vision-and-Language Navigation](https://arxiv.org/abs/2111.05759), arXiv 2021 - -[History Aware Multimodal Transformer for Vision-and-Language Navigation](https://arxiv.org/abs/2110.13309), NeurIPS 2021 [[code]](https://cshizhe.github.io/projects/vln_hamt.html) - -[Episodic Memory in Lifelong Language Learning](https://arxiv.org/abs/1906.01076), NeurIPS 2019 - -[ICON: Interactive Conversational Memory Network for Multimodal Emotion Detection](https://aclanthology.org/D18-1280.pdf), EMNLP 2018 - -[Multimodal Memory Modelling for Video Captioning](https://arxiv.org/abs/1611.05592), CVPR 2018 - -[Dynamic Memory Networks for Visual and Textual Question Answering](https://arxiv.org/abs/1603.01417), ICML 2016 - -## Applications and Datasets - -### Language and Visual QA - -[TAG: Boosting Text-VQA via Text-aware Visual Question-answer Generation](https://arxiv.org/abs/2208.01813), arXiv 2022 [[code]](https://github.com/HenryJunW/TAG) - -[Learning to Answer Questions in Dynamic Audio-Visual Scenarios](https://arxiv.org/abs/2203.14072), CVPR 2022 - -[SUTD-TrafficQA: A Question Answering Benchmark and an Efficient Network for Video Reasoning over Traffic Events](https://openaccess.thecvf.com/content/CVPR2021/html/Xu_SUTD-TrafficQA_A_Question_Answering_Benchmark_and_an_Efficient_Network_for_CVPR_2021_paper.html), CVPR 2021 [[code]](https://github.com/SUTDCV/SUTD-TrafficQA) - -[MultiModalQA: complex question answering over text, tables and images](https://openreview.net/forum?id=ee6W5UgQLa), ICLR 2021 - -[ManyModalQA: Modality Disambiguation and QA over Diverse Inputs](https://arxiv.org/abs/2001.08034), AAAI 2020 [[code]](https://github.com/hannandarryl/ManyModalQA) - -[Iterative Answer Prediction with Pointer-Augmented Multimodal Transformers for TextVQA](https://arxiv.org/abs/1911.06258), CVPR 2020 - -[Interactive Language Learning by Question Answering](https://arxiv.org/abs/1908.10909), EMNLP 2019 [[code]](https://github.com/xingdi-eric-yuan/qait_public) - -[Fusion of Detected Objects in Text for Visual Question Answering](https://arxiv.org/abs/1908.05054), arXiv 2019 - -[RUBi: Reducing Unimodal Biases in Visual Question Answering](https://arxiv.org/abs/1906.10169), NeurIPS 2019 [[code]](https://github.com/cdancette/rubi.bootstrap.pytorch) - -[GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering](https://arxiv.org/abs/1902.09506), CVPR 2019 [[code]](https://cs.stanford.edu/people/dorarad/gqa/) - -[OK-VQA: A Visual Question Answering Benchmark Requiring External Knowledge](https://arxiv.org/abs/1906.00067), CVPR 2019 [[code]](http://okvqa.allenai.org/) - -[MUREL: Multimodal Relational Reasoning for Visual Question Answering](https://arxiv.org/abs/1902.09487), CVPR 2019 [[code]](https://github.com/Cadene/murel.bootstrap.pytorch) - -[Social-IQ: A Question Answering Benchmark for Artificial Social Intelligence](http://openaccess.thecvf.com/content_CVPR_2019/html/Zadeh_Social-IQ_A_Question_Answering_Benchmark_for_Artificial_Social_Intelligence_CVPR_2019_paper.html), CVPR 2019 [[code]](https://github.com/A2Zadeh/Social-IQ) - -[Probabilistic Neural-symbolic Models for Interpretable Visual Question Answering](https://arxiv.org/abs/1902.07864), ICML 2019 [[code]](https://github.com/kdexd/probnmn-clevr) - -[Learning to Count Objects in Natural Images for Visual Question Answering](https://arxiv.org/abs/1802.05766), ICLR 2018, [[code]](https://github.com/Cyanogenoid/vqa-counting) - -[Overcoming Language Priors in Visual Question Answering with Adversarial Regularization](https://arxiv.org/abs/1810.03649), NeurIPS 2018 - -[Neural-Symbolic VQA: Disentangling Reasoning from Vision and Language Understanding](https://arxiv.org/abs/1810.02338), NeurIPS 2018 [[code]](https://github.com/kexinyi/ns-vqa) - -[RecipeQA: A Challenge Dataset for Multimodal Comprehension of Cooking Recipes](https://arxiv.org/abs/1809.00812), EMNLP 2018 [[code]](https://hucvl.github.io/recipeqa/) - -[TVQA: Localized, Compositional Video Question Answering](https://www.aclweb.org/anthology/D18-1167), EMNLP 2018 [[code]](https://github.com/jayleicn/TVQA) - -[Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering](https://arxiv.org/abs/1707.07998), CVPR 2018 [[code]](https://github.com/facebookresearch/pythia) - -[Don't Just Assume; Look and Answer: Overcoming Priors for Visual Question Answering](https://arxiv.org/abs/1712.00377), CVPR 2018 [[code]](https://github.com/AishwaryaAgrawal/GVQA) - -[Stacked Latent Attention for Multimodal Reasoning](http://openaccess.thecvf.com/content_cvpr_2018/papers/Fan_Stacked_Latent_Attention_CVPR_2018_paper.pdf), CVPR 2018 - -[Learning to Reason: End-to-End Module Networks for Visual Question Answering](https://arxiv.org/abs/1704.05526), ICCV 2017 [[code]](https://github.com/ronghanghu/n2nmn) - -[CLEVR: A Diagnostic Dataset for Compositional Language and Elementary Visual Reasoning](https://arxiv.org/abs/1612.06890), CVPR 2017 [[code]](https://github.com/facebookresearch/clevr-iep) [[dataset generation]](https://github.com/facebookresearch/clevr-dataset-gen) - -[Are You Smarter Than A Sixth Grader? Textbook Question Answering for Multimodal Machine Comprehension](https://ieeexplore.ieee.org/document/8100054/), CVPR 2017 [[code]](http://vuchallenge.org/tqa.html) - -[Multimodal Compact Bilinear Pooling for Visual Question Answering and Visual Grounding](https://arxiv.org/abs/1606.01847), EMNLP 2016 [[code]](https://github.com/akirafukui/vqa-mcb) - -[MovieQA: Understanding Stories in Movies through Question-Answering](https://arxiv.org/abs/1512.02902), CVPR 2016 [[code]](http://movieqa.cs.toronto.edu/home/) - -[VQA: Visual Question Answering](https://arxiv.org/abs/1505.00468), ICCV 2015 [[code]](https://visualqa.org/) - -### Language Grounding in Vision - -[Core Challenges in Embodied Vision-Language Planning](https://arxiv.org/abs/2106.13948), arXiv 2021 - -[MaRVL: Multicultural Reasoning over Vision and Language](https://arxiv.org/pdf/2109.13238), EMNLP 2021 [[code]](https://marvl-challenge.github.io/) - -[Grounding 'Grounding' in NLP](https://arxiv.org/abs/2106.02192), ACL 2021 - -[The Hateful Memes Challenge: Detecting Hate Speech in Multimodal Memes](https://arxiv.org/abs/2005.04790), NeurIPS 2020 [[code]](https://ai.facebook.com/blog/hateful-memes-challenge-and-data-set/) - -[What Does BERT with Vision Look At?](https://www.aclweb.org/anthology/2020.acl-main.469/), ACL 2020 - -[Visual Grounding in Video for Unsupervised Word Translation](https://arxiv.org/abs/2003.05078), CVPR 2020 [[code]](https://github.com/gsig/visual-grounding) - -[VIOLIN: A Large-Scale Dataset for Video-and-Language Inference](https://arxiv.org/abs/2003.11618), CVPR 2020 [[code]](https://github.com/jimmy646/violin) - -[Grounded Video Description](https://arxiv.org/abs/1812.06587), CVPR 2019 - -[Show, Control and Tell: A Framework for Generating Controllable and Grounded Captions](https://arxiv.org/abs/1811.10652), CVPR 2019 - -[Multilevel Language and Vision Integration for Text-to-Clip Retrieval](https://arxiv.org/abs/1804.05113), AAAI 2019 [[code]](https://github.com/VisionLearningGroup/Text-to-Clip_Retrieval) - -[Binary Image Selection (BISON): Interpretable Evaluation of Visual Grounding](https://arxiv.org/abs/1901.06595), arXiv 2019 [[code]](https://github.com/facebookresearch/binary-image-selection) - -[Finding โ€œItโ€: Weakly-Supervised Reference-Aware Visual Grounding in Instructional Videos](http://openaccess.thecvf.com/content_cvpr_2018/papers/Huang_Finding_It_Weakly-Supervised_CVPR_2018_paper.pdf), CVPR 2018 - -[SCAN: Learning Hierarchical Compositional Visual Concepts](https://arxiv.org/abs/1707.03389), ICLR 2018 - -[Visual Coreference Resolution in Visual Dialog using Neural Module Networks](https://arxiv.org/abs/1809.01816), ECCV 2018 [[code]](https://github.com/facebookresearch/corefnmn) - -[Gated-Attention Architectures for Task-Oriented Language Grounding](https://arxiv.org/abs/1706.07230), AAAI 2018 [[code]](https://github.com/devendrachaplot/DeepRL-Grounding) - -[Using Syntax to Ground Referring Expressions in Natural Images](https://arxiv.org/abs/1805.10547), AAAI 2018 [[code]](https://github.com/volkancirik/groundnet) - -[Grounding language acquisition by training semantic parsers using captioned videos](https://cbmm.mit.edu/sites/default/files/publications/Ross-et-al_ACL2018_Grounding%20language%20acquisition%20by%20training%20semantic%20parsing%20using%20caption%20videos.pdf), ACL 2018 - -[Interpretable and Globally Optimal Prediction for Textual Grounding using Image Concepts](https://arxiv.org/abs/1803.11209), NeurIPS 2017 - -[Localizing Moments in Video with Natural Language](https://arxiv.org/abs/1708.01641), ICCV 2017 - -[What are you talking about? Text-to-Image Coreference](https://ieeexplore.ieee.org/abstract/document/6909850/), CVPR 2014 - -[Grounded Language Learning from Video Described with Sentences](https://www.aclweb.org/anthology/P13-1006), ACL 2013 - -[Grounded Compositional Semantics for Finding and Describing Images with Sentences](https://nlp.stanford.edu/~socherr/SocherKarpathyLeManningNg_TACL2013.pdf), TACL 2013 - -### Language Grouding in Navigation - -[ALFWorld: Aligning Text and Embodied Environments for Interactive Learning](https://arxiv.org/abs/2010.03768), ICLR 2021 [[code]](http://alfworld.github.io/) - -[Hierarchical Cross-Modal Agent for Robotics Vision-and-Language Navigation](https://arxiv.org/abs/2104.10674), ICRA 2021, [[code]](https://github.com/GT-RIPL/robo-vln), [[video]](https://www.youtube.com/watch?v=y16x9n_zP_4), [[project page]](https://zubair-irshad.github.io/projects/robo-vln.html) - -[Improving Vision-and-Language Navigation with Image-Text Pairs from the Web](https://arxiv.org/abs/2004.14973), ECCV 2020 - -[Towards Learning a Generic Agent for Vision-and-Language Navigation via Pre-training](https://arxiv.org/abs/2002.10638), CVPR 2020 [[code]](https://github.com/weituo12321/PREVALENT) - -[VideoNavQA: Bridging the Gap between Visual and Embodied Question Answering](https://arxiv.org/abs/1908.04950), BMVC 2019 [[code]](https://github.com/catalina17/VideoNavQA) - -[Vision-and-Dialog Navigation](https://arxiv.org/abs/1907.04957), arXiv 2019 [[code]](https://github.com/mmurray/cvdn) - -[Hierarchical Decision Making by Generating and Following Natural Language Instructions](https://arxiv.org/abs/1906.00744), arXiv 2019 [[code]](https://www.minirts.net/) - -[Stay on the Path: Instruction Fidelity in Vision-and-Language Navigation](https://arxiv.org/abs/1905.12255), ACL 2019 - -[Are You Looking? Grounding to Multiple Modalities in Vision-and-Language Navigation](https://arxiv.org/abs/1906.00347), ACL 2019 - -[Touchdown: Natural Language Navigation and Spatial Reasoning in Visual Street Environments](https://arxiv.org/abs/1811.12354), CVPR 2019 [[code]](https://github.com/lil-lab/touchdown) - -[Reinforced Cross-Modal Matching and Self-Supervised Imitation Learning for Vision-Language Navigation](https://arxiv.org/abs/1811.10092), CVPR 2019 - -[The Regretful Navigation Agent for Vision-and-Language Navigation](https://arxiv.org/abs/1903.01602), CVPR 2019 [[code]](https://github.com/chihyaoma/regretful-agent) - -[Tactical Rewind: Self-Correction via Backtracking in Vision-and-Language Navigation](https://arxiv.org/abs/1903.02547), CVPR 2019 [[code]](https://github.com/Kelym/FAST) - -[Multi-modal Discriminative Model for Vision-and-Language Navigation](https://www.aclweb.org/anthology/W19-1605), NAACL SpLU-RoboNLP Workshop 2019 - -[Self-Monitoring Navigation Agent via Auxiliary Progress Estimation](https://arxiv.org/abs/1901.03035), ICLR 2019 [[code]](https://github.com/chihyaoma/selfmonitoring-agent) - -[From Language to Goals: Inverse Reinforcement Learning for Vision-Based Instruction Following](https://arxiv.org/abs/1902.07742), ICLR 2019 - -[Read, Watch, and Move: Reinforcement Learning for Temporally Grounding Natural Language Descriptions in Videos](https://arxiv.org/abs/1901.06829), AAAI 2019 - -[Learning to Navigate Unseen Environments: Back Translation with Environmental Dropout](https://www.aclweb.org/anthology/N19-1268), NAACL 2019 [[code]](https://github.com/airsplay/R2R-EnvDrop) - -[Attention Based Natural Language Grounding by Navigating Virtual Environment](https://arxiv.org/abs/1804.08454), IEEE WACV 2019 - -[Mapping Instructions to Actions in 3D Environments with Visual Goal Prediction](https://arxiv.org/abs/1809.00786), EMNLP 2018 [[code]](https://github.com/lil-lab/ciff) - -[Vision-and-Language Navigation: Interpreting Visually-Grounded Navigation Instructions in Real Environments](https://arxiv.org/abs/1711.07280), CVPR 2018 [[code]](https://bringmeaspoon.org/) - -[Embodied Question Answering](https://arxiv.org/abs/1711.11543), CVPR 2018 [[code]](https://embodiedqa.org/) - -[Look Before You Leap: Bridging Model-Free and Model-Based Reinforcement Learning for Planned-Ahead Vision-and-Language Navigation](https://arxiv.org/abs/1803.07729), ECCV 2018 - -### Multimodal Machine Translation - -[Unsupervised Multimodal Neural Machine Translation with Pseudo Visual Pivoting](https://arxiv.org/abs/2005.03119), ACL 2020 - -[Multimodal Transformer for Multimodal Machine Translation](https://www.aclweb.org/anthology/2020.acl-main.400/), ACL 2020 - -[Neural Machine Translation with Universal Visual Representation](https://openreview.net/forum?id=Byl8hhNYPS), ICLR 2020 [[code]](https://github.com/cooelf/UVR-NMT) - -[Visual Agreement Regularized Training for Multi-Modal Machine Translation](https://arxiv.org/abs/1912.12014), AAAI 2020 - -[VATEX: A Large-Scale, High-Quality Multilingual Dataset for Video-and-Language Research](https://arxiv.org/abs/1904.03493), ICCV 2019 [[code]](http://vatex.org/main/index.html) - -[Latent Variable Model for Multi-modal Translation](https://arxiv.org/pdf/1811.00357), ACL 2019 - -[Distilling Translations with Visual Awareness](https://arxiv.org/pdf/1906.07701), ACL 2019 - -[Probing the Need for Visual Context in Multimodal Machine Translation](https://www.aclweb.org/anthology/N19-1422), NAACL 2019 - -[Emergent Translation in Multi-Agent Communication](https://openreview.net/pdf?id=H1vEXaxA-), ICLR 2018 - -[Zero-Resource Neural Machine Translation with Multi-Agent Communication Game](https://arxiv.org/pdf/1802.03116), AAAI 2018 - -[Learning Translations via Images with a Massively Multilingual Image Dataset](http://aclweb.org/anthology/P18-1239), ACL 2018 - -[A Visual Attention Grounding Neural Model for Multimodal Machine Translation](http://aclweb.org/anthology/D18-1400), EMNLP 2018 - -[Adversarial Evaluation of Multimodal Machine Translation](http://aclweb.org/anthology/D18-1329), EMNLP 2018 - -[Doubly-Attentive Decoder for Multi-modal Neural Machine Translation](http://aclweb.org/anthology/P17-1175), ACL 2017 [[code]](https://github.com/iacercalixto/MultimodalNMT) - -[An empirical study on the effectiveness of images in Multimodal Neural Machine Translation](http://aclweb.org/anthology/D17-1095), EMNLP 2017 - -[Incorporating Global Visual Features into Attention-based Neural Machine Translation](http://aclweb.org/anthology/D17-1105), EMNLP 2017 [[code]](https://github.com/iacercalixto/MultimodalNMT) - -[Multimodal Pivots for Image Caption Translation](http://aclweb.org/anthology/P16-1227), ACL 2016 - -[Multi30K: Multilingual English-German Image Descriptions](https://aclweb.org/anthology/W16-3210.pdf), ACL Workshop on Language and Vision 2016 [[code]](https://github.com/multi30k/dataset) - -[Does Multimodality Help Human and Machine for Translation and Image Captioning?](http://www.statmt.org/wmt16/pdf/W16-2358.pdf), ACL WMT 2016 - -### Multi-agent Communication - -[Multi-agent Communication meets Natural Language: Synergies between Functional and Structural Language Learning](https://arxiv.org/abs/2005.07064), ACL 2020 - -[Emergence of Compositional Language with Deep Generational Transmission](https://arxiv.org/abs/1904.09067), ICML 2019 - -[On the Pitfalls of Measuring Emergent Communication](https://arxiv.org/abs/1903.05168), AAMAS 2019 [[code]](https://github.com/facebookresearch/measuring-emergent-comm) - -[Emergent Translation in Multi-Agent Communication](https://arxiv.org/abs/1710.06922), ICLR 2018 [[code]](https://github.com/facebookresearch/translagent) - -[Emergent Communication in a Multi-Modal, Multi-Step Referential Game](https://openreview.net/pdf?id=rJGZq6g0-), ICLR 2018 [[code]](https://github.com/nyu-dl/MultimodalGame) - -[Emergence of Linguistic Communication From Referential Games with Symbolic and Pixel Input](https://openreview.net/pdf?id=HJGv1Z-AW), ICLR 2018 - -[Emergent Communication through Negotiation](https://openreview.net/pdf?id=Hk6WhagRW), ICLR 2018 [[code]](https://github.com/ASAPPinc/emergent_comms_negotiation) - -[Emergence of Grounded Compositional Language in Multi-Agent Populations](https://arxiv.org/abs/1703.04908), AAAI 2018 - -[Emergence of Language with Multi-agent Games: Learning to Communicate with Sequences of Symbols](https://arxiv.org/abs/1705.11192), NeurIPS 2017 - -[Natural Language Does Not Emerge 'Naturally' in Multi-Agent Dialog](https://arxiv.org/abs/1706.08502), EMNLP 2017 [[code1]](https://github.com/batra-mlp-lab/lang-emerge) [[code2]](https://github.com/kdexd/lang-emerge-parlai) - -[Learning Cooperative Visual Dialog Agents with Deep Reinforcement Learning](https://arxiv.org/abs/1703.06585), ICCV 2017 [code](https://github.com/batra-mlp-lab/visdial-rl) - -[Multi-agent Cooperation and the Emergence of (natural) Language](https://arxiv.org/abs/1612.07182), ICLR 2017 - -[Learning to Communicate with Deep Multi-agent Reinforcement Learning](https://arxiv.org/abs/1605.06676), NIPS 2016. - -[Learning multiagent communication with backpropagation](http://papers.nips.cc/paper/6398-learning-multiagent-communication-with-backpropagation.pdf), NIPS 2016. - -[The Emergence of Compositional Structures in Perceptually Grounded Language Games](https://www.cs.utexas.edu/~kuipers/readings/Vogt-aij-05.pdf), AI 2005 - -### Commonsense Reasoning - -[Adventures in Flatland: Perceiving Social Interactions Under Physical Dynamics](https://www.tshu.io/HeiderSimmel/CogSci20/Flatland_CogSci20.pdf), CogSci 2020 - -[A Logical Model for Supporting Social Commonsense Knowledge Acquisition](https://arxiv.org/abs/1912.11599), arXiv 2019 - -[Heterogeneous Graph Learning for Visual Commonsense Reasoning](https://arxiv.org/abs/1910.11475), NeurIPS 2019 - -[SocialIQA: Commonsense Reasoning about Social Interactions](https://arxiv.org/abs/1904.09728), arXiv 2019 - -[From Recognition to Cognition: Visual Commonsense Reasoning](https://arxiv.org/abs/1811.10830), CVPR 2019 [[code]](https://visualcommonsense.com/) - -[CommonsenseQA: A Question Answering Challenge Targeting Commonsense Knowledge](https://arxiv.org/abs/1811.00937), NAACL 2019 - -### Multimodal Reinforcement Learning - -[MiniHack the Planet: A Sandbox for Open-Ended Reinforcement Learning Research](https://arxiv.org/abs/2109.13202), NeurIPS 2021 [[code]](https://github.com/facebookresearch/minihack) - -[Imitating Interactive Intelligence](https://arxiv.org/abs/2012.05672), arXiv 2020 - -[Grounded Language Learning Fast and Slow](https://arxiv.org/abs/2009.01719), ICLR 2021 - -[RTFM: Generalising to Novel Environment Dynamics via Reading](https://arxiv.org/abs/1910.08210), ICLR 2020 [[code]](https://github.com/facebookresearch/RTFM) - -[Embodied Multimodal Multitask Learning](https://arxiv.org/abs/1902.01385), IJCAI 2020 - -[Learning to Speak and Act in a Fantasy Text Adventure Game](https://arxiv.org/abs/1903.03094), arXiv 2019 [[code]](https://parl.ai/projects/light/) - -[Language as an Abstraction for Hierarchical Deep Reinforcement Learning](https://arxiv.org/abs/1906.07343), NeurIPS 2019 - -[Hierarchical Decision Making by Generating and Following Natural Language Instructions](https://arxiv.org/abs/1906.00744), NeurIPS 2019 [[code]](https://github.com/facebookresearch/minirts) - -[Habitat: A Platform for Embodied AI Research](https://arxiv.org/abs/1904.01201), ICCV 2019 [[code]](https://aihabitat.org/) - -[Multimodal Hierarchical Reinforcement Learning Policy for Task-Oriented Visual Dialog](https://arxiv.org/abs/1805.03257), SIGDIAL 2018 - -[Mapping Instructions and Visual Observations to Actions with Reinforcement Learning](https://www.cs.cornell.edu/~dkm/papers/mla-emnlp.2017.pdf), EMNLP 2017 - -[Reinforcement Learning for Mapping Instructions to Actions](https://people.csail.mit.edu/regina/my_papers/RL.pdf), ACL 2009 - -### Multimodal Dialog - -[Two Causal Principles for Improving Visual Dialog](https://arxiv.org/abs/1911.10496), CVPR 2020 - -[MELD: A Multimodal Multi-Party Dataset for Emotion Recognition in Conversations](https://arxiv.org/abs/1810.02508), ACL 2019 [[code]](http://affective-meld.github.io/) - -[CLEVR-Dialog: A Diagnostic Dataset for Multi-Round Reasoning in Visual Dialog](https://www.aclweb.org/anthology/N19-1058), NAACL 2019 [[code]](https://github.com/satwikkottur/clevr-dialog) - -[Talk the Walk: Navigating New York City through Grounded Dialogue](https://arxiv.org/abs/1807.03367), arXiv 2018 - -[Dialog-based Interactive Image Retrieval](https://arxiv.org/abs/1805.00145), NeurIPS 2018 [[code]](https://github.com/XiaoxiaoGuo/fashion-retrieval) - -[Towards Building Large Scale Multimodal Domain-Aware Conversation Systems](https://arxiv.org/abs/1704.00200), arXiv 2017 [[code]](https://amritasaha1812.github.io/MMD/) - -[Visual Dialog](https://arxiv.org/abs/1611.08669), CVPR 2017 [[code]](https://github.com/batra-mlp-lab/visdial) - -### Language and Audio - -[Lattice Transformer for Speech Translation](https://arxiv.org/abs/1906.05551), ACL 2019 - -[Exploring Phoneme-Level Speech Representations for End-to-End Speech Translation](https://arxiv.org/abs/1906.01199), ACL 2019 - -[Audio Caption: Listen and Tell](https://arxiv.org/abs/1902.09254), ICASSP 2019 - -[Audio-Linguistic Embeddings for Spoken Sentences](https://arxiv.org/abs/1902.07817), ICASSP 2019 - -[From Semi-supervised to Almost-unsupervised Speech Recognition with Very-low Resource by Jointly Learning Phonetic Structures from Audio and Text Embeddings](https://arxiv.org/abs/1904.05078), arXiv 2019 - -[From Audio to Semantics: Approaches To End-to-end Spoken Language Understanding](https://arxiv.org/abs/1809.09190), arXiv 2018 - -[Natural TTS Synthesis by Conditioning Wavenet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884), ICASSP 2018 [[code]](https://github.com/NVIDIA/tacotron2) - -[Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654), ICLR 2018 - -[Deep Voice 2: Multi-Speaker Neural Text-to-Speech](https://arxiv.org/abs/1705.08947), NeurIPS 2017 - -[Deep Voice: Real-time Neural Text-to-Speech](https://arxiv.org/abs/1702.07825), ICML 2017 - -[Text-to-Speech Synthesis](https://dl.acm.org/citation.cfm?id=1592988), 2009 - -### Audio and Visual - -[Music Gesture for Visual Sound Separation](https://arxiv.org/abs/2004.09476), CVPR 2020 - -[Co-Compressing and Unifying Deep CNN Models for Efficient Human Face and Speaker Recognition](http://openaccess.thecvf.com/content_CVPRW_2019/papers/MULA/Wan_Co-Compressing_and_Unifying_Deep_CNN_Models_for_Efficient_Human_Face_CVPRW_2019_paper.pdf), CVPRW 2019 - -[Learning Individual Styles of Conversational Gesture](https://arxiv.org/abs/1906.04160), CVPR 2019 [[code]](http://people.eecs.berkeley.edu/~shiry/speech2gesture) - -[Capture, Learning, and Synthesis of 3D Speaking Styles](https://ps.is.tuebingen.mpg.de/uploads_file/attachment/attachment/510/paper_final.pdf), CVPR 2019 [[code]](https://github.com/TimoBolkart/voca) - -[Disjoint Mapping Network for Cross-modal Matching of Voices and Faces](https://arxiv.org/abs/1807.04836), ICLR 2019 - -[Wav2Pix: Speech-conditioned Face Generation using Generative Adversarial Networks](https://arxiv.org/abs/1903.10195), ICASSP 2019 [[code]](https://imatge-upc.github.io/wav2pix/) - -[Learning Affective Correspondence between Music and Image](https://arxiv.org/abs/1904.00150), ICASSP 2019 [[dataset]](https://gaurav22verma.github.io/IMAC_Dataset.html) - -[Jointly Discovering Visual Objects and Spoken Words from Raw Sensory Input](https://arxiv.org/abs/1804.01452), ECCV 2018 [[code]](https://github.com/LiqunChen0606/Jointly-Discovering-Visual-Objects-and-Spoken-Words) - -[Seeing Voices and Hearing Faces: Cross-modal Biometric Matching](https://arxiv.org/abs/1804.00326), CVPR 2018 [[code]](https://github.com/a-nagrani/SVHF-Net) - -[Learning to Separate Object Sounds by Watching Unlabeled Video](http://openaccess.thecvf.com/content_cvpr_2018_workshops/papers/w49/Gao_Learning_to_Separate_CVPR_2018_paper.pdf), CVPR 2018 - -[Deep Audio-Visual Speech Recognition](https://arxiv.org/abs/1809.02108), IEEE TPAMI 2018 - -[Look, Listen and Learn](http://openaccess.thecvf.com/content_ICCV_2017/papers/Arandjelovic_Look_Listen_and_ICCV_2017_paper.pdf), ICCV 2017 - -[Unsupervised Learning of Spoken Language with Visual Context](https://papers.nips.cc/paper/6186-unsupervised-learning-of-spoken-language-with-visual-context.pdf), NeurIPS 2016 - -[SoundNet: Learning Sound Representations from Unlabeled Video](https://arxiv.org/abs/1610.09001), NeurIPS 2016 [[code]](http://projects.csail.mit.edu/soundnet/) - -### Visual, IMU and Wireless -[Vi-Fi: Associating Moving Subjects across Vision and Wireless Sensors](https://ieeexplore.ieee.org/document/9826015), IPSN 2022 [[code]](https://github.com/vifi2021/Vi-Fi) - -### Media Description - -[Towards Unsupervised Image Captioning with Shared Multimodal Embeddings](https://arxiv.org/abs/1908.09317), ICCV 2019 - -[Video Relationship Reasoning using Gated Spatio-Temporal Energy Graph](https://arxiv.org/abs/1903.10547), CVPR 2019 [[code]](https://github.com/yaohungt/GSTEG_CVPR_2019) - -[Joint Event Detection and Description in Continuous Video Streams](https://arxiv.org/abs/1802.10250), WACVW 2019 - -[Learning to Compose and Reason with Language Tree Structures for Visual Grounding](https://arxiv.org/abs/1906.01784), TPAMI 2019 - -[Neural Baby Talk](https://arxiv.org/abs/1803.09845), CVPR 2018 [[code]](https://github.com/jiasenlu/NeuralBabyTalk) - -[Grounding Referring Expressions in Images by Variational Context](https://arxiv.org/abs/1712.01892), CVPR 2018 - -[Video Captioning via Hierarchical Reinforcement Learning](https://arxiv.org/abs/1711.11135), CVPR 2018 - -[Charades-Ego: A Large-Scale Dataset of Paired Third and First Person Videos](https://arxiv.org/abs/1804.09626), CVPR 2018 [[code]](https://allenai.org/plato/charades/) - -[Neural Motifs: Scene Graph Parsing with Global Context](https://arxiv.org/abs/1711.06640), CVPR 2018 [[code]](http://github.com/rowanz/neural-motifs) - -[No Metrics Are Perfect: Adversarial Reward Learning for Visual Storytelling](https://arxiv.org/abs/1804.09160), ACL 2018 - -[Generating Descriptions with Grounded and Co-Referenced People](https://arxiv.org/abs/1704.01518), CVPR 2017 - -[DenseCap: Fully Convolutional Localization Networks for Dense Captioning](https://cs.stanford.edu/people/karpathy/densecap/), CVPR 2016 - -[Review Networks for Caption Generation](https://arxiv.org/abs/1605.07912), NeurIPS 2016 [[code]](https://github.com/kimiyoung/review_net) - -[Hollywood in Homes: Crowdsourcing Data Collection for Activity Understanding](https://arxiv.org/abs/1604.01753), ECCV 2016 [[code]](https://allenai.org/plato/charades/) - -[Show and Tell: Lessons learned from the 2015 MSCOCO Image Captioning Challenge](https://arxiv.org/abs/1609.06647), TPAMI 2016 [[code]](https://github.com/tensorflow/models/tree/master/research/im2txt) - -[Show, Attend and Tell: Neural Image Caption Generation with Visual Attention](https://arxiv.org/abs/1502.03044), ICML 2015 [[code]](https://github.com/kelvinxu/arctic-captions) - -[Deep Visual-Semantic Alignments for Generating Image Descriptions](https://arxiv.org/abs/1412.2306v2), CVPR 2015 [[code]](https://github.com/karpathy/neuraltalk2) - -[Show and Tell: A Neural Image Caption Generator](https://arxiv.org/abs/1411.4555), CVPR 2015 [[code]](https://github.com/karpathy/neuraltalk2) - -[A Dataset for Movie Description](https://arxiv.org/abs/1501.02530), CVPR 2015 [[code]](https://www.mpi-inf.mpg.de/departments/computer-vision-and-multimodal-computing/research/vision-and-language/mpii-movie-description-dataset/) - -[Whatโ€™s Cookinโ€™? Interpreting Cooking Videos using Text, Speech and Vision](https://arxiv.org/abs/1503.01558), NAACL 2015 [[code]](https://github.com/malmaud/whats_cookin) - -[Microsoft COCO: Common Objects in Context](https://arxiv.org/abs/1405.0312), ECCV 2014 [[code]](http://cocodataset.org/#home) - -### Video Generation from Text - -[Image Generation from Scene Graphs](https://arxiv.org/abs/1804.01622), CVPR 2018 - -[Learning to Color from Language](https://arxiv.org/abs/1804.06026), NAACL 2018 - -[Generative Adversarial Text to Image Synthesis](https://arxiv.org/abs/1605.05396), ICML 2016 - -### Affect Recognition and Multimodal Language - -[End-to-end Facial and Physiological Model for Affective Computing and Applications](https://arxiv.org/abs/1912.04711), arXiv 2019 - -[Affective Computing for Large-Scale Heterogeneous Multimedia Data: A Survey](https://arxiv.org/abs/1911.05609), ACM TOMM 2019 - -[Towards Multimodal Sarcasm Detection (An Obviously_Perfect Paper)](https://arxiv.org/abs/1906.01815), ACL 2019 [[code]](https://github.com/soujanyaporia/MUStARD) - -[Multi-modal Approach for Affective Computing](https://arxiv.org/abs/1804.09452), EMBC 2018 - -[Multimodal Language Analysis with Recurrent Multistage Fusion](https://arxiv.org/abs/1808.03920), EMNLP 2018 - -[Multimodal Language Analysis in the Wild: CMU-MOSEI Dataset and Interpretable Dynamic Fusion Graph](http://aclweb.org/anthology/P18-1208), ACL 2018 [[code]](https://github.com/A2Zadeh/CMU-MultimodalSDK) - -[Multi-attention Recurrent Network for Human Communication Comprehension](https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/viewFile/17390/16123), AAAI 2018 [[code]](https://github.com/A2Zadeh/CMU-MultimodalSDK) - -[End-to-End Multimodal Emotion Recognition using Deep Neural Networks](https://arxiv.org/abs/1704.08619), arXiv 2017 - -[AMHUSE - A Multimodal dataset for HUmor SEnsing](https://dl.acm.org/citation.cfm?id=3136806), ICMI 2017 [[code]](http://amhuse.phuselab.di.unimi.it/) - -[Decoding Childrenโ€™s Social Behavior](http://www.cbi.gatech.edu/mmdb/docs/mmdb_paper.pdf), CVPR 2013 [[code]](http://www.cbi.gatech.edu/mmdb/) - -[Collecting Large, Richly Annotated Facial-Expression Databases from Movies](http://users.cecs.anu.edu.au/%7Eadhall/Dhall_Goecke_Lucey_Gedeon_M_2012.pdf), IEEE Multimedia 2012 [[code]](https://cs.anu.edu.au/few/AFEW.html) - -[The Interactive Emotional Dyadic Motion Capture (IEMOCAP) Database](https://sail.usc.edu/iemocap/Busso_2008_iemocap.pdf), 2008 [[code]](https://sail.usc.edu/iemocap/) - -### Healthcare - -[Multimodal Co-Attention Transformer for Survival Prediction in Gigapixel Whole Slide Images](https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Multimodal_Co-Attention_Transformer_for_Survival_Prediction_in_Gigapixel_Whole_Slide_ICCV_2021_paper.html), ICCV, 2021 - -[PET-Guided Attention Network for Segmentation of Lung Tumors from PET/CT Images](https://rdcu.be/c8WWl), GCPR 2020 [[code]](https://github.com/pvk95/PAG) - -[Pathomic Fusion: An Integrated Framework for Fusing Histopathology and Genomic Features for Cancer Diagnosis and Prognosis](https://arxiv.org/abs/1912.08937), IEEE TMI, 2020 - -[Leveraging Medical Visual Question Answering with Supporting Facts](https://arxiv.org/abs/1905.12008), arXiv 2019 - -[Unsupervised Multimodal Representation Learning across Medical Images and Reports](https://arxiv.org/abs/1811.08615), ML4H 2018 - -[Multimodal Medical Image Retrieval based on Latent Topic Modeling](https://aiforsocialgood.github.io/2018/pdfs/track1/75_aisg_neurips2018.pdf), ML4H 2018 - -[Improving Hospital Mortality Prediction with Medical Named Entities and Multimodal Learning](https://arxiv.org/abs/1811.12276), ML4H 2018 - -[Knowledge-driven Generative Subspaces for Modeling Multi-view Dependencies in Medical Data](https://arxiv.org/abs/1812.00509), ML4H 2018 - -[Multimodal Depression Detection: Fusion Analysis of Paralinguistic, Head Pose and Eye Gaze Behaviors](https://ieeexplore.ieee.org/document/7763752), TAC 2018 - -[Learning the Joint Representation of Heterogeneous Temporal Events for Clinical Endpoint Prediction](https://arxiv.org/abs/1803.04837), AAAI 2018 - -[Understanding Coagulopathy using Multi-view Data in the Presence of Sub-Cohorts: A Hierarchical Subspace Approach](http://mucmd.org/CameraReadySubmissions/67%5CCameraReadySubmission%5Cunderstanding-coagulopathy-multi%20(6).pdf), MLHC 2017 - -[Machine Learning in Multimodal Medical Imaging](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5357511/), 2017 - -[Cross-modal Recurrent Models for Weight Objective Prediction from Multimodal Time-series Data](https://arxiv.org/abs/1709.08073), ML4H 2017 - -[SimSensei Kiosk: A Virtual Human Interviewer for Healthcare Decision Support](https://dl.acm.org/citation.cfm?id=2617388.2617415), AAMAS 2014 - -[Dyadic Behavior Analysis in Depression Severity Assessment Interviews](https://dl.acm.org/citation.cfm?doid=2663204.2663238), ICMI 2014 - -[Audiovisual Behavior Descriptors for Depression Assessment](https://dl.acm.org/citation.cfm?doid=2522848.2522886), ICMI 2013 - -### Robotics - -[Detect, Reject, Correct: Crossmodal Compensation of Corrupted Sensors](https://arxiv.org/abs/2012.00201), ICRA 2021 - -[Multimodal sensor fusion with differentiable filters](https://arxiv.org/abs/2010.13021), IROS 2020 - -[Concept2Robot: Learning Manipulation Concepts from Instructions and Human Demonstrations](http://www.roboticsproceedings.org/rss16/p082.pdf), RSS 2020 - -[See, Feel, Act: Hierarchical Learning for Complex Manipulation Skills with Multi-sensory Fusion](https://robotics.sciencemag.org/content/4/26/eaav3123), Science Robotics 2019 - -[Early Fusion for Goal Directed Robotic Vision](https://arxiv.org/abs/1811.08824), IROS 2019 - -[Simultaneously Learning Vision and Feature-based Control Policies for Real-world Ball-in-a-Cup](https://arxiv.org/abs/1902.04706), RSS 2019 - -[Probabilistic Multimodal Modeling for Human-Robot Interaction Tasks](http://www.roboticsproceedings.org/rss15/p47.pdf), RSS 2019 - -[Making Sense of Vision and Touch: Self-Supervised Learning of Multimodal Representations for Contact-Rich Tasks](https://arxiv.org/abs/1810.10191), ICRA 2019 - -[Evolving Multimodal Robot Behavior via Many Stepping Stones with the Combinatorial Multi-Objective Evolutionary Algorithm -](https://arxiv.org/abs/1807.03392), arXiv 2018 - -[Multi-modal Predicate Identification using Dynamically Learned Robot Controllers](https://www.cs.utexas.edu/~pstone/Papers/bib2html-links/IJCAI18-saeid.pdf), IJCAI 2018 - -[Multimodal Probabilistic Model-Based Planning for Human-Robot Interaction](https://arxiv.org/abs/1710.09483), arXiv 2017 - -[Perching and Vertical Climbing: Design of a Multimodal Robot](https://ieeexplore.ieee.org/document/6907472), ICRA 2014 - -[Multi-Modal Scene Understanding for Robotic Grasping](http://kth.diva-portal.org/smash/get/diva2:459199/FULLTEXT01), 2011 - -[Strategies for Multi-Modal Scene Exploration](https://am.is.tuebingen.mpg.de/uploads_file/attachment/attachment/307/2010_IROS_bjbk_camred.pdf), IROS 2010 - -### Autonomous Driving - -[Deep Multi-modal Object Detection and Semantic Segmentation for Autonomous Driving: Datasets, Methods, and Challenges](https://arxiv.org/pdf/1902.07830.pdf), IEEE TITS 2020 [[website]](https://boschresearch.github.io/multimodalperception/) - -[nuScenes: A multimodal dataset for autonomous driving](https://openaccess.thecvf.com/content_CVPR_2020/papers/Caesar_nuScenes_A_Multimodal_Dataset_for_Autonomous_Driving_CVPR_2020_paper.pdf), CVPR 2020 [[dataset]](https://www.nuscenes.org/) - -[Multimodal End-to-End Autonomous Driving](https://arxiv.org/abs/1906.03199), arXiv 2020 - -### Finance - -[A Multimodal Event-driven LSTM Model for Stock Prediction Using Online News](https://ailab-ua.github.io/courses/resources/Qing_TKDE_2020.pdf), TKDE 2020 - -[Multimodal Deep Learning for Finance: Integrating and Forecasting International Stock Markets](https://arxiv.org/abs/1903.06478), 2019 - -[Multimodal deep learning for short-term stock volatility prediction](https://arxiv.org/abs/1812.10479), 2018 - -### Human AI Interaction - -[Multimodal Human Computer Interaction: A Survey](https://link.springer.com/chapter/10.1007/11573425_1), HCI 2005 - -[Affective multimodal human-computer interaction](https://dl.acm.org/doi/10.1145/1101149.1101299), Multimedia 2005 - -[Building a multimodal human-robot interface](https://ieeexplore.ieee.org/abstract/document/1183338?casa_token=tdKeY0Q0e-4AAAAA:XfKwp5Di1O5bCEOnebeaS58waSbWm80lxNuY8IhWW7DqDLvRQj-8ettJW1NrFrmoR_ShudTgzw), IEEE Intelligent Systems 2001 - -### Multimodal Content Generation - -[Non-Linear Consumption of Videos Using a Sequence of Personalized Multimodal Fragments](https://gaurav22verma.github.io/assets/papers/NonLinearConsumption.pdf), IUI 2021 - -[Generating Need-Adapted Multimodal Fragments](https://gaurav22verma.github.io/assets/MultimodalFragments.pdf), IUI 2020 - -# Workshops - -[Multimodal KDD 2023: International Workshop on Multimodal Learning](https://multimodal-kdd-2023.github.io), KDD 2023 - -[Multimodal Representation Learning: Perks and Pitfalls](https://mrl-workshop.github.io/iclr-2023/), ICLR 2023 - -[Social Intelligence in Humans and Robots](https://social-intelligence-human-ai.github.io/) @ ICRA 2021 - -[LANTERN 2021](https://www.lantern.uni-saarland.de/2021/): The Third Workshop Beyond Vision and LANguage: inTEgrating Real-world kNowledge @ EACL 2021 - -Multimodal workshops @ CVPR 2021: [Multimodal Learning and Applications](https://mula-workshop.github.io/), [Sight and Sound](http://sightsound.org/), [Visual Question Answering](https://visualqa.org/workshop), [Embodied AI](https://embodied-ai.org/), [Language for 3D Scenes](http://language3dscenes.github.io/). - -Multimodal workshops @ NAACL 2021: [MAI-Workshop](http://multicomp.cs.cmu.edu/naacl2021multimodalworkshop/), [ALVR](https://alvr-workshop.github.io/), [ViGIL](https://vigilworkshop.github.io/). - -ICLR 2021 workshop on [Embodied Multimodal Learning](https://eml-workshop.github.io/). - -NeurIPS 2020 workshop on [Wordplay: When Language Meets Games](https://wordplay-workshop.github.io/). - -ACL 2020 workshops on [Multimodal Language](http://multicomp.cs.cmu.edu/acl2020multimodalworkshop/) [(proceedings)](https://www.aclweb.org/anthology/volumes/2020.challengehml-1/) and [Advances in Language and Vision Research](https://alvr-workshop.github.io/). - -Multimodal workshops @ ECCV 2020: [EVAL](https://askforalfred.com/EVAL/), [CAMP](https://camp-workshop.stanford.edu/), and [MVA](https://sites.google.com/view/multimodalvideo-v2). - -[Multi-Modal Video Reasoning and Analyzing Competition](https://sutdcv.github.io/multi-modal-video-reasoning), ICCV 2021 - -[Grand Challenge and Workshop on Human Multimodal Language](http://multicomp.cs.cmu.edu/acl2020multimodalworkshop/), ACL 2020, ACL 2018 - -[Advances in Language and Vision Research](https://alvr-workshop.github.io/), ACL 2020 - -[Visually Grounded Interaction and Language](https://vigilworkshop.github.io/), NeurIPS 2019, NeurIPS 2018 - -[Emergent Communication: Towards Natural Language](https://sites.google.com/view/emecom2019), NeurIPS 2019 - -[Workshop on Multimodal Understanding and Learning for Embodied Applications](https://sites.google.com/view/mulea2019/home), ACM Multimedia 2019 - -[Beyond Vision and Language: Integrating Real-World Knowledge](https://www.lantern.uni-saarland.de/), EMNLP 2019 - -[The How2 Challenge: New Tasks for Vision & Language](https://srvk.github.io/how2-challenge/), ICML 2019 - -[Visual Question Answering and Dialog](https://visualqa.org/workshop.html), CVPR 2019, CVPR 2017 - -[Multi-modal Learning from Videos](https://sites.google.com/view/mmlv/home), CVPR 2019 - -[Multimodal Learning and Applications Workshop](https://mula-workshop.github.io/), CVPR 2019, ECCV 2018 - -[Habitat: Embodied Agents Challenge and Workshop](https://aihabitat.org/workshop/), CVPR 2019 - -[Closing the Loop Between Vision and Language & LSMD Challenge](https://sites.google.com/site/iccv19clvllsmdc/), ICCV 2019 - -[Multi-modal Video Analysis and Moments in Time Challenge](https://sites.google.com/view/multimodalvideo/), ICCV 2019 - -[Cross-Modal Learning in Real World](https://cromol.github.io/), ICCV 2019 - -[Spatial Language Understanding and Grounded Communication for Robotics](https://splu-robonlp.github.io/), NAACL 2019 - -[YouTube-8M Large-Scale Video Understanding](https://research.google.com/youtube8m/workshop2018/), ICCV 2019, ECCV 2018, CVPR 2017 - -[Language and Vision Workshop](http://languageandvision.com/), CVPR 2019, CVPR 2018, CVPR 2017, CVPR 2015 - -[Sight and Sound](http://sightsound.org/), CVPR 2019, CVPR 2018 - -[The Large Scale Movie Description Challenge (LSMDC)](https://sites.google.com/site/describingmovies/), ICCV 2019, ICCV 2017 - -[Wordplay: Reinforcement and Language Learning in Text-based Games](https://www.wordplay2018.com/), NeurIPS 2018 - -[Interpretability and Robustness in Audio, Speech, and Language](https://irasl.gitlab.io/), NeurIPS 2018 - -[Multimodal Robot Perception](https://natanaso.github.io/rcw-icra18/), ICRA 2018 - -[WMT18: Shared Task on Multimodal Machine Translation](http://www.statmt.org/wmt18/multimodal-task.html), EMNLP 2018 - -[Shortcomings in Vision and Language](https://sites.google.com/view/sivl/), ECCV 2018 - -[Computational Approaches to Subjectivity, Sentiment and Social Media Analysis](https://wt-public.emm4u.eu/wassa2018/), EMNLP 2018, EMNLP 2017, NAACL-HLT 2016, EMNLP 2015, ACL 2014, NAACL-HLT 2013 - -[Visual Understanding Across Modalities](http://vuchallenge.org/), CVPR 2017 - -[International Workshop on Computer Vision for Audio-Visual Media](https://cvavm2017.wordpress.com/), ICCV 2017 - -[Language Grounding for Robotics](https://robo-nlp.github.io/2017_index.html), ACL 2017 - -[Computer Vision for Audio-visual Media](https://cvavm2016.wordpress.com/), ECCV 2016 - -[Language and Vision](https://vision.cs.hacettepe.edu.tr/vl2016/), ACL 2016, EMNLP 2015 - -# Tutorials - -[Tutorial on MultiModal Machine Learning](https://cmu-multicomp-lab.github.io/mmml-tutorial/icml2023/), ICML 2023, CVPR 2022, NAACL 2022 - -[Recent Advances in Vision-and-Language Research](https://rohit497.github.io/Recent-Advances-in-Vision-and-Language-Research/), CVPR 2020 - -[Connecting Language and Vision to Actions](https://lvatutorial.github.io/), ACL 2018 - -[Machine Learning for Clinicians: Advances for Multi-Modal Health Data](https://www.michaelchughes.com/mlhc2018_tutorial.html), MLHC 2018 - -[Multimodal Machine Learning](https://sites.google.com/site/multiml2016cvpr/), ACL 2017, CVPR 2016, ICMI 2016 - -[Vision and Language: Bridging Vision and Language with Deep Learning](https://www.microsoft.com/en-us/research/publication/vision-language-bridging-vision-language-deep-learning/), ICIP 2017 - -# Courses - -[CMU 11-777 Multimodal Machine Learning](https://cmu-multicomp-lab.github.io/mmml-course/fall2022/) - -[CMU 11-877 Advanced Topics in Multimodal Machine Learning](https://cmu-multicomp-lab.github.io/adv-mmml-course/spring2023/) - -[CMU 05-618, Human-AI Interaction](https://haiicmu.github.io/) - -[CMU 11-777, Advanced Multimodal Machine Learning](https://piazza.com/cmu/fall2018/11777/resources) - -[Stanford CS422: Interactive and Embodied Learning](http://cs422interactive.stanford.edu/) - -[CMU 16-785, Integrated Intelligence in Robotics: Vision, Language, and Planning](http://www.cs.cmu.edu/~jeanoh/16-785/) - -[CMU 10-808, Language Grounding to Vision and Control](https://katefvision.github.io/LanguageGrounding/) - -[CMU 11-775, Large-Scale Multimedia Analysis](https://sites.google.com/a/is.cs.cmu.edu/lti-speech-classes/11-775-large-scale-multimedia-analysis) - -[MIT 6.882, Embodied Intelligence](https://phillipi.github.io/6.882/) - -[Georgia Tech CS 8803, Vision and Language](http://www.prism.gatech.edu/~arjun9/CS8803_CVL_Fall17/) - -[Virginia Tech CS 6501-004, Vision & Language](http://www.cs.virginia.edu/~vicente/vislang/) \ No newline at end of file diff --git a/docs/roadmap.md b/docs/roadmap.md deleted file mode 100644 index 1324e1e..0000000 --- a/docs/roadmap.md +++ /dev/null @@ -1,139 +0,0 @@ - -**[Zeta's 3-Step Master Plan for Perfecting Multi-Modality LLMs]** - ---- - -**1. Refinement and Excellence: Perfecting the Framework** - - **[Objective]**: To develop Zeta into the most sophisticated, yet intuitively simple framework for building Multi-Modality LLMs. - - - **[Strategies]** - - **Zeta Innovation Labs**: - * Create a dedicated team of experts who exclusively focus on refining the foundational modules and blocks. - * Prioritize research in areas like advanced self-supervised learning, multi-modal integration, and zero-shot learning. - - **Modularity Focus**: - * Develop plug-and-play modules that allow developers to effortlessly incorporate various data types (text, image, video, audio) into their LLMs. - * Standardize the blocks ensuring consistent performance, error-handling, and interoperability. - - **Performance Optimization**: - * Collaborate with hardware manufacturers to ensure that Zeta is perfectly optimized for cutting-edge GPUs, TPUs, and other specialized hardware. - * Roll out regular updates to keep the framework at the forefront of performance. - ---- - -**2. User-Centric Development: Making Zeta Intuitive** - - **[Objective]**: Ensure that every feature, tool, and module in Zeta aligns with the principle of making LLM creation simpler and more efficient. - - - **[Strategies]** - - **Zeta Academy**: - * Host frequent workshops and webinars targeted at educating users on harnessing the power of Zeta's multi-modality LLM features. - * Create a vast library of tutorials, ranging from beginner to advanced, with real-world examples of LLM implementation. - - **Interactive GUI for LLM Design**: - * Develop a visual interface where users can drag-and-drop modules, visualize their LLM architecture, and see real-time performance metrics. - - **Feedback Loops**: - * Create a robust system to collect and implement feedback. Users should feel like theyโ€™re co-creating Zeta. - * Launch a beta program where selected developers can test new features and provide insights. - ---- - -**3. Scaling and Outreach: From the Labs to the World** - - **[Objective]**: Make Zeta the de facto choice for developers worldwide aiming to craft state-of-the-art Multi-Modality LLMs. - - - **[Strategies]** - - **Zeta Ambassadors**: - * Identify and collaborate with top AI researchers and practitioners globally, making them the face and voice of Zeta in their communities. - - **Strategic Partnerships**: - * Work closely with major tech institutions, universities, and platforms to integrate Zeta into their curriculum or platforms. - * Create an API gateway for seamless integration of Zeta with other popular machine learning and data processing platforms. - - **Global Challenges & Competitions**: - * Organize worldwide LLM challenges, where developers use Zeta to solve real-world problems, bringing attention to both the problems and the capabilities of Zeta. - ---- - - -In every tool, in every line of code, in every module of Zeta, you'll find our relentless pursuit of excellence. But remember, at its core, - -Zeta isn't about us, - -it's about you, the creator. - -It's about giving you the power, the simplicity, and the edge to redefine the boundaries of what's possible. - -With Zeta, weโ€™re not just building a tool; we're crafting the future. - -A future we're eager to see through your eyes. - - - - ------- - - - - - - - - - - - - - - - - - - - - - - - -**[Zeta's 3-Step Master Plan]** - -**1. Cultivate an Ecosystem of Innovation** - - **[Objective]**: Establish an environment where creativity and innovation are paramount. - - - **[Strategies]** - - **Education & Outreach**: - * Launch a series of free online courses, workshops, and webinars to educate developers on the capabilities and advantages of Zeta. - * Partner with top universities and institutions, offering them early access and integrations, fostering a new generation of developers natively trained on Zeta. - - **Zeta Labs**: - * Open a research lab committed to pushing the boundaries of what neural networks can achieve. - * Provide grants, resources, and mentorship to promising projects and startups that choose to build with Zeta. - - **Open Source Philosophy**: - * Release parts of Zeta's core codebase to the public, inviting developers worldwide to contribute, refine, and expand upon the framework. - * Organize hackathons and coding challenges to galvanize the community around real-world problems that Zeta can solve. - ---- - -**2. Seamless Integration & Scalability** - - **[Objective]**: Make Zeta the easiest, most efficient, and most scalable framework to integrate into any project or system. - - - **[Strategies]** - - **Developer Toolkits**: - * Release a suite of tools, plugins, and libraries for all major development platforms and languages, ensuring Zeta is accessible to everyone, everywhere. - - **Zeta Cloud**: - * Offer a cloud solution that allows developers to run, test, and deploy their neural networks seamlessly. This ensures businesses of all sizes can scale without friction. - - **Partnerships**: - * Collaborate with major tech companies, ensuring Zeta's native support on platforms like AWS, Google Cloud, and Azure. - * Establish alliances with hardware manufacturers, optimizing Zeta for the latest GPUs and Neural Network Processors. - ---- - -**3. Build a Community and Cultivate Trust** - - **[Objective]**: Establish Zeta as more than a tool โ€“ it should be a movement, a community of forward-thinkers who believe in redefining the boundaries of neural network capabilities. - - - **[Strategies]** - - **ZetaCon**: - * Annually host a global conference (both offline and online) bringing together the brightest minds in the AI and machine learning sector. It will be a platform for networking, knowledge-sharing, and showcasing the best of what's been built using Zeta. - - **Transparency Reports**: - * Release regular updates about Zeta's development, challenges, successes, and roadmap. - * Actively gather feedback, ensuring the community feels heard and that their insights are valued. - - **Zeta Academy**: - * Create a platform where developers can share their projects, tutorials, and courses about Zeta. Recognize and reward the best contributions to foster a sense of ownership and pride within the community. - ---- - -This isn't just a roadmap. It's our promise, our commitment. Because at the end of the day, it's not about the lines of code we write. It's about the lives we change, the innovations we inspire, and the future we create. And with Zeta, we believe that future is brighter than ever. Let's build it together. - - diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css deleted file mode 100644 index be1c98e..0000000 --- a/docs/stylesheets/extra.css +++ /dev/null @@ -1,4 +0,0 @@ -:root { - --md-primary-fg-color: #8315F9; - --md-accent-fg-color: #00FFCE; - } \ No newline at end of file diff --git a/docs/zeta/.DS_Store b/docs/zeta/.DS_Store deleted file mode 100644 index 95f7139..0000000 Binary files a/docs/zeta/.DS_Store and /dev/null differ diff --git a/docs/zeta/index.md b/docs/zeta/index.md deleted file mode 100644 index 6ae54af..0000000 --- a/docs/zeta/index.md +++ /dev/null @@ -1,59 +0,0 @@ -The Zeta framework provides developers with the ability to create State of The Art Models as simply and seamlessly as possible through **Modularity**, **Reliability**, **Use-Ability**, and **Speed** - -Zeta not only helps developers harness the potential of LLMs and Multi-Modal Foundation Models but also enforces trust boundaries, schema validation, and tool activity-level permissions. By doing so, Zeta maximizes LLMsโ€™ reasoning while adhering to strict policies regarding their capabilities. - -Zetaโ€™s design philosophy is based on the following tenets: - -1. **Use-Ability**: Utilizing Zeta should feel like going for a swim in the ocean, seamless and fluid with pythonic methods and classes and error handling that signifies what steps to take next. -2. **Reliability**: Zeta puts every FLOP to work by harnessing ultra-reliable and high-performance designs for all functions and classes -3. **Speed**: Zeta is like the Lamborghini of ML Frames with simply unparalled speed. - -## Quick Starts - -### Using pip - -Install **zeta** - -``` -pip3 install zeta -``` - -## Unleash FlashAttention -With Zeta, you can unleash the best and highest performance attention mechanisms like `FlashAttention` and `MultiQueryAttention`, here's an example with Flash Attention - -```python -import torch -from zeta import FlashAttention - -q = torch.randn(2, 4, 6, 8) -k = torch.randn(2, 4, 10, 8) -v = torch.randn(2, 4, 10, 8) - -attention = FlashAttention(causal=False, dropout=0.1, flash=False) -output = attention(q, k, v) - -print(output.shape) -``` - -## Unleash GPT-4 -On top of the SOTA Attention mechanisms we provide, we also provide rough implementation of some of the best neural nets ever made like `GPT4`, here's an example on how to utilize our implementation of GPT-4 - -```python -import torch -from zeta import GPT4, GPT4MultiModal - -#text -text = torch.randint(0, 256, (1, 1024)).cuda() -img = torch.randn(1, 3, 256, 256) - -gpt4_language = GPT4() - -gpt4_language(x) - -#multimodal GPT4 - -gpt4_multimodal = GPT4MultiModal() -gpt4_multimodal_output = gpt4_multimodal(text, img) - -``` - diff --git a/docs/zeta/nn/architecture/decoder.md b/docs/zeta/nn/architecture/decoder.md deleted file mode 100644 index 3fcf811..0000000 --- a/docs/zeta/nn/architecture/decoder.md +++ /dev/null @@ -1,103 +0,0 @@ -# Decoder Class Documentation - -Module/Class Name: Decoder - -```python -class Decoder(AttentionLayers): - def __init__(self, **kwargs): - assert 'causal' not in kwargs, 'cannot set causality on decoder' - super().__init__(causal=True, **kwargs) -``` - -## Overview and Introduction - -The `Decoder` class is a component of the Zeta library designed for creating a decoder model with multiple attention layers. It extends the functionality of the `AttentionLayers` class to enable the construction of a decoder architecture. The decoder is a key component in various sequence-to-sequence tasks, such as machine translation, text generation, and more. - -The decoder employs multi-head self-attention mechanisms and feed-forward networks to transform input sequences into meaningful output sequences while maintaining the causal property. It is particularly suitable for autoregressive tasks, where each step depends only on previous steps in the sequence. - -## Class Definition - -```python -class Decoder(AttentionLayers): - def __init__(self, **kwargs): - assert 'causal' not in kwargs, 'cannot set causality on decoder' - super().__init__(causal=True, **kwargs) -``` - -The `Decoder` class inherits from the `AttentionLayers` class and introduces the causality constraint by setting `causal=True`. It is initialized with various parameters that configure the architecture and behavior of the decoder. - -## Parameters - -The `Decoder` class constructor accepts various parameters that control the behavior of the decoder. The most important parameters are inherited from the `AttentionLayers` class, and additional parameters specific to the decoder are introduced. Below is a summary of the parameters: - -- `dim` (int): Dimensionality of the model. -- `depth` (int): Number of decoder layers. -- `heads` (int): Number of parallel attention heads. -- `cross_attend` (bool): Enable cross-attention between input and output sequences. -- `sandwich_coef` (int): Coefficient for configuring sandwich normalization. -- `residual_attn` (bool): Enable residual connection for self-attention layers. -- `cross_residual_attn` (bool): Enable residual connection for cross-attention layers. -- `layer_dropout` (float): Dropout probability applied to each layer. -- ... (additional parameters inherited from `AttentionLayers`) - -## Functionality and Usage - -The `Decoder` class extends the functionality of the `AttentionLayers` class to specifically create decoder models. It employs multi-head self-attention mechanisms and feed-forward networks to process input sequences and generate output sequences. - -### Initialization - -To create a decoder instance, you can use the following code: - -```python -from zeta import Decoder - -decoder = Decoder( - dim=512, - depth=6, - heads=8, - causal=True, - cross_attend=True, - residual_attn=True, - layer_dropout=0.1 -) -``` - -### Forward Pass - -The forward pass of the decoder can be performed using the following code: - -```python -output = decoder(input_sequence, context=context_sequence, mask=mask_sequence, context_mask=context_mask_sequence) -``` - -Here, `input_sequence` represents the input sequence to the decoder, `context_sequence` represents the context sequence for cross-attention (if enabled), `mask_sequence` is an optional mask to ignore certain elements in the input, and `context_mask_sequence` is an optional mask for the context sequence. - -### Return Intermediates - -If desired, you can also obtain intermediate outputs at each layer using the `return_hiddens` parameter: - -```python -output, intermediates = decoder(input_sequence, context=context_sequence, mask=mask_sequence, context_mask=context_mask_sequence, return_hiddens=True) -``` - -The `intermediates` object will contain information about intermediate hidden states and attention outputs for each layer. - -## Mathematical Formula - -The `Decoder` class is built upon the foundation of multi-head self-attention and feed-forward networks. It can be summarized using the following mathematical formula: - -1. Input Embedding: \( X \) -2. Multi-Head Self-Attention: \( A = \text{MultiHeadAttention}(X) \) -3. Feed-Forward Network: \( Y = \text{FeedForward}(A) \) -4. Residual Connection: \( Z = X + Y \) - -The above formula represents the basic forward pass of each layer in the decoder. The decoder iteratively applies these operations across its layers to generate meaningful output sequences while maintaining causal dependencies. - -## References - -- [Zeta Library Documentation](https://example.com/zeta/docs) -- [Attention Is All You Need](https://arxiv.org/abs/1706.03762) -- [PAR: Prompted Attention](https://arxiv.org/abs/2207.04503) -``` - -This documentation provides an in-depth overview of the `Decoder` class in the Zeta library. It covers its purpose, parameters, usage examples, and includes a simplified mathematical formula to illustrate its functionality. \ No newline at end of file diff --git a/docs/zeta/nn/architecture/transformer.md b/docs/zeta/nn/architecture/transformer.md deleted file mode 100644 index c2bb20a..0000000 --- a/docs/zeta/nn/architecture/transformer.md +++ /dev/null @@ -1,140 +0,0 @@ -# Transformer Documentation - -## Overview - -The `Transformer` class in the Zeta library is a versatile deep learning architecture that combines attention mechanisms with feedforward neural networks for various natural language processing tasks, such as language modeling, machine translation, and text generation. The Transformer architecture was introduced in the paper "Attention is All You Need" by Vaswani et al. - -The main purpose of the `Transformer` class is to provide a flexible and configurable interface for creating transformer-based models for sequence-to-sequence tasks. The class allows users to specify the number of tokens, maximum sequence length, attention layers, embeddings, and other parameters necessary for creating and training transformer models. - -The Transformer class supports both autoregressive and non-autoregressive training settings and includes features such as relative positional biases, rotary positional embeddings, memory tokens, and more. - -## Class Signature - -```python -class Transformer(nn.Module): - def __init__( - self, - *, - num_tokens, - max_seq_len, - attn_layers, - embedding_provider: BaseEmbedding, - emb_dim = None, - max_mem_len = 0., - shift_mem_down = 0, - emb_dropout = 0., - post_emb_norm = False, - num_memory_tokens = None, - tie_embedding = False, - logits_dim = None, - use_abs_pos_emb = True, - scaled_sinu_pos_emb = False, - l2norm_embed = False, - emb_frac_gradient = 1. - ) -``` - -## Parameters - -- `num_tokens` (int): The total number of tokens in the vocabulary. -- `max_seq_len` (int): The maximum length of the input sequences. -- `attn_layers` (AttentionLayers): An instance of the `AttentionLayers` class representing the core attention layers of the transformer. -- `embedding_provider` (BaseEmbedding): An instance of the `BaseEmbedding` class providing token embeddings. -- `emb_dim` (int, optional): The embedding dimension. Default is `None`, in which case `emb_dim` is set to the same dimension as the `attn_layers`. -- `max_mem_len` (float, optional): Maximum memory length for memory tokens. Default is `0.0`, indicating no memory tokens. -- `shift_mem_down` (int, optional): Number of positions to shift memory tokens down in each layer. Default is `0`. -- `emb_dropout` (float, optional): Dropout rate applied to the embedding layer. Default is `0.0`. -- `post_emb_norm` (bool, optional): Apply layer normalization to the post-embedding inputs. Default is `False`. -- `num_memory_tokens` (int, optional): Number of memory tokens to use. Default is `None`, indicating no memory tokens. -- `tie_embedding` (bool, optional): Tie the output projection weights with the input token embeddings. Default is `False`. -- `logits_dim` (int, optional): Dimensionality of the output logits. Default is `None`, indicating that it's the same as `num_tokens`. -- `use_abs_pos_emb` (bool, optional): Use absolute positional embeddings. Default is `True`. -- `scaled_sinu_pos_emb` (bool, optional): Use scaled sinusoidal positional embeddings. Default is `False`. -- `l2norm_embed` (bool, optional): Apply L2 normalization to the embeddings. Default is `False`. -- `emb_frac_gradient` (float, optional): Fraction of the gradient that should go to the embedding. Default is `1.0`. - -## Methods - -### `forward` - -```python -def forward( - self, - x, - return_embeddings = False, - return_logits_and_embeddings = False, - return_intermediates = False, - mask = None, - return_mems = False, - return_attn = False, - mems = None, - pos = None, - prepend_embeds = None, - sum_embeds = None, - **kwargs -) -``` - -This method computes the forward pass of the transformer. - -#### Parameters - -- `x` (torch.Tensor): Input tensor representing the sequence of token indices. -- `return_embeddings` (bool, optional): If `True`, return only the embeddings without applying the output projection. Default is `False`. -- `return_logits_and_embeddings` (bool, optional): If `True`, return both the logits and embeddings. Default is `False`. -- `return_intermediates` (bool, optional): If `True`, return intermediate attention values. Default is `False`. -- `mask` (torch.Tensor, optional): Attention mask indicating positions to be masked. Default is `None`. -- `return_mems` (bool, optional): If `True`, return updated memory tokens. Default is `False`. -- `return_attn` (bool, optional): If `True`, return attention maps. Default is `False`. -- `mems` (list of torch.Tensor, optional): Memory tokens for each layer. Default is `None`. -- `pos` (torch.Tensor, optional): External positional embeddings. Default is `None`. -- `prepend_embeds` (torch.Tensor, optional): Prepend embeddings to the input sequence. Default is `None`. -- `sum_embeds` (torch.Tensor, optional): Sum external embeddings to the input sequence. Default is `None`. -- `kwargs`: Additional keyword arguments passed to the attention layers. - -#### Returns - -The method returns the output logits or embeddings based on the specified return options. - -## Usage Examples - -Here are three usage examples of the `Transformer` class from the Zeta library: - -```python -from zeta.nn import Transformer - -# Example 1: Basic Usage -transformer = Transformer( - num_tokens=10000, - max_seq_len=256, - attn_layers=attn_layers_instance, - embedding_provider=embedding_provider_instance -) -logits = transformer(input_tokens) - -# Example 2: Return Embeddings -embeddings = transformer(input_tokens, return_embeddings=True) - -# Example 3: Return Intermediate Attention Maps -logits, attn_maps = transformer(input_tokens, return_attn=True) -``` - -In these examples, replace `attn_layers_instance` and `embedding_provider_instance` with actual instances of `AttentionLayers` and `BaseEmbedding`, respectively, and `input_tokens` with your input tensor containing token indices. - -## Mathematical Formula - -The mathematical formula for the `Transformer` class can be represented as follows: - -``` -Input -> Embedding -> Post-embedding Norm -> Embedding Dropout -> Project Embedding -> Attention Layers -> Layer Normalization -> To Logits/Embeddings -``` - -In this formula, "Attention Layers" represents the core attention mechanism of the transformer, which includes self-attention and feedforward neural networks. - -## References - -- Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., ... & Polosukhin, I. (2017). Attention is All You Need. Advances in neural information processing systems, 30. -- Zeta Library: Link to the official documentation of the Zeta library. -- Insert any additional references or resources as needed. -``` - diff --git a/docs/zeta/nn/attention/base.md b/docs/zeta/nn/attention/base.md deleted file mode 100644 index 41dfe49..0000000 --- a/docs/zeta/nn/attention/base.md +++ /dev/null @@ -1,90 +0,0 @@ -# BaseAttention Abstract Class -============================ - -Theย `BaseAttention`ย class is an abstract base class that defines the interface for all attention mechanisms. It includes the basic structure and methods that all attention mechanisms should have. - -```python -from abc import abstractmethod -import torch.nn as nn - -class BaseAttention(nn.Module): - @abstractmethod - def __init__(self, dim): - super().__init__() - self.dim = dim - - - @abstractmethod - def forward(self, x, context=None, mask=None): - pass -``` - - -## Usage ------------------------ - -Theย `FlashAttentionTwo`ย class extends theย `BaseAttention`ย abstract base class and implements the specific attention mechanism. - -```python -class FlashAttentionTwo(BaseAttention): - def __init__( - self, - *, - dim, - heads = 8, - dim_head = 64, - causal = False, - q_bucket_size = 512, - k_bucket_size = 1024, - parallel = False, - mixed_precision = False - ): - super().__init__(dim, heads, dim_head) - self.causal = causal - self.parallel = parallel - self.mixed_precision = mixed_precision - self.q_bucket_size = q_bucket_size - self.k_bucket_size = k_bucket_size - # ... rest of the implementation ... - - def forward( - self, - x, - context = None, - mask = None, - q_bucket_size = None, - k_bucket_size = None, - ): - # ... implementation of the forward method ... -``` - - -## Rules for Using the BaseAttention Class ---------------------------------------- - -1. Any class that extends theย `BaseAttention`ย class must implement theย `forward`ย method. This method defines how the attention mechanism operates. - -2. Theย `__init__`ย method of theย `BaseAttention`ย class takes three parameters:ย `dim`,ย `heads`, andย `dim_head`. Any class that extendsย `BaseAttention`ย should pass these parameters to theย `__init__`ย method of the base class. - -3. Theย `forward`ย method of theย `BaseAttention`ย class takes three parameters:ย `x`,ย `context`, andย `mask`. Any class that extendsย `BaseAttention`ย should include these parameters in itsย `forward`ย method. - ---- - -## Example of Using the FlashAttentionTwo Class --------------------------------------------- - -```python -from zeta import FlashAttentionTwo - -# Create an instance of the FlashAttentionTwo class -attention = FlashAttentionTwo(dim=512, heads=8, dim_head=64) - -# Create some input data -x = torch.randn(1, 10, 512) - -# Apply the attention mechanism -out = attention(x) -``` - - -In this example, we first create an instance of theย `FlashAttentionTwo`ย class. We then create some input dataย `x`ย and apply the attention mechanism to this data by calling theย `forward`ย method of theย `attention`ย instance. \ No newline at end of file diff --git a/docs/zeta/nn/attention/flash2.md b/docs/zeta/nn/attention/flash2.md deleted file mode 100644 index d1fbee2..0000000 --- a/docs/zeta/nn/attention/flash2.md +++ /dev/null @@ -1,155 +0,0 @@ -# Module Name: FlashAttentionTwo - -Theย `FlashAttentionTwo`ย class is a PyTorch module that implements a variant of the attention mechanism, which is a key component in many state-of-the-art models in natural language processing and other fields. This class is designed to be memory-efficient and optionally supports parallel computation and mixed precision for improved performance. - -## Class Definition ----------------- - -```python -class FlashAttentionTwo(nn.Module): - def __init__( - self, - *, - dim, - heads = 8, - dim_head = 64, - causal = False, - q_bucket_size = 512, - k_bucket_size = 1024, - parallel = False, - mixed_precision = False - ): -``` - ---- - -### Parameters - -- `dim`ย (int): The dimensionality of the input data. -- `heads`ย (int, optional): The number of attention heads. Default is 8. -- `dim_head`ย (int, optional): The dimensionality of each attention head. Default is 64. -- `causal`ย (bool, optional): If True, the attention mechanism is causal. Default is False. -- `q_bucket_size`ย (int, optional): The bucket size for the query in the attention mechanism. Default is 512. -- `k_bucket_size`ย (int, optional): The bucket size for the key in the attention mechanism. Default is 1024. -- `parallel`ย (bool, optional): If True, the computation is performed in parallel across multiple GPUs. Default is False. -- `mixed_precision`ย (bool, optional): If True, the computation is performed in mixed precision for improved performance. Default is False. - ------ - -### Methods - -#### `forward` - -``` -def forward( - self, - x, - context = None, - mask = None, - q_bucket_size = None, - k_bucket_size = None, -): -``` - -Performs the forward pass of the attention mechanism. - -##### Parameters - -- `x`ย (Tensor): The input data. -- `context`ย (Tensor, optional): The context for the attention mechanism. If not provided, the input dataย `x`ย is used as the context. -- `mask`ย (Tensor, optional): An optional mask for the attention mechanism. -- `q_bucket_size`ย (int, optional): The bucket size for the query in the attention mechanism. If not provided, the value specified during initialization is used. -- `k_bucket_size`ย (int, optional): The bucket size for the key in the attention mechanism. If not provided, the value specified during initialization is used. - ---- - -##### Returns - -- `out`ย (Tensor): The output of the attention mechanism. - - -## Usage Examples --------------- - -### Example 1: Basic Usage - -```python -from torch import nn -from zeta import FlashAttentionTwo - -model = FlashAttentionTwo(dim=512) -x = torch.randn(1, 10, 512) -out = model(x) -``` - -Copy code - -### Example 2: Using a Mask - -```python -from torch import nn -from zeta import FlashAttentionTwo - -model = FlashAttentionTwo(dim=512) -x = torch.randn(1, 10, 512) -mask = torch.ones(1, 10) -out = model(x, mask=mask) -``` - ----- - -### Example 3: Using a Context - -```python -from torch import nn -from zeta import FlashAttentionTwo - -model = FlashAttentionTwo(dim=512) -x = torch.randn(1, 10, 512) -context = torch.randn(1, 10, 512) -out = model(x, context=context) -``` - - -## Mathematical Formula --------------------- - -The attention mechanism can be described by the following formula: - -![Attention Formula](https://wikimedia.org/api/rest_v1/media/math/render/svg/0de1e8f5c8f6e3c3e1f8b3c89a6a2b7b187a5d3f) - -where Q, K, and V are the query, key, and value, respectively. The softmax function ensures that the weights sum to 1, and the dot product of the weights and the value gives the output of the attention mechanism. - - -### Additional Information ----------------------- - -Theย `FlashAttentionTwo`ย class is designed to be memory-efficient and optionally supports parallel computation and mixed precision for improved performance. - -- Theย `parallel`ย parameter allows the computation to be performed in parallel across multiple GPUs. This can significantly speed up the computation for large models or large datasets. - -- Theย `mixed_precision`ย parameter allows the computation to be performed in mixed precision. This means that some operations are performed in lower precision (e.g., float16) and some in higher precision (e.g., float32). This can significantly speed up the computation and reduce memory usage on modern GPUs that support mixed precision. - -- Theย `q_bucket_size`ย andย `k_bucket_size`ย parameters control the bucket size for the query and key in the attention mechanism, respectively. These parameters can be used to trade off between memory usage and computational efficiency. Larger bucket sizes can be more memory-efficient but may also be slower. - -### Common Issues -------------- - -- If you encounter out-of-memory errors, you can try reducing theย `q_bucket_size`ย andย `k_bucket_size`ย parameters, or enabling mixed precision computation by settingย `mixed_precision=True`. - -- If you encounter slow computation, you can try increasing theย `q_bucket_size`ย andย `k_bucket_size`ย parameters, or enabling parallel computation by settingย `parallel=True`ย (if you have multiple GPUs available). - -### References and Resources ------------------------- - -- [Attention Is All You Need](https://arxiv.org/abs/1706.03762): This is the original paper that introduced the concept of attention in deep learning. - -- [PyTorch Documentation](https://pytorch.org/docs/stable/index.html): The official PyTorch documentation provides detailed information about the PyTorch library and its modules. - -- [Efficient Attention: Attention with Linear Complexities](https://arxiv.org/abs/1812.01243): This paper introduces the concept of bucketing in the attention mechanism to improve memory efficiency. - -- [Mixed Precision Training](https://arxiv.org/abs/1710.03740): This paper introduces the concept of mixed precision training, which can significantly speed up computation and reduce memory usage on modern GPUs. - -- [PyTorch Tutorials](https://pytorch.org/tutorials/): The official PyTorch tutorials provide many examples of how to use PyTorch for various tasks. - -- \ No newline at end of file diff --git a/docs/zeta/nn/attention/flash_attention.md b/docs/zeta/nn/attention/flash_attention.md deleted file mode 100644 index 27c06fb..0000000 --- a/docs/zeta/nn/attention/flash_attention.md +++ /dev/null @@ -1,105 +0,0 @@ -# FlashAttention - -The FlashAttention module performs efficient attention computations, specifically designed for leveraging hardware capabilities on certain NVIDIA GPUs. It offers the option to perform "flash" attention which can be computationally faster on specific GPU architectures. - ---- - -## Class Definition: - -```python -class FlashAttention(nn.Module): -``` - -### Parameters: - -- `causal` (bool, optional): Determines whether to apply causal masking. Default: False. -- `dropout` (float, optional): Dropout probability. Default: 0. -- `flash` (bool, optional): Whether to use flash attention. Requires PyTorch version 2.0 or above. Default: True. - ---- - -## Methods: - -### `__init__(self, causal=False, dropout=0., flash=True)` - -Initializes the FlashAttention module. - -### `get_mask(self, i, j, device)` - -Generates a mask for attention computation. - -#### Parameters: -- `i` (int): Length of the query sequence. -- `j` (int): Length of the key sequence. -- `device` (torch.device): Device to place the mask tensor. - -#### Returns: -- `torch.Tensor`: Mask tensor of shape `(i, j)`. - -### `flash_attn(self, q, k, v, mask=None, attn_bias=None)` - -Performs flash attention computation. - -#### Parameters: -- `q` (torch.Tensor): Query tensor of shape `(batch, heads, q_len, dim)`. -- `k` (torch.Tensor): Key tensor of shape `(batch, heads, k_len, dim)`. -- `v` (torch.Tensor): Value tensor of shape `(batch, heads, v_len, dim)`. -- `mask` (torch.Tensor, optional): Mask tensor of shape `(batch, heads, q_len, k_len)`. Default: None. -- `attn_bias` (torch.Tensor, optional): Attention bias tensor of shape `(batch, heads, q_len, k_len)`. Default: None. - -#### Returns: -- `torch.Tensor`: Output tensor of shape `(batch, heads, q_len, dim)`. - -### `forward(self, q, k, v, mask=None, attn_bias=None)` - -Performs the attention computation using einstein notation. - -#### Parameters: -- `q` (torch.Tensor): Query tensor of shape `(batch, heads, q_len, dim)`. -- `k` (torch.Tensor): Key tensor of shape `(batch, heads, k_len, dim)`. -- `v` (torch.Tensor): Value tensor of shape `(batch, heads, v_len, dim)`. -- `mask` (torch.Tensor, optional): Mask tensor of shape `(batch, heads, q_len, k_len)`. Default: None. -- `attn_bias` (torch.Tensor, optional): Attention bias tensor of shape `(batch, heads, q_len, k_len)`. Default: None. - -#### Returns: -- `torch.Tensor`: Attention output tensor. - ---- - -## Usage Examples: - -1. **Basic Usage**: -```python -from zeta.nn import FlashAttention -attn_module = FlashAttention() -output = attn_module(query_tensor, key_tensor, value_tensor) -``` - -2. **Using Flash Attention with Masking**: -```python -from zeta.nn import FlashAttention -attn_module = FlashAttention(flash=True) -mask = attn_module.get_mask(query_length, key_length, device) -output = attn_module(query_tensor, key_tensor, value_tensor, mask=mask) -``` - -3. **Using Causal Flash Attention with Dropout**: -```python -from zeta.nn import FlashAttention -attn_module = FlashAttention(causal=True, dropout=0.1, flash=True) -output = attn_module(query_tensor, key_tensor, value_tensor) -``` - ---- - -## Additional Tips: - -- The `FlashAttention` module is optimized for NVIDIA A100 GPUs. On these GPUs, using `flash=True` is recommended for faster computation. -- Ensure that PyTorch version is 2.0 or above when enabling flash attention. -- The mask generated using `get_mask` method is useful for attention computations where certain positions need to be masked out. - ---- - -## References: - -- Original Attention Mechanism: [Attention Is All You Need](https://arxiv.org/abs/1706.03762) \ No newline at end of file diff --git a/docs/zeta/nn/attention/multihead.md b/docs/zeta/nn/attention/multihead.md deleted file mode 100644 index 6646190..0000000 --- a/docs/zeta/nn/attention/multihead.md +++ /dev/null @@ -1,106 +0,0 @@ -# Multihead Attention Documentation for Zeta Library - -## Introduction - -`MultiheadAttention` is a module in the Zeta library that provides multi-head attention mechanism. This mechanism enables the model to focus on different parts of the input sequence simultaneously. It's widely used in models such as transformers for capturing various aspects of information in the input. - -## Purpose - -The purpose of the `MultiheadAttention` module is to allow joint information representation from different subspaces of the input sequence. This results in capturing a richer context when modeling sequences. - -## Architecture - -The `MultiheadAttention` class extends from the `nn.Module` base class. Internally, it uses linear transformations for keys, values, and queries (`k_proj`, `v_proj`, `q_proj`). These projections are wrapped using the `MultiwayWrapper`. It also utilizes layer normalization (`inner_attn_ln`) and optionally uses relative positional embeddings (`xpos`). - -## Class Definition - -```python -class zeta.nn.embeddings.MultiheadAttention(nn.Module): -``` - -### Parameters: -- `args`: General arguments passed for configuring the module. -- `embed_dim` (int): Total dimension of the model. -- `num_heads` (int): Number of parallel attention heads. The embed_dim will be split across num_heads. -- `dropout` (float): Dropout probability. Default: 0.0. -- `self_attention` (bool): Whether to apply self attention. Only one of `self_attention` or `encoder_decoder_attention` can be True. Default: False. -- `encoder_decoder_attention` (bool): Whether to apply encoder-decoder attention. Only one of `self_attention` or `encoder_decoder_attention` can be True. Default: False. -- `subln` (bool): If True, applies layer normalization after self attention. Default: False. - -### Methods: - -#### `reset_parameters()` -Reinitialize the parameters of the attention module. - -#### `forward(query, key, value, ...)` -Computes the forward pass of the attention mechanism. - -- Parameters: - - `query` (Tensor): The query tensor. - - `key` (Tensor): The key tensor. - - `value` (Tensor): The value tensor. - - Other arguments including `incremental_state`, `key_padding_mask`, `attn_mask`, `rel_pos`, and `is_first_step`. - -- Returns: - - `attn` (Tensor): The computed attention tensor. - - `attn_weights` (Tensor): The attention weights. - -### Mathematical Formulation: - -Given a query \( Q \), key \( K \), and value \( V \), the multihead attention mechanism is mathematically represented as: - -\[ \text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right) V \] - -Where \( d_k \) is the dimension of the key. - -## Usage Examples: - -### Example 1: Basic Usage - -```python -from zeta.nn.embeddings import MultiheadAttention -import torch - -args = ... # Some configuration -attention = MultiheadAttention(args, embed_dim=512, num_heads=8, dropout=0.1, self_attention=True) -query = torch.rand((32, 10, 512)) -key = torch.rand((32, 10, 512)) -value = torch.rand((32, 10, 512)) - -attn, attn_weights = attention(query, key, value) -``` - -### Example 2: With Masking - -```python -from zeta.nn.embeddings import MultiheadAttention -import torch - -args = ... # Some configuration -attention = MultiheadAttention(args, embed_dim=512, num_heads=8, dropout=0.1, self_attention=True) -query = torch.rand((32, 10, 512)) -key = torch.rand((32, 10, 512)) -value = torch.rand((32, 10, 512)) -attn_mask = torch.ones((10, 10)).triu_() * -1e9 # Upper triangular mask - -attn, attn_weights = attention(query, key, value, attn_mask=attn_mask) -``` - -### Example 3: Encoder-Decoder Attention - -```python -from zeta.nn.embeddings import MultiheadAttention -import torch - -args = ... # Some configuration -attention = MultiheadAttention(args, embed_dim=512, num_heads=8, dropout=0.1, encoder_decoder_attention=True) -query = torch.rand((32, 10, 512)) # Decoder query -key = torch.rand((32, 20, 512)) # Encoder key -value = torch.rand((32, 20, 512)) # Encoder value - -attn, attn_weights = attention(query, key, value) -``` - -## Additional Tips: -- For encoder-decoder attention, make sure the dimensions of the encoder and decoder tensors match the expected input sizes. -- Using masks can be helpful to prevent the attention mechanism from focusing on certain parts of the sequence, such as padding. diff --git a/docs/zeta/nn/attention/multiquery.md b/docs/zeta/nn/attention/multiquery.md deleted file mode 100644 index 68fc46e..0000000 --- a/docs/zeta/nn/attention/multiquery.md +++ /dev/null @@ -1,108 +0,0 @@ -# MultiQueryAttention - -## Overview and Introduction: - -The `MultiQueryAttention` class is a part of the Zeta library, designed to perform self-attention operations on given input data. Unlike traditional attention mechanisms that use a single query, this class leverages multiple queries to capture a broader range of context information. This class allows for various implementations of attention, including Flash, Triton, and Torch. It also provides the flexibility to choose normalization type, fully connected layer type, and offers debugging verbosity. - -## Class Definition: - -```python -class MultiQueryAttention(nn.Module): - """Multi-Query self attention. - Using torch or triton attention implementation enables the user to also use - additive bias. - """ -``` - -### Parameters: -- `d_model` (int): Dimension of the model. -- `heads` (int): Number of parallel attention heads. -- `attn_impl` (str, optional): Attention implementation type, can be either 'triton', 'flash', or 'torch'. Default is 'triton'. -- `clip_qkv` (Optional[float]): Clipping value for query, key, and value. If specified, qkv is clamped within the range [-clip_qkv, clip_qkv]. -- `qk_ln` (bool, optional): If True, layer normalization is applied to query and key. -- `softmax_scale` (Optional[float]): Scale for softmax. Default value is computed as 1/sqrt(head_dim). -- `attn_pdrop` (float, optional): Attention dropout probability. Default is 0.0. -- `norm_type` (str, optional): Normalization type, default is 'low_precision_layernorm'. -- `fc_type` (str, optional): Fully connected layer type, default is 'torch'. -- `verbose` (int, optional): Verbosity level, default is 0. -- `device` (Optional[str]): Device to which the tensors should be moved. - -## Functionality and Usage: - -The `MultiQueryAttention` class operates by using multiple queries to capture broader context information from given data. This is achieved through the forward method which computes the self-attention on the given inputs. - -### Method: `forward` -```python -def forward( - self, - x, - past_key_value=None, - bias=None, - mask=None, - causal=True, - needs_weights=False, -): -``` - -#### Parameters: - -- `x` (Tensor): Input tensor. -- `past_key_value` (Optional): Past key and value for attention computation. Default is None. -- `bias` (Optional): Additive bias for attention scores. Default is None. -- `mask` (Optional): Key padding mask. Default is None. -- `causal` (bool, optional): If True, a causal mask is applied to prevent information flow from future tokens. Default is True. -- `needs_weights` (bool, optional): If True, attention weights are also returned. Default is False. - -#### Returns: - -- `context` (Tensor): Contextualized tensor after attention computation. -- `attn_weights` (Tensor, Optional): Attention weights. Only returned if `needs_weights` is True. -- `past_key_value` (Tensor, Optional): New past key and value. - -## Usage Examples: - -1. Basic Usage: -```python -from zeta import MultiQueryAttention -import torch - -# Initialize the attention module -attention_layer = MultiQueryAttention(d_model=512, heads=8, attn_impl='torch') - -# Random input tensor -x = torch.rand(16, 10, 512) # Batch of 16, sequence length 10, embedding size 512 -output, attn_weights, _ = attention_layer(x) -``` - -2. Using Past Key and Value: -```python -past_key_value = (torch.rand(16, 8, 10, 64), torch.rand(16, 8, 10, 64)) # Past key and value for 8 heads -output, attn_weights, new_past_key_value = attention_layer(x, past_key_value=past_key_value) -``` - -3. With Causal Masking and Weights: -```python -output, attn_weights, _ = attention_layer(x, causal=True, needs_weights=True) -``` - -## Mathematical Formula: - -For the self-attention mechanism, the computation involves using multiple queries (\( Q \)), keys (\( K \)), and values (\( V \)): - -```latex -\[ \text{Attention}(Q, K, V) = \text{Softmax}\left(\frac{Q \times K^T}{\sqrt{d_k}} + \text{Bias}\right) \times V \] -``` -Where: -- \( Q \), \( K \), and \( V \) are the queries, keys, and values respectively. -- \( d_k \) is the dimension of the keys. -- Bias is the optional additive bias. - -## Additional Information and Tips: - -- It's crucial to select the correct attention implementation (`attn_impl`) based on your needs and the hardware you're running on. -- The `triton` implementation might be faster than `flash` but can use more memory. Ensure that you have adequate GPU memory if using `triton`. -- If using the `torch` implementation, it's advisable to check if CUDA is available for GPU acceleration. -- The clipping of qkv (`clip_qkv`) can be beneficial for stability in training. - -## References and Resources: -For a deeper understanding of the self-attention mechanism and its variants, you can refer to the "Attention is All You Need" paper by Vaswani et al., 2017. \ No newline at end of file diff --git a/docs/zeta/nn/biases/alibi.md b/docs/zeta/nn/biases/alibi.md deleted file mode 100644 index 3f93dbe..0000000 --- a/docs/zeta/nn/biases/alibi.md +++ /dev/null @@ -1,90 +0,0 @@ -# AlibiPositionalBias Documentation - -## Introduction - -The `AlibiPositionalBias` module belongs to the zeta library and plays a crucial role in handling positional bias for multi-head attention mechanisms. Specifically, it attempts to alleviate the absolute positional bias based on the number of attention heads. - -## Class Definition: - -```python -class AlibiPositionalBias(nn.Module): -``` - -### Parameters: -- **heads** (`int`): Number of attention heads for which the slopes need to be calculated. -- **total_heads** (`int`): Total number of attention heads in the network. - -### Attributes: -- **slopes** (`Tensor`): Tensor containing slope values, which are computed based on the number of heads. -- **bias** (`Tensor` or `None`): Tensor for storing positional bias values. If not initialized or needs recomputation, it would be None. - -### Methods: -#### `__init__(self, heads, total_heads, **kwargs) -> None`: -Initializes the `AlibiPositionalBias` module. - -#### `get_bias(self, i, j, device) -> Tensor`: -Computes the positional bias for given dimensions i and j. - -- **Parameters**: - - **i** (`int`): One dimension of the required positional bias. - - **j** (`int`): Second dimension of the required positional bias. - - **device** (`torch.device`): The device on which computations are to be performed. - -#### `_get_slopes(heads) -> List[float]`: -A static method that calculates slopes based on the number of attention heads. - -- **Parameters**: - - **heads** (`int`): Number of attention heads. - -#### `forward(self, i, j) -> Tensor`: -Computes or retrieves the bias tensor for given dimensions. - -- **Parameters**: - - **i** (`int`): One dimension for the required positional bias. - - **j** (`int`): Second dimension for the required positional bias. - -## Mathematical Formula: - -Given `n` attention heads, the alibi positional bias can be represented as: - -\[ \text{Bias} = \text{-abs}(j_{\text{range}}) \times \text{slope} \] - -Where: -- \( j_{\text{range}} \) is an array of numbers from `0` to `j-1`. -- `slope` is computed based on the number of heads using `_get_slopes` method. - -## Usage Examples: - -### Example 1: Initialize and compute bias -```python -from zeta import AlibiPositionalBias -import torch - -bias_module = AlibiPositionalBias(heads=4, total_heads=8) -bias = bias_module(10, 10) -print(bias) -``` - -### Example 2: Retrieve stored bias -```python -bias = bias_module(5, 5) -print(bias) -``` - -### Example 3: Computing bias for different dimensions -```python -bias = bias_module(8, 15) -print(bias) -``` - -## Note: - -- It's crucial to ensure that the `total_heads` parameter is always greater than or equal to the `heads` parameter during initialization. -- The device property is internally used to determine the computation device based on the registered buffers. - -## References: - -For a deeper understanding and applications of positional bias in attention mechanisms, one may refer to the foundational paper on Transformer architectures: -- [Attention Is All You Need](https://arxiv.org/abs/1706.03762) - -Also, the `einops` library provides a versatile interface for tensor manipulations. More details can be found at its official [documentation](https://einops.rocks/). \ No newline at end of file diff --git a/docs/zeta/nn/biases/relative_bias.md b/docs/zeta/nn/biases/relative_bias.md deleted file mode 100644 index b3d0ec6..0000000 --- a/docs/zeta/nn/biases/relative_bias.md +++ /dev/null @@ -1,81 +0,0 @@ -# RelativePositionBias - -`RelativePositionBias` is a specialized PyTorch module designed to generate relative position biases, which can be vital for certain attention mechanisms in deep learning architectures. This module quantizes the distance between two positions into a certain number of buckets and then uses an embedding to get the relative position bias. This mechanism aids in the attention mechanism by providing biases based on relative positions between the query and key, rather than relying solely on their absolute positions. - -## Architecture: -The architecture can be visualized in three major steps: -1. **Bucketing:** Convert relative distances between two positions into bucket indices. -2. **Embedding:** Use the bucket indices to get embeddings for each pair of positions. -3. **Computing Bias:** Computes the bias values based on the embeddings. - -## Purpose: -In the context of attention mechanisms, especially the transformer-based architectures, the position of tokens can provide valuable information. The `RelativePositionBias` class helps introduce this information in a compact form by bucketing relative positions and then embedding them to serve as biases for the attention scores. - -## Mathematical Formula: -Given a relative position \( r \), the bucket index \( b \) is computed as: -\[ b = -\begin{cases} - n + \text{num_buckets} \div 2 & \text{if } n < 0 \text{ and bidirectional is True} \\ - \min\left( \max_{\text{exact}} + \left(\frac{\log(\frac{n}{\max_{\text{exact}}})}{\log(\frac{\text{max_distance}}{\max_{\text{exact}}})} \times (\text{num_buckets} - \max_{\text{exact}})\right), \text{num_buckets} - 1 \right) & \text{otherwise} - \end{cases} -\] -Where \( n \) is the negative of the relative position, and \( \max_{\text{exact}} \) is \( \text{num_buckets} \div 2 \). - -## Class Definition: - -```python -class RelativePositionBias(nn.Module): - """ - Compute relative position bias which can be utilized in attention mechanisms. - - Parameters: - - bidirectional (bool): If True, considers both forward and backward relative positions. Default: True. - - num_buckets (int): Number of buckets to cluster relative position distances. Default: 32. - - max_distance (int): Maximum distance to be considered for bucketing. Distances beyond this will be mapped to the last bucket. Default: 128. - - n_heads (int): Number of attention heads. Default: 12. - """ -``` - -### Key Methods: -- **_relative_position_bucket**: This static method is responsible for converting relative positions into bucket indices. -- **compute_bias**: Computes the relative position bias for given lengths of queries and keys. -- **forward**: Computes and returns the relative position biases for a batch. - -## Usage Examples: - -```python -from zeta import RelativePositionBias -import torch - -# Initialize the RelativePositionBias module -rel_pos_bias = RelativePositionBias() - -# Example 1: Compute bias for a single batch -bias_matrix = rel_pos_bias(1, 10, 10) - -# Example 2: Utilize in conjunction with an attention mechanism -# NOTE: This is a mock example, and may not represent an actual attention mechanism's complete implementation. -class MockAttention(nn.Module): - def __init__(self): - super().__init__() - self.rel_pos_bias = RelativePositionBias() - - def forward(self, queries, keys): - bias = self.rel_pos_bias(queries.size(0), queries.size(1), keys.size(1)) - # Further computations with bias in the attention mechanism... - return None # Placeholder - -# Example 3: Modify default configurations -custom_rel_pos_bias = RelativePositionBias(bidirectional=False, num_buckets=64, max_distance=256, n_heads=8) -``` - -## Tips: -1. The choice of `num_buckets` and `max_distance` might need tuning based on the dataset and application. -2. If the architecture doesn't need bidirectional biases, set `bidirectional` to `False` to reduce computation. -3. Ensure that the device of tensors being processed and the device of the `RelativePositionBias` module are the same. - -## References: -- [Attention Is All You Need](https://arxiv.org/abs/1706.03762) -- [Transformer Architectures](https://www.aclweb.org/anthology/D18-1422.pdf) - -Note: This documentation is based on the provided code and might need adjustments when integrated into the complete `zeta` library. \ No newline at end of file diff --git a/docs/zeta/nn/biases/xpos.md b/docs/zeta/nn/biases/xpos.md deleted file mode 100644 index 88b46b4..0000000 --- a/docs/zeta/nn/biases/xpos.md +++ /dev/null @@ -1,105 +0,0 @@ -# XPOS Module Documentation -------------------------- - -### Architecture - -The XPOS module is a part of a neural network model and is implemented as a subclass ofย `torch.nn.Module`. It consists of several functions and a class that work together to apply rotary positional embeddings to an input tensor. - -### Purpose - -The purpose of the XPOS module is to incorporate positional information into the input tensor of a neural network model. It achieves this by generating fixed positional embeddings and applying them to the input tensor using rotary positional encoding techniques. This allows the model to capture the sequential order and relative positions of the input elements, which can be beneficial for tasks such as natural language processing and time series analysis. - -### Functions and Methods - -1. `fixed_pos_embedding(x)`: Generates fixed positional embeddings for the input tensor. - - - Args: - - `x`ย (torch.Tensor): Input tensor of shapeย `(seq_len, dim)`. - - Returns: - - `sin`ย (torch.Tensor): Sine positional embeddings of shapeย `(seq_len, dim)`. - - `cos`ย (torch.Tensor): Cosine positional embeddings of shapeย `(seq_len, dim)`. -2. `rotate_every_two(x)`: Rearranges the elements of the input tensor by rotating every two elements. - - - Args: - - `x`ย (torch.Tensor): Input tensor of shapeย `(batch_size, seq_len, dim)`. - - Returns: - - `x`ย (torch.Tensor): Rearranged tensor of shapeย `(batch_size, seq_len, dim)`. -3. `duplicate_interleave(m)`: Duplicates a matrix while interleaving the copy. - - - Args: - - `m`ย (torch.Tensor): Input matrix. - - Returns: - - `m`ย (torch.Tensor): Duplicated and interleaved matrix. -4. `apply_rotary_pos_emb(x, sin, cos, scale=1)`: Applies rotary positional embeddings to the input tensor. - - - Args: - - `x`ย (torch.Tensor): Input tensor of shapeย `(batch_size, seq_len, dim)`. - - `sin`ย (torch.Tensor): Sine positional embeddings of shapeย `(seq_len, dim)`. - - `cos`ย (torch.Tensor): Cosine positional embeddings of shapeย `(seq_len, dim)`. - - `scale`ย (float): Scaling factor for the positional embeddings (default: 1). - - Returns: - - `x`ย (torch.Tensor): Tensor with applied rotary positional embeddings. -5. `XPOS(head_dim, scale_base=512)`: XPOS module class. - - - Args: - - `head_dim`ย (int): Dimensionality of the input tensor. - - `scale_base`ย (int): Base value for scaling the positional embeddings (default: 512). - - Methods: - - `forward(x, offset=0, downscale=False)`: Forward pass of the XPOS module. - - Args: - - `x`ย (torch.Tensor): Input tensor of shapeย `(batch_size, seq_len, dim)`. - - `offset`ย (int): Offset value for positional embeddings (default: 0). - - `downscale`ย (bool): Boolean indicating whether to downscale the positional embeddings (default: False). - - Returns: - - `x`ย (torch.Tensor): Tensor with applied rotary positional embeddings. - -### Usage Examples - -1. Applying XPOS module to an input tensor: - - ``` - import torch - from xpos import XPOS - - # Create an instance of the XPOS module - xpos = XPOS(head_dim=256) - - # Generate a random input tensor - x = torch.randn(1, 10, 256) - - # Apply the XPOS module to the input tensor - output = xpos(x) - ``` - - -2. Applying XPOS module with offset and downscaling: - - ``` - import torch - from zeta import XPOS - - # Create an instance of the XPOS module - xpos = XPOS(head_dim=512) - - # Generate a random input tensor - x = torch.randn(1, 20, 512) - - # Apply the XPOS module to the input tensor with offset and downscaling - output = xpos(x, offset=2, downscale=True) - ``` -3. Using the individual functions of the XPOS module: - - ``` - import torch - from zeta import fixed_pos_embedding, apply_rotary_pos_emb - - # Generate fixed positional embeddings - scale = torch.randn(10, 256) - sin, cos = fixed_pos_embedding(scale) - - # Apply rotary positional embeddings to an input tensor - x = torch.randn(1, 10, 256) - output = apply_rotary_pos_emb(x, sin, cos, scale=0.5) - ``` - -Note: The above examples assume that theย `xpos.py`ย file \ No newline at end of file diff --git a/docs/zeta/nn/embeddings/multiway.md b/docs/zeta/nn/embeddings/multiway.md deleted file mode 100644 index e8d998a..0000000 --- a/docs/zeta/nn/embeddings/multiway.md +++ /dev/null @@ -1,123 +0,0 @@ -# **Documentation for `MultiwayEmbedding` in Zeta Library** - -**Table of Contents** - -1. Overview -2. Class Definition and Parameters -3. Methods and Functionalities -4. Usage Examples -5. Additional Tips and Information -6. References - ---- - -## 1. Overview - -The `MultiwayEmbedding` class in the Zeta library provides a way to apply two separate embeddings to two distinct parts of the input tensor. It splits the input tensor at the specified position and applies one embedding to the first part and another embedding to the second part. This can be particularly useful when dealing with inputs that require diverse representations or embeddings. - ---- - -## 2. Class Definition and Parameters - -```python -class MultiwayEmbedding(MultiwayNetwork): - """ - A specialized version of the MultiwayNetwork to perform multi-way embeddings on an input tensor. - - Parameters: - - modules (List[nn.Module]): A list containing exactly two PyTorch modules. Typically these would be embedding layers. - - dim (int): The dimension along which to split and concatenate the input tensor. Default is 1. - """ - - def __init__(self, modules, dim=1): - super(MultiwayNetwork, self).__init__() - ... -``` - ---- - -## 3. Methods and Functionalities - -**forward(x, **kwargs)** -```python -def forward(self, x, **kwargs): - """ - Forward method to apply embeddings on the split input tensor. - - Parameters: - - x (torch.Tensor): The input tensor. - - **kwargs: Additional arguments that might be needed for the embeddings. - - Returns: - - torch.Tensor: Concatenated tensor after applying the embeddings. - """ - ... -``` - ---- - -## 4. Usage Examples - -**Example 1:** Basic Usage -```python -from zeta import MultiwayEmbedding -import torch.nn as nn - -emb1 = nn.Embedding(10, 5) -emb2 = nn.Embedding(10, 5) -multiway_emb = MultiwayEmbedding([emb1, emb2]) - -x = torch.LongTensor([[1,2,3],[4,5,6]]) -output = multiway_emb(x) -print(output) -``` - -**Example 2:** Setting a Split Position -```python -from zeta import MultiwayEmbedding, set_split_position -import torch.nn as nn - -emb1 = nn.Embedding(10, 5) -emb2 = nn.Embedding(10, 5) -multiway_emb = MultiwayEmbedding([emb1, emb2]) -multiway_emb.apply(set_split_position(2)) - -x = torch.LongTensor([[1,2,3],[4,5,6]]) -output = multiway_emb(x) -print(output) -``` - -**Example 3:** Working with Different Embedding Dimensions -```python -from zeta import MultiwayEmbedding -import torch.nn as nn - -emb1 = nn.Embedding(10, 5) -emb2 = nn.Embedding(10, 7) -multiway_emb = MultiwayEmbedding([emb1, emb2], dim=2) - -x = torch.LongTensor([[1,2,3],[4,5,6]]) -output = multiway_emb(x) -print(output) -``` - ---- - -## 5. Additional Tips and Information - -- Ensure that the input tensor's dimensions align with the expected embeddings. If there's a mismatch in dimensions, a runtime error will occur. -- The split position determines the point at which the tensor is divided. It's crucial to set this appropriately, especially if the embeddings have different dimensions. -- Using the provided `set_split_position` utility function makes it easy to apply the split position for the embeddings. - ---- - -## 6. References - -- Torch documentation: [Link to PyTorch Documentation](https://pytorch.org/docs/stable/index.html) -- Agora: [Link to Agora's GitHub](#) (assuming there might be a GitHub link or other resource for Agora) - ---- - -**Note:** Ensure that the tensor operations align mathematically, especially if you're concatenating tensors with different dimensions. In such cases, ensure the embeddings produce tensors that can be concatenated along the specified dimension. - -**Mathematical Explanation:** Given an input tensor \( X \) split into \( X_1 \) and \( X_2 \), and two embeddings \( A \) and \( B \), the output is given by concatenating \( A(X_1) \) and \( B(X_2) \). \ No newline at end of file diff --git a/docs/zeta/nn/embeddings/rope.md b/docs/zeta/nn/embeddings/rope.md deleted file mode 100644 index 7dd8622..0000000 --- a/docs/zeta/nn/embeddings/rope.md +++ /dev/null @@ -1,145 +0,0 @@ -# RotaryEmbedding - -`RotaryEmbedding` is a PyTorch module implementing the rotary embedding mechanism. It is designed to handle sequences of any length without the need for fine-tuning, and can also incorporate positional information into the embeddings. - -## Class Definition - -```python -class RotaryEmbedding(nn.Module): - def __init__( - self, - dim, - use_xpos=False, - scale_base=512, - interpolation_factor=1., - base=10000, - base_rescale_factor=1., - ): - ... -``` - -### Parameters - -- `dim` (int): The dimensionality of the embeddings. -- `use_xpos` (bool, optional): Whether to use positional information in the embeddings. Default: `False`. -- `scale_base` (int, optional): Base of the scale for positional information. Default: `512`. -- `interpolation_factor` (float, optional): Factor used for interpolating the embeddings. Default: `1.0`. -- `base` (int, optional): Base of the frequencies used in the embeddings. Default: `10000`. -- `base_rescale_factor` (float, optional): Factor used for rescaling the base of the frequencies. Default: `1.0`. - -### Method: `forward` - -```python -def forward(self, seq_len, device): - ... -``` - -#### Parameters - -- `seq_len` (int): The length of the sequence. -- `device` (torch.device): The device on which the computation will be performed. - -#### Returns - -- `freqs` (Tensor): The computed frequencies for the embeddings. -- `scale` (Tensor): The computed scale for the embeddings. - -## Functionality and Usage - -The `RotaryEmbedding` module computes rotary embeddings for a sequence of a given length. The embeddings are computed based on the frequency and scale of each position in the sequence. The frequency and scale are computed using the `inv_freq` and `scale` buffers registered in the module. - -The `forward` method computes the `freqs` and `scale` tensors based on the `seq_len` and `device` provided. The `freqs` tensor is computed by multiplying the `t` tensor, which contains the indices of the sequence, with the `inv_freq` tensor. The `scale` tensor is computed using the `scale` buffer and the `scale_base` parameter. - -The `freqs` and `scale` tensors are then concatenated along the last dimension and returned. - -### Usage Examples - -#### Example 1: Basic Usage - -```python -from zeta.nn import RotaryEmbedding -import torch -from torch import nn - -# Initialize the RotaryEmbedding module -rotary_embedding = RotaryEmbedding(dim=64, use_xpos=True) - -# Compute the embeddings for a sequence of length 10 -seq_len = 10 -device = torch.device('cuda') -freqs, scale = rotary_embedding(seq_len, device) - -print(freqs) -print(scale) -``` - -#### Example 2: Using a Different Scale Base - -```python -from zeta.nn import RotaryEmbedding -import torch -from torch import nn - -# Initialize the RotaryEmbedding module with a different scale base -rotary_embedding = RotaryEmbedding(dim=64, use_xpos=True, scale_base=1024) - -# Compute the embeddings for a sequence of length 10 -seq_len = 10 -device = torch.device('cuda') -freqs, scale = rotary_embedding(seq_len, device) - -print(freqs) -print(scale) -``` - -#### Example 3: Without Positional Information - -```python -from zeta.nn import RotaryEmbedding -import torch -from torch import nn - -# Initialize the RotaryEmbedding module without positional information -rotary_embedding = RotaryEmbedding(dim=64, use_xpos=False) - -# Compute the embeddings for a sequence of length 10 -seq_len = 10 -device = torch.device('cuda') -freqs, scale = rotary_embedding(seq_len, device) - -print(freqs) -print(scale) -``` - -## Mathematical Formula - -The mathematical formula for computing the `freqs` tensor is: - -\[ \text{freqs} = t \cdot \text{inv\_freq} \] - -Where: -- \( t \) is a tensor containing the indices of the sequence. -- \( \text{inv\_freq} \) is a tensor containing the inverse frequencies. - -The mathematical formula for computing the `scale` tensor is: - -\[ \text{scale} = \text{scale}^{\frac{\text{power}}{\text{scale\_base}}} \] - -Where: -- \( \text{power} \) is a tensor containing the power of each position in the sequence. -- \( \text{scale\_base} \) is a scalar containing the base of the scale. -- \( \text{scale} \) is a tensor containing the scale of each position in the sequence. - -## Additional Information and Tips - -- The `interpolation_factor` parameter can be used to interpolate the embeddings for sequences of different lengths. A larger `interpolation_factor` will result in a smoother interpolation. -- The `base_rescale_factor` parameter can be used to rescale the base of the frequencies. This can be useful for adjusting the embeddings for sequences of different lengths. -- If `use_xpos` is set to `False`, the `scale` tensor will not be used, and the `freqs` tensor will be returned as is. - -## References and Resources - -- [Paper: Link to the paper](https://arxiv.org/pdf/2308.10882.pdf) -- [PyTorch Documentation](https://pytorch.org/docs/stable/indehtml) -- [Einops Documentation](https://einops.rocks/pytorch-examples.html) - -Note: The above template includes the class definition, parameters, description, functionality, usage examples, mathematical formula, additional information and tips, and references and resources. To replicate the documentation for any other module or framework, follow the same structure and provide the specific details for that module or framework. \ No newline at end of file diff --git a/docs/zeta/nn/embeddings/truncated_rope.md b/docs/zeta/nn/embeddings/truncated_rope.md deleted file mode 100644 index d0acd0c..0000000 --- a/docs/zeta/nn/embeddings/truncated_rope.md +++ /dev/null @@ -1,103 +0,0 @@ -# Module/Function Name: TruncatedRotaryEmbedding - -The `TruncatedRotaryEmbedding` class is part of the Zeta library and is designed to implement the rotary embeddings with a truncation mechanism. The rotary embedding is a positional encoding method that aims to provide the model with information about the relative positions of the tokens in a sequence. The `TruncatedRotaryEmbedding` class extends the rotary embedding concept by incorporating a truncation mechanism, which sets the rotary embedding to zero for positions where the frequency is higher than a specified threshold. - -The architecture and workings of this class are inspired by the paper [link to the paper](https://arxiv.org/pdf/2308.10882.pdf). - -## Parameters: - -- `dim` (int): Dimensionality of the embeddings. -- `a` (float): Lower bound of the truncation region. Rotary embeddings with frequency lower than `a` will be set to zero. -- `b` (float): Upper bound of the truncation region. Rotary embeddings with frequency higher than or equal to `b` will not be truncated. -- `rho` (float): Value to which the rotary embeddings will be truncated in the region [a, b). - -The `dim` parameter is required to determine the dimensionality of the embeddings, while `a`, `b`, and `rho` are hyperparameters that control the truncation mechanism. - -## Method: - -### `forward(seq_len, device)` - -Computes the truncated rotary embeddings for a given sequence length. - -#### Parameters: - -- `seq_len` (int): Length of the sequence for which the rotary embeddings are to be computed. -- `device` (torch.device): Device on which the computations are to be performed. - -#### Returns: - -- `result` (Tensor): A tensor containing the truncated rotary embeddings for the specified sequence length. - -## Functionality and Usage: - -The `TruncatedRotaryEmbedding` class is used to compute the truncated rotary embeddings for a given sequence length. The rotary embeddings are computed by multiplying a tensor containing the position indices of the tokens in the sequence by the inverse frequencies. The inverse frequencies are computed based on the specified embedding dimension `dim` and are stored in the `inv_freq` buffer. - -The truncation mechanism is implemented by creating a `theta_star` tensor, which is used to multiply the computed `freqs`. The `theta_star` tensor is created based on the specified `a`, `b`, and `rho` parameters, and the computed `freqs` tensor. For positions where the frequency is higher than or equal to `b`, the rotary embeddings are not truncated, and `theta_star` is set to the frequency at that position. For positions where the frequency is lower than `a`, the rotary embeddings are set to zero, and `theta_star` is set to zero. For positions where the frequency is in the range [a, b], the rotary embeddings are truncated to `rho`, and `theta_star` is set to `rho`. - -Once the `theta_star` tensor is created, it is multiplied element-wise by the `freqs` tensor to compute the final truncated rotary embeddings. - -### Usage Example: - -```python -from zeta.nn.embeddings.truncated_rope import TruncatedRotaryEmbedding -import torch - -# Define the parameters -dim = 64 -a = 0.1 -b = 0.9 -rho = 0.5 -seq_len = 100 -device = torch.device('cuda') - -# Create the TruncatedRotaryEmbedding module -trunc_rotary_emb = TruncatedRotaryEmbedding(dim, a, b, rho) - -# Compute the truncated rotary embeddings for the specified sequence length -rotary_embeddings = trunc_rotary_emb(seq_len, device) - -print(rotary_embeddings) -``` - -In this example, the `TruncatedRotaryEmbedding` module is created with the specified `dim`, `a`, `b`, and `rho` parameters. The `forward` method is then called with the specified `seq_len` and `device` parameters to compute the truncated rotary embeddings for a sequence of length `seq_len`. - -## Additional Information and Tips: - -- The `a`, `b`, and `rho` parameters control the truncation mechanism and may need to be tuned based on the specific application and data being used. In particular, the `a` parameter should be set to a value that effectively removes the high-frequency noise in the rotary embeddings, while the `b` parameter should be set to a value that retains the useful positional information in the rotary embeddings. - -- The `dim` parameter should be set to the same value as the embedding dimension used in the model. - -- The `device` parameter in the `forward` method should be set to the same device on which the model is being trained. - -## Mathematical Formulation: - -The mathematical formulation of the truncated rotary embeddings can be expressed as follows: - -\[ \text{freqs} = t \cdot \text{inv\_freq} \] - -\[ \theta = \text{base}^{-2 \cdot i / \text{dim}}, \, i = 0, 2, \ldots, \text{dim}-2 \] - -\[ \theta^* = -\begin{cases} -0, & \text{if } \theta < a \\ -\rho, & \text{if } a \leq \theta < b \\ -\theta, & \text{if } \theta \geq b -\end{cases} -\] - -\[ \text{result} = \text{freqs} \cdot \theta^* \] - -Where: - -- \( t \) is a tensor containing the position indices of the tokens in the sequence. -- \( \text{inv\_freq} \) is a tensor containing the inverse frequencies computed based on the specified `dim` parameter. -- \( \text{freqs} \) is a tensor containing the computed frequencies for each position in the sequence. -- \( \theta \) is a tensor containing the computed theta values for each position in the sequence. -- \( \theta^* \) is a tensor containing the truncated theta values for each position in the sequence. -- \( \text{result} \) is the final tensor containing the truncated rotary embeddings for each position in the sequence. - -## References and Resources: - -- Paper: [Link to the paper](https://arxiv.org/pdf/2308.10882.pdf) - -For further exploration and implementation details, refer to the paper linked above. \ No newline at end of file diff --git a/docs/zeta/nn/modules/lora.md b/docs/zeta/nn/modules/lora.md deleted file mode 100644 index 84c0a7a..0000000 --- a/docs/zeta/nn/modules/lora.md +++ /dev/null @@ -1,160 +0,0 @@ -# Lora - -The `Lora` class is a module of the Zeta library that provides a simple linear transformation of the input data. It is a part of the `torch.nn` module and extends the `nn.Module` class from PyTorch. - -## Overview and Introduction - -The `Lora` class is designed to provide a scalable and efficient linear transformation operation. It is particularly useful in scenarios where the dimensionality of the input data is very high and computational efficiency is of paramount importance. The `Lora` class achieves this by breaking down the weight matrix into two lower rank matrices `A` and `B`, and a scale factor `alpha`, which are learned during the training process. This results in a significant reduction in the number of parameters to be learned, and consequently, a more computationally efficient model. - -## Key Concepts and Terminology - -- **Linear Transformation**: A linear transformation is a mathematical operation that transforms input data by multiplying it with a weight matrix. It is a fundamental operation in many machine learning models. - -- **Low Rank Approximation**: Low rank approximation is a technique used to approximate a matrix by another matrix of lower rank. This is often used to reduce the dimensionality of data and to make computations more efficient. - -- **Scale Factor**: A scale factor is a number by which a quantity is multiplied, changing the magnitude of the quantity. - -## Class Definition - -The `Lora` class is defined as follows: - -```python -class Lora(nn.Module): - def __init__( - self, - dim, - dim_out, - r=8, - alpha=None - ): - super().__init__() - self.scale = alpha / r - - self.A = nn.Parameter(torch.randn(dim, r)) - self.B = nn.Parameter(torch.randn(r, dim_out)) - - @property - def weight(self): - return (self.A @ self.B) * self.scale - - def forward(self, x): - return x @ self.weight -``` - -### Parameters - -- `dim` (`int`): The dimensionality of the input data. It is the number of features in the input data. -- `dim_out` (`int`): The desired dimensionality of the output data. It is the number of features in the output data. -- `r` (`int`, optional): The rank of the matrices `A` and `B`. It determines the size of the matrices `A` and `B`. Default is 8. -- `alpha` (`float`, optional): The scale factor. If not provided, it is set to 1 by default. - -### Methods - -#### `forward` - -The `forward` method is used to compute the forward pass of the `Lora` module. - -##### Parameters - -- `x` (`Tensor`): The input data. It is a tensor of shape `(batch_size, dim)`. - -##### Returns - -- `Tensor`: The transformed data. It is a tensor of shape `(batch_size, dim_out)`. - -## Functionality and Usage - -The `Lora` class is used to perform a linear transformation of the input data. The transformation is defined by the weight matrix `W`, which is approximated by the product of two lower rank matrices `A` and `B`, and a scale factor `alpha`. The `Lora` class learns the matrices `A` and `B`, and the scale factor `alpha` during the training process. - -The forward pass of the `Lora` module computes the product of the input data `x` and the weight matrix `W`, which is approximated by `(A @ B) * scale`. - -### Mathematical Formula - -The mathematical formula for the forward pass of the `Lora` module is: - -\[ y = xW \] - -Where: -- \( y \) is the transformed data. -- \( x \) is the input data. -- \( W \) is the weight matrix, which is approximated by \( (A @ B) * \text{scale} \). - -### Usage Examples - -Below are three examples of how to use the `Lora` class. - -#### Example 1: Basic Usage - -```python -import torch -from zeta import Lora - -# Define the input data -x = torch.randn(32, 128) # batch size of 32, and 128 features - -# Define the Lora module -lora = Lora(dim=128, dim_out=64) - -# Compute the forward pass -y = lora(x) -``` - -#### Example 2: Specifying the Rank and Scale Factor - -```python -import torch -from zeta import Lora - -# Define the input data -x = torch.randn(32, 128) # batch size of 32, and 128 features - -# Define the Lora module with specified rank and scale factor -lora = Lora(dim=128, dim_out=64, r=16, alpha=0.1) - -# Compute the forward pass -y = lora(x) -``` - -#### Example 3: Using the Lora Module in a Neural Network - -```python -import torch -from torch import nn -from zeta import Lora - -# Define a simple neural network with a Lora layer -class Net(nn.Module): - def __init__(self): - super().__init__() - self.lora = Lora(dim=128, dim_out=64) - self.fc = nn.Linear(64, 10) - - def forward(self, x): - x = self.lora(x) - x = self.fc(x) - return x - -# Define the input data -x = torch.randn(32, 128) # batch size of 32, and 128 features - -# Define the model -model = Net() - -# Compute the forward pass -output = model(x) -``` - -## Additional Information and Tips - -- The `Lora` class is particularly useful in scenarios where the dimensionality of the input data is very high and computational efficiency is of paramount importance. However, it may not be suitable for all applications, as the approximation of the weight matrix may result in a loss of accuracy. - -- The rank `r` and the scale factor `alpha` are hyperparameters that need to be tuned for the specific application. A higher value of `r` will - - result in a more accurate approximation of the weight matrix, but will also increase the computational cost. Similarly, the scale factor `alpha` needs to be tuned to achieve the desired trade-off between accuracy and computational efficiency. - -## References and Resources - -- [PyTorch nn.Module documentation](https://pytorch.org/docs/stable/generated/torch.nn.Module.html) -- [Low Rank Matrix Factorization for Deep Neural Network Training with High-dimensional Output Targets](https://arxiv.org/abs/2005.08735) - -For further exploration and implementation details, you can refer to the above resources and the official PyTorch documentation. \ No newline at end of file diff --git a/docs/zeta/nn/modules/token_learner.md b/docs/zeta/nn/modules/token_learner.md deleted file mode 100644 index 794dd77..0000000 --- a/docs/zeta/nn/modules/token_learner.md +++ /dev/null @@ -1,148 +0,0 @@ -# Zeta Library Documentation - -## Module Name: TokenLearner - -The `TokenLearner` is a PyTorch module designed for learning tokens from input data. It is a part of the Zeta library, a collection of modules and functions designed for efficient and flexible implementation of various deep learning tasks. The `TokenLearner` class is particularly useful for tasks such as image classification, object detection, and other applications where it is beneficial to extract tokens (representative features) from the input data. - -## Introduction - -In various deep learning tasks, it is common to extract tokens (representative features) from the input data. These tokens are then used for downstream tasks like classification, detection, etc. The `TokenLearner` class is designed to efficiently extract tokens from the input data. It does this by utilizing a convolutional neural network (CNN) with grouped convolutions and a gating mechanism. - -## Class Definition - -```python -class TokenLearner(nn.Module): - def __init__( - self, - *, - dim: int = None, - ff_mult: int = 2, - num_output_tokens: int = 8, - num_layers: int = 2 - ): - ... -``` - -### Parameters: - -- `dim` (int, optional): The dimension of the input data. Default is `None`. -- `ff_mult` (int, optional): The factor by which the inner dimension of the network will be multiplied. Default is `2`. -- `num_output_tokens` (int, optional): The number of tokens to be output by the network. Default is `8`. -- `num_layers` (int, optional): The number of layers in the network. Default is `2`. - -## Functionality and Usage - -The `TokenLearner` class is a PyTorch `nn.Module` that learns tokens from the input data. The input data is first packed and then processed through a series of grouped convolutions followed by a gating mechanism. The output is a set of tokens that are representative of the input data. - -The forward method of the `TokenLearner` class takes an input tensor `x` and performs the following operations: - -1. The input tensor `x` is packed using the `pack_one` helper function. -2. The packed tensor is then rearranged and passed through a series of grouped convolutions and activation functions. -3. The output of the convolutions is then rearranged and multiplied with the input tensor. -4. The resulting tensor is then reduced to obtain the final tokens. - -### Method: - -```python -def forward(self, x): - ... -``` - -### Parameters: - -- `x` (Tensor): The input tensor of shape `(batch_size, channels, height, width)`. - -### Returns: - -- `x` (Tensor): The output tokens of shape `(batch_size, channels, num_output_tokens)`. - -## Usage Examples - -### Example 1: Basic Usage - -```python -from zeta import TokenLearner -import torch - -# Initialize the TokenLearner -token_learner = TokenLearner(dim=64) - -# Generate some random input data -x = torch.randn(1, 64, 32, 32) - -# Forward pass -tokens = token_learner.forward(x) - -print(tokens.shape) -``` - -In this example, a `TokenLearner` is initialized with an input dimension of 64. A random tensor of shape `(1, 64, 32, 32)` is then passed through the `TokenLearner` to obtain the tokens. The output will be a tensor of shape `(1, 64, 8)`. - -### Example 2: Custom Parameters - -```python -from zeta import TokenLearner -import torch - -# Initialize the TokenLearner with custom parameters -token_learner = TokenLearner(dim=128, ff_mult=4, num_output_tokens=16) - -# Generate some random input data -x = torch.randn(2, 128, 64, 64) - -# Forward pass -tokens = token_learner.forward(x) - -print(tokens.shape) -# Output: torch.Size([2, 128, 16]) -``` - -In this example, a `TokenLearner` is initialized with custom parameters. A random tensor of shape `(2, 128, 64, 64)` is then passed through the `TokenLearner` to obtain the tokens. The output will be a tensor of shape `(2, 128, 16)`. - -### Example 3: Integration with Other PyTorch Modules - -```python -from zeta import TokenLearner -import torch -import torch.nn as nn - -# Initialize the TokenLearner -token_learner = TokenLearner(dim=64) - -# Generate some random input data -x = torch.randn(1, 64, 32, 32) - -# Define a simple model -model = nn.Sequential( - token_learner, - nn.Flatten(), - nn.Linear(64*8, 10) -) - -# Forward pass -output = model(x) - -print(output.shape) -# Output: torch.Size([1, 10]) -``` - -In this example, the `TokenLearner` is integrated into a simple model consisting of the `TokenLearner`, a `Flatten` layer, and a `Linear` layer. A random tensor of shape `(1, 64, 32, 32)` is then passed through the model to obtain the final output. The output will be a tensor of shape `(1, 10)`. - -## Mathematical Formulation - -The `TokenLearner` can be mathematically formulated as follows: - -Let `X` be the input tensor of shape `(B, C, H, W)`, where `B` is the batch size, `C` is the number of channels, `H` is the height, and `W` is the width. The `TokenLearner` first rearranges `X` to a tensor of shape `(B, G*C, H, W)`, where `G` is the number of output tokens. This is done by repeating `X` along the channel dimension `G` times. - -The rearranged tensor is then passed through a series of grouped convolutions and activation functions to obtain a tensor `A` of shape `(B, G, H, W)`. This tensor is then rearranged and multiplied with the input tensor `X` to obtain a tensor of shape `(B, C, G, H, W)`. - -The final tokens are obtained by reducing this tensor along the `H` and `W` dimensions to obtain a tensor of shape `(B, C, G)`. - -## Additional Information and Tips - -- The `num_output_tokens` parameter controls the number of tokens that will be output by the `TokenLearner`. A larger number of output tokens will result in a more detailed representation of the input data, but will also increase the computational requirements. - -- The `ff_mult` parameter controls the inner dimension of the `TokenLearner`. A larger `ff_mult` will result in a larger capacity model, but will also increase the computational requirements. - -- The `TokenLearner` works best with input data that has a relatively small spatial dimension (e.g. 32x32 or 64x64). For larger input sizes, it may be beneficial to use a downsampling layer (e.g. `nn.MaxPool2d`) before passing the data through the `TokenLearner`. - diff --git a/docs/zeta/nn/utils/helpers.md b/docs/zeta/nn/utils/helpers.md deleted file mode 100644 index 6c518a0..0000000 --- a/docs/zeta/nn/utils/helpers.md +++ /dev/null @@ -1,109 +0,0 @@ -## Documentation - -### Overview - -The provided module comprises utility functions and classes to streamline specific operations with Python data structures and PyTorch models. The main aspects of the module are: - -- Checking the existence of a value. -- Implementing custom call behavior through classes. -- Custom decorators for function calls. -- Dictionary manipulation. -- Initialization of PyTorch layer parameters. - -### Functions and Classes - -1. **exists(val: Any) -> bool**: - Checks if the provided value is not `None`. - -2. **default(val: Any, d: Any) -> Any**: - Returns the value if it's not `None`; otherwise, it returns a default value. - -3. **once(fn: Callable) -> Callable**: - A decorator ensuring that the function is only called once. - -4. **eval_decorator(fn: Callable) -> Callable**: - A decorator for `torch.nn.Module` methods to switch the module to `eval` mode during the function call and revert to its original mode afterwards. - -5. **cast_tuple(val: Any, depth: int) -> Tuple**: - Casts a value to a tuple with a specific depth. - -6. **maybe(fn: Callable) -> Callable**: - A decorator that calls the function only if its first argument exists. - -7. **always**: - A class that always returns the specified value when called. - -8. **not_equals** and **equals**: - Classes that, when instantiated with a value, check if another value is (not) equal to the specified value. - -9. **init_zero_(layer: nn.Module) -> None**: - Initializes the weights and biases of a torch layer to zero. - -10. **pick_and_pop(keys: List[str], d: Dict) -> Dict**: - Extracts values from a dictionary based on provided keys. - -11. **group_dict_by_key(cond: Callable, d: Dict) -> Tuple[Dict, Dict]**: - Groups dictionary keys based on a given condition. - -12. **string_begins_with(prefix: str, str: str) -> bool**: - Checks if a string starts with a specific prefix. - -13. **group_by_key_prefix(prefix: str, d: Dict) -> Tuple[Dict, Dict]**: - Groups dictionary items by keys starting with a specific prefix. - -14. **groupby_prefix_and_trim(prefix: str, d: Dict) -> Tuple[Dict, Dict]**: - Similar to `group_by_key_prefix` but also removes the prefix from keys. - -### Usage Examples - -1. **Using the `once` decorator**: - - ```python - from zeta import once - - @once - def greet(): - print("Hello, World!") - - greet() # prints "Hello, World!" - greet() # Does nothing on the second call - ``` - -2. **Using the `eval_decorator` with PyTorch**: - - ```python - import torch.nn as nn - from zeta import eval_decorator - - class SimpleModel(nn.Module): - def __init__(self): - super().__init__() - self.layer = nn.Linear(10, 10) - - @eval_decorator - def predict(self, x): - return self.layer(x) - - model = SimpleModel() - input_tensor = torch.randn(1, 10) - output = model.predict(input_tensor) # Automatically switches to eval mode and back - ``` - -3. **Dictionary Manipulation with Prefix Functions**: - - ```python - from zeta import group_by_key_prefix - - sample_dict = { - "user_name": "John", - "user_age": 25, - "order_id": 12345, - "order_date": "2023-01-01" - } - - user_data, order_data = group_by_key_prefix("user_", sample_dict) - print(user_data) # {'user_name': 'John', 'user_age': 25} - print(order_data) # {'order_id': 12345, 'order_date': '2023-01-01'} - ``` - -This module is a collection of general-purpose utility functions and classes, making many common operations more concise. It's beneficial when working with PyTorch models and various data manipulation tasks. \ No newline at end of file diff --git a/docs/zeta/tokenizers/language_tokenizer.md b/docs/zeta/tokenizers/language_tokenizer.md deleted file mode 100644 index cfa3609..0000000 --- a/docs/zeta/tokenizers/language_tokenizer.md +++ /dev/null @@ -1,91 +0,0 @@ -# Module Name: LanguageTokenizerGPTX - -The `LanguageTokenizerGPTX` is an embedding utility tailored for the "EleutherAI/gpt-neox-20b" transformer model. This class allows for seamless tokenization and decoding operations, abstracting away the underlying complexity of the chosen transformer's tokenizer. - -## Introduction: -Language tokenization is a crucial step in natural language processing tasks. This module provides an interface to tokenize and decode text using the GPT-Neox-20b transformer from the EleutherAI project. With the ability to manage end-of-string tokens, padding tokens, and a fixed model length, `LanguageTokenizerGPTX` serves as a convenient wrapper for the actual tokenizer from the transformers library. - -## Class Definition: - -```python -class LanguageTokenizerGPTX: - def __init__(self): - ... - def tokenize_texts(self, texts: str) -> torch.Tensor: - ... - def decode(self, texts: torch.Tensor) -> str: - ... - def __len__(self) -> int: - ... -``` - -### Parameters: -The class does not take any parameters upon instantiation. It uses predefined parameters internally to load the tokenizer. - -### Methods: - -#### 1. `__init__(self) -> None`: -Initializes the `LanguageTokenizerGPTX` object. This method loads the `AutoTokenizer` with predefined parameters. - -#### 2. `tokenize_texts(self, texts: str) -> torch.Tensor`: -Tokenizes a given text or list of texts. - -- **texts** (str): The input text(s) to tokenize. - - **Returns**: - - A torch Tensor of token IDs representing the input text(s). - -#### 3. `decode(self, texts: torch.Tensor) -> str`: -Decodes a given tensor of token IDs back to text. - -- **texts** (torch.Tensor): The tensor of token IDs to decode. - - **Returns**: - - A string representing the decoded text. - -#### 4. `__len__(self) -> int`: -Provides the total number of tokens in the tokenizer's vocabulary. - - **Returns**: - - An integer representing the total number of tokens. - -## Usage Examples: - -```python -from zeta import LanguageTokenizerGPTX -import torch - -# Initialize the tokenizer -tokenizer = LanguageTokenizerGPTX() - -# Example 1: Tokenize a single text -text = "Hello, world!" -tokenized_text = tokenizer.tokenize_texts(text) -print(tokenized_text) - -# Example 2: Decode a tokenized text -decoded_text = tokenizer.decode(tokenized_text) -print(decoded_text) - -# Example 3: Get the number of tokens in the tokenizer's vocabulary -num_tokens = len(tokenizer) -print(f"The tokenizer has {num_tokens} tokens.") -``` - -## Mathematical Formulation: - -Given a text \( t \) and a vocabulary \( V \) from the GPT-Neox-20b model, tokenization maps \( t \) to a sequence of token IDs \( T \) where each token ID \( t_i \) corresponds to a token in \( V \). Decoding reverses this process. - -\[ t \xrightarrow{\text{tokenize}} T \] -\[ T \xrightarrow{\text{decode}} t \] - -## Additional Information: - -The GPT-Neox-20b model is part of the EleutherAI project. It's a variant of the GPT architecture with tweaks in terms of model size and training. Utilizing such models require an understanding of tokenization and decoding, which this module aims to simplify. - -## References: - -- [Transformers Library by Hugging Face](https://huggingface.co/transformers/) -- [EleutherAI GPT-Neox](https://github.com/EleutherAI/gpt-neox) - -Note: Ensure you have the necessary packages and dependencies installed, particularly the transformers library from Hugging Face. \ No newline at end of file diff --git a/docs/zeta/tokenizers/multi_modal_tokenizer.md b/docs/zeta/tokenizers/multi_modal_tokenizer.md deleted file mode 100644 index a0f682a..0000000 --- a/docs/zeta/tokenizers/multi_modal_tokenizer.md +++ /dev/null @@ -1,168 +0,0 @@ -# **Documentation for Zeta Library's MultiModalTokenizer Class** - ---- - -## **Introduction and Overview** - -The `MultiModalTokenizer` class is part of the Zeta Library, designed to provide tokenization capabilities for both text and image data. This enables more seamless integration and utilization of multimodal (text and image) data, especially when used with models that can handle such information simultaneously, like the CLIP model. - -**Key Features**: - -1. **Multimodal Tokenization**: Combines text and image tokenization within one unified class. -2. **Integration with Hugging Face Transformers**: Utilizes the `CLIPProcessor` for image tokenization and `AutoTokenizer` for text tokenization. -3. **Special Tokens for Image Segmentation**: Uses special tokens `` and `` to denote image token boundaries within text. -4. **Error Handling**: Implements comprehensive error handling and logging to ensure robustness. - ---- - -## **Class Definition** - -### **MultiModalTokenizer** - -```python -class MultiModalTokenizer: - """ - A tokenizer class for the kosmos model - - Attributes: - processor(CLIPProcessor): The processor to tokenize images. - tokenizer(AutoTokenizer): The tokenizer to tokenize text. - im_idx(int): The Index of the "" token. - im_end_idx(int): The index of the "" token. - """ -``` - -#### **Parameters**: - -- **max_length (int, optional)**: Maximum length of the tokenized sequence. Defaults to 8192. - -#### **Attributes**: - -- **processor (CLIPProcessor)**: The processor used to tokenize images. -- **tokenizer (AutoTokenizer)**: The tokenizer used to tokenize text. -- **im_idx (int)**: Index of the `` token. -- **im_end_idx (int)**: Index of the `` token. - ---- - -## **Methods** - -### **1. tokenize_texts** - -```python -def tokenize_texts(self, texts: str) -> Tuple[torch.Tensor, torch.Tensor]: - """ - Tokenize given texts. - - Args: - texts (str): The text to be tokenized. - - Returns: - A tuple containing the tokenized texts and only the text tokens. - """ -``` - -### **2. tokenize_images** - -```python -def tokenize_images(self, images) -> torch.Tensor: - """ - Tokenizes given images. - - Args: - images: The images to be tokenized. - - Returns: - The tokenized images. - """ -``` - -### **3. tokenize** - -```python -def tokenize(self, sample) -> Dict[str, torch.Tensor]: - """ - Tokenizes given sample. - - Args: - sample: The sample to be tokenized. - - Returns: - A dictionary containing the tokenized text tokens, images, labels, and attention mask. - """ -``` - ---- - -## **Usage Examples** - -### **Example 1: Tokenizing Texts** - -```python -from zeta import MultiModalTokenizer -import torch - -tokenizer = MultiModalTokenizer() -texts = ["Hello World", "Zeta Library is great!"] -tokenized_texts, only_texts = tokenizer.tokenize_texts(texts) -print(tokenized_texts) -print(only_texts) -``` - -### **Example 2: Tokenizing Images** - -```python -from zeta import MultiModalTokenizer -import torch - -tokenizer = MultiModalTokenizer() -images = torch.randn(2, 3, 224, 224) # Assuming 2 random images of shape 3x224x224 -tokenized_images = tokenizer.tokenize_images(images) -print(tokenized_images) -``` - -### **Example 3: Tokenizing Multimodal Data** - -```python -from zeta import MultiModalTokenizer -import torch - -tokenizer = MultiModalTokenizer() -sample = { - "target_text": ["Hello World", "Zeta Library is great!"], - "image": torch.randn(2, 3, 224, 224) -} -tokenized_data = tokenizer.tokenize(sample) -print(tokenized_data) -``` - ---- - -## **Mathematical Overview** - -Given a text sequence \( T \) of length \( n \) and an image \( I \) represented by a tensor of shape \( C \times H \times W \), where \( C \) is the number of channels, \( H \) is the height, and \( W \) is the width: - -1. The tokenized text, \( T' \), is represented as: - \[ T' = [, , , T_{1}, T_{2}, ..., T_{n}, ] \] - -2. The tokenized image, \( I' \), is processed using the CLIP processor to obtain a tensor representation. - -3. When both text and image data are tokenized using the `tokenize` method, the output contains both \( T' \) and \( I' \) with their respective attention masks. - ---- - -## **Additional Tips** - -- Ensure you have the required model weights and configurations for the specified pretrained models ("laion/CLIP-ViT-L-14-laion2B-s32B-b82K" and "EleutherAI/gpt-neox-20b") downloaded or accessible from the Hugging Face Model Hub. - -- Handle potential tokenization errors gracefully using try-except blocks, as demonstrated in the provided methods. - ---- - -## **References and Resources** - -1. CLIP: Connecting Vision and Language with Reinforced Loss - OpenAI: [Link](https://openai.com/blog/clip/) -2. Hugging Face's Transformers library: [Link](https://huggingface.co/transformers/) -3. Documentation on Special Tokens in Transformers: [Link](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.PreTrainedTokenizer.add_special_tokens) - ---- \ No newline at end of file diff --git a/docs/zeta/tokenizers/sentencepiece.md b/docs/zeta/tokenizers/sentencepiece.md deleted file mode 100644 index caaed72..0000000 --- a/docs/zeta/tokenizers/sentencepiece.md +++ /dev/null @@ -1,173 +0,0 @@ -# SentencePieceTokenizer - -`SentencePieceTokenizer` is a class for tokenizing and detokenizing text using a pre-trained SentencePiece model. The SentencePiece model is a unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation tasks where the vocabulary size is predetermined prior to the neural model training. This class is a part of the zeta library which is a collection of various utility functions and classes for Natural Language Processing tasks. - -## Introduction - -Tokenization is a crucial step in many natural language processing tasks. It involves splitting a piece of text into smaller units, called tokens. These tokens can be as small as characters or as large as words. The `SentencePieceTokenizer` class provides an efficient and easy-to-use way to tokenize and detokenize text using a SentencePiece model. - -The SentencePiece model is trained to find the best tokenization by dynamically adjusting the size and boundary of tokens. SentencePiece implements subword units (e.g., byte-pair-encoding (BPE) and unigram language model with the extension of direct training from raw sentences. SentencePiece allows us to make a purely end-to-end system that does not depend on language-specific pre/postprocessing. - -## Class Definition - -```python -class SentencePieceTokenizer: - def __init__(self, model_path: str): - ... -``` - -### Parameters: - -- `model_path (str)`: The path to the pre-trained SentencePiece model. It should be a file with `.model` extension. - -### Attributes: - -- `n_words (int)`: The vocabulary size of the SentencePiece model. -- `bos_id (int)`: The token ID for the beginning of sentence token. -- `eos_id (int)`: The token ID for the end of sentence token. -- `pad_id (int)`: The token ID for the padding token. -- `prefix_id (int, optional)`: The token ID for the prefix token. -- `middle_id (int, optional)`: The token ID for the middle token. -- `suffix_id (int, optional)`: The token ID for the suffix token. -- `eot_id (int, optional)`: The token ID for the end of text token. - -## Methods - -### `encode` - -```python -def encode(self, s: str, bos: bool, eos: bool) -> List[int]: - ... -``` - -Encodes a string into a list of integer token IDs. - -#### Parameters: - -- `s (str)`: The string to be encoded. -- `bos (bool)`: Whether to add the beginning of sentence token at the start. -- `eos (bool)`: Whether to add the end of sentence token at the end. - -#### Returns: - -- `List[int]`: A list of integer token IDs. - -### `decode` - -```python -def decode(self, t: List[int]) -> str: - ... -``` - -Decodes a list of integer token IDs into a string. - -#### Parameters: - -- `t (List[int])`: A list of integer token IDs to be decoded. - -#### Returns: - -- `str`: The decoded string. - -### `encode_infilling` - -```python -def encode_infilling(self, s: str) -> List[int]: - ... -``` - -Encodes a string without an implicit leading space. - -#### Parameters: - -- `s (str)`: The string to be encoded. - -#### Returns: - -- `List[int]`: A list of integer token IDs. - -### `decode_infilling` - -```python -def decode_infilling(self, t: List[int]) -> str: - ... -``` - -Decodes a list of integer token IDs into a string without an implicit leading space. - -#### Parameters: - -- `t (List[int])`: A list of integer token IDs to be decoded. - -#### Returns: - -- `str`: The decoded string. - -## Usage Examples - -### Example 1: - -```python -from zeta import SentencePieceTokenizer - -tokenizer = SentencePieceTokenizer(model_path='path/to/your/model.model') -text = "Hello, world!" -tokens = tokenizer.encode(text, bos=True, eos=True) -print(tokens) -# [2, 284, 16, 250, 13, 849, 4, 3] - -decoded_text = tokenizer.decode(tokens) -print(decoded_text) -# "Hello, world!" -``` - -### Example 2: - -```python -from zeta import SentencePieceTokenizer - -tokenizer = SentencePieceTokenizer(model_path='path/to/your/model.model') -text = "Hello, world!" -tokens = tokenizer.encode_infilling(text) -print(tokens) -# [284, 16, 250, 13, 849, 4] - -decoded_text = tokenizer.decode_infilling(tokens) -print(decoded_text) -# "Hello, world!" -``` - -### Example 3: - -```python -from zeta import SentencePieceTokenizer - -tokenizer = SentencePieceTokenizer(model_path='path/to/your/model.model') -tokens = [2, 284, 16, 250, 13, 849, 4, 3] -decoded_text = tokenizer.decode(tokens) -print(decoded_text) -# "Hello, world!" -``` - -## Additional Information - -- Make sure that the model file specified in `model_path` exists. -- The special tokens such as `
`, ``, ``, `` are optional and may not be present in all SentencePiece models.
-
-## References and Resources
-
-- [SentencePiece GitHub Repository](https://github.com/google/sentencepiece)
-- [SentencePiece: A simple and language independent subword tokenizer and detokenizer for Neural Text Generation](https://arxiv.org/abs/1808.06226)
-
-## Mathematical Formulation
-
-The SentencePiece model uses the following mathematical formula for tokenization:
-
-\[P(w) = \prod_{i=1}^{n} P(w_i | w_1, ..., w_{i-1})\]
-
-Where:
-- \(P(w)\) is the probability of the word \(w\).
-- \(n\) is the number of subwords in the word \(w\).
-- \(w_i\) is the \(i\)-th subword of \(w\).
-
-The model is trained to maximize the likelihood of the training data, and the subwords are chosen to minimize the perplexity of the training data.
\ No newline at end of file
diff --git a/docs/zeta/training/nebula.md b/docs/zeta/training/nebula.md
deleted file mode 100644
index 2d729a2..0000000
--- a/docs/zeta/training/nebula.md
+++ /dev/null
@@ -1,138 +0,0 @@
-# Nebula
-
-The `Nebula` class is a custom loss function class that dynamically determines the most suitable loss function for a given dataset based on certain characteristics of the dataset, such as sparsity, correlation, range of values, and user input. It is part of the `zeta` library and is built upon PyTorch's LossFunction class.
-
-## Introduction
-
-The purpose of the `Nebula` class is to help determine and cache the most suitable loss function for a given dataset without requiring the user to manually select one. This can be particularly useful in scenarios where the user is unsure of the most appropriate loss function to use or in automated systems where the type of problem (classification or regression) is not known a priori.
-
-The `Nebula` class considers various characteristics of the data, such as whether the target values are integers, the sparsity of the target values, the correlation between predictions and target values, and any user or domain knowledge provided, to determine whether the problem is a classification or regression problem and subsequently select an appropriate loss function.
-
-## Class Definition
-
-```python
-class Nebula(LossFunction):
-    def __init__(self, domain_knowledge=None, user_input=None):
-        ...
-```
-
-### Parameters
-
-- `domain_knowledge` (str, optional): Domain knowledge about the problem. It can be either "classification" or "regression". Default is `None`.
-- `user_input` (str, optional): User input about the problem type. It can be either "classification" or "regression". Default is `None`.
-
-### Attributes
-
-- `loss_function`: The determined loss function.
-- `domain_knowledge`: Domain knowledge provided during initialization.
-- `user_input`: User input provided during initialization.
-- `loss_function_cache`: A cache for storing the determined loss function for a dataset.
-- `unique_values_cache`: A cache for storing the unique values in the target variable `y_true`.
-- `class_balance_cache`: A cache for storing the class balance in the target variable `y_true`.
-- `logger`: A logger for logging information during the determination of the loss function.
-
-## Functionality and Usage
-
-The `Nebula` class is used to dynamically determine the most suitable loss function for a given dataset and cache the determined loss function for future use. The class analyzes the unique values, class balance, sparsity, and correlation of the target variable `y_true` and the predicted variable `y_pred` to determine whether the problem is a classification or regression problem and select an appropriate loss function.
-
-### Method: `determine_loss_function`
-
-```python
-def determine_loss_function(self, y_pred, y_true):
-    ...
-```
-
-This method determines the most suitable loss function based on the characteristics of `y_pred` and `y_true`.
-
-#### Parameters
-
-- `y_pred` (Tensor): The predicted values.
-- `y_true` (Tensor): The ground truth values.
-
-### Method: `__call__`
-
-```python
-def __call__(self, y_pred, y_true):
-    ...
-```
-
-This method computes the loss using the determined loss function.
-
-#### Parameters
-
-- `y_pred` (Tensor): The predicted values.
-- `y_true` (Tensor): The ground truth values.
-
-#### Returns
-
-- `Tensor`: The computed loss.
-
-### Usage Examples
-
-#### Example 1: Basic Usage
-
-```python
-from zeta import Nebula
-import torch
-
-# Initialize Nebula
-nebula = Nebula()
-
-# Generate some example data
-y_pred = torch.randn(10, 5)
-y_true = torch.randint(0, 5, (10,))
-
-# Compute the loss
-loss = nebula(y_pred, y_true)
-
-print(loss)
-```
-
-#### Example 2: Providing Domain Knowledge
-
-```python
-from zeta import Nebula
-import torch
-
-# Initialize Nebula with domain knowledge
-nebula = Nebula(domain_knowledge="classification")
-
-# Generate some example data
-y_pred = torch.randn(10, 5)
-y_true = torch.randint(0, 5, (10,))
-
-# Compute the loss
-loss = nebula(y_pred, y_true)
-
-print(loss)
-```
-
-#### Example 3: Providing User Input
-
-```python
-from zeta import Nebula
-import torch
-
-# Initialize Nebula with user input
-nebula = Nebula(user_input="regression")
-
-# Generate some example data
-y_pred = torch.randn(10, 1)
-y_true = torch.randn(10, 1)
-
-# Compute the loss
-loss = nebula(y_pred, y_true)
-
-print(loss)
-```
-
-## Mathematical Formula
-
-The `Nebula` class does not have a specific mathematical formula as it dynamically determines the most suitable loss function based on the characteristics of the data. However, the determined loss function will have its own mathematical formula, which can be found in the PyTorch documentation or the `zeta` library documentation.
-
-## Additional Information and Tips
-
-- The `Nebula` class caches the determined loss function, unique values, and class balance for a given dataset to avoid recomputing them in the future.
-- If both `domain_knowledge` and `user_input` are provided, `domain_knowledge` will take precedence over `user_input`.
-- The `Nebula` class uses the `logging` module to log information during the determination of the loss function. You can customize the logging settings by modifying the `logger` attribute.
-
diff --git a/docs/zeta/training/optimizers/decoupled_lion.md b/docs/zeta/training/optimizers/decoupled_lion.md
deleted file mode 100644
index fc3329e..0000000
--- a/docs/zeta/training/optimizers/decoupled_lion.md
+++ /dev/null
@@ -1,158 +0,0 @@
-# DecoupledLionW Optimizer
-
-## Overview and Introduction
-
-`DecoupledLionW` is a PyTorch optimizer designed to improve training performance and convergence for deep learning models. It is an extension of the Lion optimizer, which incorporates decoupled weight decay and a momentum-based update rule. 
-
-The optimizer utilizes the Adam-like update rule, where the weight decay is applied separately from the gradient update. This is crucial as it helps prevent overfitting, improves generalization, and aids faster convergence and smoother optimization.
-
-### Key Concepts:
-
-- **Weight Decay:** Reduces the magnitude of the model's weights, preventing overfitting and improving generalization.
-- **Momentum Update:** An interpolation between the current gradient and the previous momentum state, allowing for faster convergence and smoother optimization.
-- **Momentum Decay:** Gradually reduces the momentum term over time, preventing it from becoming too large and destabilizing the optimization process.
-
-## Class Definition
-
-```python
-class DecoupledLionW(Optimizer):
-    def __init__(
-            self,
-            params,
-            lr: float = 1e-4,
-            betas: Tuple[float, float] = (0.9, 0.99),
-            weight_decay: float = 0.0,
-    ):
-```
-
-### Parameters
-
-- `params` (iterable): Iterable of parameters to optimize or dictionaries defining parameter groups.
-- `lr` (float, optional): Learning rate. Default: 1e-4.
-- `betas` (Tuple[float, float], optional): Coefficients used for computing running averages of gradient and its square. Default: (0.9, 0.99).
-- `weight_decay` (float, optional): Weight decay (L2 penalty). Default: 0.
-
-### Attributes
-
-- `metric_functions`: A dictionary of lambda functions to compute various metrics like L2 norm of moments, parameters, updates, and gradients, as well as cosine similarity between updates and gradients.
-
-## Functionality and Usage
-
-### `lionw` Method
-
-This static method is responsible for applying the weight decay, momentum update, and momentum decay.
-
-```python
-@staticmethod
-def lionw(p, grad, exp_avg, lr, initial_lr, wd, beta1, beta2) -> None:
-```
-
-#### Parameters
-
-- `p` (Tensor): Parameter tensor.
-- `grad` (Tensor): Gradient tensor.
-- `exp_avg` (Tensor): Exponential moving average of gradient values.
-- `lr` (float): Learning rate.
-- `initial_lr` (float): Initial learning rate.
-- `wd` (float): Weight decay.
-- `beta1` (float): Exponential decay rate for the first moment estimates.
-- `beta2` (float): Exponential decay rate for the second moment estimates.
-
-### `step` Method
-
-Performs a single optimization step.
-
-```python
-@torch.no_grad()
-def step(self, closure: Optional[Callable] = None):
-```
-
-#### Parameters
-
-- `closure` (callable, optional): A closure that reevaluates the model and returns the loss.
-
-#### Returns
-
-- `loss` (float, optional): The loss value if `closure` is provided. None otherwise.
-
-### `pre_reduce_metrics` Method
-
-This method preprocesses the metrics before reduction across nodes.
-
-```python
-def pre_reduce_metrics(self, optimizer_metrics):
-```
-
-#### Parameters
-
-- `optimizer_metrics` (dict): A dictionary containing the optimizer metrics.
-
-#### Returns
-
-- `optimizer_metrics` (dict): The pre-processed optimizer metrics.
-
-### `report_per_parameter_metrics` Method
-
-This method reports the per-parameter metrics.
-
-```python
-def report_per_parameter_metrics(self, param: torch.Tensor, name: str, optimizer_metrics: dict):
-```
-
-#### Parameters
-
-- `param` (Tensor): Parameter tensor.
-- `name` (str): Name of the parameter.
-- `optimizer_metrics` (dict): A dictionary containing the optimizer metrics.
-
-#### Returns
-
-- `optimizer_metrics` (dict): The optimizer metrics with the reported per-parameter metrics.
-
-## Usage Examples
-
-```python
-from zeta import x
-import torch
-
-# Define model parameters
-params = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
-
-# Define optimizer
-optimizer = DecoupledLionW(params, lr=0.1, betas=(0.9, 0.999), weight_decay=0.01)
-
-# Define loss function
-loss_fn = torch.nn.MSELoss()
-
-# Forward pass
-output = x(params)
-target = torch.tensor([0.0, 1.0, 2.0])
-loss = loss_fn(output, target)
-
-# Backward pass
-loss.backward()
-
-# Optimization step
-optimizer.step()
-```
-
-## Mathematical Formula
-
-The update rule of the optimizer can be represented by the following formula:
-
-\[ p = p - \alpha \cdot \text{sign}(\beta_1 \cdot m + (1-\beta_1) \cdot g) - \eta \cdot wd \]
-
-Where:
-
-- \( p \) is the parameter.
-- \( \alpha \) is the learning rate.
-- \( \beta_1 \) is the exponential decay rate for the first moment estimates.
-- \( m \) is the momentum (exponential moving average of gradient values).
-- \( g \) is the gradient.
-- \( \eta \) is the decay factor.
-- \( wd \) is the weight decay.
-
-## Additional Information and Tips
-
-- A high value of `weight_decay` can lead to a large reduction in the model's weights on every step. Ensure to use an appropriate value for your specific use case.
-- The optimizer supports both single-node and multi-node distributed training, enabling efficient training on parallel computing environments.
diff --git a/docs/zeta/training/optimizers/sophia.md b/docs/zeta/training/optimizers/sophia.md
deleted file mode 100644
index 298f3d8..0000000
--- a/docs/zeta/training/optimizers/sophia.md
+++ /dev/null
@@ -1,108 +0,0 @@
-# SophiaG Optimizer for Zeta Library
-
-## Overview
-
-The SophiaG optimizer is designed to adaptively change learning rates during training, offering a combination of momentum-based acceleration and second-order Hessian-based adaptive learning rates. This optimizer is particularly useful for training deep neural networks and optimizing complex, non-convex loss functions. Key features include:
-
-1. **Momentum**: Utilizes exponentially moving averages of gradients.
-2. **Adaptive Learning Rate**: Adjusts the learning rate based on the second-order Hessian information.
-3. **Regularization**: Applies weight decay to avoid overfitting.
-4. **Optional Settings**: Allows for maximizing the loss function, customizable settings for capturable and dynamic parameters.
-
-## Class Definition
-
-```python
-class SophiaG(Optimizer):
-    def __init__(self, params, lr=1e-4, betas=(0.965, 0.99), rho=0.04,
-                 weight_decay=1e-1, *, maximize: bool = False,
-                 capturable: bool = False, dynamic: bool = False):
-```
-
-### Parameters:
-
-- `params` (iterable): Iterable of parameters to optimize.
-- `lr` (float, default=1e-4): Learning rate.
-- `betas` (Tuple[float, float], default=(0.965, 0.99)): Coefficients used for computing running averages of gradient and Hessian.
-- `rho` (float, default=0.04): Damping factor for Hessian-based updates.
-- `weight_decay` (float, default=1e-1): Weight decay factor.
-- `maximize` (bool, default=False): Whether to maximize the loss function.
-- `capturable` (bool, default=False): Enable/Disable special capturing features.
-- `dynamic` (bool, default=False): Enable/Disable dynamic adjustments of the optimizer.
-
-## Usage and Functionality
-
-### 1. Initialization
-
-Upon initialization, the optimizer performs validation on its parameters and sets them as the default parameters for parameter groups.
-
-```python
-from zeta import SophiaG
-
-optimizer = SophiaG(model.parameters(), lr=0.01, betas=(0.9, 0.999), weight_decay=1e-4)
-```
-
-### 2. Step Forward
-
-The `.step()` method updates the model parameters. The function is decorated with `@torch.no_grad()` to avoid saving any more computation graphs for gradient computation.
-
-```python
-loss = criterion(output, target)
-loss.backward()
-optimizer.step()
-```
-
-### 3. Update Hessian and Exponential Average
-
-The optimizer has internal methods to update the Hessian and Exponential Moving Average (EMA) of the gradients, controlled by `betas`.
-
-### 4. SophiaG Function
-
-The core SophiaG function updates the parameters based on the gradient (`grad`), moving average (`exp_avg`), and Hessian (`hessian`). It uses the following update formula:
-
-\[ \text{param} = \text{param} - \text{lr} \times \left( \text{beta}_1 \times \text{exp_avg} + \frac{(1-\text{beta}_1) \times \text{grad}}{( \text{beta}_2 \times \text{hessian} + (1-\text{beta}_2) )^{\rho}} \right) \]
-
-## Usage Examples
-
-### 1. Basic Usage:
-
-```python
-from zeta import SophiaG
-import torch
-import torch.nn as nn
-
-model = nn.Linear(10, 1)
-optimizer = SophiaG(model.parameters(), lr=0.01)
-```
-
-### 2. Customizing Betas and Learning Rate:
-
-```python
-from zeta import SophiaG
-import torch
-
-optimizer = SophiaG(model.parameters(), lr=0.001, betas=(0.9, 0.999))
-```
-
-### 3. Using with Weight Decay:
-
-```python
-from zeta import SophiaG
-
-optimizer = SophiaG(model.parameters(), lr=0.01, weight_decay=1e-4)
-```
-
-## Additional Information and Tips
-
-- Make sure that the parameters passed are compatible with the model you are using.
-- To maximize the loss function (useful in adversarial training), set `maximize=True`.
-
-## Common Issues
-
-- If sparse gradients are involved, the SophiaG optimizer is not applicable.
-
-## References and Resources
-
-- [Adaptive Learning Rates](https://arxiv.org/pdf/1609.04747)
-- [Zeta Documentation](https://zeta.apac.ai)
-
-For further questions or issues, visit our [GitHub repository](https://github.com/kyegomez/zeta).
diff --git a/docs/zeta/training/train.md b/docs/zeta/training/train.md
deleted file mode 100644
index d6ac0e7..0000000
--- a/docs/zeta/training/train.md
+++ /dev/null
@@ -1,139 +0,0 @@
-# Documentation for `Trainer` Module from Zeta Library
-
----
-
-## Introduction
-
-The `Trainer` module from the Zeta library provides an easy-to-use, flexible, and scalable approach to training deep learning models. By abstracting away many of the lower-level details of training, including distributed training, gradient accumulation, and model checkpointing, `Trainer` allows developers to focus on the high-level aspects of model development and experimentation.
-
-This module also integrates seamlessly with the HuggingFace `Accelerator` to enable mixed precision training, GPU acceleration, and distributed training across multiple nodes or GPUs.
-
----
-
-## `Trainer` Class Definition
-
-```python
-def Trainer(
-        gradient_accumulate_every: int = None, 
-        batch_size: int = None, 
-        seq_len: int = None,
-        entity_name: str = None,
-        model = None,
-        use_fsdp: bool = False,
-        use_activation_checkpointing: bool = False,
-        learning_rate = None,
-        seed = None,
-        use_pretokenized: bool = False,
-        resume_from_checkpoint = None,
-        checkpointing_steps = None,
-        output_dir = None,
-        weight_decay = None,
-        use_deepspeed = None
-    ):
-```
-
-### Parameters
-
-- `gradient_accumulate_every` (`int`, optional): Specifies how often to accumulate gradients. Default: `None`.
-- `batch_size` (`int`, optional): Specifies the batch size for training. Default: `None`.
-- `seq_len` (`int`, optional): Sequence length for model inputs. Default: `None`.
-- `entity_name` (`str`, optional): Name of the entity for logging purposes. Default: `None`.
-- `model`: The model to train. No default value.
-- `use_fsdp` (`bool`, optional): Whether or not to use Fully Sharded Data Parallelism (FSDP). Default: `False`.
-- `use_activation_checkpointing` (`bool`, optional): Use activation checkpointing to save memory during training. Default: `False`.
-- `learning_rate`: The learning rate for training. No default value.
-- `seed`: Random seed for reproducibility. No default value.
-- `use_pretokenized` (`bool`, optional): Whether to use pre-tokenized data. Default: `False`.
-- `resume_from_checkpoint`: Path to a checkpoint to resume training from. Default: `None`.
-- `checkpointing_steps`: How often to save model checkpoints. Default: `None`.
-- `output_dir`: Directory to save final trained model and checkpoints. Default: `None`.
-- `weight_decay`: Weight decay value for regularization. No default value.
-- `use_deepspeed`: Whether to use deepspeed for training optimization. Default: `None`.
-
----
-
-## Functionality and Usage
-
-The primary function of the `Trainer` module is to handle the training process, including data loading, optimization, and model updates. It leverages HuggingFace's `Accelerator` to provide accelerated training on GPUs and distributed environments.
-
-Here are the primary steps:
-
-1. Initialization of the `Accelerator` for GPU training and gradient accumulation.
-2. Model and optimizer initialization.
-3. Loading datasets and setting up data loaders.
-4. Training loop with gradient accumulation and model checkpointing.
-5. Save the final trained model.
-
-### Code Examples
-
-**1. Basic Usage**
-
-```python
-from zeta import Trainer
-
-model = ... # Your model definition here
-Trainer(
-    gradient_accumulate_every=2,
-    batch_size=32,
-    seq_len=128,
-    model=model,
-    learning_rate=0.001,
-    seed=42,
-    output_dir='./models/'
-)
-```
-
-**2. Resuming Training from a Checkpoint**
-
-```python
-from zeta import Trainer
-
-model = ... # Your model definition here
-Trainer(
-    gradient_accumulate_every=2,
-    batch_size=32,
-    seq_len=128,
-    model=model,
-    learning_rate=0.001,
-    seed=42,
-    resume_from_checkpoint='./models/checkpoint.pt',
-    output_dir='./models/'
-)
-```
-
-**3. Using FSDP and Activation Checkpointing**
-
-```python
-from zeta import Trainer
-
-model = ... # Your model definition here
-Trainer(
-    gradient_accumulate_every=2,
-    batch_size=32,
-    seq_len=128,
-    model=model,
-    use_fsdp=True,
-    use_activation_checkpointing=True,
-    learning_rate=0.001,
-    seed=42,
-    output_dir='./models/'
-)
-```
-
----
-
-## Mathematical Description
-
-Given a dataset \( D \) consisting of data points \( \{ (x_1, y_1), (x_2, y_2), ... (x_N, y_N) \} \), the trainer aims to minimize the loss function \( L \) with respect to model parameters \( \theta \):
-
-\[ \theta^* = \arg\min_{\theta} \frac{1}{N} \sum_{i=1}^{N} L(f(x_i; \theta), y_i) \]
-
-
-
-where \( f \) is the model's prediction function.
-
----
-
-## Conclusions
-
-The `Trainer` module from Zeta library streamlines the training process by abstracting away many complexities, making it a valuable tool for developers at all experience levels. Whether you are training a simple model or a complex architecture in a distributed environment, the `Trainer` module offers the flexibility and ease-of-use to get your models trained efficiently.
\ No newline at end of file
diff --git a/example.py b/example.py
deleted file mode 100644
index e69de29..0000000
diff --git a/mkdocs.yml b/mkdocs.yml
deleted file mode 100644
index 08107b0..0000000
--- a/mkdocs.yml
+++ /dev/null
@@ -1,124 +0,0 @@
-site_name: Package Docs
-plugins:
-  - glightbox
-  - search
-copyright: "© APAC Corp, Inc."
-extra_css:
-  - docs/assets/css/extra.css
-extra:
-  # analytics:
-  #   provider: google
-  #   property: G-QM8EDPSCB6
-  social:
-    - icon: fontawesome/solid/house
-      link: assets/img/ZetaLogoIcon.png
-    - icon: fontawesome/brands/discord
-      link: https://discord.gg/qUtxnK2NMf
-    - icon: fontawesome/brands/github
-      link: https://github.com/kyegomez/Zeta/
-    - icon: fontawesome/brands/python
-      link: https://pypi.org/project/Zeta/
-theme:
-    name: material
-    custom_dir: docs/overrides
-    logo: assets/img/ZetaLogoIcon.png
-    palette:
-      # Palette toggle for light mode
-    - scheme: default
-      primary: 'custom'
-      toggle:
-        icon: material/brightness-7 
-        name: Switch to dark mode
-    # Palette toggle for dark mode
-    - scheme: slate
-      primary: 'custom'
-      accent: light blue
-      toggle:
-        icon: material/brightness-4
-        name: Switch to light mode
-    features:
-        - content.code.copy
-        - content.code.annotate
-        - navigation.tabs
-        - navigation.sections
-        - navigation.expand
-        - navigation.top
-        - announce.dismiss
-    font:
-      text: Roboto
-      code: Roboto Mono
-
-extra_css:
-  - stylesheets/extra.css
-
-markdown_extensions:
-  - pymdownx.highlight:
-      anchor_linenums: true
-      line_spans: __span
-      pygments_lang_class: true
-  - admonition
-  - pymdownx.inlinehilite
-  - pymdownx.snippets
-  - pymdownx.superfences
-  - pymdownx.details
-  - pymdownx.tabbed
-  - tables
-  - def_list
-  - footnotes
-
-
-nav:
-- Home:
-    - Overview: "index.md"
-    - Contributing: "contributing.md"
-    - FAQ: "faq.md"
-    - Purpose: "purpose.md"
-    - Roadmap: "roadmap.md"
-    - Design: "design.md"
-    - Flywheel: "flywheel.md"
-    - Bounties: "bounties.md"
-    - Metric: "metric.md"
-    - Distribution: "distribution"
-    - Research: "research.md"
-    - Demos: "demos.md"
-    - Architecture: "architecture.md"
-    - Checklist: "checklist.md"
-    - Hiring: "hiring.md"
-- Zeta:
-    - Overview: "zeta/index.md"
-    - zeta.nn:
-      - zeta.nn.biases: 
-        - Xpos: "zeta/nn/biases/xpos.md"
-        - RelativePositionBias: "zeta/nn/biases/relative_bias.md"
-        - AlibiPositionalBias: "zeta/nn/biases/alibi.md"
-      - zeta.nn.embeddings:
-        - MultiWay: "zeta/nn/embeddings/multiway.md"
-        - RotaryEmbeddings: "zeta/nn/embeddings/rope.md"
-        - TruncatedRotaryEmbedding: "zeta/nn/embeddings/truncated_rope.md"
-      - zeta.nn.modules:
-        - Lora: "zeta/nn/modules/lora.md"
-        - TokenLearner: "zeta/nn/modules/token_learner.md"
-      - zeta.nn.attention:
-        - FlashAttention: "zeta/nn/attention/flash_attention.md"
-        - MultiQueryAttention: "zeta/nn/attention/multiquery.md"
-        - MultiheadAttention: "zeta/nn/attention/multihead.md"
-        - FlashAttentionTwo: "zeta/nn/attention/flash2.md"
-        - BaseAttention: "zeta/nn/attention/base.md"
-      - zeta.nn.architecture:
-        - Decoder: "zeta/nn/architecture/decoder.md"
-        - Transformer: "zeta/nn/architecture/transformer.md"
-    - zeta.training:
-      - train: "zeta/training/train.md"
-      - zeta.training.loss:
-        - Nebula: "zeta/training/nebula.md"
-      - zeta.training.optimizers:
-        - DecoupledLionW: "zeta/training/optimizers/decoupled_lion.md"
-        - SophiaG: "zeta/training/optimizers/sophia.md"
-    - zeta.tokenizers:
-        - MultiModalTokenizer: "zeta/tokenizers/multi_modal_tokenizer.md"
-        - LanguageTokenizerGPTX: "zeta/tokenizers/language_tokenizer.md"
-        - SentencePieceTokenizer: "zeta/tokenizers/sentencepiece.md"
-- Examples:
-    - Overview: "examples/index.md"
-    - FlashAttention: "examples/nn/attentions/flash.md"
-    
\ No newline at end of file
diff --git a/package/__init__.py b/package/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/package/main.py b/package/main.py
deleted file mode 100644
index e69de29..0000000
diff --git a/package/subfolder/__init__.py b/package/subfolder/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/package/subfolder/main.py b/package/subfolder/main.py
deleted file mode 100644
index e69de29..0000000