-
Notifications
You must be signed in to change notification settings - Fork 39
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add set() method and clear() method to VLite class
- Loading branch information
Showing
6 changed files
with
228 additions
and
65 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
name: Run Unit Tests | ||
|
||
on: [push] | ||
|
||
jobs: | ||
test: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v2 | ||
- name: Set up Python 3.8 | ||
uses: actions/setup-python@v2 | ||
with: | ||
python-version: '3.8' | ||
- name: Install dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
# Add any other dependencies here | ||
- name: Run tests | ||
run: python ./tests/unit.py |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
from random_word import RandomWords | ||
import tiktoken | ||
|
||
def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int: | ||
encoding = tiktoken.get_encoding(encoding_name) | ||
num_tokens = len(encoding.encode(string)) | ||
return num_tokens | ||
|
||
def generate_string_of_length(target_tokens: int) -> str: | ||
r = RandomWords() | ||
generated_string = "" | ||
current_tokens = 0 | ||
|
||
while current_tokens < target_tokens: | ||
word = r.get_random_word() | ||
word_tokens = num_tokens_from_string(word) | ||
|
||
if current_tokens + word_tokens <= target_tokens: | ||
generated_string += word + " " | ||
current_tokens += word_tokens + 1 # Add 1 for the space | ||
else: | ||
break | ||
|
||
# Remove the trailing space | ||
generated_string = generated_string.strip() | ||
|
||
# If the token count is less than the target, append words one by one | ||
while current_tokens < target_tokens: | ||
word = r.get_random_word() | ||
word_tokens = num_tokens_from_string(word) | ||
|
||
if current_tokens + word_tokens <= target_tokens: | ||
generated_string += " " + word | ||
current_tokens += word_tokens + 1 # Add 1 for the space | ||
else: | ||
break | ||
|
||
# If the token count is greater than the target, remove words one by one | ||
while current_tokens > target_tokens: | ||
words = generated_string.split() | ||
last_word = words.pop() | ||
last_word_tokens = num_tokens_from_string(last_word) | ||
current_tokens -= last_word_tokens + 1 # Subtract 1 for the space | ||
generated_string = " ".join(words) | ||
|
||
return generated_string | ||
|
||
# Generate a string of 512 tokens | ||
string_512_tokens = generate_string_of_length(512) | ||
print(f"String of 512 tokens:\n{string_512_tokens}") | ||
print(f"Actual token count: {num_tokens_from_string(string_512_tokens)}") | ||
|
||
print("\n" + "-" * 50 + "\n") | ||
|
||
# Generate a string of 8192 tokens | ||
string_8192_tokens = generate_string_of_length(8192) | ||
print(f"String of 8192 tokens:\n{string_8192_tokens}") | ||
print(f"Actual token count: {num_tokens_from_string(string_8192_tokens)}") |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
import unittest | ||
import numpy as np | ||
from vlite.main import VLite | ||
import os | ||
from vlite.utils import process_pdf | ||
import cProfile | ||
from pstats import Stats | ||
import matplotlib.pyplot as plt | ||
import time | ||
|
||
class TestVLite(unittest.TestCase): | ||
test_times = {} | ||
|
||
def setUp(self): | ||
self.vlite = VLite("vlite-unit") | ||
|
||
def tearDown(self): | ||
# Remove the file | ||
if os.path.exists('vlite-unit'): | ||
print("[+] Removing vlite") | ||
os.remove('vlite-unit') | ||
|
||
def test_add__text(self): | ||
start_time = time.time() | ||
text = "This is a test text." | ||
metadata = {"source": "test"} | ||
self.vlite.add(text, metadata=metadata) | ||
self.assertEqual(self.vlite.count(), 1) | ||
end_time = time.time() | ||
TestVLite.test_times["add_single_text"] = end_time - start_time | ||
print(f"Count of texts in the collection: {self.vlite.count()}") | ||
|
||
def test_add_texts(self): | ||
start_time = time.time() | ||
text_512tokens = "underreckoning fleckiness hairstane paradigmatic eligibility sublevate xviii achylia reremice flung outpurl questing gilia unosmotic unsuckled plecopterid excludable phenazine fricando unfledgedness spiritsome incircle desmogenous subclavate redbug semihoral district chrysocolla protocoled servius readings propolises javali dujan stickman attendee hambone obtusipennate tightropes monitorially signaletics diestrums preassigning spriggy yestermorning margaritic tankfuls aseptify linearity hilasmic twinning tokonoma seminormalness cerebrospinant refroid doghouse kochab dacryocystalgia saltbushes newcomer provoker berberid platycoria overpersuaded reoverflow constrainable headless forgivably syzygal purled reese polyglottonic decennary embronze pluripotent equivocally myoblasts thymelaeaceous confervae perverted preanticipate mammalogical desalinizing tackets misappearance subflexuose concludence effluviums runtish gras cuckolded hemostasia coatroom chelidon policizer trichinised frontstall impositions unta outrance scholium fibrochondritis furcates fleaweed housefront helipads hemachate snift appellativeness knobwood superinclination tsures haberdasheries unparliamented reexecution nontangential waddied desolated subdistinctively undiscernibleness swishiest dextral progs koprino bruisingly unloanably bardash uncuckoldedunderreckoning fleckiness hairstane paradigmatic eligibility sublevate xviii achylia reremice flung outpurl questing gilia unosmotic unsuckled plecopterid excludable phenazine fricando unfledgedness spiritsome incircle desmogenous subclavate redbug semihoral district chrysocolla spriggy yestermorning margaritic tankfuls aseptify linearity hilasmic twinning tokonoma seminormalness cerebrospinant refroequivocally myoblasts thymelaeaceous confervae perverted preantiest dextral progs koprino bruisingly unloanably bardash uncuckolded" | ||
metadata = {"source": "test_512tokens"} | ||
self.vlite.add(text_512tokens, metadata=metadata) | ||
with open("data/text-8192tokens.txt", "r") as file: | ||
text_8192tokens = file.read() | ||
metadata = {"source": "test_8192tokens"} | ||
self.vlite.add(text_8192tokens, metadata=metadata) | ||
end_time = time.time() | ||
TestVLite.test_times["add_multiple_texts"] = end_time - start_time | ||
print(f"Count of texts in the collection: {self.vlite.count()}") | ||
|
||
def test_add_pdf(self): | ||
start_time = time.time() | ||
process_pdf('data/gpt-4.pdf') | ||
end_time = time.time() | ||
TestVLite.test_times["add_pdf"] = end_time - start_time | ||
# time to add 71067 tokens from the GPT-4 paper | ||
print(f"Time to add 71067 tokens: {TestVLite.test_times['add_pdf']} seconds") | ||
|
||
def test_retrieve(self): | ||
queries = [ | ||
"What is the architecture of GPT-4?", | ||
"How does GPT-4 handle contextual understanding?", | ||
"What are the key improvements in GPT-4 over GPT-3?", | ||
"How many parameters does GPT-4 have?", | ||
"What are the limitations of GPT-4?", | ||
"What datasets were used to train GPT-4?", | ||
"How does GPT-4 handle longer context?", | ||
"What is the computational requirement for training GPT-4?", | ||
"What techniques were used to train GPT-4?", | ||
"What is the impact of GPT-4 on natural language processing?", | ||
"What are the use cases demonstrated in the GPT-4 paper?", | ||
"What are the evaluation metrics used in GPT-4's paper?", | ||
"What kind of ethical considerations are discussed in the GPT-4 paper?", | ||
"How does the GPT-4 handle tokenization?", | ||
"What are the novel contributions of the GPT-4 model?" | ||
] | ||
process_pdf('data/gpt-4.pdf') | ||
start_time = time.time() | ||
for query in self.queries: | ||
_, top_sims, _ = self.vlite.retrieve(query) | ||
print(f"Top similarities for query '{query}': {top_sims}") | ||
end_time = time.time() | ||
TestVLite.test_times["retrieve"] = end_time - start_time | ||
|
||
def test_delete(self): | ||
self.vlite.add("This is a test text.") | ||
start_time = time.time() | ||
self.vlite.delete(0) | ||
end_time = time.time() | ||
TestVLite.test_times["delete"] = end_time - start_time | ||
print(f"Count of texts in the collection: {self.vlite.count()}") | ||
|
||
def test_update(self): | ||
self.vlite.add("This is a test text.") | ||
start_time = time.time() | ||
self.vlite.update(0, "This is an updated text.") | ||
end_time = time.time() | ||
TestVLite.test_times["update"] = end_time - start_time | ||
print(f"Count of texts in the collection: {self.vlite.count()}") | ||
|
||
@classmethod | ||
def tearDownClass(cls): | ||
print("\nTest times:") | ||
for test_name, test_time in cls.test_times.items(): | ||
print(f"{test_name}: {test_time:.4f} seconds") | ||
|
||
if __name__ == '__main__': | ||
unittest.main(verbosity=2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters