Skip to content

Commit

Permalink
Revert "Update sources in with_openai (#21405)" (#21506)
Browse files Browse the repository at this point in the history
  • Loading branch information
schrockn authored Apr 30, 2024
1 parent 9bc32ed commit 9d97ffa
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 46 deletions.
11 changes: 1 addition & 10 deletions examples/with_openai/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,6 @@
setup(
name="with_openai",
packages=find_packages(exclude=["with_openai_tests"]),
install_requires=[
"dagster",
"dagster-openai",
"dagster-cloud",
"faiss-cpu==1.8.0",
"filelock",
"langchain==0.1.11",
"langchain-community==0.0.34",
"langchain-openai==0.1.3",
],
install_requires=["dagster", "dagster-openai", "langchain==0.1.11"],
extras_require={"dev": ["dagster-webserver", "pytest"]},
)
30 changes: 7 additions & 23 deletions examples/with_openai/with_openai/assets.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,7 @@
from .utils import get_github_docs

docs_partitions_def = StaticPartitionsDefinition(
[
"about",
"community",
"concepts",
"dagster-plus",
"deployment",
"getting-started",
"guides",
"integrations",
"tutorial",
]
["concepts", "dagster-cloud", "deployment", "guides", "integrations"]
)


Expand Down Expand Up @@ -73,24 +63,18 @@ class OpenAIConfig(Config):
question: str


@asset(compute_kind="OpenAI", deps=[search_index])
def completion(
context: AssetExecutionContext,
openai: OpenAIResource,
config: OpenAIConfig,
):
@asset(compute_kind="OpenAI")
def completion(context: AssetExecutionContext, openai: OpenAIResource, config: OpenAIConfig):
with open(SEARCH_INDEX_FILE, "rb") as f:
serialized_search_index = pickle.load(f)
search_index = FAISS.deserialize_from_bytes(serialized_search_index, OpenAIEmbeddings())
with openai.get_client(context) as client:
prompt = stuff_prompt.PROMPT
model = ChatOpenAI(client=client.chat.completions, model=config.model, temperature=0)
summaries = " ".join(
[
SUMMARY_TEMPLATE.format(content=doc.page_content, source=doc.metadata["source"])
for doc in search_index.similarity_search(config.question, k=4)
]
)
summaries = [
SUMMARY_TEMPLATE.format(content=doc.page_content, source=doc.metadata["source"])
for doc in search_index.similarity_search(config.question, k=4)
]
output_parser = StrOutputParser()
chain = prompt | model | output_parser
context.log.info(chain.invoke({"summaries": summaries, "question": config.question}))
Expand Down
30 changes: 17 additions & 13 deletions examples/with_openai/with_openai/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import io
import os
import pathlib
import subprocess
import tempfile
import zipfile

import requests
from langchain.docstore.document import Document
Expand All @@ -19,17 +18,22 @@ def get_wiki_data(title, first_paragraph_only):
)


def get_github_docs(repo_owner, repo_name, category, archive_name="master"):
def get_github_docs(repo_owner, repo_name, category):
with tempfile.TemporaryDirectory() as d:
# The archive name can be a branch, tag or commit.
r = requests.get(f"https://github.com/{repo_owner}/{repo_name}/archive/{archive_name}.zip")
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall(d)
root_path = pathlib.Path(os.path.join(d, f"{repo_name}-{archive_name}"))
docs_path = root_path.joinpath("docs/content", category)
markdown_files = list(docs_path.glob("*.md*")) + list(docs_path.glob("*/*.md*"))
for markdown_file in markdown_files:
subprocess.check_call(
f"git clone --depth 1 https://github.com/{repo_owner}/{repo_name}.git .",
cwd=d,
shell=True,
)
git_sha = (
subprocess.check_output("git rev-parse HEAD", shell=True, cwd=d).decode("utf-8").strip()
)
docs_path = pathlib.Path(os.path.join(d, "docs/content", category))
markdown_files = list(docs_path.glob("*/*.md")) + list(docs_path.glob("*/*.mdx"))
for index, markdown_file in enumerate(markdown_files):
with open(markdown_file, "r") as f:
relative_path = markdown_file.relative_to(root_path)
github_url = f"https://github.com/{repo_owner}/{repo_name}/blob/{archive_name}/{relative_path}"
relative_path = markdown_file.relative_to(docs_path)
github_url = (
f"https://github.com/{repo_owner}/{repo_name}/blob/{git_sha}/{relative_path}"
)
yield Document(page_content=f.read(), metadata={"source": github_url})

0 comments on commit 9d97ffa

Please sign in to comment.