Revert "Update sources in with_openai (#21405)" (#21506)

## Summary & Motivation Reverts #21405 Appears to have broken this pyright-related test https://buildkite.com/dagster/dagster-dagster/builds/81462#018f2a9c-eced-4b36-815e-64128eb6729d ## How I Tested These Changes BK. Specifically `dagster-dbt dbt15-core 3.11` [Before](https://buildkite.com/dagster/dagster-dagster/builds/81462#018f2a9c-edc3-4050-9932-8974578742b9) [After](https://buildkite.com/dagster/dagster-dagster/builds/81526#018f2c5e-c215-4058-9c49-5d9b116ce06f)
dagster-io · Apr 30, 2024 · 9d97ffa · 9d97ffa
1 parent 9bc32ed
commit 9d97ffa
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 46 deletions.
diff --git a/examples/with_openai/setup.py b/examples/with_openai/setup.py
@@ -3,15 +3,6 @@
 setup(
     name="with_openai",
     packages=find_packages(exclude=["with_openai_tests"]),
-    install_requires=[
-        "dagster",
-        "dagster-openai",
-        "dagster-cloud",
-        "faiss-cpu==1.8.0",
-        "filelock",
-        "langchain==0.1.11",
-        "langchain-community==0.0.34",
-        "langchain-openai==0.1.3",
-    ],
+    install_requires=["dagster", "dagster-openai", "langchain==0.1.11"],
     extras_require={"dev": ["dagster-webserver", "pytest"]},
 )
diff --git a/examples/with_openai/with_openai/assets.py b/examples/with_openai/with_openai/assets.py
@@ -22,17 +22,7 @@
 from .utils import get_github_docs
 
 docs_partitions_def = StaticPartitionsDefinition(
-    [
-        "about",
-        "community",
-        "concepts",
-        "dagster-plus",
-        "deployment",
-        "getting-started",
-        "guides",
-        "integrations",
-        "tutorial",
-    ]
+    ["concepts", "dagster-cloud", "deployment", "guides", "integrations"]
 )
 
 
@@ -73,24 +63,18 @@ class OpenAIConfig(Config):
     question: str
 
 
-@asset(compute_kind="OpenAI", deps=[search_index])
-def completion(
-    context: AssetExecutionContext,
-    openai: OpenAIResource,
-    config: OpenAIConfig,
-):
+@asset(compute_kind="OpenAI")
+def completion(context: AssetExecutionContext, openai: OpenAIResource, config: OpenAIConfig):
     with open(SEARCH_INDEX_FILE, "rb") as f:
         serialized_search_index = pickle.load(f)
     search_index = FAISS.deserialize_from_bytes(serialized_search_index, OpenAIEmbeddings())
     with openai.get_client(context) as client:
         prompt = stuff_prompt.PROMPT
         model = ChatOpenAI(client=client.chat.completions, model=config.model, temperature=0)
-        summaries = " ".join(
-            [
-                SUMMARY_TEMPLATE.format(content=doc.page_content, source=doc.metadata["source"])
-                for doc in search_index.similarity_search(config.question, k=4)
-            ]
-        )
+        summaries = [
+            SUMMARY_TEMPLATE.format(content=doc.page_content, source=doc.metadata["source"])
+            for doc in search_index.similarity_search(config.question, k=4)
+        ]
         output_parser = StrOutputParser()
         chain = prompt | model | output_parser
         context.log.info(chain.invoke({"summaries": summaries, "question": config.question}))

diff --git a/examples/with_openai/with_openai/utils.py b/examples/with_openai/with_openai/utils.py
@@ -1,8 +1,7 @@
-import io
 import os
 import pathlib
+import subprocess
 import tempfile
-import zipfile
 
 import requests
 from langchain.docstore.document import Document
@@ -19,17 +18,22 @@ def get_wiki_data(title, first_paragraph_only):
     )
 
 
-def get_github_docs(repo_owner, repo_name, category, archive_name="master"):
+def get_github_docs(repo_owner, repo_name, category):
     with tempfile.TemporaryDirectory() as d:
-        # The archive name can be a branch, tag or commit.
-        r = requests.get(f"https://github.com/{repo_owner}/{repo_name}/archive/{archive_name}.zip")
-        z = zipfile.ZipFile(io.BytesIO(r.content))
-        z.extractall(d)
-        root_path = pathlib.Path(os.path.join(d, f"{repo_name}-{archive_name}"))
-        docs_path = root_path.joinpath("docs/content", category)
-        markdown_files = list(docs_path.glob("*.md*")) + list(docs_path.glob("*/*.md*"))
-        for markdown_file in markdown_files:
+        subprocess.check_call(
+            f"git clone --depth 1 https://github.com/{repo_owner}/{repo_name}.git .",
+            cwd=d,
+            shell=True,
+        )
+        git_sha = (
+            subprocess.check_output("git rev-parse HEAD", shell=True, cwd=d).decode("utf-8").strip()
+        )
+        docs_path = pathlib.Path(os.path.join(d, "docs/content", category))
+        markdown_files = list(docs_path.glob("*/*.md")) + list(docs_path.glob("*/*.mdx"))
+        for index, markdown_file in enumerate(markdown_files):
             with open(markdown_file, "r") as f:
-                relative_path = markdown_file.relative_to(root_path)
-                github_url = f"https://github.com/{repo_owner}/{repo_name}/blob/{archive_name}/{relative_path}"
+                relative_path = markdown_file.relative_to(docs_path)
+                github_url = (
+                    f"https://github.com/{repo_owner}/{repo_name}/blob/{git_sha}/{relative_path}"
+                )
                 yield Document(page_content=f.read(), metadata={"source": github_url})