use realtime instead of batch

zenml-io · Jan 9, 2025 · dcd4447 · dcd4447
1 parent 467c640
commit dcd4447
Show file tree

Hide file tree

Showing 5 changed files with 80 additions and 220 deletions.
diff --git a/.github/workflows/docs_summarization_check.yml b/.github/workflows/docs_summarization_check.yml
diff --git a/.github/workflows/docs_summarization_submit.yml b/.github/workflows/docs_summarization_submit.yml
@@ -1,19 +1,16 @@
-name: Submit Docs Summarization
+name: Summarize and Upload Docs
 
 on:
-  workflow_run:
-    workflows: ["release-prepare"]
-    types:
-      - completed
+  push:
+    branches:
+      - release/**
 
 jobs:
-  submit-batch:
+  summarize-and-upload:
     runs-on: ubuntu-latest
-    if: ${{ github.event.workflow_run.conclusion == 'success' }}
     permissions:
       contents: read
       id-token: write
-      actions: write
 
     steps:
       - uses: actions/checkout@v3
@@ -26,7 +23,8 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install openai pathlib repomix
+          pip install openai pathlib huggingface_hub
+          npm install -g repomix
           
       - name: Generate repomix outputs
         run: |
@@ -54,23 +52,16 @@ jobs:
           rm user-guide.txt getting-started.txt
           
       - name: Upload repomix outputs
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: repomix-outputs
           path: repomix-outputs
           retention-days: 5
 
-      - name: Submit batch job
+      - name: Summarize and upload to HuggingFace
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-        id: submit
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
           python scripts/summarize_docs.py
-          echo "batch_id=$(cat batch_id.txt)" >> $GITHUB_OUTPUT
-          
-      - name: Upload batch ID
-        uses: actions/upload-artifact@v3
-        with:
-          name: batch-id-${{ steps.submit.outputs.batch_id }}
-          path: batch_id.txt
-          retention-days: 5 
+          python scripts/upload_to_huggingface.py 
diff --git a/scripts/check_batch_output.py b/scripts/check_batch_output.py
diff --git a/scripts/summarize_docs.py b/scripts/summarize_docs.py
@@ -4,7 +4,6 @@
 from openai import OpenAI
 from pathlib import Path
 from typing import List, Dict
-import time
 
 # Initialize OpenAI client
 client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
@@ -24,95 +23,62 @@ def extract_content_blocks(md_content: str) -> str:
 
     return processed_content
 
-def prepare_batch_requests(md_files: List[Path]) -> List[Dict]:
-    """Prepares batch requests for each markdown file."""
-    batch_requests = []
+def summarize_content(content: str, file_path: str) -> str:
+    """Summarizes content using OpenAI API."""
+    try:
+        response = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a technical documentation summarizer."
+                },
+                {
+                    "role": "user",
+                    "content": f"""Please summarize the following documentation text for another LLM to be able to answer questions about it with enough detail. 
+                    Keep all important technical information and key points while removing redundancy and verbose explanations. 
+                    Make it concise but ensure NO critical information is lost and some details that you think are important are kept.
+                    Make the code shorter where possible keeping only the most important parts while preserving syntax and accuracy:
 
-    for i, file_path in enumerate(md_files):
-        try:
-            with open(file_path, 'r', encoding='utf-8') as f:
-                content = f.read()
-
-            processed_content = extract_content_blocks(content)
-
-            file_path_str_with_no_slashes = str(file_path).replace("/", "_")
-
-            # Prepare the request for this file
-            request = {
-                "custom_id": f"file-{i}-{file_path_str_with_no_slashes}",
-                "method": "POST",
-                "url": "/v1/chat/completions",
-                "body": {
-                    "model": "gpt-4o-mini",
-                    "messages": [
-                        {
-                            "role": "system",
-                            "content": "You are a technical documentation summarizer."
-                        },
-                        {
-                            "role": "user",
-                            "content": f"""Please summarize the following documentation text for another LLM to be able to answer questions about it with enough detail. 
-                            Keep all important technical information and key points while removing redundancy and verbose explanations. 
-                            Make it concise but ensure NO critical information is lost and some details that you think are important are kept.
-                            Make the code shorter where possible keeping only the most important parts while preserving syntax and accuracy:
-
-                            {processed_content}"""
-                        }
-                    ],
-                    "temperature": 0.3,
-                    "max_tokens": 2000
+                    {content}"""
                 }
-            }
-            batch_requests.append(request)
-
-        except Exception as e:
-            print(f"Error processing {file_path}: {e}")
-
-    return batch_requests
-
-def submit_batch_job(batch_requests: List[Dict]) -> str:
-    """Submits batch job to OpenAI and returns batch ID."""
-    # Create batch input file
-    batch_file_path = "batch_input.jsonl"
-    with open(batch_file_path, "w") as f:
-        for request in batch_requests:
-            f.write(json.dumps(request) + "\n")
-
-    # Upload the file
-    with open(batch_file_path, "rb") as f:
-        batch_input_file = client.files.create(
-            file=f,
-            purpose="batch"
+            ],
+            temperature=0.3,
+            max_tokens=2000
         )
-
-    # Create the batch
-    batch = client.batches.create(
-        input_file_id=batch_input_file.id,
-        endpoint="/v1/chat/completions",
-        completion_window="24h",
-        metadata={
-            "description": "ZenML docs summarization"
-        }
-    )
-
-    # Store batch ID for later use
-    with open("batch_id.txt", "w") as f:
-        f.write(batch.id)
-
-    print(f"Batch job submitted with ID: {batch.id}")
-    return batch.id
+        return response.choices[0].message.content
+    except Exception as e:
+        print(f"Error summarizing {file_path}: {e}")
+        return ""
 
 def main():
     docs_dir = "docs/book"
+    output_file = "summarized_docs.txt"
 
     # Get markdown files
     exclude_files = ["toc.md"]
     md_files = list(Path(docs_dir).rglob("*.md"))
     md_files = [file for file in md_files if file.name not in exclude_files]
 
-    # Prepare and submit batch job
-    batch_requests = prepare_batch_requests(md_files)
-    batch_id = submit_batch_job(batch_requests)
+    # Process each file and write summaries
+    with open(output_file, "w", encoding="utf-8") as out_f:
+        for file_path in md_files:
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+
+                processed_content = extract_content_blocks(content)
+                summary = summarize_content(processed_content, str(file_path))
+
+                if summary:
+                    out_f.write(f"=== File: {file_path} ===\n\n")
+                    out_f.write(summary)
+                    out_f.write("\n\n" + "="*50 + "\n\n")
+
+                    print(f"Processed: {file_path}")
+
+            except Exception as e:
+                print(f"Error processing {file_path}: {e}")
 
 if __name__ == "__main__":
     main() 
diff --git a/scripts/upload_to_huggingface.py b/scripts/upload_to_huggingface.py
@@ -0,0 +1,25 @@
+from huggingface_hub import HfApi
+import os
+
+def upload_to_huggingface():
+    api = HfApi(token=os.environ["HF_TOKEN"])
+
+    # Upload OpenAI summary
+    api.upload_file(
+        path_or_fileobj="summarized_docs.txt",
+        path_in_repo="how-to-guides.txt",
+        repo_id="zenml/llms.txt",
+        repo_type="dataset"
+    )
+
+    # Upload repomix outputs
+    for filename in ["component-guide.txt", "basics.txt", "llms-full.txt"]:
+        api.upload_file(
+            path_or_fileobj=f"repomix-outputs/{filename}",
+            path_in_repo=filename,
+            repo_id="zenml/llms.txt",
+            repo_type="dataset"
+        )
+
+if __name__ == "__main__":
+    upload_to_huggingface()