Skip to content

Commit

Permalink
use realtime instead of batch
Browse files Browse the repository at this point in the history
  • Loading branch information
wjayesh committed Jan 9, 2025
1 parent 467c640 commit dcd4447
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 220 deletions.
60 changes: 0 additions & 60 deletions .github/workflows/docs_summarization_check.yml

This file was deleted.

31 changes: 11 additions & 20 deletions .github/workflows/docs_summarization_submit.yml
Original file line number Diff line number Diff line change
@@ -1,19 +1,16 @@
name: Submit Docs Summarization
name: Summarize and Upload Docs

on:
workflow_run:
workflows: ["release-prepare"]
types:
- completed
push:
branches:
- release/**

jobs:
submit-batch:
summarize-and-upload:
runs-on: ubuntu-latest
if: ${{ github.event.workflow_run.conclusion == 'success' }}
permissions:
contents: read
id-token: write
actions: write

steps:
- uses: actions/checkout@v3
Expand All @@ -26,7 +23,8 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install openai pathlib repomix
pip install openai pathlib huggingface_hub
npm install -g repomix
- name: Generate repomix outputs
run: |
Expand Down Expand Up @@ -54,23 +52,16 @@ jobs:
rm user-guide.txt getting-started.txt
- name: Upload repomix outputs
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: repomix-outputs
path: repomix-outputs
retention-days: 5

- name: Submit batch job
- name: Summarize and upload to HuggingFace
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
id: submit
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
python scripts/summarize_docs.py
echo "batch_id=$(cat batch_id.txt)" >> $GITHUB_OUTPUT
- name: Upload batch ID
uses: actions/upload-artifact@v3
with:
name: batch-id-${{ steps.submit.outputs.batch_id }}
path: batch_id.txt
retention-days: 5
python scripts/upload_to_huggingface.py
62 changes: 0 additions & 62 deletions scripts/check_batch_output.py

This file was deleted.

122 changes: 44 additions & 78 deletions scripts/summarize_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from openai import OpenAI
from pathlib import Path
from typing import List, Dict
import time

# Initialize OpenAI client
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
Expand All @@ -24,95 +23,62 @@ def extract_content_blocks(md_content: str) -> str:

return processed_content

def prepare_batch_requests(md_files: List[Path]) -> List[Dict]:
"""Prepares batch requests for each markdown file."""
batch_requests = []
def summarize_content(content: str, file_path: str) -> str:
"""Summarizes content using OpenAI API."""
try:
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "You are a technical documentation summarizer."
},
{
"role": "user",
"content": f"""Please summarize the following documentation text for another LLM to be able to answer questions about it with enough detail.
Keep all important technical information and key points while removing redundancy and verbose explanations.
Make it concise but ensure NO critical information is lost and some details that you think are important are kept.
Make the code shorter where possible keeping only the most important parts while preserving syntax and accuracy:
for i, file_path in enumerate(md_files):
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()

processed_content = extract_content_blocks(content)

file_path_str_with_no_slashes = str(file_path).replace("/", "_")

# Prepare the request for this file
request = {
"custom_id": f"file-{i}-{file_path_str_with_no_slashes}",
"method": "POST",
"url": "/v1/chat/completions",
"body": {
"model": "gpt-4o-mini",
"messages": [
{
"role": "system",
"content": "You are a technical documentation summarizer."
},
{
"role": "user",
"content": f"""Please summarize the following documentation text for another LLM to be able to answer questions about it with enough detail.
Keep all important technical information and key points while removing redundancy and verbose explanations.
Make it concise but ensure NO critical information is lost and some details that you think are important are kept.
Make the code shorter where possible keeping only the most important parts while preserving syntax and accuracy:
{processed_content}"""
}
],
"temperature": 0.3,
"max_tokens": 2000
{content}"""
}
}
batch_requests.append(request)

except Exception as e:
print(f"Error processing {file_path}: {e}")

return batch_requests

def submit_batch_job(batch_requests: List[Dict]) -> str:
"""Submits batch job to OpenAI and returns batch ID."""
# Create batch input file
batch_file_path = "batch_input.jsonl"
with open(batch_file_path, "w") as f:
for request in batch_requests:
f.write(json.dumps(request) + "\n")

# Upload the file
with open(batch_file_path, "rb") as f:
batch_input_file = client.files.create(
file=f,
purpose="batch"
],
temperature=0.3,
max_tokens=2000
)

# Create the batch
batch = client.batches.create(
input_file_id=batch_input_file.id,
endpoint="/v1/chat/completions",
completion_window="24h",
metadata={
"description": "ZenML docs summarization"
}
)

# Store batch ID for later use
with open("batch_id.txt", "w") as f:
f.write(batch.id)

print(f"Batch job submitted with ID: {batch.id}")
return batch.id
return response.choices[0].message.content
except Exception as e:
print(f"Error summarizing {file_path}: {e}")
return ""

def main():
docs_dir = "docs/book"
output_file = "summarized_docs.txt"

# Get markdown files
exclude_files = ["toc.md"]
md_files = list(Path(docs_dir).rglob("*.md"))
md_files = [file for file in md_files if file.name not in exclude_files]

# Prepare and submit batch job
batch_requests = prepare_batch_requests(md_files)
batch_id = submit_batch_job(batch_requests)
# Process each file and write summaries
with open(output_file, "w", encoding="utf-8") as out_f:
for file_path in md_files:
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()

processed_content = extract_content_blocks(content)
summary = summarize_content(processed_content, str(file_path))

if summary:
out_f.write(f"=== File: {file_path} ===\n\n")
out_f.write(summary)
out_f.write("\n\n" + "="*50 + "\n\n")

print(f"Processed: {file_path}")

except Exception as e:
print(f"Error processing {file_path}: {e}")

if __name__ == "__main__":
main()
25 changes: 25 additions & 0 deletions scripts/upload_to_huggingface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from huggingface_hub import HfApi
import os

def upload_to_huggingface():
api = HfApi(token=os.environ["HF_TOKEN"])

# Upload OpenAI summary
api.upload_file(
path_or_fileobj="summarized_docs.txt",
path_in_repo="how-to-guides.txt",
repo_id="zenml/llms.txt",
repo_type="dataset"
)

# Upload repomix outputs
for filename in ["component-guide.txt", "basics.txt", "llms-full.txt"]:
api.upload_file(
path_or_fileobj=f"repomix-outputs/{filename}",
path_in_repo=filename,
repo_id="zenml/llms.txt",
repo_type="dataset"
)

if __name__ == "__main__":
upload_to_huggingface()

0 comments on commit dcd4447

Please sign in to comment.