Skip to content

Commit

Permalink
feat: add github actions workflows for scraperv2
Browse files Browse the repository at this point in the history
Adds reusable workflow for scraping sources and
- workflow for BitcoinTranscripts
- minor fixes identified during testing of workflow
- removes old workflow for bitcointranscripts
  • Loading branch information
kouloumos committed Jan 9, 2025
1 parent 949c3c1 commit 2112dad
Show file tree
Hide file tree
Showing 9 changed files with 123 additions and 54 deletions.
32 changes: 7 additions & 25 deletions .github/workflows/bitcointranscripts.yml
Original file line number Diff line number Diff line change
@@ -1,32 +1,14 @@
name: Bitcoin Transcripts
name: BitcoinTranscripts

on:
schedule:
- cron: '0 13 * * 3' # every Wednesday at 1pm UTC
workflow_dispatch:
repository_dispatch:

jobs:
fetch:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Fetch data
run: |
mkdir /tmp/data
python bitcointranscripts/main.py
env:
ES_ENGINE: ${{ secrets.ES_ENGINE }}
ES_URL: ${{ secrets.ES_URL }}
ES_TOKEN: ${{ secrets.ES_TOKEN }}
DATA_DIR: /tmp/data
CLOUD_ID: ${{ secrets.CLOUD_ID }}
USER_PASSWORD: ${{ secrets.USER_PASSWORD }}
USERNAME: ${{ secrets.USERNAME }}
INDEX: ${{ secrets.INDEX }}
bitcointranscripts:
uses: ./.github/workflows/scrape-source.yml
with:
source: bitcointranscripts
secrets: inherit
39 changes: 39 additions & 0 deletions .github/workflows/scrape-source.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: Scrape Source

on:
workflow_call:
inputs:
source:
required: true
type: string

jobs:
scrape:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.11"

- name: Get latest release
id: latest_release
uses: pozetroninc/[email protected]
with:
repository: bitcoinsearch/scraper
token: ${{ secrets.GITHUB_TOKEN }}

- name: Install release
run: |
# Get the wheel filename from the release assets
WHEEL_URL=$(curl -s https://api.github.com/repos/bitcoinsearch/scraper/releases/latest | jq -r '.assets[] | select(.name | endswith(".whl")) | .browser_download_url')
pip install $WHEEL_URL
- name: Run scraper
run: scraper scrape --source ${{ inputs.source }}
env:
CLOUD_ID: ${{ secrets.CLOUD_ID }}
API_KEY: ${{ secrets.API_KEY }}
INDEX: ${{ secrets.INDEX }}
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Weekly
- Bitcoin Talk Forum ([cron](.github/workflows/bitcointalk.yml), [source](bitcointalk))
- only the [Development & Technical Discussion Board](https://bitcointalk.org/index.php?board=6.0)
- only for specific authors
- [Bitcoin Transcript](https://btctranscripts.com/) ([cron](.github/workflows/bitcointranscripts.yml), [source](bitcointranscripts))
- [Bitcoin Transcript](https://btctranscripts.com/) ([cron](.github/workflows/bitcointranscripts.yml), [source](scraper/scrapers/bitcointranscripts.py))
- [Bitcoin Optech](https://bitcoinops.org/) ([cron](.github/workflows/bitcoinops.yml), [source](bitcoinops))

Additionally, for on-demand scraping tasks, we utilize a Scrapybot, details of which can be found in the [Scrapybot section](#scrapybot) below.
Expand Down
46 changes: 41 additions & 5 deletions scraper/commands/elastic.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,14 +100,50 @@ async def cleanup():
click.echo(f"Index {index_name} does not exist")
return

# Define query based on the cleanup type
if test_docs_only:
await output.cleanup_test_documents(index_name)
click.echo(f"Cleaned up test documents from index {index_name}")
query = {"query": {"term": {"test_document": True}}}
operation_desc = "test documents"
else:
output.es.delete_by_query(
index=index_name, body={"query": {"match_all": {}}}
query = {"query": {"match_all": {}}}
operation_desc = "documents"

try:
# First count how many documents will be affected
count_result = output.es.count(index=index_name, body=query)
doc_count = count_result["count"]

# Ask for confirmation
if not click.confirm(
f"\nWarning: {doc_count} {operation_desc} will be deleted from index '{index_name}'. Do you want to continue?"
):
click.echo("Operation cancelled")
return

# Proceed with deletion
delete_result = output.es.delete_by_query(
index=index_name, body=query
)
click.echo(f"Removed all documents from index {index_name}")

# Print detailed deletion results
click.echo("\nDeletion Results:")
click.echo(
f"Total {operation_desc} deleted: {delete_result['deleted']}"
)
click.echo(f"Total batches: {delete_result['batches']}")
click.echo(f"Documents that failed: {delete_result['failures']}")
click.echo(f"Time taken: {delete_result['took']}ms")

if delete_result.get("failures"):
click.echo("\nFailures encountered:")
for failure in delete_result["failures"]:
click.echo(f"Document ID: {failure['_id']}")
click.echo(f"Error: {failure.get('error')}")
click.echo("---")

except Exception as e:
click.echo(f"Error during cleanup: {e}", err=True)
raise click.ClickException(str(e))

return run_in_reactor(cleanup())

Expand Down
15 changes: 1 addition & 14 deletions scraper/outputs/elasticsearch_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ async def _query_runs(

query = {
"query": {"bool": {"must": must_clauses}},
"sort": [{"timestamp": {"order": "desc"}}],
"sort": [{"finished_at": {"order": "desc"}}],
"size": size,
}

Expand Down Expand Up @@ -128,19 +128,6 @@ async def get_recent_runs(
"""Get the most recent runs for a source."""
return await self._query_runs(source=source, size=limit)

async def cleanup_test_documents(self, index_name: str):
"""Remove all test documents from the specified index."""
query = {"query": {"term": {"test_document": True}}}
try:
result = self.es.delete_by_query(index=index_name, body=query)
logger.info(
f"Cleaned up {result['deleted']} test documents from index {index_name}"
)
except Exception as e:
logger.error(f"Error cleaning up test documents: {e}")
logger.exception("Full traceback:")
raise

async def create_index_with_mapping(self, index_name: str, mapping: dict):
"""
Create an index with a specific mapping.
Expand Down
24 changes: 22 additions & 2 deletions scraper/processors/topic_extractor_processor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import json
from pathlib import Path
from typing import List
from loguru import logger

from scraper.config import get_project_root
from scraper.models import ScrapedDocument
from .base_processor import BaseProcessor
from scraper.registry import processor_registry
Expand All @@ -12,8 +15,25 @@ def __init__(self):
self.topics_list = self.load_topics()

def load_topics(self) -> List[str]:
with open("scraper/processors/topics_list.json", "r") as f:
return json.load(f)["topics"]
topics_path = Path(get_project_root()) / "processors" / "topics_list.json"
try:
with open(topics_path, "r") as f:
return json.load(f)["topics"]
except FileNotFoundError:
logger.warning(
f"Topics file not found at {topics_path}. Using empty topics list."
)
return []
except json.JSONDecodeError:
logger.error(
f"Invalid JSON in topics file: {topics_path}. Using empty topics list."
)
return []
except KeyError:
logger.error(
f"Missing 'topics' key in topics file: {topics_path}. Using empty topics list."
)
return []

async def process(self, document: ScrapedDocument) -> ScrapedDocument:
# Placeholder logic - replace with actual topic extraction
Expand Down
10 changes: 8 additions & 2 deletions scraper/scrapers/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,10 +243,16 @@ def customize_document(
return document_data

def generate_id(self, file_path: str) -> str:
# Override this method to customize ID generation
"""
Override this method in subclasses to customize ID generation.
"""
# Since file_path is relative (e.g. 'tabconf/2022/file.zh.md'),
# we can safely use directory structure in ID generation
dir_path = os.path.dirname(file_path)
file_name = os.path.basename(file_path)
# Keep language suffix (e.g. .zh) but remove final extension (.md)
name_without_extension = os.path.splitext(file_name)[0]
return f"{self.config.name.lower()}-{slugify(name_without_extension)}"
return f"{self.config.name.lower()}-{slugify(dir_path)}-{slugify(name_without_extension)}"

def get_title(self, metadata: Dict[str, Any], body: str) -> str:
# First, check if there's a title in the metadata
Expand Down
4 changes: 0 additions & 4 deletions scraper/sources.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,6 @@ github:
- name: BitcoinTranscripts
domain: https://btctranscripts.com
url: https://github.com/bitcointranscripts/bitcointranscripts.git
processors:
- summarization
- topic_extractor
- vector_embeddings
- name: PR-Review-Club
domain: https://bitcoincore.reviews/
url: https://github.com/bitcoin-core-review-club/website.git
Expand Down
5 changes: 4 additions & 1 deletion scraper/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import re
import unicodedata
from typing import Tuple
Expand All @@ -9,7 +10,7 @@ def slugify(value: str) -> str:
"""
Convert a string to a URL-friendly slug.
- Normalize to ASCII
- Replace spaces and underscores with hyphens
- Replace spaces, underscores and directory separators with hyphens
- Remove characters that aren't alphanumerics, underscores, or hyphens
- Convert to lowercase
- Strip leading and trailing hyphens
Expand All @@ -18,6 +19,8 @@ def slugify(value: str) -> str:
value = (
unicodedata.normalize("NFKD", value).encode("ascii", "ignore").decode("ascii")
)
# Replace directory separators to hyphens
value = value.replace(os.sep, "-")

# Replace spaces and underscores with hyphens, remove invalid characters
value = re.sub(r"[_\s]+", "-", value) # Replace spaces and underscores
Expand Down

0 comments on commit 2112dad

Please sign in to comment.