From d89cb174754d53b520537e6adced973c02a30e23 Mon Sep 17 00:00:00 2001 From: kouloumos Date: Thu, 2 Jan 2025 12:52:42 +0200 Subject: [PATCH] feat(github): add `checkout_commit` option for testing specific repo states Add ability to test GitHub scrapers with specific historical commit states via new `checkout_commit` config option. Includes proper Git state handling and reset logic to prevent errors on subsequent runs. Updates documentation with clear examples of testing specific repository states. --- scraper/README.md | 52 +++++++++++++++++++++++++------------- scraper/models/source.py | 3 +++ scraper/scrapers/github.py | 25 +++++++++++++++++- 3 files changed, 61 insertions(+), 19 deletions(-) diff --git a/scraper/README.md b/scraper/README.md index 835cf46..bde76cf 100644 --- a/scraper/README.md +++ b/scraper/README.md @@ -267,27 +267,43 @@ This runs the scraper and outputs the extracted content to a JSON file. ### Test Resources -To test with specific content, add `test_resources` to your source configuration: -To use this feature: +To test with specific content: -```yaml -github: - - name: ExampleRepo - # ... source configuration ... - test_resources: - - path/to/test/file.md - -web: - - name: ExampleSite - # ... source configuration ... - test_resources: - - https://example.com/example-post - - https://example.com/example-post2 -``` +1. **Test Resources** + + Add `test_resources` to your source configuration: + + ```yaml + github: + - name: ExampleRepo + # ... source configuration ... + test_resources: + - path/to/test/file.md + + web: + - name: ExampleSite + # ... source configuration ... + test_resources: + - https://example.com/example-post + - https://example.com/example-post2 + ``` + + The scraper will only process the specified test resources instead of scraping the entire source. + +2. **Test with Specific Commit State** + + For GitHub sources, specify a `checkout_commit` to test with repository state at a particular point in time: + + ```yaml + github: + - name: ExampleRepo + # ... source configuration ... + checkout_commit: abc123def456 # Test with repo state at this specific commit + ``` -The scraper will only process the specified test resources instead of scraping the entire source. + This allows testing the scraper against a specific historical state of the repository. -This is useful for debugging, testing new processors, or verifying behavior with specific content. Remove or comment out `test_resources` to scrape the entire source. +This is useful for debugging, testing new processors, or verifying behavior with specific content. Remove or comment out these test options to scrape the entire source. ### Testing with Elasticsearch diff --git a/scraper/models/source.py b/scraper/models/source.py index 9191ed2..c3fdaeb 100644 --- a/scraper/models/source.py +++ b/scraper/models/source.py @@ -23,6 +23,9 @@ class SourceConfig(BaseModel): test_resources: Optional[List[str]] = [] processors: List[str] = [] analyzer_config: Optional[AnalyzerConfig] = None + checkout_commit: Optional[ + str + ] = None # Specific commit hash to checkout for testing __all__ = ["SourceConfig", "AnalyzerConfig"] diff --git a/scraper/scrapers/github.py b/scraper/scrapers/github.py index 22b2883..6bf1911 100644 --- a/scraper/scrapers/github.py +++ b/scraper/scrapers/github.py @@ -44,6 +44,16 @@ async def scrape(self): last_commit_hash = last_run.last_commit_hash if last_run else None repo = self.clone_or_pull_repo() + # If checkout_commit is specified, use that specific commit state + if self.config.checkout_commit: + try: + repo.git.checkout(self.config.checkout_commit) + except Exception as e: + logger.error( + f"Failed to checkout commit {self.config.checkout_commit}: {e}" + ) + raise + self.current_commit_hash = repo.head.commit.hexsha # Handle test mode vs full mode @@ -51,7 +61,9 @@ async def scrape(self): logger.info(f"Running in test mode with resources: {self.test_resources}") files_to_process = self.test_resources else: - logger.info("Running in full mode") + logger.info( + f"Running in full mode: {last_commit_hash} -> {self.current_commit_hash}" + ) files_to_process = self.get_changed_files(repo, last_commit_hash) # Process files @@ -81,6 +93,17 @@ def clone_or_pull_repo(self) -> Repo: if os.path.exists(self.repo_path): logger.info(f"Updating existing repo at path: {self.repo_path}") repo = Repo(self.repo_path) + + # Reset any changes and checkout main branch before pulling + repo.git.reset("--hard") + + # Get default branch name (usually main or master) + default_branch = repo.git.symbolic_ref("refs/remotes/origin/HEAD").split( + "/" + )[-1] + repo.git.checkout(default_branch) + + # Now pull the latest changes origin = repo.remotes.origin origin.pull() else: