From d89cb174754d53b520537e6adced973c02a30e23 Mon Sep 17 00:00:00 2001
From: kouloumos <kouloumosa@gmail.com>
Date: Thu, 2 Jan 2025 12:52:42 +0200
Subject: [PATCH] feat(github): add `checkout_commit` option for testing
 specific repo states

Add ability to test GitHub scrapers with specific historical commit states via
new `checkout_commit` config option. Includes proper Git state handling and
reset logic to prevent errors on subsequent runs. Updates documentation
with clear examples of testing specific repository states.
---
 scraper/README.md          | 52 +++++++++++++++++++++++++-------------
 scraper/models/source.py   |  3 +++
 scraper/scrapers/github.py | 25 +++++++++++++++++-
 3 files changed, 61 insertions(+), 19 deletions(-)

diff --git a/scraper/README.md b/scraper/README.md
index 835cf46..bde76cf 100644
--- a/scraper/README.md
+++ b/scraper/README.md
@@ -267,27 +267,43 @@ This runs the scraper and outputs the extracted content to a JSON file.
 
 ### Test Resources
 
-To test with specific content, add `test_resources` to your source configuration:
-To use this feature:
+To test with specific content:
 
-```yaml
-github:
-  - name: ExampleRepo
-    # ... source configuration ...
-    test_resources:
-      - path/to/test/file.md
-
-web:
-  - name: ExampleSite
-    # ... source configuration ...
-    test_resources:
-      - https://example.com/example-post
-      - https://example.com/example-post2
-```
+1. **Test Resources**
+
+   Add `test_resources` to your source configuration:
+
+   ```yaml
+   github:
+     - name: ExampleRepo
+       # ... source configuration ...
+       test_resources:
+         - path/to/test/file.md
+
+   web:
+     - name: ExampleSite
+       # ... source configuration ...
+       test_resources:
+         - https://example.com/example-post
+         - https://example.com/example-post2
+   ```
+
+   The scraper will only process the specified test resources instead of scraping the entire source.
+
+2. **Test with Specific Commit State**
+
+   For GitHub sources, specify a `checkout_commit` to test with repository state at a particular point in time:
+
+   ```yaml
+   github:
+     - name: ExampleRepo
+       # ... source configuration ...
+       checkout_commit: abc123def456 # Test with repo state at this specific commit
+   ```
 
-The scraper will only process the specified test resources instead of scraping the entire source.
+   This allows testing the scraper against a specific historical state of the repository.
 
-This is useful for debugging, testing new processors, or verifying behavior with specific content. Remove or comment out `test_resources` to scrape the entire source.
+This is useful for debugging, testing new processors, or verifying behavior with specific content. Remove or comment out these test options to scrape the entire source.
 
 ### Testing with Elasticsearch
 
diff --git a/scraper/models/source.py b/scraper/models/source.py
index 9191ed2..c3fdaeb 100644
--- a/scraper/models/source.py
+++ b/scraper/models/source.py
@@ -23,6 +23,9 @@ class SourceConfig(BaseModel):
     test_resources: Optional[List[str]] = []
     processors: List[str] = []
     analyzer_config: Optional[AnalyzerConfig] = None
+    checkout_commit: Optional[
+        str
+    ] = None  # Specific commit hash to checkout for testing
 
 
 __all__ = ["SourceConfig", "AnalyzerConfig"]
diff --git a/scraper/scrapers/github.py b/scraper/scrapers/github.py
index 22b2883..6bf1911 100644
--- a/scraper/scrapers/github.py
+++ b/scraper/scrapers/github.py
@@ -44,6 +44,16 @@ async def scrape(self):
         last_commit_hash = last_run.last_commit_hash if last_run else None
 
         repo = self.clone_or_pull_repo()
+        # If checkout_commit is specified, use that specific commit state
+        if self.config.checkout_commit:
+            try:
+                repo.git.checkout(self.config.checkout_commit)
+            except Exception as e:
+                logger.error(
+                    f"Failed to checkout commit {self.config.checkout_commit}: {e}"
+                )
+                raise
+
         self.current_commit_hash = repo.head.commit.hexsha
 
         # Handle test mode vs full mode
@@ -51,7 +61,9 @@ async def scrape(self):
             logger.info(f"Running in test mode with resources: {self.test_resources}")
             files_to_process = self.test_resources
         else:
-            logger.info("Running in full mode")
+            logger.info(
+                f"Running in full mode: {last_commit_hash} -> {self.current_commit_hash}"
+            )
             files_to_process = self.get_changed_files(repo, last_commit_hash)
 
         # Process files
@@ -81,6 +93,17 @@ def clone_or_pull_repo(self) -> Repo:
         if os.path.exists(self.repo_path):
             logger.info(f"Updating existing repo at path: {self.repo_path}")
             repo = Repo(self.repo_path)
+
+            # Reset any changes and checkout main branch before pulling
+            repo.git.reset("--hard")
+
+            # Get default branch name (usually main or master)
+            default_branch = repo.git.symbolic_ref("refs/remotes/origin/HEAD").split(
+                "/"
+            )[-1]
+            repo.git.checkout(default_branch)
+
+            # Now pull the latest changes
             origin = repo.remotes.origin
             origin.pull()
         else: