Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scrape reviewer count and publish on website #101

Draft
wants to merge 23 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
4ff5674
feat: Add initial Python script
GeckoEidechse Aug 26, 2024
82a8095
Merge remote-tracking branch 'origin/main' into feat/scrape-reviewer-…
GeckoEidechse Oct 4, 2024
e927b4c
refactor: Rename variable
GeckoEidechse Oct 6, 2024
3007523
feat: Store time of review in dict
GeckoEidechse Oct 6, 2024
281117b
style: Autoformat
GeckoEidechse Oct 6, 2024
8ce3e83
fix: Invert condition for checking review body
GeckoEidechse Oct 10, 2024
1c0d764
feat: Add initial check for reviews with non-informative content
GeckoEidechse Oct 10, 2024
c02be0d
feat: Add functions for summing review counts and sorting by reviewer…
GeckoEidechse Oct 10, 2024
2ac3b56
feat: Addtionally generate monthly and weekly reviewer stats
GeckoEidechse Oct 10, 2024
12ee962
style: Autoformat
GeckoEidechse Oct 10, 2024
31a56b4
feat: Add initial empty TypeScript files for holding review stats
GeckoEidechse Oct 10, 2024
1f54eb2
Merge branch 'main' into feat/scrape-reviewer-count
GeckoEidechse Oct 10, 2024
343074a
feat: Initial rendering of review count
GeckoEidechse Oct 11, 2024
f86b028
temp: commit stats to make it easier to work with current version
GeckoEidechse Oct 11, 2024
1b957f3
Increase minimum review character length to be considered for counting
GeckoEidechse Oct 14, 2024
9224e06
Split LGTM and text length check
GeckoEidechse Oct 14, 2024
b9b9c0b
feat: Render stats as table
GeckoEidechse Oct 14, 2024
17d15ff
Include weekly and total reviews
GeckoEidechse Oct 14, 2024
418c930
Annotate category type
GeckoEidechse Oct 14, 2024
aeeb1eb
Fill out subtext
GeckoEidechse Oct 14, 2024
300e159
Switch headings to H2
GeckoEidechse Oct 14, 2024
cfe1f01
Require at least 2 reviews to show up in table
GeckoEidechse Oct 14, 2024
c8f9fa3
temp: Add updated stats
GeckoEidechse Oct 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 224 additions & 0 deletions scripts/scrape-for-reviews.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
import requests
from collections import defaultdict
import datetime
import sys

github_token = None

# supply a github token in an arg avoid ratelimit, or don't, it's up to you
if len(sys.argv) > 1:
github_token = sys.argv[1]

# Replace with the GitHub organization name
github_org = "R2Northstar"

# Base URL for GitHub API
base_url = f"https://api.github.com/orgs/{github_org}"

# Headers for authentication
headers = {"Authorization": f"token {github_token}"}


def get_repos():
"""Fetch all repositories for the organization, handling pagination."""
repos = []
page = 1

while True:
url = f"{base_url}/repos?per_page=100&page={page}"
response = requests.get(url, headers=headers)
response.raise_for_status()
page_repos = response.json()

if not page_repos: # If the list is empty, we've reached the last page
break

repos.extend(page_repos)
page += 1

return repos


def get_pull_requests(repo_name):
"""Fetch all pull requests in the repository, handling pagination."""
print(f"{repo_name=}")
prs = []
page = 1

while True:
print(f"{page=}")
url = f"https://api.github.com/repos/{github_org}/{repo_name}/pulls?state=all&per_page=100&page={page}"
response = requests.get(url, headers=headers)
response.raise_for_status()
page_prs = response.json()

if not page_prs: # If the list is empty, we've reached the last page
break

prs.extend(page_prs)
page += 1

return prs


def get_reviews_for_pr(repo_name, pr_number):
"""Fetch all reviews for a given pull request in a specific repository."""
url = f"https://api.github.com/repos/{github_org}/{repo_name}/pulls/{pr_number}/reviews"
response = requests.get(url, headers=headers)
response.raise_for_status()
return response.json()


# Fetch all repositories in the organization
repos = get_repos()

# Dictionary to store the count of reviews per user
review_dict = defaultdict(list)


def is_trivial_review(review_text: str):
"""Perform a variety of checks to determine whether a review should be discarded due to not being extensive enough"""
min_review_length = 30
if "lgtm" in review_text.lower():
return True

if len(review_text) < min_review_length:
return True

return False


for repo in repos:
repo_name = repo["name"]
prs = get_pull_requests(repo_name)

for pr in prs:
pr_number = pr["number"]
reviews = get_reviews_for_pr(repo_name, pr_number)

for review in reviews:
if not review["user"] or not review["user"]["login"]:
continue
if review["body"] == "":
# Current object is comment on a review not an actual review, skip
continue
if is_trivial_review(review["body"]):
continue

user = review["user"]["login"]
review_dict[user].append(
datetime.datetime.fromisoformat(
review["submitted_at"].replace("Z", "+00:00")
)
)


def filter_by_timeframe(reviews_dict, weeks=1):
"""
Filters out reviews older than `weeks` weeks.
Additionally removes empty reviewer entries after filtering.
"""
# Apply the filter using a dictionary comprehension
now = datetime.datetime.now(datetime.timezone.utc)
filtered_review_counts = defaultdict(
list,
{
reviewer: [
review_time
for review_time in reviews
if now - review_time < datetime.timedelta(weeks=weeks)
]
for reviewer, reviews in reviews_dict.items()
},
)

# Remove empty entries
filtered_review_counts = defaultdict(
list,
{
reviewer: reviews
for reviewer, reviews in filtered_review_counts.items()
if len(reviews) > 0
},
)

return filtered_review_counts


def sum_up_reviews(reviews_dict):
"""Sum up review counts per reviewer"""
return {k: len(v) for k, v, in reviews_dict.items()}


def sort_alphabetically(reviews_dict):
"""Sort alphabetivally by reviewer name"""
sorted_reviewers = sorted(
reviews_dict.items(),
key=lambda item: item[0].lower(),
)
return sorted_reviewers


# Generate TypeScript code
def generate_typescript_code(sorted_review_counts, timeframe="total"):
file_header_string = "// Auto-generated from Python script\n"

definition_string = """
export interface ReviewCount {
url?: string;
name: string;
count: number;
}
"""
list_start_string = f"""export const review_counts_{timeframe}: ReviewCount[] = ["""
list_end_string = """
]
"""

contributor_list_string = ""
for user, count in sorted_review_counts:
contributor_list_string += f"""
{{
url: "https://github.com/{user}",
name: "{user}",
count: {count},
}},"""

return (
file_header_string
+ definition_string
+ list_start_string
+ contributor_list_string
+ list_end_string
)


# Total stats
with open("../src/data/reviewer-count.ts", "w") as f:
f.write(
generate_typescript_code(
sort_alphabetically(sum_up_reviews(review_dict)), "total"
)
)

# Monthly stats
with open("../src/data/reviewer-count-monthly.ts", "w") as f:
f.write(
generate_typescript_code(
sort_alphabetically(
sum_up_reviews(filter_by_timeframe(review_dict, weeks=4))
),
"monthly",
)
)

# Weekly stats
with open("../src/data/reviewer-count-weekly.ts", "w") as f:
f.write(
generate_typescript_code(
sort_alphabetically(
sum_up_reviews(filter_by_timeframe(review_dict, weeks=1))
),
"weekly",
)
)
60 changes: 60 additions & 0 deletions src/components/Reviewers.astro
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
---
import { review_counts_total } from "../data/reviewer-count";
import { review_counts_monthly } from "../data/reviewer-count-monthly";
import { review_counts_weekly } from "../data/reviewer-count-weekly";
---

<div class="TODO">
<h2>Weekly review count</h2>
<table>
<tr>
<th>Reviewer</th>
<th># of reviews</th>
</tr>
{
review_counts_weekly
.sort((a, b) => b.count - a.count)
.map((reviewer) => (
<tr>
<td>{reviewer.name}</td>
<td>{reviewer.count}</td>
</tr>
))
}
</table>
<h2>Monthly review count</h2>
<table>
<tr>
<th>Reviewer</th>
<th># of reviews</th>
</tr>
{
review_counts_monthly
.sort((a, b) => b.count - a.count)
.map((reviewer) => (
<tr>
<td>{reviewer.name}</td>
<td>{reviewer.count}</td>
</tr>
))
}
</table>
<h2>Total review count</h2>
<table>
<tr>
<th>Reviewer</th>
<th># of reviews</th>
</tr>
{
review_counts_total
.sort((a, b) => b.count - a.count)
.filter((c) => c.count > 1) // Require at least 2 reviews to reduce table size
.map((reviewer) => (
<tr>
<td>{reviewer.name}</td>
<td>{reviewer.count}</td>
</tr>
))
}
</table>
</div>
44 changes: 44 additions & 0 deletions src/data/reviewer-count-monthly.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Auto-generated from Python script

export interface ReviewCount {
url?: string;
name: string;
count: number;
}
export const review_counts_monthly: ReviewCount[] = [
{
url: "https://github.com/Alystrasz",
name: "Alystrasz",
count: 6,
},
{
url: "https://github.com/catornot",
name: "catornot",
count: 1,
},
{
url: "https://github.com/GeckoEidechse",
name: "GeckoEidechse",
count: 19,
},
{
url: "https://github.com/JMM889901",
name: "JMM889901",
count: 3,
},
{
url: "https://github.com/NachosChipeados",
name: "NachosChipeados",
count: 2,
},
{
url: "https://github.com/uniboi",
name: "uniboi",
count: 1,
},
{
url: "https://github.com/Zanieon",
name: "Zanieon",
count: 1,
},
]
24 changes: 24 additions & 0 deletions src/data/reviewer-count-weekly.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// Auto-generated from Python script

export interface ReviewCount {
url?: string;
name: string;
count: number;
}
export const review_counts_weekly: ReviewCount[] = [
{
url: "https://github.com/GeckoEidechse",
name: "GeckoEidechse",
count: 11,
},
{
url: "https://github.com/JMM889901",
name: "JMM889901",
count: 2,
},
{
url: "https://github.com/NachosChipeados",
name: "NachosChipeados",
count: 1,
},
]
Loading