-
Notifications
You must be signed in to change notification settings - Fork 0
/
search_github.py
136 lines (115 loc) · 5.03 KB
/
search_github.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import requests
import csv
import time
from datetime import datetime,timedelta, date
import os
# GitHub API URL for searching repositories
repo_url = "https://api.github.com/search/repositories"
code_url = "https://api.github.com/search/code"
# GH API Key
api_key = os.getenv('GITHUB_API_KEY')
headers = {'Authorization': f'token {api_key}'}
# Directory
directory = os.getcwd()
# Set this for timeframe we want to search over
last_x_days = 30
# Get Current date
curr_date = str(date.today())
min_date = (datetime.now() - timedelta(days=last_x_days)).strftime('%Y-%m-%d')
# Flatten the list if it's a list of lists
def flatten_list(lst):
if lst and isinstance(lst[0], list):
return [item for sublist in lst for item in sublist]
# Load list of tracked EC repos:
def load_ec_tracked_repos(curr_date):
with open(f'{directory}/all_near_repos_{curr_date}.csv', 'r') as f:
reader = csv.reader(f)
repos = list(reader)
ec_tracked_repos = flatten_list(repos)
return set(ec_tracked_repos)
# Load list of previously audited repos that are not NEAR related
# This list will grow over time and is meant to prevent multiple manual review of the same repo
def load_reviewed_repos_to_exclude():
with open(f'{directory}/checked_repos_to_exclude.csv', 'r') as f:
reader = csv.reader(f)
repos = list(reader)
checked_repos = flatten_list(repos)
return set(checked_repos)
# Check if the repo is included in the new_repos list
def check_if_repo_loaded(repo,loaded_repos,checked_repos,new_repos,type):
# Determine repo_name based on type
repo_name = repo['full_name'] if type == "repo" else repo['repository']['full_name']
# Check if the repository is not in the 1) EC list, 2) manually reviewed list, and 3) new repos
if repo_name not in loaded_repos and repo_name not in checked_repos and repo_name not in new_repos:
new_repos.append(repo_name)
# Check and load repos
def check_and_load_repos(data, loaded_repos, checked_repos,new_repos,type):
for repo in data['items']:
check_if_repo_loaded(repo, loaded_repos, checked_repos,new_repos,type)
# Make a request to the GitHub API
def make_request(url, parameters, headers):
response = requests.get(url, params=parameters, headers=headers)
if response.status_code == 200:
return response
else:
try:
response = requests.get(url, params=parameters, headers=headers)
except Exception as e:
return e
def search_repos():
# Load the list of repositories from the CSV file
ec_tracked_repos = load_ec_tracked_repos(curr_date)
# Load the list of already checked repositories from another CSV file
checked_repos = load_reviewed_repos_to_exclude()
# Initialize an empty list for new repositories to review
new_repos = []
# Search queries as list of dictionaries
queries = [
{"type":"repo","query":"near-wallet -owner:near"},
{"type":"repo","query":"near lake -owner:near"},
{"type":"repo","query":"near indexer -owner:near"},
{"type":"repo","query":"near rpc -owner:near"},
{"type":"repo","query":"content:README.md near protocol -owner:near"},
{"type":"repo","query":"\"near-sdk\" stars:1..150"},
{"type":"repo","query":"\"near-cli\" stars:1..150"},
{"type":"repo","query":"\"py-near\" -owner:near"},
{"type":"code","query":"py-near -owner:near"},
{"type":"code","query":"near-sdk -owner:near"},
{"type":"code","query":"near-api-js -owner:near"},
]
for q in queries:
# Set the URL based on the type of query
if q["type"] == "repo":
url = "https://api.github.com/search/repositories"
push = f" pushed:>{min_date}" # Only repos support pushed date
per_page=100
elif q["type"] == "code":
url = "https://api.github.com/search/code"
push = ""
per_page=1000
# Loop through pages
page_num = 1
while True:
parameters = {"q": f"{q['query']}{push}", "page": page_num, "per_page": per_page}
print(parameters)
response = make_request(url, parameters, headers)
data = response.json()
check_and_load_repos(data, ec_tracked_repos, checked_repos, new_repos, q["type"])
if q["type"] == "code":
time.sleep(6) # rate limits to 10 requests per minute
if 'next' in response.links:
page_num += 1
print(page_num)
else:
break
# Dedupe new repos
new_repos = list(set(new_repos))
# Write new repos to a CSV file
with open(f'{directory}/new_repos_{curr_date}.csv', 'w', newline='') as f:
writer = csv.writer(f)
for repo in new_repos:
writer.writerow([repo])
# Manually review the list of repos in new_repos
# For those repos that are not NEAR related, add them to checked_repos_to_exclude.csv
# For those repos that are NEAR related, add them near.toml in PR
# Any repos identified as organizations should also be added