-
Notifications
You must be signed in to change notification settings - Fork 0
/
copyright.py
214 lines (187 loc) · 7.45 KB
/
copyright.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#!/usr/bin/env python
#
# This file is part of ...
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
"""Copyright file generator.
This script runs "git log" to determine the authors of commits to a repository
or set of paths within a repository and then generates a COPYRIGHT file
corresponding to those authors and the dates of their commits.
"""
import collections
import os
import re
import subprocess
import sys
INSTITUTIONS = {
"Association of Universities for Research in Astronomy":
["noao.edu", "lsst.org", "LSST.org", "[email protected]",
"California Institute of Technology":
["caltech.edu"],
"The Board of Trustees of the Leland Stanford Junior "
"University, through SLAC National Accelerator Laboratory":
["slac.stanford.edu", "[email protected]",
"[email protected]", # !!!
"The Regents of the University of California":
["ucdavis.edu",
"[email protected]", # !!!
"The Trustees of Princeton University":
["princeton.edu", "[email protected]",
"The University of Tokyo":
["[email protected]"],
"University of Illinois Board of Trustees":
["illinois.edu", "[email protected]"],
"University of Washington":
["uw.edu", "washington.edu", "[email protected]",
}
"""Dictionary of institutions claiming copyright.
Key is the official copyright text for the institution. Value is a list of
suffixes for emails belonging to that institution.
"""
THRESHOLD = 3
"""Significance threshold for copyright, in number of lines.
A commit is deemed significant for copyright purposes if it
adds, deletes, or changes at least this number of lines.
"""
def find_institution(email, year, month):
"""Find the institution for an author's email on a given date.
All code was assigned to LSST Corporation prior to the start of
construction in 2014-08.
John Swinbank moved from Princeton to UW at the end of 2017-09.
Parameters
----------
email : `str`
Email of the commit author (not committer).
year : `int`
Year that the commit was made.
month : `int`
Month that the commit was made.
Returns
-------
institution : `str`
Copyright text for the institution which claims copyright to the
author's work, or the author's email if no institution.
"""
if year < 2014 or (year == 2014 and month < 8):
return "LSST Corporation"
if email == "[email protected]":
if year < 2017 or (year == 2017 and month < 10):
return "The Trustees of Princeton University"
return "University of Washington"
for institution in INSTITUTIONS:
for domain in INSTITUTIONS[institution]:
if email.endswith(domain):
return institution
return email
def format_year_range(min_year, max_year):
"""Format a range of years into a string.
Parameters
----------
min_year : `intr`
Low end of the range (inclusive).
max_year : `intr`
High end of the range (inclusive).
Returns
-------
text : `str`
Formatted year range.
"""
if min_year == max_year:
return str(min_year)
else:
return "%d-%d" % (min_year, max_year)
def stringify_year_set(years):
"""Convert a set of years into a list of year ranges.
Parameters
----------
years : `set` of `int`
Set of years.
Returns
-------
text : `str`
Formatted year range list.
"""
year_list = sorted(years)
min_year = max_year = year_list[0]
year_ranges = []
for year in year_list[1:]:
if year == max_year + 1:
max_year = year
else:
year_ranges.append(format_year_range(min_year, max_year))
min_year = max_year = year
year_ranges.append(format_year_range(min_year, max_year))
return ", ".join(year_ranges)
if __name__ == "__main__":
# Log format includes the author date in ISO-like format, author email,
# and full commit hash.
git_cmd = ["git", "log", "--pretty=format:%ai %ae %H", "--shortstat"]
# Append any other git arguments from the command line.
if len(sys.argv) > 1:
git_cmd += sys.argv[1:]
log = subprocess.check_output(git_cmd)
# Read in the full hashes of any commits deemed not copyrightable.
insignificant_list = []
if os.path.isfile(".non-copyright"):
with open(".non-copyright", "r") as f:
for line in f:
# Allow for comments by only taking hashes at the beginning of
# a line. Text that follows the hash or appears on lines that
# don't begin with a hex character is ignored.
m = re.search(r"^([\da-f]{40})", line)
if m:
insignificant_list.append(m.group(1))
copyrights = collections.defaultdict(set)
for line in log.split("\n"):
# One pretty-formatted line per commit.
m = re.search(
r"^(\d{4})-(\d\d)-\d\d [\d:]+ [+-]\d{4} (\S+) ([\da-f]+)$",
line)
if m:
year = int(m.group(1))
month = int(m.group(2))
email = m.group(3)
commit_hash = m.group(4)
else:
# One (or zero, e.g. for merges) shortstat line per commit.
m = re.search(r"^ \d+ file.+?, (\d+) .+?(?:, (\d+) del.*)?$", line)
if m:
# Test for commit significance.
if int(m.group(1)) >= THRESHOLD or (
m.group(2) is not None and
int(m.group(2)) >= THRESHOLD):
if commit_hash not in insignificant_list:
institution = find_institution(email, year, month)
copyrights[institution].add(year)
for institution in copyrights:
print("Copyright {} {}".format(
stringify_year_set(copyrights[institution]), institution))