Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Working version, needs improvement [WIP] #39

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
22 changes: 19 additions & 3 deletions licensedb/internal/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,23 @@ func tfidf(freq int, docfreq int, ndocs int) float32 {
}

func (db *database) QuerySourceFile(text string) map[string]float32 {
// TO DO: implement this function
placeholder := map[string]float32{}
return
candidates := map[string]float32{}
append := func(others map[string]float32) {
for key, val := range others {
if candidates[key] < val {
candidates[key] = val
}
}
}
append(db.QueryLicenseText(string(text)))
if len(candidates) == 0 {
// TO DO: split license-comments from description-comments.
Copy link
Author

@mariabg mariabg Jul 30, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As all of the files in the project are scanned for licenses I though about doing this split here for performance's sake, what do you think? Should it be done before to improve % on the output? If the license is clear, like on this project, is is found without problem.

}
if db.debug {
for key, val := range candidates {
println("NLP", key, val)
}
}
db.addURLMatches(candidates, text)
return candidates
}
101 changes: 52 additions & 49 deletions licensedb/internal/investigation.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,31 +65,31 @@ var (
"^(%s)$", strings.Join(licenseFileNames, "|")))

commentSyntaxes = map[string]*regexp.Regexp {
// "ANTLR": regexp.MustCompile(``),
"C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*\t*\r*\n*)*\*\/)`),
"C++": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*\t*\r*\n*)*\*\/)`),
"C#": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*\t*\r*\n*)*\*\/)`),
"CSS": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
"Go": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
// "HTML": regexp.MustCompile(``),
"Haskel": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\{-(.*\t*\r*\n*)*\-\})`),
"Java": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
"Javascript": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
"Matlab": regexp.MustCompile(`(%\{(.*\s+.*)*%\})`),
"Objective-C": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
// "Perl": regexp.MustCompile(``),
"PHP": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
"Python": regexp.MustCompile("(#.*\t?\r?\n?)|(```.*```)"),
// "Ruby": regexp.MustCompile(``),
"Rust": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
// "R": regexp.MustCompile(``),
// "Shell": regexp.MustCompile(``),
"Swift": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
// "SAS": regexp.MustCompile(``),
"Scala": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
"SQL": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\/\*(.*\t*\r*\n*)*\*\/)`),
// "Visual Basic": regexp.MustCompile(``),
// "yml": regexp.MustCompile(``),
"ANTLR": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"C++": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"C#": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"CSS": regexp.MustCompile(`\/\*(.*?\t?\r?\n?)+?\*\/`),
"Go": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"HTML": regexp.MustCompile(`<\!--(.*?\t?\r?\n?)+?-->`),
"Haskel": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\{-(.*?\t?\r?\n?)+?\-\})`),
"Java": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"JavaScript": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"Matlab": regexp.MustCompile(`(%.*\t?\r?\n?)|(%\{(.?\t?\r?\n?)+?%\})`),
"Objective-C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"Perl": regexp.MustCompile(`(#.*\t?\r?\n?)|(=begin(.*?\t?\r?\n?)+?=cut)`),
"PHP": regexp.MustCompile(`(#.*\t?\r?\n?)|(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"Python": regexp.MustCompile("('''(.?\t?\r?\n?)+?''')|(#.*\t?\r?\n?)|(\"\"\"(.?\t?\r?\n?)+?\"\"\")"),
"Ruby": regexp.MustCompile(`(#.*\t?\r?\n?)|(=begin(.*?\t?\r?\n?)+?=end)`),
"Rust": regexp.MustCompile(`\/\*(.*?\t?\r?\n?)+?\*\/`),
"R": regexp.MustCompile(`#.*\t?\r?\n?`),
"Shell": regexp.MustCompile(`#.*\t?\r?\n?`),
"Swift": regexp.MustCompile(`(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"SAS": regexp.MustCompile(`(\*(.*?\t?\r?\n?)+?;)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"Scala": regexp.MustCompile(`(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"SQL": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"TypeScript": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"YAML": regexp.MustCompile(`#.*\t?\r?\n?`),
}
)

Expand Down Expand Up @@ -192,17 +192,16 @@ func IsLicenseDirectory(fileName string) bool {
func ExtractSourceFiles(files []string, fs filer.Filer) [][]byte {
candidates := [][]byte{}
langs := []string{}
var empty []byte
for _, file := range files {
lang, safe := enry.GetLanguage(file)
if safe == true {
text, err := fs.ReadFile(file)
if err == nil {
lang := enry.GetLanguage(file, empty)
bzz marked this conversation as resolved.
Show resolved Hide resolved
langs = append(langs, lang)
text, err := fs.ReadFile(file)
if err == nil {
if preprocessor, exists := filePreprocessors[paths.Ext(file)]; exists {
text = preprocessor(text)
}
candidates = append(candidates, text)
if preprocessor, exists := filePreprocessors[paths.Ext(file)]; exists {
text = preprocessor(text)
}
candidates = append(candidates, text)
}
}
if len(candidates) > 0 {
Expand All @@ -214,34 +213,38 @@ func ExtractSourceFiles(files []string, fs filer.Filer) [][]byte {
// ExtractHeaderComments searches in source code files for header comments and outputs license text on them them.
func ExtractHeaderComments(candidates [][]byte, langs []string) [][]byte {
comments := [][]byte{}
for key, candidate := range candidates {
candidateLang := langs[key]
candidateHeader := candidate[:1024]
var unsupportedTypes string
for i, candidate := range candidates {
candidateLang := langs[i]
candidateHeader := candidate
if len(candidateHeader) > 1024 {
candidateHeader = candidate[:1024]
}
if reg, exists := commentSyntaxes[candidateLang]; exists {
if candidateHeader != nil {
if match := reg.FindAllStringSubmatch(string(candidateHeader), -1); match != nil {
var matchText string
for _, m := range match {
var tempText string
for _, k := range m {
tempText += string(k)
}
matchText += string(tempText)
}
comments = append(comments, []byte(matchText))
if match := reg.FindAllString(string(candidateHeader), -1); match != nil {
var matchText string
for _, m := range match {
matchText += m
}
comments = append(comments, []byte(matchText))
}
} else {
fmt.Println("Found a", candidateLang, "file from which is currently unsorported. Please open an issue on Github or contribute to the project by adding support to it.")
match, _ := regexp.Match(candidateLang, []byte(unsupportedTypes))
if match == false {
unsupportedTypes += candidateLang + ", "
}
}
}
if len(unsupportedTypes) > 0 {
unsupportedTypes = unsupportedTypes[:len(unsupportedTypes)-2]
fmt.Println("The following file types were not investigated for licenses on the comments:", unsupportedTypes + ". ")
}
return comments
}

// InvestigateHeaderComments scans the header comments for licensing information and outputs the
// probable names using NER.
func InvestigateHeaderComments(texts [][]byte, fs filer.Filer) map[string]float32 {
// TO DO: split license-comments from description-comments.
maxLicenses := map[string]float32{}
for _, text := range texts {
candidates := InvestigateHeaderComment(text)
Expand Down
19 changes: 18 additions & 1 deletion licensedb/licensedb.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,9 @@ func Detect(fs filer.Filer) (map[string]float32, error) {
}
}
// Plan C: look for licence texts in source code files with comments at header
candidates = internal.ExtractSourceFiles(fileNames, fs)
var extendedFileNames []string
extendedFileNames = extractAllSubfiles(fs, extendedFileNames, "")
candidates = internal.ExtractSourceFiles(extendedFileNames, fs)
if len(candidates) > 0 {
licenses = internal.InvestigateHeaderComments(candidates, fs)
}
Expand All @@ -59,3 +61,18 @@ func Detect(fs filer.Filer) (map[string]float32, error) {
}
return licenses, nil
}

func extractAllSubfiles(fs filer.Filer, fileNames []string, path string) []string {
files, err := fs.ReadDir(path)
if err == nil {
for _, subfile := range files {
currentPath := paths.Join(path, subfile.Name)
if subfile.IsDir {
fileNames = extractAllSubfiles(fs, fileNames, currentPath)
} else {
fileNames = append(fileNames, currentPath)
}
}
}
return fileNames
}