diff --git a/cmd/license-detector/main.go b/cmd/license-detector/main.go index 8994cfd..ca5755f 100644 --- a/cmd/license-detector/main.go +++ b/cmd/license-detector/main.go @@ -103,7 +103,7 @@ func process(arg string) ([]match, error) { return nil, err } - ls, err := licensedb.Detect(resolvedFiler) + ls, _, err := licensedb.Detect(resolvedFiler) if err != nil { return nil, err } diff --git a/licensedb/dataset_test.go b/licensedb/dataset_test.go index f7702a7..945dcba 100644 --- a/licensedb/dataset_test.go +++ b/licensedb/dataset_test.go @@ -23,7 +23,7 @@ func TestDataset(t *testing.T) { for _, project := range projects { go func(project filer.File) { defer wg.Done() - myLicenses, _ := Detect(filer.NestFiler(rootFiler, project.Name)) + myLicenses, _, _ := Detect(filer.NestFiler(rootFiler, project.Name)) if len(myLicenses) > 0 { mutex.Lock() licenses[project.Name] = myLicenses diff --git a/licensedb/internal/db.go b/licensedb/internal/db.go index 93b6c03..74d8c4d 100644 --- a/licensedb/internal/db.go +++ b/licensedb/internal/db.go @@ -460,3 +460,28 @@ func tfidf(freq int, docfreq int, ndocs int) float32 { } return weight } + +func (db *database) QuerySourceFile(text string) map[string]float32 { + candidates := map[string]float32{} + append := func(others map[string]float32) { + for key, val := range others { + if candidates[key] < val { + candidates[key] = val + } + } + } + append(db.QueryLicenseText(string(text))) + // if len(candidates) == 0 { + // append(investigateSourceFile(text, db.nameSubstrings, db.nameSubstringSizes)) + // if len(candidates) == 0 { + // append(investigateSourceFile(text, db.nameShortSubstrings, db.nameShortSubstringSizes)) + // } + // } + if db.debug { + for key, val := range candidates { + println("NLP", key, val) + } + } + db.addURLMatches(candidates, text) + return candidates +} diff --git a/licensedb/internal/investigation.go b/licensedb/internal/investigation.go index f59d53a..2749956 100644 --- a/licensedb/internal/investigation.go +++ b/licensedb/internal/investigation.go @@ -10,6 +10,7 @@ import ( "gopkg.in/src-d/go-license-detector.v2/licensedb/filer" "gopkg.in/src-d/go-license-detector.v2/licensedb/internal/processors" + "gopkg.in/src-d/enry.v1" ) var ( @@ -62,6 +63,36 @@ var ( licenseDirectoryRe = regexp.MustCompile(fmt.Sprintf( "^(%s)$", strings.Join(licenseFileNames, "|"))) + + commentSyntaxesRe = map[string]*regexp.Regexp { + "ANTLR": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "C++": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "C#": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "CSS": regexp.MustCompile(`\/\*(.*?\t?\r?\n?)+?\*\/`), + "Go": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "HTML": regexp.MustCompile(`<\!--(.*?\t?\r?\n?)+?-->`), + "Haskel": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\{-(.*?\t?\r?\n?)+?\-\})`), + "Java": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "JavaScript": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "Matlab": regexp.MustCompile(`(%.*\t?\r?\n?)|(%\{(.?\t?\r?\n?)+?%\})`), + "Objective-C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "Perl": regexp.MustCompile(`(#.*\t?\r?\n?)|(=begin(.*?\t?\r?\n?)+?=cut)`), + "PHP": regexp.MustCompile(`(#.*\t?\r?\n?)|(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "Python": regexp.MustCompile("('''(.?\t?\r?\n?)+?''')|(#.*\t?\r?\n?)|(\"\"\"(.?\t?\r?\n?)+?\"\"\")"), + "Ruby": regexp.MustCompile(`(#.*\t?\r?\n?)|(=begin(.*?\t?\r?\n?)+?=end)`), + "Rust": regexp.MustCompile(`\/\*(.*?\t?\r?\n?)+?\*\/`), + "R": regexp.MustCompile(`#.*\t?\r?\n?`), + "Shell": regexp.MustCompile(`#.*\t?\r?\n?`), + "Swift": regexp.MustCompile(`(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "SAS": regexp.MustCompile(`(\*(.*?\t?\r?\n?)+?;)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "Scala": regexp.MustCompile(`(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "SQL": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "TypeScript": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "YAML": regexp.MustCompile(`#.*\t?\r?\n?`), + } + + cleanCommentsRe = regexp.MustCompile(`#|\*|\/|=begin|=cut|=end`) ) // ExtractLicenseFiles returns the list of possible license texts. @@ -157,3 +188,86 @@ func InvestigateReadmeText(text []byte, fs filer.Filer) map[string]float32 { func IsLicenseDirectory(fileName string) bool { return licenseDirectoryRe.MatchString(strings.ToLower(fileName)) } + +// ExtractSourceFiles searches for source code files and their returns header comments, when available. +// Enry is used to get possible valuable files. +func ExtractSourceFiles(files []string, fs filer.Filer) ([][]byte, []string) { + candidates := [][]byte{} + fileNames := []string{} + langs := []string{} + commentsFileName := []string{} + for _, file := range files { + text, err := fs.ReadFile(file) + if err == nil { + lang := enry.GetLanguage(file, text) + langs = append(langs, lang) + candidates = append(candidates, text) + fileNames = append(fileNames, file) + } + } + if len(candidates) > 0 { + candidates, commentsFileName = ExtractHeaderComments(candidates, langs, fileNames) + } + return candidates, commentsFileName +} + +// ExtractHeaderComments searches in source code files for header comments and outputs license text on them them. +func ExtractHeaderComments(candidates [][]byte, langs []string, fileNames []string) ([][]byte, []string) { + comments := [][]byte{} + commentsFileName := []string{} + var unsupportedTypes string + for i, candidate := range candidates { + candidateLang := langs[i] + if reg, exists := commentSyntaxesRe[candidateLang]; exists { + candidateHeader := candidate + if len(candidateHeader) > 1024 { + candidateHeader = candidate[:1024] + } + if match := reg.FindAllString(string(candidateHeader), -1); match != nil { + commentsFileName = append(commentsFileName, fileNames[i]) + var matchText string + for _, m := range match { + matchText += cleanCommentsRe.ReplaceAllString(m, "") + } + comments = append(comments, []byte(matchText)) + } + } else { + match, _ := regexp.Match(candidateLang, []byte(unsupportedTypes)) + if match == false { + unsupportedTypes += candidateLang + ", " + } + } + } + if len(unsupportedTypes) > 0 { + unsupportedTypes = unsupportedTypes[:len(unsupportedTypes)-2] + fmt.Println("The following file types were not investigated for licenses on the comments:", unsupportedTypes + ". ") + } + return comments, commentsFileName +} + +// InvestigateHeaderComments scans the header comments for licensing information and outputs the +// probable names using NER. +func InvestigateHeaderComments(texts [][]byte, fs filer.Filer, commentsFileName []string) (map[string]float32, []string) { + maxLicenses := map[string]float32{} + licensesFileNames := []string{} + // TO DO: output max license per file, not files with licenses + licenses found + for i, text := range texts { + candidates := InvestigateHeaderComment(text) + if len(candidates) > 0 { + licensesFileNames = append(licensesFileNames, commentsFileName[i]) + for name, sim := range candidates { + maxSim := maxLicenses[name] + if sim > maxSim { + maxLicenses[name] = sim + } + } + } + } + return maxLicenses, licensesFileNames +} + +// InvestigateHeaderComment scans the header comments for licensing information and outputs probable +// names found with Named Entity Recognition from NLP. +func InvestigateHeaderComment(text []byte) map[string]float32 { + return globalLicenseDatabase().QuerySourceFile(string(text)) +} diff --git a/licensedb/internal/nlp.go b/licensedb/internal/nlp.go index 7e015b4..c340389 100644 --- a/licensedb/internal/nlp.go +++ b/licensedb/internal/nlp.go @@ -143,3 +143,14 @@ func splitLicenseName(name string) []substring { }) return result } + +func investigateSourceFile( + text string, licenseNameParts map[string][]substring, + licenseNameSizes map[string]int) map[string]float32 { + // TO DO: split license-comments from description-comments + // ===== + // ---- + // \n\n\n + // import + return map[string]float32{} + } diff --git a/licensedb/licensedb.go b/licensedb/licensedb.go index f524884..5296931 100644 --- a/licensedb/licensedb.go +++ b/licensedb/licensedb.go @@ -15,10 +15,10 @@ var ( // Detect returns the most probable reference licenses matched for the given // file tree. Each match has the confidence assigned, from 0 to 1, 1 means 100% confident. -func Detect(fs filer.Filer) (map[string]float32, error) { +func Detect(fs filer.Filer) (map[string]float32, []string, error) { files, err := fs.ReadDir("") if err != nil { - return nil, err + return nil, nil, err } fileNames := []string{} for _, file := range files { @@ -39,16 +39,43 @@ func Detect(fs filer.Filer) (map[string]float32, error) { candidates := internal.ExtractLicenseFiles(fileNames, fs) licenses := internal.InvestigateLicenseTexts(candidates) if len(licenses) > 0 { - return licenses, nil + return licenses, nil, nil } // Plan B: take the README, find the section about the license and apply NER candidates = internal.ExtractReadmeFiles(fileNames, fs) - if len(candidates) == 0 { - return nil, ErrNoLicenseFound + if len(candidates) > 0 { + licenses = internal.InvestigateReadmeTexts(candidates, fs) + if len(licenses) > 0 { + return licenses, nil, nil + } + } + + // Plan C: look for licence texts in source code files with comments at header + extendedFileNames := []string{} + commentsFileName := []string{} + licensesFileNames := []string{} + extendedFileNames = extractAllSubfiles(fs, extendedFileNames, "") + candidates, commentsFileName = internal.ExtractSourceFiles(extendedFileNames, fs) + if len(candidates) > 0 { + licenses, licensesFileNames = internal.InvestigateHeaderComments(candidates, fs, commentsFileName) } - licenses = internal.InvestigateReadmeTexts(candidates, fs) if len(licenses) == 0 { - return nil, ErrNoLicenseFound + return nil, nil, ErrNoLicenseFound + } + return licenses, licensesFileNames, nil +} + +func extractAllSubfiles(fs filer.Filer, fileNames []string, path string) []string { + files, err := fs.ReadDir(path) + if err == nil { + for _, subfile := range files { + currentPath := paths.Join(path, subfile.Name) + if subfile.IsDir { + fileNames = extractAllSubfiles(fs, fileNames, currentPath) + } else { + fileNames = append(fileNames, currentPath) + } + } } - return licenses, nil + return fileNames }