From 56f225734fe1a143283f87b5d33e51535a5f0550 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mar=C3=ADa=20Benavente?= Date: Tue, 17 Jul 2018 10:01:10 +0200 Subject: [PATCH 1/9] Find source code files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: María Benavente --- licensedb/internal/investigation.go | 55 +++++++++++++++++++++++++++++ licensedb/licensedb.go | 13 +++++-- 2 files changed, 65 insertions(+), 3 deletions(-) diff --git a/licensedb/internal/investigation.go b/licensedb/internal/investigation.go index f59d53a..d59e944 100644 --- a/licensedb/internal/investigation.go +++ b/licensedb/internal/investigation.go @@ -10,6 +10,7 @@ import ( "gopkg.in/src-d/go-license-detector.v2/licensedb/filer" "gopkg.in/src-d/go-license-detector.v2/licensedb/internal/processors" + "gopkg.in/src-d/enry.v1" ) var ( @@ -157,3 +158,57 @@ func InvestigateReadmeText(text []byte, fs filer.Filer) map[string]float32 { func IsLicenseDirectory(fileName string) bool { return licenseDirectoryRe.MatchString(strings.ToLower(fileName)) } + +// ExtractSourceFiles searches for source code files and their returns header comments, when available. +// Enry is used to get possible valuable files. +func ExtractSourceFiles(files []string, fs filer.Filer) [][]byte { + candidates := [][]byte{} + langs := []string{} + for _, file := range files { + lang, safe := enry.GetLanguageByExtension(file) + if safe == true { + langs = append(langs, lang) + text, err := fs.ReadFile(file) + if err == nil { + if preprocessor, exists := filePreprocessors[paths.Ext(file)]; exists { + text = preprocessor(text) + } + candidates = append(candidates, text) + } + } + } + if len(candidates) > 0 { + candidates = ExtractHeaderComments(candidates, langs) + } + return candidates +} + +// ExtractHeaderComments searches in source code files for header comments and outputs license text on them them. +func ExtractHeaderComments(candidates [][]byte, lang []string) [][]byte { + // TO DO: split code from comments, preferably only header comments + comments := [][]byte{} + return comments +} + +// InvestigateHeaderComments scans the header comments for licensing information and outputs the +// probable names using NER. +func InvestigateHeaderComments(texts [][]byte, fs filer.Filer) map[string]float32 { + // TO DO: split license-comments from description-comments. + maxLicenses := map[string]float32{} + for _, text := range texts { + candidates := InvestigateHeaderComment(text) + for name, sim := range candidates { + maxSim := maxLicenses[name] + if sim > maxSim { + maxLicenses[name] = sim + } + } + } + return maxLicenses +} + +// InvestigateHeaderComment scans the header comments for licensing information and outputs probable +// names found with Named Entity Recognition from NLP. +func InvestigateHeaderComment(text []byte) map[string]float32 { + return globalLicenseDatabase().QueryLicenseText(string(text)) +} diff --git a/licensedb/licensedb.go b/licensedb/licensedb.go index f524884..09a921e 100644 --- a/licensedb/licensedb.go +++ b/licensedb/licensedb.go @@ -43,10 +43,17 @@ func Detect(fs filer.Filer) (map[string]float32, error) { } // Plan B: take the README, find the section about the license and apply NER candidates = internal.ExtractReadmeFiles(fileNames, fs) - if len(candidates) == 0 { - return nil, ErrNoLicenseFound + if len(candidates) > 0 { + licenses = internal.InvestigateReadmeTexts(candidates, fs) + if len(licenses) > 0 { + return licenses, nil + } + } + // Plan C: look for licence texts in source code files with comments at header + candidates = internal.ExtractSourceFiles(fileNames, fs) + if len(candidates) > 0 { + licenses = internal.InvestigateHeaderComments(candidates, fs) } - licenses = internal.InvestigateReadmeTexts(candidates, fs) if len(licenses) == 0 { return nil, ErrNoLicenseFound } From 9f98da678852e54dccaf6440eda0d32442f93e12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mar=C3=ADa=20Benavente?= Date: Wed, 18 Jul 2018 10:31:43 +0200 Subject: [PATCH 2/9] use enry's GetLanguage() instead of GetLanguageByExtension() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: María Benavente --- licensedb/internal/investigation.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/licensedb/internal/investigation.go b/licensedb/internal/investigation.go index d59e944..0adaa48 100644 --- a/licensedb/internal/investigation.go +++ b/licensedb/internal/investigation.go @@ -165,7 +165,7 @@ func ExtractSourceFiles(files []string, fs filer.Filer) [][]byte { candidates := [][]byte{} langs := []string{} for _, file := range files { - lang, safe := enry.GetLanguageByExtension(file) + lang, safe := enry.GetLanguage(file) if safe == true { langs = append(langs, lang) text, err := fs.ReadFile(file) From 48cf528dcfe8f5afa4d63848a11e8f16746b7d2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mar=C3=ADa=20Benavente?= Date: Fri, 20 Jul 2018 18:35:01 +0200 Subject: [PATCH 3/9] Parse header comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: María Benavente --- licensedb/internal/db.go | 6 ++++ licensedb/internal/investigation.go | 54 +++++++++++++++++++++++++++-- 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/licensedb/internal/db.go b/licensedb/internal/db.go index 93b6c03..a51d2f4 100644 --- a/licensedb/internal/db.go +++ b/licensedb/internal/db.go @@ -460,3 +460,9 @@ func tfidf(freq int, docfreq int, ndocs int) float32 { } return weight } + +func func (db *database) QuerySourceFile(text string) map[string]float32 { + // TO DO: implement this function + placeholder := map[string]float32{} + return +} diff --git a/licensedb/internal/investigation.go b/licensedb/internal/investigation.go index 0adaa48..6e98050 100644 --- a/licensedb/internal/investigation.go +++ b/licensedb/internal/investigation.go @@ -63,6 +63,34 @@ var ( licenseDirectoryRe = regexp.MustCompile(fmt.Sprintf( "^(%s)$", strings.Join(licenseFileNames, "|"))) + + commentSyntaxes = map[string]*regexp.Regexp { + // "ANTLR": regexp.MustCompile(``), + "C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*\t*\r*\n*)*\*\/)`), + "C++": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*\t*\r*\n*)*\*\/)`), + "C#": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*\t*\r*\n*)*\*\/)`), + "CSS": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`), + "Go": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`), + // "HTML": regexp.MustCompile(``), + "Haskel": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\{-(.*\t*\r*\n*)*\-\})`), + "Java": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`), + "Javascript": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`), + "Matlab": regexp.MustCompile(`(%\{(.*\s+.*)*%\})`), + "Objective-C": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`), + // "Perl": regexp.MustCompile(``), + "PHP": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`), + "Python": regexp.MustCompile("(#.*\t?\r?\n?)|(```.*```)"), + // "Ruby": regexp.MustCompile(``), + "Rust": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`), + // "R": regexp.MustCompile(``), + // "Shell": regexp.MustCompile(``), + "Swift": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`), + // "SAS": regexp.MustCompile(``), + "Scala": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`), + "SQL": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\/\*(.*\t*\r*\n*)*\*\/)`), + // "Visual Basic": regexp.MustCompile(``), + // "yml": regexp.MustCompile(``), + } ) // ExtractLicenseFiles returns the list of possible license texts. @@ -184,9 +212,29 @@ func ExtractSourceFiles(files []string, fs filer.Filer) [][]byte { } // ExtractHeaderComments searches in source code files for header comments and outputs license text on them them. -func ExtractHeaderComments(candidates [][]byte, lang []string) [][]byte { - // TO DO: split code from comments, preferably only header comments +func ExtractHeaderComments(candidates [][]byte, langs []string) [][]byte { comments := [][]byte{} + for key, candidate := range candidates { + candidateLang := langs[key] + candidateHeader := candidate[:1024] + if reg, exists := commentSyntaxes[candidateLang]; exists { + if candidateHeader != nil { + if match := reg.FindAllStringSubmatch(string(candidateHeader), -1); match != nil { + var matchText string + for _, m := range match { + var tempText string + for _, k := range m { + tempText += string(k) + } + matchText += string(tempText) + } + comments = append(comments, []byte(matchText)) + } + } + } else { + fmt.Println("Found a", candidateLang, "file from which is currently unsorported. Please open an issue on Github or contribute to the project by adding support to it.") + } + } return comments } @@ -210,5 +258,5 @@ func InvestigateHeaderComments(texts [][]byte, fs filer.Filer) map[string]float3 // InvestigateHeaderComment scans the header comments for licensing information and outputs probable // names found with Named Entity Recognition from NLP. func InvestigateHeaderComment(text []byte) map[string]float32 { - return globalLicenseDatabase().QueryLicenseText(string(text)) + return globalLicenseDatabase().QuerySourceFile(string(text)) } From 901ab7e4c8732f7b757355dc3970aacf4e5b823a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mar=C3=ADa=20Benavente?= Date: Fri, 20 Jul 2018 18:36:14 +0200 Subject: [PATCH 4/9] Fix typo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: María Benavente --- licensedb/internal/db.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/licensedb/internal/db.go b/licensedb/internal/db.go index a51d2f4..d18291e 100644 --- a/licensedb/internal/db.go +++ b/licensedb/internal/db.go @@ -461,7 +461,7 @@ func tfidf(freq int, docfreq int, ndocs int) float32 { return weight } -func func (db *database) QuerySourceFile(text string) map[string]float32 { +func (db *database) QuerySourceFile(text string) map[string]float32 { // TO DO: implement this function placeholder := map[string]float32{} return From e691fdc47fe952c719e6ee06a41ecc45be7ba1c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mar=C3=ADa=20Benavente?= Date: Mon, 30 Jul 2018 18:26:36 +0200 Subject: [PATCH 5/9] Scan-individual-files implementation working (but needs improvement) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: María Benavente --- licensedb/internal/db.go | 22 +++++- licensedb/internal/investigation.go | 101 ++++++++++++++-------------- licensedb/licensedb.go | 19 +++++- 3 files changed, 89 insertions(+), 53 deletions(-) diff --git a/licensedb/internal/db.go b/licensedb/internal/db.go index d18291e..7a3c988 100644 --- a/licensedb/internal/db.go +++ b/licensedb/internal/db.go @@ -462,7 +462,23 @@ func tfidf(freq int, docfreq int, ndocs int) float32 { } func (db *database) QuerySourceFile(text string) map[string]float32 { - // TO DO: implement this function - placeholder := map[string]float32{} - return + candidates := map[string]float32{} + append := func(others map[string]float32) { + for key, val := range others { + if candidates[key] < val { + candidates[key] = val + } + } + } + append(db.QueryLicenseText(string(text))) + if len(candidates) == 0 { + // TO DO: split license-comments from description-comments. + } + if db.debug { + for key, val := range candidates { + println("NLP", key, val) + } + } + db.addURLMatches(candidates, text) + return candidates } diff --git a/licensedb/internal/investigation.go b/licensedb/internal/investigation.go index 6e98050..fd17ef6 100644 --- a/licensedb/internal/investigation.go +++ b/licensedb/internal/investigation.go @@ -65,31 +65,31 @@ var ( "^(%s)$", strings.Join(licenseFileNames, "|"))) commentSyntaxes = map[string]*regexp.Regexp { - // "ANTLR": regexp.MustCompile(``), - "C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*\t*\r*\n*)*\*\/)`), - "C++": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*\t*\r*\n*)*\*\/)`), - "C#": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*\t*\r*\n*)*\*\/)`), - "CSS": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`), - "Go": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`), - // "HTML": regexp.MustCompile(``), - "Haskel": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\{-(.*\t*\r*\n*)*\-\})`), - "Java": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`), - "Javascript": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`), - "Matlab": regexp.MustCompile(`(%\{(.*\s+.*)*%\})`), - "Objective-C": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`), - // "Perl": regexp.MustCompile(``), - "PHP": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`), - "Python": regexp.MustCompile("(#.*\t?\r?\n?)|(```.*```)"), - // "Ruby": regexp.MustCompile(``), - "Rust": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`), - // "R": regexp.MustCompile(``), - // "Shell": regexp.MustCompile(``), - "Swift": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`), - // "SAS": regexp.MustCompile(``), - "Scala": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`), - "SQL": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\/\*(.*\t*\r*\n*)*\*\/)`), - // "Visual Basic": regexp.MustCompile(``), - // "yml": regexp.MustCompile(``), + "ANTLR": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "C++": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "C#": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "CSS": regexp.MustCompile(`\/\*(.*?\t?\r?\n?)+?\*\/`), + "Go": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "HTML": regexp.MustCompile(`<\!--(.*?\t?\r?\n?)+?-->`), + "Haskel": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\{-(.*?\t?\r?\n?)+?\-\})`), + "Java": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "JavaScript": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "Matlab": regexp.MustCompile(`(%.*\t?\r?\n?)|(%\{(.?\t?\r?\n?)+?%\})`), + "Objective-C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "Perl": regexp.MustCompile(`(#.*\t?\r?\n?)|(=begin(.*?\t?\r?\n?)+?=cut)`), + "PHP": regexp.MustCompile(`(#.*\t?\r?\n?)|(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "Python": regexp.MustCompile("('''(.?\t?\r?\n?)+?''')|(#.*\t?\r?\n?)|(\"\"\"(.?\t?\r?\n?)+?\"\"\")"), + "Ruby": regexp.MustCompile(`(#.*\t?\r?\n?)|(=begin(.*?\t?\r?\n?)+?=end)`), + "Rust": regexp.MustCompile(`\/\*(.*?\t?\r?\n?)+?\*\/`), + "R": regexp.MustCompile(`#.*\t?\r?\n?`), + "Shell": regexp.MustCompile(`#.*\t?\r?\n?`), + "Swift": regexp.MustCompile(`(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "SAS": regexp.MustCompile(`(\*(.*?\t?\r?\n?)+?;)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "Scala": regexp.MustCompile(`(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "SQL": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "TypeScript": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), + "YAML": regexp.MustCompile(`#.*\t?\r?\n?`), } ) @@ -192,17 +192,16 @@ func IsLicenseDirectory(fileName string) bool { func ExtractSourceFiles(files []string, fs filer.Filer) [][]byte { candidates := [][]byte{} langs := []string{} + var empty []byte for _, file := range files { - lang, safe := enry.GetLanguage(file) - if safe == true { + text, err := fs.ReadFile(file) + if err == nil { + lang := enry.GetLanguage(file, empty) langs = append(langs, lang) - text, err := fs.ReadFile(file) - if err == nil { - if preprocessor, exists := filePreprocessors[paths.Ext(file)]; exists { - text = preprocessor(text) - } - candidates = append(candidates, text) + if preprocessor, exists := filePreprocessors[paths.Ext(file)]; exists { + text = preprocessor(text) } + candidates = append(candidates, text) } } if len(candidates) > 0 { @@ -214,34 +213,38 @@ func ExtractSourceFiles(files []string, fs filer.Filer) [][]byte { // ExtractHeaderComments searches in source code files for header comments and outputs license text on them them. func ExtractHeaderComments(candidates [][]byte, langs []string) [][]byte { comments := [][]byte{} - for key, candidate := range candidates { - candidateLang := langs[key] - candidateHeader := candidate[:1024] + var unsupportedTypes string + for i, candidate := range candidates { + candidateLang := langs[i] + candidateHeader := candidate + if len(candidateHeader) > 1024 { + candidateHeader = candidate[:1024] + } if reg, exists := commentSyntaxes[candidateLang]; exists { - if candidateHeader != nil { - if match := reg.FindAllStringSubmatch(string(candidateHeader), -1); match != nil { - var matchText string - for _, m := range match { - var tempText string - for _, k := range m { - tempText += string(k) - } - matchText += string(tempText) - } - comments = append(comments, []byte(matchText)) + if match := reg.FindAllString(string(candidateHeader), -1); match != nil { + var matchText string + for _, m := range match { + matchText += m } + comments = append(comments, []byte(matchText)) } } else { - fmt.Println("Found a", candidateLang, "file from which is currently unsorported. Please open an issue on Github or contribute to the project by adding support to it.") + match, _ := regexp.Match(candidateLang, []byte(unsupportedTypes)) + if match == false { + unsupportedTypes += candidateLang + ", " + } } } + if len(unsupportedTypes) > 0 { + unsupportedTypes = unsupportedTypes[:len(unsupportedTypes)-2] + fmt.Println("The following file types were not investigated for licenses on the comments:", unsupportedTypes + ". ") + } return comments } // InvestigateHeaderComments scans the header comments for licensing information and outputs the // probable names using NER. func InvestigateHeaderComments(texts [][]byte, fs filer.Filer) map[string]float32 { - // TO DO: split license-comments from description-comments. maxLicenses := map[string]float32{} for _, text := range texts { candidates := InvestigateHeaderComment(text) diff --git a/licensedb/licensedb.go b/licensedb/licensedb.go index 09a921e..0afd404 100644 --- a/licensedb/licensedb.go +++ b/licensedb/licensedb.go @@ -50,7 +50,9 @@ func Detect(fs filer.Filer) (map[string]float32, error) { } } // Plan C: look for licence texts in source code files with comments at header - candidates = internal.ExtractSourceFiles(fileNames, fs) + var extendedFileNames []string + extendedFileNames = extractAllSubfiles(fs, extendedFileNames, "") + candidates = internal.ExtractSourceFiles(extendedFileNames, fs) if len(candidates) > 0 { licenses = internal.InvestigateHeaderComments(candidates, fs) } @@ -59,3 +61,18 @@ func Detect(fs filer.Filer) (map[string]float32, error) { } return licenses, nil } + +func extractAllSubfiles(fs filer.Filer, fileNames []string, path string) []string { + files, err := fs.ReadDir(path) + if err == nil { + for _, subfile := range files { + currentPath := paths.Join(path, subfile.Name) + if subfile.IsDir { + fileNames = extractAllSubfiles(fs, fileNames, currentPath) + } else { + fileNames = append(fileNames, currentPath) + } + } + } + return fileNames +} From 1c941d3a2837c44df094d306f63c9de8e3533dbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mar=C3=ADa=20Benavente?= Date: Mon, 30 Jul 2018 20:24:42 +0200 Subject: [PATCH 6/9] change var name for consistency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: María Benavente --- licensedb/internal/investigation.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/licensedb/internal/investigation.go b/licensedb/internal/investigation.go index fd17ef6..2a45637 100644 --- a/licensedb/internal/investigation.go +++ b/licensedb/internal/investigation.go @@ -64,7 +64,7 @@ var ( licenseDirectoryRe = regexp.MustCompile(fmt.Sprintf( "^(%s)$", strings.Join(licenseFileNames, "|"))) - commentSyntaxes = map[string]*regexp.Regexp { + commentSyntaxesRe = map[string]*regexp.Regexp { "ANTLR": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), "C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), "C++": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), @@ -220,7 +220,7 @@ func ExtractHeaderComments(candidates [][]byte, langs []string) [][]byte { if len(candidateHeader) > 1024 { candidateHeader = candidate[:1024] } - if reg, exists := commentSyntaxes[candidateLang]; exists { + if reg, exists := commentSyntaxesRe[candidateLang]; exists { if match := reg.FindAllString(string(candidateHeader), -1); match != nil { var matchText string for _, m := range match { From 8061b5a0445e955323fa9a134764be19f633563e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mar=C3=ADa=20Benavente?= Date: Mon, 29 Oct 2018 16:44:40 +0100 Subject: [PATCH 7/9] add investigateSourceFile() structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: María Benavente --- licensedb/internal/db.go | 9 ++++++--- licensedb/internal/nlp.go | 11 +++++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/licensedb/internal/db.go b/licensedb/internal/db.go index 7a3c988..74d8c4d 100644 --- a/licensedb/internal/db.go +++ b/licensedb/internal/db.go @@ -471,9 +471,12 @@ func (db *database) QuerySourceFile(text string) map[string]float32 { } } append(db.QueryLicenseText(string(text))) - if len(candidates) == 0 { - // TO DO: split license-comments from description-comments. - } + // if len(candidates) == 0 { + // append(investigateSourceFile(text, db.nameSubstrings, db.nameSubstringSizes)) + // if len(candidates) == 0 { + // append(investigateSourceFile(text, db.nameShortSubstrings, db.nameShortSubstringSizes)) + // } + // } if db.debug { for key, val := range candidates { println("NLP", key, val) diff --git a/licensedb/internal/nlp.go b/licensedb/internal/nlp.go index 7e015b4..c340389 100644 --- a/licensedb/internal/nlp.go +++ b/licensedb/internal/nlp.go @@ -143,3 +143,14 @@ func splitLicenseName(name string) []substring { }) return result } + +func investigateSourceFile( + text string, licenseNameParts map[string][]substring, + licenseNameSizes map[string]int) map[string]float32 { + // TO DO: split license-comments from description-comments + // ===== + // ---- + // \n\n\n + // import + return map[string]float32{} + } From 7d2ccc0c366e172cdb75517bc7d56c6cd55eaff8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mar=C3=ADa=20Benavente?= Date: Mon, 29 Oct 2018 16:46:32 +0100 Subject: [PATCH 8/9] add support future printing output the full report for all the files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: María Benavente --- cmd/license-detector/main.go | 2 +- licensedb/dataset_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/license-detector/main.go b/cmd/license-detector/main.go index 8994cfd..ca5755f 100644 --- a/cmd/license-detector/main.go +++ b/cmd/license-detector/main.go @@ -103,7 +103,7 @@ func process(arg string) ([]match, error) { return nil, err } - ls, err := licensedb.Detect(resolvedFiler) + ls, _, err := licensedb.Detect(resolvedFiler) if err != nil { return nil, err } diff --git a/licensedb/dataset_test.go b/licensedb/dataset_test.go index f7702a7..945dcba 100644 --- a/licensedb/dataset_test.go +++ b/licensedb/dataset_test.go @@ -23,7 +23,7 @@ func TestDataset(t *testing.T) { for _, project := range projects { go func(project filer.File) { defer wg.Done() - myLicenses, _ := Detect(filer.NestFiler(rootFiler, project.Name)) + myLicenses, _, _ := Detect(filer.NestFiler(rootFiler, project.Name)) if len(myLicenses) > 0 { mutex.Lock() licenses[project.Name] = myLicenses From f8395c150e838dc5d379e493c42fea3e083106da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mar=C3=ADa=20Benavente?= Date: Mon, 29 Oct 2018 16:48:57 +0100 Subject: [PATCH 9/9] minor changes + getting filenames where licenses were found MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: María Benavente --- licensedb/internal/investigation.go | 52 +++++++++++++++++------------ licensedb/licensedb.go | 21 +++++++----- 2 files changed, 42 insertions(+), 31 deletions(-) diff --git a/licensedb/internal/investigation.go b/licensedb/internal/investigation.go index 2a45637..2749956 100644 --- a/licensedb/internal/investigation.go +++ b/licensedb/internal/investigation.go @@ -91,6 +91,8 @@ var ( "TypeScript": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`), "YAML": regexp.MustCompile(`#.*\t?\r?\n?`), } + + cleanCommentsRe = regexp.MustCompile(`#|\*|\/|=begin|=cut|=end`) ) // ExtractLicenseFiles returns the list of possible license texts. @@ -189,42 +191,43 @@ func IsLicenseDirectory(fileName string) bool { // ExtractSourceFiles searches for source code files and their returns header comments, when available. // Enry is used to get possible valuable files. -func ExtractSourceFiles(files []string, fs filer.Filer) [][]byte { +func ExtractSourceFiles(files []string, fs filer.Filer) ([][]byte, []string) { candidates := [][]byte{} + fileNames := []string{} langs := []string{} - var empty []byte + commentsFileName := []string{} for _, file := range files { text, err := fs.ReadFile(file) if err == nil { - lang := enry.GetLanguage(file, empty) + lang := enry.GetLanguage(file, text) langs = append(langs, lang) - if preprocessor, exists := filePreprocessors[paths.Ext(file)]; exists { - text = preprocessor(text) - } candidates = append(candidates, text) + fileNames = append(fileNames, file) } } if len(candidates) > 0 { - candidates = ExtractHeaderComments(candidates, langs) + candidates, commentsFileName = ExtractHeaderComments(candidates, langs, fileNames) } - return candidates + return candidates, commentsFileName } // ExtractHeaderComments searches in source code files for header comments and outputs license text on them them. -func ExtractHeaderComments(candidates [][]byte, langs []string) [][]byte { +func ExtractHeaderComments(candidates [][]byte, langs []string, fileNames []string) ([][]byte, []string) { comments := [][]byte{} + commentsFileName := []string{} var unsupportedTypes string for i, candidate := range candidates { candidateLang := langs[i] - candidateHeader := candidate - if len(candidateHeader) > 1024 { - candidateHeader = candidate[:1024] - } if reg, exists := commentSyntaxesRe[candidateLang]; exists { + candidateHeader := candidate + if len(candidateHeader) > 1024 { + candidateHeader = candidate[:1024] + } if match := reg.FindAllString(string(candidateHeader), -1); match != nil { + commentsFileName = append(commentsFileName, fileNames[i]) var matchText string for _, m := range match { - matchText += m + matchText += cleanCommentsRe.ReplaceAllString(m, "") } comments = append(comments, []byte(matchText)) } @@ -239,23 +242,28 @@ func ExtractHeaderComments(candidates [][]byte, langs []string) [][]byte { unsupportedTypes = unsupportedTypes[:len(unsupportedTypes)-2] fmt.Println("The following file types were not investigated for licenses on the comments:", unsupportedTypes + ". ") } - return comments + return comments, commentsFileName } // InvestigateHeaderComments scans the header comments for licensing information and outputs the // probable names using NER. -func InvestigateHeaderComments(texts [][]byte, fs filer.Filer) map[string]float32 { +func InvestigateHeaderComments(texts [][]byte, fs filer.Filer, commentsFileName []string) (map[string]float32, []string) { maxLicenses := map[string]float32{} - for _, text := range texts { + licensesFileNames := []string{} + // TO DO: output max license per file, not files with licenses + licenses found + for i, text := range texts { candidates := InvestigateHeaderComment(text) - for name, sim := range candidates { - maxSim := maxLicenses[name] - if sim > maxSim { - maxLicenses[name] = sim + if len(candidates) > 0 { + licensesFileNames = append(licensesFileNames, commentsFileName[i]) + for name, sim := range candidates { + maxSim := maxLicenses[name] + if sim > maxSim { + maxLicenses[name] = sim + } } } } - return maxLicenses + return maxLicenses, licensesFileNames } // InvestigateHeaderComment scans the header comments for licensing information and outputs probable diff --git a/licensedb/licensedb.go b/licensedb/licensedb.go index 0afd404..5296931 100644 --- a/licensedb/licensedb.go +++ b/licensedb/licensedb.go @@ -15,10 +15,10 @@ var ( // Detect returns the most probable reference licenses matched for the given // file tree. Each match has the confidence assigned, from 0 to 1, 1 means 100% confident. -func Detect(fs filer.Filer) (map[string]float32, error) { +func Detect(fs filer.Filer) (map[string]float32, []string, error) { files, err := fs.ReadDir("") if err != nil { - return nil, err + return nil, nil, err } fileNames := []string{} for _, file := range files { @@ -39,27 +39,30 @@ func Detect(fs filer.Filer) (map[string]float32, error) { candidates := internal.ExtractLicenseFiles(fileNames, fs) licenses := internal.InvestigateLicenseTexts(candidates) if len(licenses) > 0 { - return licenses, nil + return licenses, nil, nil } // Plan B: take the README, find the section about the license and apply NER candidates = internal.ExtractReadmeFiles(fileNames, fs) if len(candidates) > 0 { licenses = internal.InvestigateReadmeTexts(candidates, fs) if len(licenses) > 0 { - return licenses, nil + return licenses, nil, nil } } + // Plan C: look for licence texts in source code files with comments at header - var extendedFileNames []string + extendedFileNames := []string{} + commentsFileName := []string{} + licensesFileNames := []string{} extendedFileNames = extractAllSubfiles(fs, extendedFileNames, "") - candidates = internal.ExtractSourceFiles(extendedFileNames, fs) + candidates, commentsFileName = internal.ExtractSourceFiles(extendedFileNames, fs) if len(candidates) > 0 { - licenses = internal.InvestigateHeaderComments(candidates, fs) + licenses, licensesFileNames = internal.InvestigateHeaderComments(candidates, fs, commentsFileName) } if len(licenses) == 0 { - return nil, ErrNoLicenseFound + return nil, nil, ErrNoLicenseFound } - return licenses, nil + return licenses, licensesFileNames, nil } func extractAllSubfiles(fs filer.Filer, fileNames []string, path string) []string {