From 56f225734fe1a143283f87b5d33e51535a5f0550 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mar=C3=ADa=20Benavente?= <mariabenaventeg@gmail.com>
Date: Tue, 17 Jul 2018 10:01:10 +0200
Subject: [PATCH 1/9] Find source code files
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: María Benavente <mariabenaventeg@gmail.com>
---
 licensedb/internal/investigation.go | 55 +++++++++++++++++++++++++++++
 licensedb/licensedb.go              | 13 +++++--
 2 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/licensedb/internal/investigation.go b/licensedb/internal/investigation.go
index f59d53a..d59e944 100644
--- a/licensedb/internal/investigation.go
+++ b/licensedb/internal/investigation.go
@@ -10,6 +10,7 @@ import (
 
 	"gopkg.in/src-d/go-license-detector.v2/licensedb/filer"
 	"gopkg.in/src-d/go-license-detector.v2/licensedb/internal/processors"
+	"gopkg.in/src-d/enry.v1"
 )
 
 var (
@@ -157,3 +158,57 @@ func InvestigateReadmeText(text []byte, fs filer.Filer) map[string]float32 {
 func IsLicenseDirectory(fileName string) bool {
 	return licenseDirectoryRe.MatchString(strings.ToLower(fileName))
 }
+
+// ExtractSourceFiles searches for source code files and their returns header comments, when available.
+// Enry is used to get possible valuable files.
+func ExtractSourceFiles(files []string, fs filer.Filer) [][]byte {
+	candidates := [][]byte{}
+	langs := []string{}
+	for _, file := range files {
+		lang, safe := enry.GetLanguageByExtension(file)
+		if safe == true {
+			langs = append(langs, lang)
+			text, err := fs.ReadFile(file)
+			if err == nil {
+				if preprocessor, exists := filePreprocessors[paths.Ext(file)]; exists {
+					text = preprocessor(text)
+				}
+				candidates = append(candidates, text)
+			}
+		}
+	}
+	if len(candidates) > 0 {
+		candidates = ExtractHeaderComments(candidates, langs)
+	}
+	return candidates
+}
+
+// ExtractHeaderComments searches in source code files for header comments and outputs license text on them them.
+func ExtractHeaderComments(candidates [][]byte, lang []string) [][]byte {
+	// TO DO: split code from comments, preferably only header comments
+	comments := [][]byte{}
+	return comments
+}
+
+// InvestigateHeaderComments scans the header comments for licensing information and outputs the
+// probable names using NER.
+func InvestigateHeaderComments(texts [][]byte, fs filer.Filer) map[string]float32 {
+	// TO DO: split license-comments from description-comments.
+	maxLicenses := map[string]float32{}
+	for _, text := range texts {
+		candidates := InvestigateHeaderComment(text)
+		for name, sim := range candidates {
+			maxSim := maxLicenses[name]
+			if sim > maxSim {
+				maxLicenses[name] = sim
+			}
+		}
+	}
+	return maxLicenses
+}
+
+// InvestigateHeaderComment scans the header comments for licensing information and outputs probable
+// names found with Named Entity Recognition from NLP.
+func InvestigateHeaderComment(text []byte) map[string]float32 {
+	return globalLicenseDatabase().QueryLicenseText(string(text))
+}
diff --git a/licensedb/licensedb.go b/licensedb/licensedb.go
index f524884..09a921e 100644
--- a/licensedb/licensedb.go
+++ b/licensedb/licensedb.go
@@ -43,10 +43,17 @@ func Detect(fs filer.Filer) (map[string]float32, error) {
 	}
 	// Plan B: take the README, find the section about the license and apply NER
 	candidates = internal.ExtractReadmeFiles(fileNames, fs)
-	if len(candidates) == 0 {
-		return nil, ErrNoLicenseFound
+	if len(candidates) > 0 {
+		licenses = internal.InvestigateReadmeTexts(candidates, fs)
+		if len(licenses) > 0 {
+			return licenses, nil
+		}
+	}
+	// Plan C: look for licence texts in source code files with comments at header
+	candidates = internal.ExtractSourceFiles(fileNames, fs)
+	if len(candidates) > 0 {
+		licenses = internal.InvestigateHeaderComments(candidates, fs)
 	}
-	licenses = internal.InvestigateReadmeTexts(candidates, fs)
 	if len(licenses) == 0 {
 		return nil, ErrNoLicenseFound
 	}

From 9f98da678852e54dccaf6440eda0d32442f93e12 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mar=C3=ADa=20Benavente?= <mariabenaventeg@gmail.com>
Date: Wed, 18 Jul 2018 10:31:43 +0200
Subject: [PATCH 2/9] use enry's GetLanguage() instead of
 GetLanguageByExtension()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: María Benavente <mariabenaventeg@gmail.com>
---
 licensedb/internal/investigation.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/licensedb/internal/investigation.go b/licensedb/internal/investigation.go
index d59e944..0adaa48 100644
--- a/licensedb/internal/investigation.go
+++ b/licensedb/internal/investigation.go
@@ -165,7 +165,7 @@ func ExtractSourceFiles(files []string, fs filer.Filer) [][]byte {
 	candidates := [][]byte{}
 	langs := []string{}
 	for _, file := range files {
-		lang, safe := enry.GetLanguageByExtension(file)
+		lang, safe := enry.GetLanguage(file)
 		if safe == true {
 			langs = append(langs, lang)
 			text, err := fs.ReadFile(file)

From 48cf528dcfe8f5afa4d63848a11e8f16746b7d2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mar=C3=ADa=20Benavente?= <mariabenaventeg@gmail.com>
Date: Fri, 20 Jul 2018 18:35:01 +0200
Subject: [PATCH 3/9] Parse header comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: María Benavente <mariabenaventeg@gmail.com>
---
 licensedb/internal/db.go            |  6 ++++
 licensedb/internal/investigation.go | 54 +++++++++++++++++++++++++++--
 2 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/licensedb/internal/db.go b/licensedb/internal/db.go
index 93b6c03..a51d2f4 100644
--- a/licensedb/internal/db.go
+++ b/licensedb/internal/db.go
@@ -460,3 +460,9 @@ func tfidf(freq int, docfreq int, ndocs int) float32 {
 	}
 	return weight
 }
+
+func func (db *database) QuerySourceFile(text string) map[string]float32 {
+	// TO DO: implement this function
+	placeholder := map[string]float32{}
+	return
+}
diff --git a/licensedb/internal/investigation.go b/licensedb/internal/investigation.go
index 0adaa48..6e98050 100644
--- a/licensedb/internal/investigation.go
+++ b/licensedb/internal/investigation.go
@@ -63,6 +63,34 @@ var (
 
 	licenseDirectoryRe = regexp.MustCompile(fmt.Sprintf(
 		"^(%s)$", strings.Join(licenseFileNames, "|")))
+
+	commentSyntaxes = map[string]*regexp.Regexp {
+		// "ANTLR": regexp.MustCompile(``),
+		"C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*\t*\r*\n*)*\*\/)`),
+		"C++": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*\t*\r*\n*)*\*\/)`),
+		"C#": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*\t*\r*\n*)*\*\/)`),
+		"CSS": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
+		"Go": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
+		// "HTML": regexp.MustCompile(``),
+		"Haskel": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\{-(.*\t*\r*\n*)*\-\})`),
+		"Java": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
+		"Javascript": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
+		"Matlab": regexp.MustCompile(`(%\{(.*\s+.*)*%\})`),
+		"Objective-C": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
+		// "Perl": regexp.MustCompile(``),
+		"PHP": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
+		"Python": regexp.MustCompile("(#.*\t?\r?\n?)|(```.*```)"),
+		// "Ruby": regexp.MustCompile(``),
+		"Rust": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
+		// "R": regexp.MustCompile(``),
+		// "Shell": regexp.MustCompile(``),
+		"Swift": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
+		// "SAS": regexp.MustCompile(``),
+		"Scala": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
+		"SQL": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\/\*(.*\t*\r*\n*)*\*\/)`),
+		// "Visual Basic": regexp.MustCompile(``),
+		// "yml": regexp.MustCompile(``),
+	}
 )
 
 // ExtractLicenseFiles returns the list of possible license texts.
@@ -184,9 +212,29 @@ func ExtractSourceFiles(files []string, fs filer.Filer) [][]byte {
 }
 
 // ExtractHeaderComments searches in source code files for header comments and outputs license text on them them.
-func ExtractHeaderComments(candidates [][]byte, lang []string) [][]byte {
-	// TO DO: split code from comments, preferably only header comments
+func ExtractHeaderComments(candidates [][]byte, langs []string) [][]byte {
 	comments := [][]byte{}
+	for key, candidate := range candidates {
+		candidateLang := langs[key]
+		candidateHeader := candidate[:1024]
+		if reg, exists := commentSyntaxes[candidateLang]; exists {
+			if candidateHeader != nil {
+				if match := reg.FindAllStringSubmatch(string(candidateHeader), -1); match != nil {
+						var matchText string
+						for _, m := range match {
+							var tempText string
+							for _, k := range m {
+								tempText += string(k)
+							}
+							matchText += string(tempText)
+						}
+						comments = append(comments, []byte(matchText))
+				}
+			}
+		} else {
+				fmt.Println("Found a", candidateLang, "file from which is currently unsorported. Please open an issue on Github or contribute to the project by adding support to it.")
+		}
+	}
 	return comments
 }
 
@@ -210,5 +258,5 @@ func InvestigateHeaderComments(texts [][]byte, fs filer.Filer) map[string]float3
 // InvestigateHeaderComment scans the header comments for licensing information and outputs probable
 // names found with Named Entity Recognition from NLP.
 func InvestigateHeaderComment(text []byte) map[string]float32 {
-	return globalLicenseDatabase().QueryLicenseText(string(text))
+	return globalLicenseDatabase().QuerySourceFile(string(text))
 }

From 901ab7e4c8732f7b757355dc3970aacf4e5b823a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mar=C3=ADa=20Benavente?= <mariabenaventeg@gmail.com>
Date: Fri, 20 Jul 2018 18:36:14 +0200
Subject: [PATCH 4/9] Fix typo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: María Benavente <mariabenaventeg@gmail.com>
---
 licensedb/internal/db.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/licensedb/internal/db.go b/licensedb/internal/db.go
index a51d2f4..d18291e 100644
--- a/licensedb/internal/db.go
+++ b/licensedb/internal/db.go
@@ -461,7 +461,7 @@ func tfidf(freq int, docfreq int, ndocs int) float32 {
 	return weight
 }
 
-func func (db *database) QuerySourceFile(text string) map[string]float32 {
+func (db *database) QuerySourceFile(text string) map[string]float32 {
 	// TO DO: implement this function
 	placeholder := map[string]float32{}
 	return

From e691fdc47fe952c719e6ee06a41ecc45be7ba1c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mar=C3=ADa=20Benavente?= <mariabenaventeg@gmail.com>
Date: Mon, 30 Jul 2018 18:26:36 +0200
Subject: [PATCH 5/9] Scan-individual-files implementation working (but needs
 improvement)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: María Benavente <mariabenaventeg@gmail.com>
---
 licensedb/internal/db.go            |  22 +++++-
 licensedb/internal/investigation.go | 101 ++++++++++++++--------------
 licensedb/licensedb.go              |  19 +++++-
 3 files changed, 89 insertions(+), 53 deletions(-)

diff --git a/licensedb/internal/db.go b/licensedb/internal/db.go
index d18291e..7a3c988 100644
--- a/licensedb/internal/db.go
+++ b/licensedb/internal/db.go
@@ -462,7 +462,23 @@ func tfidf(freq int, docfreq int, ndocs int) float32 {
 }
 
 func (db *database) QuerySourceFile(text string) map[string]float32 {
-	// TO DO: implement this function
-	placeholder := map[string]float32{}
-	return
+	candidates := map[string]float32{}
+	append := func(others map[string]float32) {
+		for key, val := range others {
+			if candidates[key] < val {
+				candidates[key] = val
+			}
+		}
+	}
+	append(db.QueryLicenseText(string(text)))
+	if len(candidates) == 0 {
+		// TO DO: split license-comments from description-comments.
+	}
+	if db.debug {
+		for key, val := range candidates {
+			println("NLP", key, val)
+		}
+	}
+	db.addURLMatches(candidates, text)
+	return candidates
 }
diff --git a/licensedb/internal/investigation.go b/licensedb/internal/investigation.go
index 6e98050..fd17ef6 100644
--- a/licensedb/internal/investigation.go
+++ b/licensedb/internal/investigation.go
@@ -65,31 +65,31 @@ var (
 		"^(%s)$", strings.Join(licenseFileNames, "|")))
 
 	commentSyntaxes = map[string]*regexp.Regexp {
-		// "ANTLR": regexp.MustCompile(``),
-		"C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*\t*\r*\n*)*\*\/)`),
-		"C++": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*\t*\r*\n*)*\*\/)`),
-		"C#": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*\t*\r*\n*)*\*\/)`),
-		"CSS": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
-		"Go": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
-		// "HTML": regexp.MustCompile(``),
-		"Haskel": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\{-(.*\t*\r*\n*)*\-\})`),
-		"Java": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
-		"Javascript": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
-		"Matlab": regexp.MustCompile(`(%\{(.*\s+.*)*%\})`),
-		"Objective-C": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
-		// "Perl": regexp.MustCompile(``),
-		"PHP": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
-		"Python": regexp.MustCompile("(#.*\t?\r?\n?)|(```.*```)"),
-		// "Ruby": regexp.MustCompile(``),
-		"Rust": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
-		// "R": regexp.MustCompile(``),
-		// "Shell": regexp.MustCompile(``),
-		"Swift": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
-		// "SAS": regexp.MustCompile(``),
-		"Scala": regexp.MustCompile(`(\/\*(.*\t*\r*\n*)*\*\/)`),
-		"SQL": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\/\*(.*\t*\r*\n*)*\*\/)`),
-		// "Visual Basic": regexp.MustCompile(``),
-		// "yml": regexp.MustCompile(``),
+		"ANTLR": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"C++": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"C#": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"CSS": regexp.MustCompile(`\/\*(.*?\t?\r?\n?)+?\*\/`),
+		"Go": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"HTML": regexp.MustCompile(`<\!--(.*?\t?\r?\n?)+?-->`),
+		"Haskel": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\{-(.*?\t?\r?\n?)+?\-\})`),
+		"Java": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"JavaScript": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"Matlab": regexp.MustCompile(`(%.*\t?\r?\n?)|(%\{(.?\t?\r?\n?)+?%\})`),
+		"Objective-C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"Perl": regexp.MustCompile(`(#.*\t?\r?\n?)|(=begin(.*?\t?\r?\n?)+?=cut)`),
+		"PHP": regexp.MustCompile(`(#.*\t?\r?\n?)|(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"Python": regexp.MustCompile("('''(.?\t?\r?\n?)+?''')|(#.*\t?\r?\n?)|(\"\"\"(.?\t?\r?\n?)+?\"\"\")"),
+		"Ruby": regexp.MustCompile(`(#.*\t?\r?\n?)|(=begin(.*?\t?\r?\n?)+?=end)`),
+		"Rust": regexp.MustCompile(`\/\*(.*?\t?\r?\n?)+?\*\/`),
+		"R": regexp.MustCompile(`#.*\t?\r?\n?`),
+		"Shell": regexp.MustCompile(`#.*\t?\r?\n?`),
+		"Swift": regexp.MustCompile(`(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"SAS": regexp.MustCompile(`(\*(.*?\t?\r?\n?)+?;)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"Scala": regexp.MustCompile(`(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"SQL": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"TypeScript": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"YAML": regexp.MustCompile(`#.*\t?\r?\n?`),
 	}
 )
 
@@ -192,17 +192,16 @@ func IsLicenseDirectory(fileName string) bool {
 func ExtractSourceFiles(files []string, fs filer.Filer) [][]byte {
 	candidates := [][]byte{}
 	langs := []string{}
+	var empty []byte
 	for _, file := range files {
-		lang, safe := enry.GetLanguage(file)
-		if safe == true {
+		text, err := fs.ReadFile(file)
+		if err == nil {
+			lang := enry.GetLanguage(file, empty)
 			langs = append(langs, lang)
-			text, err := fs.ReadFile(file)
-			if err == nil {
-				if preprocessor, exists := filePreprocessors[paths.Ext(file)]; exists {
-					text = preprocessor(text)
-				}
-				candidates = append(candidates, text)
+			if preprocessor, exists := filePreprocessors[paths.Ext(file)]; exists {
+				text = preprocessor(text)
 			}
+			candidates = append(candidates, text)
 		}
 	}
 	if len(candidates) > 0 {
@@ -214,34 +213,38 @@ func ExtractSourceFiles(files []string, fs filer.Filer) [][]byte {
 // ExtractHeaderComments searches in source code files for header comments and outputs license text on them them.
 func ExtractHeaderComments(candidates [][]byte, langs []string) [][]byte {
 	comments := [][]byte{}
-	for key, candidate := range candidates {
-		candidateLang := langs[key]
-		candidateHeader := candidate[:1024]
+	var unsupportedTypes string
+	for i, candidate := range candidates {
+		candidateLang := langs[i]
+		candidateHeader := candidate
+		if len(candidateHeader) > 1024 {
+			candidateHeader = candidate[:1024]
+		}
 		if reg, exists := commentSyntaxes[candidateLang]; exists {
-			if candidateHeader != nil {
-				if match := reg.FindAllStringSubmatch(string(candidateHeader), -1); match != nil {
-						var matchText string
-						for _, m := range match {
-							var tempText string
-							for _, k := range m {
-								tempText += string(k)
-							}
-							matchText += string(tempText)
-						}
-						comments = append(comments, []byte(matchText))
+			if match := reg.FindAllString(string(candidateHeader), -1); match != nil {
+				var matchText string
+				for _, m := range match {
+					matchText += m
 				}
+				comments = append(comments, []byte(matchText))
 			}
 		} else {
-				fmt.Println("Found a", candidateLang, "file from which is currently unsorported. Please open an issue on Github or contribute to the project by adding support to it.")
+			match, _ := regexp.Match(candidateLang, []byte(unsupportedTypes))
+			if match == false {
+				unsupportedTypes += candidateLang + ", "
+			}
 		}
 	}
+	if len(unsupportedTypes) > 0 {
+		unsupportedTypes = unsupportedTypes[:len(unsupportedTypes)-2]
+		fmt.Println("The following file types were not investigated for licenses on the comments:", unsupportedTypes + ". ")
+	}
 	return comments
 }
 
 // InvestigateHeaderComments scans the header comments for licensing information and outputs the
 // probable names using NER.
 func InvestigateHeaderComments(texts [][]byte, fs filer.Filer) map[string]float32 {
-	// TO DO: split license-comments from description-comments.
 	maxLicenses := map[string]float32{}
 	for _, text := range texts {
 		candidates := InvestigateHeaderComment(text)
diff --git a/licensedb/licensedb.go b/licensedb/licensedb.go
index 09a921e..0afd404 100644
--- a/licensedb/licensedb.go
+++ b/licensedb/licensedb.go
@@ -50,7 +50,9 @@ func Detect(fs filer.Filer) (map[string]float32, error) {
 		}
 	}
 	// Plan C: look for licence texts in source code files with comments at header
-	candidates = internal.ExtractSourceFiles(fileNames, fs)
+	var extendedFileNames []string
+	extendedFileNames = extractAllSubfiles(fs, extendedFileNames, "")
+	candidates = internal.ExtractSourceFiles(extendedFileNames, fs)
 	if len(candidates) > 0 {
 		licenses = internal.InvestigateHeaderComments(candidates, fs)
 	}
@@ -59,3 +61,18 @@ func Detect(fs filer.Filer) (map[string]float32, error) {
 	}
 	return licenses, nil
 }
+
+func extractAllSubfiles(fs filer.Filer, fileNames []string, path string) []string {
+	files, err := fs.ReadDir(path)
+	if err == nil {
+		for _, subfile := range files {
+			currentPath := paths.Join(path, subfile.Name)
+			if subfile.IsDir {
+				fileNames = extractAllSubfiles(fs, fileNames, currentPath)
+			} else {
+				fileNames = append(fileNames, currentPath)
+			}
+		}
+	}
+	return fileNames
+}

From 1c941d3a2837c44df094d306f63c9de8e3533dbc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mar=C3=ADa=20Benavente?= <mariabenaventeg@gmail.com>
Date: Mon, 30 Jul 2018 20:24:42 +0200
Subject: [PATCH 6/9] change var name for consistency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: María Benavente <mariabenaventeg@gmail.com>
---
 licensedb/internal/investigation.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/licensedb/internal/investigation.go b/licensedb/internal/investigation.go
index fd17ef6..2a45637 100644
--- a/licensedb/internal/investigation.go
+++ b/licensedb/internal/investigation.go
@@ -64,7 +64,7 @@ var (
 	licenseDirectoryRe = regexp.MustCompile(fmt.Sprintf(
 		"^(%s)$", strings.Join(licenseFileNames, "|")))
 
-	commentSyntaxes = map[string]*regexp.Regexp {
+	commentSyntaxesRe = map[string]*regexp.Regexp {
 		"ANTLR": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
 		"C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
 		"C++": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
@@ -220,7 +220,7 @@ func ExtractHeaderComments(candidates [][]byte, langs []string) [][]byte {
 		if len(candidateHeader) > 1024 {
 			candidateHeader = candidate[:1024]
 		}
-		if reg, exists := commentSyntaxes[candidateLang]; exists {
+		if reg, exists := commentSyntaxesRe[candidateLang]; exists {
 			if match := reg.FindAllString(string(candidateHeader), -1); match != nil {
 				var matchText string
 				for _, m := range match {

From 8061b5a0445e955323fa9a134764be19f633563e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mar=C3=ADa=20Benavente?= <mariabenaventeg@gmail.com>
Date: Mon, 29 Oct 2018 16:44:40 +0100
Subject: [PATCH 7/9] add investigateSourceFile() structure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: María Benavente <mariabenaventeg@gmail.com>
---
 licensedb/internal/db.go  |  9 ++++++---
 licensedb/internal/nlp.go | 11 +++++++++++
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/licensedb/internal/db.go b/licensedb/internal/db.go
index 7a3c988..74d8c4d 100644
--- a/licensedb/internal/db.go
+++ b/licensedb/internal/db.go
@@ -471,9 +471,12 @@ func (db *database) QuerySourceFile(text string) map[string]float32 {
 		}
 	}
 	append(db.QueryLicenseText(string(text)))
-	if len(candidates) == 0 {
-		// TO DO: split license-comments from description-comments.
-	}
+	// if len(candidates) == 0 {
+	// 	append(investigateSourceFile(text, db.nameSubstrings, db.nameSubstringSizes))
+	// 	if len(candidates) == 0 {
+	// 		append(investigateSourceFile(text, db.nameShortSubstrings, db.nameShortSubstringSizes))
+	// 	}
+	// }
 	if db.debug {
 		for key, val := range candidates {
 			println("NLP", key, val)
diff --git a/licensedb/internal/nlp.go b/licensedb/internal/nlp.go
index 7e015b4..c340389 100644
--- a/licensedb/internal/nlp.go
+++ b/licensedb/internal/nlp.go
@@ -143,3 +143,14 @@ func splitLicenseName(name string) []substring {
 	})
 	return result
 }
+
+func investigateSourceFile(
+	text string, licenseNameParts map[string][]substring,
+	licenseNameSizes map[string]int) map[string]float32 {
+	// TO DO: split license-comments from description-comments
+			// =====
+			// ----
+			// \n\n\n
+			// import
+	return map[string]float32{}
+	}

From 7d2ccc0c366e172cdb75517bc7d56c6cd55eaff8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mar=C3=ADa=20Benavente?= <mariabenaventeg@gmail.com>
Date: Mon, 29 Oct 2018 16:46:32 +0100
Subject: [PATCH 8/9] add support future printing output the full report for
 all the files
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: María Benavente <mariabenaventeg@gmail.com>
---
 cmd/license-detector/main.go | 2 +-
 licensedb/dataset_test.go    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmd/license-detector/main.go b/cmd/license-detector/main.go
index 8994cfd..ca5755f 100644
--- a/cmd/license-detector/main.go
+++ b/cmd/license-detector/main.go
@@ -103,7 +103,7 @@ func process(arg string) ([]match, error) {
 		return nil, err
 	}
 
-	ls, err := licensedb.Detect(resolvedFiler)
+	ls, _, err := licensedb.Detect(resolvedFiler)
 	if err != nil {
 		return nil, err
 	}
diff --git a/licensedb/dataset_test.go b/licensedb/dataset_test.go
index f7702a7..945dcba 100644
--- a/licensedb/dataset_test.go
+++ b/licensedb/dataset_test.go
@@ -23,7 +23,7 @@ func TestDataset(t *testing.T) {
 	for _, project := range projects {
 		go func(project filer.File) {
 			defer wg.Done()
-			myLicenses, _ := Detect(filer.NestFiler(rootFiler, project.Name))
+			myLicenses, _, _ := Detect(filer.NestFiler(rootFiler, project.Name))
 			if len(myLicenses) > 0 {
 				mutex.Lock()
 				licenses[project.Name] = myLicenses

From f8395c150e838dc5d379e493c42fea3e083106da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mar=C3=ADa=20Benavente?= <mariabenaventeg@gmail.com>
Date: Mon, 29 Oct 2018 16:48:57 +0100
Subject: [PATCH 9/9] minor changes + getting filenames where licenses were
 found
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: María Benavente <mariabenaventeg@gmail.com>
---
 licensedb/internal/investigation.go | 52 +++++++++++++++++------------
 licensedb/licensedb.go              | 21 +++++++-----
 2 files changed, 42 insertions(+), 31 deletions(-)

diff --git a/licensedb/internal/investigation.go b/licensedb/internal/investigation.go
index 2a45637..2749956 100644
--- a/licensedb/internal/investigation.go
+++ b/licensedb/internal/investigation.go
@@ -91,6 +91,8 @@ var (
 		"TypeScript": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
 		"YAML": regexp.MustCompile(`#.*\t?\r?\n?`),
 	}
+
+	cleanCommentsRe = regexp.MustCompile(`#|\*|\/|=begin|=cut|=end`)
 )
 
 // ExtractLicenseFiles returns the list of possible license texts.
@@ -189,42 +191,43 @@ func IsLicenseDirectory(fileName string) bool {
 
 // ExtractSourceFiles searches for source code files and their returns header comments, when available.
 // Enry is used to get possible valuable files.
-func ExtractSourceFiles(files []string, fs filer.Filer) [][]byte {
+func ExtractSourceFiles(files []string, fs filer.Filer) ([][]byte, []string) {
 	candidates := [][]byte{}
+	fileNames := []string{}
 	langs := []string{}
-	var empty []byte
+	commentsFileName := []string{}
 	for _, file := range files {
 		text, err := fs.ReadFile(file)
 		if err == nil {
-			lang := enry.GetLanguage(file, empty)
+			lang := enry.GetLanguage(file, text)
 			langs = append(langs, lang)
-			if preprocessor, exists := filePreprocessors[paths.Ext(file)]; exists {
-				text = preprocessor(text)
-			}
 			candidates = append(candidates, text)
+			fileNames = append(fileNames, file)
 		}
 	}
 	if len(candidates) > 0 {
-		candidates = ExtractHeaderComments(candidates, langs)
+		candidates, commentsFileName = ExtractHeaderComments(candidates, langs, fileNames)
 	}
-	return candidates
+	return candidates, commentsFileName
 }
 
 // ExtractHeaderComments searches in source code files for header comments and outputs license text on them them.
-func ExtractHeaderComments(candidates [][]byte, langs []string) [][]byte {
+func ExtractHeaderComments(candidates [][]byte, langs []string, fileNames []string) ([][]byte, []string) {
 	comments := [][]byte{}
+	commentsFileName := []string{}
 	var unsupportedTypes string
 	for i, candidate := range candidates {
 		candidateLang := langs[i]
-		candidateHeader := candidate
-		if len(candidateHeader) > 1024 {
-			candidateHeader = candidate[:1024]
-		}
 		if reg, exists := commentSyntaxesRe[candidateLang]; exists {
+			candidateHeader := candidate
+			if len(candidateHeader) > 1024 {
+				candidateHeader = candidate[:1024]
+			}
 			if match := reg.FindAllString(string(candidateHeader), -1); match != nil {
+				commentsFileName = append(commentsFileName, fileNames[i])
 				var matchText string
 				for _, m := range match {
-					matchText += m
+					matchText += cleanCommentsRe.ReplaceAllString(m, "")
 				}
 				comments = append(comments, []byte(matchText))
 			}
@@ -239,23 +242,28 @@ func ExtractHeaderComments(candidates [][]byte, langs []string) [][]byte {
 		unsupportedTypes = unsupportedTypes[:len(unsupportedTypes)-2]
 		fmt.Println("The following file types were not investigated for licenses on the comments:", unsupportedTypes + ". ")
 	}
-	return comments
+	return comments, commentsFileName
 }
 
 // InvestigateHeaderComments scans the header comments for licensing information and outputs the
 // probable names using NER.
-func InvestigateHeaderComments(texts [][]byte, fs filer.Filer) map[string]float32 {
+func InvestigateHeaderComments(texts [][]byte, fs filer.Filer, commentsFileName []string) (map[string]float32, []string) {
 	maxLicenses := map[string]float32{}
-	for _, text := range texts {
+	licensesFileNames := []string{}
+	// TO DO: output max license per file, not files with licenses + licenses found
+	for i, text := range texts {
 		candidates := InvestigateHeaderComment(text)
-		for name, sim := range candidates {
-			maxSim := maxLicenses[name]
-			if sim > maxSim {
-				maxLicenses[name] = sim
+		if len(candidates) > 0 {
+			licensesFileNames = append(licensesFileNames, commentsFileName[i])
+			for name, sim := range candidates {
+				maxSim := maxLicenses[name]
+				if sim > maxSim {
+					maxLicenses[name] = sim
+				}
 			}
 		}
 	}
-	return maxLicenses
+	return maxLicenses, licensesFileNames
 }
 
 // InvestigateHeaderComment scans the header comments for licensing information and outputs probable
diff --git a/licensedb/licensedb.go b/licensedb/licensedb.go
index 0afd404..5296931 100644
--- a/licensedb/licensedb.go
+++ b/licensedb/licensedb.go
@@ -15,10 +15,10 @@ var (
 
 // Detect returns the most probable reference licenses matched for the given
 // file tree. Each match has the confidence assigned, from 0 to 1, 1 means 100% confident.
-func Detect(fs filer.Filer) (map[string]float32, error) {
+func Detect(fs filer.Filer) (map[string]float32, []string, error) {
 	files, err := fs.ReadDir("")
 	if err != nil {
-		return nil, err
+		return nil, nil, err
 	}
 	fileNames := []string{}
 	for _, file := range files {
@@ -39,27 +39,30 @@ func Detect(fs filer.Filer) (map[string]float32, error) {
 	candidates := internal.ExtractLicenseFiles(fileNames, fs)
 	licenses := internal.InvestigateLicenseTexts(candidates)
 	if len(licenses) > 0 {
-		return licenses, nil
+		return licenses, nil, nil
 	}
 	// Plan B: take the README, find the section about the license and apply NER
 	candidates = internal.ExtractReadmeFiles(fileNames, fs)
 	if len(candidates) > 0 {
 		licenses = internal.InvestigateReadmeTexts(candidates, fs)
 		if len(licenses) > 0 {
-			return licenses, nil
+			return licenses, nil, nil
 		}
 	}
+
 	// Plan C: look for licence texts in source code files with comments at header
-	var extendedFileNames []string
+	extendedFileNames := []string{}
+	commentsFileName := []string{}
+	licensesFileNames := []string{}
 	extendedFileNames = extractAllSubfiles(fs, extendedFileNames, "")
-	candidates = internal.ExtractSourceFiles(extendedFileNames, fs)
+	candidates, commentsFileName = internal.ExtractSourceFiles(extendedFileNames, fs)
 	if len(candidates) > 0 {
-		licenses = internal.InvestigateHeaderComments(candidates, fs)
+		licenses, licensesFileNames = internal.InvestigateHeaderComments(candidates, fs, commentsFileName)
 	}
 	if len(licenses) == 0 {
-		return nil, ErrNoLicenseFound
+		return nil, nil, ErrNoLicenseFound
 	}
-	return licenses, nil
+	return licenses, licensesFileNames, nil
 }
 
 func extractAllSubfiles(fs filer.Filer, fileNames []string, path string) []string {