From abf0b8809237bb87e2ed201c296bb9abf2b8ffda Mon Sep 17 00:00:00 2001 From: Mark Date: Fri, 28 May 2021 16:50:40 +0200 Subject: [PATCH 1/2] Add tag filter and json one line --- converter/converter.go | 21 +++++++++++++---- converter/json.go | 53 ++++++++++++++++++++++++++++-------------- main.go | 2 ++ 3 files changed, 54 insertions(+), 22 deletions(-) diff --git a/converter/converter.go b/converter/converter.go index 07b631e..a7cc94d 100755 --- a/converter/converter.go +++ b/converter/converter.go @@ -19,6 +19,8 @@ type Config struct { SourcePath string StoreToDir string SkipHTMLDecoding bool + FilterByTagId string + JsonOneLine bool } const ( @@ -101,7 +103,14 @@ func Convert(cfg Config) (err error) { } else { cfg.StoreToDir = sourcePathResolved } - + var tags []string + if cfg.FilterByTagId != "" { + tags = strings.Fields(cfg.FilterByTagId) + log.Printf("Filter tags containing: %s", tags) + } + if !(cfg.JsonOneLine) { + log.Printf("Write one json obj per line instead of array") + } log.Printf("Total %d file(s) to convert", len(sourceFiles)) var wg sync.WaitGroup @@ -112,7 +121,7 @@ func Convert(cfg Config) (err error) { fmt.Sprintf("%s.%s", typeName, cfg.ResultFormat)) wg.Add(1) log.Printf("[%s] Converting is started", typeName) - go convertXMLFile(&wg, typeName, sf, resultFile) + go convertXMLFile(&wg, typeName, sf, resultFile, tags, cfg.JsonOneLine) } wg.Wait() @@ -120,7 +129,7 @@ func Convert(cfg Config) (err error) { return } -func convertXMLFile(wg *sync.WaitGroup, typeName string, xmlFilePath string, resultFilePath string) { +func convertXMLFile(wg *sync.WaitGroup, typeName string, xmlFilePath string, resultFilePath string, tags []string, jsonOneline bool) { xmlFile, err := os.Open(xmlFilePath) if err != nil { log.Printf("[%s] Error: %s", typeName, err) @@ -138,9 +147,13 @@ func convertXMLFile(wg *sync.WaitGroup, typeName string, xmlFilePath string, res var total, converted int64 switch converterConfig.ResultFormat { case "csv": + if len(tags) != 0 { + log.Printf("Tag filter for csv not supported") + return + } total, converted, err = convertToCSV(typeName, xmlFile, resultFile, converterConfig) case "json": - total, converted, err = convertToJSON(typeName, xmlFile, resultFile, converterConfig) + total, converted, err = convertToJSON(typeName, xmlFile, resultFile, converterConfig, tags, jsonOneline) } if err != nil { diff --git a/converter/json.go b/converter/json.go index f18eda7..a900287 100644 --- a/converter/json.go +++ b/converter/json.go @@ -4,27 +4,31 @@ import ( "bufio" "bytes" "encoding/json" + "github.com/SkobelevIgor/stackexchange-xml-converter/encoders" "log" "os" - - "github.com/SkobelevIgor/stackexchange-xml-converter/encoders" + "regexp" + "strings" ) // WriteBufferSize bytes (8MB) const WriteBufferSize = 8388608 -func convertToJSON(typeName string, xmlFile *os.File, jsonFile *os.File, cfg Config) (total int64, converted int64, err error) { - +func convertToJSON(typeName string, xmlFile *os.File, jsonFile *os.File, cfg Config, tags []string, oneLine bool) (total int64, converted int64, err error) { iterator := NewIterator(xmlFile) + log.Printf(jsonFile.Name()) w := bufio.NewWriterSize(jsonFile, WriteBufferSize) defer w.Flush() - - w.WriteByte('[') - + if oneLine == false { + log.Printf("oneline not set") + w.WriteByte('[') + } var iErr error for iterator.Next() { if total > 0 && iErr == nil { - w.WriteByte(',') + if oneLine == false { + w.WriteByte(',') + } } total++ encoder, _ := encoders.NewEncoder(typeName) @@ -33,28 +37,41 @@ func convertToJSON(typeName string, xmlFile *os.File, jsonFile *os.File, cfg Con log.Printf("[%s] Error: %s", typeName, iErr) continue } - if cfg.SkipHTMLDecoding { encoder.EscapeFields() } - ji, iErr := marshal(&encoder) if iErr != nil { log.Printf("[%s] Error: %s", typeName, iErr) continue } - _, iErr = w.Write(ji) - if iErr != nil { - log.Printf("[%s] Error: %s", typeName, iErr) - continue + // We might want to exclude this post + // if we filter for tags this needs to be initialized with false + var ignorePost bool + ignorePost = len(tags) > 0 + if ignorePost { + // Look for tags in post + re := regexp.MustCompile(`"Tags":"(<.+>)+"`) + tagsVar := re.FindString(string(ji)) + // check if we the post is tagged with a label we are intested in + for i := 0; i < len(tags) && ignorePost; i++ { + ignorePost = !(strings.Contains(tagsVar, "<" + tags[i] + ">")) + } + } + if !(ignorePost) { + // log.Printf("test: %s", string(ji)) + _, iErr = w.Write(ji) + if iErr != nil { + log.Printf("[%s] Error: %s", typeName, iErr) + continue + } } - converted++ } - - w.WriteByte(']') - + if oneLine == false { + w.WriteByte(']') + } return } diff --git a/main.go b/main.go index 3778950..247fae1 100755 --- a/main.go +++ b/main.go @@ -14,6 +14,8 @@ func main() { flag.StringVar(&cfg.SourcePath, "source-path", "", "Path to XML file(s)") flag.StringVar(&cfg.StoreToDir, "store-to-dir", "", "Path where to store CSV file(s)") flag.BoolVar(&cfg.SkipHTMLDecoding, "skip-html-decoding", false, "Path where to store CSV file(s)") + flag.StringVar(&cfg.FilterByTagId, "filter-by-tag-id", "", "Filter for tags, space sperated list") + flag.BoolVar(&cfg.JsonOneLine, "json-one-line", false, "Save json file as one object per line") flag.Parse() var err error From ce3c5d2009a465289df1768a60707ad4ee4d0392 Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 24 May 2022 18:32:21 +0200 Subject: [PATCH 2/2] Add option to filter exact matches only --- converter/converter.go | 10 +++++++--- converter/json.go | 13 ++++++------- main.go | 1 + 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/converter/converter.go b/converter/converter.go index a7cc94d..b818325 100755 --- a/converter/converter.go +++ b/converter/converter.go @@ -20,6 +20,7 @@ type Config struct { StoreToDir string SkipHTMLDecoding bool FilterByTagId string + FilterExactMatch bool JsonOneLine bool } @@ -108,6 +109,9 @@ func Convert(cfg Config) (err error) { tags = strings.Fields(cfg.FilterByTagId) log.Printf("Filter tags containing: %s", tags) } + if !(cfg.FilterExactMatch) { + log.Printf("Filter tags have to match exactly") + } if !(cfg.JsonOneLine) { log.Printf("Write one json obj per line instead of array") } @@ -121,7 +125,7 @@ func Convert(cfg Config) (err error) { fmt.Sprintf("%s.%s", typeName, cfg.ResultFormat)) wg.Add(1) log.Printf("[%s] Converting is started", typeName) - go convertXMLFile(&wg, typeName, sf, resultFile, tags, cfg.JsonOneLine) + go convertXMLFile(&wg, typeName, sf, resultFile, tags, cfg.JsonOneLine, !(cfg.FilterExactMatch)) } wg.Wait() @@ -129,7 +133,7 @@ func Convert(cfg Config) (err error) { return } -func convertXMLFile(wg *sync.WaitGroup, typeName string, xmlFilePath string, resultFilePath string, tags []string, jsonOneline bool) { +func convertXMLFile(wg *sync.WaitGroup, typeName string, xmlFilePath string, resultFilePath string, tags []string, jsonOneline bool, filterExactMatch bool) { xmlFile, err := os.Open(xmlFilePath) if err != nil { log.Printf("[%s] Error: %s", typeName, err) @@ -153,7 +157,7 @@ func convertXMLFile(wg *sync.WaitGroup, typeName string, xmlFilePath string, res } total, converted, err = convertToCSV(typeName, xmlFile, resultFile, converterConfig) case "json": - total, converted, err = convertToJSON(typeName, xmlFile, resultFile, converterConfig, tags, jsonOneline) + total, converted, err = convertToJSON(typeName, xmlFile, resultFile, converterConfig, tags, jsonOneline, filterExactMatch) } if err != nil { diff --git a/converter/json.go b/converter/json.go index a900287..0d7772c 100644 --- a/converter/json.go +++ b/converter/json.go @@ -14,15 +14,10 @@ import ( // WriteBufferSize bytes (8MB) const WriteBufferSize = 8388608 -func convertToJSON(typeName string, xmlFile *os.File, jsonFile *os.File, cfg Config, tags []string, oneLine bool) (total int64, converted int64, err error) { +func convertToJSON(typeName string, xmlFile *os.File, jsonFile *os.File, cfg Config, tags []string, oneLine bool, filterExactMatch bool) (total int64, converted int64, err error) { iterator := NewIterator(xmlFile) - log.Printf(jsonFile.Name()) w := bufio.NewWriterSize(jsonFile, WriteBufferSize) defer w.Flush() - if oneLine == false { - log.Printf("oneline not set") - w.WriteByte('[') - } var iErr error for iterator.Next() { if total > 0 && iErr == nil { @@ -56,7 +51,11 @@ func convertToJSON(typeName string, xmlFile *os.File, jsonFile *os.File, cfg Con tagsVar := re.FindString(string(ji)) // check if we the post is tagged with a label we are intested in for i := 0; i < len(tags) && ignorePost; i++ { - ignorePost = !(strings.Contains(tagsVar, "<" + tags[i] + ">")) + if filterExactMatch { + ignorePost = !(strings.Contains(tagsVar, "<" + tags[i] + ">")) + } else { + ignorePost = !(strings.Contains(tagsVar, tags[i])) + } } } if !(ignorePost) { diff --git a/main.go b/main.go index 247fae1..89f837d 100755 --- a/main.go +++ b/main.go @@ -15,6 +15,7 @@ func main() { flag.StringVar(&cfg.StoreToDir, "store-to-dir", "", "Path where to store CSV file(s)") flag.BoolVar(&cfg.SkipHTMLDecoding, "skip-html-decoding", false, "Path where to store CSV file(s)") flag.StringVar(&cfg.FilterByTagId, "filter-by-tag-id", "", "Filter for tags, space sperated list") + flag.BoolVar(&cfg.FilterExactMatch, "filter-no-exact-match", false, "Match tags that contain the keywords specified by filter-by-tag-id instead of matching by exact matches only") flag.BoolVar(&cfg.JsonOneLine, "json-one-line", false, "Save json file as one object per line") flag.Parse()