diff --git a/converter/converter.go b/converter/converter.go index 07b631e..b818325 100755 --- a/converter/converter.go +++ b/converter/converter.go @@ -19,6 +19,9 @@ type Config struct { SourcePath string StoreToDir string SkipHTMLDecoding bool + FilterByTagId string + FilterExactMatch bool + JsonOneLine bool } const ( @@ -101,7 +104,17 @@ func Convert(cfg Config) (err error) { } else { cfg.StoreToDir = sourcePathResolved } - + var tags []string + if cfg.FilterByTagId != "" { + tags = strings.Fields(cfg.FilterByTagId) + log.Printf("Filter tags containing: %s", tags) + } + if !(cfg.FilterExactMatch) { + log.Printf("Filter tags have to match exactly") + } + if !(cfg.JsonOneLine) { + log.Printf("Write one json obj per line instead of array") + } log.Printf("Total %d file(s) to convert", len(sourceFiles)) var wg sync.WaitGroup @@ -112,7 +125,7 @@ func Convert(cfg Config) (err error) { fmt.Sprintf("%s.%s", typeName, cfg.ResultFormat)) wg.Add(1) log.Printf("[%s] Converting is started", typeName) - go convertXMLFile(&wg, typeName, sf, resultFile) + go convertXMLFile(&wg, typeName, sf, resultFile, tags, cfg.JsonOneLine, !(cfg.FilterExactMatch)) } wg.Wait() @@ -120,7 +133,7 @@ func Convert(cfg Config) (err error) { return } -func convertXMLFile(wg *sync.WaitGroup, typeName string, xmlFilePath string, resultFilePath string) { +func convertXMLFile(wg *sync.WaitGroup, typeName string, xmlFilePath string, resultFilePath string, tags []string, jsonOneline bool, filterExactMatch bool) { xmlFile, err := os.Open(xmlFilePath) if err != nil { log.Printf("[%s] Error: %s", typeName, err) @@ -138,9 +151,13 @@ func convertXMLFile(wg *sync.WaitGroup, typeName string, xmlFilePath string, res var total, converted int64 switch converterConfig.ResultFormat { case "csv": + if len(tags) != 0 { + log.Printf("Tag filter for csv not supported") + return + } total, converted, err = convertToCSV(typeName, xmlFile, resultFile, converterConfig) case "json": - total, converted, err = convertToJSON(typeName, xmlFile, resultFile, converterConfig) + total, converted, err = convertToJSON(typeName, xmlFile, resultFile, converterConfig, tags, jsonOneline, filterExactMatch) } if err != nil { diff --git a/converter/json.go b/converter/json.go index f18eda7..0d7772c 100644 --- a/converter/json.go +++ b/converter/json.go @@ -4,27 +4,26 @@ import ( "bufio" "bytes" "encoding/json" + "github.com/SkobelevIgor/stackexchange-xml-converter/encoders" "log" "os" - - "github.com/SkobelevIgor/stackexchange-xml-converter/encoders" + "regexp" + "strings" ) // WriteBufferSize bytes (8MB) const WriteBufferSize = 8388608 -func convertToJSON(typeName string, xmlFile *os.File, jsonFile *os.File, cfg Config) (total int64, converted int64, err error) { - +func convertToJSON(typeName string, xmlFile *os.File, jsonFile *os.File, cfg Config, tags []string, oneLine bool, filterExactMatch bool) (total int64, converted int64, err error) { iterator := NewIterator(xmlFile) w := bufio.NewWriterSize(jsonFile, WriteBufferSize) defer w.Flush() - - w.WriteByte('[') - var iErr error for iterator.Next() { if total > 0 && iErr == nil { - w.WriteByte(',') + if oneLine == false { + w.WriteByte(',') + } } total++ encoder, _ := encoders.NewEncoder(typeName) @@ -33,28 +32,45 @@ func convertToJSON(typeName string, xmlFile *os.File, jsonFile *os.File, cfg Con log.Printf("[%s] Error: %s", typeName, iErr) continue } - if cfg.SkipHTMLDecoding { encoder.EscapeFields() } - ji, iErr := marshal(&encoder) if iErr != nil { log.Printf("[%s] Error: %s", typeName, iErr) continue } - _, iErr = w.Write(ji) - if iErr != nil { - log.Printf("[%s] Error: %s", typeName, iErr) - continue + // We might want to exclude this post + // if we filter for tags this needs to be initialized with false + var ignorePost bool + ignorePost = len(tags) > 0 + if ignorePost { + // Look for tags in post + re := regexp.MustCompile(`"Tags":"(<.+>)+"`) + tagsVar := re.FindString(string(ji)) + // check if we the post is tagged with a label we are intested in + for i := 0; i < len(tags) && ignorePost; i++ { + if filterExactMatch { + ignorePost = !(strings.Contains(tagsVar, "<" + tags[i] + ">")) + } else { + ignorePost = !(strings.Contains(tagsVar, tags[i])) + } + } + } + if !(ignorePost) { + // log.Printf("test: %s", string(ji)) + _, iErr = w.Write(ji) + if iErr != nil { + log.Printf("[%s] Error: %s", typeName, iErr) + continue + } } - converted++ } - - w.WriteByte(']') - + if oneLine == false { + w.WriteByte(']') + } return } diff --git a/main.go b/main.go index 3778950..89f837d 100755 --- a/main.go +++ b/main.go @@ -14,6 +14,9 @@ func main() { flag.StringVar(&cfg.SourcePath, "source-path", "", "Path to XML file(s)") flag.StringVar(&cfg.StoreToDir, "store-to-dir", "", "Path where to store CSV file(s)") flag.BoolVar(&cfg.SkipHTMLDecoding, "skip-html-decoding", false, "Path where to store CSV file(s)") + flag.StringVar(&cfg.FilterByTagId, "filter-by-tag-id", "", "Filter for tags, space sperated list") + flag.BoolVar(&cfg.FilterExactMatch, "filter-no-exact-match", false, "Match tags that contain the keywords specified by filter-by-tag-id instead of matching by exact matches only") + flag.BoolVar(&cfg.JsonOneLine, "json-one-line", false, "Save json file as one object per line") flag.Parse() var err error