Skip to content

Feature: Create json file line by line and filter using tags #3

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 21 additions & 4 deletions converter/converter.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ type Config struct {
SourcePath string
StoreToDir string
SkipHTMLDecoding bool
FilterByTagId string
FilterExactMatch bool
JsonOneLine bool
}

const (
Expand Down Expand Up @@ -101,7 +104,17 @@ func Convert(cfg Config) (err error) {
} else {
cfg.StoreToDir = sourcePathResolved
}

var tags []string
if cfg.FilterByTagId != "" {
tags = strings.Fields(cfg.FilterByTagId)
log.Printf("Filter tags containing: %s", tags)
}
if !(cfg.FilterExactMatch) {
log.Printf("Filter tags have to match exactly")
}
if !(cfg.JsonOneLine) {
log.Printf("Write one json obj per line instead of array")
}
log.Printf("Total %d file(s) to convert", len(sourceFiles))

var wg sync.WaitGroup
Expand All @@ -112,15 +125,15 @@ func Convert(cfg Config) (err error) {
fmt.Sprintf("%s.%s", typeName, cfg.ResultFormat))
wg.Add(1)
log.Printf("[%s] Converting is started", typeName)
go convertXMLFile(&wg, typeName, sf, resultFile)
go convertXMLFile(&wg, typeName, sf, resultFile, tags, cfg.JsonOneLine, !(cfg.FilterExactMatch))
}

wg.Wait()

return
}

func convertXMLFile(wg *sync.WaitGroup, typeName string, xmlFilePath string, resultFilePath string) {
func convertXMLFile(wg *sync.WaitGroup, typeName string, xmlFilePath string, resultFilePath string, tags []string, jsonOneline bool, filterExactMatch bool) {
xmlFile, err := os.Open(xmlFilePath)
if err != nil {
log.Printf("[%s] Error: %s", typeName, err)
Expand All @@ -138,9 +151,13 @@ func convertXMLFile(wg *sync.WaitGroup, typeName string, xmlFilePath string, res
var total, converted int64
switch converterConfig.ResultFormat {
case "csv":
if len(tags) != 0 {
log.Printf("Tag filter for csv not supported")
return
}
total, converted, err = convertToCSV(typeName, xmlFile, resultFile, converterConfig)
case "json":
total, converted, err = convertToJSON(typeName, xmlFile, resultFile, converterConfig)
total, converted, err = convertToJSON(typeName, xmlFile, resultFile, converterConfig, tags, jsonOneline, filterExactMatch)
}

if err != nil {
Expand Down
52 changes: 34 additions & 18 deletions converter/json.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,26 @@ import (
"bufio"
"bytes"
"encoding/json"
"github.com/SkobelevIgor/stackexchange-xml-converter/encoders"
"log"
"os"

"github.com/SkobelevIgor/stackexchange-xml-converter/encoders"
"regexp"
"strings"
)

// WriteBufferSize bytes (8MB)
const WriteBufferSize = 8388608

func convertToJSON(typeName string, xmlFile *os.File, jsonFile *os.File, cfg Config) (total int64, converted int64, err error) {

func convertToJSON(typeName string, xmlFile *os.File, jsonFile *os.File, cfg Config, tags []string, oneLine bool, filterExactMatch bool) (total int64, converted int64, err error) {
iterator := NewIterator(xmlFile)
w := bufio.NewWriterSize(jsonFile, WriteBufferSize)
defer w.Flush()

w.WriteByte('[')

var iErr error
for iterator.Next() {
if total > 0 && iErr == nil {
w.WriteByte(',')
if oneLine == false {
w.WriteByte(',')
}
}
total++
encoder, _ := encoders.NewEncoder(typeName)
Expand All @@ -33,28 +32,45 @@ func convertToJSON(typeName string, xmlFile *os.File, jsonFile *os.File, cfg Con
log.Printf("[%s] Error: %s", typeName, iErr)
continue
}

if cfg.SkipHTMLDecoding {
encoder.EscapeFields()
}

ji, iErr := marshal(&encoder)
if iErr != nil {
log.Printf("[%s] Error: %s", typeName, iErr)
continue
}

_, iErr = w.Write(ji)
if iErr != nil {
log.Printf("[%s] Error: %s", typeName, iErr)
continue
// We might want to exclude this post
// if we filter for tags this needs to be initialized with false
var ignorePost bool
ignorePost = len(tags) > 0
if ignorePost {
// Look for tags in post
re := regexp.MustCompile(`"Tags":"(<.+>)+"`)
tagsVar := re.FindString(string(ji))
// check if we the post is tagged with a label we are intested in
for i := 0; i < len(tags) && ignorePost; i++ {
if filterExactMatch {
ignorePost = !(strings.Contains(tagsVar, "<" + tags[i] + ">"))
} else {
ignorePost = !(strings.Contains(tagsVar, tags[i]))
}
}
}
if !(ignorePost) {
// log.Printf("test: %s", string(ji))
_, iErr = w.Write(ji)
if iErr != nil {
log.Printf("[%s] Error: %s", typeName, iErr)
continue
}
}

converted++
}

w.WriteByte(']')

if oneLine == false {
w.WriteByte(']')
}
return
}

Expand Down
3 changes: 3 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ func main() {
flag.StringVar(&cfg.SourcePath, "source-path", "", "Path to XML file(s)")
flag.StringVar(&cfg.StoreToDir, "store-to-dir", "", "Path where to store CSV file(s)")
flag.BoolVar(&cfg.SkipHTMLDecoding, "skip-html-decoding", false, "Path where to store CSV file(s)")
flag.StringVar(&cfg.FilterByTagId, "filter-by-tag-id", "", "Filter for tags, space sperated list")
flag.BoolVar(&cfg.FilterExactMatch, "filter-no-exact-match", false, "Match tags that contain the keywords specified by filter-by-tag-id instead of matching by exact matches only")
flag.BoolVar(&cfg.JsonOneLine, "json-one-line", false, "Save json file as one object per line")
flag.Parse()

var err error
Expand Down