From df7b1bad646e6f5ca0ab55bb5d3d3c7a9b42393a Mon Sep 17 00:00:00 2001 From: "daniil.khasanov" Date: Wed, 23 Jul 2025 19:22:03 +0300 Subject: [PATCH 01/15] Changed string template --- CHANGELOG.md | 4 + doc/ru/usage.md | 12 +- go.mod | 3 + go.sum | 85 +++++++++ internal/generator/models/common.go | 45 +++++ internal/generator/models/common_test.go | 163 ++++++++++++++++++ .../usecase/general/generator/generator.go | 10 +- .../general/generator/value/datetime.go | 2 +- .../usecase/general/generator/value/enum.go | 2 +- .../usecase/general/generator/value/float.go | 2 +- .../general/generator/value/integer.go | 2 +- .../general/generator/value/interfaces.go | 2 +- .../usecase/general/generator/value/string.go | 33 +++- .../usecase/general/generator/value/uuid.go | 2 +- internal/generator/usecase/general/task.go | 29 +++- .../usecase/general/test/unit_test.go | 10 +- 16 files changed, 375 insertions(+), 31 deletions(-) create mode 100644 internal/generator/models/common_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 450f3eb..0d93338 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,3 +38,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Data partitioning - Ability to continue generation - Availability to ignore some models for generation + +### Changed + +- String templates replaced with jinja like diff --git a/doc/ru/usage.md b/doc/ru/usage.md index b88f2ec..499af96 100644 --- a/doc/ru/usage.md +++ b/doc/ru/usage.md @@ -164,8 +164,10 @@ open_ai: - `min_length`: Минимальная длина строки. По умолчанию `1`. - `max_length`: Максимальная длина строки. По умолчанию `32`. - `logical_type`: Логический тип строки. Поддерживаемые значения: `first_name`, `last_name`, `phone`, `text`. -- `template`: Шаблон для генерации строки. Символ `A` - любая большая буква, символ `a` - любая маленькая буква, - символ `0` - любая цифра, символ `#` - любой символ. Остальные символы остаются как есть. +- `template`: Jinja-подобный шаблон для генерации строки. Позволяет использовать любые поля генерируемой модели и + задавать паттерн строки с помощью функции `pattern`, где символ `A` - любая большая буква, символ `a` - любая маленькая буква, + символ `0` - любая цифра, символ `#` - любой символ, а остальные символы остаются как есть. + Также поддерживается использование фильтров, таких как `upper` и `lower`. - `locale`: Локаль для генерации строк. Поддерживаемые значения: `ru`, `en`. По умолчанию `en`. - `without_large_letters`: Флаг, указывающий, исключать ли большие буквы из строки. - `without_small_letters`: Флаг, указывающий, исключать ли маленькие буквы из строки. @@ -311,9 +313,13 @@ models: - name: passport type: string type_params: - template: AA 00 000 000 + template: "{{ pattern('AA 00 000 000') }}" distinct_percentage: 1 ordered: true + - name: email + type: string + type_params: + template: "{{ first_name_en | lower }}.{{ id }}@example.com" - name: rating type: float type_params: diff --git a/go.mod b/go.mod index ae4ded1..df8996a 100644 --- a/go.mod +++ b/go.mod @@ -36,7 +36,9 @@ require ( github.com/charmbracelet/x/term v0.2.1 // indirect github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/emirpasic/gods v1.18.1 // indirect github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect + github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3 // indirect github.com/goccy/go-json v0.10.5 // indirect github.com/golang/snappy v0.0.4 // indirect github.com/google/flatbuffers v25.2.10+incompatible // indirect @@ -58,6 +60,7 @@ require ( github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect github.com/muesli/cancelreader v0.2.2 // indirect github.com/muesli/termenv v0.15.3-0.20240618155329-98d742f6907a // indirect + github.com/otaviokr/topological-sort v1.1.0 // indirect github.com/pierrec/lz4/v4 v4.1.22 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/rivo/uniseg v0.4.7 // indirect diff --git a/go.sum b/go.sum index a590230..70d4c68 100644 --- a/go.sum +++ b/go.sum @@ -37,26 +37,49 @@ github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46t github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc= +github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ= github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4= github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM= github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= +github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3 h1:fmFk0Wt3bBxxwZnu48jqMdaOR/IZ4vdtJFuaFV8MpIE= +github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3/go.mod h1:bJWSKrZyQvfTnb2OudyUjurSG4/edverV7n82+K3JiM= +github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= +github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE= +github.com/go-test/deep v1.1.0 h1:WOcxcdHcvdgThNXjw0t76K42FXTU7HpNQWHpA2HHNlg= +github.com/go-test/deep v1.1.0/go.mod h1:5C2ZWiW0ErCdrYzpqxLbTX7MG14M9iiw8DgHncVwcsE= github.com/goccy/go-json v0.10.5 h1:Fq85nIqj+gXn/S5ahsiTlK3TmC85qgirsdTP/+DeaC4= github.com/goccy/go-json v0.10.5/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= +github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= +github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= +github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= +github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= +github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/flatbuffers v25.2.10+incompatible h1:F3vclr7C3HpB1k9mxCGRMXq6FdUalZ6H/pNX4FP1v0Q= github.com/google/flatbuffers v25.2.10+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= +github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= @@ -65,6 +88,9 @@ github.com/hashicorp/go-hclog v1.6.3 h1:Qr2kF+eVWjTiYmU7Y31tYlP1h0q/X3Nl3tPGdaB1 github.com/hashicorp/go-hclog v1.6.3/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVHBcfoyhpF5M= github.com/hashicorp/go-retryablehttp v0.7.7 h1:C8hUCYzor8PIfXHa4UrZkU4VvK8o9ISHxT2Q8+VepXU= github.com/hashicorp/go-retryablehttp v0.7.7/go.mod h1:pkQpWZeYWskR+D1tR2O5OcBFOxfA7DoAO6xtkuQnHTk= +github.com/heimdalr/dag v1.5.0 h1:hqVtijvY776P5OKP3QbdVBRt3Xxq6BYopz3XgklsGvo= +github.com/heimdalr/dag v1.5.0/go.mod h1:lthekrHl01dddmzqyBQ1YZbi7XcVGGzjFo0jIky5knc= +github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/ilyakaznacheev/cleanenv v1.5.0 h1:0VNZXggJE2OYdXE87bfSSwGxeiGt9moSR2lOrsHHvr4= github.com/ilyakaznacheev/cleanenv v1.5.0/go.mod h1:a5aDzaJrLCQZsazHol1w8InnDcOX0OColm64SlIi6gk= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= @@ -113,11 +139,23 @@ github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELU github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo= github.com/muesli/termenv v0.15.3-0.20240618155329-98d742f6907a h1:2MaM6YC3mGu54x+RKAA6JiFFHlHDY1UbkxqppT7wYOg= github.com/muesli/termenv v0.15.3-0.20240618155329-98d742f6907a/go.mod h1:hxSnBBYLK21Vtq/PHd0S2FYCxBXzBua8ov5s1RobyRQ= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= +github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= +github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= +github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk= +github.com/onsi/ginkgo v1.16.1/go.mod h1:CObGmKUOKaSC0RjmoAK7tKyn4Azo5P2IWuoMnvwxz1E= +github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= +github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= +github.com/onsi/gomega v1.11.0/go.mod h1:azGKhqFUon9Vuj0YmTfLSmx0FUwqXYSTl5re8lQLTUg= +github.com/otaviokr/topological-sort v1.1.0 h1:BrWj/bLOo9aZFUi0YN2/s4P/GRe2PSmb8cyX4w1ysNg= +github.com/otaviokr/topological-sort v1.1.0/go.mod h1:77ZaKUg7Ir1nL6DPwEIQFm9iH2OS5xxVWvzZ8xPTCFg= github.com/pierrec/lz4/v4 v4.1.22 h1:cKFw6uJDK+/gfw5BcDL0JL5aBsAFdsIT18eRtLj7VIU= github.com/pierrec/lz4/v4 v4.1.22/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= @@ -134,8 +172,10 @@ github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= +github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw= @@ -146,6 +186,7 @@ github.com/vbauerster/mpb/v8 v8.8.3 h1:dTOByGoqwaTJYPubhVz3lO5O6MK553XVgUo33LdnN github.com/vbauerster/mpb/v8 v8.8.3/go.mod h1:JfCCrtcMsJwP6ZwMn9e5LMnNyp3TVNpUWWkN+nd4EWk= github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU= github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= @@ -162,29 +203,60 @@ go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w= go.opentelemetry.io/otel/trace v1.34.0 h1:+ouXS2V8Rd4hp4580a8q23bg0azF2nI8cqLYnC8mh/k= go.opentelemetry.io/otel/trace v1.34.0/go.mod h1:Svm7lSjQD7kG7KJ/MUHPVXSDGz2OX4h0M2jHBhmSfRE= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM= golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U= golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 h1:e66Fs6Z+fZTbFBAxKfP3PALWBtpfqks2bwGcexMxgtk= golang.org/x/exp v0.0.0-20240909161429-701f63a606c0/go.mod h1:2TbTHSBQa924w8M6Xs1QcRcFwyucIwBGpK1p2f1YFFY= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.25.0 h1:n7a+ZbQKQA/Ysbyb0/6IbB1H/X41mKgbhfv7AfG/44w= golang.org/x/mod v0.25.0/go.mod h1:IXM97Txy2VM4PJ3gI61r1YEk/gAj6zAHN3AdZt6S9Ww= +golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200520004742-59133d7f0dd7/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181122145206-62eef0e2fa9b/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191120155948-bd437916bb0e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210112080510-489259a85091/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M= golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA= golang.org/x/time v0.8.0 h1:9i3RxcPv3PZnitoVGMPDKZSq1xW1gK1Xy3ArNOGZfEg= golang.org/x/time v0.8.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.33.0 h1:4qz2S3zmRxbGIhDIAgjxvFutSvH5EfnsYrRBj0UI0bc= golang.org/x/tools v0.33.0/go.mod h1:CIJMaWEY88juyUfo7UbgPqbC8rU2OqfAV1h2Qp0oMYI= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da h1:noIWHXmPHxILtqtCOPIhSt0ABwskkZKjD3bXGnZGpNY= golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90= gonum.org/v1/gonum v0.15.1 h1:FNy7N6OUZVUaWG9pTiD+jlhdQ3lMP+/LcTpJ6+a8sQ0= @@ -193,11 +265,24 @@ google.golang.org/genproto/googleapis/rpc v0.0.0-20250115164207-1a7da9e5054f h1: google.golang.org/genproto/googleapis/rpc v0.0.0-20250115164207-1a7da9e5054f/go.mod h1:+2Yz8+CLJbIfL9z73EW45avw8Lmge3xVElCP9zEKi50= google.golang.org/grpc v1.71.0 h1:kF77BGdPTQ4/JZWMlb9VpJ5pa25aqvVqogsxNHHdeBg= google.golang.org/grpc v1.71.0/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= +google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= +google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= +google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= +google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= +google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= +google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.36.5 h1:tPhr+woSbjfYvY6/GPufUoYizxw1cF/yFoxJ2fmpwlM= google.golang.org/protobuf v1.36.5/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20200902074654-038fdea0a05b/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= olympos.io/encoding/edn v0.0.0-20201019073823-d3554ca0b0a3 h1:slmdOY3vp8a7KQbHkL+FLbvbkgMqmXojpFUO/jENuqQ= diff --git a/internal/generator/models/common.go b/internal/generator/models/common.go index 635e84b..5814a2d 100644 --- a/internal/generator/models/common.go +++ b/internal/generator/models/common.go @@ -2,10 +2,12 @@ package models import ( "encoding/json" + "github.com/otaviokr/topological-sort/toposort" "io" "os" "path/filepath" "reflect" + "regexp" "strings" "github.com/ilyakaznacheev/cleanenv" @@ -119,3 +121,46 @@ func parseErrsToString(errs []error) string { return sb.String() } + +func TopologicalSort(columns []*Column) ([]string, error) { + graph := make(map[string][]string) + for _, c := range columns { + graph[c.Name] = make([]string, 0) + + if c.Type != "string" { + continue + } + + for _, r := range c.Ranges { + if r.StringParams.Template == "" { + continue + } + + graph[c.Name] = extractValuesFromTemplate(r.StringParams.Template) + } + } + + sortedVertexes, err := toposort.ReverseTarjan(graph) + if err != nil { + return nil, err + } + + return sortedVertexes, nil +} + +func extractValuesFromTemplate(template string) []string { + re := regexp.MustCompile(`{{\s*([^}]+)\s*}}`) + matches := re.FindAllStringSubmatch(template, -1) + + var values []string + for _, match := range matches { + expr := match[1] + + parts := regexp.MustCompile(`\s*\|\s*|\s+`).Split(expr, -1) + if len(parts) > 0 && parts[0] != "" && !strings.Contains(parts[0], "(") { + values = append(values, parts[0]) + } + } + + return values +} diff --git a/internal/generator/models/common_test.go b/internal/generator/models/common_test.go new file mode 100644 index 0000000..eb12dea --- /dev/null +++ b/internal/generator/models/common_test.go @@ -0,0 +1,163 @@ +package models + +import ( + "github.com/stretchr/testify/require" + "testing" +) + +func TestExtractValuesFromTemplate(t *testing.T) { + type testCase struct { + name string + template string + expected []string + } + + testCases := []testCase{ + { + name: "Empty template", + template: "", + expected: nil, + }, + { + name: "Valid template", + template: "{{ foo }}.{{boo}}", + expected: []string{"foo", "boo"}, + }, + { + name: "Template with filters", + template: "{{ foo | upper | lower }}", + expected: []string{"foo"}, + }, + { + name: "Template with functions", + template: "{{ upper('foo') | lower }}@{{ boo }}", + expected: []string{"boo"}, + }, + { + name: "Invalid template", + template: "{_{ foo }}", + expected: nil, + }, + } + + testFunc := func(t *testing.T, tc testCase) { + t.Helper() + + actual := extractValuesFromTemplate(tc.template) + require.Equal(t, tc.expected, actual) + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { testFunc(t, tc) }) + } +} + +func TestTopologicalSort(t *testing.T) { + type testCase struct { + name string + columns []*Column + wantErr bool + expected []string + } + + testCases := []testCase{ + { + name: "Empty columns", + columns: []*Column{}, + wantErr: false, + expected: []string{}, + }, + { + name: "Columns with dependencies", + columns: []*Column{ + { + Name: "1", + Type: "string", + Ranges: []*Params{ + { + StringParams: &ColumnStringParams{ + Template: "{{ 3 }}", + }, + }, + }, + }, + { + Name: "2", + Type: "string", + Ranges: []*Params{ + { + StringParams: &ColumnStringParams{ + Template: "{{ 4 }}", + }, + }, + }, + }, + { + Name: "3", + Type: "string", + Ranges: []*Params{ + { + StringParams: &ColumnStringParams{ + Template: "{{ 2 }}", + }, + }, + }, + }, + { + Name: "4", + Type: "string", + Ranges: []*Params{ + { + StringParams: &ColumnStringParams{ + Template: "", + }, + }, + }, + }, + }, + wantErr: false, + expected: []string{"4", "2", "3", "1"}, + }, + { + name: "Columns with cycle dependencies", + columns: []*Column{ + { + Name: "1", + Type: "string", + Ranges: []*Params{ + { + StringParams: &ColumnStringParams{ + Template: "{{ 2 }}", + }, + }, + }, + }, + { + Name: "2", + Type: "string", + Ranges: []*Params{ + { + StringParams: &ColumnStringParams{ + Template: "{{ 1 }}", + }, + }, + }, + }, + }, + wantErr: true, + expected: nil, + }, + } + + testFunc := func(t *testing.T, tc testCase) { + t.Helper() + + actual, err := TopologicalSort(tc.columns) + require.Equal(t, tc.wantErr, err != nil) + require.Equal(t, tc.expected, actual) + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { testFunc(t, tc) }) + } +} diff --git a/internal/generator/usecase/general/generator/generator.go b/internal/generator/usecase/general/generator/generator.go index 7e548fc..254db2e 100644 --- a/internal/generator/usecase/general/generator/generator.go +++ b/internal/generator/usecase/general/generator/generator.go @@ -200,7 +200,7 @@ type valueID struct { type BatchGenerator struct { numbers []valueID nextNumber int - valuer func(number valueID) (any, error) + valuer func(number valueID, generatedValues map[string]any) (any, error) } func (cg *ColumnGenerator) NewBatchGenerator(batchSize uint64) *BatchGenerator { @@ -226,14 +226,14 @@ func (cg *ColumnGenerator) NewBatchGenerator(batchSize uint64) *BatchGenerator { } } - valuer := func(id valueID) (any, error) { + valuer := func(id valueID, generatedValues map[string]any) (any, error) { vg := cg.rangeGenerators[id.generatorIndex] if vg.nullPercentage > 0 && fastRandomFloat(cg.dataColumnSeed+uint64(id.number)) < vg.nullPercentage { return nil, nil //nolint:nilnil } - return vg.generator.Value(id.number) + return vg.generator.Value(id.number, generatedValues) } return &BatchGenerator{ @@ -243,8 +243,8 @@ func (cg *ColumnGenerator) NewBatchGenerator(batchSize uint64) *BatchGenerator { } // Value returns random value for described column. -func (g *BatchGenerator) Value() (any, error) { - res, err := g.valuer(g.numbers[g.nextNumber]) +func (g *BatchGenerator) Value(generatedValues map[string]any) (any, error) { + res, err := g.valuer(g.numbers[g.nextNumber], generatedValues) g.nextNumber++ g.nextNumber %= len(g.numbers) diff --git a/internal/generator/usecase/general/generator/value/datetime.go b/internal/generator/usecase/general/generator/value/datetime.go index 84b1cab..600e89b 100644 --- a/internal/generator/usecase/general/generator/value/datetime.go +++ b/internal/generator/usecase/general/generator/value/datetime.go @@ -26,7 +26,7 @@ func (g *DateTimeGenerator) SetTotalCount(totalValuesCount uint64) error { } // Value returns n-th date from range. -func (g *DateTimeGenerator) Value(number float64) (any, error) { +func (g *DateTimeGenerator) Value(number float64, _ map[string]any) (any, error) { fromSec := g.From.Unix() toSec := g.To.Unix() diff --git a/internal/generator/usecase/general/generator/value/enum.go b/internal/generator/usecase/general/generator/value/enum.go index 8c8e4f5..18d4413 100644 --- a/internal/generator/usecase/general/generator/value/enum.go +++ b/internal/generator/usecase/general/generator/value/enum.go @@ -31,7 +31,7 @@ func (g *EnumGenerator) SetTotalCount(totalValuesCount uint64) error { return nil } -func (g *EnumGenerator) Value(number float64) (any, error) { +func (g *EnumGenerator) Value(number float64, _ map[string]any) (any, error) { idx := int(math.Floor(number)) / g.rowsPerValue return g.Values[idx], nil diff --git a/internal/generator/usecase/general/generator/value/float.go b/internal/generator/usecase/general/generator/value/float.go index 0b68655..c1903a4 100644 --- a/internal/generator/usecase/general/generator/value/float.go +++ b/internal/generator/usecase/general/generator/value/float.go @@ -26,7 +26,7 @@ func (g *FloatGenerator) SetTotalCount(totalValuesCount uint64) error { } // Value returns n-th float number from range. -func (g *FloatGenerator) Value(number float64) (any, error) { +func (g *FloatGenerator) Value(number float64, _ map[string]any) (any, error) { value := orderedFloat64(g.From, g.To, number, g.totalValuesCount) if g.BitWidth == 32 { //nolint:mnd diff --git a/internal/generator/usecase/general/generator/value/integer.go b/internal/generator/usecase/general/generator/value/integer.go index 6956e3d..c83b1ba 100644 --- a/internal/generator/usecase/general/generator/value/integer.go +++ b/internal/generator/usecase/general/generator/value/integer.go @@ -22,7 +22,7 @@ func (g *IntegerGenerator) SetTotalCount(totalValuesCount uint64) error { } // Value returns n-th integer number from range. -func (g *IntegerGenerator) Value(number float64) (any, error) { +func (g *IntegerGenerator) Value(number float64, _ map[string]any) (any, error) { value := orderedInt64(g.From, g.To, number, g.totalValuesCount) switch g.BitWidth { diff --git a/internal/generator/usecase/general/generator/value/interfaces.go b/internal/generator/usecase/general/generator/value/interfaces.go index c67e5da..0c4c196 100644 --- a/internal/generator/usecase/general/generator/value/interfaces.go +++ b/internal/generator/usecase/general/generator/value/interfaces.go @@ -7,7 +7,7 @@ type Generator interface { // SetTotalCount method should remember count of rows to generate SetTotalCount(totalValuesCount uint64) error // Value method should return ordered unique value by number - Value(number float64) (any, error) + Value(number float64, generatedValues map[string]any) (any, error) // ValuesCount method should return the number of possible values to generate ValuesCount() float64 } diff --git a/internal/generator/usecase/general/generator/value/string.go b/internal/generator/usecase/general/generator/value/string.go index fc1d7c5..a48a848 100644 --- a/internal/generator/usecase/general/generator/value/string.go +++ b/internal/generator/usecase/general/generator/value/string.go @@ -1,6 +1,7 @@ package value import ( + "github.com/flosch/pongo2" "math" "math/big" "slices" @@ -21,6 +22,7 @@ type StringGenerator struct { *models.ColumnStringParams totalValuesCount uint64 localeModule locale.LocalModule + template *pongo2.Template charset []rune countByPrefix []float64 sumByPrefix []float64 @@ -29,6 +31,15 @@ type StringGenerator struct { //nolint:cyclop func (g *StringGenerator) Prepare() error { + if g.Template != "" { + template, err := pongo2.FromString(g.Template) + if err != nil { + return err + } + + g.template = template + } + switch g.Locale { case "ru": g.localeModule = ru.NewLocaleModule(g.LogicalType, g.MinLength, g.MaxLength) @@ -171,8 +182,22 @@ func (g *StringGenerator) calculateCompletions(length int) []int64 { } // templateString returns n-th string by template. -func (g *StringGenerator) templateString(number float64) string { - val := []rune(g.Template) +func (g *StringGenerator) templateString(number float64, generatedValues map[string]any) (string, error) { + generatedValues["pattern"] = func(pattern string) string { + return g.patternString(number, pattern) + } + + val, err := g.template.Execute(generatedValues) + if err != nil { + return "", err + } + + return val, nil +} + +// patternString returns n-th string by pattern. +func (g *StringGenerator) patternString(number float64, pattern string) string { + val := []rune(pattern) index := number / float64(g.totalValuesCount) for i := range val { @@ -410,9 +435,9 @@ func (g *StringGenerator) simpleString(number float64) string { } // Value returns n-th string from range. -func (g *StringGenerator) Value(number float64) (any, error) { +func (g *StringGenerator) Value(number float64, row map[string]any) (any, error) { if g.Template != "" { - return g.templateString(number), nil + return g.templateString(number, row) } switch g.LogicalType { diff --git a/internal/generator/usecase/general/generator/value/uuid.go b/internal/generator/usecase/general/generator/value/uuid.go index 7bf922b..ca32580 100644 --- a/internal/generator/usecase/general/generator/value/uuid.go +++ b/internal/generator/usecase/general/generator/value/uuid.go @@ -25,7 +25,7 @@ func (g *UUIDGenerator) SetTotalCount(totalValuesCount uint64) error { } // Value returns n-th UUID from range. -func (g *UUIDGenerator) Value(number float64) (any, error) { +func (g *UUIDGenerator) Value(number float64, _ map[string]any) (any, error) { res := uuid.UUID{} index := number / float64(g.totalValuesCount) diff --git a/internal/generator/usecase/general/task.go b/internal/generator/usecase/general/task.go index 2bf435e..6067e08 100644 --- a/internal/generator/usecase/general/task.go +++ b/internal/generator/usecase/general/task.go @@ -221,7 +221,7 @@ func (t *Task) generateAndSaveValues(ctx context.Context) error { generators = append(generators, t.generators[columnKey].NewBatchGenerator(rowsCount)) } - pool.Submit(ctx, outputSyncer.WorkerSyncer(), modelName, generators, rowsCount) + pool.Submit(ctx, outputSyncer.WorkerSyncer(), model, generators, rowsCount) } }() } @@ -255,7 +255,7 @@ func (t *Task) skipRows() { // generateAndSaveBatch function generate batch of values for selected column and send it to output. func (t *Task) generateAndSaveBatch( ctx context.Context, outputSync *common.WorkerSyncer, - modelName string, generators []*generator.BatchGenerator, count uint64, + model *models.Model, generators []*generator.BatchGenerator, count uint64, ) error { defer outputSync.Done(ctx) @@ -266,29 +266,42 @@ func (t *Task) generateAndSaveBatch( } } - for g, gen := range generators { - for i := range count { + sortedColumn, err := models.TopologicalSort(model.Columns) + if err != nil { + return err + } + + originIndexes := make(map[string]int, len(model.Columns)) + for index, column := range model.Columns { + originIndexes[column.Name] = index + } + + for i := range count { + generatedValues := make(map[string]any) + + for _, columnName := range sortedColumn { if common.CtxClosed(ctx) { return &common.ContextCancelError{} } - value, err := gen.Value() + value, err := generators[originIndexes[columnName]].Value(generatedValues) if err != nil { return errors.WithMessage(err, "failed to get or generate value") } - batch[i].Values[g] = value + generatedValues[columnName] = value + batch[i].Values[originIndexes[columnName]] = value } } outputSync.WaitPrevious(ctx) - err := t.output.HandleRowsBatch(ctx, modelName, batch) + err = t.output.HandleRowsBatch(ctx, model.Name, batch) if err != nil { return errors.WithMessage(err, "failed to save batch to output") } - t.progress.Add(modelName, count) + t.progress.Add(model.Name, count) return nil } diff --git a/internal/generator/usecase/general/test/unit_test.go b/internal/generator/usecase/general/test/unit_test.go index 6efad38..0a2a7ad 100644 --- a/internal/generator/usecase/general/test/unit_test.go +++ b/internal/generator/usecase/general/test/unit_test.go @@ -437,9 +437,9 @@ func TestString(t *testing.T) { {&models.ColumnStringParams{LogicalType: models.LastNameType, MinLength: 4, MaxLength: 7}, 4, 7}, {&models.ColumnStringParams{LogicalType: models.PhoneType, MinLength: 10, MaxLength: 10}, 10, 10}, {&models.ColumnStringParams{MinLength: 100, MaxLength: 100}, 100, 100}, - {&models.ColumnStringParams{Template: "AAaa00##", Locale: "en"}, 8, 8}, - {&models.ColumnStringParams{Template: "AAaa00##", Locale: "ru"}, 8, 8}, - {&models.ColumnStringParams{Template: "0123456789012345678901234567890123456789"}, 40, 40}, + {&models.ColumnStringParams{Template: "{{ pattern('AAaa00##') }}", Locale: "en"}, 8, 8}, + {&models.ColumnStringParams{Template: "{{ pattern('AAaa00##') }}", Locale: "ru"}, 8, 8}, + {&models.ColumnStringParams{Template: "{{ pattern('0123456789012345678901234567890123456789') }}"}, 40, 40}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 3, MaxLength: 5}, 3, 5}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 254, MaxLength: 256}, 254, 256}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 510, MaxLength: 512}, 510, 512}, @@ -449,7 +449,7 @@ func TestString(t *testing.T) { } for _, testCase := range testCases { - column := &models.Column{Type: "string", Ranges: []*models.Params{{TypeParams: testCase.typeParams}}} + column := &models.Column{Name: "test", Type: "string", Ranges: []*models.Params{{TypeParams: testCase.typeParams}}} handled := checkType(t, column, "") strValue, ok := handled[0].Values[0].(string) @@ -599,7 +599,7 @@ func TestIdempotence(t *testing.T) { Name: "passport", Type: "string", Ranges: []*models.Params{{TypeParams: &models.ColumnStringParams{ - Template: "AA 00 000 000", + Template: "{{ pattern('AA 00 000 000') }}", }, NullPercentage: 0.5}}, }, From 771a45f68142e6ee7c32f2d11e067a03d27cbe8e Mon Sep 17 00:00:00 2001 From: "daniil.khasanov" Date: Thu, 24 Jul 2025 18:48:01 +0300 Subject: [PATCH 02/15] Updated the calculation of the number of possible values to generation --- go.mod | 5 +- go.sum | 13 ++-- internal/generator/common/utils.go | 21 +++++ internal/generator/common/utils_test.go | 47 +++++++++++ internal/generator/models/common.go | 33 ++------ internal/generator/models/common_test.go | 52 +------------ internal/generator/models/generator_model.go | 12 ++- internal/generator/models/models_test.go | 3 + .../usecase/general/generator/generator.go | 37 +++++---- .../general/generator/value/datetime.go | 2 +- .../usecase/general/generator/value/enum.go | 2 +- .../usecase/general/generator/value/float.go | 2 +- .../general/generator/value/integer.go | 2 +- .../general/generator/value/interfaces.go | 2 +- .../usecase/general/generator/value/string.go | 78 +++++++++++++++---- .../usecase/general/generator/value/uuid.go | 2 +- internal/generator/usecase/general/task.go | 19 +++-- .../usecase/general/test/unit_test.go | 58 +++++++++++--- 18 files changed, 244 insertions(+), 146 deletions(-) diff --git a/go.mod b/go.mod index df8996a..ad9d088 100644 --- a/go.mod +++ b/go.mod @@ -5,12 +5,14 @@ go 1.23.8 require ( github.com/apache/arrow-go/v18 v18.2.0 github.com/charmbracelet/huh/spinner v0.0.0-20250203114958-f07ae1af69ae + github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3 github.com/google/uuid v1.6.0 github.com/hashicorp/go-retryablehttp v0.7.7 github.com/ilyakaznacheev/cleanenv v1.5.0 github.com/labstack/echo/v4 v4.13.3 github.com/manifoldco/promptui v0.9.0 github.com/moby/term v0.5.2 + github.com/otaviokr/topological-sort v1.1.0 github.com/pkg/errors v0.9.1 github.com/sashabaranov/go-openai v1.36.1 github.com/spf13/afero v1.12.0 @@ -36,9 +38,7 @@ require ( github.com/charmbracelet/x/term v0.2.1 // indirect github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/emirpasic/gods v1.18.1 // indirect github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect - github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3 // indirect github.com/goccy/go-json v0.10.5 // indirect github.com/golang/snappy v0.0.4 // indirect github.com/google/flatbuffers v25.2.10+incompatible // indirect @@ -60,7 +60,6 @@ require ( github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect github.com/muesli/cancelreader v0.2.2 // indirect github.com/muesli/termenv v0.15.3-0.20240618155329-98d742f6907a // indirect - github.com/otaviokr/topological-sort v1.1.0 // indirect github.com/pierrec/lz4/v4 v4.1.22 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/rivo/uniseg v0.4.7 // indirect diff --git a/go.sum b/go.sum index 70d4c68..8488c13 100644 --- a/go.sum +++ b/go.sum @@ -41,8 +41,6 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc= -github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ= github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4= github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM= github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= @@ -50,14 +48,13 @@ github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4Nij github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3 h1:fmFk0Wt3bBxxwZnu48jqMdaOR/IZ4vdtJFuaFV8MpIE= github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3/go.mod h1:bJWSKrZyQvfTnb2OudyUjurSG4/edverV7n82+K3JiM= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= +github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE= -github.com/go-test/deep v1.1.0 h1:WOcxcdHcvdgThNXjw0t76K42FXTU7HpNQWHpA2HHNlg= -github.com/go-test/deep v1.1.0/go.mod h1:5C2ZWiW0ErCdrYzpqxLbTX7MG14M9iiw8DgHncVwcsE= github.com/goccy/go-json v0.10.5 h1:Fq85nIqj+gXn/S5ahsiTlK3TmC85qgirsdTP/+DeaC4= github.com/goccy/go-json v0.10.5/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= @@ -79,7 +76,6 @@ github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= @@ -88,8 +84,6 @@ github.com/hashicorp/go-hclog v1.6.3 h1:Qr2kF+eVWjTiYmU7Y31tYlP1h0q/X3Nl3tPGdaB1 github.com/hashicorp/go-hclog v1.6.3/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVHBcfoyhpF5M= github.com/hashicorp/go-retryablehttp v0.7.7 h1:C8hUCYzor8PIfXHa4UrZkU4VvK8o9ISHxT2Q8+VepXU= github.com/hashicorp/go-retryablehttp v0.7.7/go.mod h1:pkQpWZeYWskR+D1tR2O5OcBFOxfA7DoAO6xtkuQnHTk= -github.com/heimdalr/dag v1.5.0 h1:hqVtijvY776P5OKP3QbdVBRt3Xxq6BYopz3XgklsGvo= -github.com/heimdalr/dag v1.5.0/go.mod h1:lthekrHl01dddmzqyBQ1YZbi7XcVGGzjFo0jIky5knc= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/ilyakaznacheev/cleanenv v1.5.0 h1:0VNZXggJE2OYdXE87bfSSwGxeiGt9moSR2lOrsHHvr4= github.com/ilyakaznacheev/cleanenv v1.5.0/go.mod h1:a5aDzaJrLCQZsazHol1w8InnDcOX0OColm64SlIi6gk= @@ -141,12 +135,15 @@ github.com/muesli/termenv v0.15.3-0.20240618155329-98d742f6907a h1:2MaM6YC3mGu54 github.com/muesli/termenv v0.15.3-0.20240618155329-98d742f6907a/go.mod h1:hxSnBBYLK21Vtq/PHd0S2FYCxBXzBua8ov5s1RobyRQ= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= +github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk= +github.com/onsi/ginkgo v1.16.1 h1:foqVmeWDD6yYpK+Yz3fHyNIxFYNxswxqNFjSKe+vI54= github.com/onsi/ginkgo v1.16.1/go.mod h1:CObGmKUOKaSC0RjmoAK7tKyn4Azo5P2IWuoMnvwxz1E= github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= +github.com/onsi/gomega v1.11.0 h1:+CqWgvj0OZycCaqclBD1pxKHAU+tOkHmQIWvDHq2aug= github.com/onsi/gomega v1.11.0/go.mod h1:azGKhqFUon9Vuj0YmTfLSmx0FUwqXYSTl5re8lQLTUg= github.com/otaviokr/topological-sort v1.1.0 h1:BrWj/bLOo9aZFUi0YN2/s4P/GRe2PSmb8cyX4w1ysNg= github.com/otaviokr/topological-sort v1.1.0/go.mod h1:77ZaKUg7Ir1nL6DPwEIQFm9iH2OS5xxVWvzZ8xPTCFg= @@ -278,10 +275,12 @@ gopkg.in/check.v1 v1.0.0-20200902074654-038fdea0a05b/go.mod h1:Co6ibVJAznAaIkqp8 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/generator/common/utils.go b/internal/generator/common/utils.go index 38f95d0..a58f0c4 100644 --- a/internal/generator/common/utils.go +++ b/internal/generator/common/utils.go @@ -7,6 +7,7 @@ import ( "fmt" "os" "reflect" + "regexp" "slices" "strconv" "strings" @@ -357,3 +358,23 @@ func CtxClosed(ctx context.Context) bool { return false } } + +func ExtractValuesFromTemplate(template string) []string { + re := regexp.MustCompile(`{{\s*([^\s|(){}]+)[^}]*}}`) + matches := re.FindAllStringSubmatch(template, -1) + + values := make([]string, 0, len(matches)) + + for _, match := range matches { + expr := match[0] + val := match[1] + + if strings.Contains(expr, "(") && strings.Contains(expr, ")") { + continue + } + + values = append(values, val) + } + + return values +} diff --git a/internal/generator/common/utils_test.go b/internal/generator/common/utils_test.go index 96b5529..34dabf2 100644 --- a/internal/generator/common/utils_test.go +++ b/internal/generator/common/utils_test.go @@ -697,3 +697,50 @@ func TestWalkWithFilter(t *testing.T) { t.Run(tc.name, func(t *testing.T) { testFunc(t, tc) }) } } + +func TestExtractValuesFromTemplate(t *testing.T) { + type testCase struct { + name string + template string + expected []string + } + + testCases := []testCase{ + { + name: "Empty template", + template: "", + expected: []string{}, + }, + { + name: "Valid template", + template: "{{ foo }}.{{boo}}", + expected: []string{"foo", "boo"}, + }, + { + name: "Template with filters", + template: "{{ foo | upper | lower }}", + expected: []string{"foo"}, + }, + { + name: "Template with functions", + template: "{{ upper('foo') | lower }}@{{ boo }}", + expected: []string{"boo"}, + }, + { + name: "Invalid template", + template: "{_{ foo }}", + expected: []string{}, + }, + } + + testFunc := func(t *testing.T, tc testCase) { + t.Helper() + + actual := ExtractValuesFromTemplate(tc.template) + require.Equal(t, tc.expected, actual) + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { testFunc(t, tc) }) + } +} diff --git a/internal/generator/models/common.go b/internal/generator/models/common.go index 5814a2d..5f82f8e 100644 --- a/internal/generator/models/common.go +++ b/internal/generator/models/common.go @@ -2,15 +2,15 @@ package models import ( "encoding/json" - "github.com/otaviokr/topological-sort/toposort" + "github.com/tarantool/sdvg/internal/generator/common" "io" "os" "path/filepath" "reflect" - "regexp" "strings" "github.com/ilyakaznacheev/cleanenv" + "github.com/otaviokr/topological-sort/toposort" "github.com/pkg/errors" "gopkg.in/yaml.v3" ) @@ -122,45 +122,24 @@ func parseErrsToString(errs []error) string { return sb.String() } -func TopologicalSort(columns []*Column) ([]string, error) { +func topologicalSort(columns []*Column) ([]string, error) { graph := make(map[string][]string) for _, c := range columns { graph[c.Name] = make([]string, 0) - if c.Type != "string" { - continue - } - for _, r := range c.Ranges { - if r.StringParams.Template == "" { + if r.StringParams == nil || r.StringParams.Template == "" { continue } - graph[c.Name] = extractValuesFromTemplate(r.StringParams.Template) + graph[c.Name] = common.ExtractValuesFromTemplate(r.StringParams.Template) } } sortedVertexes, err := toposort.ReverseTarjan(graph) if err != nil { - return nil, err + return nil, errors.New(err.Error()) } return sortedVertexes, nil } - -func extractValuesFromTemplate(template string) []string { - re := regexp.MustCompile(`{{\s*([^}]+)\s*}}`) - matches := re.FindAllStringSubmatch(template, -1) - - var values []string - for _, match := range matches { - expr := match[1] - - parts := regexp.MustCompile(`\s*\|\s*|\s+`).Split(expr, -1) - if len(parts) > 0 && parts[0] != "" && !strings.Contains(parts[0], "(") { - values = append(values, parts[0]) - } - } - - return values -} diff --git a/internal/generator/models/common_test.go b/internal/generator/models/common_test.go index eb12dea..314bb87 100644 --- a/internal/generator/models/common_test.go +++ b/internal/generator/models/common_test.go @@ -1,56 +1,10 @@ package models import ( - "github.com/stretchr/testify/require" "testing" -) - -func TestExtractValuesFromTemplate(t *testing.T) { - type testCase struct { - name string - template string - expected []string - } - - testCases := []testCase{ - { - name: "Empty template", - template: "", - expected: nil, - }, - { - name: "Valid template", - template: "{{ foo }}.{{boo}}", - expected: []string{"foo", "boo"}, - }, - { - name: "Template with filters", - template: "{{ foo | upper | lower }}", - expected: []string{"foo"}, - }, - { - name: "Template with functions", - template: "{{ upper('foo') | lower }}@{{ boo }}", - expected: []string{"boo"}, - }, - { - name: "Invalid template", - template: "{_{ foo }}", - expected: nil, - }, - } - testFunc := func(t *testing.T, tc testCase) { - t.Helper() - - actual := extractValuesFromTemplate(tc.template) - require.Equal(t, tc.expected, actual) - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { testFunc(t, tc) }) - } -} + "github.com/stretchr/testify/require" +) func TestTopologicalSort(t *testing.T) { type testCase struct { @@ -152,7 +106,7 @@ func TestTopologicalSort(t *testing.T) { testFunc := func(t *testing.T, tc testCase) { t.Helper() - actual, err := TopologicalSort(tc.columns) + actual, err := topologicalSort(tc.columns) require.Equal(t, tc.wantErr, err != nil) require.Equal(t, tc.expected, actual) } diff --git a/internal/generator/models/generator_model.go b/internal/generator/models/generator_model.go index e3c451f..88aaf16 100644 --- a/internal/generator/models/generator_model.go +++ b/internal/generator/models/generator_model.go @@ -30,8 +30,9 @@ type Model struct { RowsPerFile uint64 `backup:"true" json:"rows_per_file" yaml:"rows_per_file"` ModelDir string `backup:"true" json:"model_dir" yaml:"model_dir"` // The columns from the partitioning key with PartitionColumn.WriteToOutput == false, must be at the end of slice. - Columns []*Column `backup:"true" json:"columns" yaml:"columns"` - PartitionColumns []*PartitionColumn `backup:"true" json:"partition_columns" yaml:"partition_columns"` + Columns []*Column `backup:"true" json:"columns" yaml:"columns"` + ColumnsTopologicalOrder []string + PartitionColumns []*PartitionColumn `backup:"true" json:"partition_columns" yaml:"partition_columns"` } // PartitionColumn type is used to describe partition parameters for column. @@ -80,6 +81,13 @@ func (m *Model) Parse() error { m.shiftColumnsToEnd(nonWriteableColumns) + sortedColumns, err := topologicalSort(m.Columns) + if err != nil { + return errors.WithMessage(err, "failed to sorting columns by dependencies") + } + + m.ColumnsTopologicalOrder = sortedColumns + return nil } diff --git a/internal/generator/models/models_test.go b/internal/generator/models/models_test.go index ffb908a..5df79da 100644 --- a/internal/generator/models/models_test.go +++ b/internal/generator/models/models_test.go @@ -1197,6 +1197,9 @@ parquet params: expectedModel := tc.expected.Models[modelName] gotModel := cfg.Models[modelName] + // skip ColumnsTopologicalOrder check + expectedModel.ColumnsTopologicalOrder = gotModel.ColumnsTopologicalOrder + for columnName := range expectedModel.Columns { expectedColumn := expectedModel.Columns[columnName] gotColumn := gotModel.Columns[columnName] diff --git a/internal/generator/usecase/general/generator/generator.go b/internal/generator/usecase/general/generator/generator.go index 254db2e..2dc8c27 100644 --- a/internal/generator/usecase/general/generator/generator.go +++ b/internal/generator/usecase/general/generator/generator.go @@ -11,12 +11,13 @@ import ( ) type rangeGenerator struct { - numFrom uint64 - numTo uint64 - sequencer sequencer - dataRandomFactor float64 - generator value.Generator - nullPercentage float64 + numFrom uint64 + numTo uint64 + distinctValuesCount uint64 + sequencer sequencer + dataRandomFactor float64 + generator value.Generator + nullPercentage float64 } type ColumnGenerator struct { @@ -28,7 +29,7 @@ type ColumnGenerator struct { } func NewColumnGenerator( - baseSeed uint64, + baseSeed uint64, distinctValuesCountByColumn map[string]uint64, modelName string, model *models.Model, column *models.Column, dataModelName string, dataModel *models.Model, dataColumn *models.Column, ) (*ColumnGenerator, error) { @@ -54,7 +55,7 @@ func NewColumnGenerator( rangeRowsCount := uint64(math.Ceil(float64(rowsCount) * dataRange.RangePercentage)) gen, err := newRangeGenerator( - column, columnSeed, + column, columnSeed, distinctValuesCountByColumn, dataModel, dataColumn, dataColumnSeed, dataRange, rangeRowsOffset, rangeRowsCount, ) @@ -67,7 +68,6 @@ func NewColumnGenerator( } rangeGenerators = append(rangeGenerators, gen) - rangeRowsOffset += rangeRowsCount } @@ -94,7 +94,7 @@ func (cg *ColumnGenerator) SkipRows(count uint64) { //nolint:cyclop func newRangeGenerator( - column *models.Column, columnSeed uint64, + column *models.Column, columnSeed uint64, distinctValuesCountByColumn map[string]uint64, dataModel *models.Model, dataColumn *models.Column, dataColumnSeed uint64, dataRange *models.Params, rangeRowsOffset, rangeRowsCount uint64, ) (*rangeGenerator, error) { @@ -140,7 +140,7 @@ func newRangeGenerator( distinctValuesCount = dataRange.DistinctCount } - generatorValuesCount := valueGenerator.ValuesCount() + generatorValuesCount := valueGenerator.ValuesCount(distinctValuesCountByColumn) if float64(distinctValuesCount) > generatorValuesCount { if dataRange.DistinctPercentage != 0 || dataRange.DistinctCount != 0 { @@ -150,6 +150,8 @@ func newRangeGenerator( distinctValuesCount = uint64(generatorValuesCount) } + distinctValuesCountByColumn[column.Name] += distinctValuesCount + rangeOrdered := dataRange.Ordered orderSeed := dataColumnSeed @@ -173,12 +175,13 @@ func newRangeGenerator( dataRandomFactor := 1 - float64(distinctValuesCount)/generatorValuesCount return &rangeGenerator{ - numFrom: rangeRowsOffset, - numTo: rangeRowsOffset + rangeRowsCount, - dataRandomFactor: dataRandomFactor, - generator: valueGenerator, - sequencer: rangeSequencer, - nullPercentage: dataRange.NullPercentage, + numFrom: rangeRowsOffset, + numTo: rangeRowsOffset + rangeRowsCount, + distinctValuesCount: distinctValuesCount, + dataRandomFactor: dataRandomFactor, + generator: valueGenerator, + sequencer: rangeSequencer, + nullPercentage: dataRange.NullPercentage, }, nil } diff --git a/internal/generator/usecase/general/generator/value/datetime.go b/internal/generator/usecase/general/generator/value/datetime.go index 600e89b..8970354 100644 --- a/internal/generator/usecase/general/generator/value/datetime.go +++ b/internal/generator/usecase/general/generator/value/datetime.go @@ -49,7 +49,7 @@ func (g *DateTimeGenerator) Value(number float64, _ map[string]any) (any, error) return value, nil } -func (g *DateTimeGenerator) ValuesCount() float64 { +func (g *DateTimeGenerator) ValuesCount(_ map[string]uint64) float64 { fromSec := g.From.Unix() toSec := g.To.Unix() diff --git a/internal/generator/usecase/general/generator/value/enum.go b/internal/generator/usecase/general/generator/value/enum.go index 18d4413..e989b68 100644 --- a/internal/generator/usecase/general/generator/value/enum.go +++ b/internal/generator/usecase/general/generator/value/enum.go @@ -37,6 +37,6 @@ func (g *EnumGenerator) Value(number float64, _ map[string]any) (any, error) { return g.Values[idx], nil } -func (g *EnumGenerator) ValuesCount() float64 { +func (g *EnumGenerator) ValuesCount(_ map[string]uint64) float64 { return float64(len(g.Values)) } diff --git a/internal/generator/usecase/general/generator/value/float.go b/internal/generator/usecase/general/generator/value/float.go index c1903a4..79ba0c5 100644 --- a/internal/generator/usecase/general/generator/value/float.go +++ b/internal/generator/usecase/general/generator/value/float.go @@ -36,6 +36,6 @@ func (g *FloatGenerator) Value(number float64, _ map[string]any) (any, error) { return value, nil } -func (g *FloatGenerator) ValuesCount() float64 { +func (g *FloatGenerator) ValuesCount(_ map[string]uint64) float64 { return math.Inf(1) } diff --git a/internal/generator/usecase/general/generator/value/integer.go b/internal/generator/usecase/general/generator/value/integer.go index c83b1ba..5aefbc1 100644 --- a/internal/generator/usecase/general/generator/value/integer.go +++ b/internal/generator/usecase/general/generator/value/integer.go @@ -37,6 +37,6 @@ func (g *IntegerGenerator) Value(number float64, _ map[string]any) (any, error) } } -func (g *IntegerGenerator) ValuesCount() float64 { +func (g *IntegerGenerator) ValuesCount(_ map[string]uint64) float64 { return float64(uint64(g.To-g.From)) + 1 } diff --git a/internal/generator/usecase/general/generator/value/interfaces.go b/internal/generator/usecase/general/generator/value/interfaces.go index 0c4c196..5094611 100644 --- a/internal/generator/usecase/general/generator/value/interfaces.go +++ b/internal/generator/usecase/general/generator/value/interfaces.go @@ -9,5 +9,5 @@ type Generator interface { // Value method should return ordered unique value by number Value(number float64, generatedValues map[string]any) (any, error) // ValuesCount method should return the number of possible values to generate - ValuesCount() float64 + ValuesCount(distinctValuesCountByColumn map[string]uint64) float64 } diff --git a/internal/generator/usecase/general/generator/value/string.go b/internal/generator/usecase/general/generator/value/string.go index a48a848..69eb01d 100644 --- a/internal/generator/usecase/general/generator/value/string.go +++ b/internal/generator/usecase/general/generator/value/string.go @@ -1,19 +1,25 @@ package value import ( - "github.com/flosch/pongo2" "math" "math/big" + "regexp" "slices" "strings" + "github.com/flosch/pongo2" "github.com/pkg/errors" + "github.com/tarantool/sdvg/internal/generator/common" "github.com/tarantool/sdvg/internal/generator/models" "github.com/tarantool/sdvg/internal/generator/usecase/general/locale" "github.com/tarantool/sdvg/internal/generator/usecase/general/locale/en" "github.com/tarantool/sdvg/internal/generator/usecase/general/locale/ru" ) +var ( + rePatternVal = regexp.MustCompile(`pattern\((?:'([^']*)'|"([^"]*)")\)`) +) + // Verify interface compliance in compile time. var _ Generator = (*StringGenerator)(nil) @@ -34,7 +40,7 @@ func (g *StringGenerator) Prepare() error { if g.Template != "" { template, err := pongo2.FromString(g.Template) if err != nil { - return err + return errors.Errorf("failed to parse template: %s", err.Error()) } g.template = template @@ -183,13 +189,13 @@ func (g *StringGenerator) calculateCompletions(length int) []int64 { // templateString returns n-th string by template. func (g *StringGenerator) templateString(number float64, generatedValues map[string]any) (string, error) { - generatedValues["pattern"] = func(pattern string) string { - return g.patternString(number, pattern) + generatedValues["pattern"] = func(pattern string) *pongo2.Value { + return pongo2.AsSafeValue(g.patternString(number, pattern)) } val, err := g.template.Execute(generatedValues) if err != nil { - return "", err + return "", errors.New(err.Error()) } return val, nil @@ -437,7 +443,12 @@ func (g *StringGenerator) simpleString(number float64) string { // Value returns n-th string from range. func (g *StringGenerator) Value(number float64, row map[string]any) (any, error) { if g.Template != "" { - return g.templateString(number, row) + val, err := g.templateString(number, row) + if err != nil { + return nil, errors.WithMessage(err, "failed to template string") + } + + return val, nil } switch g.LogicalType { @@ -455,15 +466,9 @@ func (g *StringGenerator) Value(number float64, row map[string]any) (any, error) } //nolint:cyclop -func (g *StringGenerator) ValuesCount() float64 { +func (g *StringGenerator) ValuesCount(distinctValuesCountByColumn map[string]uint64) float64 { if g.Template != "" { - totalCount := float64(0) - totalCount += math.Pow(float64(len(g.localeModule.LargeLetters())), float64(strings.Count(g.Template, "A"))) - totalCount += math.Pow(float64(len(g.localeModule.SmallLetters())), float64(strings.Count(g.Template, "a"))) - totalCount += math.Pow(float64(len(locale.Numbers)), float64(strings.Count(g.Template, "0"))) - totalCount += math.Pow(float64(len(locale.SpecialChars)), float64(strings.Count(g.Template, "#"))) - - return totalCount + return g.templateCardinality(distinctValuesCountByColumn) } switch g.LogicalType { @@ -501,3 +506,48 @@ func (g *StringGenerator) ValuesCount() float64 { return totalCount } + +func (g *StringGenerator) templateCardinality(distinctValuesCountByColumn map[string]uint64) float64 { + total := 1.0 + + patternValMatches := rePatternVal.FindAllStringSubmatch(g.Template, -1) + for _, match := range patternValMatches { + pattern := match[1] + if pattern == "" { + pattern = match[2] + } + + total *= g.patternCardinality(pattern) + } + + columns := common.ExtractValuesFromTemplate(g.Template) + for _, column := range columns { + if count, ok := distinctValuesCountByColumn[column]; ok && count > 0 { + total *= float64(count) + } + } + + return total +} + +func (g *StringGenerator) patternCardinality(pattern string) float64 { + total := 1.0 + + if count := strings.Count(pattern, "A"); count > 0 { + total *= math.Pow(float64(len(g.localeModule.LargeLetters())), float64(count)) + } + + if count := strings.Count(pattern, "a"); count > 0 { + total *= math.Pow(float64(len(g.localeModule.SmallLetters())), float64(count)) + } + + if count := strings.Count(pattern, "0"); count > 0 { + total *= math.Pow(float64(len(locale.Numbers)), float64(count)) + } + + if count := strings.Count(pattern, "#"); count > 0 { + total *= math.Pow(float64(len(locale.SpecialChars)), float64(count)) + } + + return total +} diff --git a/internal/generator/usecase/general/generator/value/uuid.go b/internal/generator/usecase/general/generator/value/uuid.go index ca32580..914e503 100644 --- a/internal/generator/usecase/general/generator/value/uuid.go +++ b/internal/generator/usecase/general/generator/value/uuid.go @@ -43,6 +43,6 @@ func (g *UUIDGenerator) Value(number float64, _ map[string]any) (any, error) { return res, nil } -func (g *UUIDGenerator) ValuesCount() float64 { +func (g *UUIDGenerator) ValuesCount(_ map[string]uint64) float64 { return float64(1<<(128-10) - 1) //nolint:mnd } diff --git a/internal/generator/usecase/general/task.go b/internal/generator/usecase/general/task.go index 6067e08..cae4282 100644 --- a/internal/generator/usecase/general/task.go +++ b/internal/generator/usecase/general/task.go @@ -84,6 +84,8 @@ func newGenerators(cfg *models.GenerationConfig) (map[string]*generator.ColumnGe generators := make(map[string]*generator.ColumnGenerator) for modelName, model := range cfg.Models { + distinctValuesCountByColumn := make(map[string]uint64) + for _, column := range model.Columns { dataModelName := modelName dataModel := model @@ -98,7 +100,7 @@ func newGenerators(cfg *models.GenerationConfig) (map[string]*generator.ColumnGe columnKey := common.GetKey(modelName, column.Name) gen, err := generator.NewColumnGenerator( - cfg.RandomSeed, + cfg.RandomSeed, distinctValuesCountByColumn, modelName, model, column, dataModelName, dataModel, dataColumn, ) @@ -266,11 +268,6 @@ func (t *Task) generateAndSaveBatch( } } - sortedColumn, err := models.TopologicalSort(model.Columns) - if err != nil { - return err - } - originIndexes := make(map[string]int, len(model.Columns)) for index, column := range model.Columns { originIndexes[column.Name] = index @@ -279,24 +276,26 @@ func (t *Task) generateAndSaveBatch( for i := range count { generatedValues := make(map[string]any) - for _, columnName := range sortedColumn { + for _, columnName := range model.ColumnsTopologicalOrder { if common.CtxClosed(ctx) { return &common.ContextCancelError{} } - value, err := generators[originIndexes[columnName]].Value(generatedValues) + idx := originIndexes[columnName] + + value, err := generators[idx].Value(generatedValues) if err != nil { return errors.WithMessage(err, "failed to get or generate value") } generatedValues[columnName] = value - batch[i].Values[originIndexes[columnName]] = value + batch[i].Values[idx] = value } } outputSync.WaitPrevious(ctx) - err = t.output.HandleRowsBatch(ctx, model.Name, batch) + err := t.output.HandleRowsBatch(ctx, model.Name, batch) if err != nil { return errors.WithMessage(err, "failed to save batch to output") } diff --git a/internal/generator/usecase/general/test/unit_test.go b/internal/generator/usecase/general/test/unit_test.go index 0a2a7ad..91a1bcd 100644 --- a/internal/generator/usecase/general/test/unit_test.go +++ b/internal/generator/usecase/general/test/unit_test.go @@ -310,7 +310,11 @@ func TestInteger(t *testing.T) { } for _, testCase := range checkTypeCases { - column := &models.Column{Type: "integer", Ranges: []*models.Params{{TypeParams: testCase.typeParams}}} + column := &models.Column{ + Name: "integers", + Type: "integer", + Ranges: []*models.Params{{TypeParams: testCase.typeParams}}, + } checkType(t, column, testCase.expected) checkOrdered(t, column) @@ -357,7 +361,11 @@ func TestInteger(t *testing.T) { } for _, testCase := range checkValueCases { - column := &models.Column{Type: "integer", Ranges: []*models.Params{{TypeParams: testCase.typeParams}}} + column := &models.Column{ + Name: "integers", + Type: "integer", + Ranges: []*models.Params{{TypeParams: testCase.typeParams}}, + } checkValue(t, column, testCase.expected) } @@ -382,7 +390,11 @@ func TestFloat(t *testing.T) { } for _, testCase := range checkTypeCases { - column := &models.Column{Type: "float", Ranges: []*models.Params{{TypeParams: testCase.typeParams}}} + column := &models.Column{ + Name: "floats", + Type: "float", + Ranges: []*models.Params{{TypeParams: testCase.typeParams}}, + } checkType(t, column, testCase.expected) checkOrdered(t, column) @@ -413,7 +425,11 @@ func TestFloat(t *testing.T) { } for _, testCase := range checkValueCases { - column := &models.Column{Type: "float", Ranges: []*models.Params{{TypeParams: testCase.typeParams}}} + column := &models.Column{ + Name: "floats", + Type: "float", + Ranges: []*models.Params{{TypeParams: testCase.typeParams}}, + } checkValue(t, column, testCase.expected) } @@ -449,7 +465,11 @@ func TestString(t *testing.T) { } for _, testCase := range testCases { - column := &models.Column{Name: "test", Type: "string", Ranges: []*models.Params{{TypeParams: testCase.typeParams}}} + column := &models.Column{ + Name: "strings", + Type: "string", + Ranges: []*models.Params{{TypeParams: testCase.typeParams}}, + } handled := checkType(t, column, "") strValue, ok := handled[0].Values[0].(string) @@ -466,7 +486,7 @@ func TestString(t *testing.T) { } func TestUUID(t *testing.T) { - column := &models.Column{Type: "uuid"} + column := &models.Column{Name: "uuids", Type: "uuid"} checkType(t, column, uuid.UUID{}) checkDistinct(t, column) checkForeignKeyCases(t, column) @@ -486,7 +506,11 @@ func TestDateTime(t *testing.T) { } for _, testCase := range checkTypeCases { - column := &models.Column{Type: "datetime", Ranges: []*models.Params{{TypeParams: testCase.typeParams}}} + column := &models.Column{ + Name: "datetimes", + Type: "datetime", + Ranges: []*models.Params{{TypeParams: testCase.typeParams}}, + } checkType(t, column, testCase.expected) checkOrdered(t, column) @@ -503,7 +527,11 @@ func TestDateTime(t *testing.T) { } for _, testCase := range checkValueCases { - column := &models.Column{Type: "datetime", Ranges: []*models.Params{{TypeParams: testCase.typeParams}}} + column := &models.Column{ + Name: "datetimes", + Type: "datetime", + Ranges: []*models.Params{{TypeParams: testCase.typeParams}}, + } checkValue(t, column, testCase.expected) } @@ -736,7 +764,11 @@ func TestEnum(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - column := &models.Column{Type: tc.dataType, Ranges: []*models.Params{{Values: tc.values}}} + column := &models.Column{ + Name: "enums", + Type: tc.dataType, + Ranges: []*models.Params{{Values: tc.values}}, + } cfg := oneColumnCfg(t, column) cfg.Models[UnitDefaultColumnName].RowsCount = tc.rowsCount @@ -745,7 +777,11 @@ func TestEnum(t *testing.T) { handledDataRows := generateFunc(t, cfg)[UnitDefaultColumnName] require.Len(t, handledDataRows, len(tc.expected)) - columnOrdered := &models.Column{Type: tc.dataType, Ranges: []*models.Params{{Values: tc.values, Ordered: true}}} + columnOrdered := &models.Column{ + Name: "enums", + Type: tc.dataType, + Ranges: []*models.Params{{Values: tc.values, Ordered: true}}, + } cfg = oneColumnCfg(t, columnOrdered) cfg.Models[UnitDefaultColumnName].RowsCount = tc.rowsCount @@ -914,7 +950,7 @@ func TestRanges(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - column := &models.Column{Type: tc.dataType, Ranges: tc.ranges} + column := &models.Column{Name: "ranges", Type: tc.dataType, Ranges: tc.ranges} cfg := oneColumnCfg(t, column) cfg.Models[UnitDefaultColumnName].RowsCount = UnitDefaultRowsCount From 1f14c6a9f451aeb98505056da73805dd94f47f9b Mon Sep 17 00:00:00 2001 From: reversetm Date: Sat, 26 Jul 2025 16:48:36 +0300 Subject: [PATCH 03/15] Fixed error of calculation of possible values for generation of string template, updated usage.md, updated CHANGELOG, and improve and columns are sorted at the point of use. --- CHANGELOG.md | 13 +- config/models.yml | 2 +- doc/ru/usage.md | 16 +- internal/generator/models/common.go | 6 +- internal/generator/models/common_test.go | 2 +- internal/generator/models/generator_model.go | 12 +- internal/generator/models/models_test.go | 3 - .../usecase/general/generator/generator.go | 26 +- internal/generator/usecase/general/task.go | 52 ++- .../usecase/general/test/unit_test.go | 349 ++++++++++++++++-- 10 files changed, 405 insertions(+), 76 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d93338..6fd4b0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [latest](https://github.com/tarantool/sdvg/compare/0.0.1..master) +### Changed + +- String templates replaced with jinja like + +### Breaking changes + +- The old version of string template in `type_params` of `string` type is no longer supported, + instead you should use `{{ pattern('pattern_expression') }}` + ## [0.0.1](https://github.com/tarantool/sdvg/compare/36d0930..0.0.1) - 2025-07-21 ### Added @@ -38,7 +47,3 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Data partitioning - Ability to continue generation - Availability to ignore some models for generation - -### Changed - -- String templates replaced with jinja like diff --git a/config/models.yml b/config/models.yml index b3b22d9..7bc502c 100644 --- a/config/models.yml +++ b/config/models.yml @@ -62,7 +62,7 @@ models: - name: passport type: string type_params: - template: AA 00 000 000 + template: "{{ pattern('AA 00 000 000') }}" distinct_percentage: 1 ordered: true - name: created diff --git a/doc/ru/usage.md b/doc/ru/usage.md index 499af96..f3ef70f 100644 --- a/doc/ru/usage.md +++ b/doc/ru/usage.md @@ -165,8 +165,8 @@ open_ai: - `max_length`: Максимальная длина строки. По умолчанию `32`. - `logical_type`: Логический тип строки. Поддерживаемые значения: `first_name`, `last_name`, `phone`, `text`. - `template`: Jinja-подобный шаблон для генерации строки. Позволяет использовать любые поля генерируемой модели и - задавать паттерн строки с помощью функции `pattern`, где символ `A` - любая большая буква, символ `a` - любая маленькая буква, - символ `0` - любая цифра, символ `#` - любой символ, а остальные символы остаются как есть. + задавать паттерн строки с помощью функции `pattern`. Информация о фильтрах и функциях, доступных в шаблонных + строках описана [здесь](#фильтры-и-функции-используемые-в-шаблонных-строках). Также поддерживается использование фильтров, таких как `upper` и `lower`. - `locale`: Локаль для генерации строк. Поддерживаемые значения: `ru`, `en`. По умолчанию `en`. - `without_large_letters`: Флаг, указывающий, исключать ли большие буквы из строки. @@ -248,6 +248,18 @@ open_ai: Подобна структуре для формата `http`, за исключением того, что поле `format_template` неизменяемое и всегда равняется значению по умолчанию. +#### Фильтры и функции, используемые в шаблонных строках + +Шаблонные строки реализованы с использованием библиотеки `pongo2`, ознакомиться +со всеми доступными фильтрами и функциями можно в репозитории [pongo2](https://github.com/flosch/pongo2). + +Вдобавок к ним была добавлена 1 функция: + +- pattern: позволяет создать паттерн строки при помощи специальных символов. + Символ `A` - любая большая буква, символ `a` - любая маленькая буква, + символ `0` - любая цифра, символ `#` - любой символ, а остальные символы остаются как есть. + Функция доступна только в поле `template` типа данных `string`. + #### Примеры конфигурации генерации данных Пример конфигурации модели данных: diff --git a/internal/generator/models/common.go b/internal/generator/models/common.go index 5f82f8e..75142cc 100644 --- a/internal/generator/models/common.go +++ b/internal/generator/models/common.go @@ -2,7 +2,6 @@ package models import ( "encoding/json" - "github.com/tarantool/sdvg/internal/generator/common" "io" "os" "path/filepath" @@ -12,6 +11,7 @@ import ( "github.com/ilyakaznacheev/cleanenv" "github.com/otaviokr/topological-sort/toposort" "github.com/pkg/errors" + "github.com/tarantool/sdvg/internal/generator/common" "gopkg.in/yaml.v3" ) @@ -122,8 +122,8 @@ func parseErrsToString(errs []error) string { return sb.String() } -func topologicalSort(columns []*Column) ([]string, error) { - graph := make(map[string][]string) +func TopologicalSort(columns []*Column) ([]string, error) { + graph := make(map[string][]string, len(columns)) for _, c := range columns { graph[c.Name] = make([]string, 0) diff --git a/internal/generator/models/common_test.go b/internal/generator/models/common_test.go index 314bb87..b6f2403 100644 --- a/internal/generator/models/common_test.go +++ b/internal/generator/models/common_test.go @@ -106,7 +106,7 @@ func TestTopologicalSort(t *testing.T) { testFunc := func(t *testing.T, tc testCase) { t.Helper() - actual, err := topologicalSort(tc.columns) + actual, err := TopologicalSort(tc.columns) require.Equal(t, tc.wantErr, err != nil) require.Equal(t, tc.expected, actual) } diff --git a/internal/generator/models/generator_model.go b/internal/generator/models/generator_model.go index 88aaf16..e3c451f 100644 --- a/internal/generator/models/generator_model.go +++ b/internal/generator/models/generator_model.go @@ -30,9 +30,8 @@ type Model struct { RowsPerFile uint64 `backup:"true" json:"rows_per_file" yaml:"rows_per_file"` ModelDir string `backup:"true" json:"model_dir" yaml:"model_dir"` // The columns from the partitioning key with PartitionColumn.WriteToOutput == false, must be at the end of slice. - Columns []*Column `backup:"true" json:"columns" yaml:"columns"` - ColumnsTopologicalOrder []string - PartitionColumns []*PartitionColumn `backup:"true" json:"partition_columns" yaml:"partition_columns"` + Columns []*Column `backup:"true" json:"columns" yaml:"columns"` + PartitionColumns []*PartitionColumn `backup:"true" json:"partition_columns" yaml:"partition_columns"` } // PartitionColumn type is used to describe partition parameters for column. @@ -81,13 +80,6 @@ func (m *Model) Parse() error { m.shiftColumnsToEnd(nonWriteableColumns) - sortedColumns, err := topologicalSort(m.Columns) - if err != nil { - return errors.WithMessage(err, "failed to sorting columns by dependencies") - } - - m.ColumnsTopologicalOrder = sortedColumns - return nil } diff --git a/internal/generator/models/models_test.go b/internal/generator/models/models_test.go index 5df79da..ffb908a 100644 --- a/internal/generator/models/models_test.go +++ b/internal/generator/models/models_test.go @@ -1197,9 +1197,6 @@ parquet params: expectedModel := tc.expected.Models[modelName] gotModel := cfg.Models[modelName] - // skip ColumnsTopologicalOrder check - expectedModel.ColumnsTopologicalOrder = gotModel.ColumnsTopologicalOrder - for columnName := range expectedModel.Columns { expectedColumn := expectedModel.Columns[columnName] gotColumn := gotModel.Columns[columnName] diff --git a/internal/generator/usecase/general/generator/generator.go b/internal/generator/usecase/general/generator/generator.go index 2dc8c27..e78da6f 100644 --- a/internal/generator/usecase/general/generator/generator.go +++ b/internal/generator/usecase/general/generator/generator.go @@ -11,13 +11,12 @@ import ( ) type rangeGenerator struct { - numFrom uint64 - numTo uint64 - distinctValuesCount uint64 - sequencer sequencer - dataRandomFactor float64 - generator value.Generator - nullPercentage float64 + numFrom uint64 + numTo uint64 + sequencer sequencer + dataRandomFactor float64 + generator value.Generator + nullPercentage float64 } type ColumnGenerator struct { @@ -175,13 +174,12 @@ func newRangeGenerator( dataRandomFactor := 1 - float64(distinctValuesCount)/generatorValuesCount return &rangeGenerator{ - numFrom: rangeRowsOffset, - numTo: rangeRowsOffset + rangeRowsCount, - distinctValuesCount: distinctValuesCount, - dataRandomFactor: dataRandomFactor, - generator: valueGenerator, - sequencer: rangeSequencer, - nullPercentage: dataRange.NullPercentage, + numFrom: rangeRowsOffset, + numTo: rangeRowsOffset + rangeRowsCount, + dataRandomFactor: dataRandomFactor, + generator: valueGenerator, + sequencer: rangeSequencer, + nullPercentage: dataRange.NullPercentage, }, nil } diff --git a/internal/generator/usecase/general/task.go b/internal/generator/usecase/general/task.go index cae4282..6b71f1e 100644 --- a/internal/generator/usecase/general/task.go +++ b/internal/generator/usecase/general/task.go @@ -84,9 +84,21 @@ func newGenerators(cfg *models.GenerationConfig) (map[string]*generator.ColumnGe generators := make(map[string]*generator.ColumnGenerator) for modelName, model := range cfg.Models { - distinctValuesCountByColumn := make(map[string]uint64) + distinctValuesCountByColumn := make(map[string]uint64, len(model.Columns)) + + sortedColumns, err := models.TopologicalSort(model.Columns) + if err != nil { + return nil, errors.WithMessagef(err, "failed to sorting columns by dependencies for model %q", modelName) + } + + originIndexes := make(map[string]int, len(model.Columns)) + for index, column := range model.Columns { + originIndexes[column.Name] = index + } + + for _, columnName := range sortedColumns { + column := model.Columns[originIndexes[columnName]] - for _, column := range model.Columns { dataModelName := modelName dataModel := model dataColumn := column @@ -173,6 +185,8 @@ func (t *Task) WaitError() error { } // generateAndSaveValues function generates values for all model. +// +//nolint:cyclop func (t *Task) generateAndSaveValues(ctx context.Context) error { var err error @@ -203,6 +217,16 @@ func (t *Task) generateAndSaveValues(ctx context.Context) error { continue } + columnsTopologicalOrder, err := models.TopologicalSort(model.Columns) + if err != nil { + return errors.WithMessagef(err, "failed to sorting columns by dependencies for model %q", modelName) + } + + originColumnsIndexes := make(map[string]int, len(model.Columns)) + for index, column := range model.Columns { + originColumnsIndexes[column.Name] = index + } + pool.Add(1) go func() { @@ -223,7 +247,11 @@ func (t *Task) generateAndSaveValues(ctx context.Context) error { generators = append(generators, t.generators[columnKey].NewBatchGenerator(rowsCount)) } - pool.Submit(ctx, outputSyncer.WorkerSyncer(), model, generators, rowsCount) + pool.Submit( + ctx, outputSyncer.WorkerSyncer(), + modelName, columnsTopologicalOrder, originColumnsIndexes, + generators, rowsCount, + ) } }() } @@ -257,7 +285,8 @@ func (t *Task) skipRows() { // generateAndSaveBatch function generate batch of values for selected column and send it to output. func (t *Task) generateAndSaveBatch( ctx context.Context, outputSync *common.WorkerSyncer, - model *models.Model, generators []*generator.BatchGenerator, count uint64, + modelName string, columnsTopologicalOrder []string, originColumnsIndexes map[string]int, + generators []*generator.BatchGenerator, count uint64, ) error { defer outputSync.Done(ctx) @@ -268,20 +297,15 @@ func (t *Task) generateAndSaveBatch( } } - originIndexes := make(map[string]int, len(model.Columns)) - for index, column := range model.Columns { - originIndexes[column.Name] = index - } - for i := range count { - generatedValues := make(map[string]any) + generatedValues := make(map[string]any, len(originColumnsIndexes)) - for _, columnName := range model.ColumnsTopologicalOrder { + for _, columnName := range columnsTopologicalOrder { if common.CtxClosed(ctx) { return &common.ContextCancelError{} } - idx := originIndexes[columnName] + idx := originColumnsIndexes[columnName] value, err := generators[idx].Value(generatedValues) if err != nil { @@ -295,12 +319,12 @@ func (t *Task) generateAndSaveBatch( outputSync.WaitPrevious(ctx) - err := t.output.HandleRowsBatch(ctx, model.Name, batch) + err := t.output.HandleRowsBatch(ctx, modelName, batch) if err != nil { return errors.WithMessage(err, "failed to save batch to output") } - t.progress.Add(model.Name, count) + t.progress.Add(modelName, count) return nil } diff --git a/internal/generator/usecase/general/test/unit_test.go b/internal/generator/usecase/general/test/unit_test.go index 91a1bcd..cae5433 100644 --- a/internal/generator/usecase/general/test/unit_test.go +++ b/internal/generator/usecase/general/test/unit_test.go @@ -16,6 +16,7 @@ import ( outputMock "github.com/tarantool/sdvg/internal/generator/output/mock" "github.com/tarantool/sdvg/internal/generator/usecase" usecaseGeneral "github.com/tarantool/sdvg/internal/generator/usecase/general" + "github.com/tarantool/sdvg/internal/generator/usecase/general/generator/value" ) const ( @@ -69,12 +70,12 @@ func deepColumnCopy(c *models.Column) *models.Column { func toString(t *testing.T, anyValue any) string { t.Helper() - value, err := json.Marshal(anyValue) + val, err := json.Marshal(anyValue) if err != nil { - t.Fatalf("Failed to json marshal of %v: %s", value, err) + t.Fatalf("Failed to json marshal of %v: %s", val, err) } - return string(value) + return string(val) } func getCfg(t *testing.T, model map[string]*models.Model) models.GenerationConfig { @@ -192,13 +193,59 @@ func checkDistinct(t *testing.T, column *models.Column) { for i := range UnitDefaultRowsCount { require.Len(t, handled[i].Values, 1, "column: %+v\n handled: %+v", column, handled) - value := toString(t, handled[i].Values[0]) - _, alreadyHas := uniqueMap[value] - require.False(t, alreadyHas, "value: %+v\nmap: %+v", value, uniqueMap) - uniqueMap[value] = true + val := toString(t, handled[i].Values[0]) + _, alreadyHas := uniqueMap[val] + require.False(t, alreadyHas, "value: %+v\nmap: %+v", val, uniqueMap) + uniqueMap[val] = true } } +func checkValuesCount( + t *testing.T, + gen value.Generator, + valuesCountByColumn map[string]uint64, expectedValueCount float64, +) { + t.Helper() + + require.NoError(t, gen.Prepare()) + + valuesCount := gen.ValuesCount(valuesCountByColumn) + require.Equal(t, uint64(expectedValueCount), uint64(valuesCount)) +} + +func checkPossibleToGenerate(t *testing.T, columns []*models.Column, rowsCount uint64, wantErr bool) { + t.Helper() + + copyColumns := make([]*models.Column, 0, len(columns)) + for _, column := range columns { + copyColumns = append(copyColumns, deepColumnCopy(column)) + } + + cfg := getCfg(t, map[string]*models.Model{ + "test": { + RowsCount: rowsCount, + Columns: copyColumns, + }, + }) + + outputHandler := func(_ context.Context, _ string, _ []*models.DataRow) error { return nil } + + out := outputMock.NewOutput(outputHandler) + uc := usecaseGeneral.NewUseCase(usecaseGeneral.UseCaseConfig{}) + + taskID, err := uc.CreateTask( + context.Background(), + usecase.TaskConfig{ + GenerationConfig: &cfg, + Output: out, + }, + ) + + require.Equal(t, wantErr, err != nil) + err = uc.WaitResult(taskID) + require.Equal(t, wantErr, err != nil) +} + func checkForeignKey(t *testing.T, column *models.Column, nullPercentage float64, foreignOrdered bool) { t.Helper() @@ -214,6 +261,7 @@ func checkForeignKey(t *testing.T, column *models.Column, nullPercentage float64 "foreign": { RowsCount: UnitDefaultRowsCount * 2, Columns: []*models.Column{{ + Name: "foreign_key", ForeignKey: "orig.test", Params: &models.Params{Ordered: foreignOrdered}, }}, @@ -237,10 +285,10 @@ func checkForeignKey(t *testing.T, column *models.Column, nullPercentage float64 continue } - value := toString(t, origHandled[i].Values[0]) - _, alreadyHas := origMap[value] - require.False(t, alreadyHas, "value: %+v\nmap: %+v", value, origMap) - origMap[value] = true + val := toString(t, origHandled[i].Values[0]) + _, alreadyHas := origMap[val] + require.False(t, alreadyHas, "value: %+v\nmap: %+v", val, origMap) + origMap[val] = true } for i := range UnitDefaultRowsCount * 2 { @@ -266,9 +314,9 @@ func checkForeignKey(t *testing.T, column *models.Column, nullPercentage float64 } } - value := toString(t, foreignHandled[i].Values[0]) - _, alreadyHas := origMap[value] - require.True(t, alreadyHas, "value: %+v (#%d)\nmap: %+v", value, i, origMap) + val := toString(t, foreignHandled[i].Values[0]) + _, alreadyHas := origMap[val] + require.True(t, alreadyHas, "value: %+v (#%d)\nmap: %+v", val, i, origMap) } } @@ -369,6 +417,21 @@ func TestInteger(t *testing.T) { checkValue(t, column, testCase.expected) } + + checkValuesCountCases := []struct { + typeParams *models.ColumnIntegerParams + expected float64 + }{ + {&models.ColumnIntegerParams{From: 1, To: 5}, 5}, + {&models.ColumnIntegerParams{From: 100, To: 1000}, 901}, + {&models.ColumnIntegerParams{From: 1, To: 1}, 1}, + {&models.ColumnIntegerParams{From: 123, To: 654}, 532}, + } + + for _, testCase := range checkValuesCountCases { + generator := &value.IntegerGenerator{ColumnIntegerParams: testCase.typeParams} + checkValuesCount(t, generator, nil, testCase.expected) + } } func TestFloat(t *testing.T) { @@ -433,6 +496,21 @@ func TestFloat(t *testing.T) { checkValue(t, column, testCase.expected) } + + checkValuesCountCases := []struct { + typeParams *models.ColumnFloatParams + expected float64 + }{ + {&models.ColumnFloatParams{From: 1.021, To: 5.554433}, math.Inf(1)}, + {&models.ColumnFloatParams{From: 195.2345, To: 1000}, math.Inf(1)}, + {&models.ColumnFloatParams{From: 0.12345, To: 1}, math.Inf(1)}, + {&models.ColumnFloatParams{From: 123, To: 654}, math.Inf(1)}, + } + + for _, testCase := range checkValuesCountCases { + generator := &value.FloatGenerator{ColumnFloatParams: testCase.typeParams} + checkValuesCount(t, generator, nil, testCase.expected) + } } func TestString(t *testing.T) { @@ -483,6 +561,189 @@ func TestString(t *testing.T) { checkDistinct(t, column) checkForeignKeyCases(t, column) } + + checkValuesCountCases := []struct { + typeParams *models.ColumnStringParams + distinctValuesCountByColumn map[string]uint64 + expected float64 + }{ + { + &models.ColumnStringParams{ + MinLength: 1, + MaxLength: 1, + Locale: "en", + WithoutNumbers: true, + WithoutSpecialChars: true, + }, + nil, + 52, + }, + { + &models.ColumnStringParams{ + MinLength: 1, + MaxLength: 1, + Locale: "ru", + WithoutNumbers: true, + WithoutSpecialChars: true, + }, + nil, + 66.0, + }, + { + &models.ColumnStringParams{ + MinLength: 3, + MaxLength: 7, + Locale: "en", + WithoutNumbers: true, + WithoutSpecialChars: true, + }, + nil, + 1048229968448, + }, + { + &models.ColumnStringParams{ + MinLength: 2, + MaxLength: 9, + Locale: "ru", + WithoutNumbers: true, + WithoutSpecialChars: true, + }, + nil, + 24128259706319868, + }, + { + &models.ColumnStringParams{ + MinLength: 10, + MaxLength: 24, + Locale: "en", + WithoutLargeLetters: true, + WithoutSmallLetters: true, + WithoutSpecialChars: true, + }, + nil, + 1111111111111110000000000, + }, + { + &models.ColumnStringParams{ + MinLength: 1, + MaxLength: 8, + Locale: "en", + WithoutLargeLetters: true, + WithoutSmallLetters: true, + WithoutNumbers: true, + }, + nil, + 81870575520, + }, + { + &models.ColumnStringParams{ + MinLength: 10, + MaxLength: 15, + Locale: "en", + }, + nil, + 88394150280794134360488281250, + }, + { + &models.ColumnStringParams{ + MinLength: 10, + MaxLength: 15, + Locale: "ru", + }, + nil, + 868834460299970670989801640300, + }, + { + &models.ColumnStringParams{ + Locale: "en", + Template: "{{ field }}", + }, + map[string]uint64{ + "field": 11, + }, + 11, + }, + { + &models.ColumnStringParams{ + Locale: "en", + Template: "{{ pattern('A00') }}", + }, + nil, + 2600, + }, + { + &models.ColumnStringParams{ + Locale: "ru", + Template: "{{ field }}{{ pattern('a0#') }}", + }, + map[string]uint64{ + "field": 10, + }, + 75900, + }, + } + + for _, testCase := range checkValuesCountCases { + generator := &value.StringGenerator{ColumnStringParams: testCase.typeParams} + checkValuesCount(t, generator, testCase.distinctValuesCountByColumn, testCase.expected) + } + + idColumn := &models.Column{ + Name: "id", + Type: "integer", + Ranges: []*models.Params{ + { + TypeParams: &models.ColumnIntegerParams{ + FromPtr: int64Ptr(1), + ToPtr: int64Ptr(5), + }, + }, + }, + } + + emailColumn := &models.Column{ + Name: "email", + Type: "string", + Ranges: []*models.Params{ + { + TypeParams: &models.ColumnStringParams{ + Template: "{{ id }}.{{ pattern('00') }}@example.com", + }, + DistinctPercentage: 1, + }, + }, + } + + checkPossibleToGenerateCases := []struct { + columns []*models.Column + rowsCount uint64 + wantErr bool + }{ + { + columns: []*models.Column{idColumn, emailColumn}, + rowsCount: 500, + wantErr: false, + }, + { + columns: []*models.Column{emailColumn, idColumn}, + rowsCount: 500, + wantErr: false, + }, + { + columns: []*models.Column{idColumn, emailColumn}, + rowsCount: 501, + wantErr: true, + }, + { + columns: []*models.Column{emailColumn, idColumn}, + rowsCount: 501, + wantErr: true, + }, + } + + for _, testCase := range checkPossibleToGenerateCases { + checkPossibleToGenerate(t, testCase.columns, testCase.rowsCount, testCase.wantErr) + } } func TestUUID(t *testing.T) { @@ -490,6 +751,7 @@ func TestUUID(t *testing.T) { checkType(t, column, uuid.UUID{}) checkDistinct(t, column) checkForeignKeyCases(t, column) + checkValuesCount(t, &value.UUIDGenerator{}, nil, float64(1<<(128-10)-1)) } func TestDateTime(t *testing.T) { @@ -535,6 +797,45 @@ func TestDateTime(t *testing.T) { checkValue(t, column, testCase.expected) } + + checkValuesCountCases := []struct { + typeParams *models.ColumnDateTimeParams + expected float64 + }{ + { + &models.ColumnDateTimeParams{ + From: time.Date(2025, 7, 25, 10, 0, 0, 0, time.UTC), + To: time.Date(2025, 7, 25, 10, 0, 0, 0, time.UTC), + }, + 1, + }, + { + &models.ColumnDateTimeParams{ + From: time.Date(2025, 7, 25, 10, 0, 0, 500_000_000, time.UTC), + To: time.Date(2025, 7, 25, 10, 0, 5, 500_000_000, time.UTC), + }, + 6, + }, + { + &models.ColumnDateTimeParams{ + From: time.Date(2025, 7, 25, 10, 0, 0, 900_000_000, time.UTC), + To: time.Date(2025, 7, 25, 10, 0, 1, 100_000_000, time.UTC), + }, + 400_000_002, + }, + { + &models.ColumnDateTimeParams{ + From: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), + To: time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC), + }, + 31_536_001, + }, + } + + for _, testCase := range checkValuesCountCases { + generator := &value.DateTimeGenerator{ColumnDateTimeParams: testCase.typeParams} + checkValuesCount(t, generator, nil, testCase.expected) + } } func TestIdempotence(t *testing.T) { @@ -791,8 +1092,8 @@ func TestEnum(t *testing.T) { require.Len(t, handledDataRows, len(tc.expected)) for i := range handledDataRows { - value := handledDataRows[i].Values[0] - require.Equal(t, tc.expected[i], value) + val := handledDataRows[i].Values[0] + require.Equal(t, tc.expected[i], val) } }) } @@ -965,9 +1266,9 @@ func TestRanges(t *testing.T) { } for i := range handledDataRows { - value := handledDataRows[i].Values[0] + val := handledDataRows[i].Values[0] - rangeIdx, err := mapValueToRange(tc.dataType, value, tc.ranges) + rangeIdx, err := mapValueToRange(tc.dataType, val, tc.ranges) require.NoError(t, err) expectedValuesAmountPerRange[rangeIdx]-- @@ -991,13 +1292,13 @@ func mapValueToRange(columnType string, value any, ranges []*models.Params) (int switch columnType { case "integer": - switch value := value.(type) { + switch val := value.(type) { case int32: - if int32(r.IntegerParams.From) <= value && value <= int32(r.IntegerParams.To) { + if int32(r.IntegerParams.From) <= val && val <= int32(r.IntegerParams.To) { return idx, nil } case int64: - if r.IntegerParams.From <= value && value <= r.IntegerParams.To { + if r.IntegerParams.From <= val && val <= r.IntegerParams.To { return idx, nil } } @@ -1020,13 +1321,13 @@ func mapValueToRange(columnType string, value any, ranges []*models.Params) (int return idx, nil } case "float": - switch value := value.(type) { + switch val := value.(type) { case float32: - if float32(r.FloatParams.From) <= value && value <= float32(r.FloatParams.To) { + if float32(r.FloatParams.From) <= val && val <= float32(r.FloatParams.To) { return idx, nil } case float64: - if r.FloatParams.From <= value && value <= r.FloatParams.To { + if r.FloatParams.From <= val && val <= r.FloatParams.To { return idx, nil } } From 3fe331cfb07ca0c861fd1b2b8f3934fbbf5919d6 Mon Sep 17 00:00:00 2001 From: reversetm Date: Mon, 28 Jul 2025 18:11:30 +0300 Subject: [PATCH 04/15] Improved performance --- CHANGELOG.md | 2 +- doc/ru/usage.md | 7 +- internal/generator/common/utils.go | 35 ++++++ internal/generator/common/utils_test.go | 74 +++++++++++ internal/generator/models/common.go | 24 ---- internal/generator/models/common_test.go | 117 ------------------ .../usecase/general/generator/generator.go | 14 ++- .../general/generator/value/interfaces.go | 2 +- .../usecase/general/generator/value/string.go | 14 ++- internal/generator/usecase/general/task.go | 48 +++++-- 10 files changed, 168 insertions(+), 169 deletions(-) delete mode 100644 internal/generator/models/common_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 6fd4b0f..eb57aaf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Breaking changes - The old version of string template in `type_params` of `string` type is no longer supported, - instead you should use `{{ pattern('pattern_expression') }}` + `{{ pattern('pattern_expression') }}` should be used instead. ## [0.0.1](https://github.com/tarantool/sdvg/compare/36d0930..0.0.1) - 2025-07-21 diff --git a/doc/ru/usage.md b/doc/ru/usage.md index f3ef70f..ea6ed53 100644 --- a/doc/ru/usage.md +++ b/doc/ru/usage.md @@ -166,8 +166,7 @@ open_ai: - `logical_type`: Логический тип строки. Поддерживаемые значения: `first_name`, `last_name`, `phone`, `text`. - `template`: Jinja-подобный шаблон для генерации строки. Позволяет использовать любые поля генерируемой модели и задавать паттерн строки с помощью функции `pattern`. Информация о фильтрах и функциях, доступных в шаблонных - строках описана [здесь](#фильтры-и-функции-используемые-в-шаблонных-строках). - Также поддерживается использование фильтров, таких как `upper` и `lower`. + строках описана в конце данного раздела. - `locale`: Локаль для генерации строк. Поддерживаемые значения: `ru`, `en`. По умолчанию `en`. - `without_large_letters`: Флаг, указывающий, исключать ли большие буквы из строки. - `without_small_letters`: Флаг, указывающий, исключать ли маленькие буквы из строки. @@ -248,7 +247,7 @@ open_ai: Подобна структуре для формата `http`, за исключением того, что поле `format_template` неизменяемое и всегда равняется значению по умолчанию. -#### Фильтры и функции, используемые в шаблонных строках +Фильтры и функции, используемые в шаблонных строках Шаблонные строки реализованы с использованием библиотеки `pongo2`, ознакомиться со всеми доступными фильтрами и функциями можно в репозитории [pongo2](https://github.com/flosch/pongo2). @@ -258,7 +257,7 @@ open_ai: - pattern: позволяет создать паттерн строки при помощи специальных символов. Символ `A` - любая большая буква, символ `a` - любая маленькая буква, символ `0` - любая цифра, символ `#` - любой символ, а остальные символы остаются как есть. - Функция доступна только в поле `template` типа данных `string`. + Функция доступна только в поле `template` типа данных `string`. #### Примеры конфигурации генерации данных diff --git a/internal/generator/common/utils.go b/internal/generator/common/utils.go index a58f0c4..49f2087 100644 --- a/internal/generator/common/utils.go +++ b/internal/generator/common/utils.go @@ -14,6 +14,7 @@ import ( "time" "github.com/google/uuid" + "github.com/otaviokr/topological-sort/toposort" "github.com/pkg/errors" "gopkg.in/yaml.v3" ) @@ -378,3 +379,37 @@ func ExtractValuesFromTemplate(template string) []string { return values } + +// TopologicalSort sorts the given items in topological order using the provided +// function to extract node name and dependencies. +// Returns the sorted node names, a flag indicating if any dependencies exist, +// and an error if a cycle is detected. +func TopologicalSort[T any](items []T, nodeFunc func(T) (string, []string)) ([]string, bool, error) { + var ( + graph = make(map[string][]string, len(items)) + sortedVertexes = make([]string, 0, len(items)) + hasDependencies bool + err error + ) + + for _, item := range items { + name, dependencies := nodeFunc(item) + if len(dependencies) > 0 { + hasDependencies = true + } + + sortedVertexes = append(sortedVertexes, name) + graph[name] = dependencies + } + + if !hasDependencies { + return sortedVertexes, false, nil + } + + sortedVertexes, err = toposort.ReverseTarjan(graph) + if err != nil { + return nil, false, errors.New(err.Error()) + } + + return sortedVertexes, hasDependencies, nil +} diff --git a/internal/generator/common/utils_test.go b/internal/generator/common/utils_test.go index 34dabf2..5c4a406 100644 --- a/internal/generator/common/utils_test.go +++ b/internal/generator/common/utils_test.go @@ -744,3 +744,77 @@ func TestExtractValuesFromTemplate(t *testing.T) { t.Run(tc.name, func(t *testing.T) { testFunc(t, tc) }) } } + +func TestTopologicalSort(t *testing.T) { + type node struct { + name string + deps []string + } + + type testCase struct { + name string + items []node + wantErr bool + wantDependencies bool + expected []string + } + + testCases := []testCase{ + { + name: "Empty items", + items: []node{}, + wantErr: false, + wantDependencies: false, + expected: []string{}, + }, + { + name: "Items with dependencies", + items: []node{ + {name: "1", deps: []string{"3"}}, + {name: "2", deps: []string{"4"}}, + {name: "3", deps: []string{"2"}}, + {name: "4", deps: []string{}}, + }, + wantErr: false, + wantDependencies: true, + expected: []string{"4", "2", "3", "1"}, + }, + { + name: "Items without dependencies", + items: []node{ + {name: "1", deps: []string{}}, + {name: "2", deps: []string{}}, + {name: "3", deps: []string{}}, + }, + wantErr: false, + wantDependencies: false, + expected: []string{"1", "2", "3"}, + }, + { + name: "Items with cycle dependencies", + items: []node{ + {name: "1", deps: []string{"2"}}, + {name: "2", deps: []string{"1"}}, + }, + wantErr: true, + wantDependencies: false, + expected: nil, + }, + } + + testFunc := func(t *testing.T, tc testCase) { + t.Helper() + + actual, hasDependencies, err := TopologicalSort(tc.items, func(node node) (string, []string) { + return node.name, node.deps + }) + + require.Equal(t, tc.wantErr, err != nil) + require.Equal(t, tc.wantDependencies, hasDependencies) + require.Equal(t, tc.expected, actual) + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { testFunc(t, tc) }) + } +} diff --git a/internal/generator/models/common.go b/internal/generator/models/common.go index 75142cc..635e84b 100644 --- a/internal/generator/models/common.go +++ b/internal/generator/models/common.go @@ -9,9 +9,7 @@ import ( "strings" "github.com/ilyakaznacheev/cleanenv" - "github.com/otaviokr/topological-sort/toposort" "github.com/pkg/errors" - "github.com/tarantool/sdvg/internal/generator/common" "gopkg.in/yaml.v3" ) @@ -121,25 +119,3 @@ func parseErrsToString(errs []error) string { return sb.String() } - -func TopologicalSort(columns []*Column) ([]string, error) { - graph := make(map[string][]string, len(columns)) - for _, c := range columns { - graph[c.Name] = make([]string, 0) - - for _, r := range c.Ranges { - if r.StringParams == nil || r.StringParams.Template == "" { - continue - } - - graph[c.Name] = common.ExtractValuesFromTemplate(r.StringParams.Template) - } - } - - sortedVertexes, err := toposort.ReverseTarjan(graph) - if err != nil { - return nil, errors.New(err.Error()) - } - - return sortedVertexes, nil -} diff --git a/internal/generator/models/common_test.go b/internal/generator/models/common_test.go deleted file mode 100644 index b6f2403..0000000 --- a/internal/generator/models/common_test.go +++ /dev/null @@ -1,117 +0,0 @@ -package models - -import ( - "testing" - - "github.com/stretchr/testify/require" -) - -func TestTopologicalSort(t *testing.T) { - type testCase struct { - name string - columns []*Column - wantErr bool - expected []string - } - - testCases := []testCase{ - { - name: "Empty columns", - columns: []*Column{}, - wantErr: false, - expected: []string{}, - }, - { - name: "Columns with dependencies", - columns: []*Column{ - { - Name: "1", - Type: "string", - Ranges: []*Params{ - { - StringParams: &ColumnStringParams{ - Template: "{{ 3 }}", - }, - }, - }, - }, - { - Name: "2", - Type: "string", - Ranges: []*Params{ - { - StringParams: &ColumnStringParams{ - Template: "{{ 4 }}", - }, - }, - }, - }, - { - Name: "3", - Type: "string", - Ranges: []*Params{ - { - StringParams: &ColumnStringParams{ - Template: "{{ 2 }}", - }, - }, - }, - }, - { - Name: "4", - Type: "string", - Ranges: []*Params{ - { - StringParams: &ColumnStringParams{ - Template: "", - }, - }, - }, - }, - }, - wantErr: false, - expected: []string{"4", "2", "3", "1"}, - }, - { - name: "Columns with cycle dependencies", - columns: []*Column{ - { - Name: "1", - Type: "string", - Ranges: []*Params{ - { - StringParams: &ColumnStringParams{ - Template: "{{ 2 }}", - }, - }, - }, - }, - { - Name: "2", - Type: "string", - Ranges: []*Params{ - { - StringParams: &ColumnStringParams{ - Template: "{{ 1 }}", - }, - }, - }, - }, - }, - wantErr: true, - expected: nil, - }, - } - - testFunc := func(t *testing.T, tc testCase) { - t.Helper() - - actual, err := TopologicalSort(tc.columns) - require.Equal(t, tc.wantErr, err != nil) - require.Equal(t, tc.expected, actual) - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { testFunc(t, tc) }) - } -} diff --git a/internal/generator/usecase/general/generator/generator.go b/internal/generator/usecase/general/generator/generator.go index e78da6f..ea52da4 100644 --- a/internal/generator/usecase/general/generator/generator.go +++ b/internal/generator/usecase/general/generator/generator.go @@ -149,7 +149,9 @@ func newRangeGenerator( distinctValuesCount = uint64(generatorValuesCount) } - distinctValuesCountByColumn[column.Name] += distinctValuesCount + if distinctValuesCountByColumn != nil { + distinctValuesCountByColumn[column.Name] += distinctValuesCount + } rangeOrdered := dataRange.Ordered orderSeed := dataColumnSeed @@ -201,7 +203,7 @@ type valueID struct { type BatchGenerator struct { numbers []valueID nextNumber int - valuer func(number valueID, generatedValues map[string]any) (any, error) + valuer func(number valueID, rowValues map[string]any) (any, error) } func (cg *ColumnGenerator) NewBatchGenerator(batchSize uint64) *BatchGenerator { @@ -227,14 +229,14 @@ func (cg *ColumnGenerator) NewBatchGenerator(batchSize uint64) *BatchGenerator { } } - valuer := func(id valueID, generatedValues map[string]any) (any, error) { + valuer := func(id valueID, rowValues map[string]any) (any, error) { vg := cg.rangeGenerators[id.generatorIndex] if vg.nullPercentage > 0 && fastRandomFloat(cg.dataColumnSeed+uint64(id.number)) < vg.nullPercentage { return nil, nil //nolint:nilnil } - return vg.generator.Value(id.number, generatedValues) + return vg.generator.Value(id.number, rowValues) } return &BatchGenerator{ @@ -244,8 +246,8 @@ func (cg *ColumnGenerator) NewBatchGenerator(batchSize uint64) *BatchGenerator { } // Value returns random value for described column. -func (g *BatchGenerator) Value(generatedValues map[string]any) (any, error) { - res, err := g.valuer(g.numbers[g.nextNumber], generatedValues) +func (g *BatchGenerator) Value(rowValues map[string]any) (any, error) { + res, err := g.valuer(g.numbers[g.nextNumber], rowValues) g.nextNumber++ g.nextNumber %= len(g.numbers) diff --git a/internal/generator/usecase/general/generator/value/interfaces.go b/internal/generator/usecase/general/generator/value/interfaces.go index 5094611..7200317 100644 --- a/internal/generator/usecase/general/generator/value/interfaces.go +++ b/internal/generator/usecase/general/generator/value/interfaces.go @@ -7,7 +7,7 @@ type Generator interface { // SetTotalCount method should remember count of rows to generate SetTotalCount(totalValuesCount uint64) error // Value method should return ordered unique value by number - Value(number float64, generatedValues map[string]any) (any, error) + Value(number float64, rowValues map[string]any) (any, error) // ValuesCount method should return the number of possible values to generate ValuesCount(distinctValuesCountByColumn map[string]uint64) float64 } diff --git a/internal/generator/usecase/general/generator/value/string.go b/internal/generator/usecase/general/generator/value/string.go index 69eb01d..0012b09 100644 --- a/internal/generator/usecase/general/generator/value/string.go +++ b/internal/generator/usecase/general/generator/value/string.go @@ -188,12 +188,16 @@ func (g *StringGenerator) calculateCompletions(length int) []int64 { } // templateString returns n-th string by template. -func (g *StringGenerator) templateString(number float64, generatedValues map[string]any) (string, error) { - generatedValues["pattern"] = func(pattern string) *pongo2.Value { +func (g *StringGenerator) templateString(number float64, rowValues map[string]any) (string, error) { + if rowValues == nil { + rowValues = make(map[string]any) + } + + rowValues["pattern"] = func(pattern string) *pongo2.Value { return pongo2.AsSafeValue(g.patternString(number, pattern)) } - val, err := g.template.Execute(generatedValues) + val, err := g.template.Execute(rowValues) if err != nil { return "", errors.New(err.Error()) } @@ -441,9 +445,9 @@ func (g *StringGenerator) simpleString(number float64) string { } // Value returns n-th string from range. -func (g *StringGenerator) Value(number float64, row map[string]any) (any, error) { +func (g *StringGenerator) Value(number float64, rowValues map[string]any) (any, error) { if g.Template != "" { - val, err := g.templateString(number, row) + val, err := g.templateString(number, rowValues) if err != nil { return nil, errors.WithMessage(err, "failed to template string") } diff --git a/internal/generator/usecase/general/task.go b/internal/generator/usecase/general/task.go index 6b71f1e..b3bf3a4 100644 --- a/internal/generator/usecase/general/task.go +++ b/internal/generator/usecase/general/task.go @@ -84,9 +84,7 @@ func newGenerators(cfg *models.GenerationConfig) (map[string]*generator.ColumnGe generators := make(map[string]*generator.ColumnGenerator) for modelName, model := range cfg.Models { - distinctValuesCountByColumn := make(map[string]uint64, len(model.Columns)) - - sortedColumns, err := models.TopologicalSort(model.Columns) + columnsTopologicalOrder, hasDependencies, err := columnsTopologicalSort(model.Columns) if err != nil { return nil, errors.WithMessagef(err, "failed to sorting columns by dependencies for model %q", modelName) } @@ -96,7 +94,12 @@ func newGenerators(cfg *models.GenerationConfig) (map[string]*generator.ColumnGe originIndexes[column.Name] = index } - for _, columnName := range sortedColumns { + var distinctValuesCountByColumn map[string]uint64 + if hasDependencies { + distinctValuesCountByColumn = make(map[string]uint64, len(model.Columns)) + } + + for _, columnName := range columnsTopologicalOrder { column := model.Columns[originIndexes[columnName]] dataModelName := modelName @@ -127,6 +130,23 @@ func newGenerators(cfg *models.GenerationConfig) (map[string]*generator.ColumnGe return generators, nil } +func columnsTopologicalSort(columns []*models.Column) ([]string, bool, error) { + return common.TopologicalSort( + columns, + func(c *models.Column) (string, []string) { + var deps []string + + for _, r := range c.Ranges { + if r.StringParams != nil && r.StringParams.Template != "" { + deps = append(deps, common.ExtractValuesFromTemplate(r.StringParams.Template)...) + } + } + + return c.Name, deps + }, + ) +} + // RunTask function generates unique values and then all values for selected model. func (t *Task) RunTask(ctx context.Context, callback func()) { started := make(chan struct{}) @@ -217,7 +237,7 @@ func (t *Task) generateAndSaveValues(ctx context.Context) error { continue } - columnsTopologicalOrder, err := models.TopologicalSort(model.Columns) + columnsTopologicalOrder, hasDependencies, err := columnsTopologicalSort(model.Columns) if err != nil { return errors.WithMessagef(err, "failed to sorting columns by dependencies for model %q", modelName) } @@ -249,7 +269,7 @@ func (t *Task) generateAndSaveValues(ctx context.Context) error { pool.Submit( ctx, outputSyncer.WorkerSyncer(), - modelName, columnsTopologicalOrder, originColumnsIndexes, + modelName, columnsTopologicalOrder, originColumnsIndexes, hasDependencies, generators, rowsCount, ) } @@ -285,7 +305,7 @@ func (t *Task) skipRows() { // generateAndSaveBatch function generate batch of values for selected column and send it to output. func (t *Task) generateAndSaveBatch( ctx context.Context, outputSync *common.WorkerSyncer, - modelName string, columnsTopologicalOrder []string, originColumnsIndexes map[string]int, + modelName string, columnsTopologicalOrder []string, originColumnsIndexes map[string]int, hasDependencies bool, generators []*generator.BatchGenerator, count uint64, ) error { defer outputSync.Done(ctx) @@ -297,9 +317,12 @@ func (t *Task) generateAndSaveBatch( } } - for i := range count { - generatedValues := make(map[string]any, len(originColumnsIndexes)) + var rowValues map[string]any + if hasDependencies { + rowValues = make(map[string]any, len(originColumnsIndexes)) + } + for i := range count { for _, columnName := range columnsTopologicalOrder { if common.CtxClosed(ctx) { return &common.ContextCancelError{} @@ -307,13 +330,16 @@ func (t *Task) generateAndSaveBatch( idx := originColumnsIndexes[columnName] - value, err := generators[idx].Value(generatedValues) + value, err := generators[idx].Value(rowValues) if err != nil { return errors.WithMessage(err, "failed to get or generate value") } - generatedValues[columnName] = value batch[i].Values[idx] = value + + if rowValues != nil { + rowValues[columnName] = value + } } } From ff40656e515c49c8e9cac4b44c62389b3ae7975a Mon Sep 17 00:00:00 2001 From: reversetm Date: Mon, 28 Jul 2025 18:29:17 +0300 Subject: [PATCH 05/15] Rebased and updated usage --- doc/en/usage.md | 23 ++++++++++++++++++++--- doc/ru/usage.md | 2 +- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/doc/en/usage.md b/doc/en/usage.md index 3605b33..6c4b103 100644 --- a/doc/en/usage.md +++ b/doc/en/usage.md @@ -158,8 +158,9 @@ Structure `models[*].columns[*].type_params` for data type `string`: - `min_length`: Minimum string length. Default is `1`. - `max_length`: Maximum string length. Default is `32`. - `logical_type`: Logical type of string. Supported values: `first_name`, `last_name`, `phone`, `text`. -- `template`: Template for string generation. Symbol `A` - any uppercase letter, symbol `a` - any lowercase letter, - symbol `0` - any digit, symbol `#` - any character. Other characters remain as-is. +- `template`: Jinja-like template for string generation. Allows you to use any fields of the generated model and + specify the pattern of the string using the `pattern` function. Information about the filters and functions + available in template strings is described at the end of this section. - `locale`: Locale for generated strings. Supported values: `ru`, `en`. Default is `en`. - `without_large_letters`: Flag indicating if uppercase letters should be excluded from the string. - `without_small_letters`: Flag indicating if lowercase letters should be excluded from the string. @@ -240,6 +241,18 @@ Structure of `output.params` for `tcs` format: Similar to the structure for the `http` format, except that the `format_template` field is immutable and always set to its default value. +Filters and functions used in template strings: + +Template strings are implemented using the `pongo2` library, you can read +all available filters and functions in the [pongo2](https://github.com/flosch/pongo2) repository. + +In addition, `1` function has been added: + +- pattern: allows you to create a string pattern using special characters. + The `A` symbol is any capital letter, the `a` symbol is any small letter, + symbol `0` is any digit, the `#` symbol is any character, and the other characters remain as they are. + The function is available only in the `template` field of the `string` data type. + #### Examples of data generation configuration Example data model configuration: @@ -305,9 +318,13 @@ models: - name: passport type: string type_params: - template: AA 00 000 000 + template: "{{ pattern('AA 00 000 000') }}" distinct_percentage: 1 ordered: true + - name: email + type: string + type_params: + template: "{{ first_name_en | lower }}.{{ id }}@example.com" - name: rating type: float type_params: diff --git a/doc/ru/usage.md b/doc/ru/usage.md index ea6ed53..8bd62d1 100644 --- a/doc/ru/usage.md +++ b/doc/ru/usage.md @@ -247,7 +247,7 @@ open_ai: Подобна структуре для формата `http`, за исключением того, что поле `format_template` неизменяемое и всегда равняется значению по умолчанию. -Фильтры и функции, используемые в шаблонных строках +Фильтры и функции, используемые в шаблонных строках: Шаблонные строки реализованы с использованием библиотеки `pongo2`, ознакомиться со всеми доступными фильтрами и функциями можно в репозитории [pongo2](https://github.com/flosch/pongo2). From ef8cfdbbc5c07316d5e5c51d021f3008498c9146 Mon Sep 17 00:00:00 2001 From: reversetm Date: Wed, 30 Jul 2025 00:20:07 +0300 Subject: [PATCH 06/15] Replaced pongo2 with templates from standard library. --- CHANGELOG.md | 5 +- doc/en/usage.md | 46 ++++++++++------ doc/ru/usage.md | 49 +++++++++++------ go.mod | 1 - go.sum | 4 -- internal/generator/common/utils.go | 8 +-- internal/generator/common/utils_test.go | 11 ++-- .../usecase/general/generator/value/string.go | 54 ++++++++++++++----- internal/generator/usecase/general/task.go | 6 ++- .../usecase/general/test/unit_test.go | 16 +++--- 10 files changed, 125 insertions(+), 75 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eb57aaf..a19ccf5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,12 +9,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- String templates replaced with jinja like +- Field `template` in the `string` data type now not only specifies the pattern, + but also allows you to use the values of any columns of the generated model. ### Breaking changes - The old version of string template in `type_params` of `string` type is no longer supported, - `{{ pattern('pattern_expression') }}` should be used instead. + `{{ "pattern_expression" | pattern }}` should be used instead. ## [0.0.1](https://github.com/tarantool/sdvg/compare/36d0930..0.0.1) - 2025-07-21 diff --git a/doc/en/usage.md b/doc/en/usage.md index 6c4b103..6a36462 100644 --- a/doc/en/usage.md +++ b/doc/en/usage.md @@ -158,8 +158,8 @@ Structure `models[*].columns[*].type_params` for data type `string`: - `min_length`: Minimum string length. Default is `1`. - `max_length`: Maximum string length. Default is `32`. - `logical_type`: Logical type of string. Supported values: `first_name`, `last_name`, `phone`, `text`. -- `template`: Jinja-like template for string generation. Allows you to use any fields of the generated model and - specify the pattern of the string using the `pattern` function. Information about the filters and functions +- `template`: Template for string generation. Allows you to use the values of any columns of the generated model and + specify the pattern of the string using the `pattern` function. Information about the functions available in template strings is described at the end of this section. - `locale`: Locale for generated strings. Supported values: `ru`, `en`. Default is `en`. - `without_large_letters`: Flag indicating if uppercase letters should be excluded from the string. @@ -195,15 +195,12 @@ Structure `output.params` for format `http`: - `workers_count`: Number of threads for writing data. Default is `1`. *Experimental field.* - `headers`: HTTP request headers specified as a dictionary. Default is none. - `format_template`: Template-based format for sending data, configured using Golang templates. - Available for use in `format_template`: - - - fields: + There are 2 fields available for use in `format_template`: * `ModelName` - name of the model. * `Rows` - array of records, where each element is a dictionary representing a data row. Dictionary keys correspond to column names, and values correspond to data in those columns. - - functions: - * `len` - returns the length of the given element. - * `json` - converts the given element to a JSON string. + + You can read about the available functions and the use of template strings at the end of this section. Example value for the `format_template` field: @@ -241,17 +238,36 @@ Structure of `output.params` for `tcs` format: Similar to the structure for the `http` format, except that the `format_template` field is immutable and always set to its default value. -Filters and functions used in template strings: +Using Template Strings:: + +Template strings are implemented using the standard golang library, you can read about +all its features and available functions in this [documentation](https://pkg.go.dev/text/template). + +Accessing Data: -Template strings are implemented using the `pongo2` library, you can read -all available filters and functions in the [pongo2](https://github.com/flosch/pongo2) repository. +In a template, data is accessed using `.`(the object or value passed to the template) +and the field name, for example: `{{ .var }}`. -In addition, `1` function has been added: +Function calls: -- pattern: allows you to create a string pattern using special characters. +- direct call: `{{ upper .name }}`. +- using pipe: `{{ .name | upper }}`. + +In addition to standard functions, the project provides `5` custom functions: + +- `pattern`: allows you to create a string pattern using special characters. The `A` symbol is any capital letter, the `a` symbol is any small letter, symbol `0` is any digit, the `#` symbol is any character, and the other characters remain as they are. The function is available only in the `template` field of the `string` data type. +- `upper`: converts the string to upper case. +- `lower`: converts the string to lower case. +- `len`: returns the length of the element. +- `json`: converts the element to a JSON string. + +Usage restrictions: + +The `pattern`, `lower`, and `upper` functions are available only in the `template` field of the `string` data type. +The `len` and `json` functions are available only in the `format_template` field of the output parameters. #### Examples of data generation configuration @@ -318,13 +334,13 @@ models: - name: passport type: string type_params: - template: "{{ pattern('AA 00 000 000') }}" + template: '{{ "AA 00 000 000" | pattern }}' distinct_percentage: 1 ordered: true - name: email type: string type_params: - template: "{{ first_name_en | lower }}.{{ id }}@example.com" + template: "{{ .first_name_en | lower }}.{{ .id }}@example.com" - name: rating type: float type_params: diff --git a/doc/ru/usage.md b/doc/ru/usage.md index 8bd62d1..79b56d8 100644 --- a/doc/ru/usage.md +++ b/doc/ru/usage.md @@ -164,9 +164,9 @@ open_ai: - `min_length`: Минимальная длина строки. По умолчанию `1`. - `max_length`: Максимальная длина строки. По умолчанию `32`. - `logical_type`: Логический тип строки. Поддерживаемые значения: `first_name`, `last_name`, `phone`, `text`. -- `template`: Jinja-подобный шаблон для генерации строки. Позволяет использовать любые поля генерируемой модели и - задавать паттерн строки с помощью функции `pattern`. Информация о фильтрах и функциях, доступных в шаблонных - строках описана в конце данного раздела. +- `template`: Шаблон для генерации строки. Позволяет использовать значения любых столбов генерируемой модели и + задавать паттерн строки с помощью функции `pattern`. Информация о том, как использовать шаблонные строки, + описана в конце данного раздела. - `locale`: Локаль для генерации строк. Поддерживаемые значения: `ru`, `en`. По умолчанию `en`. - `without_large_letters`: Флаг, указывающий, исключать ли большие буквы из строки. - `without_small_letters`: Флаг, указывающий, исключать ли маленькие буквы из строки. @@ -201,15 +201,12 @@ open_ai: - `workers_count`: Количество потоков для записи данных. По умолчанию `1`. *Является экспериментальным полем.* - `headers`: Заголовки http запроса, указываются в формате словаря. По умолчанию отсутствуют. - `format_template`: Формат отправляемых данных, конфигурируемый с помощью шаблонов Golang. - Для использования в поле `format_template` доступны: - - - поля: + Для использования в `format_template` доступно 2 поля: * `ModelName` - имя модели. * `Rows` - массив записей, где каждый элемент является словарем, который представляет собой строку данных. Ключи словаря соответствуют названиям столбцов, а значения — данным в этих столбцах. - - функции: - * `len` - возвращает длину переданного элемента. - * `json` - преобразует переданный элемент в JSON строку. + + О доступных функциях и использовании шаблонных строк можно прочитать в конце данного раздела. Пример значения поля `format_template`: @@ -247,17 +244,35 @@ open_ai: Подобна структуре для формата `http`, за исключением того, что поле `format_template` неизменяемое и всегда равняется значению по умолчанию. -Фильтры и функции, используемые в шаблонных строках: +Использование шаблонных строк: + +Шаблонные строки реализованы с использованием стандартной библиотеки golang, ознакомиться +со всеми ее возможностями и доступными функциями можно данной [документации](https://pkg.go.dev/text/template). + +Доступ к данным: -Шаблонные строки реализованы с использованием библиотеки `pongo2`, ознакомиться -со всеми доступными фильтрами и функциями можно в репозитории [pongo2](https://github.com/flosch/pongo2). +Обращение к данным в шаблоне выполняется с помощью `.`(объект или значение, переданное шаблону) +и имени переменной, например, `{{ .var }}`. -Вдобавок к ним была добавлена 1 функция: +Вызовы функций: -- pattern: позволяет создать паттерн строки при помощи специальных символов. +- прямой вызов: `{{ upper .name }}`. +- с помощью pipe: `{{ .name | upper }}`. + +В проекте помимо стандартных функций доступны `5` пользовательских: + +- `pattern`: позволяет создать паттерн строки при помощи специальных символов. Символ `A` - любая большая буква, символ `a` - любая маленькая буква, символ `0` - любая цифра, символ `#` - любой символ, а остальные символы остаются как есть. - Функция доступна только в поле `template` типа данных `string`. +- `upper`: преобразует строку в верхний регистр. +- `lower`: преобразует строку в нижний регистр. +- `len`: возвращает длину элемента. +- `json`: преобразует элемент в JSON строку. + +Ограничения по использованию: + +Функции `pattern`, `lower`, и `upper` доступны только в поле `template` типа данных `string`. +Функции `len` и `json` доступны только в поле `format_template` параметров вывода. #### Примеры конфигурации генерации данных @@ -324,13 +339,13 @@ models: - name: passport type: string type_params: - template: "{{ pattern('AA 00 000 000') }}" + template: '{{ "AA 00 000 000" | pattern }}' distinct_percentage: 1 ordered: true - name: email type: string type_params: - template: "{{ first_name_en | lower }}.{{ id }}@example.com" + template: "{{ .first_name_en | lower }}.{{ .id }}@example.com" - name: rating type: float type_params: diff --git a/go.mod b/go.mod index ad9d088..1a3689c 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,6 @@ go 1.23.8 require ( github.com/apache/arrow-go/v18 v18.2.0 github.com/charmbracelet/huh/spinner v0.0.0-20250203114958-f07ae1af69ae - github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3 github.com/google/uuid v1.6.0 github.com/hashicorp/go-retryablehttp v0.7.7 github.com/ilyakaznacheev/cleanenv v1.5.0 diff --git a/go.sum b/go.sum index 8488c13..11b141e 100644 --- a/go.sum +++ b/go.sum @@ -45,8 +45,6 @@ github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6 github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM= github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= -github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3 h1:fmFk0Wt3bBxxwZnu48jqMdaOR/IZ4vdtJFuaFV8MpIE= -github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3/go.mod h1:bJWSKrZyQvfTnb2OudyUjurSG4/edverV7n82+K3JiM= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= @@ -133,7 +131,6 @@ github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELU github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo= github.com/muesli/termenv v0.15.3-0.20240618155329-98d742f6907a h1:2MaM6YC3mGu54x+RKAA6JiFFHlHDY1UbkxqppT7wYOg= github.com/muesli/termenv v0.15.3-0.20240618155329-98d742f6907a/go.mod h1:hxSnBBYLK21Vtq/PHd0S2FYCxBXzBua8ov5s1RobyRQ= -github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= @@ -271,7 +268,6 @@ google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2 google.golang.org/protobuf v1.36.5 h1:tPhr+woSbjfYvY6/GPufUoYizxw1cF/yFoxJ2fmpwlM= google.golang.org/protobuf v1.36.5/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20200902074654-038fdea0a05b/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= diff --git a/internal/generator/common/utils.go b/internal/generator/common/utils.go index 49f2087..6331038 100644 --- a/internal/generator/common/utils.go +++ b/internal/generator/common/utils.go @@ -361,7 +361,7 @@ func CtxClosed(ctx context.Context) bool { } func ExtractValuesFromTemplate(template string) []string { - re := regexp.MustCompile(`{{\s*([^\s|(){}]+)[^}]*}}`) + re := regexp.MustCompile(`{{.*?\.([^\s|}]+).*?}}`) matches := re.FindAllStringSubmatch(template, -1) values := make([]string, 0, len(matches)) @@ -387,18 +387,18 @@ func ExtractValuesFromTemplate(template string) []string { func TopologicalSort[T any](items []T, nodeFunc func(T) (string, []string)) ([]string, bool, error) { var ( graph = make(map[string][]string, len(items)) - sortedVertexes = make([]string, 0, len(items)) + sortedVertexes = make([]string, len(items)) hasDependencies bool err error ) - for _, item := range items { + for i, item := range items { name, dependencies := nodeFunc(item) if len(dependencies) > 0 { hasDependencies = true } - sortedVertexes = append(sortedVertexes, name) + sortedVertexes[i] = name graph[name] = dependencies } diff --git a/internal/generator/common/utils_test.go b/internal/generator/common/utils_test.go index 5c4a406..eb8e608 100644 --- a/internal/generator/common/utils_test.go +++ b/internal/generator/common/utils_test.go @@ -713,18 +713,13 @@ func TestExtractValuesFromTemplate(t *testing.T) { }, { name: "Valid template", - template: "{{ foo }}.{{boo}}", + template: "{{ .foo }}.{{.boo}}", expected: []string{"foo", "boo"}, }, - { - name: "Template with filters", - template: "{{ foo | upper | lower }}", - expected: []string{"foo"}, - }, { name: "Template with functions", - template: "{{ upper('foo') | lower }}@{{ boo }}", - expected: []string{"boo"}, + template: "{{ upper .foo | lower }}@{{ .boo }}", + expected: []string{"foo", "boo"}, }, { name: "Invalid template", diff --git a/internal/generator/usecase/general/generator/value/string.go b/internal/generator/usecase/general/generator/value/string.go index 0012b09..713627f 100644 --- a/internal/generator/usecase/general/generator/value/string.go +++ b/internal/generator/usecase/general/generator/value/string.go @@ -1,13 +1,16 @@ package value import ( + "bytes" + "fmt" "math" "math/big" "regexp" "slices" "strings" + "sync" + "text/template" - "github.com/flosch/pongo2" "github.com/pkg/errors" "github.com/tarantool/sdvg/internal/generator/common" "github.com/tarantool/sdvg/internal/generator/models" @@ -17,7 +20,8 @@ import ( ) var ( - rePatternVal = regexp.MustCompile(`pattern\((?:'([^']*)'|"([^"]*)")\)`) + rePatternFunc = regexp.MustCompile(`{{\s*pattern\(\s*(?:'([^']*)'|"([^"]*)")\s*\)\s*}}`) + rePatternFilter = regexp.MustCompile(`{{\s*(?:pattern\s+"([^"]+)"|"([^"]+)"\s*\|\s*pattern)\s*}}`) ) // Verify interface compliance in compile time. @@ -27,8 +31,9 @@ var _ Generator = (*StringGenerator)(nil) type StringGenerator struct { *models.ColumnStringParams totalValuesCount uint64 + template *template.Template + bufPool *sync.Pool localeModule locale.LocalModule - template *pongo2.Template charset []rune countByPrefix []float64 sumByPrefix []float64 @@ -38,12 +43,25 @@ type StringGenerator struct { //nolint:cyclop func (g *StringGenerator) Prepare() error { if g.Template != "" { - template, err := pongo2.FromString(g.Template) + tmpl, err := template.New("template"). + Funcs(template.FuncMap{ + "upper": strings.ToUpper, + "lower": strings.ToLower, + "pattern": func(s string) string { + return fmt.Sprintf("{{pattern('%s')}}", s) + }, + }). + Parse(g.Template) if err != nil { return errors.Errorf("failed to parse template: %s", err.Error()) } - g.template = template + g.template = tmpl + g.bufPool = &sync.Pool{ + New: func() any { + return new(bytes.Buffer) + }, + } } switch g.Locale { @@ -188,20 +206,28 @@ func (g *StringGenerator) calculateCompletions(length int) []int64 { } // templateString returns n-th string by template. +// +//nolint:forcetypeassert func (g *StringGenerator) templateString(number float64, rowValues map[string]any) (string, error) { - if rowValues == nil { - rowValues = make(map[string]any) - } - - rowValues["pattern"] = func(pattern string) *pongo2.Value { - return pongo2.AsSafeValue(g.patternString(number, pattern)) - } + buf := g.bufPool.Get().(*bytes.Buffer) + buf.Reset() - val, err := g.template.Execute(rowValues) + err := g.template.Execute(buf, rowValues) if err != nil { + g.bufPool.Put(buf) + return "", errors.New(err.Error()) } + val := buf.String() + g.bufPool.Put(buf) + + val = rePatternFunc.ReplaceAllStringFunc(val, func(m string) string { + sub := rePatternFunc.FindStringSubmatch(m) + + return g.patternString(number, sub[1]) + }) + return val, nil } @@ -514,7 +540,7 @@ func (g *StringGenerator) ValuesCount(distinctValuesCountByColumn map[string]uin func (g *StringGenerator) templateCardinality(distinctValuesCountByColumn map[string]uint64) float64 { total := 1.0 - patternValMatches := rePatternVal.FindAllStringSubmatch(g.Template, -1) + patternValMatches := rePatternFilter.FindAllStringSubmatch(g.Template, -1) for _, match := range patternValMatches { pattern := match[1] if pattern == "" { diff --git a/internal/generator/usecase/general/task.go b/internal/generator/usecase/general/task.go index b3bf3a4..026dfc6 100644 --- a/internal/generator/usecase/general/task.go +++ b/internal/generator/usecase/general/task.go @@ -269,7 +269,8 @@ func (t *Task) generateAndSaveValues(ctx context.Context) error { pool.Submit( ctx, outputSyncer.WorkerSyncer(), - modelName, columnsTopologicalOrder, originColumnsIndexes, hasDependencies, + modelName, hasDependencies, + columnsTopologicalOrder, originColumnsIndexes, generators, rowsCount, ) } @@ -305,7 +306,8 @@ func (t *Task) skipRows() { // generateAndSaveBatch function generate batch of values for selected column and send it to output. func (t *Task) generateAndSaveBatch( ctx context.Context, outputSync *common.WorkerSyncer, - modelName string, columnsTopologicalOrder []string, originColumnsIndexes map[string]int, hasDependencies bool, + modelName string, hasDependencies bool, + columnsTopologicalOrder []string, originColumnsIndexes map[string]int, generators []*generator.BatchGenerator, count uint64, ) error { defer outputSync.Done(ctx) diff --git a/internal/generator/usecase/general/test/unit_test.go b/internal/generator/usecase/general/test/unit_test.go index cae5433..836f614 100644 --- a/internal/generator/usecase/general/test/unit_test.go +++ b/internal/generator/usecase/general/test/unit_test.go @@ -531,9 +531,9 @@ func TestString(t *testing.T) { {&models.ColumnStringParams{LogicalType: models.LastNameType, MinLength: 4, MaxLength: 7}, 4, 7}, {&models.ColumnStringParams{LogicalType: models.PhoneType, MinLength: 10, MaxLength: 10}, 10, 10}, {&models.ColumnStringParams{MinLength: 100, MaxLength: 100}, 100, 100}, - {&models.ColumnStringParams{Template: "{{ pattern('AAaa00##') }}", Locale: "en"}, 8, 8}, - {&models.ColumnStringParams{Template: "{{ pattern('AAaa00##') }}", Locale: "ru"}, 8, 8}, - {&models.ColumnStringParams{Template: "{{ pattern('0123456789012345678901234567890123456789') }}"}, 40, 40}, + {&models.ColumnStringParams{Template: `{{ pattern "AAaa00##" }}`, Locale: "en"}, 8, 8}, + {&models.ColumnStringParams{Template: `{{ pattern "AAaa00##" }}`, Locale: "ru"}, 8, 8}, + {&models.ColumnStringParams{Template: `{{ pattern "0123456789012345678901234567890123456789" }}`}, 40, 40}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 3, MaxLength: 5}, 3, 5}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 254, MaxLength: 256}, 254, 256}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 510, MaxLength: 512}, 510, 512}, @@ -656,7 +656,7 @@ func TestString(t *testing.T) { { &models.ColumnStringParams{ Locale: "en", - Template: "{{ field }}", + Template: "{{ .field }}", }, map[string]uint64{ "field": 11, @@ -666,7 +666,7 @@ func TestString(t *testing.T) { { &models.ColumnStringParams{ Locale: "en", - Template: "{{ pattern('A00') }}", + Template: `{{ pattern "A00" }}`, }, nil, 2600, @@ -674,7 +674,7 @@ func TestString(t *testing.T) { { &models.ColumnStringParams{ Locale: "ru", - Template: "{{ field }}{{ pattern('a0#') }}", + Template: `{{ .field }}{{ pattern "a0#" }}`, }, map[string]uint64{ "field": 10, @@ -707,7 +707,7 @@ func TestString(t *testing.T) { Ranges: []*models.Params{ { TypeParams: &models.ColumnStringParams{ - Template: "{{ id }}.{{ pattern('00') }}@example.com", + Template: `{{ .id }}.{{ pattern "00" }}@example.com`, }, DistinctPercentage: 1, }, @@ -928,7 +928,7 @@ func TestIdempotence(t *testing.T) { Name: "passport", Type: "string", Ranges: []*models.Params{{TypeParams: &models.ColumnStringParams{ - Template: "{{ pattern('AA 00 000 000') }}", + Template: `{{ pattern "AA 00 000 000" }}`, }, NullPercentage: 0.5}}, }, From 925635b0a63eea1ebc4f922964541433fff617cf Mon Sep 17 00:00:00 2001 From: reversetm Date: Wed, 30 Jul 2025 17:06:58 +0300 Subject: [PATCH 07/15] Separated the pattern and template logic, improved performance --- CHANGELOG.md | 8 +- doc/en/usage.md | 18 +-- doc/ru/usage.md | 17 +- internal/generator/common/utils.go | 9 +- internal/generator/models/generator_model.go | 15 ++ .../usecase/general/generator/generator.go | 12 +- .../general/generator/value/datetime.go | 2 +- .../usecase/general/generator/value/enum.go | 2 +- .../usecase/general/generator/value/float.go | 2 +- .../general/generator/value/integer.go | 2 +- .../general/generator/value/interfaces.go | 2 +- .../usecase/general/generator/value/string.go | 103 ++++-------- .../usecase/general/generator/value/uuid.go | 2 +- internal/generator/usecase/general/task.go | 73 ++++----- .../usecase/general/test/unit_test.go | 149 ++---------------- 15 files changed, 128 insertions(+), 288 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a19ccf5..0aaa1c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,13 +9,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- Field `template` in the `string` data type now not only specifies the pattern, - but also allows you to use the values of any columns of the generated model. +- The `template` field in the `string` data type is now used to generate template strings + with the ability to use the values of any columns of the generated model. ### Breaking changes -- The old version of string template in `type_params` of `string` type is no longer supported, - `{{ "pattern_expression" | pattern }}` should be used instead. +- Using `template` field to specify a string pattern like `Aa0#` is no longer supported, + `pattern` should be used instead. ## [0.0.1](https://github.com/tarantool/sdvg/compare/36d0930..0.0.1) - 2025-07-21 diff --git a/doc/en/usage.md b/doc/en/usage.md index 6a36462..54af81a 100644 --- a/doc/en/usage.md +++ b/doc/en/usage.md @@ -158,9 +158,11 @@ Structure `models[*].columns[*].type_params` for data type `string`: - `min_length`: Minimum string length. Default is `1`. - `max_length`: Maximum string length. Default is `32`. - `logical_type`: Logical type of string. Supported values: `first_name`, `last_name`, `phone`, `text`. -- `template`: Template for string generation. Allows you to use the values of any columns of the generated model and - specify the pattern of the string using the `pattern` function. Information about the functions - available in template strings is described at the end of this section. +- `template`: Template for string generation. Allows you to use the values of any columns of the generated model. + Information about the functions available in template strings is described at the end of this section. + Cannot coexist with `ordered`, `distinct_percentage` or `distinct_count`. +- `pattern`: Pattern for string generation. The `A` symbol is any capital letter, the `a` symbol is any small letter, + symbol `0` is any digit, the `#` symbol is any character, and the other characters remain as they are. - `locale`: Locale for generated strings. Supported values: `ru`, `en`. Default is `en`. - `without_large_letters`: Flag indicating if uppercase letters should be excluded from the string. - `without_small_letters`: Flag indicating if lowercase letters should be excluded from the string. @@ -253,12 +255,8 @@ Function calls: - direct call: `{{ upper .name }}`. - using pipe: `{{ .name | upper }}`. -In addition to standard functions, the project provides `5` custom functions: +In addition to standard functions, the project provides `4` custom functions: -- `pattern`: allows you to create a string pattern using special characters. - The `A` symbol is any capital letter, the `a` symbol is any small letter, - symbol `0` is any digit, the `#` symbol is any character, and the other characters remain as they are. - The function is available only in the `template` field of the `string` data type. - `upper`: converts the string to upper case. - `lower`: converts the string to lower case. - `len`: returns the length of the element. @@ -266,7 +264,7 @@ In addition to standard functions, the project provides `5` custom functions: Usage restrictions: -The `pattern`, `lower`, and `upper` functions are available only in the `template` field of the `string` data type. +The `lower`, and `upper` functions are available only in the `template` field of the `string` data type. The `len` and `json` functions are available only in the `format_template` field of the output parameters. #### Examples of data generation configuration @@ -334,7 +332,7 @@ models: - name: passport type: string type_params: - template: '{{ "AA 00 000 000" | pattern }}' + pattern: AA 00 000 000 distinct_percentage: 1 ordered: true - name: email diff --git a/doc/ru/usage.md b/doc/ru/usage.md index 79b56d8..b803f90 100644 --- a/doc/ru/usage.md +++ b/doc/ru/usage.md @@ -164,9 +164,11 @@ open_ai: - `min_length`: Минимальная длина строки. По умолчанию `1`. - `max_length`: Максимальная длина строки. По умолчанию `32`. - `logical_type`: Логический тип строки. Поддерживаемые значения: `first_name`, `last_name`, `phone`, `text`. -- `template`: Шаблон для генерации строки. Позволяет использовать значения любых столбов генерируемой модели и - задавать паттерн строки с помощью функции `pattern`. Информация о том, как использовать шаблонные строки, - описана в конце данного раздела. +- `template`: Шаблон для генерации строки. Позволяет использовать значения любых столбов генерируемой модели. + Информация о том, как использовать шаблонные строки, описана в конце данного раздела. + Не работает совместно с `ordered`, `distinct_percentage` или `distinct_count`. +- `pattern`: Паттерн для генерации строки. Символ `A` - любая большая буква, символ `a` - любая маленькая буква, + символ `0` - любая цифра, символ `#` - любой символ, а остальные символы остаются как есть. - `locale`: Локаль для генерации строк. Поддерживаемые значения: `ru`, `en`. По умолчанию `en`. - `without_large_letters`: Флаг, указывающий, исключать ли большие буквы из строки. - `without_small_letters`: Флаг, указывающий, исключать ли маленькие буквы из строки. @@ -259,11 +261,8 @@ open_ai: - прямой вызов: `{{ upper .name }}`. - с помощью pipe: `{{ .name | upper }}`. -В проекте помимо стандартных функций доступны `5` пользовательских: +В проекте помимо стандартных функций доступны `4` пользовательских: -- `pattern`: позволяет создать паттерн строки при помощи специальных символов. - Символ `A` - любая большая буква, символ `a` - любая маленькая буква, - символ `0` - любая цифра, символ `#` - любой символ, а остальные символы остаются как есть. - `upper`: преобразует строку в верхний регистр. - `lower`: преобразует строку в нижний регистр. - `len`: возвращает длину элемента. @@ -271,7 +270,7 @@ open_ai: Ограничения по использованию: -Функции `pattern`, `lower`, и `upper` доступны только в поле `template` типа данных `string`. +Функции `lower`, и `upper` доступны только в поле `template` типа данных `string`. Функции `len` и `json` доступны только в поле `format_template` параметров вывода. #### Примеры конфигурации генерации данных @@ -339,7 +338,7 @@ models: - name: passport type: string type_params: - template: '{{ "AA 00 000 000" | pattern }}' + pattern: AA 00 000 000 distinct_percentage: 1 ordered: true - name: email diff --git a/internal/generator/common/utils.go b/internal/generator/common/utils.go index 6331038..523b6d4 100644 --- a/internal/generator/common/utils.go +++ b/internal/generator/common/utils.go @@ -367,14 +367,7 @@ func ExtractValuesFromTemplate(template string) []string { values := make([]string, 0, len(matches)) for _, match := range matches { - expr := match[0] - val := match[1] - - if strings.Contains(expr, "(") && strings.Contains(expr, ")") { - continue - } - - values = append(values, val) + values = append(values, match[1]) } return values diff --git a/internal/generator/models/generator_model.go b/internal/generator/models/generator_model.go index e3c451f..5f88de9 100644 --- a/internal/generator/models/generator_model.go +++ b/internal/generator/models/generator_model.go @@ -426,6 +426,16 @@ func (p *Params) Validate() []error { errs = append(errs, datetimeParamsErrs...) } + if p.StringParams != nil && p.StringParams.Template != "" { + if common.Any( + p.Ordered, + p.DistinctPercentage != 0, + p.DistinctCount != 0, + ) { + errs = append(errs, errors.New("forbidden to use string template with distinct params or ordered")) + } + } + // must be called only after parsing, filling defaults and validation of TypeParams. if p.Values != nil { if err := p.PostProcess(); err != nil { @@ -674,6 +684,7 @@ type ColumnStringParams struct { Locale string `backup:"true" json:"locale" yaml:"locale"` LogicalType string `backup:"true" json:"logical_type" yaml:"logical_type"` Template string `backup:"true" json:"template" yaml:"template"` + Pattern string `backup:"true" json:"pattern" yaml:"pattern"` WithoutLargeLetters bool `backup:"true" json:"without_large_letters" yaml:"without_large_letters"` WithoutSmallLetters bool `backup:"true" json:"without_small_letters" yaml:"without_small_letters"` WithoutNumbers bool `backup:"true" json:"without_numbers" yaml:"without_numbers"` @@ -703,6 +714,10 @@ func (p *ColumnStringParams) FillDefaults() { func (p *ColumnStringParams) Validate() []error { var errs []error + if p.Template != "" && p.Pattern != "" { + errs = append(errs, errors.Errorf("forbidden to use template and pattern at the same time")) + } + if p.MinLength > p.MaxLength { errs = append(errs, errors.Errorf( "min length (%v) should be less than or equal to max length (%v)", diff --git a/internal/generator/usecase/general/generator/generator.go b/internal/generator/usecase/general/generator/generator.go index ea52da4..286a259 100644 --- a/internal/generator/usecase/general/generator/generator.go +++ b/internal/generator/usecase/general/generator/generator.go @@ -28,7 +28,7 @@ type ColumnGenerator struct { } func NewColumnGenerator( - baseSeed uint64, distinctValuesCountByColumn map[string]uint64, + baseSeed uint64, modelName string, model *models.Model, column *models.Column, dataModelName string, dataModel *models.Model, dataColumn *models.Column, ) (*ColumnGenerator, error) { @@ -54,7 +54,7 @@ func NewColumnGenerator( rangeRowsCount := uint64(math.Ceil(float64(rowsCount) * dataRange.RangePercentage)) gen, err := newRangeGenerator( - column, columnSeed, distinctValuesCountByColumn, + column, columnSeed, dataModel, dataColumn, dataColumnSeed, dataRange, rangeRowsOffset, rangeRowsCount, ) @@ -93,7 +93,7 @@ func (cg *ColumnGenerator) SkipRows(count uint64) { //nolint:cyclop func newRangeGenerator( - column *models.Column, columnSeed uint64, distinctValuesCountByColumn map[string]uint64, + column *models.Column, columnSeed uint64, dataModel *models.Model, dataColumn *models.Column, dataColumnSeed uint64, dataRange *models.Params, rangeRowsOffset, rangeRowsCount uint64, ) (*rangeGenerator, error) { @@ -139,7 +139,7 @@ func newRangeGenerator( distinctValuesCount = dataRange.DistinctCount } - generatorValuesCount := valueGenerator.ValuesCount(distinctValuesCountByColumn) + generatorValuesCount := valueGenerator.ValuesCount() if float64(distinctValuesCount) > generatorValuesCount { if dataRange.DistinctPercentage != 0 || dataRange.DistinctCount != 0 { @@ -149,10 +149,6 @@ func newRangeGenerator( distinctValuesCount = uint64(generatorValuesCount) } - if distinctValuesCountByColumn != nil { - distinctValuesCountByColumn[column.Name] += distinctValuesCount - } - rangeOrdered := dataRange.Ordered orderSeed := dataColumnSeed diff --git a/internal/generator/usecase/general/generator/value/datetime.go b/internal/generator/usecase/general/generator/value/datetime.go index 8970354..600e89b 100644 --- a/internal/generator/usecase/general/generator/value/datetime.go +++ b/internal/generator/usecase/general/generator/value/datetime.go @@ -49,7 +49,7 @@ func (g *DateTimeGenerator) Value(number float64, _ map[string]any) (any, error) return value, nil } -func (g *DateTimeGenerator) ValuesCount(_ map[string]uint64) float64 { +func (g *DateTimeGenerator) ValuesCount() float64 { fromSec := g.From.Unix() toSec := g.To.Unix() diff --git a/internal/generator/usecase/general/generator/value/enum.go b/internal/generator/usecase/general/generator/value/enum.go index e989b68..18d4413 100644 --- a/internal/generator/usecase/general/generator/value/enum.go +++ b/internal/generator/usecase/general/generator/value/enum.go @@ -37,6 +37,6 @@ func (g *EnumGenerator) Value(number float64, _ map[string]any) (any, error) { return g.Values[idx], nil } -func (g *EnumGenerator) ValuesCount(_ map[string]uint64) float64 { +func (g *EnumGenerator) ValuesCount() float64 { return float64(len(g.Values)) } diff --git a/internal/generator/usecase/general/generator/value/float.go b/internal/generator/usecase/general/generator/value/float.go index 79ba0c5..c1903a4 100644 --- a/internal/generator/usecase/general/generator/value/float.go +++ b/internal/generator/usecase/general/generator/value/float.go @@ -36,6 +36,6 @@ func (g *FloatGenerator) Value(number float64, _ map[string]any) (any, error) { return value, nil } -func (g *FloatGenerator) ValuesCount(_ map[string]uint64) float64 { +func (g *FloatGenerator) ValuesCount() float64 { return math.Inf(1) } diff --git a/internal/generator/usecase/general/generator/value/integer.go b/internal/generator/usecase/general/generator/value/integer.go index 5aefbc1..c83b1ba 100644 --- a/internal/generator/usecase/general/generator/value/integer.go +++ b/internal/generator/usecase/general/generator/value/integer.go @@ -37,6 +37,6 @@ func (g *IntegerGenerator) Value(number float64, _ map[string]any) (any, error) } } -func (g *IntegerGenerator) ValuesCount(_ map[string]uint64) float64 { +func (g *IntegerGenerator) ValuesCount() float64 { return float64(uint64(g.To-g.From)) + 1 } diff --git a/internal/generator/usecase/general/generator/value/interfaces.go b/internal/generator/usecase/general/generator/value/interfaces.go index 7200317..b669412 100644 --- a/internal/generator/usecase/general/generator/value/interfaces.go +++ b/internal/generator/usecase/general/generator/value/interfaces.go @@ -9,5 +9,5 @@ type Generator interface { // Value method should return ordered unique value by number Value(number float64, rowValues map[string]any) (any, error) // ValuesCount method should return the number of possible values to generate - ValuesCount(distinctValuesCountByColumn map[string]uint64) float64 + ValuesCount() float64 } diff --git a/internal/generator/usecase/general/generator/value/string.go b/internal/generator/usecase/general/generator/value/string.go index 713627f..280b6b0 100644 --- a/internal/generator/usecase/general/generator/value/string.go +++ b/internal/generator/usecase/general/generator/value/string.go @@ -2,28 +2,20 @@ package value import ( "bytes" - "fmt" "math" "math/big" - "regexp" "slices" "strings" "sync" "text/template" "github.com/pkg/errors" - "github.com/tarantool/sdvg/internal/generator/common" "github.com/tarantool/sdvg/internal/generator/models" "github.com/tarantool/sdvg/internal/generator/usecase/general/locale" "github.com/tarantool/sdvg/internal/generator/usecase/general/locale/en" "github.com/tarantool/sdvg/internal/generator/usecase/general/locale/ru" ) -var ( - rePatternFunc = regexp.MustCompile(`{{\s*pattern\(\s*(?:'([^']*)'|"([^"]*)")\s*\)\s*}}`) - rePatternFilter = regexp.MustCompile(`{{\s*(?:pattern\s+"([^"]+)"|"([^"]+)"\s*\|\s*pattern)\s*}}`) -) - // Verify interface compliance in compile time. var _ Generator = (*StringGenerator)(nil) @@ -44,12 +36,10 @@ type StringGenerator struct { func (g *StringGenerator) Prepare() error { if g.Template != "" { tmpl, err := template.New("template"). + Option("missingkey=error"). Funcs(template.FuncMap{ "upper": strings.ToUpper, "lower": strings.ToLower, - "pattern": func(s string) string { - return fmt.Sprintf("{{pattern('%s')}}", s) - }, }). Parse(g.Template) if err != nil { @@ -208,7 +198,7 @@ func (g *StringGenerator) calculateCompletions(length int) []int64 { // templateString returns n-th string by template. // //nolint:forcetypeassert -func (g *StringGenerator) templateString(number float64, rowValues map[string]any) (string, error) { +func (g *StringGenerator) templateString(rowValues map[string]any) (string, error) { buf := g.bufPool.Get().(*bytes.Buffer) buf.Reset() @@ -222,18 +212,12 @@ func (g *StringGenerator) templateString(number float64, rowValues map[string]an val := buf.String() g.bufPool.Put(buf) - val = rePatternFunc.ReplaceAllStringFunc(val, func(m string) string { - sub := rePatternFunc.FindStringSubmatch(m) - - return g.patternString(number, sub[1]) - }) - return val, nil } // patternString returns n-th string by pattern. -func (g *StringGenerator) patternString(number float64, pattern string) string { - val := []rune(pattern) +func (g *StringGenerator) patternString(number float64) string { + val := []rune(g.Pattern) index := number / float64(g.totalValuesCount) for i := range val { @@ -473,14 +457,18 @@ func (g *StringGenerator) simpleString(number float64) string { // Value returns n-th string from range. func (g *StringGenerator) Value(number float64, rowValues map[string]any) (any, error) { if g.Template != "" { - val, err := g.templateString(number, rowValues) + val, err := g.templateString(rowValues) if err != nil { - return nil, errors.WithMessage(err, "failed to template string") + return nil, errors.WithMessage(err, "failed to render template string") } return val, nil } + if g.Pattern != "" { + return g.patternString(number), nil + } + switch g.LogicalType { case models.FirstNameType: return g.firstName(number), nil @@ -496,9 +484,31 @@ func (g *StringGenerator) Value(number float64, rowValues map[string]any) (any, } //nolint:cyclop -func (g *StringGenerator) ValuesCount(distinctValuesCountByColumn map[string]uint64) float64 { +func (g *StringGenerator) ValuesCount() float64 { if g.Template != "" { - return g.templateCardinality(distinctValuesCountByColumn) + return 1.0 + } + + if g.Pattern != "" { + total := 1.0 + + if count := strings.Count(g.Pattern, "A"); count > 0 { + total *= math.Pow(float64(len(g.localeModule.LargeLetters())), float64(count)) + } + + if count := strings.Count(g.Pattern, "a"); count > 0 { + total *= math.Pow(float64(len(g.localeModule.SmallLetters())), float64(count)) + } + + if count := strings.Count(g.Pattern, "0"); count > 0 { + total *= math.Pow(float64(len(locale.Numbers)), float64(count)) + } + + if count := strings.Count(g.Pattern, "#"); count > 0 { + total *= math.Pow(float64(len(locale.SpecialChars)), float64(count)) + } + + return total } switch g.LogicalType { @@ -536,48 +546,3 @@ func (g *StringGenerator) ValuesCount(distinctValuesCountByColumn map[string]uin return totalCount } - -func (g *StringGenerator) templateCardinality(distinctValuesCountByColumn map[string]uint64) float64 { - total := 1.0 - - patternValMatches := rePatternFilter.FindAllStringSubmatch(g.Template, -1) - for _, match := range patternValMatches { - pattern := match[1] - if pattern == "" { - pattern = match[2] - } - - total *= g.patternCardinality(pattern) - } - - columns := common.ExtractValuesFromTemplate(g.Template) - for _, column := range columns { - if count, ok := distinctValuesCountByColumn[column]; ok && count > 0 { - total *= float64(count) - } - } - - return total -} - -func (g *StringGenerator) patternCardinality(pattern string) float64 { - total := 1.0 - - if count := strings.Count(pattern, "A"); count > 0 { - total *= math.Pow(float64(len(g.localeModule.LargeLetters())), float64(count)) - } - - if count := strings.Count(pattern, "a"); count > 0 { - total *= math.Pow(float64(len(g.localeModule.SmallLetters())), float64(count)) - } - - if count := strings.Count(pattern, "0"); count > 0 { - total *= math.Pow(float64(len(locale.Numbers)), float64(count)) - } - - if count := strings.Count(pattern, "#"); count > 0 { - total *= math.Pow(float64(len(locale.SpecialChars)), float64(count)) - } - - return total -} diff --git a/internal/generator/usecase/general/generator/value/uuid.go b/internal/generator/usecase/general/generator/value/uuid.go index 914e503..ca32580 100644 --- a/internal/generator/usecase/general/generator/value/uuid.go +++ b/internal/generator/usecase/general/generator/value/uuid.go @@ -43,6 +43,6 @@ func (g *UUIDGenerator) Value(number float64, _ map[string]any) (any, error) { return res, nil } -func (g *UUIDGenerator) ValuesCount(_ map[string]uint64) float64 { +func (g *UUIDGenerator) ValuesCount() float64 { return float64(1<<(128-10) - 1) //nolint:mnd } diff --git a/internal/generator/usecase/general/task.go b/internal/generator/usecase/general/task.go index 026dfc6..39f88cb 100644 --- a/internal/generator/usecase/general/task.go +++ b/internal/generator/usecase/general/task.go @@ -84,24 +84,7 @@ func newGenerators(cfg *models.GenerationConfig) (map[string]*generator.ColumnGe generators := make(map[string]*generator.ColumnGenerator) for modelName, model := range cfg.Models { - columnsTopologicalOrder, hasDependencies, err := columnsTopologicalSort(model.Columns) - if err != nil { - return nil, errors.WithMessagef(err, "failed to sorting columns by dependencies for model %q", modelName) - } - - originIndexes := make(map[string]int, len(model.Columns)) - for index, column := range model.Columns { - originIndexes[column.Name] = index - } - - var distinctValuesCountByColumn map[string]uint64 - if hasDependencies { - distinctValuesCountByColumn = make(map[string]uint64, len(model.Columns)) - } - - for _, columnName := range columnsTopologicalOrder { - column := model.Columns[originIndexes[columnName]] - + for _, column := range model.Columns { dataModelName := modelName dataModel := model dataColumn := column @@ -115,7 +98,7 @@ func newGenerators(cfg *models.GenerationConfig) (map[string]*generator.ColumnGe columnKey := common.GetKey(modelName, column.Name) gen, err := generator.NewColumnGenerator( - cfg.RandomSeed, distinctValuesCountByColumn, + cfg.RandomSeed, modelName, model, column, dataModelName, dataModel, dataColumn, ) @@ -130,8 +113,8 @@ func newGenerators(cfg *models.GenerationConfig) (map[string]*generator.ColumnGe return generators, nil } -func columnsTopologicalSort(columns []*models.Column) ([]string, bool, error) { - return common.TopologicalSort( +func columnsIdxTopologicalSort(columns []*models.Column) ([]int, bool, error) { + sortedNames, hasDeps, err := common.TopologicalSort( columns, func(c *models.Column) (string, []string) { var deps []string @@ -145,6 +128,21 @@ func columnsTopologicalSort(columns []*models.Column) ([]string, bool, error) { return c.Name, deps }, ) + if err != nil { + return nil, false, err + } + + originColumnsIndexes := make(map[string]int, len(columns)) + for index, column := range columns { + originColumnsIndexes[column.Name] = index + } + + sortedIndexes := make([]int, len(sortedNames)) + for i, columnName := range sortedNames { + sortedIndexes[i] = originColumnsIndexes[columnName] + } + + return sortedIndexes, hasDeps, nil } // RunTask function generates unique values and then all values for selected model. @@ -206,7 +204,7 @@ func (t *Task) WaitError() error { // generateAndSaveValues function generates values for all model. // -//nolint:cyclop + func (t *Task) generateAndSaveValues(ctx context.Context) error { var err error @@ -237,16 +235,11 @@ func (t *Task) generateAndSaveValues(ctx context.Context) error { continue } - columnsTopologicalOrder, hasDependencies, err := columnsTopologicalSort(model.Columns) + sortedColumnsIndexes, hasDependencies, err := columnsIdxTopologicalSort(model.Columns) if err != nil { return errors.WithMessagef(err, "failed to sorting columns by dependencies for model %q", modelName) } - originColumnsIndexes := make(map[string]int, len(model.Columns)) - for index, column := range model.Columns { - originColumnsIndexes[column.Name] = index - } - pool.Add(1) go func() { @@ -269,8 +262,8 @@ func (t *Task) generateAndSaveValues(ctx context.Context) error { pool.Submit( ctx, outputSyncer.WorkerSyncer(), - modelName, hasDependencies, - columnsTopologicalOrder, originColumnsIndexes, + model, hasDependencies, + sortedColumnsIndexes, generators, rowsCount, ) } @@ -306,8 +299,8 @@ func (t *Task) skipRows() { // generateAndSaveBatch function generate batch of values for selected column and send it to output. func (t *Task) generateAndSaveBatch( ctx context.Context, outputSync *common.WorkerSyncer, - modelName string, hasDependencies bool, - columnsTopologicalOrder []string, originColumnsIndexes map[string]int, + model *models.Model, hasDependencies bool, + columnsIndexesTopologicalOrder []int, generators []*generator.BatchGenerator, count uint64, ) error { defer outputSync.Done(ctx) @@ -321,38 +314,36 @@ func (t *Task) generateAndSaveBatch( var rowValues map[string]any if hasDependencies { - rowValues = make(map[string]any, len(originColumnsIndexes)) + rowValues = make(map[string]any, len(generators)) } for i := range count { - for _, columnName := range columnsTopologicalOrder { + for _, columnIdx := range columnsIndexesTopologicalOrder { if common.CtxClosed(ctx) { return &common.ContextCancelError{} } - idx := originColumnsIndexes[columnName] - - value, err := generators[idx].Value(rowValues) + value, err := generators[columnIdx].Value(rowValues) if err != nil { return errors.WithMessage(err, "failed to get or generate value") } - batch[i].Values[idx] = value + batch[i].Values[columnIdx] = value if rowValues != nil { - rowValues[columnName] = value + rowValues[model.Columns[columnIdx].Name] = value } } } outputSync.WaitPrevious(ctx) - err := t.output.HandleRowsBatch(ctx, modelName, batch) + err := t.output.HandleRowsBatch(ctx, model.Name, batch) if err != nil { return errors.WithMessage(err, "failed to save batch to output") } - t.progress.Add(modelName, count) + t.progress.Add(model.Name, count) return nil } diff --git a/internal/generator/usecase/general/test/unit_test.go b/internal/generator/usecase/general/test/unit_test.go index 836f614..0243f2e 100644 --- a/internal/generator/usecase/general/test/unit_test.go +++ b/internal/generator/usecase/general/test/unit_test.go @@ -200,52 +200,15 @@ func checkDistinct(t *testing.T, column *models.Column) { } } -func checkValuesCount( - t *testing.T, - gen value.Generator, - valuesCountByColumn map[string]uint64, expectedValueCount float64, -) { +func checkValuesCount(t *testing.T, gen value.Generator, expectedValueCount float64) { t.Helper() require.NoError(t, gen.Prepare()) - valuesCount := gen.ValuesCount(valuesCountByColumn) + valuesCount := gen.ValuesCount() require.Equal(t, uint64(expectedValueCount), uint64(valuesCount)) } -func checkPossibleToGenerate(t *testing.T, columns []*models.Column, rowsCount uint64, wantErr bool) { - t.Helper() - - copyColumns := make([]*models.Column, 0, len(columns)) - for _, column := range columns { - copyColumns = append(copyColumns, deepColumnCopy(column)) - } - - cfg := getCfg(t, map[string]*models.Model{ - "test": { - RowsCount: rowsCount, - Columns: copyColumns, - }, - }) - - outputHandler := func(_ context.Context, _ string, _ []*models.DataRow) error { return nil } - - out := outputMock.NewOutput(outputHandler) - uc := usecaseGeneral.NewUseCase(usecaseGeneral.UseCaseConfig{}) - - taskID, err := uc.CreateTask( - context.Background(), - usecase.TaskConfig{ - GenerationConfig: &cfg, - Output: out, - }, - ) - - require.Equal(t, wantErr, err != nil) - err = uc.WaitResult(taskID) - require.Equal(t, wantErr, err != nil) -} - func checkForeignKey(t *testing.T, column *models.Column, nullPercentage float64, foreignOrdered bool) { t.Helper() @@ -430,7 +393,7 @@ func TestInteger(t *testing.T) { for _, testCase := range checkValuesCountCases { generator := &value.IntegerGenerator{ColumnIntegerParams: testCase.typeParams} - checkValuesCount(t, generator, nil, testCase.expected) + checkValuesCount(t, generator, testCase.expected) } } @@ -509,7 +472,7 @@ func TestFloat(t *testing.T) { for _, testCase := range checkValuesCountCases { generator := &value.FloatGenerator{ColumnFloatParams: testCase.typeParams} - checkValuesCount(t, generator, nil, testCase.expected) + checkValuesCount(t, generator, testCase.expected) } } @@ -531,9 +494,9 @@ func TestString(t *testing.T) { {&models.ColumnStringParams{LogicalType: models.LastNameType, MinLength: 4, MaxLength: 7}, 4, 7}, {&models.ColumnStringParams{LogicalType: models.PhoneType, MinLength: 10, MaxLength: 10}, 10, 10}, {&models.ColumnStringParams{MinLength: 100, MaxLength: 100}, 100, 100}, - {&models.ColumnStringParams{Template: `{{ pattern "AAaa00##" }}`, Locale: "en"}, 8, 8}, - {&models.ColumnStringParams{Template: `{{ pattern "AAaa00##" }}`, Locale: "ru"}, 8, 8}, - {&models.ColumnStringParams{Template: `{{ pattern "0123456789012345678901234567890123456789" }}`}, 40, 40}, + {&models.ColumnStringParams{Pattern: "AAaa00##", Locale: "en"}, 8, 8}, + {&models.ColumnStringParams{Pattern: "AAaa00##", Locale: "ru"}, 8, 8}, + {&models.ColumnStringParams{Pattern: "0123456789012345678901234567890123456789"}, 40, 40}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 3, MaxLength: 5}, 3, 5}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 254, MaxLength: 256}, 254, 256}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 510, MaxLength: 512}, 510, 512}, @@ -563,9 +526,8 @@ func TestString(t *testing.T) { } checkValuesCountCases := []struct { - typeParams *models.ColumnStringParams - distinctValuesCountByColumn map[string]uint64 - expected float64 + typeParams *models.ColumnStringParams + expected float64 }{ { &models.ColumnStringParams{ @@ -575,7 +537,6 @@ func TestString(t *testing.T) { WithoutNumbers: true, WithoutSpecialChars: true, }, - nil, 52, }, { @@ -586,7 +547,6 @@ func TestString(t *testing.T) { WithoutNumbers: true, WithoutSpecialChars: true, }, - nil, 66.0, }, { @@ -597,7 +557,6 @@ func TestString(t *testing.T) { WithoutNumbers: true, WithoutSpecialChars: true, }, - nil, 1048229968448, }, { @@ -608,7 +567,6 @@ func TestString(t *testing.T) { WithoutNumbers: true, WithoutSpecialChars: true, }, - nil, 24128259706319868, }, { @@ -620,7 +578,6 @@ func TestString(t *testing.T) { WithoutSmallLetters: true, WithoutSpecialChars: true, }, - nil, 1111111111111110000000000, }, { @@ -632,7 +589,6 @@ func TestString(t *testing.T) { WithoutSmallLetters: true, WithoutNumbers: true, }, - nil, 81870575520, }, { @@ -641,7 +597,6 @@ func TestString(t *testing.T) { MaxLength: 15, Locale: "en", }, - nil, 88394150280794134360488281250, }, { @@ -650,7 +605,6 @@ func TestString(t *testing.T) { MaxLength: 15, Locale: "ru", }, - nil, 868834460299970670989801640300, }, { @@ -658,91 +612,20 @@ func TestString(t *testing.T) { Locale: "en", Template: "{{ .field }}", }, - map[string]uint64{ - "field": 11, - }, - 11, + 1, }, { &models.ColumnStringParams{ - Locale: "en", - Template: `{{ pattern "A00" }}`, + Locale: "en", + Pattern: "A00", }, - nil, 2600, }, - { - &models.ColumnStringParams{ - Locale: "ru", - Template: `{{ .field }}{{ pattern "a0#" }}`, - }, - map[string]uint64{ - "field": 10, - }, - 75900, - }, } for _, testCase := range checkValuesCountCases { generator := &value.StringGenerator{ColumnStringParams: testCase.typeParams} - checkValuesCount(t, generator, testCase.distinctValuesCountByColumn, testCase.expected) - } - - idColumn := &models.Column{ - Name: "id", - Type: "integer", - Ranges: []*models.Params{ - { - TypeParams: &models.ColumnIntegerParams{ - FromPtr: int64Ptr(1), - ToPtr: int64Ptr(5), - }, - }, - }, - } - - emailColumn := &models.Column{ - Name: "email", - Type: "string", - Ranges: []*models.Params{ - { - TypeParams: &models.ColumnStringParams{ - Template: `{{ .id }}.{{ pattern "00" }}@example.com`, - }, - DistinctPercentage: 1, - }, - }, - } - - checkPossibleToGenerateCases := []struct { - columns []*models.Column - rowsCount uint64 - wantErr bool - }{ - { - columns: []*models.Column{idColumn, emailColumn}, - rowsCount: 500, - wantErr: false, - }, - { - columns: []*models.Column{emailColumn, idColumn}, - rowsCount: 500, - wantErr: false, - }, - { - columns: []*models.Column{idColumn, emailColumn}, - rowsCount: 501, - wantErr: true, - }, - { - columns: []*models.Column{emailColumn, idColumn}, - rowsCount: 501, - wantErr: true, - }, - } - - for _, testCase := range checkPossibleToGenerateCases { - checkPossibleToGenerate(t, testCase.columns, testCase.rowsCount, testCase.wantErr) + checkValuesCount(t, generator, testCase.expected) } } @@ -751,7 +634,7 @@ func TestUUID(t *testing.T) { checkType(t, column, uuid.UUID{}) checkDistinct(t, column) checkForeignKeyCases(t, column) - checkValuesCount(t, &value.UUIDGenerator{}, nil, float64(1<<(128-10)-1)) + checkValuesCount(t, &value.UUIDGenerator{}, float64(1<<(128-10)-1)) } func TestDateTime(t *testing.T) { @@ -834,7 +717,7 @@ func TestDateTime(t *testing.T) { for _, testCase := range checkValuesCountCases { generator := &value.DateTimeGenerator{ColumnDateTimeParams: testCase.typeParams} - checkValuesCount(t, generator, nil, testCase.expected) + checkValuesCount(t, generator, testCase.expected) } } @@ -928,7 +811,7 @@ func TestIdempotence(t *testing.T) { Name: "passport", Type: "string", Ranges: []*models.Params{{TypeParams: &models.ColumnStringParams{ - Template: `{{ pattern "AA 00 000 000" }}`, + Pattern: "AA 00 000 000", }, NullPercentage: 0.5}}, }, From 4dfd8acc621c33605c321cb103678a7094909f9a Mon Sep 17 00:00:00 2001 From: reversetm Date: Wed, 30 Jul 2025 17:23:12 +0300 Subject: [PATCH 08/15] Updated default generation config --- config/models.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/config/models.yml b/config/models.yml index 7bc502c..e382e34 100644 --- a/config/models.yml +++ b/config/models.yml @@ -62,9 +62,13 @@ models: - name: passport type: string type_params: - template: "{{ pattern('AA 00 000 000') }}" + pattern: AA 00 000 000 distinct_percentage: 1 ordered: true + - name: email + type: string + type_params: + template: "{{ .first_name_en | lower }}.{{ .id }}@email.com" - name: created type: datetime type_params: From 63e79a317cff4e587c7b0b0fb13b0f5d5cf10005 Mon Sep 17 00:00:00 2001 From: reversetm Date: Wed, 30 Jul 2025 17:30:46 +0300 Subject: [PATCH 09/15] Fixed usage --- doc/en/usage.md | 4 ++-- doc/ru/usage.md | 2 +- internal/generator/models/generator_model.go | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/doc/en/usage.md b/doc/en/usage.md index 54af81a..fdbc666 100644 --- a/doc/en/usage.md +++ b/doc/en/usage.md @@ -160,7 +160,7 @@ Structure `models[*].columns[*].type_params` for data type `string`: - `logical_type`: Logical type of string. Supported values: `first_name`, `last_name`, `phone`, `text`. - `template`: Template for string generation. Allows you to use the values of any columns of the generated model. Information about the functions available in template strings is described at the end of this section. - Cannot coexist with `ordered`, `distinct_percentage` or `distinct_count`. + Cannot coexist with `ordered`, `distinct_percentage` and `distinct_count`. - `pattern`: Pattern for string generation. The `A` symbol is any capital letter, the `a` symbol is any small letter, symbol `0` is any digit, the `#` symbol is any character, and the other characters remain as they are. - `locale`: Locale for generated strings. Supported values: `ru`, `en`. Default is `en`. @@ -240,7 +240,7 @@ Structure of `output.params` for `tcs` format: Similar to the structure for the `http` format, except that the `format_template` field is immutable and always set to its default value. -Using Template Strings:: +Using Template Strings: Template strings are implemented using the standard golang library, you can read about all its features and available functions in this [documentation](https://pkg.go.dev/text/template). diff --git a/doc/ru/usage.md b/doc/ru/usage.md index b803f90..6d5d642 100644 --- a/doc/ru/usage.md +++ b/doc/ru/usage.md @@ -166,7 +166,7 @@ open_ai: - `logical_type`: Логический тип строки. Поддерживаемые значения: `first_name`, `last_name`, `phone`, `text`. - `template`: Шаблон для генерации строки. Позволяет использовать значения любых столбов генерируемой модели. Информация о том, как использовать шаблонные строки, описана в конце данного раздела. - Не работает совместно с `ordered`, `distinct_percentage` или `distinct_count`. + Не работает совместно с `ordered`, `distinct_percentage` и `distinct_count`. - `pattern`: Паттерн для генерации строки. Символ `A` - любая большая буква, символ `a` - любая маленькая буква, символ `0` - любая цифра, символ `#` - любой символ, а остальные символы остаются как есть. - `locale`: Локаль для генерации строк. Поддерживаемые значения: `ru`, `en`. По умолчанию `en`. diff --git a/internal/generator/models/generator_model.go b/internal/generator/models/generator_model.go index 5f88de9..e32f6ff 100644 --- a/internal/generator/models/generator_model.go +++ b/internal/generator/models/generator_model.go @@ -427,12 +427,12 @@ func (p *Params) Validate() []error { } if p.StringParams != nil && p.StringParams.Template != "" { - if common.Any( - p.Ordered, - p.DistinctPercentage != 0, - p.DistinctCount != 0, - ) { - errs = append(errs, errors.New("forbidden to use string template with distinct params or ordered")) + if p.Ordered { + errs = append(errs, errors.New("forbidden to use string template with ordered")) + } + + if common.Any(p.DistinctPercentage != 0, p.DistinctCount != 0) { + errs = append(errs, errors.New("forbidden to use string template with distinct params")) } } From c10ef0f4b1ea922c4413bf6ff38d8c1129c8d07e Mon Sep 17 00:00:00 2001 From: reversetm Date: Tue, 5 Aug 2025 17:09:44 +0300 Subject: [PATCH 10/15] Replaced regexp in ExtractValuesFromTemplate function and updated usage --- doc/en/usage.md | 14 +++++++++++++- doc/ru/usage.md | 14 +++++++++++++- internal/generator/common/utils.go | 17 +++++++++++++---- internal/generator/common/utils_test.go | 19 +++++++++++++++++-- .../usecase/general/generator/value/string.go | 3 +++ 5 files changed, 59 insertions(+), 8 deletions(-) diff --git a/doc/en/usage.md b/doc/en/usage.md index fdbc666..ef0becb 100644 --- a/doc/en/usage.md +++ b/doc/en/usage.md @@ -247,9 +247,21 @@ all its features and available functions in this [documentation](https://pkg.go. Accessing Data: -In a template, data is accessed using `.`(the object or value passed to the template) +In a template, data is accessed using `.` (the object or value passed to the template) and the field name, for example: `{{ .var }}`. +> **Important**: only the following characters are allowed in variable names: +> +> - letters from any alphabet (Unicode category L*). +> - numbers (Unicode category Nd), but not as the first character. +> - the underscore character `_`. +> +> Any other characters — spaces, periods, hyphens, quotation marks, punctuation marks, + etc. — cannot be used in the name itself. +> +> If you need to access a variable whose name is considered invalid, + refer to it using the `index` function, specifying the variable name in quotation marks, for example, `{{ index . "field-with-dash" }}`. + Function calls: - direct call: `{{ upper .name }}`. diff --git a/doc/ru/usage.md b/doc/ru/usage.md index 6d5d642..c293c66 100644 --- a/doc/ru/usage.md +++ b/doc/ru/usage.md @@ -253,9 +253,21 @@ open_ai: Доступ к данным: -Обращение к данным в шаблоне выполняется с помощью `.`(объект или значение, переданное шаблону) +Обращение к данным в шаблоне выполняется с помощью `.` (объект или значение, переданное шаблону) и имени переменной, например, `{{ .var }}`. +> **Важно**: в именах переменных допустимы только следующие символы: +> +> - буквы любого алфавита (Unicode-категории L*). +> - цифры (Unicode-категории Nd), но не первым символом. +> - знак подчёркивания `_`. +> +> Любые другие символы — пробелы, точки, дефисы, кавычки, символы пунктуации + и т.д. — в самом имени использовать нельзя. +> +> Если вам необходимо получить доступ к переменной, имя которой считается недопустимым, + обращайтесь к ней через функцию `index`, указав имя переменной в кавычках, например `{{ index . "field-with-dash" }}`. + Вызовы функций: - прямой вызов: `{{ upper .name }}`. diff --git a/internal/generator/common/utils.go b/internal/generator/common/utils.go index 523b6d4..86e5d36 100644 --- a/internal/generator/common/utils.go +++ b/internal/generator/common/utils.go @@ -361,12 +361,21 @@ func CtxClosed(ctx context.Context) bool { } func ExtractValuesFromTemplate(template string) []string { - re := regexp.MustCompile(`{{.*?\.([^\s|}]+).*?}}`) - matches := re.FindAllStringSubmatch(template, -1) + // regexp for templates like {{ .name }} + reField := regexp.MustCompile(`{{\s*(?:\w+\s+)?\.([^\s"|}]+).*?}}`) + matchesField := reField.FindAllStringSubmatch(template, -1) - values := make([]string, 0, len(matches)) + // regexp for templates like {{ index . "name-with-specific-symbols" }} + reMapKey := regexp.MustCompile(`{{\s*index\s+\.\s+"([^"]+)".*?}}`) + matchesMapKeys := reMapKey.FindAllStringSubmatch(template, -1) - for _, match := range matches { + values := make([]string, 0, len(matchesField)+len(matchesMapKeys)) + + for _, match := range matchesField { + values = append(values, match[1]) + } + + for _, match := range matchesMapKeys { values = append(values, match[1]) } diff --git a/internal/generator/common/utils_test.go b/internal/generator/common/utils_test.go index eb8e608..de86111 100644 --- a/internal/generator/common/utils_test.go +++ b/internal/generator/common/utils_test.go @@ -718,8 +718,23 @@ func TestExtractValuesFromTemplate(t *testing.T) { }, { name: "Template with functions", - template: "{{ upper .foo | lower }}@{{ .boo }}", - expected: []string{"foo", "boo"}, + template: "{{ upper .foo | lower }}", + expected: []string{"foo"}, + }, + { + name: "Template with index function", + template: `{{ index . foo }}.{{ index . "boo" }}`, + expected: []string{"boo"}, + }, + { + name: "Mixed template", + template: `{{ index . "foo" }}.{{ lower .boo }}|{{.coo}}`, + expected: []string{"boo", "coo", "foo"}, + }, + { + name: "Field name in quotation marks", + template: `{{ ."foo" }}`, + expected: []string{}, }, { name: "Invalid template", diff --git a/internal/generator/usecase/general/generator/value/string.go b/internal/generator/usecase/general/generator/value/string.go index 280b6b0..93eb99f 100644 --- a/internal/generator/usecase/general/generator/value/string.go +++ b/internal/generator/usecase/general/generator/value/string.go @@ -486,6 +486,9 @@ func (g *StringGenerator) Value(number float64, rowValues map[string]any) (any, //nolint:cyclop func (g *StringGenerator) ValuesCount() float64 { if g.Template != "" { + // Using `distinct` or `ordered` parameters with templates + // is not possible, we cannot guarantee that these parameters + // will be met, so we just need to return something other than 0. return 1.0 } From 0a3c62cb6df10ba9b31bd2cabb55b5b13959d99d Mon Sep 17 00:00:00 2001 From: reversetm Date: Thu, 14 Aug 2025 18:21:02 +0300 Subject: [PATCH 11/15] Add isbn, base64, base64URL, base64RawURL, hex, credit_card logical types --- internal/generator/models/generator_model.go | 45 +- .../usecase/general/generator/value/string.go | 688 ++++++++++++++++-- .../usecase/general/generator/value/utils.go | 13 + .../usecase/general/locale/consts.go | 7 +- 4 files changed, 682 insertions(+), 71 deletions(-) diff --git a/internal/generator/models/generator_model.go b/internal/generator/models/generator_model.go index e32f6ff..78d773a 100644 --- a/internal/generator/models/generator_model.go +++ b/internal/generator/models/generator_model.go @@ -14,10 +14,18 @@ import ( ) const ( - FirstNameType = "first_name" - LastNameType = "last_name" - PhoneType = "phone" - TextType = "text" + SimpleStringType = "simple_string" + FirstNameType = "first_name" + LastNameType = "last_name" + PhoneType = "phone" + TextType = "text" + HexType = "hex" + Ipv4Type = "ipv4" + Base64Type = "base64" + Base64URLType = "base64_url" + Base64RawURLType = "base64_raw_url" + CreditCardType = "credit_card" + IsbnType = "isbn" ) // Model type is used to describe model of generated data. @@ -691,7 +699,17 @@ type ColumnStringParams struct { WithoutSpecialChars bool `backup:"true" json:"without_special_chars" yaml:"without_special_chars"` } -func (p *ColumnStringParams) Parse() error { return nil } +func (p *ColumnStringParams) Parse() error { + if p.LogicalType == "" && p.Template == "" && p.Pattern == "" { + p.LogicalType = SimpleStringType + } + + if p.LogicalType == CreditCardType { + p.Pattern = "0000 0000 0000 0000" + } + + return nil +} func (p *ColumnStringParams) FillDefaults() { if p.MinLength == 0 { @@ -729,7 +747,22 @@ func (p *ColumnStringParams) Validate() []error { errs = append(errs, errors.Errorf("unknown locale: %s", p.Locale)) } - if !slices.Contains([]string{"", FirstNameType, LastNameType, PhoneType, TextType}, p.LogicalType) { + logicalTypes := []string{ + SimpleStringType, + FirstNameType, + LastNameType, + PhoneType, + TextType, + HexType, + Ipv4Type, + Base64Type, + Base64URLType, + Base64RawURLType, + CreditCardType, + IsbnType, + } + + if !slices.Contains(logicalTypes, p.LogicalType) { errs = append(errs, errors.Errorf("unknown logical type: %s", p.LogicalType)) } diff --git a/internal/generator/usecase/general/generator/value/string.go b/internal/generator/usecase/general/generator/value/string.go index 93eb99f..e2e0835 100644 --- a/internal/generator/usecase/general/generator/value/string.go +++ b/internal/generator/usecase/general/generator/value/string.go @@ -2,9 +2,13 @@ package value import ( "bytes" + "encoding/binary" + "fmt" "math" "math/big" "slices" + "sort" + "strconv" "strings" "sync" "text/template" @@ -16,6 +20,8 @@ import ( "github.com/tarantool/sdvg/internal/generator/usecase/general/locale/ru" ) +type prepareFunc func() error + // Verify interface compliance in compile time. var _ Generator = (*StringGenerator)(nil) @@ -29,31 +35,58 @@ type StringGenerator struct { charset []rune countByPrefix []float64 sumByPrefix []float64 - completions []int64 // completions[i] stores the number of ways to form a text of length i + completions []int64 // completions[i] stores the number of ways to form a text of length i + lexOrderedOctets []string // precomputed lexicographically ordered IPv4 octets + powersOfTen []uint64 // precomputed powers of ten for ISBN generation + base64Endings []string // precomputed Base64 endings + base64URLEndings []string } //nolint:cyclop func (g *StringGenerator) Prepare() error { - if g.Template != "" { - tmpl, err := template.New("template"). - Option("missingkey=error"). - Funcs(template.FuncMap{ - "upper": strings.ToUpper, - "lower": strings.ToLower, - }). - Parse(g.Template) - if err != nil { - return errors.Errorf("failed to parse template: %s", err.Error()) - } + prepareFuncs := []prepareFunc{ + g.prepareTemplate, + g.prepareLocaleModule, + g.prepareCharset, + g.prepareLogicalType, + } - g.template = tmpl - g.bufPool = &sync.Pool{ - New: func() any { - return new(bytes.Buffer) - }, + for _, fn := range prepareFuncs { + if err := fn(); err != nil { + return err } } + return nil +} + +func (g *StringGenerator) prepareTemplate() error { + if g.Template == "" { + return nil + } + + tmpl, err := template.New("template"). + Option("missingkey=error"). + Funcs(template.FuncMap{ + "upper": strings.ToUpper, + "lower": strings.ToLower, + }). + Parse(g.Template) + if err != nil { + return errors.Errorf("failed to parse template: %s", err.Error()) + } + + g.template = tmpl + g.bufPool = &sync.Pool{ + New: func() any { + return new(bytes.Buffer) + }, + } + + return nil +} + +func (g *StringGenerator) prepareLocaleModule() error { switch g.Locale { case "ru": g.localeModule = ru.NewLocaleModule(g.LogicalType, g.MinLength, g.MaxLength) @@ -63,6 +96,46 @@ func (g *StringGenerator) Prepare() error { return errors.Errorf("unknown locale: %q", g.Locale) } + return nil +} + +func (g *StringGenerator) prepareCharset() error { + switch g.LogicalType { + case models.Base64Type: + g.charset = locale.Base64Charset + + case models.Base64URLType, models.Base64RawURLType: + g.charset = locale.Base64URLCharset + + case models.HexType: + g.charset = locale.HexCharset + + default: + g.charset = make([]rune, 0) + + if !g.WithoutLargeLetters { + g.charset = append(g.charset, g.localeModule.LargeLetters()...) + } + + if !g.WithoutSmallLetters { + g.charset = append(g.charset, g.localeModule.SmallLetters()...) + } + + if !g.WithoutNumbers { + g.charset = append(g.charset, locale.Numbers...) + } + + if !g.WithoutSpecialChars { + g.charset = append(g.charset, locale.SpecialChars...) + } + } + + slices.Sort(g.charset) + + return nil +} + +func (g *StringGenerator) prepareLogicalType() error { switch g.LogicalType { case models.FirstNameType: if len(g.localeModule.GetFirstNames(locale.MaleGender)) == 0 { @@ -72,6 +145,7 @@ func (g *StringGenerator) Prepare() error { if len(g.localeModule.GetFirstNames(locale.FemaleGender)) == 0 { return errors.Errorf("no female first names with length between %v and %v", g.MinLength, g.MaxLength) } + case models.LastNameType: if len(g.localeModule.GetLastNames(locale.MaleGender)) == 0 { return errors.Errorf("no male last names with length between %v and %v", g.MinLength, g.MaxLength) @@ -80,34 +154,25 @@ func (g *StringGenerator) Prepare() error { if len(g.localeModule.GetLastNames(locale.FemaleGender)) == 0 { return errors.Errorf("no female last names with length between %v and %v", g.MinLength, g.MaxLength) } + case models.PhoneType: if len(g.localeModule.GetPhonePatterns()) == 0 { return errors.Errorf("no phone patterns with length between %v and %v", g.MinLength, g.MaxLength) } - } - - g.charset = make([]rune, 0) - - if !g.WithoutLargeLetters { - g.charset = append(g.charset, g.localeModule.LargeLetters()...) - } - - if !g.WithoutSmallLetters { - g.charset = append(g.charset, g.localeModule.SmallLetters()...) - } - if !g.WithoutNumbers { - g.charset = append(g.charset, locale.Numbers...) - } + case models.TextType: + g.calculateCompletions(g.MaxLength + 1) - if !g.WithoutSpecialChars { - g.charset = append(g.charset, locale.SpecialChars...) - } + case models.Ipv4Type: + g.generateSortedOctets() - slices.Sort(g.charset) + case models.IsbnType: + g.calculatePowersOfTen() - if g.LogicalType == models.TextType { - g.completions = g.calculateCompletions(g.MaxLength + 1) + case models.Base64Type: + g.generateBase64SortedEndings() + case models.Base64URLType: + g.generateBase64URLSortedEndings() } return nil @@ -116,42 +181,140 @@ func (g *StringGenerator) Prepare() error { func (g *StringGenerator) SetTotalCount(totalValuesCount uint64) error { g.totalValuesCount = totalValuesCount - if g.LogicalType == "" && g.Template == "" { + if g.LogicalType == models.SimpleStringType || g.LogicalType == models.Base64Type || + g.LogicalType == models.Base64URLType || g.LogicalType == models.Base64RawURLType || + g.LogicalType == models.HexType { + tailCount, allowedLength, prefixLength := g.lexicographicRules() + charsetLength := float64(len(g.charset)) + + var allowedCount int + + for length := g.MinLength; length <= g.MaxLength; length++ { + if allowedLength(length) { + allowedCount++ + } + } + countByLength := make([]float64, g.MaxLength+1) - avgRangeCount := math.Ceil(float64(totalValuesCount) / float64(g.MaxLength-g.MinLength+1)) + avgRangeCount := math.Ceil(float64(totalValuesCount) / float64(allowedCount)) for length := g.MinLength; length <= g.MaxLength; length++ { - rangeCount := math.Pow(float64(len(g.charset)), float64(length)) + if !allowedLength(length) { + continue + } + + rangeCount := float64(tailCount) * math.Pow(charsetLength, float64(prefixLength(length))) - var currentLenCount float64 + var currentLengthCount float64 if avgRangeCount > rangeCount { - currentLenCount = rangeCount - avgRangeCount += (avgRangeCount - rangeCount) / float64(g.MaxLength-length) + currentLengthCount = rangeCount + + remainAllowed := 0 + for x := length + 1; x <= g.MaxLength; x++ { + if allowedLength(x) { + remainAllowed++ + } + } + + if remainAllowed > 0 { + avgRangeCount += (avgRangeCount - rangeCount) / float64(remainAllowed) + } } else { - currentLenCount = math.Ceil(avgRangeCount) + currentLengthCount = math.Ceil(avgRangeCount) } - - countByLength[length] = currentLenCount + countByLength[length] = currentLengthCount } g.countByPrefix = make([]float64, g.MaxLength+1) - g.sumByPrefix = make([]float64, g.MaxLength+1) + g.sumByPrefix = make([]float64, g.MaxLength+2) for prefix := 0; prefix <= g.MaxLength; prefix++ { - prefixDivider := math.Pow(float64(len(g.charset)), float64(prefix)) - g.countByPrefix[prefix] = countByLength[prefix] / prefixDivider + prefixDivider := math.Pow(charsetLength, float64(prefix)) + nextPrefixDivider := prefixDivider * charsetLength + + var endNow float64 + for length := g.MinLength; length <= g.MaxLength; length++ { + if allowedLength(length) && prefixLength(length) == prefix { + endNow += countByLength[length] / prefixDivider + } + } + g.countByPrefix[prefix] = endNow - for length := 0; length <= g.MaxLength-prefix; length++ { - g.sumByPrefix[prefix] += countByLength[length+prefix] / prefixDivider + var sumNext float64 + for length := g.MinLength; length <= g.MaxLength; length++ { + if allowedLength(length) && prefixLength(length) >= prefix+1 { + sumNext += countByLength[length] / nextPrefixDivider + } } + g.sumByPrefix[prefix+1] = sumNext } } return nil } +func (g *StringGenerator) lexicographicRules() (int, func(int) bool, func(int) int) { + var ( + allowedLength func(length int) bool + prefixLength func(length int) int + tailCount int + ) + + switch g.LogicalType { + case models.Base64Type, models.Base64URLType: + allowedLength = func(length int) bool { + return length >= 4 && length%4 == 0 + } + + prefixLength = func(length int) int { + if length < 2 { + return 0 + } + + return length - 2 + } + + tailCount = 4161 + + case models.Base64RawURLType: + allowedLength = func(length int) bool { + return true + } + + prefixLength = func(length int) int { + return length + } + + tailCount = 4096 + + case models.HexType: + allowedLength = func(length int) bool { + return length >= 2 && length%2 == 0 + } + + prefixLength = func(length int) int { + return length + } + + tailCount = 1 + + default: + allowedLength = func(length int) bool { + return true + } + + prefixLength = func(length int) int { + return length + } + + tailCount = 1 + } + + return tailCount, allowedLength, prefixLength +} + // calculateCompletions precomputes completions. -func (g *StringGenerator) calculateCompletions(length int) []int64 { +func (g *StringGenerator) calculateCompletions(length int) { words := g.localeModule.GetWords() bytesPerChar := g.localeModule.GetBytesPerChar() delimiterLen := len(locale.WordsDelimiter) @@ -182,17 +345,90 @@ func (g *StringGenerator) calculateCompletions(length int) []int64 { } // convert from big.Int to int64 - completions := make([]int64, 0, length+1) + g.completions = make([]int64, 0, length+1) for _, blockCount := range completionsBig { if !blockCount.IsInt64() { break } - completions = append(completions, blockCount.Int64()) + g.completions = append(g.completions, blockCount.Int64()) + } +} + +func (g *StringGenerator) generateSortedOctets() { + g.lexOrderedOctets = make([]string, 256) + + for val := 0; val < 256; val++ { + g.lexOrderedOctets[val] = strconv.Itoa(val) + } + + sort.Strings(g.lexOrderedOctets) +} + +func (g *StringGenerator) calculatePowersOfTen() { + g.powersOfTen = make([]uint64, 11) + g.powersOfTen[0] = 1 + + for i := 1; i <= 10; i++ { + g.powersOfTen[i] = g.powersOfTen[i-1] * 10 + } +} + +func (g *StringGenerator) generateBase64SortedEndings() { + baseAll := []rune("+/0123456789=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") + + eqIndex := -1 + for i, r := range baseAll { + if r == '=' { + eqIndex = i + break + } + } + if eqIndex == -1 { + panic("'=' not found in base64 alphabet") + } + + charsetLength := len(baseAll) + g.base64Endings = make([]string, 0, 4161) + + for i := 0; i < charsetLength; i++ { + for j := 0; j < charsetLength; j++ { + if i < eqIndex || (baseAll[i] == '=' && baseAll[j] == '=') || i > eqIndex { + g.base64Endings = append(g.base64Endings, string([]rune{baseAll[i], baseAll[j]})) + } + } + } + + sort.Strings(g.base64Endings) +} + +func (g *StringGenerator) generateBase64URLSortedEndings() { + baseAll := []rune("-_0123456789=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") + + eqIndex := -1 + for i, r := range baseAll { + if r == '=' { + eqIndex = i + break + } + } + if eqIndex == -1 { + panic("'=' not found in base64 alphabet") + } + + charsetLength := len(baseAll) + g.base64URLEndings = make([]string, 0, 4161) + + for i := 0; i < charsetLength; i++ { + for j := 0; j < charsetLength; j++ { + if i < eqIndex || (baseAll[i] == '=' && baseAll[j] == '=') || i > eqIndex { + g.base64URLEndings = append(g.base64URLEndings, string([]rune{baseAll[i], baseAll[j]})) + } + } } - return completions + sort.Strings(g.base64URLEndings) } // templateString returns n-th string by template. @@ -368,6 +604,269 @@ func (g *StringGenerator) text(num float64) (string, error) { return text, nil } +func (g *StringGenerator) ipv4(number float64) string { + index := uint32(orderedInt64(0, math.MaxUint32, number, g.totalValuesCount)) + + indexBytes := make([]byte, 4) + binary.BigEndian.PutUint32(indexBytes, index) + + return fmt.Sprintf( + "%s.%s.%s.%s", + g.lexOrderedOctets[int(indexBytes[0])], + g.lexOrderedOctets[int(indexBytes[1])], + g.lexOrderedOctets[int(indexBytes[2])], + g.lexOrderedOctets[int(indexBytes[3])], + ) +} + +func (g *StringGenerator) hex(number float64) string { + prefix := make([]rune, 0, g.MaxLength) + + var prefixLen int + for remain := number; ; { + prefixLen = len(prefix) + + remain -= g.countByPrefix[prefixLen] + if remain < 0 || g.sumByPrefix[prefixLen+1] == 0 { + break + } + + step := g.sumByPrefix[prefixLen+1] + i := int(remain / step) + if i >= len(g.charset) { + i = len(g.charset) - 1 + } + remain -= step * float64(i) + prefix = append(prefix, g.charset[i]) + } + + if prefixLen < g.MinLength { + destLen := g.MinLength + int(number)%(g.MaxLength-g.MinLength+1) + for i := 0; i < destLen-prefixLen; i++ { + prefix = append(prefix, g.charset[(int(number)+i*i)%len(g.charset)]) + } + } + + return string(prefix) +} + +// isbn generates lexicographically ordered ISBN-13 strings (with prefix "978" or "979"). +// The ordering is determined by the input `number` which maps proportionally to the total ISBN space. +// +// ISBN structure used: +// - Prefix: fixed 978 or 979 (determined first, all 978 go before all 979 in order); +// - Country group: 1–5 digits; +// - Publisher: 1–7 digits (limited so that Country+Publisher ≤ 8 digits total); +// - Item number: fills remaining digits so that Country+Publisher+Item = 9 digits; +// - Check digit: 1 digit (0–9, does not follow real ISBN rules). +// +// Precomputed values (from Prepare method): +// - powersOfTen — [10^0, 10^1, ..., 10^10], used to quickly calculate weights without math.Pow. +// +// Steps in generation: +// +// 1. Scale number to total space +// step = totalValues / totalValuesCount, index = floor(step * number). +// This gives the lexicographic index in the full ISBN list. +// +// 2. Determine prefix ("978" or "979") +// If index >= totalValuesPerPrefix, choose "979" and subtract totalValuesPerPrefix from index. +// Otherwise, — "978". This ensures all "978..." go before all "979..." lexicographically. +// +// 3. Generate Country Group (1–5 digits) +// - Each position can either be a hyphen (end of group) or a digit 0–9. +// - First position must be a digit (cannot be a hyphen). +// - The number of possible publisherBlockLengths after a given countryBlockLength is precomputed using the formula: +// ((5-countryBlockLength)(10-countryBlockLength))/2 +// This avoids looping for each length. +// - Multiply by 10^(remaining digits) to get digitWeight — number of ISBNs for each digit choice. +// - digit = index / digitWeight, then index %= digitWeight. +// - Append the digit and update hyphenWeight — number of ISBNs if we put a hyphen next. +// +// 4. Generate Publisher (1–maxPublisherBlockLength digits) +// - Similar logic as Country group: first digit is mandatory, subsequent positions can be a hyphen or digit. +// - maxPublisherLength = min(7, 8 - countryLen). +// - digitWeight here = (remaining publisher digits) × 10^(remaining total digits after current position). +// - Append digits until either max length reached or hyphen chosen. +// +// 5. Generate Item Number & Check Digit +// +// - itemBlockLength = 9 - countryBlockLength - publisherBlockLength. +// +// - itemBlock = index / 10 — because last digit is reserved for check digit. +// +// - checkDigit = index % 10. +// +// 6. Format output +// Combine all blocks: "prefix-countryBlock-publisherBlock-itemBlock-checkDigit". +// Item number is zero-padded to always match itemBlockLength. +// +// This approach ensures: +// - Full lexicographic ordering across all possible ISBNs. +// - Even distribution when scaling from `number`. +// - No pre-generation of all ISBNs — computed on demand in O(1) time. +func (g *StringGenerator) isbn(number float64) string { + totalValuesPerPrefix := 25 * g.powersOfTen[10] + totalValues := 2 * totalValuesPerPrefix + + step := float64(totalValues) / float64(g.totalValuesCount) + index := uint64(step * number) + + prefix := "978" + if index >= totalValuesPerPrefix { + prefix = "979" + index -= totalValuesPerPrefix + } + + var ( + countryBlock = make([]byte, 0, 5) + countryBlockLength int + hyphenWeight uint64 + ) + + for countryBlockLength < 5 && (countryBlockLength == 0 || index >= hyphenWeight) { + if countryBlockLength > 0 { + index -= hyphenWeight + } + + totalPossiblePublisherLengths := uint64((5 - countryBlockLength) * (10 - countryBlockLength) / 2) + + digitWeight := totalPossiblePublisherLengths * g.powersOfTen[9-countryBlockLength] + digit := index / digitWeight + index %= digitWeight + + countryBlock = append(countryBlock, '0'+byte(digit)) + countryBlockLength++ + + hyphenWeight = uint64(8-countryBlockLength) * g.powersOfTen[10-countryBlockLength] + } + + var ( + maxPublisherBlockLength = 8 - countryBlockLength + publisherBlock = make([]byte, 0, maxPublisherBlockLength) + publisherBlockLength int + ) + + for publisherBlockLength < maxPublisherBlockLength && (publisherBlockLength == 0 || index >= hyphenWeight) { + if publisherBlockLength > 0 { + index -= hyphenWeight + } + + remaining := maxPublisherBlockLength - publisherBlockLength + digitWeight := uint64(remaining) * g.powersOfTen[9-countryBlockLength-publisherBlockLength] + digit := index / digitWeight + index %= digitWeight + + publisherBlock = append(publisherBlock, '0'+byte(digit)) + publisherBlockLength++ + + hyphenWeight = g.powersOfTen[10-countryBlockLength-publisherBlockLength] + } + + var ( + itemBlockLength = 9 - countryBlockLength - publisherBlockLength + itemBlock = index / 10 + checkDigit = index % 10 + ) + + return fmt.Sprintf( + "%s-%s-%s-%0*d-%d", + prefix, + string(countryBlock), + string(publisherBlock), + itemBlockLength, itemBlock, + checkDigit, + ) +} + +func (g *StringGenerator) base64(number float64) string { + prefix := make([]rune, 0, g.MaxLength) + + for remain := number; ; { + p := len(prefix) + + // bucket: "finish now" at prefix length p (для Base64 это L = p+2) + remain -= g.countByPrefix[p] + if remain < 0 || g.sumByPrefix[p+1] == 0 { + // распределяем remain пропорционально в хвосты + posInBucket := remain + g.countByPrefix[p] // 0 .. countByPrefix[p] + idx := int(posInBucket / g.countByPrefix[p] * float64(len(g.base64Endings))) + + if idx < 0 { + idx = 0 + } + if idx >= len(g.base64Endings) { + idx = len(g.base64Endings) - 1 + } + + return string(prefix) + g.base64Endings[idx] + } + + // выбираем следующую руну для префикса + step := g.sumByPrefix[p+1] + i := int(remain / step) + if i >= len(g.charset) { + i = len(g.charset) - 1 + } + remain -= step * float64(i) + prefix = append(prefix, g.charset[i]) + } +} + +func (g *StringGenerator) base64URL(number float64) string { + prefix := make([]rune, 0, g.MaxLength) + + for remain := number; ; { + p := len(prefix) + + remain -= g.countByPrefix[p] + if remain < 0 || g.sumByPrefix[p+1] == 0 { + posInBucket := remain + g.countByPrefix[p] + idx := int(posInBucket / g.countByPrefix[p] * float64(len(g.base64URLEndings))) + + if idx < 0 { + idx = 0 + } + if idx >= len(g.base64URLEndings) { + idx = len(g.base64URLEndings) - 1 + } + + return string(prefix) + g.base64URLEndings[idx] + } + + step := g.sumByPrefix[p+1] + i := int(remain / step) + if i >= len(g.charset) { + i = len(g.charset) - 1 + } + remain -= step * float64(i) + prefix = append(prefix, g.charset[i]) + } +} + +func (g *StringGenerator) base64RawURL(number float64) string { + prefix := make([]rune, 0, g.MaxLength) + + for remain := number; ; { + p := len(prefix) + + remain -= g.countByPrefix[p] + if remain < 0 || g.sumByPrefix[p+1] == 0 { + break + } + + step := g.sumByPrefix[p+1] + i := int(remain / step) + if i >= len(g.charset) { + i = len(g.charset) - 1 + } + remain -= step * float64(i) + prefix = append(prefix, g.charset[i]) + } + + return string(prefix) +} + // simpleString generates a lexicographically ordered string based on the given number. // The function ensures that strings of different lengths are evenly distributed. // @@ -478,9 +977,23 @@ func (g *StringGenerator) Value(number float64, rowValues map[string]any) (any, return g.phone(number), nil case models.TextType: return g.text(number) + case models.HexType: + return g.hex(number), nil + case models.Ipv4Type: + return g.ipv4(number), nil + case models.IsbnType: + return g.isbn(number), nil + case models.Base64Type: + return g.base64(number), nil + case models.Base64URLType: + return g.base64URL(number), nil + case models.Base64RawURLType: + return g.base64RawURL(number), nil + case models.SimpleStringType: + return g.simpleString(number), nil + default: + return nil, errors.Errorf("unknown logical type: %s", g.LogicalType) } - - return g.simpleString(number), nil } //nolint:cyclop @@ -540,12 +1053,61 @@ func (g *StringGenerator) ValuesCount() float64 { } return totalCount - } - totalCount := float64(0) - for length := g.MinLength; length <= g.MaxLength; length++ { - totalCount += math.Pow(float64(len(g.charset)), float64(length)) - } + case models.HexType: + total := float64(0) + for length := g.MinLength; length <= g.MaxLength; length++ { + if length%2 != 0 { + continue + } + + total += math.Pow(16, float64(length)) + } + + return total + + case models.Ipv4Type: + // IPv4: 32-bit address space, total unique addresses = 2^32. + // +1 because MaxUint32 is 2^32 - 1. + return float64(math.MaxUint32 + 1) + + case models.IsbnType: + // ISBN-13: we support prefixes 978 and 979 -> 2 variants. + // For each prefix: 25 possible group partitioning schemes × 10^10 digit combinations. + // Total unique ISBNs = 2 * 25 * 10^10. + return 2 * 25 * math.Pow(10, 10) + + case models.Base64Type, models.Base64URLType: + // Lengths are always multiples of 4. For each length L: + // - The first L-2 characters can be any of the 64 Base64 symbols (no '=' allowed there). + // - The last 2 characters can form 3 types of endings: + // 1) Both are Base64 symbols -> 64 * 64 = 4096 combinations + // 2) Base64 symbol + '=' -> 64 * 1 = 64 combinations + // 3) '=' + '=' -> 1 * 1 = 1 combination + // Total endings per prefix = 4096 + 64 + 1 = 4161. + total := float64(0) + for length := g.MinLength; length <= g.MaxLength; length += 4 { + total += math.Pow(64, float64(length-2)) * 4161 + } + + return total + + case models.Base64RawURLType: + total := float64(0) + for length := g.MinLength; length <= g.MaxLength; length++ { + total += math.Pow(64, float64(length)) + } + return total + + case models.SimpleStringType: + total := float64(0) + for length := g.MinLength; length <= g.MaxLength; length++ { + total += math.Pow(float64(len(g.charset)), float64(length)) + } - return totalCount + return total + + default: + return 0 + } } diff --git a/internal/generator/usecase/general/generator/value/utils.go b/internal/generator/usecase/general/generator/value/utils.go index fccb740..fe341bb 100644 --- a/internal/generator/usecase/general/generator/value/utils.go +++ b/internal/generator/usecase/general/generator/value/utils.go @@ -67,3 +67,16 @@ func replaceWithNumber(str string, char rune, number int64) string { return string(runes) } + +//func orderedBase64(charset []rune, minLength, maxLength int, number float64) string { +// const tailCombinations uint64 = 4161 +// +// var ( +// charsetLength = uint64(len(charset)) +// eqIndex = strings.IndexByte(charset, byte('=')) +// ) +// +// val := make([]rune, 0) +// +// return string(val) +//} diff --git a/internal/generator/usecase/general/locale/consts.go b/internal/generator/usecase/general/locale/consts.go index a7e8487..b7ecab5 100644 --- a/internal/generator/usecase/general/locale/consts.go +++ b/internal/generator/usecase/general/locale/consts.go @@ -3,8 +3,11 @@ package locale // Common string charsets. var ( - Numbers = []rune("0123456789") - SpecialChars = []rune("!#$%&()*+,-.:;<=>?@_{|}") + Numbers = []rune("0123456789") + SpecialChars = []rune("!#$%&()*+,-.:;<=>?@_{|}") + Base64Charset = []rune("+/0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") + Base64URLCharset = []rune("-_0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") + HexCharset = []rune("0123456789abcdefABCDEF") ) // Genders declaration. From f4a785ee7a5026667c9c70b69c33e919d4c568fc Mon Sep 17 00:00:00 2001 From: reversetm Date: Sat, 16 Aug 2025 10:32:21 +0300 Subject: [PATCH 12/15] Simplify code --- internal/generator/models/generator_model.go | 31 ++- .../usecase/general/generator/value/string.go | 236 ++++-------------- .../usecase/general/generator/value/utils.go | 13 - .../usecase/general/locale/consts.go | 2 +- 4 files changed, 72 insertions(+), 210 deletions(-) diff --git a/internal/generator/models/generator_model.go b/internal/generator/models/generator_model.go index 78d773a..a8e8a3f 100644 --- a/internal/generator/models/generator_model.go +++ b/internal/generator/models/generator_model.go @@ -19,13 +19,13 @@ const ( LastNameType = "last_name" PhoneType = "phone" TextType = "text" - HexType = "hex" + CreditCardType = "credit_card" Ipv4Type = "ipv4" + IsbnType = "isbn" + HexType = "hex" Base64Type = "base64" Base64URLType = "base64_url" Base64RawURLType = "base64_raw_url" - CreditCardType = "credit_card" - IsbnType = "isbn" ) // Model type is used to describe model of generated data. @@ -705,6 +705,7 @@ func (p *ColumnStringParams) Parse() error { } if p.LogicalType == CreditCardType { + p.LogicalType = "" p.Pattern = "0000 0000 0000 0000" } @@ -753,19 +754,37 @@ func (p *ColumnStringParams) Validate() []error { LastNameType, PhoneType, TextType, - HexType, + CreditCardType, Ipv4Type, + IsbnType, + HexType, Base64Type, Base64URLType, Base64RawURLType, - CreditCardType, - IsbnType, } if !slices.Contains(logicalTypes, p.LogicalType) { errs = append(errs, errors.Errorf("unknown logical type: %s", p.LogicalType)) } + if p.LogicalType == Base64Type || p.LogicalType == Base64URLType { + if p.MinLength%4 != 0 || p.MaxLength%4 != 0 { + errs = append(errs, errors.Errorf( + "min length (%v) and max length (%v) fields should be multiple of 4 for %q logical type", + p.MinLength, p.MaxLength, p.LogicalType, + )) + } + } + + if p.LogicalType == HexType { + if p.MinLength%2 != 0 || p.MaxLength%2 != 0 { + errs = append(errs, errors.Errorf( + "min length (%v) and max length (%v) fields should be multiple of 2 for %q logical type", + p.MinLength, p.MaxLength, p.LogicalType, + )) + } + } + return errs } diff --git a/internal/generator/usecase/general/generator/value/string.go b/internal/generator/usecase/general/generator/value/string.go index e2e0835..e0e511a 100644 --- a/internal/generator/usecase/general/generator/value/string.go +++ b/internal/generator/usecase/general/generator/value/string.go @@ -39,7 +39,6 @@ type StringGenerator struct { lexOrderedOctets []string // precomputed lexicographically ordered IPv4 octets powersOfTen []uint64 // precomputed powers of ten for ISBN generation base64Endings []string // precomputed Base64 endings - base64URLEndings []string } //nolint:cyclop @@ -169,10 +168,8 @@ func (g *StringGenerator) prepareLogicalType() error { case models.IsbnType: g.calculatePowersOfTen() - case models.Base64Type: + case models.Base64Type, models.Base64URLType: g.generateBase64SortedEndings() - case models.Base64URLType: - g.generateBase64URLSortedEndings() } return nil @@ -210,8 +207,8 @@ func (g *StringGenerator) SetTotalCount(totalValuesCount uint64) error { currentLengthCount = rangeCount remainAllowed := 0 - for x := length + 1; x <= g.MaxLength; x++ { - if allowedLength(x) { + for candidateLength := length + 1; candidateLength <= g.MaxLength; candidateLength++ { + if allowedLength(candidateLength) { remainAllowed++ } } @@ -255,13 +252,18 @@ func (g *StringGenerator) SetTotalCount(totalValuesCount uint64) error { func (g *StringGenerator) lexicographicRules() (int, func(int) bool, func(int) int) { var ( - allowedLength func(length int) bool - prefixLength func(length int) int - tailCount int + tailCount = 1 + allowedLength = func(length int) bool { + return true + } + prefixLength = func(length int) int { + return length + } ) switch g.LogicalType { case models.Base64Type, models.Base64URLType: + tailCount = 4161 allowedLength = func(length int) bool { return length >= 4 && length%4 == 0 } @@ -274,40 +276,10 @@ func (g *StringGenerator) lexicographicRules() (int, func(int) bool, func(int) i return length - 2 } - tailCount = 4161 - - case models.Base64RawURLType: - allowedLength = func(length int) bool { - return true - } - - prefixLength = func(length int) int { - return length - } - - tailCount = 4096 - case models.HexType: allowedLength = func(length int) bool { return length >= 2 && length%2 == 0 } - - prefixLength = func(length int) int { - return length - } - - tailCount = 1 - - default: - allowedLength = func(length int) bool { - return true - } - - prefixLength = func(length int) int { - return length - } - - tailCount = 1 } return tailCount, allowedLength, prefixLength @@ -376,10 +348,13 @@ func (g *StringGenerator) calculatePowersOfTen() { } func (g *StringGenerator) generateBase64SortedEndings() { - baseAll := []rune("+/0123456789=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") + alphabet := make([]rune, len(g.charset)+1) + copy(alphabet, g.charset) + alphabet[len(g.charset)] = '=' + slices.Sort(alphabet) eqIndex := -1 - for i, r := range baseAll { + for i, r := range alphabet { if r == '=' { eqIndex = i break @@ -389,13 +364,13 @@ func (g *StringGenerator) generateBase64SortedEndings() { panic("'=' not found in base64 alphabet") } - charsetLength := len(baseAll) + charsetLength := len(alphabet) g.base64Endings = make([]string, 0, 4161) for i := 0; i < charsetLength; i++ { for j := 0; j < charsetLength; j++ { - if i < eqIndex || (baseAll[i] == '=' && baseAll[j] == '=') || i > eqIndex { - g.base64Endings = append(g.base64Endings, string([]rune{baseAll[i], baseAll[j]})) + if i < eqIndex || (alphabet[i] == '=' && alphabet[j] == '=') || i > eqIndex { + g.base64Endings = append(g.base64Endings, string([]rune{alphabet[i], alphabet[j]})) } } } @@ -403,34 +378,6 @@ func (g *StringGenerator) generateBase64SortedEndings() { sort.Strings(g.base64Endings) } -func (g *StringGenerator) generateBase64URLSortedEndings() { - baseAll := []rune("-_0123456789=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") - - eqIndex := -1 - for i, r := range baseAll { - if r == '=' { - eqIndex = i - break - } - } - if eqIndex == -1 { - panic("'=' not found in base64 alphabet") - } - - charsetLength := len(baseAll) - g.base64URLEndings = make([]string, 0, 4161) - - for i := 0; i < charsetLength; i++ { - for j := 0; j < charsetLength; j++ { - if i < eqIndex || (baseAll[i] == '=' && baseAll[j] == '=') || i > eqIndex { - g.base64URLEndings = append(g.base64URLEndings, string([]rune{baseAll[i], baseAll[j]})) - } - } - } - - sort.Strings(g.base64URLEndings) -} - // templateString returns n-th string by template. // //nolint:forcetypeassert @@ -619,37 +566,6 @@ func (g *StringGenerator) ipv4(number float64) string { ) } -func (g *StringGenerator) hex(number float64) string { - prefix := make([]rune, 0, g.MaxLength) - - var prefixLen int - for remain := number; ; { - prefixLen = len(prefix) - - remain -= g.countByPrefix[prefixLen] - if remain < 0 || g.sumByPrefix[prefixLen+1] == 0 { - break - } - - step := g.sumByPrefix[prefixLen+1] - i := int(remain / step) - if i >= len(g.charset) { - i = len(g.charset) - 1 - } - remain -= step * float64(i) - prefix = append(prefix, g.charset[i]) - } - - if prefixLen < g.MinLength { - destLen := g.MinLength + int(number)%(g.MaxLength-g.MinLength+1) - for i := 0; i < destLen-prefixLen; i++ { - prefix = append(prefix, g.charset[(int(number)+i*i)%len(g.charset)]) - } - } - - return string(prefix) -} - // isbn generates lexicographically ordered ISBN-13 strings (with prefix "978" or "979"). // The ordering is determined by the input `number` which maps proportionally to the total ISBN space. // @@ -782,89 +698,40 @@ func (g *StringGenerator) isbn(number float64) string { func (g *StringGenerator) base64(number float64) string { prefix := make([]rune, 0, g.MaxLength) - for remain := number; ; { - p := len(prefix) - - // bucket: "finish now" at prefix length p (для Base64 это L = p+2) - remain -= g.countByPrefix[p] - if remain < 0 || g.sumByPrefix[p+1] == 0 { - // распределяем remain пропорционально в хвосты - posInBucket := remain + g.countByPrefix[p] // 0 .. countByPrefix[p] - idx := int(posInBucket / g.countByPrefix[p] * float64(len(g.base64Endings))) + var ( + remain float64 + prefixLen int + ) - if idx < 0 { - idx = 0 - } - if idx >= len(g.base64Endings) { - idx = len(g.base64Endings) - 1 - } + for remain = number; ; { + prefixLen = len(prefix) - return string(prefix) + g.base64Endings[idx] + remain -= g.countByPrefix[prefixLen] + if remain < 0 || g.sumByPrefix[prefixLen+1] == 0 { + break } - // выбираем следующую руну для префикса - step := g.sumByPrefix[p+1] - i := int(remain / step) - if i >= len(g.charset) { - i = len(g.charset) - 1 - } - remain -= step * float64(i) + i := int(remain / g.sumByPrefix[prefixLen+1]) + remain -= g.sumByPrefix[prefixLen+1] * float64(i) prefix = append(prefix, g.charset[i]) } -} -func (g *StringGenerator) base64URL(number float64) string { - prefix := make([]rune, 0, g.MaxLength) - - for remain := number; ; { - p := len(prefix) - - remain -= g.countByPrefix[p] - if remain < 0 || g.sumByPrefix[p+1] == 0 { - posInBucket := remain + g.countByPrefix[p] - idx := int(posInBucket / g.countByPrefix[p] * float64(len(g.base64URLEndings))) + pos := remain + g.countByPrefix[prefixLen] + idx := int(pos / g.countByPrefix[prefixLen] * float64(len(g.base64Endings))) - if idx < 0 { - idx = 0 - } - if idx >= len(g.base64URLEndings) { - idx = len(g.base64URLEndings) - 1 - } - - return string(prefix) + g.base64URLEndings[idx] - } + return string(prefix) + g.base64Endings[idx] +} - step := g.sumByPrefix[p+1] - i := int(remain / step) - if i >= len(g.charset) { - i = len(g.charset) - 1 - } - remain -= step * float64(i) - prefix = append(prefix, g.charset[i]) - } +func (g *StringGenerator) base64URL(number float64) string { + return g.base64(number) } func (g *StringGenerator) base64RawURL(number float64) string { - prefix := make([]rune, 0, g.MaxLength) - - for remain := number; ; { - p := len(prefix) - - remain -= g.countByPrefix[p] - if remain < 0 || g.sumByPrefix[p+1] == 0 { - break - } - - step := g.sumByPrefix[p+1] - i := int(remain / step) - if i >= len(g.charset) { - i = len(g.charset) - 1 - } - remain -= step * float64(i) - prefix = append(prefix, g.charset[i]) - } + return g.simpleString(number) +} - return string(prefix) +func (g *StringGenerator) hex(number float64) string { + return g.simpleString(number) } // simpleString generates a lexicographically ordered string based on the given number. @@ -1054,18 +921,6 @@ func (g *StringGenerator) ValuesCount() float64 { return totalCount - case models.HexType: - total := float64(0) - for length := g.MinLength; length <= g.MaxLength; length++ { - if length%2 != 0 { - continue - } - - total += math.Pow(16, float64(length)) - } - - return total - case models.Ipv4Type: // IPv4: 32-bit address space, total unique addresses = 2^32. // +1 because MaxUint32 is 2^32 - 1. @@ -1087,19 +942,20 @@ func (g *StringGenerator) ValuesCount() float64 { // Total endings per prefix = 4096 + 64 + 1 = 4161. total := float64(0) for length := g.MinLength; length <= g.MaxLength; length += 4 { - total += math.Pow(64, float64(length-2)) * 4161 + total += math.Pow(float64(len(g.charset)), float64(length-2)) * 4161 } return total - case models.Base64RawURLType: + case models.HexType: total := float64(0) - for length := g.MinLength; length <= g.MaxLength; length++ { - total += math.Pow(64, float64(length)) + for length := g.MinLength; length <= g.MaxLength; length += 2 { + total += math.Pow(float64(len(g.charset)), float64(length)) } + return total - case models.SimpleStringType: + case models.SimpleStringType, models.Base64RawURLType: total := float64(0) for length := g.MinLength; length <= g.MaxLength; length++ { total += math.Pow(float64(len(g.charset)), float64(length)) diff --git a/internal/generator/usecase/general/generator/value/utils.go b/internal/generator/usecase/general/generator/value/utils.go index fe341bb..fccb740 100644 --- a/internal/generator/usecase/general/generator/value/utils.go +++ b/internal/generator/usecase/general/generator/value/utils.go @@ -67,16 +67,3 @@ func replaceWithNumber(str string, char rune, number int64) string { return string(runes) } - -//func orderedBase64(charset []rune, minLength, maxLength int, number float64) string { -// const tailCombinations uint64 = 4161 -// -// var ( -// charsetLength = uint64(len(charset)) -// eqIndex = strings.IndexByte(charset, byte('=')) -// ) -// -// val := make([]rune, 0) -// -// return string(val) -//} diff --git a/internal/generator/usecase/general/locale/consts.go b/internal/generator/usecase/general/locale/consts.go index b7ecab5..948ee7d 100644 --- a/internal/generator/usecase/general/locale/consts.go +++ b/internal/generator/usecase/general/locale/consts.go @@ -7,7 +7,7 @@ var ( SpecialChars = []rune("!#$%&()*+,-.:;<=>?@_{|}") Base64Charset = []rune("+/0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") Base64URLCharset = []rune("-_0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") - HexCharset = []rune("0123456789abcdefABCDEF") + HexCharset = []rune("0123456789ABCDEF") ) // Genders declaration. From b71e82b73157091a0c824689bb07881eea884e08 Mon Sep 17 00:00:00 2001 From: reversetm Date: Wed, 20 Aug 2025 11:39:47 +0300 Subject: [PATCH 13/15] Update unit tests for string type --- internal/generator/models/generator_model.go | 2 + internal/generator/models/models_test.go | 7 +- .../usecase/general/generator/value/string.go | 196 ++++++++++-------- .../usecase/general/test/unit_test.go | 98 ++++++++- 4 files changed, 208 insertions(+), 95 deletions(-) diff --git a/internal/generator/models/generator_model.go b/internal/generator/models/generator_model.go index a8e8a3f..45fe1b6 100644 --- a/internal/generator/models/generator_model.go +++ b/internal/generator/models/generator_model.go @@ -730,6 +730,7 @@ func (p *ColumnStringParams) FillDefaults() { p.LogicalType = strings.ToLower(p.LogicalType) } +//nolint:cyclop func (p *ColumnStringParams) Validate() []error { var errs []error @@ -749,6 +750,7 @@ func (p *ColumnStringParams) Validate() []error { } logicalTypes := []string{ + "", SimpleStringType, FirstNameType, LastNameType, diff --git a/internal/generator/models/models_test.go b/internal/generator/models/models_test.go index ffb908a..f94fc86 100644 --- a/internal/generator/models/models_test.go +++ b/internal/generator/models/models_test.go @@ -202,9 +202,10 @@ func TestGeneratorConfigYAMLParse(t *testing.T) { } defaultStringParams := &ColumnStringParams{ - MinLength: 1, - MaxLength: 32, - Locale: "en", + MinLength: 1, + MaxLength: 32, + Locale: "en", + LogicalType: SimpleStringType, } defaultDateTimeParams := &ColumnDateTimeParams{ diff --git a/internal/generator/usecase/general/generator/value/string.go b/internal/generator/usecase/general/generator/value/string.go index e0e511a..3c6678d 100644 --- a/internal/generator/usecase/general/generator/value/string.go +++ b/internal/generator/usecase/general/generator/value/string.go @@ -14,13 +14,16 @@ import ( "text/template" "github.com/pkg/errors" + "github.com/tarantool/sdvg/internal/generator/common" "github.com/tarantool/sdvg/internal/generator/models" "github.com/tarantool/sdvg/internal/generator/usecase/general/locale" "github.com/tarantool/sdvg/internal/generator/usecase/general/locale/en" "github.com/tarantool/sdvg/internal/generator/usecase/general/locale/ru" ) -type prepareFunc func() error +const ( + base64EndingCombinations = 4161 +) // Verify interface compliance in compile time. var _ Generator = (*StringGenerator)(nil) @@ -36,14 +39,13 @@ type StringGenerator struct { countByPrefix []float64 sumByPrefix []float64 completions []int64 // completions[i] stores the number of ways to form a text of length i - lexOrderedOctets []string // precomputed lexicographically ordered IPv4 octets + ipv4SortedOctets []string // precomputed lexicographically ordered IPv4 octets powersOfTen []uint64 // precomputed powers of ten for ISBN generation base64Endings []string // precomputed Base64 endings } -//nolint:cyclop func (g *StringGenerator) Prepare() error { - prepareFuncs := []prepareFunc{ + prepareFuncs := []func() error{ g.prepareTemplate, g.prepareLocaleModule, g.prepareCharset, @@ -134,6 +136,7 @@ func (g *StringGenerator) prepareCharset() error { return nil } +//nolint:cyclop func (g *StringGenerator) prepareLogicalType() error { switch g.LogicalType { case models.FirstNameType: @@ -163,7 +166,7 @@ func (g *StringGenerator) prepareLogicalType() error { g.calculateCompletions(g.MaxLength + 1) case models.Ipv4Type: - g.generateSortedOctets() + g.generateIpv4SortedOctets() case models.IsbnType: g.calculatePowersOfTen() @@ -178,82 +181,105 @@ func (g *StringGenerator) prepareLogicalType() error { func (g *StringGenerator) SetTotalCount(totalValuesCount uint64) error { g.totalValuesCount = totalValuesCount - if g.LogicalType == models.SimpleStringType || g.LogicalType == models.Base64Type || - g.LogicalType == models.Base64URLType || g.LogicalType == models.Base64RawURLType || - g.LogicalType == models.HexType { - tailCount, allowedLength, prefixLength := g.lexicographicRules() - charsetLength := float64(len(g.charset)) + if common.Any( + g.LogicalType == models.SimpleStringType, + g.LogicalType == models.Base64Type, + g.LogicalType == models.Base64URLType, + g.LogicalType == models.Base64RawURLType, + g.LogicalType == models.HexType, + ) { + g.calculatePrefixDistribution(totalValuesCount) + } - var allowedCount int + return nil +} - for length := g.MinLength; length <= g.MaxLength; length++ { - if allowedLength(length) { - allowedCount++ - } +//nolint:mnd,cyclop,gocognit +func (g *StringGenerator) calculatePrefixDistribution(totalValuesCount uint64) { + endingCombinations, allowedLength, prefixLength := g.lengthRules() + charsetLength := float64(len(g.charset)) + + var allowedCount int + + for length := g.MinLength; length <= g.MaxLength; length++ { + if allowedLength(length) { + allowedCount++ } + } - countByLength := make([]float64, g.MaxLength+1) - avgRangeCount := math.Ceil(float64(totalValuesCount) / float64(allowedCount)) + countByLength := make([]float64, g.MaxLength+1) + avgRangeCount := math.Ceil(float64(totalValuesCount) / float64(allowedCount)) - for length := g.MinLength; length <= g.MaxLength; length++ { - if !allowedLength(length) { - continue - } + for length := g.MinLength; length <= g.MaxLength; length++ { + if !allowedLength(length) { + continue + } - rangeCount := float64(tailCount) * math.Pow(charsetLength, float64(prefixLength(length))) + rangeCount := float64(endingCombinations) * math.Pow(charsetLength, float64(prefixLength(length))) - var currentLengthCount float64 - if avgRangeCount > rangeCount { - currentLengthCount = rangeCount + var currentLengthCount float64 + if avgRangeCount > rangeCount { + currentLengthCount = rangeCount - remainAllowed := 0 - for candidateLength := length + 1; candidateLength <= g.MaxLength; candidateLength++ { - if allowedLength(candidateLength) { - remainAllowed++ - } - } + remainAllowed := 0 - if remainAllowed > 0 { - avgRangeCount += (avgRangeCount - rangeCount) / float64(remainAllowed) + for candidateLength := length + 1; candidateLength <= g.MaxLength; candidateLength++ { + if allowedLength(candidateLength) { + remainAllowed++ } - } else { - currentLengthCount = math.Ceil(avgRangeCount) } - countByLength[length] = currentLengthCount + + if remainAllowed > 0 { + avgRangeCount += (avgRangeCount - rangeCount) / float64(remainAllowed) + } + } else { + currentLengthCount = math.Ceil(avgRangeCount) } - g.countByPrefix = make([]float64, g.MaxLength+1) - g.sumByPrefix = make([]float64, g.MaxLength+2) + countByLength[length] = currentLengthCount + } - for prefix := 0; prefix <= g.MaxLength; prefix++ { - prefixDivider := math.Pow(charsetLength, float64(prefix)) - nextPrefixDivider := prefixDivider * charsetLength + g.countByPrefix = make([]float64, g.MaxLength+1) + g.sumByPrefix = make([]float64, g.MaxLength+2) - var endNow float64 - for length := g.MinLength; length <= g.MaxLength; length++ { - if allowedLength(length) && prefixLength(length) == prefix { - endNow += countByLength[length] / prefixDivider - } + for prefix := 0; prefix <= g.MaxLength; prefix++ { + prefixDivider := math.Pow(charsetLength, float64(prefix)) + nextPrefixDivider := prefixDivider * charsetLength + + var endNow float64 + + for length := g.MinLength; length <= g.MaxLength; length++ { + if allowedLength(length) && prefixLength(length) == prefix { + endNow += countByLength[length] / prefixDivider } - g.countByPrefix[prefix] = endNow + } - var sumNext float64 - for length := g.MinLength; length <= g.MaxLength; length++ { - if allowedLength(length) && prefixLength(length) >= prefix+1 { - sumNext += countByLength[length] / nextPrefixDivider - } + g.countByPrefix[prefix] = endNow + + var sumNext float64 + + for length := g.MinLength; length <= g.MaxLength; length++ { + if allowedLength(length) && prefixLength(length) >= prefix+1 { + sumNext += countByLength[length] / nextPrefixDivider } - g.sumByPrefix[prefix+1] = sumNext } - } - return nil + g.sumByPrefix[prefix+1] = sumNext + } } -func (g *StringGenerator) lexicographicRules() (int, func(int) bool, func(int) int) { +// lengthRules returns a set of rules for string length based on the +// logical type. Specifically, it provides: +// +// 1. endingCombinations – the number of possible combinations for the string ending, +// 2. allowedLength – a predicate that checks whether a given length is valid, +// 3. prefixLength – a function that computes the effective prefix length. +// +//nolint:mnd +func (g *StringGenerator) lengthRules() (int, func(int) bool, func(int) int) { var ( - tailCount = 1 - allowedLength = func(length int) bool { + endingCombinations = 1 + allowedLength = func(length int) bool { return true } prefixLength = func(length int) int { @@ -263,7 +289,7 @@ func (g *StringGenerator) lexicographicRules() (int, func(int) bool, func(int) i switch g.LogicalType { case models.Base64Type, models.Base64URLType: - tailCount = 4161 + endingCombinations = base64EndingCombinations allowedLength = func(length int) bool { return length >= 4 && length%4 == 0 } @@ -282,7 +308,7 @@ func (g *StringGenerator) lexicographicRules() (int, func(int) bool, func(int) i } } - return tailCount, allowedLength, prefixLength + return endingCombinations, allowedLength, prefixLength } // calculateCompletions precomputes completions. @@ -328,16 +354,18 @@ func (g *StringGenerator) calculateCompletions(length int) { } } -func (g *StringGenerator) generateSortedOctets() { - g.lexOrderedOctets = make([]string, 256) +//nolint:mnd +func (g *StringGenerator) generateIpv4SortedOctets() { + g.ipv4SortedOctets = make([]string, 256) - for val := 0; val < 256; val++ { - g.lexOrderedOctets[val] = strconv.Itoa(val) + for val := range 256 { + g.ipv4SortedOctets[val] = strconv.Itoa(val) } - sort.Strings(g.lexOrderedOctets) + sort.Strings(g.ipv4SortedOctets) } +//nolint:mnd func (g *StringGenerator) calculatePowersOfTen() { g.powersOfTen = make([]uint64, 11) g.powersOfTen[0] = 1 @@ -354,21 +382,20 @@ func (g *StringGenerator) generateBase64SortedEndings() { slices.Sort(alphabet) eqIndex := -1 + for i, r := range alphabet { if r == '=' { eqIndex = i + break } } - if eqIndex == -1 { - panic("'=' not found in base64 alphabet") - } charsetLength := len(alphabet) - g.base64Endings = make([]string, 0, 4161) + g.base64Endings = make([]string, 0, base64EndingCombinations) - for i := 0; i < charsetLength; i++ { - for j := 0; j < charsetLength; j++ { + for i := range charsetLength { + for j := range charsetLength { if i < eqIndex || (alphabet[i] == '=' && alphabet[j] == '=') || i > eqIndex { g.base64Endings = append(g.base64Endings, string([]rune{alphabet[i], alphabet[j]})) } @@ -551,6 +578,7 @@ func (g *StringGenerator) text(num float64) (string, error) { return text, nil } +//nolint:mnd func (g *StringGenerator) ipv4(number float64) string { index := uint32(orderedInt64(0, math.MaxUint32, number, g.totalValuesCount)) @@ -559,10 +587,10 @@ func (g *StringGenerator) ipv4(number float64) string { return fmt.Sprintf( "%s.%s.%s.%s", - g.lexOrderedOctets[int(indexBytes[0])], - g.lexOrderedOctets[int(indexBytes[1])], - g.lexOrderedOctets[int(indexBytes[2])], - g.lexOrderedOctets[int(indexBytes[3])], + g.ipv4SortedOctets[int(indexBytes[0])], + g.ipv4SortedOctets[int(indexBytes[1])], + g.ipv4SortedOctets[int(indexBytes[2])], + g.ipv4SortedOctets[int(indexBytes[3])], ) } @@ -589,7 +617,7 @@ func (g *StringGenerator) ipv4(number float64) string { // If index >= totalValuesPerPrefix, choose "979" and subtract totalValuesPerPrefix from index. // Otherwise, — "978". This ensures all "978..." go before all "979..." lexicographically. // -// 3. Generate Country Group (1–5 digits) +// 3. Generate Country Group (1 — 5 digits) // - Each position can either be a hyphen (end of group) or a digit 0–9. // - First position must be a digit (cannot be a hyphen). // - The number of possible publisherBlockLengths after a given countryBlockLength is precomputed using the formula: @@ -599,7 +627,7 @@ func (g *StringGenerator) ipv4(number float64) string { // - digit = index / digitWeight, then index %= digitWeight. // - Append the digit and update hyphenWeight — number of ISBNs if we put a hyphen next. // -// 4. Generate Publisher (1–maxPublisherBlockLength digits) +// 4. Generate Publisher (1 — maxPublisherBlockLength digits) // - Similar logic as Country group: first digit is mandatory, subsequent positions can be a hyphen or digit. // - maxPublisherLength = min(7, 8 - countryLen). // - digitWeight here = (remaining publisher digits) × 10^(remaining total digits after current position). @@ -621,7 +649,10 @@ func (g *StringGenerator) ipv4(number float64) string { // - Full lexicographic ordering across all possible ISBNs. // - Even distribution when scaling from `number`. // - No pre-generation of all ISBNs — computed on demand in O(1) time. +// +//nolint:mnd func (g *StringGenerator) isbn(number float64) string { + // 25 possible group partitioning schemes totalValuesPerPrefix := 25 * g.powersOfTen[10] totalValues := 2 * totalValuesPerPrefix @@ -651,7 +682,7 @@ func (g *StringGenerator) isbn(number float64) string { digit := index / digitWeight index %= digitWeight - countryBlock = append(countryBlock, '0'+byte(digit)) + countryBlock = append(countryBlock, byte(digit)+'0') countryBlockLength++ hyphenWeight = uint64(8-countryBlockLength) * g.powersOfTen[10-countryBlockLength] @@ -673,7 +704,7 @@ func (g *StringGenerator) isbn(number float64) string { digit := index / digitWeight index %= digitWeight - publisherBlock = append(publisherBlock, '0'+byte(digit)) + publisherBlock = append(publisherBlock, byte(digit)+'0') publisherBlockLength++ hyphenWeight = g.powersOfTen[10-countryBlockLength-publisherBlockLength] @@ -821,6 +852,8 @@ func (g *StringGenerator) simpleString(number float64) string { } // Value returns n-th string from range. +// +//nolint:cyclop func (g *StringGenerator) Value(number float64, rowValues map[string]any) (any, error) { if g.Template != "" { val, err := g.templateString(rowValues) @@ -863,7 +896,7 @@ func (g *StringGenerator) Value(number float64, rowValues map[string]any) (any, } } -//nolint:cyclop +//nolint:cyclop,mnd func (g *StringGenerator) ValuesCount() float64 { if g.Template != "" { // Using `distinct` or `ordered` parameters with templates @@ -942,12 +975,13 @@ func (g *StringGenerator) ValuesCount() float64 { // Total endings per prefix = 4096 + 64 + 1 = 4161. total := float64(0) for length := g.MinLength; length <= g.MaxLength; length += 4 { - total += math.Pow(float64(len(g.charset)), float64(length-2)) * 4161 + total += math.Pow(float64(len(g.charset)), float64(length-2)) * base64EndingCombinations } return total case models.HexType: + // Lengths are always multiples of 2. total := float64(0) for length := g.MinLength; length <= g.MaxLength; length += 2 { total += math.Pow(float64(len(g.charset)), float64(length)) diff --git a/internal/generator/usecase/general/test/unit_test.go b/internal/generator/usecase/general/test/unit_test.go index 0243f2e..16eb4e4 100644 --- a/internal/generator/usecase/general/test/unit_test.go +++ b/internal/generator/usecase/general/test/unit_test.go @@ -489,20 +489,31 @@ func TestString(t *testing.T) { {&models.ColumnStringParams{LogicalType: models.FirstNameType, Locale: "ru"}, 1, 32}, {&models.ColumnStringParams{LogicalType: models.LastNameType, Locale: "ru"}, 1, 32}, {&models.ColumnStringParams{LogicalType: models.PhoneType, Locale: "ru"}, 1, 32}, - {&models.ColumnStringParams{MinLength: 5, MaxLength: 5}, 5, 5}, {&models.ColumnStringParams{LogicalType: models.FirstNameType, MinLength: 5, MaxLength: 5}, 5, 5}, {&models.ColumnStringParams{LogicalType: models.LastNameType, MinLength: 4, MaxLength: 7}, 4, 7}, {&models.ColumnStringParams{LogicalType: models.PhoneType, MinLength: 10, MaxLength: 10}, 10, 10}, - {&models.ColumnStringParams{MinLength: 100, MaxLength: 100}, 100, 100}, - {&models.ColumnStringParams{Pattern: "AAaa00##", Locale: "en"}, 8, 8}, - {&models.ColumnStringParams{Pattern: "AAaa00##", Locale: "ru"}, 8, 8}, - {&models.ColumnStringParams{Pattern: "0123456789012345678901234567890123456789"}, 40, 40}, + {&models.ColumnStringParams{LogicalType: models.SimpleStringType, MinLength: 5, MaxLength: 5}, 5, 5}, + {&models.ColumnStringParams{LogicalType: models.SimpleStringType, MinLength: 100, MaxLength: 100}, 100, 100}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 3, MaxLength: 5}, 3, 5}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 254, MaxLength: 256}, 254, 256}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 510, MaxLength: 512}, 510, 512}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 3, MaxLength: 5, Locale: "ru"}, 3, 5}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 254, MaxLength: 256, Locale: "ru"}, 254, 256}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 510, MaxLength: 512, Locale: "ru"}, 510, 512}, + {&models.ColumnStringParams{LogicalType: models.CreditCardType}, 19, 19}, + {&models.ColumnStringParams{LogicalType: models.IsbnType}, 17, 17}, + {&models.ColumnStringParams{LogicalType: models.Ipv4Type}, 7, 15}, + {&models.ColumnStringParams{LogicalType: models.HexType, MinLength: 2, MaxLength: 10}, 2, 10}, + {&models.ColumnStringParams{LogicalType: models.HexType, MinLength: 4, MaxLength: 4}, 4, 4}, + {&models.ColumnStringParams{LogicalType: models.Base64Type, MinLength: 4, MaxLength: 16}, 4, 8}, + {&models.ColumnStringParams{LogicalType: models.Base64Type, MinLength: 4, MaxLength: 4}, 4, 4}, + {&models.ColumnStringParams{LogicalType: models.Base64URLType, MinLength: 4, MaxLength: 20}, 4, 20}, + {&models.ColumnStringParams{LogicalType: models.Base64URLType, MinLength: 4, MaxLength: 4}, 4, 4}, + {&models.ColumnStringParams{LogicalType: models.Base64RawURLType, MinLength: 2, MaxLength: 10}, 2, 10}, + {&models.ColumnStringParams{LogicalType: models.Base64RawURLType, MinLength: 5, MaxLength: 5}, 5, 5}, + {&models.ColumnStringParams{Pattern: "AAaa00##", Locale: "en"}, 8, 8}, + {&models.ColumnStringParams{Pattern: "AAaa00##", Locale: "ru"}, 8, 8}, + {&models.ColumnStringParams{Pattern: "0123456789012345678901234567890123456789"}, 40, 40}, } for _, testCase := range testCases { @@ -531,6 +542,7 @@ func TestString(t *testing.T) { }{ { &models.ColumnStringParams{ + LogicalType: models.SimpleStringType, MinLength: 1, MaxLength: 1, Locale: "en", @@ -541,6 +553,7 @@ func TestString(t *testing.T) { }, { &models.ColumnStringParams{ + LogicalType: models.SimpleStringType, MinLength: 1, MaxLength: 1, Locale: "ru", @@ -551,6 +564,7 @@ func TestString(t *testing.T) { }, { &models.ColumnStringParams{ + LogicalType: models.SimpleStringType, MinLength: 3, MaxLength: 7, Locale: "en", @@ -561,6 +575,7 @@ func TestString(t *testing.T) { }, { &models.ColumnStringParams{ + LogicalType: models.SimpleStringType, MinLength: 2, MaxLength: 9, Locale: "ru", @@ -571,6 +586,7 @@ func TestString(t *testing.T) { }, { &models.ColumnStringParams{ + LogicalType: models.SimpleStringType, MinLength: 10, MaxLength: 24, Locale: "en", @@ -582,6 +598,7 @@ func TestString(t *testing.T) { }, { &models.ColumnStringParams{ + LogicalType: models.SimpleStringType, MinLength: 1, MaxLength: 8, Locale: "en", @@ -593,17 +610,19 @@ func TestString(t *testing.T) { }, { &models.ColumnStringParams{ - MinLength: 10, - MaxLength: 15, - Locale: "en", + LogicalType: models.SimpleStringType, + MinLength: 10, + MaxLength: 15, + Locale: "en", }, 88394150280794134360488281250, }, { &models.ColumnStringParams{ - MinLength: 10, - MaxLength: 15, - Locale: "ru", + LogicalType: models.SimpleStringType, + MinLength: 10, + MaxLength: 15, + Locale: "ru", }, 868834460299970670989801640300, }, @@ -621,6 +640,63 @@ func TestString(t *testing.T) { }, 2600, }, + { + &models.ColumnStringParams{ + Locale: "en", + Pattern: "0000 0000 0000 0000", + }, + 10000000000000000, + }, + { + &models.ColumnStringParams{ + Locale: "en", + LogicalType: models.Ipv4Type, + }, + 4294967296, + }, + { + &models.ColumnStringParams{ + Locale: "en", + LogicalType: models.IsbnType, + }, + 500000000000, + }, + { + &models.ColumnStringParams{ + Locale: "en", + LogicalType: models.HexType, + MinLength: 2, + MaxLength: 10, + }, + 1103823438080, + }, + { + &models.ColumnStringParams{ + Locale: "en", + LogicalType: models.Base64Type, + MinLength: 4, + MaxLength: 8, + }, + 285941759741952, + }, + { + &models.ColumnStringParams{ + Locale: "en", + LogicalType: models.Base64URLType, + MinLength: 4, + MaxLength: 8, + }, + 285941759741952, + }, + { + &models.ColumnStringParams{ + Locale: "en", + LogicalType: models.Base64RawURLType, + MinLength: 1, + MaxLength: 8, + }, + 285942833483840, + }, } for _, testCase := range checkValuesCountCases { From 0972670e5316f2282f5e7636ba99ab93573e6a05 Mon Sep 17 00:00:00 2001 From: reversetm Date: Wed, 20 Aug 2025 11:55:06 +0300 Subject: [PATCH 14/15] Update usage and changelog --- CHANGELOG.md | 8 +++++++- doc/en/usage.md | 3 ++- doc/ru/usage.md | 3 ++- internal/generator/models/generator_model.go | 4 ++-- .../generator/usecase/general/generator/value/string.go | 6 +++--- 5 files changed, 16 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0aaa1c4..65ef37f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,13 +10,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - The `template` field in the `string` data type is now used to generate template strings - with the ability to use the values of any columns of the generated model. + with the ability to use the values of any columns of the generated model. +- In the `logical_type` field of type `string`, if you need a regular string without any + special logical meaning, you can now explicitly specify `simple_string`. ### Breaking changes - Using `template` field to specify a string pattern like `Aa0#` is no longer supported, `pattern` should be used instead. +### Added + +- Logical types for strings: credit_card, ipv4, isbn13, hex, base64, base64_url and base64_raw_url + ## [0.0.1](https://github.com/tarantool/sdvg/compare/36d0930..0.0.1) - 2025-07-21 ### Added diff --git a/doc/en/usage.md b/doc/en/usage.md index ef0becb..b795e3d 100644 --- a/doc/en/usage.md +++ b/doc/en/usage.md @@ -157,7 +157,8 @@ Structure `models[*].columns[*].type_params` for data type `string`: - `min_length`: Minimum string length. Default is `1`. - `max_length`: Maximum string length. Default is `32`. -- `logical_type`: Logical type of string. Supported values: `first_name`, `last_name`, `phone`, `text`. +- `logical_type`: Logical type of string. Supported values: `simple_string`, `first_name`, `last_name`, `phone`, + `text`, `ipv4`, `isbn13`, `hex`, `base64`, `base64_url`, `base64_raw_url`, `credit_card`. Default is `simple_string`. - `template`: Template for string generation. Allows you to use the values of any columns of the generated model. Information about the functions available in template strings is described at the end of this section. Cannot coexist with `ordered`, `distinct_percentage` and `distinct_count`. diff --git a/doc/ru/usage.md b/doc/ru/usage.md index c293c66..f9cf675 100644 --- a/doc/ru/usage.md +++ b/doc/ru/usage.md @@ -163,7 +163,8 @@ open_ai: - `min_length`: Минимальная длина строки. По умолчанию `1`. - `max_length`: Максимальная длина строки. По умолчанию `32`. -- `logical_type`: Логический тип строки. Поддерживаемые значения: `first_name`, `last_name`, `phone`, `text`. +- `logical_type`: Логический тип строки. Поддерживаемые значения: `simple_string`, `first_name`, `last_name`, `phone`, + `text`, `ipv4`, `isbn13`, `hex`, `base64`, `base64_url`, `base64_raw_url`, `credit_card`. По умолчанию `simple_string`. - `template`: Шаблон для генерации строки. Позволяет использовать значения любых столбов генерируемой модели. Информация о том, как использовать шаблонные строки, описана в конце данного раздела. Не работает совместно с `ordered`, `distinct_percentage` и `distinct_count`. diff --git a/internal/generator/models/generator_model.go b/internal/generator/models/generator_model.go index 45fe1b6..708de3a 100644 --- a/internal/generator/models/generator_model.go +++ b/internal/generator/models/generator_model.go @@ -21,7 +21,7 @@ const ( TextType = "text" CreditCardType = "credit_card" Ipv4Type = "ipv4" - IsbnType = "isbn" + Isbn13Type = "isbn13" HexType = "hex" Base64Type = "base64" Base64URLType = "base64_url" @@ -758,7 +758,7 @@ func (p *ColumnStringParams) Validate() []error { TextType, CreditCardType, Ipv4Type, - IsbnType, + Isbn13Type, HexType, Base64Type, Base64URLType, diff --git a/internal/generator/usecase/general/generator/value/string.go b/internal/generator/usecase/general/generator/value/string.go index 3c6678d..ea7e948 100644 --- a/internal/generator/usecase/general/generator/value/string.go +++ b/internal/generator/usecase/general/generator/value/string.go @@ -651,7 +651,7 @@ func (g *StringGenerator) ipv4(number float64) string { // - No pre-generation of all ISBNs — computed on demand in O(1) time. // //nolint:mnd -func (g *StringGenerator) isbn(number float64) string { +func (g *StringGenerator) isbn13(number float64) string { // 25 possible group partitioning schemes totalValuesPerPrefix := 25 * g.powersOfTen[10] totalValues := 2 * totalValuesPerPrefix @@ -882,7 +882,7 @@ func (g *StringGenerator) Value(number float64, rowValues map[string]any) (any, case models.Ipv4Type: return g.ipv4(number), nil case models.IsbnType: - return g.isbn(number), nil + return g.isbn13(number), nil case models.Base64Type: return g.base64(number), nil case models.Base64URLType: @@ -959,7 +959,7 @@ func (g *StringGenerator) ValuesCount() float64 { // +1 because MaxUint32 is 2^32 - 1. return float64(math.MaxUint32 + 1) - case models.IsbnType: + case models.Isbn13Type: // ISBN-13: we support prefixes 978 and 979 -> 2 variants. // For each prefix: 25 possible group partitioning schemes × 10^10 digit combinations. // Total unique ISBNs = 2 * 25 * 10^10. From ddf70b55df0216b58660b9650b95f44502661087 Mon Sep 17 00:00:00 2001 From: reversetm Date: Wed, 20 Aug 2025 11:57:58 +0300 Subject: [PATCH 15/15] Update name --- internal/generator/usecase/general/generator/value/string.go | 4 ++-- internal/generator/usecase/general/test/unit_test.go | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/internal/generator/usecase/general/generator/value/string.go b/internal/generator/usecase/general/generator/value/string.go index ea7e948..554cac1 100644 --- a/internal/generator/usecase/general/generator/value/string.go +++ b/internal/generator/usecase/general/generator/value/string.go @@ -168,7 +168,7 @@ func (g *StringGenerator) prepareLogicalType() error { case models.Ipv4Type: g.generateIpv4SortedOctets() - case models.IsbnType: + case models.Isbn13Type: g.calculatePowersOfTen() case models.Base64Type, models.Base64URLType: @@ -881,7 +881,7 @@ func (g *StringGenerator) Value(number float64, rowValues map[string]any) (any, return g.hex(number), nil case models.Ipv4Type: return g.ipv4(number), nil - case models.IsbnType: + case models.Isbn13Type: return g.isbn13(number), nil case models.Base64Type: return g.base64(number), nil diff --git a/internal/generator/usecase/general/test/unit_test.go b/internal/generator/usecase/general/test/unit_test.go index 16eb4e4..865696c 100644 --- a/internal/generator/usecase/general/test/unit_test.go +++ b/internal/generator/usecase/general/test/unit_test.go @@ -501,7 +501,7 @@ func TestString(t *testing.T) { {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 254, MaxLength: 256, Locale: "ru"}, 254, 256}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 510, MaxLength: 512, Locale: "ru"}, 510, 512}, {&models.ColumnStringParams{LogicalType: models.CreditCardType}, 19, 19}, - {&models.ColumnStringParams{LogicalType: models.IsbnType}, 17, 17}, + {&models.ColumnStringParams{LogicalType: models.Isbn13Type}, 17, 17}, {&models.ColumnStringParams{LogicalType: models.Ipv4Type}, 7, 15}, {&models.ColumnStringParams{LogicalType: models.HexType, MinLength: 2, MaxLength: 10}, 2, 10}, {&models.ColumnStringParams{LogicalType: models.HexType, MinLength: 4, MaxLength: 4}, 4, 4}, @@ -657,7 +657,7 @@ func TestString(t *testing.T) { { &models.ColumnStringParams{ Locale: "en", - LogicalType: models.IsbnType, + LogicalType: models.Isbn13Type, }, 500000000000, },