11
11
import yaml
12
12
13
13
from guidellm .dataset .synthetic import (
14
+ PrefixBucketConfig ,
14
15
SyntheticDatasetConfig ,
15
16
SyntheticDatasetCreator ,
16
17
SyntheticTextItemsGenerator ,
@@ -29,8 +30,12 @@ def test_config_creation_with_all_params(self):
29
30
30
31
### WRITTEN BY AI ###
31
32
"""
33
+ prefix_bucket = PrefixBucketConfig (
34
+ bucket_weight = 100 , prefix_count = 1 , prefix_tokens = 5
35
+ )
36
+
32
37
config = SyntheticDatasetConfig (
33
- prefix_tokens = 5 ,
38
+ prefix_buckets = [ prefix_bucket ] ,
34
39
prompt_tokens = 100 ,
35
40
prompt_tokens_stdev = 10 ,
36
41
prompt_tokens_min = 50 ,
@@ -43,7 +48,7 @@ def test_config_creation_with_all_params(self):
43
48
source = "custom_text.txt" ,
44
49
)
45
50
46
- assert config .prefix_tokens == 5
51
+ assert config .prefix_buckets [ 0 ]. prefix_tokens == 5
47
52
assert config .prompt_tokens == 100
48
53
assert config .prompt_tokens_stdev == 10
49
54
assert config .prompt_tokens_min == 50
@@ -67,7 +72,9 @@ def test_parse_json_string(self):
67
72
"output_tokens" : 25 ,
68
73
"samples" : 200 ,
69
74
"source" : "test.txt" ,
70
- "prefix_tokens" : 10 ,
75
+ "prefix_buckets" : [
76
+ {"bucket_weight" : 100 , "prefix_count" : 1 , "prefix_tokens" : 10 }
77
+ ],
71
78
}
72
79
)
73
80
@@ -77,23 +84,23 @@ def test_parse_json_string(self):
77
84
assert config .output_tokens == 25
78
85
assert config .samples == 200
79
86
assert config .source == "test.txt"
80
- assert config .prefix_tokens == 10
87
+ assert config .prefix_buckets [ 0 ]. prefix_tokens == 10
81
88
82
89
@pytest .mark .regression
83
90
def test_parse_key_value_pairs (self ):
84
91
"""Test parsing key-value pairs configuration.
85
92
86
93
### WRITTEN BY AI ###
87
94
"""
88
- kv_str = "prompt_tokens=80,output_tokens=30,samples=300,source=data.txt,prefix_tokens=5" # noqa: E501
95
+ kv_str = "prompt_tokens=80,output_tokens=30,samples=300,source=data.txt"
89
96
90
97
config = SyntheticDatasetConfig .parse_str (kv_str )
91
98
92
99
assert config .prompt_tokens == 80
93
100
assert config .output_tokens == 30
94
101
assert config .samples == 300
95
102
assert config .source == "data.txt"
96
- assert config .prefix_tokens == 5
103
+ assert config .prefix_buckets is None
97
104
98
105
@pytest .mark .sanity
99
106
def test_parse_yaml_file (self ):
@@ -106,7 +113,9 @@ def test_parse_yaml_file(self):
106
113
"output_tokens" : 15 ,
107
114
"samples" : 100 ,
108
115
"source" : "yaml_test.txt" ,
109
- "prefix_tokens" : 3 ,
116
+ "prefix_buckets" : [
117
+ {"bucket_weight" : 100 , "prefix_count" : 1 , "prefix_tokens" : 3 }
118
+ ],
110
119
}
111
120
112
121
with tempfile .NamedTemporaryFile (mode = "w" , suffix = ".yaml" , delete = False ) as f :
@@ -120,7 +129,7 @@ def test_parse_yaml_file(self):
120
129
assert config .output_tokens == 15
121
130
assert config .samples == 100
122
131
assert config .source == "yaml_test.txt"
123
- assert config .prefix_tokens == 3
132
+ assert config .prefix_buckets [ 0 ]. prefix_tokens == 3
124
133
finally :
125
134
Path (yaml_path ).unlink ()
126
135
@@ -134,7 +143,9 @@ def test_parse_config_file(self):
134
143
"prompt_tokens" : 90 ,
135
144
"output_tokens" : 35 ,
136
145
"samples" : 150 ,
137
- "prefix_tokens" : 2 ,
146
+ "prefix_buckets" : [
147
+ {"bucket_weight" : 100 , "prefix_count" : 1 , "prefix_tokens" : 2 }
148
+ ],
138
149
}
139
150
140
151
with tempfile .NamedTemporaryFile (mode = "w" , suffix = ".config" , delete = False ) as f :
@@ -147,7 +158,7 @@ def test_parse_config_file(self):
147
158
assert config .prompt_tokens == 90
148
159
assert config .output_tokens == 35
149
160
assert config .samples == 150
150
- assert config .prefix_tokens == 2
161
+ assert config .prefix_buckets [ 0 ]. prefix_tokens == 2
151
162
finally :
152
163
Path (config_path ).unlink ()
153
164
@@ -194,8 +205,9 @@ def test_validation_positive_values(self):
194
205
with pytest .raises (ValueError ):
195
206
SyntheticDatasetConfig (prompt_tokens = 20 , output_tokens = 10 , samples = 0 )
196
207
208
+ # Test negative prefix tokens via PrefixBucketConfig validation
197
209
with pytest .raises (ValueError ):
198
- SyntheticDatasetConfig ( prompt_tokens = 20 , output_tokens = 10 , prefix_tokens = - 1 )
210
+ PrefixBucketConfig ( prefix_tokens = - 1 )
199
211
200
212
@pytest .mark .regression
201
213
def test_validation_optional_positive_values (self ):
@@ -279,7 +291,7 @@ def mock_tokenizer(self):
279
291
"""
280
292
tokenizer = Mock ()
281
293
tokenizer .get_vocab .return_value = {f"token_{ i } " : i for i in range (1000 )}
282
- tokenizer .encode .side_effect = lambda text : [ 1 , 2 , 3 ] * ( len (text ) // 10 + 1 )
294
+ tokenizer .encode .side_effect = lambda text : list ( range ( len (text . split ())) )
283
295
tokenizer .decode .side_effect = (
284
296
lambda tokens , skip_special_tokens = False : " " .join (
285
297
f"token_{ t } " for t in tokens [:5 ]
@@ -306,8 +318,12 @@ def config_with_prefix(self):
306
318
307
319
### WRITTEN BY AI ###
308
320
"""
321
+ prefix_bucket = PrefixBucketConfig (
322
+ bucket_weight = 100 , prefix_count = 1 , prefix_tokens = 3
323
+ )
324
+
309
325
return SyntheticDatasetConfig (
310
- prefix_tokens = 3 ,
326
+ prefix_buckets = [ prefix_bucket ] ,
311
327
prompt_tokens = 15 ,
312
328
output_tokens = 10 ,
313
329
samples = 5 ,
@@ -352,20 +368,14 @@ def test_generator_initialization(
352
368
mock_text_creator .assert_called_once_with (data = simple_config .source )
353
369
354
370
@pytest .mark .smoke
355
- @patch ("guidellm.dataset.synthetic.EndlessTextCreator" )
356
371
@patch ("guidellm.dataset.synthetic.IntegerRangeSampler" )
357
372
def test_basic_iteration (
358
- self , mock_sampler , mock_text_creator , simple_config , mock_tokenizer
373
+ self ,
374
+ mock_sampler ,
375
+ simple_config ,
376
+ mock_tokenizer ,
359
377
):
360
- """Test basic iteration functionality.
361
-
362
- ### WRITTEN BY AI ###
363
- """
364
- # Setup mocks
365
- mock_text_creator_instance = Mock ()
366
- mock_text_creator_instance .words = ["word1" , "word2" , "word3" ] * 100
367
- mock_text_creator_instance .create_text .return_value = "sample text"
368
- mock_text_creator .return_value = mock_text_creator_instance
378
+ """Test basic iteration functionality."""
369
379
370
380
# Mock IntegerRangeSampler to return iterators
371
381
def mock_sampler_side_effect (* args , ** kwargs ):
@@ -394,59 +404,34 @@ def mock_sampler_side_effect(*args, **kwargs):
394
404
assert isinstance (item ["output_tokens_count" ], int )
395
405
396
406
@pytest .mark .sanity
397
- @patch ("guidellm.dataset.synthetic.EndlessTextCreator" )
398
- def test_create_prompt_method (
399
- self , mock_text_creator , simple_config , mock_tokenizer
400
- ):
407
+ def test_create_prompt_method (self , simple_config , mock_tokenizer ):
401
408
"""Test _create_prompt method.
402
409
403
410
### WRITTEN BY AI ###
404
411
"""
405
- mock_text_creator_instance = Mock ()
406
- mock_text_creator_instance .words = ["word" ] * 100
407
- mock_text_creator_instance .create_text .return_value = "test text"
408
- mock_text_creator .return_value = mock_text_creator_instance
409
-
410
- mock_tokenizer .encode .return_value = [1 , 2 , 3 ]
411
-
412
412
generator = SyntheticTextItemsGenerator (
413
413
simple_config , mock_tokenizer , random_seed = 42
414
414
)
415
415
416
416
# Test normal case
417
417
result = generator ._create_prompt (5 , 0 , 42 )
418
- assert result == [42 , 1 , 2 , 3 ]
418
+ assert result [0 ] == 42 # Unique prefix token
419
+ assert len (result ) == 5
419
420
420
421
# Test zero tokens
421
422
result = generator ._create_prompt (0 , 0 , 42 )
422
423
assert result == []
423
424
424
425
# Test without unique prefix
425
426
result = generator ._create_prompt (3 , 0 )
426
- assert result == [ 1 , 2 , 3 ]
427
+ assert len ( result ) == 3
427
428
428
429
@pytest .mark .regression
429
- @patch ("guidellm.dataset.synthetic.EndlessTextCreator" )
430
- def test_create_prompt_binary_search (
431
- self , mock_text_creator , simple_config , mock_tokenizer
432
- ):
430
+ def test_create_prompt_binary_search (self , simple_config , mock_tokenizer ):
433
431
"""Test binary search logic in _create_prompt.
434
432
435
433
### WRITTEN BY AI ###
436
434
"""
437
- mock_text_creator_instance = Mock ()
438
- mock_text_creator_instance .words = ["word" ] * 1000
439
- mock_text_creator_instance .create_text .side_effect = lambda start , length : (
440
- "text " * max (1 , length // 4 )
441
- ).strip ()
442
- mock_text_creator .return_value = mock_text_creator_instance
443
-
444
- # Mock tokenizer to return different lengths based on input
445
- def mock_encode (text ):
446
- return [1 ] * len (text .split ())
447
-
448
- mock_tokenizer .encode .side_effect = mock_encode
449
-
450
435
generator = SyntheticTextItemsGenerator (
451
436
simple_config , mock_tokenizer , random_seed = 42
452
437
)
@@ -456,21 +441,14 @@ def mock_encode(text):
456
441
assert len (result ) >= 4 # Should include prefix + some tokens
457
442
458
443
@pytest .mark .sanity
459
- @patch ("guidellm.dataset.synthetic.EndlessTextCreator" )
460
444
@patch ("guidellm.dataset.synthetic.IntegerRangeSampler" )
461
445
def test_prefix_tokens_integration (
462
- self , mock_sampler , mock_text_creator , config_with_prefix , mock_tokenizer
446
+ self , mock_sampler , config_with_prefix , mock_tokenizer
463
447
):
464
448
"""Test integration with prefix tokens.
465
449
466
450
### WRITTEN BY AI ###
467
451
"""
468
- # Setup mocks
469
- mock_text_creator_instance = Mock ()
470
- mock_text_creator_instance .words = ["word" ] * 100
471
- mock_text_creator_instance .create_text .return_value = "sample text"
472
- mock_text_creator .return_value = mock_text_creator_instance
473
-
474
452
mock_sampler_instance = Mock ()
475
453
mock_sampler_instance .__iter__ = Mock (return_value = iter ([15 , 15 , 15 , 15 , 15 ]))
476
454
mock_sampler .return_value = mock_sampler_instance
@@ -483,24 +461,20 @@ def test_prefix_tokens_integration(
483
461
484
462
# Verify prompt_tokens_count includes prefix
485
463
for item in items :
486
- assert item ["prompt_tokens_count" ] == config_with_prefix .prefix_tokens + 15
464
+ assert (
465
+ item ["prompt_tokens_count" ]
466
+ == config_with_prefix .prefix_buckets [0 ].prefix_tokens + 15
467
+ )
487
468
488
469
@pytest .mark .regression
489
- @patch ("guidellm.dataset.synthetic.EndlessTextCreator" )
490
470
@patch ("guidellm.dataset.synthetic.IntegerRangeSampler" )
491
471
def test_random_seeding_consistency (
492
- self , mock_sampler , mock_text_creator , simple_config , mock_tokenizer
472
+ self , mock_sampler , simple_config , mock_tokenizer
493
473
):
494
474
"""Test that same seed produces consistent results.
495
475
496
476
### WRITTEN BY AI ###
497
477
"""
498
- # Setup mocks
499
- mock_text_creator_instance = Mock ()
500
- mock_text_creator_instance .words = ["word" ] * 100
501
- mock_text_creator_instance .create_text .return_value = "sample text"
502
- mock_text_creator .return_value = mock_text_creator_instance
503
-
504
478
# Create consistent mock sampler behavior
505
479
call_count = 0
506
480
@@ -536,25 +510,12 @@ def mock_sampler_side_effect(*args, **kwargs):
536
510
assert item1 ["output_tokens_count" ] == item2 ["output_tokens_count" ]
537
511
538
512
@pytest .mark .regression
539
- @patch ("guidellm.dataset.synthetic.EndlessTextCreator" )
540
513
@patch ("guidellm.dataset.synthetic.IntegerRangeSampler" )
541
- def test_variance_configuration (
542
- self , mock_sampler , mock_text_creator , complex_config , mock_tokenizer
543
- ):
514
+ def test_variance_configuration (self , mock_sampler , complex_config , mock_tokenizer ):
544
515
"""Test that variance configuration is properly used.
545
516
546
517
### WRITTEN BY AI ###
547
518
"""
548
- # Setup mocks
549
- mock_text_creator_instance = Mock ()
550
- mock_text_creator_instance .words = ["word" ] * 100
551
- mock_text_creator_instance .create_text .return_value = "sample text"
552
- mock_text_creator .return_value = mock_text_creator_instance
553
-
554
- # Fix tokenizer mock to handle the create_text return properly
555
- mock_tokenizer .encode .side_effect = (
556
- lambda text : [1 , 2 , 3 ] if isinstance (text , str ) else [1 , 2 , 3 ]
557
- )
558
519
559
520
# Setup mock sampler to track calls
560
521
def mock_sampler_side_effect (* args , ** kwargs ):
@@ -592,19 +553,11 @@ def mock_sampler_side_effect(*args, **kwargs):
592
553
assert output_call [1 ]["random_seed" ] == 43 # 42 + 1
593
554
594
555
@pytest .mark .regression
595
- @patch ("guidellm.dataset.synthetic.EndlessTextCreator" )
596
- def test_unique_prefix_generation (
597
- self , mock_text_creator , simple_config , mock_tokenizer
598
- ):
556
+ def test_unique_prefix_generation (self , simple_config , mock_tokenizer ):
599
557
"""Test that unique prefixes are generated for each request.
600
558
601
559
### WRITTEN BY AI ###
602
560
"""
603
- mock_text_creator_instance = Mock ()
604
- mock_text_creator_instance .words = ["word" ] * 100
605
- mock_text_creator_instance .create_text .return_value = "sample text"
606
- mock_text_creator .return_value = mock_text_creator_instance
607
-
608
561
# Mock the cycle to return predictable values
609
562
with patch ("guidellm.dataset.synthetic.cycle" ) as mock_cycle :
610
563
mock_cycle .return_value = iter ([100 , 101 , 102 , 103 , 104 ])
0 commit comments