Skip to content

Commit 20b660b

Browse files
committed
Update tests for new prefix patch and reduce the number of mocks
1 parent 69f6575 commit 20b660b

File tree

1 file changed

+47
-94
lines changed

1 file changed

+47
-94
lines changed

tests/unit/dataset/test_synthetic.py

Lines changed: 47 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import yaml
1212

1313
from guidellm.dataset.synthetic import (
14+
PrefixBucketConfig,
1415
SyntheticDatasetConfig,
1516
SyntheticDatasetCreator,
1617
SyntheticTextItemsGenerator,
@@ -29,8 +30,12 @@ def test_config_creation_with_all_params(self):
2930
3031
### WRITTEN BY AI ###
3132
"""
33+
prefix_bucket = PrefixBucketConfig(
34+
bucket_weight=100, prefix_count=1, prefix_tokens=5
35+
)
36+
3237
config = SyntheticDatasetConfig(
33-
prefix_tokens=5,
38+
prefix_buckets=[prefix_bucket],
3439
prompt_tokens=100,
3540
prompt_tokens_stdev=10,
3641
prompt_tokens_min=50,
@@ -43,7 +48,7 @@ def test_config_creation_with_all_params(self):
4348
source="custom_text.txt",
4449
)
4550

46-
assert config.prefix_tokens == 5
51+
assert config.prefix_buckets[0].prefix_tokens == 5
4752
assert config.prompt_tokens == 100
4853
assert config.prompt_tokens_stdev == 10
4954
assert config.prompt_tokens_min == 50
@@ -67,7 +72,9 @@ def test_parse_json_string(self):
6772
"output_tokens": 25,
6873
"samples": 200,
6974
"source": "test.txt",
70-
"prefix_tokens": 10,
75+
"prefix_buckets": [
76+
{"bucket_weight": 100, "prefix_count": 1, "prefix_tokens": 10}
77+
],
7178
}
7279
)
7380

@@ -77,23 +84,23 @@ def test_parse_json_string(self):
7784
assert config.output_tokens == 25
7885
assert config.samples == 200
7986
assert config.source == "test.txt"
80-
assert config.prefix_tokens == 10
87+
assert config.prefix_buckets[0].prefix_tokens == 10
8188

8289
@pytest.mark.regression
8390
def test_parse_key_value_pairs(self):
8491
"""Test parsing key-value pairs configuration.
8592
8693
### WRITTEN BY AI ###
8794
"""
88-
kv_str = "prompt_tokens=80,output_tokens=30,samples=300,source=data.txt,prefix_tokens=5" # noqa: E501
95+
kv_str = "prompt_tokens=80,output_tokens=30,samples=300,source=data.txt"
8996

9097
config = SyntheticDatasetConfig.parse_str(kv_str)
9198

9299
assert config.prompt_tokens == 80
93100
assert config.output_tokens == 30
94101
assert config.samples == 300
95102
assert config.source == "data.txt"
96-
assert config.prefix_tokens == 5
103+
assert config.prefix_buckets is None
97104

98105
@pytest.mark.sanity
99106
def test_parse_yaml_file(self):
@@ -106,7 +113,9 @@ def test_parse_yaml_file(self):
106113
"output_tokens": 15,
107114
"samples": 100,
108115
"source": "yaml_test.txt",
109-
"prefix_tokens": 3,
116+
"prefix_buckets": [
117+
{"bucket_weight": 100, "prefix_count": 1, "prefix_tokens": 3}
118+
],
110119
}
111120

112121
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
@@ -120,7 +129,7 @@ def test_parse_yaml_file(self):
120129
assert config.output_tokens == 15
121130
assert config.samples == 100
122131
assert config.source == "yaml_test.txt"
123-
assert config.prefix_tokens == 3
132+
assert config.prefix_buckets[0].prefix_tokens == 3
124133
finally:
125134
Path(yaml_path).unlink()
126135

@@ -134,7 +143,9 @@ def test_parse_config_file(self):
134143
"prompt_tokens": 90,
135144
"output_tokens": 35,
136145
"samples": 150,
137-
"prefix_tokens": 2,
146+
"prefix_buckets": [
147+
{"bucket_weight": 100, "prefix_count": 1, "prefix_tokens": 2}
148+
],
138149
}
139150

140151
with tempfile.NamedTemporaryFile(mode="w", suffix=".config", delete=False) as f:
@@ -147,7 +158,7 @@ def test_parse_config_file(self):
147158
assert config.prompt_tokens == 90
148159
assert config.output_tokens == 35
149160
assert config.samples == 150
150-
assert config.prefix_tokens == 2
161+
assert config.prefix_buckets[0].prefix_tokens == 2
151162
finally:
152163
Path(config_path).unlink()
153164

@@ -194,8 +205,9 @@ def test_validation_positive_values(self):
194205
with pytest.raises(ValueError):
195206
SyntheticDatasetConfig(prompt_tokens=20, output_tokens=10, samples=0)
196207

208+
# Test negative prefix tokens via PrefixBucketConfig validation
197209
with pytest.raises(ValueError):
198-
SyntheticDatasetConfig(prompt_tokens=20, output_tokens=10, prefix_tokens=-1)
210+
PrefixBucketConfig(prefix_tokens=-1)
199211

200212
@pytest.mark.regression
201213
def test_validation_optional_positive_values(self):
@@ -279,7 +291,7 @@ def mock_tokenizer(self):
279291
"""
280292
tokenizer = Mock()
281293
tokenizer.get_vocab.return_value = {f"token_{i}": i for i in range(1000)}
282-
tokenizer.encode.side_effect = lambda text: [1, 2, 3] * (len(text) // 10 + 1)
294+
tokenizer.encode.side_effect = lambda text: list(range(len(text.split())))
283295
tokenizer.decode.side_effect = (
284296
lambda tokens, skip_special_tokens=False: " ".join(
285297
f"token_{t}" for t in tokens[:5]
@@ -306,8 +318,12 @@ def config_with_prefix(self):
306318
307319
### WRITTEN BY AI ###
308320
"""
321+
prefix_bucket = PrefixBucketConfig(
322+
bucket_weight=100, prefix_count=1, prefix_tokens=3
323+
)
324+
309325
return SyntheticDatasetConfig(
310-
prefix_tokens=3,
326+
prefix_buckets=[prefix_bucket],
311327
prompt_tokens=15,
312328
output_tokens=10,
313329
samples=5,
@@ -352,20 +368,14 @@ def test_generator_initialization(
352368
mock_text_creator.assert_called_once_with(data=simple_config.source)
353369

354370
@pytest.mark.smoke
355-
@patch("guidellm.dataset.synthetic.EndlessTextCreator")
356371
@patch("guidellm.dataset.synthetic.IntegerRangeSampler")
357372
def test_basic_iteration(
358-
self, mock_sampler, mock_text_creator, simple_config, mock_tokenizer
373+
self,
374+
mock_sampler,
375+
simple_config,
376+
mock_tokenizer,
359377
):
360-
"""Test basic iteration functionality.
361-
362-
### WRITTEN BY AI ###
363-
"""
364-
# Setup mocks
365-
mock_text_creator_instance = Mock()
366-
mock_text_creator_instance.words = ["word1", "word2", "word3"] * 100
367-
mock_text_creator_instance.create_text.return_value = "sample text"
368-
mock_text_creator.return_value = mock_text_creator_instance
378+
"""Test basic iteration functionality."""
369379

370380
# Mock IntegerRangeSampler to return iterators
371381
def mock_sampler_side_effect(*args, **kwargs):
@@ -394,59 +404,34 @@ def mock_sampler_side_effect(*args, **kwargs):
394404
assert isinstance(item["output_tokens_count"], int)
395405

396406
@pytest.mark.sanity
397-
@patch("guidellm.dataset.synthetic.EndlessTextCreator")
398-
def test_create_prompt_method(
399-
self, mock_text_creator, simple_config, mock_tokenizer
400-
):
407+
def test_create_prompt_method(self, simple_config, mock_tokenizer):
401408
"""Test _create_prompt method.
402409
403410
### WRITTEN BY AI ###
404411
"""
405-
mock_text_creator_instance = Mock()
406-
mock_text_creator_instance.words = ["word"] * 100
407-
mock_text_creator_instance.create_text.return_value = "test text"
408-
mock_text_creator.return_value = mock_text_creator_instance
409-
410-
mock_tokenizer.encode.return_value = [1, 2, 3]
411-
412412
generator = SyntheticTextItemsGenerator(
413413
simple_config, mock_tokenizer, random_seed=42
414414
)
415415

416416
# Test normal case
417417
result = generator._create_prompt(5, 0, 42)
418-
assert result == [42, 1, 2, 3]
418+
assert result[0] == 42 # Unique prefix token
419+
assert len(result) == 5
419420

420421
# Test zero tokens
421422
result = generator._create_prompt(0, 0, 42)
422423
assert result == []
423424

424425
# Test without unique prefix
425426
result = generator._create_prompt(3, 0)
426-
assert result == [1, 2, 3]
427+
assert len(result) == 3
427428

428429
@pytest.mark.regression
429-
@patch("guidellm.dataset.synthetic.EndlessTextCreator")
430-
def test_create_prompt_binary_search(
431-
self, mock_text_creator, simple_config, mock_tokenizer
432-
):
430+
def test_create_prompt_binary_search(self, simple_config, mock_tokenizer):
433431
"""Test binary search logic in _create_prompt.
434432
435433
### WRITTEN BY AI ###
436434
"""
437-
mock_text_creator_instance = Mock()
438-
mock_text_creator_instance.words = ["word"] * 1000
439-
mock_text_creator_instance.create_text.side_effect = lambda start, length: (
440-
"text " * max(1, length // 4)
441-
).strip()
442-
mock_text_creator.return_value = mock_text_creator_instance
443-
444-
# Mock tokenizer to return different lengths based on input
445-
def mock_encode(text):
446-
return [1] * len(text.split())
447-
448-
mock_tokenizer.encode.side_effect = mock_encode
449-
450435
generator = SyntheticTextItemsGenerator(
451436
simple_config, mock_tokenizer, random_seed=42
452437
)
@@ -456,21 +441,14 @@ def mock_encode(text):
456441
assert len(result) >= 4 # Should include prefix + some tokens
457442

458443
@pytest.mark.sanity
459-
@patch("guidellm.dataset.synthetic.EndlessTextCreator")
460444
@patch("guidellm.dataset.synthetic.IntegerRangeSampler")
461445
def test_prefix_tokens_integration(
462-
self, mock_sampler, mock_text_creator, config_with_prefix, mock_tokenizer
446+
self, mock_sampler, config_with_prefix, mock_tokenizer
463447
):
464448
"""Test integration with prefix tokens.
465449
466450
### WRITTEN BY AI ###
467451
"""
468-
# Setup mocks
469-
mock_text_creator_instance = Mock()
470-
mock_text_creator_instance.words = ["word"] * 100
471-
mock_text_creator_instance.create_text.return_value = "sample text"
472-
mock_text_creator.return_value = mock_text_creator_instance
473-
474452
mock_sampler_instance = Mock()
475453
mock_sampler_instance.__iter__ = Mock(return_value=iter([15, 15, 15, 15, 15]))
476454
mock_sampler.return_value = mock_sampler_instance
@@ -483,24 +461,20 @@ def test_prefix_tokens_integration(
483461

484462
# Verify prompt_tokens_count includes prefix
485463
for item in items:
486-
assert item["prompt_tokens_count"] == config_with_prefix.prefix_tokens + 15
464+
assert (
465+
item["prompt_tokens_count"]
466+
== config_with_prefix.prefix_buckets[0].prefix_tokens + 15
467+
)
487468

488469
@pytest.mark.regression
489-
@patch("guidellm.dataset.synthetic.EndlessTextCreator")
490470
@patch("guidellm.dataset.synthetic.IntegerRangeSampler")
491471
def test_random_seeding_consistency(
492-
self, mock_sampler, mock_text_creator, simple_config, mock_tokenizer
472+
self, mock_sampler, simple_config, mock_tokenizer
493473
):
494474
"""Test that same seed produces consistent results.
495475
496476
### WRITTEN BY AI ###
497477
"""
498-
# Setup mocks
499-
mock_text_creator_instance = Mock()
500-
mock_text_creator_instance.words = ["word"] * 100
501-
mock_text_creator_instance.create_text.return_value = "sample text"
502-
mock_text_creator.return_value = mock_text_creator_instance
503-
504478
# Create consistent mock sampler behavior
505479
call_count = 0
506480

@@ -536,25 +510,12 @@ def mock_sampler_side_effect(*args, **kwargs):
536510
assert item1["output_tokens_count"] == item2["output_tokens_count"]
537511

538512
@pytest.mark.regression
539-
@patch("guidellm.dataset.synthetic.EndlessTextCreator")
540513
@patch("guidellm.dataset.synthetic.IntegerRangeSampler")
541-
def test_variance_configuration(
542-
self, mock_sampler, mock_text_creator, complex_config, mock_tokenizer
543-
):
514+
def test_variance_configuration(self, mock_sampler, complex_config, mock_tokenizer):
544515
"""Test that variance configuration is properly used.
545516
546517
### WRITTEN BY AI ###
547518
"""
548-
# Setup mocks
549-
mock_text_creator_instance = Mock()
550-
mock_text_creator_instance.words = ["word"] * 100
551-
mock_text_creator_instance.create_text.return_value = "sample text"
552-
mock_text_creator.return_value = mock_text_creator_instance
553-
554-
# Fix tokenizer mock to handle the create_text return properly
555-
mock_tokenizer.encode.side_effect = (
556-
lambda text: [1, 2, 3] if isinstance(text, str) else [1, 2, 3]
557-
)
558519

559520
# Setup mock sampler to track calls
560521
def mock_sampler_side_effect(*args, **kwargs):
@@ -592,19 +553,11 @@ def mock_sampler_side_effect(*args, **kwargs):
592553
assert output_call[1]["random_seed"] == 43 # 42 + 1
593554

594555
@pytest.mark.regression
595-
@patch("guidellm.dataset.synthetic.EndlessTextCreator")
596-
def test_unique_prefix_generation(
597-
self, mock_text_creator, simple_config, mock_tokenizer
598-
):
556+
def test_unique_prefix_generation(self, simple_config, mock_tokenizer):
599557
"""Test that unique prefixes are generated for each request.
600558
601559
### WRITTEN BY AI ###
602560
"""
603-
mock_text_creator_instance = Mock()
604-
mock_text_creator_instance.words = ["word"] * 100
605-
mock_text_creator_instance.create_text.return_value = "sample text"
606-
mock_text_creator.return_value = mock_text_creator_instance
607-
608561
# Mock the cycle to return predictable values
609562
with patch("guidellm.dataset.synthetic.cycle") as mock_cycle:
610563
mock_cycle.return_value = iter([100, 101, 102, 103, 104])

0 commit comments

Comments
 (0)