Skip to content

Commit 3e274d3

Browse files
authored
Fix errors with metric accumulation (#266)
## Summary <!-- Include a short paragraph of the changes introduced in this PR. If this PR requires additional context or rationale, explain why the changes are necessary. --> Fixes a issue in metric calculation that caused incorrect statistics at extreme changes in concurrency and an issue where the first decode token was not counted in total tokens per second. ## Details <!-- Provide a detailed list of all changes introduced in this pull request. --> - [x] Fixed issue where merged concurrency change events would double-count concurrency - [x] Ensure first decode token is counted when calculating total tokens per second ## Test Plan <!-- List the steps needed to test this PR. --> - Run unit tests: `tox -e test-unit -- -m "regression and sanity"` --- - [x] "I certify that all code in this PR is my own, except as noted below." ## Use of AI - [x] Includes AI-assisted code completion - [ ] Includes code generated by an AI application - [x] Includes AI-generated tests (NOTE: AI written tests should have a docstring that includes `## WRITTEN BY AI ##`) --------- Signed-off-by: Samuel Monson <smonson@redhat.com>
1 parent 1261fe8 commit 3e274d3

File tree

3 files changed

+95
-13
lines changed

3 files changed

+95
-13
lines changed

src/guidellm/benchmark/benchmark.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -817,7 +817,9 @@ def from_stats(
817817
],
818818
iter_counts=[req.output_tokens for req in total_with_output_first],
819819
first_iter_counts=[
820-
req.prompt_tokens for req in total_with_output_first
820+
# prompt tokens + first token
821+
req.prompt_tokens + 1
822+
for req in total_with_output_first
821823
],
822824
),
823825
),

src/guidellm/objects/statistics.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -243,18 +243,9 @@ def from_request_times(
243243
"""
244244
if distribution_type == "concurrency":
245245
# convert to delta changes based on when requests were running
246-
time_deltas: dict[float, int] = defaultdict(int)
247-
for start, end in requests:
248-
time_deltas[start] += 1
249-
time_deltas[end] -= 1
250-
251-
# convert to the events over time measuring concurrency changes
252-
events = []
253-
active = 0
254-
255-
for time, delta in sorted(time_deltas.items()):
256-
active += delta
257-
events.append((time, active))
246+
events = [(start, 1) for start, _ in requests] + [
247+
(end, -1) for _, end in requests
248+
]
258249
elif distribution_type == "rate":
259250
# convert to events for when requests finished
260251
global_start = min(start for start, _ in requests) if requests else 0
@@ -281,6 +272,16 @@ def from_request_times(
281272
else:
282273
flattened_events.append((time, val))
283274

275+
if distribution_type == "concurrency":
276+
# convert to the events over time measuring concurrency changes
277+
events_over_time: list[tuple[float, float]] = []
278+
active = 0
279+
for time, delta in flattened_events:
280+
active += delta # type: ignore [assignment]
281+
events_over_time.append((time, active))
282+
283+
flattened_events = events_over_time
284+
284285
# convert to value distribution function
285286
distribution: dict[float, float] = defaultdict(float)
286287

tests/unit/objects/test_statistics.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -704,3 +704,82 @@ def test_time_running_stats_update():
704704
assert time_running_stats.rate_ms == pytest.approx(
705705
3000 / (time.time() - time_running_stats.start_time), abs=0.1
706706
)
707+
708+
709+
@pytest.mark.regression
710+
def test_distribution_summary_concurrency_double_counting_regression():
711+
"""Specific regression test for the double-counting bug in concurrency calculation.
712+
713+
Before the fix, when events were merged due to epsilon, the deltas were summed
714+
but then the active count wasn't properly accumulated, causing incorrect results.
715+
716+
### WRITTEN BY AI ###
717+
"""
718+
epsilon = 1e-6
719+
720+
# Create a scenario where multiple requests start at exactly the same time
721+
# This should result in events being merged, testing the accumulation logic
722+
same_start_time = 1.0
723+
requests = [
724+
(same_start_time, 3.0),
725+
(same_start_time, 4.0),
726+
(same_start_time, 5.0),
727+
(same_start_time + epsilon / 3, 6.0), # Very close start (within epsilon)
728+
]
729+
730+
distribution_summary = DistributionSummary.from_request_times(
731+
requests, distribution_type="concurrency", epsilon=epsilon
732+
)
733+
734+
# All requests start at the same time (or within epsilon), so they should
735+
# all be considered concurrent from the start
736+
# Expected timeline:
737+
# - t=1.0-3.0: 4 concurrent requests
738+
# - t=3.0-4.0: 3 concurrent requests
739+
# - t=4.0-5.0: 2 concurrent requests
740+
# - t=5.0-6.0: 1 concurrent request
741+
742+
assert distribution_summary.max == 4.0 # All 4 requests concurrent at start
743+
assert distribution_summary.min == 1.0 # 1 request still running at the end
744+
745+
746+
@pytest.mark.sanity
747+
def test_distribution_summary_concurrency_epsilon_edge_case():
748+
"""Test the exact epsilon boundary condition.
749+
750+
### WRITTEN BY AI ###
751+
"""
752+
epsilon = 1e-6
753+
754+
# Test requests that are exactly epsilon apart - should be merged
755+
requests_exactly_epsilon = [
756+
(1.0, 2.0),
757+
(1.0 + epsilon, 2.5), # Exactly epsilon apart
758+
(2.0, 2.5), # Another close request
759+
]
760+
761+
dist_epsilon = DistributionSummary.from_request_times(
762+
requests_exactly_epsilon, distribution_type="concurrency", epsilon=epsilon
763+
)
764+
765+
# Should be treated as concurrent (merged events)
766+
assert dist_epsilon.max == 2.0
767+
assert dist_epsilon.min == 2.0
768+
769+
# Test requests that are just over epsilon apart - should NOT be merged
770+
requests_over_epsilon = [
771+
(1.0, 2.0),
772+
(1.0 + epsilon * 1.1, 2.5), # Just over epsilon apart
773+
(2.0, 2.5), # Another close request
774+
]
775+
776+
dist_over_epsilon = DistributionSummary.from_request_times(
777+
requests_over_epsilon, distribution_type="concurrency", epsilon=epsilon
778+
)
779+
780+
# These should be treated separately, so max concurrency depends on overlap
781+
# At t=1.0 to 1.0+epsilon*1.1: 1 concurrent
782+
# At t=1.0+epsilon*1.1 to 2.0: 2 concurrent
783+
# At t=2.0 to 2.5: 1 concurrent
784+
assert dist_over_epsilon.max == 2.0
785+
assert dist_over_epsilon.min == 1.0

0 commit comments

Comments
 (0)