Fix errors with metric accumulation (#266)

sjmonson · web-flow · commit 3e274d3632a0 · 2025-08-20T11:32:12.000-04:00
## Summary

&lt;!--
Include a short paragraph of the changes introduced in this PR.
If this PR requires additional context or rationale, explain why
the changes are necessary.
--&gt;
Fixes a issue in metric calculation that caused incorrect statistics at
extreme changes in concurrency and an issue where the first decode token
was not counted in total tokens per second.

## Details

&lt;!--
Provide a detailed list of all changes introduced in this pull request.
--&gt;
- [x] Fixed issue where merged concurrency change events would
double-count concurrency
- [x] Ensure first decode token is counted when calculating total tokens
per second

## Test Plan

&lt;!--
List the steps needed to test this PR.
--&gt;
- Run unit tests: `tox -e test-unit -- -m "regression and sanity"`

---

- [x] "I certify that all code in this PR is my own, except as noted
below."

## Use of AI

- [x] Includes AI-assisted code completion
- [ ] Includes code generated by an AI application
- [x] Includes AI-generated tests (NOTE: AI written tests should have a
docstring that includes `## WRITTEN BY AI ##`)

---------

Signed-off-by: Samuel Monson &lt;smonson@redhat.com&gt;
diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py
@@ -817,7 +817,9 @@ def from_stats(
                     ],
                     iter_counts=[req.output_tokens for req in total_with_output_first],
                     first_iter_counts=[
-                        req.prompt_tokens for req in total_with_output_first
+                        # prompt tokens + first token
+                        req.prompt_tokens + 1
+                        for req in total_with_output_first
                     ],
                 ),
             ),
diff --git a/src/guidellm/objects/statistics.py b/src/guidellm/objects/statistics.py
@@ -243,18 +243,9 @@ def from_request_times(
         """
         if distribution_type == "concurrency":
             # convert to delta changes based on when requests were running
-            time_deltas: dict[float, int] = defaultdict(int)
-            for start, end in requests:
-                time_deltas[start] += 1
-                time_deltas[end] -= 1
-
-            # convert to the events over time measuring concurrency changes
-            events = []
-            active = 0
-
-            for time, delta in sorted(time_deltas.items()):
-                active += delta
-                events.append((time, active))
+            events = [(start, 1) for start, _ in requests] + [
+                (end, -1) for _, end in requests
+            ]
         elif distribution_type == "rate":
             # convert to events for when requests finished
             global_start = min(start for start, _ in requests) if requests else 0
@@ -281,6 +272,16 @@ def from_request_times(
             else:
                 flattened_events.append((time, val))
 
+        if distribution_type == "concurrency":
+            # convert to the events over time measuring concurrency changes
+            events_over_time: list[tuple[float, float]] = []
+            active = 0
+            for time, delta in flattened_events:
+                active += delta  # type: ignore [assignment]
+                events_over_time.append((time, active))
+
+            flattened_events = events_over_time
+
         # convert to value distribution function
         distribution: dict[float, float] = defaultdict(float)
 
diff --git a/tests/unit/objects/test_statistics.py b/tests/unit/objects/test_statistics.py
@@ -704,3 +704,82 @@ def test_time_running_stats_update():
     assert time_running_stats.rate_ms == pytest.approx(
         3000 / (time.time() - time_running_stats.start_time), abs=0.1
     )
+
+
+@pytest.mark.regression
+def test_distribution_summary_concurrency_double_counting_regression():
+    """Specific regression test for the double-counting bug in concurrency calculation.
+
+    Before the fix, when events were merged due to epsilon, the deltas were summed
+    but then the active count wasn't properly accumulated, causing incorrect results.
+
+    ### WRITTEN BY AI ###
+    """
+    epsilon = 1e-6
+
+    # Create a scenario where multiple requests start at exactly the same time
+    # This should result in events being merged, testing the accumulation logic
+    same_start_time = 1.0
+    requests = [
+        (same_start_time, 3.0),
+        (same_start_time, 4.0),
+        (same_start_time, 5.0),
+        (same_start_time + epsilon / 3, 6.0),  # Very close start (within epsilon)
+    ]
+
+    distribution_summary = DistributionSummary.from_request_times(
+        requests, distribution_type="concurrency", epsilon=epsilon
+    )
+
+    # All requests start at the same time (or within epsilon), so they should
+    # all be considered concurrent from the start
+    # Expected timeline:
+    # - t=1.0-3.0: 4 concurrent requests
+    # - t=3.0-4.0: 3 concurrent requests
+    # - t=4.0-5.0: 2 concurrent requests
+    # - t=5.0-6.0: 1 concurrent request
+
+    assert distribution_summary.max == 4.0  # All 4 requests concurrent at start
+    assert distribution_summary.min == 1.0  # 1 request still running at the end
+
+
+@pytest.mark.sanity
+def test_distribution_summary_concurrency_epsilon_edge_case():
+    """Test the exact epsilon boundary condition.
+
+    ### WRITTEN BY AI ###
+    """
+    epsilon = 1e-6
+
+    # Test requests that are exactly epsilon apart - should be merged
+    requests_exactly_epsilon = [
+        (1.0, 2.0),
+        (1.0 + epsilon, 2.5),  # Exactly epsilon apart
+        (2.0, 2.5),  # Another close request
+    ]
+
+    dist_epsilon = DistributionSummary.from_request_times(
+        requests_exactly_epsilon, distribution_type="concurrency", epsilon=epsilon
+    )
+
+    # Should be treated as concurrent (merged events)
+    assert dist_epsilon.max == 2.0
+    assert dist_epsilon.min == 2.0
+
+    # Test requests that are just over epsilon apart - should NOT be merged
+    requests_over_epsilon = [
+        (1.0, 2.0),
+        (1.0 + epsilon * 1.1, 2.5),  # Just over epsilon apart
+        (2.0, 2.5),  # Another close request
+    ]
+
+    dist_over_epsilon = DistributionSummary.from_request_times(
+        requests_over_epsilon, distribution_type="concurrency", epsilon=epsilon
+    )
+
+    # These should be treated separately, so max concurrency depends on overlap
+    # At t=1.0 to 1.0+epsilon*1.1: 1 concurrent
+    # At t=1.0+epsilon*1.1 to 2.0: 2 concurrent
+    # At t=2.0 to 2.5: 1 concurrent
+    assert dist_over_epsilon.max == 2.0
+    assert dist_over_epsilon.min == 1.0