Fix some edge cases and resolve floating point precision problem.

shunping · shunping · commit fd153a501fc5 · 2025-06-27T01:07:53.000-04:00
Add tests.
diff --git a/sdks/python/apache_beam/transforms/periodicsequence.py b/sdks/python/apache_beam/transforms/periodicsequence.py
@@ -21,7 +21,6 @@
 from typing import Any
 from typing import Optional
 from typing import Sequence
-from typing import Union
 
 import apache_beam as beam
 from apache_beam.io.restriction_trackers import OffsetRange
@@ -41,13 +40,21 @@ class ImpulseSeqGenRestrictionProvider(core.RestrictionProvider):
   def initial_restriction(self, element):
     start, end, interval = element
     if isinstance(start, Timestamp):
-      start = start.micros / 1000000
+      start_micros = start.micros
+    else:
+      start_micros = round(start * 1000000)
+
     if isinstance(end, Timestamp):
-      end = end.micros / 1000000
+      end_micros = end.micros
+    else:
+      end_micros = round(end * 1000000)
+
+    interval_micros = round(interval * 1000000)
 
-    assert start <= end
+    assert start_micros <= end_micros
     assert interval > 0
-    total_outputs = math.ceil((end - start) / interval)
+    delta_micros: int = end_micros - start_micros
+    total_outputs = math.ceil(delta_micros / interval_micros)
     return OffsetRange(0, total_outputs)
 
   def create_tracker(self, restriction):
@@ -232,19 +239,19 @@ def _validate_and_adjust_duration(self):
 
     if isinstance(self.stop_ts, Timestamp):
       if self.stop_ts == MAX_TIMESTAMP:
-        # adjust stop timestamp to match the data duration
-        end = start + data_duration
-        if self.interval > 1e-6:
-          end += 1e-6
-        self.stop_ts = Timestamp.of(end)
+        # When the stop timestamp is unbounded (MAX_TIMESTAMP), set it to the
+        # data's actual end time plus an extra fire interval, because the
+        # impulse duration's upper bound is exclusive.
+        end = start + data_duration + self.interval
+        self.stop_ts = Timestamp(micros=end * 1000000)
       else:
         end = self.stop_ts.micros / 1000000
     else:
       end = self.stop_ts
 
     # The total time for the impulse signal which occurs in [start, end).
     impulse_duration = end - start
-    if data_duration + self.interval < impulse_duration:
+    if round(data_duration + self.interval, 6) < round(impulse_duration, 6):
       # We don't have enough data for the impulse.
       # If we can fit at least one more data point in the impulse duration,
       # then we will be in the repeat mode.
@@ -264,8 +271,8 @@ def _validate_and_adjust_duration(self):
 
   def __init__(
       self,
-      start_timestamp: Union[Timestamp, float] = Timestamp.now(),
-      stop_timestamp: Union[Timestamp, float] = MAX_TIMESTAMP,
+      start_timestamp: Timestamp = Timestamp.now(),
+      stop_timestamp: Timestamp = MAX_TIMESTAMP,
       fire_interval: float = 360.0,
       apply_windowing: bool = False,
       data: Optional[Sequence[Any]] = None):
diff --git a/sdks/python/apache_beam/transforms/periodicsequence_test.py b/sdks/python/apache_beam/transforms/periodicsequence_test.py
@@ -19,10 +19,14 @@
 
 # pytype: skip-file
 
+import logging
 import inspect
+import random
 import time
 import unittest
 
+from parameterized import parameterized
+
 import apache_beam as beam
 from apache_beam.io.restriction_trackers import OffsetRange
 from apache_beam.testing.test_pipeline import TestPipeline
@@ -157,6 +161,53 @@ def test_processing_time(self):
       expected = [0, 2, 4]
       assert_that(ret, equal_to(expected, lambda x, y: abs(x - y) < threshold))
 
+  @parameterized.expand([0.5, 1, 2, 10])
+  def test_stop_over_by_epsilon(self, interval):
+    with TestPipeline() as p:
+      ret = (
+          p | PeriodicImpulse(
+              start_timestamp=Timestamp(seconds=1),
+              stop_timestamp=Timestamp(seconds=1, micros=1),
+              data=[1, 2],
+              fire_interval=interval)
+          | beam.WindowInto(FixedWindows(interval))
+          | beam.WithKeys(0)
+          | beam.GroupByKey())
+      expected = [
+          (0, [1]),
+      ]
+      assert_that(ret, equal_to(expected))
+
+  @parameterized.expand([1, 2])
+  def test_stop_over_by_interval(self, interval):
+    with TestPipeline() as p:
+      ret = (
+          p | PeriodicImpulse(
+              start_timestamp=Timestamp(seconds=1),
+              stop_timestamp=Timestamp(seconds=1 + interval),
+              data=[1, 2],
+              fire_interval=interval)
+          | beam.WindowInto(FixedWindows(interval))
+          | beam.WithKeys(0)
+          | beam.GroupByKey())
+      expected = [(0, [1])]
+      assert_that(ret, equal_to(expected))
+
+  @parameterized.expand([1, 2])
+  def test_stop_over_by_interval_and_epsilon(self, interval):
+    with TestPipeline() as p:
+      ret = (
+          p | PeriodicImpulse(
+              start_timestamp=Timestamp(seconds=1),
+              stop_timestamp=Timestamp(seconds=1 + interval, micros=1),
+              data=[1, 2],
+              fire_interval=interval)
+          | beam.WindowInto(FixedWindows(interval))
+          | beam.WithKeys(0)
+          | beam.GroupByKey())
+      expected = [(0, [1]), (0, [2])]
+      assert_that(ret, equal_to(expected))
+
   def test_interval(self):
     with TestPipeline() as p:
       ret = (
@@ -208,15 +259,22 @@ def test_not_enough_timestamped_value(self):
                 data=data,
                 fire_interval=0.5))
 
-  def test_small_interval(self):
-    data = [(Timestamp(1), 1), (Timestamp(2), 2), (Timestamp(3), 3),
-            (Timestamp(6), 6), (Timestamp(4), 4), (Timestamp(5), 5),
-            (Timestamp(7), 7), (Timestamp(8), 8), (Timestamp(9), 9),
-            (Timestamp(10), 10)]
-    expected = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-    with TestPipeline() as p:
-      ret = (p | PeriodicImpulse(data=data, fire_interval=0.0001))
-      assert_that(ret, equal_to(expected))
+  def test_fuzzy_interval(self):
+    seed = int(time.time() * 1000)
+    times = 30
+    logging.warning("random seed=%d", seed)
+    random.seed(seed)
+    for _ in range(times):
+      n = int(random.randint(1, 100))
+      data = list(range(n))
+      m = random.randint(1, 1000)
+      interval = m / 1e6
+      now = Timestamp.now()
+      with TestPipeline() as p:
+        ret = (
+            p | PeriodicImpulse(
+                start_timestamp=now, data=data, fire_interval=interval))
+        assert_that(ret, equal_to(data))
 
 
 if __name__ == '__main__':