Some progress with virtual arrays

ariostas · ariostas · commit f392613af185 · 2025-10-06T17:56:58.000-04:00
diff --git a/src/uproot/behaviors/RNTuple.py b/src/uproot/behaviors/RNTuple.py
@@ -15,6 +15,7 @@
 import sys
 import warnings
 from collections.abc import Mapping
+from functools import partial
 
 import numpy
 
@@ -622,6 +623,7 @@ def arrays(
         interpreter="cpu",
         ak_add_doc=False,
         how=None,
+        virtual=True,
         # For compatibility reasons we also accepts kwargs meant for TTrees
         interpretation_executor=None,
         filter_branch=unset,
@@ -677,6 +679,7 @@ def arrays(
                 ``list``, and ``dict``. Note that the container *type itself*
                 must be passed as ``how``, not an instance of that type (i.e.
                 ``how=tuple``, not ``how=()``).
+            virtual (bool): If True, return virtual Awkward arrays, meaning that the data will not be loaded into memory until it is accessed.
             interpretation_executor (None): This argument is not used and is only included for now
                 for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used
                 and will be removed in a future version.
@@ -759,22 +762,40 @@ def arrays(
                 n_padding = self.ntuple.column_records[key_nr].first_element_index
                 n_padding -= cluster_starts[start_cluster_idx]
                 n_padding = max(n_padding, 0)
+                dtype = None
                 if interpreter == "cpu":
-                    content = self.ntuple.read_cluster_range(
+                    content_generator = partial(
+                        self.ntuple.read_cluster_range,
                         key_nr,
                         start_cluster_idx,
                         stop_cluster_idx,
                         missing_element_padding=n_padding,
                         array_cache=array_cache,
                     )
+                    if virtual:
+                        total_length, _, dtype = (
+                            self.ntuple._expected_array_length_starts_dtype(
+                                key_nr,
+                                start_cluster_idx,
+                                stop_cluster_idx,
+                                missing_element_padding=n_padding,
+                            )
+                        )
+                        if "cardinality" in key:
+                            total_length -= 1
+                        content = (total_length, content_generator)
+                    else:
+                        content = content_generator()
                 elif interpreter == "gpu" and backend == "cuda":
                     content = content_dict[key_nr]
                 elif interpreter == "gpu":
                     raise NotImplementedError(
                         f"Backend {backend} GDS support not implemented."
                     )
+                else:
+                    raise NotImplementedError(f"Backend {backend} not implemented.")
                 dtype_byte = self.ntuple.column_records[key_nr].type
-                _fill_container_dict(container_dict, content, key, dtype_byte)
+                _fill_container_dict(container_dict, content, key, dtype_byte, dtype)
 
         cluster_offset = cluster_starts[start_cluster_idx]
         entry_start -= cluster_offset
@@ -1779,27 +1800,75 @@ def _cupy_insert(arr, obj, value):
     return out
 
 
-def _fill_container_dict(container_dict, content, key, dtype_byte):
-    array_library_string = uproot._util.get_array_library(content)
+def _fill_container_dict(container_dict, content, key, dtype_byte, dtype):
+    from awkward._nplikes.numpy import Numpy
+    from awkward._nplikes.virtual import VirtualNDArray
+
+    if type(content) == tuple:
+        # Virtual arrays not yet implemented for GPU
+        array_library_string = "numpy"
+        virtual = True
+        length = int(content[0])
+        raw_generator = content[1]
+    else:
+        virtual = False
+        array_library_string = uproot._util.get_array_library(content)
 
     library = numpy if array_library_string == "numpy" else uproot.extras.cupy()
 
     if "cardinality" in key:
-        content = library.diff(content)
-
-    if "optional" in key:
-        # We need to convert from a ListOffsetArray to an IndexedOptionArray
-        diff = library.diff(content)
-        missing = library.nonzero(diff == 0)[0]
-        missing -= library.arange(len(missing), dtype=missing.dtype)
-        dtype = "int64" if content.dtype == library.uint64 else "int32"
-        indices = library.arange(len(content) - len(missing), dtype=dtype)
-        if array_library_string == "numpy":
-            indices = numpy.insert(indices, missing, -1)
+        if virtual:
+
+            def generator():
+                materialized = raw_generator()
+                materialized = library.diff(materialized)
+                return materialized
+
+            virtual_array = VirtualNDArray(
+                Numpy.instance(), shape=(length,), dtype=dtype, generator=generator
+            )
+            container_dict[f"{key}-data"] = generator
         else:
-            indices = _cupy_insert(indices, missing, -1)
-        container_dict[f"{key}-index"] = indices
+            content = library.diff(content)
+            container_dict[f"{key}-data"] = content
+    elif "optional" in key:
+        if virtual:
+
+            def generator():
+                # We need to convert from a ListOffsetArray to an IndexedOptionArray
+                materialized = raw_generator()
+                diff = library.diff(materialized)
+                missing = library.nonzero(diff == 0)[0]
+                missing -= library.arange(len(missing), dtype=missing.dtype)
+                dtype = "int64" if materialized.dtype == library.uint64 else "int32"
+                indices = library.arange(len(materialized) - len(missing), dtype=dtype)
+                if array_library_string == "numpy":
+                    indices = numpy.insert(indices, missing, -1)
+                else:
+                    indices = _cupy_insert(indices, missing, -1)
+                return indices
+
+            virtual_array = VirtualNDArray(
+                Numpy.instance(), shape=(length,), dtype=dtype, generator=generator
+            )
+            container_dict[f"{key}-index"] = generator
+        else:
+            # We need to convert from a ListOffsetArray to an IndexedOptionArray
+            diff = library.diff(content)
+            missing = library.nonzero(diff == 0)[0]
+            missing -= library.arange(len(missing), dtype=missing.dtype)
+            dtype = "int64" if content.dtype == library.uint64 else "int32"
+            indices = library.arange(len(content) - len(missing), dtype=dtype)
+            if array_library_string == "numpy":
+                indices = numpy.insert(indices, missing, -1)
+            else:
+                indices = _cupy_insert(indices, missing, -1)
+            container_dict[f"{key}-index"] = indices
     elif dtype_byte == uproot.const.rntuple_col_type_to_num_dict["switch"]:
+        if virtual:
+            # TODO: Figure out how to handle this one
+            content = raw_generator()
+            print(f"{length}  {len(content)}")
         kindex, tags = uproot.models.RNTuple._split_switch_bits(content)
         # Find invalid variants and adjust buffers accordingly
         invalid = numpy.flatnonzero(tags == -1)
@@ -1816,6 +1885,14 @@ def _fill_container_dict(container_dict, content, key, dtype_byte):
         container_dict[f"{key}-union-index"] = library.array(kindex)
         container_dict[f"{key}-union-tags"] = library.array(tags)
     else:
-        # don't distinguish data and offsets
-        container_dict[f"{key}-data"] = content
-        container_dict[f"{key}-offsets"] = content
+        if virtual:
+            virtual_array = VirtualNDArray(
+                Numpy.instance(), shape=(length,), dtype=dtype, generator=raw_generator
+            )
+            # don't distinguish data and offsets
+            container_dict[f"{key}-data"] = raw_generator
+            container_dict[f"{key}-offsets"] = raw_generator
+        else:
+            # don't distinguish data and offsets
+            container_dict[f"{key}-data"] = content
+            container_dict[f"{key}-offsets"] = content
diff --git a/src/uproot/const.py b/src/uproot/const.py
@@ -138,8 +138,8 @@
     0x0B: "float16",
     0x0C: "float32",
     0x0D: "float64",
-    0x0E: "uint32",  # Index32
-    0x0F: "uint64",  # Index64
+    0x0E: "int32",  # Index32
+    0x0F: "int64",  # Index64
     0x10: "switch",  # Switch: (uint64, uint32)
     0x11: "int16",  # SplitInt16: split + zigzag encoding
     0x12: "uint16",  # SplitUInt16: split encoding
@@ -150,8 +150,8 @@
     0x17: "float16",  # SplitReal16: split encoding
     0x18: "float32",  # SplitReal32: split encoding
     0x19: "float64",  # SplitReal64: split encoding
-    0x1A: "uint32",  # SplitIndex32: split + delta encoding
-    0x1B: "uint64",  # SplitIndex64: split + delta encoding
+    0x1A: "int32",  # SplitIndex32: split + delta encoding
+    0x1B: "int64",  # SplitIndex64: split + delta encoding
     0x1C: "real32trunc",  # Real32Trunc: float32 with truncated mantissa
     0x1D: "real32quant",  # Real32Quant: float32 with quantized integer representation
 }
diff --git a/src/uproot/models/RNTuple.py b/src/uproot/models/RNTuple.py
@@ -659,7 +659,7 @@ def read_page(
         if array_cache is not None:
             array_cache[key] = destination.copy()
 
-    def _expected_array_length_and_starts(
+    def _expected_array_length_starts_dtype(
         self, col_idx, cluster_start, cluster_stop, missing_element_padding=0
     ):
         """
@@ -669,7 +669,7 @@ def _expected_array_length_and_starts(
             cluster_stop (int): The first cluster to exclude (i.e. one greater than the last cluster to include).
             missing_element_padding (int): Number of padding elements to add at the start of the array.
 
-        Returns the expected length of the array over the given cluster range, including padding, and also the start indices of each cluster.
+        Returns the expected length of the array over the given cluster range (including padding), the start indices of each cluster, and the dtype of the array.
         """
         field_metadata = self.get_field_metadata(col_idx)
         if field_metadata.dtype_byte in uproot.const.rntuple_index_types:
@@ -699,7 +699,7 @@ def _expected_array_length_and_starts(
             starts.append(total_length)
             total_length += cluster_length
 
-        return total_length, starts
+        return total_length, starts, field_metadata.dtype_result
 
     def read_cluster_range(
         self,
@@ -720,7 +720,7 @@ def read_cluster_range(
         Returns a numpy array with the data from the column.
         """
         field_metadata = self.get_field_metadata(col_idx)
-        total_length, starts = self._expected_array_length_and_starts(
+        total_length, starts, _ = self._expected_array_length_starts_dtype(
             col_idx, cluster_start, cluster_stop, missing_element_padding
         )
         res = numpy.empty(total_length, field_metadata.dtype_result)
@@ -958,7 +958,7 @@ def gpu_deserialize_decompressed_content(
             n_padding = self.column_records[key_nr].first_element_index
             n_padding -= cluster_starts[start_cluster_idx]
             n_padding = max(n_padding, 0)
-            total_length, starts = self._expected_array_length_and_starts(
+            total_length, starts, _ = self._expected_array_length_starts_dtype(
                 ncol, start_cluster_idx, stop_cluster_idx, n_padding
             )
             field_metadata = self.get_field_metadata(ncol)