From b043010796b6fcb2c20e30edb0879c13e97cfe25 Mon Sep 17 00:00:00 2001 From: Andres Rios Tascon Date: Wed, 5 Nov 2025 10:02:22 -0500 Subject: [PATCH 1/2] Also return expected dtype --- src/uproot/const.py | 8 ++++---- src/uproot/models/RNTuple.py | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/uproot/const.py b/src/uproot/const.py index 80af0e62e..9fb20db63 100644 --- a/src/uproot/const.py +++ b/src/uproot/const.py @@ -138,8 +138,8 @@ 0x0B: "float16", 0x0C: "float32", 0x0D: "float64", - 0x0E: "uint32", # Index32 - 0x0F: "uint64", # Index64 + 0x0E: "int32", # Index32 + 0x0F: "int64", # Index64 0x10: "switch", # Switch: (uint64, uint32) 0x11: "int16", # SplitInt16: split + zigzag encoding 0x12: "uint16", # SplitUInt16: split encoding @@ -150,8 +150,8 @@ 0x17: "float16", # SplitReal16: split encoding 0x18: "float32", # SplitReal32: split encoding 0x19: "float64", # SplitReal64: split encoding - 0x1A: "uint32", # SplitIndex32: split + delta encoding - 0x1B: "uint64", # SplitIndex64: split + delta encoding + 0x1A: "int32", # SplitIndex32: split + delta encoding + 0x1B: "int64", # SplitIndex64: split + delta encoding 0x1C: "real32trunc", # Real32Trunc: float32 with truncated mantissa 0x1D: "real32quant", # Real32Quant: float32 with quantized integer representation } diff --git a/src/uproot/models/RNTuple.py b/src/uproot/models/RNTuple.py index 557832261..51aad4619 100644 --- a/src/uproot/models/RNTuple.py +++ b/src/uproot/models/RNTuple.py @@ -650,7 +650,7 @@ def read_page( ] self.deserialize_page_decompressed_buffer(destination, field_metadata) - def _expected_array_length_and_starts( + def _expected_array_length_starts_dtype( self, col_idx, cluster_start, cluster_stop, missing_element_padding=0 ): """ @@ -660,7 +660,7 @@ def _expected_array_length_and_starts( cluster_stop (int): The first cluster to exclude (i.e. one greater than the last cluster to include). missing_element_padding (int): Number of padding elements to add at the start of the array. - Returns the expected length of the array over the given cluster range, including padding, and also the start indices of each cluster. + Returns the expected length of the array over the given cluster range (including padding), the start indices of each cluster, and the dtype of the array. """ field_metadata = self.get_field_metadata(col_idx) if field_metadata.dtype_byte in uproot.const.rntuple_index_types: @@ -690,7 +690,7 @@ def _expected_array_length_and_starts( starts.append(total_length) total_length += cluster_length - return total_length, starts + return total_length, starts, field_metadata.dtype_result def read_cluster_range( self, @@ -711,7 +711,7 @@ def read_cluster_range( Returns a numpy array with the data from the column. """ field_metadata = self.get_field_metadata(col_idx) - total_length, starts = self._expected_array_length_and_starts( + total_length, starts, _ = self._expected_array_length_starts_dtype( col_idx, cluster_start, cluster_stop, missing_element_padding ) res = numpy.empty(total_length, field_metadata.dtype_result) @@ -964,7 +964,7 @@ def gpu_deserialize_decompressed_content( n_padding = self.column_records[key_nr].first_element_index n_padding -= cluster_starts[start_cluster_idx] n_padding = max(n_padding, 0) - total_length, starts = self._expected_array_length_and_starts( + total_length, starts, _ = self._expected_array_length_starts_dtype( ncol, start_cluster_idx, stop_cluster_idx, n_padding ) field_metadata = self.get_field_metadata(ncol) From 5dc521d2e38f03eb4645438457e4b64951fb17a9 Mon Sep 17 00:00:00 2001 From: Andres Rios Tascon Date: Wed, 5 Nov 2025 15:05:40 -0500 Subject: [PATCH 2/2] First working draft of virtual arrays --- src/uproot/behaviors/RNTuple.py | 157 ++++++++++++++++++++++++++------ src/uproot/models/RNTuple.py | 8 +- 2 files changed, 134 insertions(+), 31 deletions(-) diff --git a/src/uproot/behaviors/RNTuple.py b/src/uproot/behaviors/RNTuple.py index 15399ba5c..fb8ee268e 100644 --- a/src/uproot/behaviors/RNTuple.py +++ b/src/uproot/behaviors/RNTuple.py @@ -15,6 +15,7 @@ import sys import warnings from collections.abc import Mapping +from functools import partial import numpy @@ -622,6 +623,7 @@ def arrays( interpreter="cpu", ak_add_doc=False, how=None, + virtual=False, # For compatibility reasons we also accepts kwargs meant for TTrees interpretation_executor=None, filter_branch=unset, @@ -677,6 +679,7 @@ def arrays( ``list``, and ``dict``. Note that the container *type itself* must be passed as ``how``, not an instance of that type (i.e. ``how=tuple``, not ``how=()``). + virtual (bool): If True, return virtual Awkward arrays, meaning that the data will not be loaded into memory until it is accessed. interpretation_executor (None): This argument is not used and is only included for now for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used and will be removed in a future version. @@ -759,22 +762,40 @@ def arrays( n_padding = self.ntuple.column_records[key_nr].first_element_index n_padding -= cluster_starts[start_cluster_idx] n_padding = max(n_padding, 0) + dtype = None if interpreter == "cpu": - content = self.ntuple.read_cluster_range( + content_generator = partial( + self.ntuple.read_cluster_range, key_nr, start_cluster_idx, stop_cluster_idx, missing_element_padding=n_padding, array_cache=array_cache, ) + if virtual: + total_length, _, dtype = ( + self.ntuple._expected_array_length_starts_dtype( + key_nr, + start_cluster_idx, + stop_cluster_idx, + missing_element_padding=n_padding, + ) + ) + if "cardinality" in key: + total_length -= 1 + content = (total_length, content_generator) + else: + content = content_generator() elif interpreter == "gpu" and backend == "cuda": content = content_dict[key_nr] elif interpreter == "gpu": raise NotImplementedError( f"Backend {backend} GDS support not implemented." ) + else: + raise NotImplementedError(f"Backend {backend} not implemented.") dtype_byte = self.ntuple.column_records[key_nr].type - _fill_container_dict(container_dict, content, key, dtype_byte) + _fill_container_dict(container_dict, content, key, dtype_byte, dtype) cluster_offset = cluster_starts[start_cluster_idx] entry_start -= cluster_offset @@ -1778,36 +1799,116 @@ def _cupy_insert(arr, obj, value): return out -def _fill_container_dict(container_dict, content, key, dtype_byte): - array_library_string = uproot._util.get_array_library(content) +def _fill_container_dict(container_dict, content, key, dtype_byte, dtype): + from awkward._nplikes.numpy import Numpy + from awkward._nplikes.virtual import VirtualNDArray + + if isinstance(content, tuple): + # Virtual arrays not yet implemented for GPU + array_library_string = "numpy" + virtual = True + length = int(content[0]) + raw_generator = content[1] + else: + virtual = False + array_library_string = uproot._util.get_array_library(content) + length = len(content) + + def raw_generator(): + return content library = numpy if array_library_string == "numpy" else uproot.extras.cupy() if "cardinality" in key: - content = library.diff(content) - - if "optional" in key: - # We need to convert from a ListOffsetArray to an IndexedOptionArray - diff = library.diff(content) - missing = library.nonzero(diff == 0)[0] - missing -= library.arange(len(missing), dtype=missing.dtype) - dtype = "int64" if content.dtype == library.uint64 else "int32" - indices = library.arange(len(content) - len(missing), dtype=dtype) - if array_library_string == "numpy": - indices = numpy.insert(indices, missing, -1) + + def generator(): + materialized = raw_generator() + materialized = library.diff(materialized) + return materialized + + if virtual: + virtual_array = VirtualNDArray( + Numpy.instance(), shape=(length,), dtype=dtype, generator=generator + ) + container_dict[f"{key}-data"] = virtual_array else: - indices = _cupy_insert(indices, missing, -1) - container_dict[f"{key}-index"] = indices + container_dict[f"{key}-data"] = generator() + + elif "optional" in key: + + def generator(): + # We need to convert from a ListOffsetArray to an IndexedOptionArray + materialized = raw_generator() + diff = library.diff(materialized) + missing = library.nonzero(diff == 0)[0] + missing -= library.arange(len(missing), dtype=missing.dtype) + dtype = "int64" if materialized.dtype == library.int64 else "int32" + indices = library.arange(len(materialized) - len(missing), dtype=dtype) + if array_library_string == "numpy": + indices = numpy.insert(indices, missing, -1) + else: + indices = _cupy_insert(indices, missing, -1) + return indices[:-1] # We need to delete the last index + + if virtual: + virtual_array = VirtualNDArray( + Numpy.instance(), shape=(length - 1,), dtype=dtype, generator=generator + ) + container_dict[f"{key}-index"] = virtual_array + else: + container_dict[f"{key}-index"] = generator() + elif dtype_byte == uproot.const.rntuple_col_type_to_num_dict["switch"]: - tags = content["tag"].astype(numpy.int8) - kindex = content["index"] - # Find invalid variants and adjust buffers accordingly - invalid = numpy.flatnonzero(tags == 0) - kindex[invalid] = 0 # Might not be necessary, but safer - container_dict[f"{key}-index"] = library.array(kindex) - container_dict[f"{key}-tags"] = library.array(tags) - container_dict["nones-index"] = library.array([-1], dtype=numpy.int64) + + def tag_generator(): + content = raw_generator() + return content["tag"].astype(numpy.int8) + + def index_generator(): + content = raw_generator() + tags = content["tag"].astype(numpy.int8) + kindex = content["index"] + # Find invalid variants and adjust buffers accordingly + invalid = numpy.flatnonzero(tags == 0) + kindex[invalid] = 0 # Might not be necessary, but safer + return kindex + + def nones_index_generator(): + return library.array([-1], dtype=numpy.int64) + + if virtual: + tag_virtual_array = VirtualNDArray( + Numpy.instance(), + shape=(length,), + dtype=numpy.int8, + generator=tag_generator, + ) + container_dict[f"{key}-tags"] = tag_virtual_array + index_virtual_array = VirtualNDArray( + Numpy.instance(), + shape=(length,), + dtype=numpy.int64, + generator=index_generator, + ) + container_dict[f"{key}-index"] = index_virtual_array + nones_index_virtual_array = VirtualNDArray( + Numpy.instance(), + shape=(1,), + dtype=numpy.int64, + generator=nones_index_generator, + ) + container_dict["nones-index"] = nones_index_virtual_array + else: + container_dict[f"{key}-tags"] = tag_generator() + container_dict[f"{key}-index"] = index_generator() + container_dict["nones-index"] = nones_index_generator() else: - # don't distinguish data and offsets - container_dict[f"{key}-data"] = content - container_dict[f"{key}-offsets"] = content + if virtual: + virtual_array = VirtualNDArray( + Numpy.instance(), shape=(length,), dtype=dtype, generator=raw_generator + ) + container_dict[f"{key}-data"] = virtual_array + container_dict[f"{key}-offsets"] = virtual_array + else: + container_dict[f"{key}-data"] = content + container_dict[f"{key}-offsets"] = content diff --git a/src/uproot/models/RNTuple.py b/src/uproot/models/RNTuple.py index 51aad4619..3da958f12 100644 --- a/src/uproot/models/RNTuple.py +++ b/src/uproot/models/RNTuple.py @@ -711,10 +711,10 @@ def read_cluster_range( Returns a numpy array with the data from the column. """ field_metadata = self.get_field_metadata(col_idx) - total_length, starts, _ = self._expected_array_length_starts_dtype( + total_length, starts, dtype = self._expected_array_length_starts_dtype( col_idx, cluster_start, cluster_stop, missing_element_padding ) - res = numpy.empty(total_length, field_metadata.dtype_result) + res = numpy.empty(total_length, dtype) # Initialize the padding elements. Note that it might be different from missing_element_padding # because for offsets there is an extra zero added at the start. assert len(starts) > 0, "The cluster range is invalid" @@ -726,7 +726,7 @@ def read_cluster_range( cluster_idx, col_idx, field_metadata, - destination=res[starts[i] : stop], + destination=res[starts[i] : stop].view(field_metadata.dtype), array_cache=array_cache, ) @@ -1169,6 +1169,8 @@ def get_field_metadata(self, ncol): "std::string" ): dtype_result = dtype + elif dtype_byte in uproot.const.rntuple_custom_float_types: + dtype_result = numpy.float32 else: dtype_result = numpy.result_type(*alt_dtype_list) field_metadata = FieldClusterMetadata(