Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 129 additions & 28 deletions src/uproot/behaviors/RNTuple.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import sys
import warnings
from collections.abc import Mapping
from functools import partial

import numpy

Expand Down Expand Up @@ -622,6 +623,7 @@ def arrays(
interpreter="cpu",
ak_add_doc=False,
how=None,
virtual=False,
# For compatibility reasons we also accepts kwargs meant for TTrees
interpretation_executor=None,
filter_branch=unset,
Expand Down Expand Up @@ -677,6 +679,7 @@ def arrays(
``list``, and ``dict``. Note that the container *type itself*
must be passed as ``how``, not an instance of that type (i.e.
``how=tuple``, not ``how=()``).
virtual (bool): If True, return virtual Awkward arrays, meaning that the data will not be loaded into memory until it is accessed.
interpretation_executor (None): This argument is not used and is only included for now
for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used
and will be removed in a future version.
Expand Down Expand Up @@ -759,22 +762,40 @@ def arrays(
n_padding = self.ntuple.column_records[key_nr].first_element_index
n_padding -= cluster_starts[start_cluster_idx]
n_padding = max(n_padding, 0)
dtype = None
if interpreter == "cpu":
content = self.ntuple.read_cluster_range(
content_generator = partial(
self.ntuple.read_cluster_range,
key_nr,
start_cluster_idx,
stop_cluster_idx,
missing_element_padding=n_padding,
array_cache=array_cache,
)
if virtual:
total_length, _, dtype = (
self.ntuple._expected_array_length_starts_dtype(
key_nr,
start_cluster_idx,
stop_cluster_idx,
missing_element_padding=n_padding,
)
)
if "cardinality" in key:
total_length -= 1
content = (total_length, content_generator)
else:
content = content_generator()
elif interpreter == "gpu" and backend == "cuda":
content = content_dict[key_nr]
elif interpreter == "gpu":
raise NotImplementedError(
f"Backend {backend} GDS support not implemented."
)
else:
raise NotImplementedError(f"Backend {backend} not implemented.")
dtype_byte = self.ntuple.column_records[key_nr].type
_fill_container_dict(container_dict, content, key, dtype_byte)
_fill_container_dict(container_dict, content, key, dtype_byte, dtype)

cluster_offset = cluster_starts[start_cluster_idx]
entry_start -= cluster_offset
Expand Down Expand Up @@ -1778,36 +1799,116 @@ def _cupy_insert(arr, obj, value):
return out


def _fill_container_dict(container_dict, content, key, dtype_byte):
array_library_string = uproot._util.get_array_library(content)
def _fill_container_dict(container_dict, content, key, dtype_byte, dtype):
from awkward._nplikes.numpy import Numpy
from awkward._nplikes.virtual import VirtualNDArray

if isinstance(content, tuple):
# Virtual arrays not yet implemented for GPU
array_library_string = "numpy"
virtual = True
length = int(content[0])
raw_generator = content[1]
else:
virtual = False
array_library_string = uproot._util.get_array_library(content)
length = len(content)

def raw_generator():
return content

library = numpy if array_library_string == "numpy" else uproot.extras.cupy()

if "cardinality" in key:
content = library.diff(content)

if "optional" in key:
# We need to convert from a ListOffsetArray to an IndexedOptionArray
diff = library.diff(content)
missing = library.nonzero(diff == 0)[0]
missing -= library.arange(len(missing), dtype=missing.dtype)
dtype = "int64" if content.dtype == library.uint64 else "int32"
indices = library.arange(len(content) - len(missing), dtype=dtype)
if array_library_string == "numpy":
indices = numpy.insert(indices, missing, -1)

def generator():
materialized = raw_generator()
materialized = library.diff(materialized)
return materialized

if virtual:
virtual_array = VirtualNDArray(
Numpy.instance(), shape=(length,), dtype=dtype, generator=generator
)
container_dict[f"{key}-data"] = virtual_array
else:
indices = _cupy_insert(indices, missing, -1)
container_dict[f"{key}-index"] = indices
container_dict[f"{key}-data"] = generator()

elif "optional" in key:

def generator():
# We need to convert from a ListOffsetArray to an IndexedOptionArray
materialized = raw_generator()
diff = library.diff(materialized)
missing = library.nonzero(diff == 0)[0]
missing -= library.arange(len(missing), dtype=missing.dtype)
dtype = "int64" if materialized.dtype == library.int64 else "int32"
indices = library.arange(len(materialized) - len(missing), dtype=dtype)
if array_library_string == "numpy":
indices = numpy.insert(indices, missing, -1)
else:
indices = _cupy_insert(indices, missing, -1)
return indices[:-1] # We need to delete the last index

if virtual:
virtual_array = VirtualNDArray(
Numpy.instance(), shape=(length - 1,), dtype=dtype, generator=generator
)
container_dict[f"{key}-index"] = virtual_array
else:
container_dict[f"{key}-index"] = generator()

elif dtype_byte == uproot.const.rntuple_col_type_to_num_dict["switch"]:
tags = content["tag"].astype(numpy.int8)
kindex = content["index"]
# Find invalid variants and adjust buffers accordingly
invalid = numpy.flatnonzero(tags == 0)
kindex[invalid] = 0 # Might not be necessary, but safer
container_dict[f"{key}-index"] = library.array(kindex)
container_dict[f"{key}-tags"] = library.array(tags)
container_dict["nones-index"] = library.array([-1], dtype=numpy.int64)

def tag_generator():
content = raw_generator()
return content["tag"].astype(numpy.int8)

def index_generator():
content = raw_generator()
tags = content["tag"].astype(numpy.int8)
kindex = content["index"]
# Find invalid variants and adjust buffers accordingly
invalid = numpy.flatnonzero(tags == 0)
kindex[invalid] = 0 # Might not be necessary, but safer
return kindex

def nones_index_generator():
return library.array([-1], dtype=numpy.int64)

if virtual:
tag_virtual_array = VirtualNDArray(
Numpy.instance(),
shape=(length,),
dtype=numpy.int8,
generator=tag_generator,
)
container_dict[f"{key}-tags"] = tag_virtual_array
index_virtual_array = VirtualNDArray(
Numpy.instance(),
shape=(length,),
dtype=numpy.int64,
generator=index_generator,
)
container_dict[f"{key}-index"] = index_virtual_array
nones_index_virtual_array = VirtualNDArray(
Numpy.instance(),
shape=(1,),
dtype=numpy.int64,
generator=nones_index_generator,
)
container_dict["nones-index"] = nones_index_virtual_array
else:
container_dict[f"{key}-tags"] = tag_generator()
container_dict[f"{key}-index"] = index_generator()
container_dict["nones-index"] = nones_index_generator()
else:
# don't distinguish data and offsets
container_dict[f"{key}-data"] = content
container_dict[f"{key}-offsets"] = content
if virtual:
virtual_array = VirtualNDArray(
Numpy.instance(), shape=(length,), dtype=dtype, generator=raw_generator
)
container_dict[f"{key}-data"] = virtual_array
container_dict[f"{key}-offsets"] = virtual_array
else:
container_dict[f"{key}-data"] = content
container_dict[f"{key}-offsets"] = content
8 changes: 4 additions & 4 deletions src/uproot/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@
0x0B: "float16",
0x0C: "float32",
0x0D: "float64",
0x0E: "uint32", # Index32
0x0F: "uint64", # Index64
0x0E: "int32", # Index32
0x0F: "int64", # Index64
0x10: "switch", # Switch: (uint64, uint32)
0x11: "int16", # SplitInt16: split + zigzag encoding
0x12: "uint16", # SplitUInt16: split encoding
Expand All @@ -150,8 +150,8 @@
0x17: "float16", # SplitReal16: split encoding
0x18: "float32", # SplitReal32: split encoding
0x19: "float64", # SplitReal64: split encoding
0x1A: "uint32", # SplitIndex32: split + delta encoding
0x1B: "uint64", # SplitIndex64: split + delta encoding
0x1A: "int32", # SplitIndex32: split + delta encoding
0x1B: "int64", # SplitIndex64: split + delta encoding
0x1C: "real32trunc", # Real32Trunc: float32 with truncated mantissa
0x1D: "real32quant", # Real32Quant: float32 with quantized integer representation
}
Expand Down
16 changes: 9 additions & 7 deletions src/uproot/models/RNTuple.py
Original file line number Diff line number Diff line change
Expand Up @@ -650,7 +650,7 @@ def read_page(
]
self.deserialize_page_decompressed_buffer(destination, field_metadata)

def _expected_array_length_and_starts(
def _expected_array_length_starts_dtype(
self, col_idx, cluster_start, cluster_stop, missing_element_padding=0
):
"""
Expand All @@ -660,7 +660,7 @@ def _expected_array_length_and_starts(
cluster_stop (int): The first cluster to exclude (i.e. one greater than the last cluster to include).
missing_element_padding (int): Number of padding elements to add at the start of the array.

Returns the expected length of the array over the given cluster range, including padding, and also the start indices of each cluster.
Returns the expected length of the array over the given cluster range (including padding), the start indices of each cluster, and the dtype of the array.
"""
field_metadata = self.get_field_metadata(col_idx)
if field_metadata.dtype_byte in uproot.const.rntuple_index_types:
Expand Down Expand Up @@ -690,7 +690,7 @@ def _expected_array_length_and_starts(
starts.append(total_length)
total_length += cluster_length

return total_length, starts
return total_length, starts, field_metadata.dtype_result

def read_cluster_range(
self,
Expand All @@ -711,10 +711,10 @@ def read_cluster_range(
Returns a numpy array with the data from the column.
"""
field_metadata = self.get_field_metadata(col_idx)
total_length, starts = self._expected_array_length_and_starts(
total_length, starts, dtype = self._expected_array_length_starts_dtype(
col_idx, cluster_start, cluster_stop, missing_element_padding
)
res = numpy.empty(total_length, field_metadata.dtype_result)
res = numpy.empty(total_length, dtype)
# Initialize the padding elements. Note that it might be different from missing_element_padding
# because for offsets there is an extra zero added at the start.
assert len(starts) > 0, "The cluster range is invalid"
Expand All @@ -726,7 +726,7 @@ def read_cluster_range(
cluster_idx,
col_idx,
field_metadata,
destination=res[starts[i] : stop],
destination=res[starts[i] : stop].view(field_metadata.dtype),
array_cache=array_cache,
)

Expand Down Expand Up @@ -964,7 +964,7 @@ def gpu_deserialize_decompressed_content(
n_padding = self.column_records[key_nr].first_element_index
n_padding -= cluster_starts[start_cluster_idx]
n_padding = max(n_padding, 0)
total_length, starts = self._expected_array_length_and_starts(
total_length, starts, _ = self._expected_array_length_starts_dtype(
ncol, start_cluster_idx, stop_cluster_idx, n_padding
)
field_metadata = self.get_field_metadata(ncol)
Expand Down Expand Up @@ -1169,6 +1169,8 @@ def get_field_metadata(self, ncol):
"std::string"
):
dtype_result = dtype
elif dtype_byte in uproot.const.rntuple_custom_float_types:
dtype_result = numpy.float32
else:
dtype_result = numpy.result_type(*alt_dtype_list)
field_metadata = FieldClusterMetadata(
Expand Down
Loading