Skip to content

Commit f392613

Browse files
committed
Some progress with virtual arrays
1 parent a0d52f0 commit f392613

File tree

3 files changed

+106
-29
lines changed

3 files changed

+106
-29
lines changed

src/uproot/behaviors/RNTuple.py

Lines changed: 97 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import sys
1616
import warnings
1717
from collections.abc import Mapping
18+
from functools import partial
1819

1920
import numpy
2021

@@ -622,6 +623,7 @@ def arrays(
622623
interpreter="cpu",
623624
ak_add_doc=False,
624625
how=None,
626+
virtual=True,
625627
# For compatibility reasons we also accepts kwargs meant for TTrees
626628
interpretation_executor=None,
627629
filter_branch=unset,
@@ -677,6 +679,7 @@ def arrays(
677679
``list``, and ``dict``. Note that the container *type itself*
678680
must be passed as ``how``, not an instance of that type (i.e.
679681
``how=tuple``, not ``how=()``).
682+
virtual (bool): If True, return virtual Awkward arrays, meaning that the data will not be loaded into memory until it is accessed.
680683
interpretation_executor (None): This argument is not used and is only included for now
681684
for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used
682685
and will be removed in a future version.
@@ -759,22 +762,40 @@ def arrays(
759762
n_padding = self.ntuple.column_records[key_nr].first_element_index
760763
n_padding -= cluster_starts[start_cluster_idx]
761764
n_padding = max(n_padding, 0)
765+
dtype = None
762766
if interpreter == "cpu":
763-
content = self.ntuple.read_cluster_range(
767+
content_generator = partial(
768+
self.ntuple.read_cluster_range,
764769
key_nr,
765770
start_cluster_idx,
766771
stop_cluster_idx,
767772
missing_element_padding=n_padding,
768773
array_cache=array_cache,
769774
)
775+
if virtual:
776+
total_length, _, dtype = (
777+
self.ntuple._expected_array_length_starts_dtype(
778+
key_nr,
779+
start_cluster_idx,
780+
stop_cluster_idx,
781+
missing_element_padding=n_padding,
782+
)
783+
)
784+
if "cardinality" in key:
785+
total_length -= 1
786+
content = (total_length, content_generator)
787+
else:
788+
content = content_generator()
770789
elif interpreter == "gpu" and backend == "cuda":
771790
content = content_dict[key_nr]
772791
elif interpreter == "gpu":
773792
raise NotImplementedError(
774793
f"Backend {backend} GDS support not implemented."
775794
)
795+
else:
796+
raise NotImplementedError(f"Backend {backend} not implemented.")
776797
dtype_byte = self.ntuple.column_records[key_nr].type
777-
_fill_container_dict(container_dict, content, key, dtype_byte)
798+
_fill_container_dict(container_dict, content, key, dtype_byte, dtype)
778799

779800
cluster_offset = cluster_starts[start_cluster_idx]
780801
entry_start -= cluster_offset
@@ -1779,27 +1800,75 @@ def _cupy_insert(arr, obj, value):
17791800
return out
17801801

17811802

1782-
def _fill_container_dict(container_dict, content, key, dtype_byte):
1783-
array_library_string = uproot._util.get_array_library(content)
1803+
def _fill_container_dict(container_dict, content, key, dtype_byte, dtype):
1804+
from awkward._nplikes.numpy import Numpy
1805+
from awkward._nplikes.virtual import VirtualNDArray
1806+
1807+
if type(content) == tuple:
1808+
# Virtual arrays not yet implemented for GPU
1809+
array_library_string = "numpy"
1810+
virtual = True
1811+
length = int(content[0])
1812+
raw_generator = content[1]
1813+
else:
1814+
virtual = False
1815+
array_library_string = uproot._util.get_array_library(content)
17841816

17851817
library = numpy if array_library_string == "numpy" else uproot.extras.cupy()
17861818

17871819
if "cardinality" in key:
1788-
content = library.diff(content)
1789-
1790-
if "optional" in key:
1791-
# We need to convert from a ListOffsetArray to an IndexedOptionArray
1792-
diff = library.diff(content)
1793-
missing = library.nonzero(diff == 0)[0]
1794-
missing -= library.arange(len(missing), dtype=missing.dtype)
1795-
dtype = "int64" if content.dtype == library.uint64 else "int32"
1796-
indices = library.arange(len(content) - len(missing), dtype=dtype)
1797-
if array_library_string == "numpy":
1798-
indices = numpy.insert(indices, missing, -1)
1820+
if virtual:
1821+
1822+
def generator():
1823+
materialized = raw_generator()
1824+
materialized = library.diff(materialized)
1825+
return materialized
1826+
1827+
virtual_array = VirtualNDArray(
1828+
Numpy.instance(), shape=(length,), dtype=dtype, generator=generator
1829+
)
1830+
container_dict[f"{key}-data"] = generator
17991831
else:
1800-
indices = _cupy_insert(indices, missing, -1)
1801-
container_dict[f"{key}-index"] = indices
1832+
content = library.diff(content)
1833+
container_dict[f"{key}-data"] = content
1834+
elif "optional" in key:
1835+
if virtual:
1836+
1837+
def generator():
1838+
# We need to convert from a ListOffsetArray to an IndexedOptionArray
1839+
materialized = raw_generator()
1840+
diff = library.diff(materialized)
1841+
missing = library.nonzero(diff == 0)[0]
1842+
missing -= library.arange(len(missing), dtype=missing.dtype)
1843+
dtype = "int64" if materialized.dtype == library.uint64 else "int32"
1844+
indices = library.arange(len(materialized) - len(missing), dtype=dtype)
1845+
if array_library_string == "numpy":
1846+
indices = numpy.insert(indices, missing, -1)
1847+
else:
1848+
indices = _cupy_insert(indices, missing, -1)
1849+
return indices
1850+
1851+
virtual_array = VirtualNDArray(
1852+
Numpy.instance(), shape=(length,), dtype=dtype, generator=generator
1853+
)
1854+
container_dict[f"{key}-index"] = generator
1855+
else:
1856+
# We need to convert from a ListOffsetArray to an IndexedOptionArray
1857+
diff = library.diff(content)
1858+
missing = library.nonzero(diff == 0)[0]
1859+
missing -= library.arange(len(missing), dtype=missing.dtype)
1860+
dtype = "int64" if content.dtype == library.uint64 else "int32"
1861+
indices = library.arange(len(content) - len(missing), dtype=dtype)
1862+
if array_library_string == "numpy":
1863+
indices = numpy.insert(indices, missing, -1)
1864+
else:
1865+
indices = _cupy_insert(indices, missing, -1)
1866+
container_dict[f"{key}-index"] = indices
18021867
elif dtype_byte == uproot.const.rntuple_col_type_to_num_dict["switch"]:
1868+
if virtual:
1869+
# TODO: Figure out how to handle this one
1870+
content = raw_generator()
1871+
print(f"{length} {len(content)}")
18031872
kindex, tags = uproot.models.RNTuple._split_switch_bits(content)
18041873
# Find invalid variants and adjust buffers accordingly
18051874
invalid = numpy.flatnonzero(tags == -1)
@@ -1816,6 +1885,14 @@ def _fill_container_dict(container_dict, content, key, dtype_byte):
18161885
container_dict[f"{key}-union-index"] = library.array(kindex)
18171886
container_dict[f"{key}-union-tags"] = library.array(tags)
18181887
else:
1819-
# don't distinguish data and offsets
1820-
container_dict[f"{key}-data"] = content
1821-
container_dict[f"{key}-offsets"] = content
1888+
if virtual:
1889+
virtual_array = VirtualNDArray(
1890+
Numpy.instance(), shape=(length,), dtype=dtype, generator=raw_generator
1891+
)
1892+
# don't distinguish data and offsets
1893+
container_dict[f"{key}-data"] = raw_generator
1894+
container_dict[f"{key}-offsets"] = raw_generator
1895+
else:
1896+
# don't distinguish data and offsets
1897+
container_dict[f"{key}-data"] = content
1898+
container_dict[f"{key}-offsets"] = content

src/uproot/const.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,8 @@
138138
0x0B: "float16",
139139
0x0C: "float32",
140140
0x0D: "float64",
141-
0x0E: "uint32", # Index32
142-
0x0F: "uint64", # Index64
141+
0x0E: "int32", # Index32
142+
0x0F: "int64", # Index64
143143
0x10: "switch", # Switch: (uint64, uint32)
144144
0x11: "int16", # SplitInt16: split + zigzag encoding
145145
0x12: "uint16", # SplitUInt16: split encoding
@@ -150,8 +150,8 @@
150150
0x17: "float16", # SplitReal16: split encoding
151151
0x18: "float32", # SplitReal32: split encoding
152152
0x19: "float64", # SplitReal64: split encoding
153-
0x1A: "uint32", # SplitIndex32: split + delta encoding
154-
0x1B: "uint64", # SplitIndex64: split + delta encoding
153+
0x1A: "int32", # SplitIndex32: split + delta encoding
154+
0x1B: "int64", # SplitIndex64: split + delta encoding
155155
0x1C: "real32trunc", # Real32Trunc: float32 with truncated mantissa
156156
0x1D: "real32quant", # Real32Quant: float32 with quantized integer representation
157157
}

src/uproot/models/RNTuple.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -659,7 +659,7 @@ def read_page(
659659
if array_cache is not None:
660660
array_cache[key] = destination.copy()
661661

662-
def _expected_array_length_and_starts(
662+
def _expected_array_length_starts_dtype(
663663
self, col_idx, cluster_start, cluster_stop, missing_element_padding=0
664664
):
665665
"""
@@ -669,7 +669,7 @@ def _expected_array_length_and_starts(
669669
cluster_stop (int): The first cluster to exclude (i.e. one greater than the last cluster to include).
670670
missing_element_padding (int): Number of padding elements to add at the start of the array.
671671
672-
Returns the expected length of the array over the given cluster range, including padding, and also the start indices of each cluster.
672+
Returns the expected length of the array over the given cluster range (including padding), the start indices of each cluster, and the dtype of the array.
673673
"""
674674
field_metadata = self.get_field_metadata(col_idx)
675675
if field_metadata.dtype_byte in uproot.const.rntuple_index_types:
@@ -699,7 +699,7 @@ def _expected_array_length_and_starts(
699699
starts.append(total_length)
700700
total_length += cluster_length
701701

702-
return total_length, starts
702+
return total_length, starts, field_metadata.dtype_result
703703

704704
def read_cluster_range(
705705
self,
@@ -720,7 +720,7 @@ def read_cluster_range(
720720
Returns a numpy array with the data from the column.
721721
"""
722722
field_metadata = self.get_field_metadata(col_idx)
723-
total_length, starts = self._expected_array_length_and_starts(
723+
total_length, starts, _ = self._expected_array_length_starts_dtype(
724724
col_idx, cluster_start, cluster_stop, missing_element_padding
725725
)
726726
res = numpy.empty(total_length, field_metadata.dtype_result)
@@ -958,7 +958,7 @@ def gpu_deserialize_decompressed_content(
958958
n_padding = self.column_records[key_nr].first_element_index
959959
n_padding -= cluster_starts[start_cluster_idx]
960960
n_padding = max(n_padding, 0)
961-
total_length, starts = self._expected_array_length_and_starts(
961+
total_length, starts, _ = self._expected_array_length_starts_dtype(
962962
ncol, start_cluster_idx, stop_cluster_idx, n_padding
963963
)
964964
field_metadata = self.get_field_metadata(ncol)

0 commit comments

Comments
 (0)