1515import sys
1616import warnings
1717from collections .abc import Mapping
18+ from functools import partial
1819
1920import numpy
2021
@@ -622,6 +623,7 @@ def arrays(
622623 interpreter = "cpu" ,
623624 ak_add_doc = False ,
624625 how = None ,
626+ virtual = True ,
625627 # For compatibility reasons we also accepts kwargs meant for TTrees
626628 interpretation_executor = None ,
627629 filter_branch = unset ,
@@ -677,6 +679,7 @@ def arrays(
677679 ``list``, and ``dict``. Note that the container *type itself*
678680 must be passed as ``how``, not an instance of that type (i.e.
679681 ``how=tuple``, not ``how=()``).
682+ virtual (bool): If True, return virtual Awkward arrays, meaning that the data will not be loaded into memory until it is accessed.
680683 interpretation_executor (None): This argument is not used and is only included for now
681684 for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used
682685 and will be removed in a future version.
@@ -759,22 +762,40 @@ def arrays(
759762 n_padding = self .ntuple .column_records [key_nr ].first_element_index
760763 n_padding -= cluster_starts [start_cluster_idx ]
761764 n_padding = max (n_padding , 0 )
765+ dtype = None
762766 if interpreter == "cpu" :
763- content = self .ntuple .read_cluster_range (
767+ content_generator = partial (
768+ self .ntuple .read_cluster_range ,
764769 key_nr ,
765770 start_cluster_idx ,
766771 stop_cluster_idx ,
767772 missing_element_padding = n_padding ,
768773 array_cache = array_cache ,
769774 )
775+ if virtual :
776+ total_length , _ , dtype = (
777+ self .ntuple ._expected_array_length_starts_dtype (
778+ key_nr ,
779+ start_cluster_idx ,
780+ stop_cluster_idx ,
781+ missing_element_padding = n_padding ,
782+ )
783+ )
784+ if "cardinality" in key :
785+ total_length -= 1
786+ content = (total_length , content_generator )
787+ else :
788+ content = content_generator ()
770789 elif interpreter == "gpu" and backend == "cuda" :
771790 content = content_dict [key_nr ]
772791 elif interpreter == "gpu" :
773792 raise NotImplementedError (
774793 f"Backend { backend } GDS support not implemented."
775794 )
795+ else :
796+ raise NotImplementedError (f"Backend { backend } not implemented." )
776797 dtype_byte = self .ntuple .column_records [key_nr ].type
777- _fill_container_dict (container_dict , content , key , dtype_byte )
798+ _fill_container_dict (container_dict , content , key , dtype_byte , dtype )
778799
779800 cluster_offset = cluster_starts [start_cluster_idx ]
780801 entry_start -= cluster_offset
@@ -1779,27 +1800,75 @@ def _cupy_insert(arr, obj, value):
17791800 return out
17801801
17811802
1782- def _fill_container_dict (container_dict , content , key , dtype_byte ):
1783- array_library_string = uproot ._util .get_array_library (content )
1803+ def _fill_container_dict (container_dict , content , key , dtype_byte , dtype ):
1804+ from awkward ._nplikes .numpy import Numpy
1805+ from awkward ._nplikes .virtual import VirtualNDArray
1806+
1807+ if type (content ) == tuple :
1808+ # Virtual arrays not yet implemented for GPU
1809+ array_library_string = "numpy"
1810+ virtual = True
1811+ length = int (content [0 ])
1812+ raw_generator = content [1 ]
1813+ else :
1814+ virtual = False
1815+ array_library_string = uproot ._util .get_array_library (content )
17841816
17851817 library = numpy if array_library_string == "numpy" else uproot .extras .cupy ()
17861818
17871819 if "cardinality" in key :
1788- content = library . diff ( content )
1789-
1790- if "optional" in key :
1791- # We need to convert from a ListOffsetArray to an IndexedOptionArray
1792- diff = library .diff (content )
1793- missing = library . nonzero ( diff == 0 )[ 0 ]
1794- missing -= library . arange ( len ( missing ), dtype = missing . dtype )
1795- dtype = "int64" if content . dtype == library . uint64 else "int32"
1796- indices = library . arange ( len ( content ) - len ( missing ), dtype = dtype )
1797- if array_library_string == "numpy" :
1798- indices = numpy . insert ( indices , missing , - 1 )
1820+ if virtual :
1821+
1822+ def generator () :
1823+ materialized = raw_generator ()
1824+ materialized = library .diff (materialized )
1825+ return materialized
1826+
1827+ virtual_array = VirtualNDArray (
1828+ Numpy . instance (), shape = ( length , ), dtype = dtype , generator = generator
1829+ )
1830+ container_dict [ f" { key } -data" ] = generator
17991831 else :
1800- indices = _cupy_insert (indices , missing , - 1 )
1801- container_dict [f"{ key } -index" ] = indices
1832+ content = library .diff (content )
1833+ container_dict [f"{ key } -data" ] = content
1834+ elif "optional" in key :
1835+ if virtual :
1836+
1837+ def generator ():
1838+ # We need to convert from a ListOffsetArray to an IndexedOptionArray
1839+ materialized = raw_generator ()
1840+ diff = library .diff (materialized )
1841+ missing = library .nonzero (diff == 0 )[0 ]
1842+ missing -= library .arange (len (missing ), dtype = missing .dtype )
1843+ dtype = "int64" if materialized .dtype == library .uint64 else "int32"
1844+ indices = library .arange (len (materialized ) - len (missing ), dtype = dtype )
1845+ if array_library_string == "numpy" :
1846+ indices = numpy .insert (indices , missing , - 1 )
1847+ else :
1848+ indices = _cupy_insert (indices , missing , - 1 )
1849+ return indices
1850+
1851+ virtual_array = VirtualNDArray (
1852+ Numpy .instance (), shape = (length ,), dtype = dtype , generator = generator
1853+ )
1854+ container_dict [f"{ key } -index" ] = generator
1855+ else :
1856+ # We need to convert from a ListOffsetArray to an IndexedOptionArray
1857+ diff = library .diff (content )
1858+ missing = library .nonzero (diff == 0 )[0 ]
1859+ missing -= library .arange (len (missing ), dtype = missing .dtype )
1860+ dtype = "int64" if content .dtype == library .uint64 else "int32"
1861+ indices = library .arange (len (content ) - len (missing ), dtype = dtype )
1862+ if array_library_string == "numpy" :
1863+ indices = numpy .insert (indices , missing , - 1 )
1864+ else :
1865+ indices = _cupy_insert (indices , missing , - 1 )
1866+ container_dict [f"{ key } -index" ] = indices
18021867 elif dtype_byte == uproot .const .rntuple_col_type_to_num_dict ["switch" ]:
1868+ if virtual :
1869+ # TODO: Figure out how to handle this one
1870+ content = raw_generator ()
1871+ print (f"{ length } { len (content )} " )
18031872 kindex , tags = uproot .models .RNTuple ._split_switch_bits (content )
18041873 # Find invalid variants and adjust buffers accordingly
18051874 invalid = numpy .flatnonzero (tags == - 1 )
@@ -1816,6 +1885,14 @@ def _fill_container_dict(container_dict, content, key, dtype_byte):
18161885 container_dict [f"{ key } -union-index" ] = library .array (kindex )
18171886 container_dict [f"{ key } -union-tags" ] = library .array (tags )
18181887 else :
1819- # don't distinguish data and offsets
1820- container_dict [f"{ key } -data" ] = content
1821- container_dict [f"{ key } -offsets" ] = content
1888+ if virtual :
1889+ virtual_array = VirtualNDArray (
1890+ Numpy .instance (), shape = (length ,), dtype = dtype , generator = raw_generator
1891+ )
1892+ # don't distinguish data and offsets
1893+ container_dict [f"{ key } -data" ] = raw_generator
1894+ container_dict [f"{ key } -offsets" ] = raw_generator
1895+ else :
1896+ # don't distinguish data and offsets
1897+ container_dict [f"{ key } -data" ] = content
1898+ container_dict [f"{ key } -offsets" ] = content
0 commit comments