Skip to content

Commit ee230cc

Browse files
authored
Merge pull request #17 from maarten-ic/feature/lazy-load-from-netcdf
Implement implicit conversion, lazy loading and some performance improvements for netCDF backend
2 parents a8a86d4 + 62506e0 commit ee230cc

File tree

11 files changed

+302
-74
lines changed

11 files changed

+302
-74
lines changed

docs/source/netcdf.rst

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ IMAS netCDF files
1111

1212
IMAS-Python supports reading IDSs from and writing IDSs to IMAS netCDF files. This
1313
feature is currently in alpha status, and its functionality may change in
14-
upcoming minor releases of IMAS-Python.
14+
upcoming (minor) releases of IMAS-Python.
1515

1616
A detailed description of the IMAS netCDF format and conventions can be found on
1717
the :ref:`IMAS conventions for the netCDF data format` page.
@@ -42,6 +42,34 @@ will be used for :py:meth:`~imas.db_entry.DBEntry.get` and
4242
imas.util.print_tree(cp2)
4343
4444
45+
Implemented features of a netCDF ``DBEntry``
46+
--------------------------------------------
47+
48+
A netCDF ``DBEntry`` doesn't implement all features that are supported by
49+
``imas_core``. The following table provides an overview of the implemented
50+
features that are supported by DBEntries using ``imas_core`` respectively
51+
``netCDF``:
52+
53+
.. list-table::
54+
:header-rows: 1
55+
56+
* - Feature
57+
- ``imas_core``
58+
- ``netCDF``
59+
* - :ref:`Lazy loading`
60+
- Yes
61+
- Yes
62+
* - :ref:`Automatic conversion between DD versions <Conversion of IDSs between DD versions>`
63+
- When reading and writing
64+
- When reading
65+
* - ``get_slice`` / ``put_slice``
66+
- Yes
67+
- Not implemented
68+
* - ``get_sample``
69+
- Yes (requires ``imas_core >= 5.4.0``)
70+
- Not implemented
71+
72+
4573
Using IMAS netCDF files with 3rd-party tools
4674
--------------------------------------------
4775

imas/backends/imas_core/al_context.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
import numpy
1212

13+
import imas
1314
from imas.backends.imas_core.imas_interface import ll_interface
1415
from imas.exception import LowlevelError
1516
from imas.ids_defs import (
@@ -280,6 +281,9 @@ def __init__(
280281
self.context = None
281282
"""Potential weak reference to opened context."""
282283

284+
def get_child(self, child):
285+
imas.backends.imas_core.db_entry_helpers._get_child(child, self)
286+
283287
def get_context(self) -> ALContext:
284288
"""Create and yield the actual ALContext."""
285289
if self.dbentry._db_ctx is not self.dbentry_ctx:

imas/backends/imas_core/db_entry_helpers.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def get_children(
2222
structure: IDSStructure,
2323
ctx: ALContext,
2424
time_mode: int,
25-
nbc_map: Optional[NBCPathMap],
25+
nbc_map: Optional["NBCPathMap"],
2626
) -> None:
2727
"""Recursively get all children of an IDSStructure."""
2828
# NOTE: changes in this method must be propagated to _get_child and vice versa
@@ -77,15 +77,11 @@ def get_children(
7777
getattr(structure, name)._IDSPrimitive__value = data
7878

7979

80-
def _get_child(child: IDSBase, ctx: Optional[LazyALContext]):
80+
def _get_child(child: IDSBase, ctx: LazyALContext):
8181
"""Get a single child when required (lazy loading)."""
8282
# NOTE: changes in this method must be propagated to _get_children and vice versa
8383
# Performance: this method is specialized for the lazy get
8484

85-
# ctx can be None when the parent structure does not exist in the on-disk DD version
86-
if ctx is None:
87-
return # There is no data to be loaded
88-
8985
time_mode = ctx.time_mode
9086
if time_mode == IDS_TIME_MODE_INDEPENDENT and child.metadata.type.is_dynamic:
9187
return # skip dynamic (time-dependent) nodes
@@ -148,7 +144,7 @@ def put_children(
148144
ctx: ALContext,
149145
time_mode: int,
150146
is_slice: bool,
151-
nbc_map: Optional[NBCPathMap],
147+
nbc_map: Optional["NBCPathMap"],
152148
verify_maxoccur: bool,
153149
) -> None:
154150
"""Recursively put all children of an IDSStructure"""

imas/backends/imas_core/imas_interface.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
imasdef = None
3333
lowlevel = None
3434
logger.critical(
35-
"Could not import 'al_core': %s. Some functionality is not available.",
35+
"Could not import 'imas_core': %s. Some functionality is not available.",
3636
exc,
3737
)
3838

imas/backends/netcdf/db_entry_nc.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from imas.backends.netcdf.ids2nc import IDS2NC
1212
from imas.backends.netcdf.nc2ids import NC2IDS
1313
from imas.exception import DataEntryException, InvalidNetCDFEntry
14-
from imas.ids_convert import NBCPathMap, convert_ids
14+
from imas.ids_convert import NBCPathMap, dd_version_map_from_factories
1515
from imas.ids_factory import IDSFactory
1616
from imas.ids_toplevel import IDSToplevel
1717

@@ -108,10 +108,6 @@ def get(
108108
else:
109109
func = "get_sample"
110110
raise NotImplementedError(f"`{func}` is not available for netCDF files.")
111-
if lazy:
112-
raise NotImplementedError(
113-
"Lazy loading is not implemented for netCDF files."
114-
)
115111

116112
# Check if the IDS/occurrence exists, and obtain the group it is stored in
117113
try:
@@ -123,14 +119,19 @@ def get(
123119

124120
# Load data into the destination IDS
125121
if self._ds_factory.dd_version == destination._dd_version:
126-
NC2IDS(group, destination).run()
122+
NC2IDS(group, destination, destination.metadata, None).run(lazy)
127123
else:
128-
# FIXME: implement automatic conversion using nbc_map
129-
# As a work-around: do an explicit conversion, but automatic conversion
130-
# will also be needed to implement lazy loading.
131-
ids = self._ds_factory.new(ids_name)
132-
NC2IDS(group, ids).run()
133-
convert_ids(ids, None, target=destination)
124+
# Construct relevant NBCPathMap, the one we get from DBEntry has the reverse
125+
# mapping from what we need. The imas_core logic does the mapping from
126+
# in-memory to on-disk, while we take what is on-disk and map it to
127+
# in-memory.
128+
ddmap, source_is_older = dd_version_map_from_factories(
129+
ids_name, self._ds_factory, self._factory
130+
)
131+
nbc_map = ddmap.old_to_new if source_is_older else ddmap.new_to_old
132+
NC2IDS(
133+
group, destination, self._ds_factory.new(ids_name).metadata, nbc_map
134+
).run(lazy)
134135

135136
return destination
136137

imas/backends/netcdf/nc2ids.py

Lines changed: 134 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
from typing import Iterator, List, Optional, Tuple
44

55
import netCDF4
6+
import numpy as np
67

78
from imas.backends.netcdf import ids2nc
89
from imas.backends.netcdf.nc_metadata import NCMetadata
910
from imas.exception import InvalidNetCDFEntry
1011
from imas.ids_base import IDSBase
12+
from imas.ids_convert import NBCPathMap
1113
from imas.ids_data_type import IDSDataType
1214
from imas.ids_defs import IDS_TIME_MODE_HOMOGENEOUS
1315
from imas.ids_metadata import IDSMetadata
@@ -70,22 +72,37 @@ def _tree_iter(
7072
class NC2IDS:
7173
"""Class responsible for reading an IDS from a NetCDF group."""
7274

73-
def __init__(self, group: netCDF4.Group, ids: IDSToplevel) -> None:
75+
def __init__(
76+
self,
77+
group: netCDF4.Group,
78+
ids: IDSToplevel,
79+
ids_metadata: IDSMetadata,
80+
nbc_map: Optional[NBCPathMap],
81+
) -> None:
7482
"""Initialize NC2IDS converter.
7583
7684
Args:
7785
group: NetCDF group that stores the IDS data.
7886
ids: Corresponding IDS toplevel to store the data in.
87+
ids_metadata: Metadata corresponding to the DD version that the data is
88+
stored in.
89+
nbc_map: Path map for implicit DD conversions.
7990
"""
8091
self.group = group
8192
"""NetCDF Group that the IDS is stored in."""
8293
self.ids = ids
8394
"""IDS to store the data in."""
95+
self.ids_metadata = ids_metadata
96+
"""Metadata of the IDS in the DD version that the data is stored in"""
97+
self.nbc_map = nbc_map
98+
"""Path map for implicit DD conversions."""
8499

85-
self.ncmeta = NCMetadata(ids.metadata)
100+
self.ncmeta = NCMetadata(ids_metadata)
86101
"""NetCDF related metadata."""
87102
self.variables = list(group.variables)
88103
"""List of variable names stored in the netCDF group."""
104+
105+
self._lazy_map = {}
89106
# Don't use masked arrays: they're slow and we'll handle most of the unset
90107
# values through the `:shape` arrays
91108
self.group.set_auto_mask(False)
@@ -99,31 +116,60 @@ def __init__(self, group: netCDF4.Group, ids: IDSToplevel) -> None:
99116
"Mandatory variable `ids_properties.homogeneous_time` does not exist."
100117
)
101118
var = group["ids_properties.homogeneous_time"]
102-
self._validate_variable(var, ids.ids_properties.homogeneous_time.metadata)
119+
self._validate_variable(var, ids.metadata["ids_properties/homogeneous_time"])
103120
if var[()] not in [0, 1, 2]:
104121
raise InvalidNetCDFEntry(
105122
f"Invalid value for ids_properties.homogeneous_time: {var[()]}. "
106123
"Was expecting: 0, 1 or 2."
107124
)
108125
self.homogeneous_time = var[()] == IDS_TIME_MODE_HOMOGENEOUS
109126

110-
def run(self) -> None:
127+
def run(self, lazy: bool) -> None:
111128
"""Load the data from the netCDF group into the IDS."""
112129
self.variables.sort()
113130
self.validate_variables()
131+
if lazy:
132+
self.ids._set_lazy_context(LazyContext(self))
114133
for var_name in self.variables:
115134
if var_name.endswith(":shape"):
116135
continue
117-
metadata = self.ids.metadata[var_name]
136+
metadata = self.ids_metadata[var_name]
118137

119138
if metadata.data_type is IDSDataType.STRUCTURE:
120139
continue # This only contains DD metadata we already know
121140

141+
# Handle implicit DD version conversion
142+
if self.nbc_map is None:
143+
target_metadata = metadata # no conversion
144+
elif metadata.path_string in self.nbc_map:
145+
new_path = self.nbc_map.path[metadata.path_string]
146+
if new_path is None:
147+
logging.info(
148+
"Not loading data for %s: no equivalent data structure exists "
149+
"in the target Data Dictionary version.",
150+
metadata.path_string,
151+
)
152+
continue
153+
target_metadata = self.ids.metadata[new_path]
154+
elif metadata.path_string in self.nbc_map.type_change:
155+
logging.info(
156+
"Not loading data for %s: cannot hanlde type changes when "
157+
"implicitly converting data to the target Data Dictionary version.",
158+
metadata.path_string,
159+
)
160+
continue
161+
else:
162+
target_metadata = metadata # no conversion required
163+
122164
var = self.group[var_name]
165+
if lazy:
166+
self._lazy_map[target_metadata.path_string] = var
167+
continue
168+
123169
if metadata.data_type is IDSDataType.STRUCT_ARRAY:
124170
if "sparse" in var.ncattrs():
125171
shapes = self.group[var_name + ":shape"][()]
126-
for index, node in tree_iter(self.ids, metadata):
172+
for index, node in tree_iter(self.ids, target_metadata):
127173
node.resize(shapes[index][0])
128174

129175
else:
@@ -132,7 +178,7 @@ def run(self) -> None:
132178
metadata.path_string, self.homogeneous_time
133179
)[-1]
134180
size = self.group.dimensions[dim].size
135-
for _, node in tree_iter(self.ids, metadata):
181+
for _, node in tree_iter(self.ids, target_metadata):
136182
node.resize(size)
137183

138184
continue
@@ -144,23 +190,30 @@ def run(self) -> None:
144190
if "sparse" in var.ncattrs():
145191
if metadata.ndim:
146192
shapes = self.group[var_name + ":shape"][()]
147-
for index, node in tree_iter(self.ids, metadata):
193+
for index, node in tree_iter(self.ids, target_metadata):
148194
shape = shapes[index]
149195
if shape.all():
150-
node.value = data[index + tuple(map(slice, shapes[index]))]
196+
# NOTE: bypassing IDSPrimitive.value.setter logic
197+
node._IDSPrimitive__value = data[
198+
index + tuple(map(slice, shape))
199+
]
151200
else:
152-
for index, node in tree_iter(self.ids, metadata):
201+
for index, node in tree_iter(self.ids, target_metadata):
153202
value = data[index]
154203
if value != getattr(var, "_FillValue", None):
155-
node.value = data[index]
204+
# NOTE: bypassing IDSPrimitive.value.setter logic
205+
node._IDSPrimitive__value = value
156206

157207
elif metadata.path_string not in self.ncmeta.aos:
158208
# Shortcut for assigning untensorized data
159-
self.ids[metadata.path] = data
209+
# Note: var[()] can return 0D numpy arrays. Instead of handling this
210+
# here, we'll let IDSPrimitive.value.setter take care of it:
211+
self.ids[target_metadata.path].value = data
160212

161213
else:
162-
for index, node in tree_iter(self.ids, metadata):
163-
node.value = data[index]
214+
for index, node in tree_iter(self.ids, target_metadata):
215+
# NOTE: bypassing IDSPrimitive.value.setter logic
216+
node._IDSPrimitive__value = data[index]
164217

165218
def validate_variables(self) -> None:
166219
"""Validate that all variables in the netCDF Group exist and match the DD."""
@@ -194,7 +247,7 @@ def validate_variables(self) -> None:
194247
# Check that the DD defines this variable, and validate its metadata
195248
var = self.group[var_name]
196249
try:
197-
metadata = self.ids.metadata[var_name]
250+
metadata = self.ids_metadata[var_name]
198251
except KeyError:
199252
raise InvalidNetCDFEntry(
200253
f"Invalid variable {var_name}: no such variable exists in the "
@@ -300,3 +353,69 @@ def _validate_sparsity(
300353
raise variable_error(
301354
shape_var, "dtype", shape_var.dtype, "any integer type"
302355
)
356+
357+
358+
class LazyContext:
359+
def __init__(self, nc2ids, index=()):
360+
self.nc2ids = nc2ids
361+
self.index = index
362+
363+
def get_child(self, child):
364+
metadata = child.metadata
365+
path = metadata.path_string
366+
data_type = metadata.data_type
367+
nc2ids = self.nc2ids
368+
var = nc2ids._lazy_map.get(path)
369+
370+
if data_type is IDSDataType.STRUCT_ARRAY:
371+
# Determine size of the aos
372+
if var is None:
373+
size = 0
374+
elif "sparse" in var.ncattrs():
375+
size = nc2ids.group[var.name + ":shape"][self.index][0]
376+
else:
377+
# FIXME: extract dimension name from nc file?
378+
dim = nc2ids.ncmeta.get_dimensions(
379+
metadata.path_string, nc2ids.homogeneous_time
380+
)[-1]
381+
size = nc2ids.group.dimensions[dim].size
382+
383+
child._set_lazy_context(LazyArrayStructContext(nc2ids, self.index, size))
384+
385+
elif data_type is IDSDataType.STRUCTURE:
386+
child._set_lazy_context(self)
387+
388+
elif var is not None: # Data elements
389+
value = None
390+
if "sparse" in var.ncattrs():
391+
if metadata.ndim:
392+
shape_var = nc2ids.group[var.name + ":shape"]
393+
shape = shape_var[self.index]
394+
if shape.all():
395+
value = var[self.index + tuple(map(slice, shape))]
396+
else:
397+
value = var[self.index]
398+
if value == getattr(var, "_FillValue", None):
399+
value = None # Skip setting
400+
else:
401+
value = var[self.index]
402+
403+
if value is not None:
404+
if isinstance(value, np.ndarray):
405+
# Convert the numpy array to a read-only view
406+
value = value.view()
407+
value.flags.writeable = False
408+
# NOTE: bypassing IDSPrimitive.value.setter logic
409+
child._IDSPrimitive__value = value
410+
411+
412+
class LazyArrayStructContext(LazyContext):
413+
def __init__(self, nc2ids, index, size):
414+
super().__init__(nc2ids, index)
415+
self.size = size
416+
417+
def get_context(self):
418+
return self # IDSStructArray expects to get something with a size attribute
419+
420+
def iterate_to_index(self, index: int) -> LazyContext:
421+
return LazyContext(self.nc2ids, self.index + (index,))

0 commit comments

Comments
 (0)