Skip to content

WIP: Add functionality to virtualize GeoTIFFs using async_tiff #524

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 17 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,16 @@ fits = [
"kerchunk>=0.2.8",
"astropy",
]
tif = [
"obstore>=0.5.1",
"async-tiff @ git+https://github.com/developmentseed/async-tiff#subdirectory=python",
]
all_readers = [
"virtualizarr[hdf]",
"virtualizarr[hdf5]",
"virtualizarr[netcdf3]",
"virtualizarr[fits]",
"virtualizarr[tif]",
]

# writers
Expand All @@ -95,7 +100,6 @@ upstream = [
# optional dependencies
'astropy @ git+https://github.com/astropy/astropy',
'fsspec @ git+https://github.com/fsspec/filesystem_spec',
's3fs @ git+https://github.com/fsspec/s3fs',
'kerchunk @ git+https://github.com/fsspec/kerchunk',
'icechunk @ git+https://github.com/earth-mover/icechunk#subdirectory=icechunk-python',
]
Expand Down Expand Up @@ -168,6 +172,9 @@ h5netcdf = ">=1.5.0,<2"
[tool.pixi.feature.icechunk-dev.dependencies]
rust = "*"

[tool.pixi.feature.rio.dependencies]
rioxarray = "*"

# Define commands to run within the test environments
[tool.pixi.feature.test.tasks]
run-mypy = { cmd = "mypy virtualizarr" }
Expand All @@ -181,12 +188,12 @@ run-tests-html-cov = { cmd = "pytest -n auto --run-network-tests --verbose --cov
[tool.pixi.environments]
min-deps = ["dev", "test", "hdf", "hdf5", "hdf5-lib"] # VirtualiZarr/conftest.py using h5py, so the minimum set of dependencies for testing still includes hdf libs
# Inherit from min-deps to get all the test commands, along with optional dependencies
test = ["dev", "test", "remote", "hdf", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore"]
test-py311 = ["dev", "test", "remote", "hdf", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore", "py311"] # test against python 3.11
test-py312 = ["dev", "test", "remote", "hdf", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore", "py312"] # test against python 3.12
test = ["dev", "test", "remote", "hdf", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore", "tif", "rio"]
test-py311 = ["dev", "test", "remote", "hdf", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore", "tif", "rio", "py311"] # test against python 3.11
test-py312 = ["dev", "test", "remote", "hdf", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore", "tif", "rio", "py312"] # test against python 3.12
minio = ["dev", "remote", "hdf", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore", "py312", "minio"]
upstream = ["dev", "test", "hdf", "hdf5", "hdf5-lib", "netcdf3", "upstream", "icechunk-dev"]
all = ["dev", "test", "remote", "hdf", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore", "all_readers", "all_writers"]
all = ["dev", "test", "remote", "hdf", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore", "tif", "rio", "all_readers", "all_writers"]
docs = ["docs"]

# Define commands to run within the docs environment
Expand Down Expand Up @@ -222,7 +229,9 @@ module = [
"minio",
"numcodecs.*",
"ujson",
"zarr",
"zarr.*",
"async_tiff.*",
"obstore.*",
]
ignore_missing_imports = true

Expand Down
36 changes: 16 additions & 20 deletions virtualizarr/manifests/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,36 +108,29 @@ def get_zarr_metadata(manifest_group: ManifestGroup, key: str) -> Buffer:
return dict_to_buffer(metadata, prototype=default_buffer_prototype())


def parse_manifest_index(
key: str, chunk_key_encoding: str = "."
) -> tuple[str, tuple[int, ...]]:
def parse_manifest_index(key: str, chunk_key_encoding: str = ".") -> tuple[int, ...]:
"""
Splits `key` provided to a zarr store into the variable indicated
by the first part and the chunk index from the 3rd through last parts,
which can be used to index into the ndarrays containing paths, offsets,
and lengths in ManifestArrays.

Currently only works for 1d+ arrays with a tree depth of one from the
root Zarr group.

Parameters
----------
key : str
chunk_key_encoding : str

Returns
-------
ManifestIndex
tuple containing chunk indexes
"""
parts = key.split("/")
var = parts[0]
# Assume "c" is the second part
# TODO: Handle scalar array case with "c" holds the data
if chunk_key_encoding == "/":
indexes = tuple(int(ind) for ind in parts[2:])
else:
indexes = tuple(int(ind) for ind in parts[2].split(chunk_key_encoding))
return var, indexes
if key.endswith("c"):
# Scalar arrays hold the data in the "c" key
return (0,)
parts = key.split(
"c/"
) # TODO: Open an issue upstream about the Zarr spec indicating this should be f"c{chunk_key_encoding}" rather than always "c/"
return tuple(int(ind) for ind in parts[1].split(chunk_key_encoding))


def find_matching_store(stores: StoreDict, request_key: str) -> StoreRequest:
Expand Down Expand Up @@ -263,13 +256,16 @@ async def get(

if key.endswith("zarr.json"):
return get_zarr_metadata(self._group, key)
var, chunk_key = parse_manifest_index(key)
var = key.split("/")[0]
marr = self._group.arrays[var]
manifest = marr.manifest

path = manifest._paths[*chunk_key]
offset = manifest._offsets[*chunk_key]
length = manifest._lengths[*chunk_key]
chunk_indexes = parse_manifest_index(
key, marr.metadata.chunk_key_encoding.separator
)
path = manifest._paths[*chunk_indexes]
offset = manifest._offsets[*chunk_indexes]
length = manifest._lengths[*chunk_indexes]
# Get the configured object store instance that matches the path
store_request = find_matching_store(stores=self._stores, request_key=path)
# Transform the input byte range to account for the chunk location in the file
Expand Down
11 changes: 9 additions & 2 deletions virtualizarr/manifests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import numpy as np
from zarr import Array
from zarr.core.chunk_key_encodings import ChunkKeyEncodingLike
from zarr.core.metadata.v3 import ArrayV3Metadata

from virtualizarr.codecs import convert_to_codec_pipeline, get_codecs
Expand All @@ -14,9 +15,11 @@ def create_v3_array_metadata(
shape: tuple[int, ...],
data_type: np.dtype,
chunk_shape: tuple[int, ...],
chunk_key_encoding: ChunkKeyEncodingLike = {"name": "default"},
fill_value: Any = None,
codecs: Optional[list[Dict[str, Any]]] = None,
attributes: Optional[Dict[str, Any]] = None,
dimension_names: Optional[tuple[str, ...]] = None,
) -> ArrayV3Metadata:
"""
Create an ArrayV3Metadata instance with standard configuration.
Expand All @@ -30,12 +33,16 @@ def create_v3_array_metadata(
The numpy dtype of the array
chunk_shape : tuple[int, ...]
The shape of each chunk
chunk_key_encoding : ChunkKeyEncodingLike
The mapping from chunk grid cell coordinates to keys.
fill_value : Any, optional
The fill value for the array
codecs : list[Dict[str, Any]], optional
List of codec configurations
attributes : Dict[str, Any], optional
Additional attributes for the array
dimension_names : tuple[str], optional
Names of the dimensions

Returns
-------
Expand All @@ -49,14 +56,14 @@ def create_v3_array_metadata(
"name": "regular",
"configuration": {"chunk_shape": chunk_shape},
},
chunk_key_encoding={"name": "default"},
chunk_key_encoding=chunk_key_encoding,
fill_value=fill_value,
codecs=convert_to_codec_pipeline(
codecs=codecs or [],
dtype=data_type,
),
attributes=attributes or {},
dimension_names=None,
dimension_names=dimension_names,
storage_transformers=None,
)

Expand Down
30 changes: 24 additions & 6 deletions virtualizarr/readers/common.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,35 @@
import dataclasses
from collections.abc import Iterable, Mapping
from typing import (
Any,
Hashable,
MutableMapping,
Optional,
)
from typing import Any, Hashable, MutableMapping, Optional, TypedDict

import numpy as np
import xarray as xr
import xarray.indexes
from numcodecs.abc import Codec

from virtualizarr.utils import _FsspecFSFromFilepath


@dataclasses.dataclass
class ZstdProperties:
level: int


@dataclasses.dataclass
class ShuffleProperties:
elementsize: int


@dataclasses.dataclass
class ZlibProperties:
level: int


class CFCodec(TypedDict):
target_dtype: np.dtype
codec: Codec
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This stuff probably deserves to be in a dedicated codecs.py file (which I think we already have?



def construct_fully_virtual_dataset(
virtual_vars: Mapping[str, xr.Variable],
coord_names: Iterable[str] | None = None,
Expand Down
28 changes: 7 additions & 21 deletions virtualizarr/readers/hdf/filters.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
from __future__ import annotations

import dataclasses
from typing import TYPE_CHECKING, List, Tuple, TypedDict, Union
from typing import TYPE_CHECKING, List, Tuple, Union

import numcodecs.registry as registry
import numpy as np
from numcodecs.abc import Codec
from numcodecs.fixedscaleoffset import FixedScaleOffset
from xarray.coding.variables import _choose_float_dtype

from virtualizarr.readers.common import (
CFCodec,
ShuffleProperties,
ZlibProperties,
ZstdProperties,
)
from virtualizarr.utils import soft_import

h5py = soft_import("h5py", "For reading hdf files", strict=False)
Expand Down Expand Up @@ -48,26 +54,6 @@ def __post_init__(self):
self.cname = blosc_compressor_codes[self.cname]


@dataclasses.dataclass
class ZstdProperties:
level: int


@dataclasses.dataclass
class ShuffleProperties:
elementsize: int


@dataclasses.dataclass
class ZlibProperties:
level: int


class CFCodec(TypedDict):
target_dtype: np.dtype
codec: Codec


def _filter_to_codec(
filter_id: str, filter_properties: Union[int, None, Tuple] = None
) -> Codec:
Expand Down
Loading
Loading