-
Notifications
You must be signed in to change notification settings - Fork 17
Kvikio backend entrypoint with Zarr v3 #70
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 35 commits
9deadb7
aa2dc91
7fb4b94
743fe7d
5d501e4
facf5f7
f3f5189
9c98d19
dd8bc57
d2da1e4
b87c3c2
87cb74e
d7394ef
1b23fef
ca0cf45
97260d6
5d27b26
85491d7
c470b97
95efa18
d684dad
ae2a7f1
15fbafd
f3df115
4e1857a
7345b61
e2b410e
7dd78e9
cb77678
0262151
e26ed24
f185b44
1a52ce5
789a9b6
3894d29
7fa7c06
1e205ec
9227810
b45decb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
from . import _version | ||
from .accessors import CupyDataArrayAccessor, CupyDatasetAccessor # noqa | ||
from .accessors import CupyDataArrayAccessor, CupyDatasetAccessor # noqa: F401 | ||
from .kvikio import KvikioBackendEntrypoint # noqa: F401 | ||
|
||
__version__ = _version.get_versions()["version"] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
""" | ||
:doc:`kvikIO <kvikio:index>` backend for xarray to read Zarr stores directly into CuPy | ||
arrays in GPU memory. | ||
""" | ||
|
||
from xarray.backends.common import _normalize_path # TODO: can this be public | ||
from xarray.backends.store import StoreBackendEntrypoint | ||
from xarray.backends.zarr import ZarrBackendEntrypoint, ZarrStore | ||
from xarray.core.dataset import Dataset | ||
from xarray.core.utils import close_on_error # TODO: can this be public. | ||
|
||
try: | ||
import kvikio.zarr | ||
|
||
has_kvikio = True | ||
except ImportError: | ||
has_kvikio = False | ||
|
||
|
||
class KvikioBackendEntrypoint(ZarrBackendEntrypoint): | ||
""" | ||
Xarray backend to read Zarr stores using 'kvikio' engine. | ||
|
||
For more information about the underlying library, visit | ||
:doc:`kvikIO's Zarr page<kvikio:zarr>`. | ||
""" | ||
|
||
available = has_kvikio | ||
description = "Open zarr files (.zarr) using Kvikio" | ||
url = "https://docs.rapids.ai/api/kvikio/stable/api/#zarr" | ||
|
||
# disabled by default | ||
# We need to provide this because of the subclassing from | ||
# ZarrBackendEntrypoint | ||
def guess_can_open(self, filename_or_obj): | ||
return False | ||
|
||
def open_dataset( | ||
self, | ||
filename_or_obj, | ||
mask_and_scale=True, | ||
decode_times=True, | ||
concat_characters=True, | ||
decode_coords=True, | ||
drop_variables=None, | ||
use_cftime=None, | ||
decode_timedelta=None, | ||
group=None, | ||
mode="r", | ||
synchronizer=None, | ||
consolidated=None, | ||
chunk_store=None, | ||
storage_options=None, | ||
zarr_version=None, | ||
zarr_format=None, | ||
store=None, | ||
engine=None, | ||
use_zarr_fill_value_as_mask=None, | ||
cache_members: bool = True, | ||
) -> Dataset: | ||
filename_or_obj = _normalize_path(filename_or_obj) | ||
if not store: | ||
store = ZarrStore.open_group( | ||
store=kvikio.zarr.GDSStore(root=filename_or_obj), | ||
group=group, | ||
mode=mode, | ||
synchronizer=synchronizer, | ||
consolidated=consolidated, | ||
consolidate_on_close=False, | ||
chunk_store=chunk_store, | ||
storage_options=storage_options, | ||
zarr_version=zarr_version, | ||
use_zarr_fill_value_as_mask=None, | ||
zarr_format=zarr_format, | ||
cache_members=cache_members, | ||
) | ||
|
||
store_entrypoint = StoreBackendEntrypoint() | ||
with close_on_error(store): | ||
ds = store_entrypoint.open_dataset( | ||
store, | ||
mask_and_scale=mask_and_scale, | ||
decode_times=decode_times, | ||
concat_characters=concat_characters, | ||
decode_coords=decode_coords, | ||
drop_variables=drop_variables, | ||
use_cftime=use_cftime, | ||
decode_timedelta=decode_timedelta, | ||
) | ||
return ds |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import cupy as cp | ||
import numpy as np | ||
import pytest | ||
import xarray as xr | ||
from xarray.core.indexing import ExplicitlyIndexedNDArrayMixin | ||
|
||
kvikio = pytest.importorskip("kvikio") | ||
zarr = pytest.importorskip("zarr") | ||
|
||
import kvikio.zarr # noqa | ||
import xarray.core.indexing # noqa | ||
|
||
|
||
@pytest.fixture | ||
def store(tmp_path): | ||
ds = xr.Dataset( | ||
{ | ||
"a": ("x", np.arange(10), {"foo": "bar"}), | ||
"scalar": np.array(1), | ||
}, | ||
coords={"x": ("x", np.arange(-5, 5))}, | ||
) | ||
|
||
for var in ds.variables: | ||
ds[var].encoding["compressors"] = None | ||
|
||
store_path = tmp_path / "kvikio.zarr" | ||
ds.to_zarr(store_path, consolidated=True) | ||
return store_path | ||
|
||
|
||
def test_entrypoint(): | ||
assert "kvikio" in xr.backends.list_engines() | ||
|
||
|
||
@pytest.mark.parametrize("consolidated", [True, False]) | ||
def test_lazy_load(consolidated, store): | ||
with xr.open_dataset(store, engine="kvikio", consolidated=consolidated) as ds: | ||
for _, da in ds.data_vars.items(): | ||
assert isinstance(da.variable._data, ExplicitlyIndexedNDArrayMixin) | ||
|
||
|
||
@pytest.mark.parametrize("indexer", [slice(None), slice(2, 4), 2, [2, 3, 5]]) | ||
def test_lazy_indexing(indexer, store): | ||
with zarr.config.enable_gpu(), xr.open_dataset(store, engine="kvikio") as ds: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ideally, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure... The (v3) kvikio.zarr.GDSStore is written to accept either. I'm tempted to leave this up to the user (so make them configure Zarr correctly) rather than assuming they want to use GPU memory, but maybe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, Looking at There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am inclined to agree that But for ease of use, it might be better to default to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is similar to the discussion at zarr-developers/zarr-python#2473. That Currently the |
||
ds = ds.isel(x=indexer) | ||
for _, da in ds.data_vars.items(): | ||
assert isinstance(da.variable._data, ExplicitlyIndexedNDArrayMixin) | ||
|
||
loaded = ds.compute() | ||
for _, da in loaded.data_vars.items(): | ||
if da.ndim == 0: | ||
continue | ||
assert isinstance(da.data, cp.ndarray) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, those two are functionally the same. The
xr.open_dataset(kvikio.zarr.GDSStore("data.zarr"), engine="zarr")
style works today (with xarraypatch=2025.03.0 and kvikio=25.04.00a), whilexr.open_dataset("data.zarr", engine="kvikio")
is more just a convenience syntax. I was hoping to also sneak in thezarr.config.enable_gpu()
in the backend somewhere to getcupy
arrays by default (but we can discuss that in the https://github.com/xarray-contrib/cupy-xarray/pull/70/files#r1988251560 thread).