Add rechunking notebook for reference (#22)

kafitzgerald · web-flow · commit 3d9ebe3fb995 · 2025-04-01T15:36:50.000-06:00
diff --git a/rechunking/environment.yml b/rechunking/environment.yml
@@ -0,0 +1,11 @@
+name: rechunking
+channels:
+  - conda-forge
+  - nodefaults
+dependencies:
+  - python=3.12
+  - jupyterlab
+  - xarray
+  - zarr
+  - dask
+  - dask-jobqueue
diff --git a/rechunking/era5_rechunking.ipynb b/rechunking/era5_rechunking.ipynb
@@ -0,0 +1,211 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "153c98aa-2f72-4cb4-a96a-c01374d84930",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Dask imports\n",
+    "\n",
+    "from dask_jobqueue import PBSCluster\n",
+    "from dask.distributed import Client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "338d23f5-92d2-422b-bbe4-01955aceff50",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Dask cluster config\n",
+    "\n",
+    "cluster = PBSCluster(\n",
+    "    # Basic job directives\n",
+    "    job_name        = 'hackathon-rechunk',\n",
+    "    queue           = 'casper',\n",
+    "    walltime        = '120:00',\n",
+    "    # Make sure you change the project code if running this notebook!!\n",
+    "    account         = 'UCSG0002',\n",
+    "    log_directory   = 'dask-logs',\n",
+    "    # These settings impact the resources assigned to the job\n",
+    "    cores           = 1,\n",
+    "    memory          = '10GiB',\n",
+    "    resource_spec   = 'select=1:ncpus=1:mem=10GB',\n",
+    "    # These settings define the resources assigned to a worker\n",
+    "    processes       = 1,\n",
+    "    # This controls where Dask will write data to disk if memory is exhausted\n",
+    "    local_directory = '/local_scratch/pbs.$PBS_JOBID/dask/spill',\n",
+    "    # This specifies which network interface the cluster will use\n",
+    "    interface       = 'ext'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0d4322e9-cc4f-4b45-815c-9b8228eb03a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create the client to load the Dashboard\n",
+    "client = Client(cluster)\n",
+    "\n",
+    "# Display the client repr\n",
+    "client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c737b08-9cf2-4e90-9646-2013641815b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Scale and wait for workers\n",
+    "\n",
+    "cluster.scale(40)\n",
+    "client.wait_for_workers(40)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d7d7583-c695-43c9-86a8-12f20b5d432d",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import xarray as xr\n",
+    "import pandas as pd\n",
+    "import dask\n",
+    "\n",
+    "# Read in files\n",
+    "ds = xr.open_mfdataset('/glade/derecho/scratch/ksha/CREDIT_data/ERA5_mlevel_arXiv/SixHourly_y_TOTAL_202*.zarr',\n",
+    "                       engine = 'zarr',\n",
+    "                       consolidated=True,\n",
+    "                       data_vars='minimal',\n",
+    "                       coords='minimal',\n",
+    "                       compat='override',\n",
+    "                       parallel=True)\n",
+    "\n",
+    "# Rechunk the data\n",
+    "ds = ds.chunk({\"time\": 1, \"level\": 1, \"latitude\": 640, \"longitude\": 1280})\n",
+    "\n",
+    "# Remove the old encoding info and set compression to none\n",
+    "for k, v in ds.variables.items():\n",
+    "    v.encoding['compressors'] = None\n",
+    "    del v.encoding['chunks']\n",
+    "    del v.encoding['preferred_chunks']\n",
+    "\n",
+    "# Remove the old encoding info (default compression will then apply when written to Zarr)\n",
+    "# for k, v in ds.variables.items():\n",
+    "#     del v.encoding['compressors']\n",
+    "#     del v.encoding['chunks']\n",
+    "#     del v.encoding['preferred_chunks']\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "53fd0270-d21e-4f2f-a769-1701900f66f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Some not particularly polished data wrangling to combine the arrays\n",
+    "# Skip this to write separate arrays\n",
+    "\n",
+    "full_variables = ['Q', 'T', 'U', 'V']\n",
+    "single_level_variables = ['Q500', 'T500', 'U500', 'V500', 'Z500', 't2m', 'SP']\n",
+    "\n",
+    "ds1 = xr.concat([ds[x] for x in single_level_variables],\n",
+    "                pd.Index(single_level_variables,\n",
+    "                         name='channel')).transpose('time',\n",
+    "                                                    'channel',\n",
+    "                                                    'latitude',\n",
+    "                                                    'longitude')\n",
+    "\n",
+    "c = xr.concat([ds[x] for x in full_variables], dim=full_variables)\n",
+    "\n",
+    "s = c.stack(channel = ('concat_dim','level')).transpose('time',\n",
+    "                                                        'channel',\n",
+    "                                                        'latitude',\n",
+    "                                                        'longitude').reset_index('channel')\n",
+    "\n",
+    "s['channel'] = s['concat_dim'] + s['level'].astype('str')\n",
+    "\n",
+    "ds2 = s.drop_vars(['level', 'concat_dim'])\n",
+    "\n",
+    "combined = xr.concat([ds1, ds2], dim='channel').rename('combined')\n",
+    "\n",
+    "combined.encoding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a3b394cc-9186-4a83-8a5d-2fedc3f10825",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Write to Zarr v3 with consolidated metdata\n",
+    "\n",
+    "combined.to_zarr('/glade/derecho/scratch/katelynw/era5/rechunked_stacked_uncompressed_test.zarr',\n",
+    "                 zarr_version=3,\n",
+    "                 consolidated=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd075006-9d9c-43b4-82ce-9cb1a7d1c576",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Shutdown the cluster\n",
+    "\n",
+    "client.shutdown()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9eedf120-3afa-4e26-a345-f58cbdc032a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Open up the new dataset and check the encoding\n",
+    "\n",
+    "ds_new = xr.open_dataset('/glade/derecho/scratch/katelynw/era5/rechunked_stacked_uncompressed_test.zarr')\n",
+    "\n",
+    "ds_new.combined.encoding"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:my-env]",
+   "language": "python",
+   "name": "conda-env-my-env-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}