2. xarray, netcdf and zarr¶

Motivation: how you store your data can an enormous effect on performance.

Four issues:

Compression vs. cpu time to uncompress/compress
Multithreaded read/writes
Performance for cloud storage (Amazon, Google Compute, Azure)
Throttling data reads to fit in available memory and avoid swapping (“chunking”)

2.1. The current defacto standard in atmos/ocean science¶

netcdf/hdf5

2.2. Some challenges with netcdf¶

2.3. create an xarray¶

import glob
import numpy as np
import pdb
from pathlib import Path
import xarray as xr
from matplotlib import pyplot as plt

#!conda install -y xarray

import context
from westgrid.data_read import download

2.4. Download toy model data¶

#get 10 files, each is the same timestep for a member of a
#10 member ensemble

import numpy as np
root='http://clouds.eos.ubc.ca/~phil/docs/atsc500/data/small_les'
for i in range(10):
    the_name=f'mar12014_{(i+1):d}_15600.nc'
    print(the_name)
    url='{}/{}'.format(root,the_name)
    download(the_name,root=root)

#get 10 files, each is the same timestep for a member of a
#10 member ensemble
import numpy as np
root=Path().resolve()
the_files=root.glob('mar12*nc')
the_files=list(the_files)
print(the_files)

2.5. Sort in numeric order¶

def sort_name(pathname):
    """
      sort the filenames so '10' sorts
      last by converting to integers
    """
    pathname = Path(pathname)
    str_name=str(pathname.name)
    front, number, back = str_name.split('_')
    return int(number)

2.6. Make an xarray¶

Now use xarray to stitch together the 10 ensemble members along a new “virtual dimenstion”. The variable “ds” is an xray dataset, which controls the reads/writes from the 10 netcdf files

the_files.sort(key=sort_name)

#
#  put the 10 ensembles together along a new "ens" dimension
#  using an xray multi-file dataset
#
ds = [xr.open_dataset(item) for item in the_files]
dim = xr.IndexVariable("realization", np.arange(len(ds)), attrs={"axis": "E"})
ens = xr.concat(ds, dim)
for vname, var in ds[0].variables.items():
    ens[vname].attrs.update(**var.attrs)
ens.attrs.update(**ds[0].attrs)
# dump the structure
print(ens)

# 3-d ensemble average for temp

x = ens['x']
y = ens['y']
z = ens['z']
temp = ens['TABS']
mean_temp = temp[:,  :, :, :].mean(dim='realization')
#
# same for velocity
#
wvel = ens['W']
mean_w = wvel[:,  :, :, :].mean(dim='realization')
#
# now look at the perturbation fields for one ensemble member
#
wvelprime = wvel[0,  :, :, :] - mean_w
Tprime = temp[0,  :, :, :] - mean_temp
flux_prime = wvelprime * Tprime
flux_profile = flux_prime.mean(dim='x').mean(dim='y')
keep_dict = dict(flux_prof=flux_profile, flux_prime=flux_prime.values,
                 wvelprime=wvelprime.values, Tprime=Tprime.values, x=x, y=y, z=z)

2.7. Dump to a zarr file¶

ens.to_zarr('zarr_dir','w');

fig1, ax1 = plt.subplots(1, 1)
ax1.plot('flux_prof', 'z', data = keep_dict)
ax1.set(title='Ens 0: vertically averaged kinematic heat flux',
        ylabel='z (m)', xlabel='flux (K m/s)')

fig2, ax2 = plt.subplots(1, 1)
z200 = np.searchsorted(keep_dict['z'], 200)
flux_200 = keep_dict['flux_prime'][z200,:,:].flat
ax2.hist(flux_200)
ax2.set(title='histogram of kinematic heat flux (K m/s) at z=200 m')

fig3, ax3 = plt.subplots(1, 1)
wvel200=keep_dict['wvelprime'][z200,:,:].flat
ax3.hist(wvel200)
ax3.set(title="histogram of wvel' at 200 m")

fig4, ax4 = plt.subplots(1, 1)
Tprime200=keep_dict['Tprime'][z200, :, :].flat
ax4.hist(Tprime200)
ax4.set(title="histogram ot T' at z=200 m");

1. Introduction to joblib 3. Using dask and zarr for multithreaded input/output