Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle NetCDF variable length strings (and other VLen types) #6340

Merged
merged 33 commits into from
Mar 7, 2025
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
3f7779e
Initial workaround - if variable is a "str" type then force it to load.
ukmo-ccbunney Oct 8, 2024
faf0b2e
Added some TODO comments
ukmo-ccbunney Oct 8, 2024
b7dc23b
Merge remote-tracking branch 'upstream/main' into bugfix/varlen_str_type
ukmo-ccbunney Feb 25, 2025
d8a666d
Working solution that checks for VLEN arrays and handles the special
ukmo-ccbunney Feb 26, 2025
be2654c
Formatting updates
ukmo-ccbunney Feb 26, 2025
3723cd5
Added size hinting of variable length arrays using CHUNK_CONTROL cont…
ukmo-ccbunney Feb 26, 2025
ab9eccc
Access netCDF variable `datatype` via `cf_var.cf_data` to avoid Mock …
ukmo-ccbunney Feb 27, 2025
1168b56
Make check on cf_var.cf_data.datatype optional as it assumes underlying
ukmo-ccbunney Feb 27, 2025
adce495
Added warning category
ukmo-ccbunney Feb 27, 2025
c2423a9
Added unit tests for VLen arrays.
ukmo-ccbunney Feb 27, 2025
f33ce39
Merge branch 'main' into varlen_str_type
ukmo-ccbunney Feb 27, 2025
90b4a0a
Use 'safe-access' version of netCDF4.VLType to satisfy coding-standar…
pp-mo Feb 27, 2025
fc871ba
Updates to comments and added _MEAN_VL_ARRAY_LEN as module variable
ukmo-ccbunney Feb 28, 2025
4c16226
Merge branch 'varlen_str_type' into vlen_safe_nc
ukmo-ccbunney Feb 28, 2025
1b0876d
Merge pull request #1 from pp-mo/vlen_safe_nc
ukmo-ccbunney Feb 28, 2025
13ca3dd
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 28, 2025
0db9371
Restored accidentally removed comment
ukmo-ccbunney Feb 28, 2025
f2edbb7
Updated docstring and added docs
ukmo-ccbunney Feb 28, 2025
1c34e42
Merge remote-tracking branch 'upstream/main' into varlen_str_type
ukmo-ccbunney Feb 28, 2025
b200099
Updated Whats New
ukmo-ccbunney Feb 28, 2025
3f50599
Fixed some typos in docs
ukmo-ccbunney Feb 28, 2025
51a93a0
Fixed doctest indent
ukmo-ccbunney Feb 28, 2025
ffc00dd
Missing imports for doctest
ukmo-ccbunney Feb 28, 2025
252fa63
Rewording of docs
ukmo-ccbunney Feb 28, 2025
f094cba
Merge branch 'main' into varlen_str_type
pp-mo Mar 3, 2025
3d5b468
Missing black lines in doctest
ukmo-ccbunney Mar 3, 2025
c340d7e
Update lockfiles with iris-sample-data
ukmo-ccbunney Mar 3, 2025
da1855c
Better URL for variable length types [Review comment]
ukmo-ccbunney Mar 4, 2025
f6cb8fe
Slight rewording of docs and fix some typos [Review comments]
ukmo-ccbunney Mar 4, 2025
c8e2fc9
reworded hinting section
ukmo-ccbunney Mar 4, 2025
a7db764
Typo
ukmo-ccbunney Mar 4, 2025
c04bad2
Merge branch 'main' into varlen_str_type
pp-mo Mar 6, 2025
c37d527
Merge branch 'main' into varlen_str_type
pp-mo Mar 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lib/iris/fileformats/netcdf/_thread_safe_nc.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# Doesn't need thread protection, but this allows all netCDF4 refs to be
# replaced with thread_safe refs.
default_fillvals = netCDF4.default_fillvals
VLType = netCDF4.VLType


class _ThreadSafeWrapper(ABC):
Expand Down
45 changes: 40 additions & 5 deletions lib/iris/fileformats/netcdf/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,11 @@ def _get_actual_dtype(cf_var):
# mostly done for speed improvement. See https://github.com/SciTools/iris/pull/5069
_LAZYVAR_MIN_BYTES = 5000

# A stab in the dark at the mean length of the "ragged dimension" for netCDF "variable
# length arrays" (`NetCDF.VLType` type). Total array size is unknown until the variable is
# read in. Making this number bigger makes it more likely an array will be loaded lazily.
_MEAN_VL_ARRAY_LEN = 10


def _get_cf_var_data(cf_var, filename):
"""Get an array representing the data of a CF variable.
Expand All @@ -215,23 +220,53 @@ def _get_cf_var_data(cf_var, filename):
# See https://github.com/SciTools/iris/issues/4994 "Xarray bridge".
result = cf_var._data_array
else:
total_bytes = cf_var.size * cf_var.dtype.itemsize
# Determine size of data; however can't do this for variable length (VLEN)
# netCDF arrays as the size of the array can only be known by reading the
# data; see https://github.com/Unidata/netcdf-c/issues/1893.
# Note: "Variable length" netCDF types have a datatype of `nc.VLType`.
if isinstance(getattr(cf_var, "datatype", None), _thread_safe_nc.VLType):
msg = (
f"NetCDF variable `{cf_var.cf_name}` is a variable length type of kind {cf_var.dtype} "
"thus the total data size cannot be known in advance. This may affect the lazy loading "
"of the data."
)
warnings.warn(msg, category=iris.warnings.IrisLoadWarning)

# Give user the chance to pass a hint of the average variable length array size via
# the chunk control context manager. This allows for better decisions to be made on
# whether the data should be lazy-loaded or not.
mean_vl_array_len = _MEAN_VL_ARRAY_LEN
if CHUNK_CONTROL.mode is not CHUNK_CONTROL.Modes.AS_DASK:
if chunks := CHUNK_CONTROL.var_dim_chunksizes.get(cf_var.cf_name):
if vl_chunk_hint := chunks.get("_vl_hint"):
mean_vl_array_len = vl_chunk_hint

# Special handling for strings (`str` type) as these don't have an itemsize attribute;
# assume 4 bytes which is sufficient for unicode character storage
itemsize = 4 if cf_var.dtype is str else cf_var.dtype.itemsize

# For `VLType` cf_var.size will just return the known dimension size.
total_bytes = cf_var.size * mean_vl_array_len * itemsize
else:
# Normal NCVariable type:
total_bytes = cf_var.size * cf_var.dtype.itemsize

if total_bytes < _LAZYVAR_MIN_BYTES:
# Don't make a lazy array, as it will cost more memory AND more time to access.
# Instead fetch the data immediately, as a real array, and return that.
result = cf_var[:]

else:
# Get lazy chunked data out of a cf variable.
# Creates Dask wrappers around data arrays for any cube components which
# can have lazy values, e.g. Cube, Coord, CellMeasure, AuxiliaryVariable.
dtype = _get_actual_dtype(cf_var)

# Make a data-proxy that mimics array access and can fetch from the file.
# Note: Special handling needed for "variable length string" types which
# return a dtype of `str`, rather than a numpy type; use `S1` in this case.
fill_dtype = "S1" if cf_var.dtype is str else cf_var.dtype.str[1:]
fill_value = getattr(
cf_var.cf_data,
"_FillValue",
_thread_safe_nc.default_fillvals[cf_var.dtype.str[1:]],
_thread_safe_nc.default_fillvals[fill_dtype],
)
proxy = NetCDFDataProxy(
cf_var.shape, dtype, filename, cf_var.cf_name, fill_value
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from iris._lazy_data import _optimum_chunksize
import iris.fileformats.cf
from iris.fileformats.netcdf._thread_safe_nc import VLType
from iris.fileformats.netcdf.loader import CHUNK_CONTROL, _get_cf_var_data


Expand All @@ -33,7 +34,8 @@ def _make(self, chunksizes=None, shape=None, dtype="i4", **extra_properties):
cf_data.chunking = mock.MagicMock(return_value=chunksizes)
if shape is None:
shape = self.shape
dtype = np.dtype(dtype)
if dtype is not str: # for testing VLen str arrays (dtype=`class <str>`)
dtype = np.dtype(dtype)
cf_var = mock.MagicMock(
spec=iris.fileformats.cf.CFVariable,
dtype=dtype,
Expand Down Expand Up @@ -103,6 +105,51 @@ def test_arraytype__100f8_is_real(self):
var_data = _get_cf_var_data(cf_var, self.filename)
self.assertIs(var_data, mock.sentinel.real_data_accessed)

def test_vltype__1000str_is_lazy(self):
# Variable length string type
mock_vltype = mock.Mock(spec=VLType, dtype=str, name="varlen string type")
cf_var = self._make(shape=(1000,), dtype=str, datatype=mock_vltype)
var_data = _get_cf_var_data(cf_var, self.filename)
self.assertIsInstance(var_data, da.Array)

def test_vltype__1000str_is_real_with_hint(self):
# Variable length string type with a hint on the array variable length size
mock_vltype = mock.Mock(spec=VLType, dtype=str, name="varlen string type")
cf_var = self._make(shape=(100,), dtype=str, datatype=mock_vltype)
with CHUNK_CONTROL.set("DUMMY_VAR", _vl_hint=1):
var_data = _get_cf_var_data(cf_var, self.filename)
self.assertIs(var_data, mock.sentinel.real_data_accessed)

def test_vltype__100str_is_real(self):
# Variable length string type
mock_vltype = mock.Mock(spec=VLType, dtype=str, name="varlen string type")
cf_var = self._make(shape=(100,), dtype=str, datatype=mock_vltype)
var_data = _get_cf_var_data(cf_var, self.filename)
self.assertIs(var_data, mock.sentinel.real_data_accessed)

def test_vltype__100str_is_lazy_with_hint(self):
# Variable length string type with a hint on the array variable length size
mock_vltype = mock.Mock(spec=VLType, dtype=str, name="varlen string type")
cf_var = self._make(shape=(100,), dtype=str, datatype=mock_vltype)
with CHUNK_CONTROL.set("DUMMY_VAR", _vl_hint=50):
var_data = _get_cf_var_data(cf_var, self.filename)
self.assertIsInstance(var_data, da.Array)

def test_vltype__100f8_is_lazy(self):
# Variable length float64 type
mock_vltype = mock.Mock(spec=VLType, dtype="f8", name="varlen float64 type")
cf_var = self._make(shape=(1000,), dtype="f8", datatype=mock_vltype)
var_data = _get_cf_var_data(cf_var, self.filename)
self.assertIsInstance(var_data, da.Array)

def test_vltype__100f8_is_real_with_hint(self):
# Variable length float64 type with a hint on the array variable length size
mock_vltype = mock.Mock(spec=VLType, dtype="f8", name="varlen float64 type")
cf_var = self._make(shape=(100,), dtype="f8", datatype=mock_vltype)
with CHUNK_CONTROL.set("DUMMY_VAR", _vl_hint=2):
var_data = _get_cf_var_data(cf_var, self.filename)
self.assertIs(var_data, mock.sentinel.real_data_accessed)

def test_cf_data_emulation(self):
# Check that a variable emulation object passes its real data directly.
emulated_data = mock.Mock()
Expand Down
Loading