SciTools · pp-mo · Mar 7, 2025 · Oct 8, 2024 · Oct 8, 2024 · Feb 25, 2025
diff --git a/docs/src/further_topics/netcdf_io.rst b/docs/src/further_topics/netcdf_io.rst
@@ -122,6 +122,61 @@ Iris' optimisation all together, and will take its chunksizes from Dask's behavi
     (70, 37, 49)
 
 
+Variable-length datatypes
+-------------------------
+
+The NetCDF4 module provides support for variable-length (or "ragged") data
+types (``VLType``); see
+`Variable-length data types <https://unidata.github.io/netcdf4-python/#variable-length-vlen-data-types>`_
+
+The ``VLType`` allows for storing data where the length of the data in each array element
+can vary. When ``VLType`` arrays are loaded into Iris cubes (or numpy), they are stored
+as an array of ``Object`` types - essentially an array-of-arrays, rather than a single
+multi-dimensional array.
+
+The most likely case to encounter variable-length data types is when an array of
+strings (not characters) are stored in a NetCDF file. As the string length for any
+particular array element can vary the values are stored as an array of ``VLType``.
+
+As each element of a variable-length array is stored as a ``VLType`` containing
+an unknown number of vales, the total size of a variable-length NetCDF array
+cannot be known without first loading the data. This makes it difficult for
+Iris to make an informed decision on whether to the load the data lazily or not.
+The user can aid this decision using *VLType size hinting* described below.
+
+VLType size hinting
+^^^^^^^^^^^^^^^^^^^
+
+If the user has some *a priori* knowledge of the average length of the data in
+variable-length ``VLType``, this can be provided as a hint to Iris via the
+``CHUNK_CONTROL`` context manager and the special ``_vl_hint`` keyword
+targeting the variable, e.g. ``CHUNK_CONTROL.set("varname", _vl_hint=5)``.
+This allows Iris to make a more informed decision on whether to load the
+data lazily.
+
+For example, consider a netCDF file with an auxiliary coordinate
+``experiment_version`` that is stored as a variable-length string type. By 
+default, Iris will attempt to guess the total array size based on the known
+dimension sizes (``time=150`` in this example) and load the data lazily.
+However, if it is known prior to loading the file that the strings are all no
+longer than 5 characters this information can be passed to the Iris NetCDF
+loader so it can be make a more informed decision on lazy loading:
+
+.. doctest::
+
+    >>> import iris
+    >>> from iris.fileformats.netcdf.loader import CHUNK_CONTROL
+    >>>
+    >>> sample_file = iris.sample_data_path("vlstr_type.nc")
+    >>> cube = iris.load_cube(sample_file)
+    >>> print(cube.coord('experiment_version').has_lazy_points())
+    True
+    >>> with CHUNK_CONTROL.set("expver", _vl_hint=5):
+    ...     cube = iris.load_cube(sample_file)
+    >>> print(cube.coord('experiment_version').has_lazy_points())
+    False
+
+
 Split Attributes
 -----------------
 

diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst
@@ -65,6 +65,9 @@ This document explains the changes made to Iris for this release
    older NetCDF formats e.g. ``NETCDF4_CLASSIC`` support a maximum precision of
    32-bit. (:issue:`6178`, :pull:`6343`)
 
+# `@ukmo-ccbunney` added support for loading NetCDF variable-length string types
+   and size hinting for better lazy loading (:issue:`6149`, :pull:`6340`)
+
 
 💣 Incompatible Changes
 =======================

diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py
@@ -20,6 +20,7 @@
 # Doesn't need thread protection, but this allows all netCDF4 refs to be
 #  replaced with thread_safe refs.
 default_fillvals = netCDF4.default_fillvals
+VLType = netCDF4.VLType
 
 
 class _ThreadSafeWrapper(ABC):

diff --git a/lib/iris/fileformats/netcdf/loader.py b/lib/iris/fileformats/netcdf/loader.py
@@ -196,6 +196,11 @@ def _get_actual_dtype(cf_var):
 # mostly done for speed improvement.  See https://github.com/SciTools/iris/pull/5069
 _LAZYVAR_MIN_BYTES = 5000
 
+# A stab in the dark at the mean length of the "ragged dimension" for netCDF "variable
+# length arrays" (`NetCDF.VLType` type). Total array size is unknown until the variable is
+# read in. Making this number bigger makes it more likely an array will be loaded lazily.
+_MEAN_VL_ARRAY_LEN = 10
+
 
 def _get_cf_var_data(cf_var, filename):
     """Get an array representing the data of a CF variable.
@@ -215,23 +220,54 @@ def _get_cf_var_data(cf_var, filename):
         # See https://github.com/SciTools/iris/issues/4994 "Xarray bridge".
         result = cf_var._data_array
     else:
-        total_bytes = cf_var.size * cf_var.dtype.itemsize
+        # Determine size of data; however can't do this for variable length (VLEN)
+        # netCDF arrays as the size of the array can only be known by reading the
+        # data; see https://github.com/Unidata/netcdf-c/issues/1893.
+        # Note: "Variable length" netCDF types have a datatype of `nc.VLType`.
+        if isinstance(getattr(cf_var, "datatype", None), _thread_safe_nc.VLType):
+            msg = (
+                f"NetCDF variable `{cf_var.cf_name}` is a variable length type of kind {cf_var.dtype} "
+                "thus the total data size cannot be known in advance. This may affect the lazy loading "
+                "of the data."
+            )
+            warnings.warn(msg, category=iris.warnings.IrisLoadWarning)
+
+            # Give user the chance to pass a hint of the average variable length array size via
+            # the chunk control context manager. This allows for better decisions to be made on
+            # whether the data should be lazy-loaded or not.
+            mean_vl_array_len = _MEAN_VL_ARRAY_LEN
+            if CHUNK_CONTROL.mode is not CHUNK_CONTROL.Modes.AS_DASK:
+                if chunks := CHUNK_CONTROL.var_dim_chunksizes.get(cf_var.cf_name):
+                    if vl_chunk_hint := chunks.get("_vl_hint"):
+                        mean_vl_array_len = vl_chunk_hint
+
+            # Special handling for strings (`str` type) as these don't have an itemsize attribute;
+            # assume 4 bytes which is sufficient for unicode character storage
+            itemsize = 4 if cf_var.dtype is str else cf_var.dtype.itemsize
+
+            # For `VLType` cf_var.size will just return the known dimension size.
+            total_bytes = cf_var.size * mean_vl_array_len * itemsize
+        else:
+            # Normal NCVariable type:
+            total_bytes = cf_var.size * cf_var.dtype.itemsize
+
         if total_bytes < _LAZYVAR_MIN_BYTES:
             # Don't make a lazy array, as it will cost more memory AND more time to access.
-            # Instead fetch the data immediately, as a real array, and return that.
             result = cf_var[:]
-
         else:
             # Get lazy chunked data out of a cf variable.
             # Creates Dask wrappers around data arrays for any cube components which
             # can have lazy values, e.g. Cube, Coord, CellMeasure, AuxiliaryVariable.
             dtype = _get_actual_dtype(cf_var)
 
             # Make a data-proxy that mimics array access and can fetch from the file.
+            # Note: Special handling needed for "variable length string" types which
+            # return a dtype of `str`, rather than a numpy type; use `S1` in this case.
+            fill_dtype = "S1" if cf_var.dtype is str else cf_var.dtype.str[1:]
             fill_value = getattr(
                 cf_var.cf_data,
                 "_FillValue",
-                _thread_safe_nc.default_fillvals[cf_var.dtype.str[1:]],
+                _thread_safe_nc.default_fillvals[fill_dtype],
             )
             proxy = NetCDFDataProxy(
                 cf_var.shape, dtype, filename, cf_var.cf_name, fill_value
@@ -699,6 +735,10 @@ def set(
     ) -> Iterator[None]:
         r"""Control the Dask chunk sizes applied to NetCDF variables during loading.
 
+        This function can also be used to provide a size hint for the unknown
+        array lengths when loading "variable-length" NetCDF data types.
+        See https://unidata.github.io/netcdf4-python/#netCDF4.Dataset.vltypes
+
         Parameters
         ----------
         var_names : str or list of str, default=None
@@ -710,7 +750,8 @@ def set(
             Each key-value pair defines a chunk size for a named file
             dimension, e.g. ``{'time': 10, 'model_levels':1}``.
             Values of ``-1`` will lock the chunk size to the full size of that
-            dimension.
+            dimension. To specify a size hint for "variable-length"  data types
+            use the special name `_vl_hint`.
 
         Notes
         -----
@@ -734,6 +775,16 @@ def set(
         i.e. the setting configured by
         ``dask.config.set({'array.chunk-size': '250MiB'})``.
 
+        For variable-length data types the size of the variable (or "ragged")
+        dimension of the individual array elements cannot be known without
+        reading the data. This can make it difficult for Iris to determine
+        whether to load the data lazily or not. If the user has some apriori
+        knowledge of the mean variable array length this can be passed as
+        as a size hint via the special `_vl_hint` name. For example a hint
+        that variable-length string array that contains 4 character experiment
+        identifiers:
+        ``CHUNK_CONTROL.set("expver", _vl_hint=4)``
+
         """
         old_mode = self.mode
         old_var_dim_chunksizes = deepcopy(self.var_dim_chunksizes)

diff --git a/lib/iris/tests/unit/fileformats/netcdf/loader/test__get_cf_var_data.py b/lib/iris/tests/unit/fileformats/netcdf/loader/test__get_cf_var_data.py
@@ -15,6 +15,7 @@
 
 from iris._lazy_data import _optimum_chunksize
 import iris.fileformats.cf
+from iris.fileformats.netcdf._thread_safe_nc import VLType
 from iris.fileformats.netcdf.loader import CHUNK_CONTROL, _get_cf_var_data
 
 
@@ -33,7 +34,8 @@ def _make(self, chunksizes=None, shape=None, dtype="i4", **extra_properties):
         cf_data.chunking = mock.MagicMock(return_value=chunksizes)
         if shape is None:
             shape = self.shape
-        dtype = np.dtype(dtype)
+        if dtype is not str:  # for testing VLen str arrays (dtype=`class <str>`)
+            dtype = np.dtype(dtype)
         cf_var = mock.MagicMock(
             spec=iris.fileformats.cf.CFVariable,
             dtype=dtype,
@@ -103,6 +105,51 @@ def test_arraytype__100f8_is_real(self):
         var_data = _get_cf_var_data(cf_var, self.filename)
         self.assertIs(var_data, mock.sentinel.real_data_accessed)
 
+    def test_vltype__1000str_is_lazy(self):
+        # Variable length string type
+        mock_vltype = mock.Mock(spec=VLType, dtype=str, name="varlen string type")
+        cf_var = self._make(shape=(1000,), dtype=str, datatype=mock_vltype)
+        var_data = _get_cf_var_data(cf_var, self.filename)
+        self.assertIsInstance(var_data, da.Array)
+
+    def test_vltype__1000str_is_real_with_hint(self):
+        # Variable length string type with a hint on the array variable length size
+        mock_vltype = mock.Mock(spec=VLType, dtype=str, name="varlen string type")
+        cf_var = self._make(shape=(100,), dtype=str, datatype=mock_vltype)
+        with CHUNK_CONTROL.set("DUMMY_VAR", _vl_hint=1):
+            var_data = _get_cf_var_data(cf_var, self.filename)
+        self.assertIs(var_data, mock.sentinel.real_data_accessed)
+
+    def test_vltype__100str_is_real(self):
+        # Variable length string type
+        mock_vltype = mock.Mock(spec=VLType, dtype=str, name="varlen string type")
+        cf_var = self._make(shape=(100,), dtype=str, datatype=mock_vltype)
+        var_data = _get_cf_var_data(cf_var, self.filename)
+        self.assertIs(var_data, mock.sentinel.real_data_accessed)
+
+    def test_vltype__100str_is_lazy_with_hint(self):
+        # Variable length string type with a hint on the array variable length size
+        mock_vltype = mock.Mock(spec=VLType, dtype=str, name="varlen string type")
+        cf_var = self._make(shape=(100,), dtype=str, datatype=mock_vltype)
+        with CHUNK_CONTROL.set("DUMMY_VAR", _vl_hint=50):
+            var_data = _get_cf_var_data(cf_var, self.filename)
+        self.assertIsInstance(var_data, da.Array)
+
+    def test_vltype__100f8_is_lazy(self):
+        # Variable length float64 type
+        mock_vltype = mock.Mock(spec=VLType, dtype="f8", name="varlen float64 type")
+        cf_var = self._make(shape=(1000,), dtype="f8", datatype=mock_vltype)
+        var_data = _get_cf_var_data(cf_var, self.filename)
+        self.assertIsInstance(var_data, da.Array)
+
+    def test_vltype__100f8_is_real_with_hint(self):
+        # Variable length float64 type with a hint on the array variable length size
+        mock_vltype = mock.Mock(spec=VLType, dtype="f8", name="varlen float64 type")
+        cf_var = self._make(shape=(100,), dtype="f8", datatype=mock_vltype)
+        with CHUNK_CONTROL.set("DUMMY_VAR", _vl_hint=2):
+            var_data = _get_cf_var_data(cf_var, self.filename)
+        self.assertIs(var_data, mock.sentinel.real_data_accessed)
+
     def test_cf_data_emulation(self):
         # Check that a variable emulation object passes its real data directly.
         emulated_data = mock.Mock()

diff --git a/requirements/locks/py311-linux-64.lock b/requirements/locks/py311-linux-64.lock
@@ -162,7 +162,7 @@ https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.cond
 https://conda.anaconda.org/conda-forge/noarch/idna-3.10-pyhd8ed1ab_1.conda#39a4f67be3286c86d696df570b1201b7
 https://conda.anaconda.org/conda-forge/noarch/imagesize-1.4.1-pyhd8ed1ab_0.tar.bz2#7de5386c8fea29e76b303f37dde4c352
 https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_1.conda#6837f3eff7dcea42ecd714ce1ac2b108
-https://conda.anaconda.org/conda-forge/noarch/iris-sample-data-2.5.1-pyhd8ed1ab_1.conda#ae376af0a29183e98a95508ed6944664
+https://conda.anaconda.org/conda-forge/noarch/iris-sample-data-2.5.2-pyhd8ed1ab_0.conda#895f6625dd8a246fece9279fcc12c1de
 https://conda.anaconda.org/conda-forge/linux-64/jack-1.9.22-h7c63dc7_2.conda#f56277b7f079f1b13cbf7fb9b4f194c4
 https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.4.7-py311hd18a35c_0.conda#be34c90cce87090d24da64a7c239ca96
 https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-31_h59b9bed_openblas.conda#728dbebd0f7a20337218beacffd37916

diff --git a/requirements/locks/py312-linux-64.lock b/requirements/locks/py312-linux-64.lock
@@ -156,7 +156,7 @@ https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.cond
 https://conda.anaconda.org/conda-forge/noarch/idna-3.10-pyhd8ed1ab_1.conda#39a4f67be3286c86d696df570b1201b7
 https://conda.anaconda.org/conda-forge/noarch/imagesize-1.4.1-pyhd8ed1ab_0.tar.bz2#7de5386c8fea29e76b303f37dde4c352
 https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_1.conda#6837f3eff7dcea42ecd714ce1ac2b108
-https://conda.anaconda.org/conda-forge/noarch/iris-sample-data-2.5.1-pyhd8ed1ab_1.conda#ae376af0a29183e98a95508ed6944664
+https://conda.anaconda.org/conda-forge/noarch/iris-sample-data-2.5.2-pyhd8ed1ab_0.conda#895f6625dd8a246fece9279fcc12c1de
 https://conda.anaconda.org/conda-forge/linux-64/jack-1.9.22-h7c63dc7_2.conda#f56277b7f079f1b13cbf7fb9b4f194c4
 https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.4.8-py312h84d6215_0.conda#6713467dc95509683bfa3aca08524e8a
 https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-31_h59b9bed_openblas.conda#728dbebd0f7a20337218beacffd37916

diff --git a/requirements/locks/py313-linux-64.lock b/requirements/locks/py313-linux-64.lock
@@ -162,7 +162,7 @@ https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.cond
 https://conda.anaconda.org/conda-forge/noarch/idna-3.10-pyhd8ed1ab_1.conda#39a4f67be3286c86d696df570b1201b7
 https://conda.anaconda.org/conda-forge/noarch/imagesize-1.4.1-pyhd8ed1ab_0.tar.bz2#7de5386c8fea29e76b303f37dde4c352
 https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_1.conda#6837f3eff7dcea42ecd714ce1ac2b108
-https://conda.anaconda.org/conda-forge/noarch/iris-sample-data-2.5.1-pyhd8ed1ab_1.conda#ae376af0a29183e98a95508ed6944664
+https://conda.anaconda.org/conda-forge/noarch/iris-sample-data-2.5.2-pyhd8ed1ab_0.conda#895f6625dd8a246fece9279fcc12c1de
 https://conda.anaconda.org/conda-forge/linux-64/jack-1.9.22-h7c63dc7_2.conda#f56277b7f079f1b13cbf7fb9b4f194c4
 https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.4.7-py313h33d0bda_0.conda#9862d13a5e466273d5a4738cffcb8d6c
 https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-31_h59b9bed_openblas.conda#728dbebd0f7a20337218beacffd37916