Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stop PPDataProxy accessing the file when no data is needed. #3659

Merged
merged 12 commits into from
Feb 19, 2020
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
* Fixed a problem which was causing file loads to fetch *all* field data
whenever UM files (PP or Fieldsfiles) were loaded.
With large sourcefiles, initial file loads are slow, with large memory usage
before any cube data is even fetched. Large enough files will cause a crash.
The problem occurs only with Dask versions >= 2.0.
30 changes: 18 additions & 12 deletions lib/iris/fileformats/pp.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# (C) British Crown Copyright 2010 - 2019, Met Office
# (C) British Crown Copyright 2010 - 2020, Met Office
#
# This file is part of Iris.
#
Expand Down Expand Up @@ -52,7 +52,7 @@
LBPROC_MAP as lbproc_map)
import iris.fileformats.rules
import iris.coord_systems

from iris.util import _array_slice_ifempty

try:
import mo_pack
Expand Down Expand Up @@ -644,16 +644,22 @@ def ndim(self):
return len(self.shape)

def __getitem__(self, keys):
with open(self.path, 'rb') as pp_file:
pp_file.seek(self.offset, os.SEEK_SET)
data_bytes = pp_file.read(self.data_len)
data = _data_bytes_to_shaped_array(data_bytes,
self.lbpack,
self.boundary_packing,
self.shape, self.src_dtype,
self.mdi)
data = data.__getitem__(keys)
return np.asanyarray(data, dtype=self.dtype)
# Check for 'empty' slicings, in which case don't fetch the data.
# Because, since Dask v2, 'dask.array.from_array' performs an empty
# slicing and we must not fetch the data at that time.
result = _array_slice_ifempty(keys, self.shape, self.dtype)
if result is None:
with open(self.path, 'rb') as pp_file:
pp_file.seek(self.offset, os.SEEK_SET)
data_bytes = pp_file.read(self.data_len)
data = _data_bytes_to_shaped_array(data_bytes,
self.lbpack,
self.boundary_packing,
self.shape, self.src_dtype,
self.mdi)
result = data.__getitem__(keys)

return np.asanyarray(result, dtype=self.dtype)

def __repr__(self):
fmt = '<{self.__class__.__name__} shape={self.shape}' \
Expand Down
117 changes: 115 additions & 2 deletions lib/iris/tests/unit/fileformats/pp/test_PPDataProxy.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# (C) British Crown Copyright 2014 - 2019, Met Office
# (C) British Crown Copyright 2014 - 2020, Met Office
#
# This file is part of Iris.
#
Expand All @@ -19,10 +19,14 @@
from __future__ import (absolute_import, division, print_function)
from six.moves import (filter, input, map, range, zip) # noqa

import six

# Import iris.tests first so that some things can be initialised before
# importing anything else.
import iris.tests as tests

import numpy as np

from iris.fileformats.pp import PPDataProxy, SplittableInt
from iris.tests import mock

Expand All @@ -35,7 +39,7 @@ def test_lbpack_SplittableInt(self):
self.assertEqual(proxy.lbpack, lbpack)
self.assertIs(proxy.lbpack, lbpack)

def test_lnpack_raw(self):
def test_lbpack_raw(self):
lbpack = 4321
proxy = PPDataProxy(None, None, None, None,
None, lbpack, None, None)
Expand All @@ -48,5 +52,114 @@ def test_lnpack_raw(self):
self.assertEqual(proxy.lbpack.n4, lbpack // 1000 % 10)


class SliceTranslator():
"""
Class to translate an array-indexing expression into a tuple of keys.
An instance just returns the argument of its __getitem__ call.
"""
def __getitem__(self, keys):
return keys


# A multidimensional-indexable object that returns its index keys, so we can
# use multidimensional-indexing notation to specify a slicing expression.
Slices = SliceTranslator()


class Test__getitem__slicing(tests.IrisTest):
def _check_slicing(self, test_shape, indices, result_shape,
data_was_fetched=True):
# Check behaviour of the getitem call with specific slicings.
# Especially: check cases where a fetch does *not* read from the file.
# This is necessary because, since Dask 2.0, the "from_array" function
# takes a zero-length slice of its array argument, to capture array
# metadata, and in those cases we want to avoid file access.
test_dtype = np.dtype(np.float32)
proxy = PPDataProxy(shape=test_shape, src_dtype=test_dtype,
path=None, offset=None, data_len=None,
lbpack=0, # Note: a 'real' value is needed.
boundary_packing=None, mdi=None)

# Mock out the file-open call, to see if the file would be read.
if six.PY2:
builtin_open_func_name = '__builtin__.open'
else:
builtin_open_func_name = 'builtins.open'
mock_fileopen = self.patch(builtin_open_func_name)

# Also mock out the 'databytes_to_shaped_array' call, to fake minimal
# operation in the cases where file-open *does* get called.
fake_data = np.zeros(test_shape, dtype=test_dtype)
self.patch('iris.fileformats.pp._data_bytes_to_shaped_array',
mock.MagicMock(return_value=fake_data))

# Test the requested indexing operation.
result = proxy.__getitem__(indices)

# Check the behaviour and results were as expected.
self.assertEqual(mock_fileopen.called, data_was_fetched)
self.assertIsInstance(result, np.ndarray)
self.assertEqual(result.dtype, test_dtype)
self.assertEqual(result.shape, result_shape)

def test_slicing_1d_normal(self):
# A 'normal' 1d testcase with no empty slices.
self._check_slicing(
test_shape=(3,),
indices=Slices[1:10],
result_shape=(2,),
data_was_fetched=True)

def test_slicing_1d_empty(self):
# A 1d testcase with an empty slicing.
self._check_slicing(
test_shape=(3,),
indices=Slices[0:0],
result_shape=(0,),
data_was_fetched=False)

def test_slicing_2d_normal(self):
# A 2d testcase with no empty slices.
self._check_slicing(
test_shape=(3, 4),
indices=Slices[2, :3],
result_shape=(3,),
data_was_fetched=True)

def test_slicing_2d_allempty(self):
# A 2d testcase with all empty slices.
self._check_slicing(
test_shape=(3, 4),
indices=Slices[0:0, 0:0],
result_shape=(0, 0),
data_was_fetched=False)

def test_slicing_2d_empty_dim0(self):
# A 2d testcase with an empty slice.
self._check_slicing(
test_shape=(3, 4),
indices=Slices[0:0],
result_shape=(0, 4),
data_was_fetched=False)

def test_slicing_2d_empty_dim1(self):
# A 2d testcase with an empty slice, and an integer index.
self._check_slicing(
test_shape=(3, 4),
indices=Slices[1, 0:0],
result_shape=(0,),
data_was_fetched=False)

def test_slicing_complex(self):
# Multiple dimensions with multiple empty slices.
self._check_slicing(
test_shape=(3, 4, 2, 5, 6, 3, 7),
indices=Slices[1:3, 2, 0:0, :, 1:1, :100],
result_shape=(2, 0, 5, 0, 3, 7),
data_was_fetched=False)


if __name__ == '__main__':
tests.main()
63 changes: 62 additions & 1 deletion lib/iris/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# (C) British Crown Copyright 2010 - 2019, Met Office
# (C) British Crown Copyright 2010 - 2020, Met Office
#
# This file is part of Iris.
#
Expand Down Expand Up @@ -915,6 +915,67 @@ def __lt__(self, other):
return NotImplemented


def _array_slice_ifempty(keys, shape, dtype):
"""
Detect cases where an array slice will contain no data, as it contains a
zero-length dimension, and produce an equivalent result for those cases.
The function indicates 'empty' slicing cases, by returning an array equal
to the slice result in those cases.
Args:
* keys (indexing key, or tuple of keys):
The argument from an array __getitem__ call.
Only tuples of integers and slices are supported, in particular no
newaxis, ellipsis or array keys.
These are the types of array access usage we expect from Dask.
* shape (tuple of int):
The shape of the array being indexed.
* dtype (numpy.dtype):
The dtype of the array being indexed.
Returns:
result (np.ndarray or None):
If 'keys' contains a slice(0, 0), this is an ndarray of the correct
resulting shape and provided dtype.
Otherwise it is None.
.. note::
This is used to prevent DataProxy arraylike objects from fetching their
file data when wrapped as Dask arrays.
This is because, for Dask >= 2.0, the "dask.array.from_array" call
performs a fetch like [0:0, 0:0, ...], to 'snapshot' array metadata.
This function enables us to avoid triggering a file data fetch in those
cases : This is consistent because the result will not contain any
actual data content.
"""
# Convert a single key into a 1-tuple, so we always have a tuple of keys.
if isinstance(keys, tuple):
keys_tuple = keys
else:
keys_tuple = (keys,)

if any(key == slice(0, 0) for key in keys_tuple):
# An 'empty' slice is present : Return a 'fake' array instead.
target_shape = list(shape)
for i_dim, key in enumerate(keys_tuple):
if key == slice(0, 0):
# Reduce dims with empty slicing to length 0.
target_shape[i_dim] = 0
# Create a prototype result : no memory usage, as some dims are 0.
result = np.zeros(target_shape, dtype=dtype)
# Index with original keys to produce the desired result shape.
# Note : also ok in 0-length dims, as the slice is always '0:0'.
result = result[keys]
else:
result = None

return result


def create_temp_filename(suffix=''):
"""Return a temporary file name.
Expand Down