From 9b06a20cf5dab6a6d4d5e5f98954f3cc91cd727d Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Thu, 21 Sep 2023 23:01:54 +0200 Subject: [PATCH] Python bindings: GetArrowStreamAsNumPy(): fix reading fixed size list arrays that were ignoring the parent offset (affects Parquet) --- autotest/ogr/ogr_parquet.py | 36 +++++++++++++++++++++++++++++++++--- swig/include/gdal_array.i | 19 ++++++++++++++----- 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/autotest/ogr/ogr_parquet.py b/autotest/ogr/ogr_parquet.py index 08b0ac8dd49f..aa96b888fa23 100755 --- a/autotest/ogr/ogr_parquet.py +++ b/autotest/ogr/ogr_parquet.py @@ -1597,15 +1597,23 @@ def test_ogr_parquet_arrow_stream_numpy(): assert batches[1]["string"][1] == b"d" assert numpy.array_equal(batch["list_boolean"][0], numpy.array([])) assert numpy.array_equal(batch["list_boolean"][1], numpy.array([False])) + + assert numpy.array_equal( + batches[0]["fixed_size_list_boolean"][0], numpy.array([True, False]) + ) + assert numpy.array_equal( + batches[0]["fixed_size_list_boolean"][1], numpy.array([False, True]) + ) assert numpy.array_equal( - batch["fixed_size_list_boolean"][0], numpy.array([True, False]) + batches[0]["fixed_size_list_boolean"][2], numpy.array([True, False]) ) assert numpy.array_equal( - batches[1]["fixed_size_list_boolean"][0], numpy.array([True, False]) + batches[1]["fixed_size_list_boolean"][0], numpy.array([False, True]) ) assert numpy.array_equal( - batches[1]["fixed_size_list_boolean"][1], numpy.array([False, True]) + batches[1]["fixed_size_list_boolean"][1], numpy.array([True, False]) ) + assert numpy.array_equal(batch["fixed_size_list_uint8"][0], numpy.array([0, 1])) assert numpy.array_equal(batch["list_uint64"][1], numpy.array([0])), batch[ "list_uint64" @@ -1625,6 +1633,28 @@ def test_ogr_parquet_arrow_stream_numpy(): batches[1]["list_string"][1], numpy.array([b"A", b"BC", b"CDE", b"DEFG"]) ) + assert numpy.array_equal(batches[0]["list_uint8"][0], numpy.array([])) + assert numpy.array_equal(batches[0]["list_uint8"][1], numpy.array([0])) + assert numpy.array_equal(batches[0]["list_uint8"][2], numpy.array([])) + assert numpy.array_equal(batches[1]["list_uint8"][0], numpy.array([0, 4, 5])) + assert numpy.array_equal(batches[1]["list_uint8"][1], numpy.array([0, 7, 8, 9])) + + assert numpy.array_equal( + batches[0]["fixed_size_list_uint8"][0], numpy.array([0, 1]) + ) + assert numpy.array_equal( + batches[0]["fixed_size_list_uint8"][1], numpy.array([2, 3]) + ) + assert numpy.array_equal( + batches[0]["fixed_size_list_uint8"][2], numpy.array([4, 5]) + ) + assert numpy.array_equal( + batches[1]["fixed_size_list_uint8"][0], numpy.array([6, 7]) + ) + assert numpy.array_equal( + batches[1]["fixed_size_list_uint8"][1], numpy.array([8, 9]) + ) + ignored_fields = ["geometry"] lyr_defn = lyr.GetLayerDefn() for i in range(lyr_defn.GetFieldCount()): diff --git a/swig/include/gdal_array.i b/swig/include/gdal_array.i index b512c0062cec..4d119acab8fd 100644 --- a/swig/include/gdal_array.i +++ b/swig/include/gdal_array.i @@ -1144,15 +1144,24 @@ PyObject* _RecordBatchAsNumpy(VoidPtrAsLong recordBatchPtr, Py_DECREF(dict); Py_RETURN_NONE; } - if( arrayField->children[0]->n_buffers != 2 ) + const struct ArrowArray* psChildArray = arrayField->children[0]; + if( psChildArray->n_buffers != 2 ) { CPLError(CE_Failure, CPLE_AppDefined, - "Field %s: arrayField->children[0]->n_buffers != 2", + "Field %s: psChildArray->n_buffers != 2", schemaField->name); Py_DECREF(dict); Py_RETURN_NONE; } const int nLength = atoi(arrowType + strlen("+w:")); + if( psChildArray->length < nLength * arrayField->length ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "Field %s: psChildArray->length < nLength * arrayField->length", + schemaField->name); + Py_DECREF(dict); + Py_RETURN_NONE; + } numpyArray = PyArray_SimpleNew(1, &dims, NPY_OBJECT); for( npy_intp j = 0; j < dims; j++ ) { @@ -1163,8 +1172,8 @@ PyObject* _RecordBatchAsNumpy(VoidPtrAsLong recordBatchPtr, subObj = PyArray_SimpleNew(1, &nvalues, NPY_BOOL); for( npy_intp k = 0; k < nvalues; k++ ) { - size_t srcOffset = static_cast(arrayField->children[0]->offset + j * nLength + k); - uint8_t val = (((uint8_t*)arrayField->children[0]->buffers[1])[srcOffset / 8] >> (srcOffset % 8)) & 1; + size_t srcOffset = static_cast(psChildArray->offset + (j + arrayField->offset) * nLength + k); + uint8_t val = (((uint8_t*)psChildArray->buffers[1])[srcOffset / 8] >> (srcOffset % 8)) & 1; *(uint8_t*)PyArray_GETPTR1((PyArrayObject *) subObj, k) = val; } } @@ -1172,7 +1181,7 @@ PyObject* _RecordBatchAsNumpy(VoidPtrAsLong recordBatchPtr, { subObj = PyArray_SimpleNewFromData( 1, &nvalues, typenum, - (char*)arrayField->children[0]->buffers[1] + static_cast(arrayField->children[0]->offset) + j * nLength * sizeOfType); + (char*)psChildArray->buffers[1] + static_cast((psChildArray->offset + (j + arrayField->offset) * nLength) * sizeOfType)); /* Keep a reference to the owner object */ #if NPY_API_VERSION >= 0x00000007 PyArray_SetBaseObject((PyArrayObject *) subObj, pointerArrayKeeper);