Skip to content

Commit

Permalink
Python bindings: GetArrowStreamAsNumPy(): fix reading fixed size list…
Browse files Browse the repository at this point in the history
… arrays that were ignoring the parent offset (affects Parquet)
  • Loading branch information
rouault committed Sep 22, 2023
1 parent 1369bbb commit 9b06a20
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 8 deletions.
36 changes: 33 additions & 3 deletions autotest/ogr/ogr_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1597,15 +1597,23 @@ def test_ogr_parquet_arrow_stream_numpy():
assert batches[1]["string"][1] == b"d"
assert numpy.array_equal(batch["list_boolean"][0], numpy.array([]))
assert numpy.array_equal(batch["list_boolean"][1], numpy.array([False]))

assert numpy.array_equal(
batches[0]["fixed_size_list_boolean"][0], numpy.array([True, False])
)
assert numpy.array_equal(
batches[0]["fixed_size_list_boolean"][1], numpy.array([False, True])
)
assert numpy.array_equal(
batch["fixed_size_list_boolean"][0], numpy.array([True, False])
batches[0]["fixed_size_list_boolean"][2], numpy.array([True, False])
)
assert numpy.array_equal(
batches[1]["fixed_size_list_boolean"][0], numpy.array([True, False])
batches[1]["fixed_size_list_boolean"][0], numpy.array([False, True])
)
assert numpy.array_equal(
batches[1]["fixed_size_list_boolean"][1], numpy.array([False, True])
batches[1]["fixed_size_list_boolean"][1], numpy.array([True, False])
)

assert numpy.array_equal(batch["fixed_size_list_uint8"][0], numpy.array([0, 1]))
assert numpy.array_equal(batch["list_uint64"][1], numpy.array([0])), batch[
"list_uint64"
Expand All @@ -1625,6 +1633,28 @@ def test_ogr_parquet_arrow_stream_numpy():
batches[1]["list_string"][1], numpy.array([b"A", b"BC", b"CDE", b"DEFG"])
)

assert numpy.array_equal(batches[0]["list_uint8"][0], numpy.array([]))
assert numpy.array_equal(batches[0]["list_uint8"][1], numpy.array([0]))
assert numpy.array_equal(batches[0]["list_uint8"][2], numpy.array([]))
assert numpy.array_equal(batches[1]["list_uint8"][0], numpy.array([0, 4, 5]))
assert numpy.array_equal(batches[1]["list_uint8"][1], numpy.array([0, 7, 8, 9]))

assert numpy.array_equal(
batches[0]["fixed_size_list_uint8"][0], numpy.array([0, 1])
)
assert numpy.array_equal(
batches[0]["fixed_size_list_uint8"][1], numpy.array([2, 3])
)
assert numpy.array_equal(
batches[0]["fixed_size_list_uint8"][2], numpy.array([4, 5])
)
assert numpy.array_equal(
batches[1]["fixed_size_list_uint8"][0], numpy.array([6, 7])
)
assert numpy.array_equal(
batches[1]["fixed_size_list_uint8"][1], numpy.array([8, 9])
)

ignored_fields = ["geometry"]
lyr_defn = lyr.GetLayerDefn()
for i in range(lyr_defn.GetFieldCount()):
Expand Down
19 changes: 14 additions & 5 deletions swig/include/gdal_array.i
Original file line number Diff line number Diff line change
Expand Up @@ -1144,15 +1144,24 @@ PyObject* _RecordBatchAsNumpy(VoidPtrAsLong recordBatchPtr,
Py_DECREF(dict);
Py_RETURN_NONE;
}
if( arrayField->children[0]->n_buffers != 2 )
const struct ArrowArray* psChildArray = arrayField->children[0];
if( psChildArray->n_buffers != 2 )
{
CPLError(CE_Failure, CPLE_AppDefined,
"Field %s: arrayField->children[0]->n_buffers != 2",
"Field %s: psChildArray->n_buffers != 2",
schemaField->name);
Py_DECREF(dict);
Py_RETURN_NONE;
}
const int nLength = atoi(arrowType + strlen("+w:"));
if( psChildArray->length < nLength * arrayField->length )
{
CPLError(CE_Failure, CPLE_AppDefined,
"Field %s: psChildArray->length < nLength * arrayField->length",
schemaField->name);
Py_DECREF(dict);
Py_RETURN_NONE;
}
numpyArray = PyArray_SimpleNew(1, &dims, NPY_OBJECT);
for( npy_intp j = 0; j < dims; j++ )
{
Expand All @@ -1163,16 +1172,16 @@ PyObject* _RecordBatchAsNumpy(VoidPtrAsLong recordBatchPtr,
subObj = PyArray_SimpleNew(1, &nvalues, NPY_BOOL);
for( npy_intp k = 0; k < nvalues; k++ )
{
size_t srcOffset = static_cast<size_t>(arrayField->children[0]->offset + j * nLength + k);
uint8_t val = (((uint8_t*)arrayField->children[0]->buffers[1])[srcOffset / 8] >> (srcOffset % 8)) & 1;
size_t srcOffset = static_cast<size_t>(psChildArray->offset + (j + arrayField->offset) * nLength + k);
uint8_t val = (((uint8_t*)psChildArray->buffers[1])[srcOffset / 8] >> (srcOffset % 8)) & 1;
*(uint8_t*)PyArray_GETPTR1((PyArrayObject *) subObj, k) = val;
}
}
else
{
subObj = PyArray_SimpleNewFromData(
1, &nvalues, typenum,
(char*)arrayField->children[0]->buffers[1] + static_cast<size_t>(arrayField->children[0]->offset) + j * nLength * sizeOfType);
(char*)psChildArray->buffers[1] + static_cast<size_t>((psChildArray->offset + (j + arrayField->offset) * nLength) * sizeOfType));
/* Keep a reference to the owner object */
#if NPY_API_VERSION >= 0x00000007
PyArray_SetBaseObject((PyArrayObject *) subObj, pointerArrayKeeper);
Expand Down

0 comments on commit 9b06a20

Please sign in to comment.