Skip to content

Commit 8677e33

Browse files
jorisvandenbosscheamoeba
authored andcommitted
GH-45175: [Python] Honor the strings_to_categorical keyword in to_pandas for string view type (#45176)
### Rationale for this change Currently this keyword works for string or large string: ```python >>> table = pa.table({"col": pa.array(["a", "b", "a"], pa.string())}) >>> table.to_pandas(strings_to_categorical=True).dtypes col category dtype: object >>> table = pa.table({"col": pa.array(["a", "b", "a"], pa.large_string())}) >>> table.to_pandas(strings_to_categorical=True).dtypes col category dtype: object ``` but not for string view: ```python >>> table = pa.table({"col": pa.array(["a", "b", "a"], pa.string_view())}) >>> table.to_pandas(strings_to_categorical=True).dtypes col object dtype: object ``` For consistency we should make that keyword check for string view columns as well, I think From https://github.com/apache/arrow/pull/44195/files#r1901831460 ### Are these changes tested? Yes ### Are there any user-facing changes? Yes, when using the `strings_to_categorical=True` keyword and having a string_view type, this column will now be converted to a pandas Categorical * GitHub Issue: #45175 Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com> Signed-off-by: Raúl Cumplido <raulcumplido@gmail.com>
1 parent f41f590 commit 8677e33

File tree

2 files changed

+32
-6
lines changed

2 files changed

+32
-6
lines changed

python/pyarrow/src/arrow/python/arrow_to_pandas.cc

+4-2
Original file line numberDiff line numberDiff line change
@@ -2523,7 +2523,8 @@ Status ConvertCategoricals(const PandasOptions& options, ChunkedArrayVector* arr
25232523
}
25242524
if (options.strings_to_categorical) {
25252525
for (int i = 0; i < static_cast<int>(arrays->size()); i++) {
2526-
if (is_base_binary_like((*arrays)[i]->type()->id())) {
2526+
if (is_base_binary_like((*arrays)[i]->type()->id()) ||
2527+
is_binary_view_like((*arrays)[i]->type()->id())) {
25272528
columns_to_encode.push_back(i);
25282529
}
25292530
}
@@ -2557,7 +2558,8 @@ Status ConvertChunkedArrayToPandas(const PandasOptions& options,
25572558
py_ref = nullptr;
25582559
}
25592560

2560-
if (options.strings_to_categorical && is_base_binary_like(arr->type()->id())) {
2561+
if (options.strings_to_categorical && (is_base_binary_like(arr->type()->id()) ||
2562+
is_binary_view_like(arr->type()->id()))) {
25612563
if (options.zero_copy_only) {
25622564
return Status::Invalid("Need to dictionary encode a column, but ",
25632565
"only zero-copy conversions allowed");

python/pyarrow/tests/test_pandas.py

+28-4
Original file line numberDiff line numberDiff line change
@@ -1836,10 +1836,13 @@ def test_to_pandas_categories_already_dictionary(self):
18361836
result = table.to_pandas(categories=['col'])
18371837
assert table.to_pandas().equals(result)
18381838

1839-
def test_table_str_to_categorical_without_na(self):
1839+
@pytest.mark.parametrize(
1840+
"string_type", [pa.string(), pa.large_string(), pa.string_view()]
1841+
)
1842+
def test_table_str_to_categorical_without_na(self, string_type):
18401843
values = ['a', 'a', 'b', 'b', 'c']
18411844
df = pd.DataFrame({'strings': values})
1842-
field = pa.field('strings', pa.string())
1845+
field = pa.field('strings', string_type)
18431846
schema = pa.schema([field])
18441847
table = pa.Table.from_pandas(df, schema=schema)
18451848

@@ -1851,10 +1854,22 @@ def test_table_str_to_categorical_without_na(self):
18511854
table.to_pandas(strings_to_categorical=True,
18521855
zero_copy_only=True)
18531856

1854-
def test_table_str_to_categorical_with_na(self):
1857+
# chunked array
1858+
result = table["strings"].to_pandas(strings_to_categorical=True)
1859+
expected = pd.Series(pd.Categorical(values), name="strings")
1860+
tm.assert_series_equal(result, expected)
1861+
1862+
with pytest.raises(pa.ArrowInvalid):
1863+
table["strings"].to_pandas(strings_to_categorical=True,
1864+
zero_copy_only=True)
1865+
1866+
@pytest.mark.parametrize(
1867+
"string_type", [pa.string(), pa.large_string(), pa.string_view()]
1868+
)
1869+
def test_table_str_to_categorical_with_na(self, string_type):
18551870
values = [None, 'a', 'b', np.nan]
18561871
df = pd.DataFrame({'strings': values})
1857-
field = pa.field('strings', pa.string())
1872+
field = pa.field('strings', string_type)
18581873
schema = pa.schema([field])
18591874
table = pa.Table.from_pandas(df, schema=schema)
18601875

@@ -1866,6 +1881,15 @@ def test_table_str_to_categorical_with_na(self):
18661881
table.to_pandas(strings_to_categorical=True,
18671882
zero_copy_only=True)
18681883

1884+
# chunked array
1885+
result = table["strings"].to_pandas(strings_to_categorical=True)
1886+
expected = pd.Series(pd.Categorical(values), name="strings")
1887+
tm.assert_series_equal(result, expected)
1888+
1889+
with pytest.raises(pa.ArrowInvalid):
1890+
table["strings"].to_pandas(strings_to_categorical=True,
1891+
zero_copy_only=True)
1892+
18691893
# Regression test for ARROW-2101
18701894
def test_array_of_bytes_to_strings(self):
18711895
converted = pa.array(np.array([b'x'], dtype=object), pa.string())

0 commit comments

Comments
 (0)