Skip to content

Commit

Permalink
DEPR: keep_date_col, nested parse_dates in read_csv (#56569)
Browse files Browse the repository at this point in the history
* DEPR: keep_date_col, nested parse_dates in read_csv

* update doc, mypy fixup
  • Loading branch information
jbrockmendel authored Dec 19, 2023
1 parent 8b8f0d0 commit 38c2877
Show file tree
Hide file tree
Showing 5 changed files with 231 additions and 58 deletions.
8 changes: 8 additions & 0 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -836,6 +836,7 @@ order) and the new column names will be the concatenation of the component
column names:

.. ipython:: python
:okwarning:
data = (
"KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
Expand All @@ -856,6 +857,7 @@ By default the parser removes the component date columns, but you can choose
to retain them via the ``keep_date_col`` keyword:

.. ipython:: python
:okwarning:
df = pd.read_csv(
"tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True
Expand All @@ -871,6 +873,7 @@ single column.
You can also use a dict to specify custom name columns:

.. ipython:: python
:okwarning:
date_spec = {"nominal": [1, 2], "actual": [1, 3]}
df = pd.read_csv("tmp.csv", header=None, parse_dates=date_spec)
Expand All @@ -883,6 +886,7 @@ data columns:


.. ipython:: python
:okwarning:
date_spec = {"nominal": [1, 2], "actual": [1, 3]}
df = pd.read_csv(
Expand All @@ -902,6 +906,10 @@ data columns:
for your data to store datetimes in this format, load times will be
significantly faster, ~20x has been observed.

.. deprecated:: 2.2.0
Combining date columns inside read_csv is deprecated. Use ``pd.to_datetime``
on the relevant result columns instead.


Date parsing functions
++++++++++++++++++++++
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,7 @@ Other Deprecations
- Deprecated strings ``H``, ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`)
- Deprecated strings ``H``, ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`)
- Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`)
- Deprecated support for combining parsed datetime columns in :func:`read_csv` along with the ``keep_date_col`` keyword (:issue:`55569`)
- Deprecated the :attr:`.DataFrameGroupBy.grouper` and :attr:`SeriesGroupBy.grouper`; these attributes will be removed in a future version of pandas (:issue:`56521`)
- Deprecated the :class:`.Grouping` attributes ``group_index``, ``result_index``, and ``group_arraylike``; these will be removed in a future version of pandas (:issue:`56148`)
- Deprecated the ``errors="ignore"`` option in :func:`to_datetime`, :func:`to_timedelta`, and :func:`to_numeric`; explicitly catch exceptions instead (:issue:`54467`)
Expand Down
76 changes: 66 additions & 10 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from pandas.core.dtypes.common import (
is_file_like,
is_float,
is_hashable,
is_integer,
is_list_like,
pandas_dtype,
Expand Down Expand Up @@ -649,7 +650,7 @@ def read_csv(
skip_blank_lines: bool = ...,
parse_dates: bool | Sequence[Hashable] | None = ...,
infer_datetime_format: bool | lib.NoDefault = ...,
keep_date_col: bool = ...,
keep_date_col: bool | lib.NoDefault = ...,
date_parser: Callable | lib.NoDefault = ...,
date_format: str | dict[Hashable, str] | None = ...,
dayfirst: bool = ...,
Expand Down Expand Up @@ -709,7 +710,7 @@ def read_csv(
skip_blank_lines: bool = ...,
parse_dates: bool | Sequence[Hashable] | None = ...,
infer_datetime_format: bool | lib.NoDefault = ...,
keep_date_col: bool = ...,
keep_date_col: bool | lib.NoDefault = ...,
date_parser: Callable | lib.NoDefault = ...,
date_format: str | dict[Hashable, str] | None = ...,
dayfirst: bool = ...,
Expand Down Expand Up @@ -769,7 +770,7 @@ def read_csv(
skip_blank_lines: bool = ...,
parse_dates: bool | Sequence[Hashable] | None = ...,
infer_datetime_format: bool | lib.NoDefault = ...,
keep_date_col: bool = ...,
keep_date_col: bool | lib.NoDefault = ...,
date_parser: Callable | lib.NoDefault = ...,
date_format: str | dict[Hashable, str] | None = ...,
dayfirst: bool = ...,
Expand Down Expand Up @@ -829,7 +830,7 @@ def read_csv(
skip_blank_lines: bool = ...,
parse_dates: bool | Sequence[Hashable] | None = ...,
infer_datetime_format: bool | lib.NoDefault = ...,
keep_date_col: bool = ...,
keep_date_col: bool | lib.NoDefault = ...,
date_parser: Callable | lib.NoDefault = ...,
date_format: str | dict[Hashable, str] | None = ...,
dayfirst: bool = ...,
Expand Down Expand Up @@ -903,7 +904,7 @@ def read_csv(
# Datetime Handling
parse_dates: bool | Sequence[Hashable] | None = None,
infer_datetime_format: bool | lib.NoDefault = lib.no_default,
keep_date_col: bool = False,
keep_date_col: bool | lib.NoDefault = lib.no_default,
date_parser: Callable | lib.NoDefault = lib.no_default,
date_format: str | dict[Hashable, str] | None = None,
dayfirst: bool = False,
Expand Down Expand Up @@ -934,6 +935,38 @@ def read_csv(
storage_options: StorageOptions | None = None,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
) -> DataFrame | TextFileReader:
if keep_date_col is not lib.no_default:
# GH#55569
warnings.warn(
"The 'keep_date_col' keyword in pd.read_csv is deprecated and "
"will be removed in a future version. Explicitly remove unwanted "
"columns after parsing instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
else:
keep_date_col = False

if lib.is_list_like(parse_dates):
# GH#55569
depr = False
# error: Item "bool" of "bool | Sequence[Hashable] | None" has no
# attribute "__iter__" (not iterable)
if not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr]
depr = True
elif isinstance(parse_dates, dict) and any(
lib.is_list_like(x) for x in parse_dates.values()
):
depr = True
if depr:
warnings.warn(
"Support for nested sequences for 'parse_dates' in pd.read_csv "
"is deprecated. Combine the desired columns with pd.to_datetime "
"after parsing instead.",
FutureWarning,
stacklevel=find_stack_level(),
)

if infer_datetime_format is not lib.no_default:
warnings.warn(
"The argument 'infer_datetime_format' is deprecated and will "
Expand Down Expand Up @@ -1004,7 +1037,7 @@ def read_table(
skip_blank_lines: bool = ...,
parse_dates: bool | Sequence[Hashable] = ...,
infer_datetime_format: bool | lib.NoDefault = ...,
keep_date_col: bool = ...,
keep_date_col: bool | lib.NoDefault = ...,
date_parser: Callable | lib.NoDefault = ...,
date_format: str | dict[Hashable, str] | None = ...,
dayfirst: bool = ...,
Expand Down Expand Up @@ -1061,7 +1094,7 @@ def read_table(
skip_blank_lines: bool = ...,
parse_dates: bool | Sequence[Hashable] = ...,
infer_datetime_format: bool | lib.NoDefault = ...,
keep_date_col: bool = ...,
keep_date_col: bool | lib.NoDefault = ...,
date_parser: Callable | lib.NoDefault = ...,
date_format: str | dict[Hashable, str] | None = ...,
dayfirst: bool = ...,
Expand Down Expand Up @@ -1118,7 +1151,7 @@ def read_table(
skip_blank_lines: bool = ...,
parse_dates: bool | Sequence[Hashable] = ...,
infer_datetime_format: bool | lib.NoDefault = ...,
keep_date_col: bool = ...,
keep_date_col: bool | lib.NoDefault = ...,
date_parser: Callable | lib.NoDefault = ...,
date_format: str | dict[Hashable, str] | None = ...,
dayfirst: bool = ...,
Expand Down Expand Up @@ -1175,7 +1208,7 @@ def read_table(
skip_blank_lines: bool = ...,
parse_dates: bool | Sequence[Hashable] = ...,
infer_datetime_format: bool | lib.NoDefault = ...,
keep_date_col: bool = ...,
keep_date_col: bool | lib.NoDefault = ...,
date_parser: Callable | lib.NoDefault = ...,
date_format: str | dict[Hashable, str] | None = ...,
dayfirst: bool = ...,
Expand Down Expand Up @@ -1248,7 +1281,7 @@ def read_table(
# Datetime Handling
parse_dates: bool | Sequence[Hashable] = False,
infer_datetime_format: bool | lib.NoDefault = lib.no_default,
keep_date_col: bool = False,
keep_date_col: bool | lib.NoDefault = lib.no_default,
date_parser: Callable | lib.NoDefault = lib.no_default,
date_format: str | dict[Hashable, str] | None = None,
dayfirst: bool = False,
Expand Down Expand Up @@ -1279,6 +1312,29 @@ def read_table(
storage_options: StorageOptions | None = None,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
) -> DataFrame | TextFileReader:
if keep_date_col is not lib.no_default:
# GH#55569
warnings.warn(
"The 'keep_date_col' keyword in pd.read_table is deprecated and "
"will be removed in a future version. Explicitly remove unwanted "
"columns after parsing instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
else:
keep_date_col = False

# error: Item "bool" of "bool | Sequence[Hashable]" has no attribute "__iter__"
if lib.is_list_like(parse_dates) and not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr]
# GH#55569
warnings.warn(
"Support for nested sequences for 'parse_dates' in pd.read_table "
"is deprecated. Combine the desired columns with pd.to_datetime "
"after parsing instead.",
FutureWarning,
stacklevel=find_stack_level(),
)

if infer_datetime_format is not lib.no_default:
warnings.warn(
"The argument 'infer_datetime_format' is deprecated and will "
Expand Down
Loading

0 comments on commit 38c2877

Please sign in to comment.