Skip to content

Commit

Permalink
Add clv_summary Utility Function (#164)
Browse files Browse the repository at this point in the history
* Fixed copy warning, consolidated fixtures, added CDNOW dataset

* Added type hinting and black formatting

* Added cdnow_data to pytest

* Revised clv_summary parameters

* reset index of clv_summary output

* conftest linting

* renamed cdnow fixture and added data source url

* converted cdnow_trans to csv and moved conftest
  • Loading branch information
ColtAllen authored Feb 21, 2023
1 parent 99a1c2c commit 7ebc194
Show file tree
Hide file tree
Showing 5 changed files with 7,502 additions and 1 deletion.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,6 @@ dmypy.json

.DS_Store
.vscode

# PyCharm .idea files
.idea/
219 changes: 219 additions & 0 deletions pymc_marketing/clv/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from datetime import datetime
from typing import Union

import numpy as np
import pandas as pd
import xarray

__all__ = ["to_xarray", "customer_lifetime_value", "clv_summary"]


def to_xarray(customer_id, *arrays, dim: str = "customer_id"):
"""Convert vector arrays to xarray with a common dim (default "customer_id")."""
Expand Down Expand Up @@ -116,3 +119,219 @@ def customer_lifetime_value(
)

return clv


def _find_first_transactions(
transactions: pd.DataFrame,
customer_id_col: str,
datetime_col: str,
monetary_value_col: str = None,
datetime_format: str = None,
observation_period_end: Union[str, pd.Period, datetime] = None,
time_unit: str = "D",
) -> pd.DataFrame:
"""
Return dataframe with first transactions.
This takes a DataFrame of transaction data of the form:
customer_id, datetime [, monetary_value]
and appends a column named 'repeated' to the transaction log which indicates which rows
are repeated transactions for that customer_id.
Adapted from lifetimes package
https://github.com/CamDavidsonPilon/lifetimes/blob/41e394923ad72b17b5da93e88cfabab43f51abe2/lifetimes/utils.py#L148
Parameters
----------
transactions: :obj: DataFrame
A Pandas DataFrame that contains the customer_id col and the datetime col.
customer_id_col: string
Column in the transactions DataFrame that denotes the customer_id.
datetime_col: string
Column in the transactions DataFrame that denotes the datetime the purchase was made.
monetary_value_col: string, optional
Column in the transactions DataFrame that denotes the monetary value of the transaction.
Optional; only needed for spend estimation models like the Gamma-Gamma model.
observation_period_end: :obj: datetime
A string or datetime to denote the final date of the study.
Events after this date are truncated. If not given, defaults to the max 'datetime_col'.
datetime_format: string, optional
A string that represents the timestamp format. Useful if Pandas can't understand
the provided format.
time_unit: string, optional
Time granularity for study.
Default: 'D' for days. Possible values listed here:
https://numpy.org/devdocs/reference/arrays.datetime.html#datetime-units
"""

select_columns = [customer_id_col, datetime_col]

if monetary_value_col:
select_columns.append(monetary_value_col)

transactions = transactions[select_columns].sort_values(select_columns).copy()

# convert date column into a DateTimeIndex for time-wise grouping and truncating
transactions[datetime_col] = pd.to_datetime(
transactions[datetime_col], format=datetime_format
)
transactions = (
transactions.set_index(datetime_col).to_period(time_unit).to_timestamp()
)

transactions = transactions.loc[
(transactions.index <= observation_period_end)
].reset_index()

period_groupby = transactions.groupby(
[datetime_col, customer_id_col], sort=False, as_index=False
)

if monetary_value_col:
# when processing a monetary column, make sure to sum together transactions made in the same period
period_transactions = period_groupby.sum()
else:
# by calling head() on the groupby object, the datetime and customer_id columns
# will be reduced to the first transaction of that time period
period_transactions = period_groupby.head(1)

# create a new column for flagging first transactions
period_transactions = period_transactions.copy()
period_transactions.loc[:, "first"] = False
# find all first transactions and store as an index
first_transactions = (
period_transactions.groupby(customer_id_col, sort=True, as_index=False)
.head(1)
.index
)
# flag first transactions as True
period_transactions.loc[first_transactions, "first"] = True
select_columns.append("first")
# reset datetime_col to period
period_transactions.loc[:, datetime_col] = pd.Index(
period_transactions[datetime_col]
).to_period(time_unit)

return period_transactions[select_columns]


def clv_summary(
transactions: pd.DataFrame,
customer_id_col: str,
datetime_col: str,
monetary_value_col: str = None,
datetime_format: str = None,
observation_period_end: Union[str, pd.Period, datetime] = None,
time_unit: str = "D",
time_scaler: float = 1,
) -> pd.DataFrame:
"""
Summarize transaction data for modeling.
This transforms a DataFrame of transaction data of the form:
customer_id, datetime [, monetary_value]
to a DataFrame of the form:
customer_id, frequency, recency, T [, monetary_value]
Adapted from lifetimes package
https://github.com/CamDavidsonPilon/lifetimes/blob/41e394923ad72b17b5da93e88cfabab43f51abe2/lifetimes/utils.py#L230
Parameters
----------
transactions: :obj: DataFrame
A Pandas DataFrame that contains the customer_id col and the datetime col.
customer_id_col: string
Column in the transactions DataFrame that denotes the customer_id.
datetime_col: string
Column in the transactions DataFrame that denotes the datetime the purchase was made.
monetary_value_col: string, optional
Column in the transactions DataFrame that denotes the monetary value of the transaction.
Optional; only needed for spend estimation models like the Gamma-Gamma model.
observation_period_end: datetime, optional
A string or datetime to denote the final date of the study.
Events after this date are truncated. If not given, defaults to the max 'datetime_col'.
datetime_format: string, optional
A string that represents the timestamp format. Useful if Pandas can't understand
the provided format.
time_unit: string, optional
Time granularity for study.
Default: 'D' for days. Possible values listed here:
https://numpy.org/devdocs/reference/arrays.datetime.html#datetime-units
time_scaler: int, optional
Default: 1. Useful for scaling recency & T to a different time granularity. Example:
With freq='D' and freq_multiplier=1, we get recency=591 and T=632
With freq='h' and freq_multiplier=24, we get recency=590.125 and T=631.375
This is useful if predictions in a different time granularity are desired,
and can also help with model convergence for study periods of many years.
Returns
-------
:obj: DataFrame:
customer_id, frequency, recency, T [, monetary_value]
"""

if observation_period_end is None:
observation_period_end = (
pd.to_datetime(transactions[datetime_col].max(), format=datetime_format)
.to_period(time_unit)
.to_timestamp()
)
else:
observation_period_end = (
pd.to_datetime(observation_period_end, format=datetime_format)
.to_period(time_unit)
.to_timestamp()
)

# label repeated transactions
repeated_transactions = _find_first_transactions(
transactions,
customer_id_col,
datetime_col,
monetary_value_col,
datetime_format,
observation_period_end,
time_unit,
)
# reset datetime_col to timestamp
repeated_transactions[datetime_col] = pd.Index(
repeated_transactions[datetime_col]
).to_timestamp()

# count all orders by customer
customers = repeated_transactions.groupby(customer_id_col, sort=False)[
datetime_col
].agg(["min", "max", "count"])

# subtract 1 from count for non-repeat customers
customers["frequency"] = customers["count"] - 1

customers["T"] = (
(observation_period_end - customers["min"])
/ np.timedelta64(1, time_unit)
/ time_scaler
)
customers["recency"] = (
(customers["max"] - customers["min"])
/ np.timedelta64(1, time_unit)
/ time_scaler
)

summary_columns = ["frequency", "recency", "T"]

if monetary_value_col:
# create an index of first purchases
first_purchases = repeated_transactions[repeated_transactions["first"]].index
# Exclude first purchases from the mean value calculation,
# by setting as null, then imputing with zero
repeated_transactions.loc[first_purchases, monetary_value_col] = np.nan
customers["monetary_value"] = (
repeated_transactions.groupby(customer_id_col)[monetary_value_col]
.mean()
.fillna(0)
)
summary_columns.append("monetary_value")

summary_df = customers[summary_columns].astype(float)

return summary_df.reset_index()
Loading

0 comments on commit 7ebc194

Please sign in to comment.