Tutorial 5: Uplift modeling
Official LightAutoML github repository is here
[ ]:
%load_ext autoreload
%autoreload 2
Install LightAutoML
Uncomment if doesn’t clone repository by git. (ex.: colab, kaggle version)
[ ]:
#! pip install -U lightautoml
Import necessary libraries
[ ]:
# Standard python libraries
from copy import deepcopy
import os
import requests
# Installed libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch
# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task
from lightautoml.addons.uplift.base import AutoUplift, BaseLearnerWrapper, MetaLearnerWrapper
from lightautoml.addons.uplift import metalearners
from lightautoml.addons.uplift.metrics import (_available_uplift_modes,
TUpliftMetric,
calculate_graphic_uplift_curve,
calculate_min_max_uplift_auc,
calculate_uplift_at_top,
calculate_uplift_auc,
perfect_uplift_curve)
from lightautoml.addons.uplift.utils import create_linear_automl
from lightautoml.report.report_deco import ReportDecoUplift
%matplotlib inline
Parameters
Setting
[ ]:
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 300 # Time in seconds for automl run
TARGET_NAME = 'TARGET' # Target column name
TREATMENT_NAME = 'CODE_GENDER'
Fix torch number of threads and numpy seed
[ ]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)
Example data load
Load a dataset from the repository if doesn’t clone repository by git.
[ ]:
DATASET_DIR = '../data/'
DATASET_NAME = 'sampled_app_train.csv'
DATASET_FULLNAME = os.path.join(DATASET_DIR, DATASET_NAME)
DATASET_URL = 'https://raw.githubusercontent.com/AILab-MLTools/LightAutoML/master/example_data/test_data_files/sampled_app_train.csv'
[ ]:
%%time
if not os.path.exists(DATASET_FULLNAME):
os.makedirs(DATASET_DIR, exist_ok=True)
dataset = requests.get(DATASET_URL).text
with open(DATASET_FULLNAME, 'w') as output:
output.write(dataset)
[ ]:
%%time
data = pd.read_csv(DATASET_FULLNAME)
data.head()
(Optional) Some user feature preparation
[ ]:
%%time
data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
).astype(str)
data['report_dt'] = np.datetime64('2018-01-01')
data['constant'] = 1
data['allnan'] = np.nan
data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)
data['CODE_GENDER'] = (data['CODE_GENDER'] == 'M').astype(int)
Data splitting for train-test
[ ]:
%%time
stratify_value = data[TARGET_NAME] + 10 * data[TREATMENT_NAME]
train, test = train_test_split(data, test_size=3000, stratify=stratify_value, random_state=42)
test_target, test_treatment = test[TARGET_NAME].values.ravel(), test[TREATMENT_NAME].values.ravel()
Setup columns roles
[ ]:
%%time
roles = {
'target': TARGET_NAME,
'treatment': TREATMENT_NAME,
DatetimeRole(base_date=True, seasonality=(), base_feats=False): 'report_dt'
}
AutoUplift (use predefined uplift methods)
Fit autouplift
[ ]:
%%time
task = Task('binary')
autouplift = AutoUplift(task,
metric='adj_qini',
has_report=True,
test_size=0.2,
timeout=200,
# timeout_metalearner=5
)
autouplift.fit(train, roles, verbose=1)
Show rating of uplift methods (meta-learners)
[ ]:
%%time
rating_table = autouplift.get_metalearners_rating()
rating_table
Get best metalearner
[ ]:
%%time
best_metalearner = autouplift.create_best_metalearner(
update_metalearner_params={'timeout': None},
update_baselearner_params={'timeout': 30}
)
best_metalearner.fit(train, roles)
_ = best_metalearner.predict(test);
Predict to test data and check metrics
[ ]:
%%time
uplift_pred, treatment_pred, control_pred = best_metalearner.predict(test)
uplift_pred = uplift_pred.ravel()
roc_auc_treatment = roc_auc_score(test_target[test_treatment == 1], treatment_pred[test_treatment == 1])
roc_auc_control = roc_auc_score(test_target[test_treatment == 0], control_pred[test_treatment == 0])
uplift_auc_algo = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=False)
uplift_auc_algo_normed = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=True)
auc_base, auc_perfect = calculate_min_max_uplift_auc(test_target, test_treatment)
print('--- Check scores ---')
print('OOF scores "ROC_AUC":')
print('\tTreatment = {:.5f}'.format(roc_auc_treatment))
print('\tControl = {:.5f}'.format(roc_auc_control))
print('Uplift score of test group (default="adj_qini"):')
print('\tBaseline = {:.5f}'.format(auc_base))
print('\tAlgo (Normed) = {:.5f} ({:.5f})'.format(uplift_auc_algo, uplift_auc_algo_normed))
print('\tPerfect = {:.5f}'.format(auc_perfect))
AutoUplift (custom uplift methods)
Fit autouplift
[ ]:
%%time
# Set uplift candidate for choosing best of them
# !!!ATTENTION!!!
# This is a demonstration of the possibilities,
# You may use default set of candidates
task = Task('binary')
uplift_candidates = [
MetaLearnerWrapper(
name='TLearner__Default',
klass=metalearners.TLearner,
params={'base_task': task}
),
MetaLearnerWrapper(
name='TLearner__Custom',
klass=metalearners.TLearner,
params={
'treatment_learner': BaseLearnerWrapper(
name='__TabularAutoML__',
klass=TabularAutoML,
params={'task': task, 'timeout': 10}),
'control_learner': BaseLearnerWrapper(
name='__Linear__',
klass=create_linear_automl,
params={'task': Task('binary')})
}
),
MetaLearnerWrapper(
name='XLearner__Custom',
klass=metalearners.XLearner,
params={
'outcome_learners': [
TabularAutoML(task=task, timeout=10), # [sec] , Only speed up example, don't change it!
create_linear_automl(task=Task('binary'))
],
'effect_learners': [BaseLearnerWrapper(
name='__TabularAutoML__',
klass=TabularAutoML,
params={'task': Task('reg'), 'timeout': 5})],
'propensity_learner': create_linear_automl(task=Task('binary')),
}
)
]
autouplift = AutoUplift(task,
uplift_candidates=uplift_candidates,
metric='adj_qini',
test_size=0.2,
threshold_imbalance_treatment=0.0, # Doesn't affect, see warnings
timeout=600) # Doesn't affect, see warnings
autouplift.fit(train, roles, verbose=1)
Show rating of uplift methods (meta-learners)
[ ]:
%%time
rating_table = autouplift.get_metalearners_rating()
rating_table
Predict to test data and check metrics
[ ]:
%%time
uplift_pred, treatment_pred, control_pred = autouplift.predict(test)
uplift_pred = uplift_pred.ravel()
roc_auc_treatment = roc_auc_score(test_target[test_treatment == 1], treatment_pred[test_treatment == 1])
roc_auc_control = roc_auc_score(test_target[test_treatment == 0], control_pred[test_treatment == 0])
uplift_auc_algo = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=False)
uplift_auc_algo_normed = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=True)
auc_base, auc_perfect = calculate_min_max_uplift_auc(test_target, test_treatment)
print('--- Check scores ---')
print('OOF scores "ROC_AUC":')
print('\tTreatment = {:.5f}'.format(roc_auc_treatment))
print('\tControl = {:.5f}'.format(roc_auc_control))
print('Uplift score of test group (default="adj_qini"):')
print('\tBaseline = {:.5f}'.format(auc_base))
print('\tAlgo (Normed) = {:.5f} ({:.5f})'.format(uplift_auc_algo, uplift_auc_algo_normed))
print('\tPerfect = {:.5f}'.format(auc_perfect))
AutoUplift with custom metric
Fit autouplift
[ ]:
%%time
# Using a custom metric
# How to determine custom metric, see below
task = Task('binary')
class CustomUpliftMetric(TUpliftMetric):
def __call__(self, target: np.ndarray, uplift_pred: np.ndarray, treatment: np.ndarray) -> float:
up_10 = calculate_uplift_at_top(target, uplift_pred, treatment, 10)
up_20 = calculate_uplift_at_top(target, uplift_pred, treatment, 20)
return 0.5 * (up_10 + up_20)
autouplift = AutoUplift(task,
add_dd_candidates=True,
metric=CustomUpliftMetric(),
test_size=0.2,
threshold_imbalance_treatment=0.0,
cpu_limit=10,
timeout=100)
autouplift.fit(train, roles)
Show rating of uplift methods (meta-learners)
[ ]:
%%time
rating_table = autouplift.get_metalearners_ranting()
rating_table
MetaLearner
TLearner
Fit on train data
[ ]:
%%time
# Default setting
tlearner = metalearners.TLearner(base_task=Task('binary'), cpu_limit=5)
tlearner.fit(train, roles)
Predict to test data and check metrics
[ ]:
%%time
uplift_pred, treatment_pred, control_pred = tlearner.predict(test)
uplift_pred = uplift_pred.ravel()
roc_auc_treatment = roc_auc_score(test_target[test_treatment == 1], treatment_pred[test_treatment == 1])
roc_auc_control = roc_auc_score(test_target[test_treatment == 0], control_pred[test_treatment == 0])
uplift_auc_algo = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=False)
uplift_auc_algo_normed = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=True)
auc_base, auc_perfect = calculate_min_max_uplift_auc(test_target, test_treatment)
print('--- Check scores ---')
print('OOF scores "ROC_AUC":')
print('\tTreatment = {:.5f}'.format(roc_auc_treatment))
print('\tControl = {:.5f}'.format(roc_auc_control))
print('Uplift score of test group (default="adj_qini"):')
print('\tBaseline = {:.5f}'.format(auc_base))
print('\tAlgo (Normed) = {:.5f} ({:.5f})'.format(uplift_auc_algo, uplift_auc_algo_normed))
print('\tPerfect = {:.5f}'.format(auc_perfect))
XLearner
Fit on train data
[ ]:
%%time
# Custom base algorithm
xlearner = metalearners.XLearner(
propensity_learner=TabularAutoML(task=Task('binary'), timeout=10),
outcome_learners=[
TabularAutoML(task=Task('binary'), timeout=10),
TabularAutoML(task=Task('binary'), timeout=10)
],
effect_learners=[
TabularAutoML(task=Task('reg'), timeout=10),
TabularAutoML(task=Task('reg'), timeout=10)
]
)
xlearner.fit(train, roles)
Predict to test data and check metrics
[ ]:
%%time
uplift_pred, treatment_pred, control_pred = xlearner.predict(test)
uplift_pred = uplift_pred.ravel()
roc_auc_treatment = roc_auc_score(test_target[test_treatment == 1], treatment_pred[test_treatment == 1])
roc_auc_control = roc_auc_score(test_target[test_treatment == 0], control_pred[test_treatment == 0])
uplift_auc_algo = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=False)
uplift_auc_algo_normed = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=True)
auc_base, auc_perfect = calculate_min_max_uplift_auc(test_target, test_treatment)
print('--- Check scores ---')
print('OOF scores "ROC_AUC":')
print('\tTreatment = {:.5f}'.format(roc_auc_treatment))
print('\tControl = {:.5f}'.format(roc_auc_control))
print('Uplift score of test group (default="adj_qini"):')
print('\tBaseline = {:.5f}'.format(auc_base))
print('\tAlgo (Normed) = {:.5f} ({:.5f})'.format(uplift_auc_algo, uplift_auc_algo_normed))
print('\tPerfect = {:.5f}'.format(auc_perfect))
Uplift metrics and graphics (using xlearner predictions)
[ ]:
%%time
UPLIFT_METRIC = 'adj_qini'
print("All available uplift metrics: {}".format(_available_uplift_modes))
Algorithm uplift curve
[ ]:
%%time
# Algorithm curve
xs_xlearner, ys_xlearner = calculate_graphic_uplift_curve(
test_target, uplift_pred, test_treatment, mode=UPLIFT_METRIC
)
Baseline, perfect curve
[ ]:
# Baseline curve
xs_base, ys_base = xs_xlearner, xs_xlearner * ys_xlearner[-1]
# Perfect curver
perfect_uplift = perfect_uplift_curve(test_target, test_treatment)
xs_perfect, ys_perfect = calculate_graphic_uplift_curve(
test_target, perfect_uplift, test_treatment, mode=UPLIFT_METRIC)
[ ]:
plt.figure(figsize=(10, 7))
plt.plot(xs_base, ys_base, 'black')
plt.plot(xs_xlearner, ys_xlearner, 'red')
plt.plot(xs_perfect, ys_perfect, 'green')
plt.fill_between(xs_xlearner, ys_base, ys_xlearner, alpha=0.5, color='orange')
plt.xlabel('Cumulative percentage of people in T/C groups')
plt.ylabel('Uplift metric (%s)'.format(UPLIFT_METRIC))
plt.grid()
plt.legend(['Baseline', 'XLearner', 'Perfect']);
Uplift TOP-K
[ ]:
tops = np.arange(5, 101, 5)
uplift_at_tops = []
for top in tops:
uat = calculate_uplift_at_top(test_target, uplift_pred, test_treatment, top=top)
uplift_at_tops.append(uat)
plt.figure(figsize=(10, 7))
plt.plot(tops, uplift_at_tops, marker='.')
plt.legend(['Uplift_At_K'])
plt.xticks(np.arange(0, 101, 10))
plt.grid()
Custom metric
[ ]:
# Custom metric can be used in AutoUplift
# There msut be a function's signature:
# def custom_metric(target, uplift_pred, treatment) -> float:
class CustomUpliftMetric(TUpliftMetric):
def __call__(self, target: np.ndarray, uplift_pred: np.ndarray, treatment: np.ndarray) -> float:
up_10 = calculate_uplift_at_top(target, uplift_pred, treatment, 10)
up_20 = calculate_uplift_at_top(target, uplift_pred, treatment, 20)
return 0.5 * (up_10 + up_20)
metric = CustomUpliftMetric()
metric_value = metric(test_target, uplift_pred, test_treatment)
print("Metric = {}".format(metric_value))
Report
[ ]:
%%time
RDU = ReportDecoUplift()
tlearner_deco = RDU(metalearners.TLearner(base_task=Task('binary')))
tlearner_deco.fit(train, roles)
_ = tlearner_deco.predict(test)
# Path to report: PATH_TO_CURRENT_NOTEBOOK/lama_report/lama_interactive_report.html