流失预测模型实证-Pareto/NBD模型
互联网购物基本是一种非契约型协议,顾客的购买行为均具有随机性和不可预测性,那如何在此激烈的网络市场立于不败之地,
那就应该尽可能的降低网络顾客的流失率。
目前用于预测顾客流失率的模型有:
SVM模型,Logistics模型,Pareto/NBD模型,BG/NBD模型以及引申的各类模型,通过结合分类模型评估方法,就可以检验模型的准确率,
从而进一步应用与实际用例中。
其中:Pareto/NBD模型介绍:
Pareto/NBD模型:
一:变量:
ID:用户ID
x:在指定时间段内,重复购买次数(不包括第一次购买)
Tx:最后一次购买的时长,即在指定时间段内,第一次购买到最后一次购买的周期数,保留小数
T:第一购买到最后指定时间的最后一天的周期数,保留小数
T_star: 预测时间周期长度
二:4个参数的估计:
R code:
Python Code:
import pandas as pd
from pnbd import pnbd # 自己开发的模块
consume_sources=pd.read_excel(\’test.xls\’)
cal_cbs=consume_sources.loc[:,[\’x\’,\’t_x\’,\’T_cal\’]]
fun,x=pnbd.EstimateParameters(cal_cbs,par_start = [1, 1, 1, 1], max_param_value = 1000)
print fun,x
# 0.4846256 1.4887628 0.6738844 4.4011076
# 150346.098305 [ 0.48458758 1.48866284 0.67405456 4.40356461]
两个软件运行的结果一样
三:预测用户的活跃度
params=x
consume_sources[\’P\’]=consume_sources.apply(lambda x:
pnbd.PAlive(params,x=x[\’x\’], t_x=x[\’t_x\’], T_cal=x[\’T_cal\’]),axis=1)
print consume_sources.head(20)
*** 阈值选取可以选择算法选择合理的阈值
活跃度计算指标:
样本活跃百分比=3817/22731=16.79%
Accuracy准确率=(2421+15342)/22731=78.14%
Sensitivity覆盖率=2421/3817=63.43%
Precision命中率=2421/5993=40.40%
流失率计算指标:
样本流失百分比=3817/22731=83.21%
Accuracy准确率=(2421+15342)/22731=78.14%
Sensitivity覆盖率=15342/18914=81.12%
Precision命中率=15342/16738=91.66%
四:预测未来一个月用户的购买期望次数:
consume_sources[\’E\’]=consume_sources.apply(lambda x:
pnbd.ConditionalExpectedTransactions(params,T_star=5,
x=x[\’x\’], t_x=x[\’t_x\’], T_cal=x[\’T_cal\’]),axis=1)
consume_sources.to_excel(\’test.xls\’,index=False)
Python代码:
pnbd.py
# -*-coding:utf-8-*-
\'\'\'
Pareto/NBD模型代码:
仿照R语言:BTYD
params <- c("r", "alpha", "s", "beta")
# 分组求和
pnbd.compress.cbs(cbs, 0)
# 个体客户在时刻T的活跃度模型
pnbd.PAlive(params, 10, 35, 39)
# 客户在时间T后t*时间内发生交易的交易次数的期望模型
pnbd.ConditionalExpectedTransactions(params, T.star=2, x=10, t.x=35, T.cal=39)
# 计算对数极大似然值
pnbd.LL(params, cbs[, "x"], cbs[, "t.x"], cbs[, "T.cal"])
pnbd.cbs.LL(params, cbs)
# 评估4个参数,采用最小方差法
result<-pnbd.EstimateParameters(cal.cbs, par.start = c(1, 1, 1, 1),max.param.value = 100)
\'\'\'
from math import lgamma
from scipy.special import gammaln
import math
from scipy.optimize import minimize
import numpy as np
def LL(params, x, t_x, T_cal):
x = np.matrix(x).T
t_x = np.matrix(t_x).T
T_cal = np.matrix(T_cal).T
# z:为数值
def h2f1(a, b, c, z):
lenz = len(z)
j = 0
uj =[1 for i in range(1,lenz+1)]
uj=np.matrix(uj).T
y = uj
lteps = 0
while (lteps < lenz):
lasty = y
j = j + 1
uj = np.multiply(uj , (a + j - 1))
uj = np.multiply(uj,(b + j - 1))
uj = np.multiply(uj,z)
temp=np.multiply((c + j - 1) ,j)
uj = np.multiply(uj,1/temp)
y = y + uj
lteps = sum(y == lasty)
return y
r = params[0]
alpha = params[1]
s = params[2]
beta = params[3]
maxab = max(alpha, beta)
absab = abs(alpha - beta)
param2 = s + 1
if (alpha < beta):
param2 = r + x
part1 = r * np.log(alpha) + s * np.log(beta) - lgamma(r) + gammaln(r +x)
part2 = np.multiply(-(r + x),np.log(alpha + T_cal)) - s * np.log(beta + T_cal)
if (absab == 0):
partF = np.multiply(-(r + s + x),np.log(maxab + t_x))\
+ np.log(1 - np.power(((maxab + t_x) / (maxab + T_cal)),r + s + x))
else:
F1 = h2f1(r + s + x, param2, r + s + x + 1, absab / (maxab +t_x))
F2 = h2f1(r + s + x, param2, r + s + x + 1,absab / (maxab + T_cal))
F2 = np.multiply(F2,np.power((maxab + t_x)/(maxab + T_cal), r + s + x))
partF = np.multiply(-(r + s + x), np.log(maxab + t_x)) + np.log(F1 - F2)
part3 = np.log(s) - np.log(r + s + x) + partF
return (part1 + np.log(np.exp(part2) + np.exp(part3)))
# cbs:DataFrame , params:list
def cbs_LL(params, cal_cbs):
x = cal_cbs[\'x\'].values
t_x = cal_cbs[\'t_x\'].values
T_cal = cal_cbs[\'T_cal\'].values
if \'custs\' in cal_cbs.columns:
custs = cal_cbs[\'custs\'].values
else:
custs = [1 for i in range(len(cal_cbs))]
custs = np.matrix(custs).T
ll=sum(np.multiply(custs, LL(params,x,t_x,T_cal)))
return ll[0,0]
# 个体客户在时刻T的活跃度模型
# params=[0.55, 10.56, 0.61, 11.64] PAlive(params, 10, 35, 39)
def PAlive(params, x, t_x, T_cal):
# h2f1: F(a,b,c,z)为高斯超几何函数
def h2f1(a, b, c, z):
j = 0
uj = 1
y = uj
lteps = 0
while (lteps < 1):
lasty = y
j = j + 1
uj = uj * (a + j - 1) * (b + j - 1) / (c + j - 1) * z / j
y = y + uj
if y == lasty:
lteps = lteps + 1
return y
r = params[0]
alpha = params[1]
s = params[2]
beta = params[3]
if (alpha > beta):
F1 = h2f1(r + s + x, s + 1, r + s + x + 1, (alpha - beta) / (alpha + t_x))
F2 = h2f1(r + s + x, s + 1, r + s + x + 1, (alpha - beta) / (alpha + T_cal))
# 与R语言中BTYD.pnbd.PAlive有差异
A0 = F1 / math.pow(alpha+t_x,r+x+s) - F2 / math.pow(alpha + T_cal,r + s + x)
# A0 = F1 / (math.pow(alpha+T_cal,s)*math.pow(alpha + t_x,r+x)) - \
# F2 / math.pow(alpha + T_cal,r + s + x)
elif (alpha < beta):
F1 = h2f1(r + s + x, r + x, r + s + x + 1, (beta - alpha) / (beta + t_x))
F2 = h2f1(r + s + x, r + x, r + s + x + 1, (beta - alpha) / (beta + T_cal))
A0 = F1 / math.pow(beta + t_x,r + s + x) - \
F2 / math.pow(beta + T_cal , r + s + x)
else:
return math.pow(1 + s / (r + s + x) * (math.pow((alpha+T_cal)/(alpha+t_x),r+x+s)-1),-1)
return math.pow(1 + s / (r + s + x) * math.pow(alpha + T_cal,r + x) * math.pow(beta +T_cal,s)* A0,-1)
# 估计4个参数: c("r", "alpha", "s", "beta")
# 所以参数不超过:max_param_value值
# for example:
# EstimateParameters(data_Soures, par_start = [1, 1, 1, 1], max_param_value =100)
def EstimateParameters(cal_cbs, par_start = [1, 1, 1, 1], max_param_value = 10000):
# 对数似然值
def pnbd_ell(params,cal_cbs,max_param_value):
params = np.exp(params)
params[0] = min(params[0], max_param_value)
params[1] = min(params[1], max_param_value)
params[2] = min(params[2], max_param_value)
params[3] = min(params[3], max_param_value)
return -1 * cbs_LL(params=params,cal_cbs=cal_cbs)
logparams = np.log(par_start)
results = minimize(fun=pnbd_ell,x0=logparams,
args=(cal_cbs,max_param_value),
method="L-BFGS-B")
return results[\'fun\'],np.exp(results[\'x\'])
# 单一值
# 客户在时间T后t*时间内发生交易的交易次数的期望模型
def ConditionalExpectedTransactions(params, T_star, x, t_x, T_cal):
r = params[0]
alpha = params[1]
s = params[2]
beta = params[3]
P1 = (r + x) * (beta + T_cal) / ((alpha + T_cal) * (s - 1))
# 与R语言中BTYD.pnbd.ConditionalExpectedTransactions有差异
# P2 < - (1 - ((beta + T.cal) / (beta + T.cal + T.star)) ^ (s - 1))
P2 = (1 -math.pow(((beta + T_cal) / (beta + T_cal + T_star)) , (s - 1)) )
P3 = PAlive(params, x, t_x, T_cal)
return P1 * P2 * P3