机器学习-中国移动梧桐杯- 智慧金融赛道TOP方案
梧桐杯”中国移动大数据应用创新大赛 - 智慧金融赛道赛事链接背景介绍任务描述数据集描述算法部分数据预处理建模预测赛事链接赛事链接背景介绍在金融领域,活跃着一批“职业羊毛党”,他们通过套现、套利行为大肆牟利,影响普通用户本应享有的权益。他们制作各种自动、半自动的黑产工具,如自动注册机、刷单自动机、短信代接平台、分身软件、猫池等,并捆绑手机卡、银行卡或通过第三方平台交易完成套现,从而实现“薅羊毛”活动
·
赛事链接
背景介绍
在金融领域,活跃着一批“职业羊毛党”,他们通过套现、套利行为大肆牟利,影响普通用户本应享有的权益。他们制作各种自动、半自动的黑产工具,如自动注册机、刷单自动机、短信代接平台、分身软件、猫池等,并捆绑手机卡、银行卡或通过第三方平台交易完成套现,从而实现“薅羊毛”活动,自身获利的同时对商家、银行、平台、运营商的利益造成损失。如何从普通用户中有效鉴别出羊毛党,从而提前进行防范,在实际商业应用中有着重要的意义。本届竞赛将从真实场景和实际应用出发,在智慧金融领域新增了更具挑战性、更加务实的任务,期待参赛选手们能在这些任务上相互切磋、共同进步。
任务描述
防羊毛党评分模型旨在从普通用户中区分出羊毛党用户号码,本次挑战赛设置了更具挑战性的任务,相比其他竞赛,本次竞赛所提供的训练字段相对较少,总体为低资源的竞赛任务,具体包含:
初赛——利用用户通信、流量、app使用等行为数据识别真实羊毛党群体。正样本比例不超过5%。
数据集描述
初赛均提供下载数据,选手在本地进行算法调试,在比赛页面提交结果。其中每个号码样本有2个月份数据,由号码和月份组成共同主键。所有号码均经过加密处理。
算法部分
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,roc_auc_score
import warnings
warnings.filterwarnings('ignore')
data_a = pd.read_csv('data/data_a.csv',na_values=r'\N')
data_b = pd.read_csv('B榜给选手的数据/data_b.csv',na_values=r'\N')
label = pd.read_csv('data/train_label.csv')
predict_a = pd.read_csv('data/to_pred_a.csv')
predict_b = pd.read_csv('B榜给选手的数据/to_pred_b.csv')
data = data_a.append(data_b).reset_index(drop=True)
数据预处理
#三个分类变量缺失值填补
data.loc[data['if_family'].isna() == 1,'if_family'] = 0
data.loc[data['if_group'].isna() == 1,'if_group'] = 0
data.loc[data['sms_inpkg_ind'].isna() == 1,'sms_inpkg_ind'] = 1
#缺失比例大于50%的列,是否缺失二分类,三个特征处理后提分0.0015
for f in ['call_cnt','up_flux','down_flux']:
data.loc[data[f].isna() == 0,f] = 1
data.loc[data[f].isna() == 1,f] = 0
#判断数值特征是否>0
for f in data.columns:
if f not in ['if_family','if_group','sms_inpkg_ind','month','phone']:
data[f'{f}>0'] = data[f].apply( lambda x : 1 if x > 0 else 0)
##对数平滑
data.loc[data['chrg_cnt'] > 2,'chrg_cnt'] = 3 #chrg_cnt异常值盖帽处理
data['monfix_fee'] = np.log(data['monfix_fee'].values+1)
data['gift_acct_amt'] = np.log(data['gift_acct_amt'].values+1)
data['gprs_fee'] = np.log(data['gprs_fee'].values+1)
data['month_max'] = data.groupby(by=['phone'])['month'].transform('max')
data['month_diff'] = data['month_max'] - data['month']
cat_feats = ['if_family','if_group','sms_inpkg_ind']
data[cat_feats] = data[cat_feats].astype('int')
#数据集划分为2个月
data1 = data.loc[data['month_diff'] == 1]
data2 = data.loc[data['month_diff'] == 0]
##重构数据集,拼接前后两个月的数据
data_reshape = data1.merge(data2,on='phone',how='left')
#删除掉无用的列
data_reshape.drop(labels=['month_x','month_max_x','month_diff_x','month_y','month_max_y','month_diff_y'],axis=1,inplace=True)
train = label.merge(data_reshape,on='phone',how='left')
test_a= predict_a.merge(data_reshape,on='phone',how='left')
test_b= predict_b.merge(data_reshape,on='phone',how='left')
## 自定义评价函数F1—score
def F1_score(preds, dtrain):
labels = dtrain.get_label()
preds = np.where(preds>0.5,1,0)
f_score = f1_score(labels, preds, average = 'binary')
return 'f1_score', f_score, True
建模预测
def lgb_model(train,test,k,seed):
drop_feats = [f for f in train.columns if train[f].nunique() <= 1]
feats = [f for f in train.columns if f not in drop_feats + ['label','phone']]
print(f'特征个数:{len(feats)}')
print('-'*100)
xtrain = train[feats]
ytrain = train['label']
cat_feats = ['if_family_x','if_group_x','sms_inpkg_ind_x','if_family_y','if_group_y','sms_inpkg_ind_y']
categorical_features = [feats.index(i) for i in cat_feats]
folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
offline_score = []
output_preds = []
feature_importance_df = pd.DataFrame()
for i, (train_index, test_index) in enumerate(folds.split(xtrain, ytrain)):
train_y, valid_y = ytrain[train_index], ytrain[test_index]
train_X, valid_X = xtrain[feats].iloc[train_index, :], xtrain[feats].iloc[test_index, :]
dtrain = lgb.Dataset(train_X, label= train_y)
dvalid = lgb.Dataset(valid_X, label= valid_y)
parameters={'metric': None,
'learning_rate': 0.1,
'boosting_type': 'gbdt',
'max_depth':-1,
'num_leaves': 2**6-1,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'is_unbalance':True,
'categorical_feature':categorical_features,
'verbose': -1
}
lgb_model = lgb.train(params=parameters,
train_set=dtrain,
num_boost_round=10000,
valid_sets=[dtrain,dvalid],
early_stopping_rounds=200,
verbose_eval=500,
feval=F1_score
)
offline_score.append(lgb_model.best_score['valid_1']['f1_score'])
output_preds.append(lgb_model.predict(test[feats], num_iteration=lgb_model.best_iteration))
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = feats
fold_importance_df["importance"] = lgb_model.feature_importance(importance_type='gain')
fold_importance_df["fold"] = i + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
feature_importance_df = feature_importance_df.groupby(['feature'])['importance'].mean().sort_values(ascending=False)
print('OOF-MEAN-F1 score:%.6f, OOF-STD:%.6f' % (np.mean(offline_score), np.std(offline_score)))
#print(feature_importance_df.head(10))
return output_preds
output_preds1 = lgb_model(train,test_b,5,0)
output_preds2 = lgb_model(train,test_b,5,1111111)
output_preds3 = lgb_model(train,test_b,5,2021)
output_preds = (output_preds1 + output_preds2 + output_preds3)
predict_b['label'] = np.mean(output_preds,axis=0)
predict_b.loc[predict_b['label']>0.2,'label'] = 1
predict_b.loc[predict_b['label']<0.2,'label'] = 0
predict_b['sum'] = test_b[['chrg_amt_x','out_activcall_fee_x','chrg_amt_y','out_activcall_fee_y']].sum(axis=1)
predict_b.loc[(predict_b['sum']>0)&(predict_b['label'] == 1),'label'] = 0
predict_b.drop(labels='sum',axis=1,inplace=True)
predict_b.to_csv('submit/sub1_b.csv',index=False)
更多推荐
已为社区贡献1条内容
所有评论(0)