赛事链接

赛事链接

背景介绍

在金融领域,活跃着一批“职业羊毛党”,他们通过套现、套利行为大肆牟利,影响普通用户本应享有的权益。他们制作各种自动、半自动的黑产工具,如自动注册机、刷单自动机、短信代接平台、分身软件、猫池等,并捆绑手机卡、银行卡或通过第三方平台交易完成套现,从而实现“薅羊毛”活动,自身获利的同时对商家、银行、平台、运营商的利益造成损失。如何从普通用户中有效鉴别出羊毛党,从而提前进行防范,在实际商业应用中有着重要的意义。本届竞赛将从真实场景和实际应用出发,在智慧金融领域新增了更具挑战性、更加务实的任务,期待参赛选手们能在这些任务上相互切磋、共同进步。

任务描述

防羊毛党评分模型旨在从普通用户中区分出羊毛党用户号码,本次挑战赛设置了更具挑战性的任务,相比其他竞赛,本次竞赛所提供的训练字段相对较少,总体为低资源的竞赛任务,具体包含:
初赛——利用用户通信、流量、app使用等行为数据识别真实羊毛党群体。正样本比例不超过5%。

数据集描述

初赛均提供下载数据,选手在本地进行算法调试,在比赛页面提交结果。其中每个号码样本有2个月份数据,由号码和月份组成共同主键。所有号码均经过加密处理。
在这里插入图片描述在这里插入图片描述

算法部分

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,roc_auc_score

import warnings
warnings.filterwarnings('ignore')
data_a = pd.read_csv('data/data_a.csv',na_values=r'\N')
data_b = pd.read_csv('B榜给选手的数据/data_b.csv',na_values=r'\N')
label = pd.read_csv('data/train_label.csv')
predict_a = pd.read_csv('data/to_pred_a.csv')
predict_b = pd.read_csv('B榜给选手的数据/to_pred_b.csv')
data = data_a.append(data_b).reset_index(drop=True)

数据预处理

#三个分类变量缺失值填补
data.loc[data['if_family'].isna() == 1,'if_family'] = 0
data.loc[data['if_group'].isna() == 1,'if_group'] = 0
data.loc[data['sms_inpkg_ind'].isna() == 1,'sms_inpkg_ind'] = 1
#缺失比例大于50%的列,是否缺失二分类,三个特征处理后提分0.0015
for f in ['call_cnt','up_flux','down_flux']:
    data.loc[data[f].isna() == 0,f] = 1
    data.loc[data[f].isna() == 1,f] = 0
#判断数值特征是否>0
for f in  data.columns:
    if f not in ['if_family','if_group','sms_inpkg_ind','month','phone']:
        data[f'{f}>0'] = data[f].apply( lambda x : 1 if x > 0 else 0)
##对数平滑
data.loc[data['chrg_cnt'] > 2,'chrg_cnt'] = 3  #chrg_cnt异常值盖帽处理
data['monfix_fee'] = np.log(data['monfix_fee'].values+1)
data['gift_acct_amt'] = np.log(data['gift_acct_amt'].values+1)
data['gprs_fee'] = np.log(data['gprs_fee'].values+1)
data['month_max'] = data.groupby(by=['phone'])['month'].transform('max')
data['month_diff'] = data['month_max'] - data['month']

cat_feats = ['if_family','if_group','sms_inpkg_ind']
data[cat_feats] = data[cat_feats].astype('int')

#数据集划分为2个月
data1 = data.loc[data['month_diff'] == 1]
data2 = data.loc[data['month_diff'] == 0]
##重构数据集,拼接前后两个月的数据
data_reshape = data1.merge(data2,on='phone',how='left')
#删除掉无用的列
data_reshape.drop(labels=['month_x','month_max_x','month_diff_x','month_y','month_max_y','month_diff_y'],axis=1,inplace=True)
train = label.merge(data_reshape,on='phone',how='left')
test_a= predict_a.merge(data_reshape,on='phone',how='left')
test_b= predict_b.merge(data_reshape,on='phone',how='left')
## 自定义评价函数F1—score
def F1_score(preds, dtrain):
    labels = dtrain.get_label()
    preds = np.where(preds>0.5,1,0)
    f_score = f1_score(labels, preds, average = 'binary')
    return 'f1_score', f_score, True

建模预测

def lgb_model(train,test,k,seed):
    drop_feats = [f for f in train.columns if train[f].nunique() <= 1]
    feats = [f for f in  train.columns if f not in drop_feats + ['label','phone']]
    print(f'特征个数:{len(feats)}')
    print('-'*100)
    xtrain = train[feats]
    ytrain = train['label']
    cat_feats = ['if_family_x','if_group_x','sms_inpkg_ind_x','if_family_y','if_group_y','sms_inpkg_ind_y']
    categorical_features = [feats.index(i) for i in cat_feats]
    folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
    offline_score = []
    output_preds = []
    feature_importance_df = pd.DataFrame()
    for i, (train_index, test_index) in enumerate(folds.split(xtrain, ytrain)):
        train_y, valid_y = ytrain[train_index], ytrain[test_index]
        train_X, valid_X = xtrain[feats].iloc[train_index, :], xtrain[feats].iloc[test_index, :]
        dtrain = lgb.Dataset(train_X, label= train_y)
        dvalid = lgb.Dataset(valid_X, label= valid_y)
        parameters={'metric': None,
                    'learning_rate': 0.1,
                    'boosting_type': 'gbdt',
                    'max_depth':-1,
                    'num_leaves': 2**6-1,
                    'feature_fraction': 0.8,
                    'bagging_fraction': 0.8,
                    'is_unbalance':True,
                    'categorical_feature':categorical_features,
                    'verbose': -1
                       }

        lgb_model = lgb.train(params=parameters,
                              train_set=dtrain,
                              num_boost_round=10000,
                              valid_sets=[dtrain,dvalid],
                              early_stopping_rounds=200,
                              verbose_eval=500,
                              feval=F1_score
                                 )
        offline_score.append(lgb_model.best_score['valid_1']['f1_score'])
        output_preds.append(lgb_model.predict(test[feats], num_iteration=lgb_model.best_iteration))
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = lgb_model.feature_importance(importance_type='gain')
        fold_importance_df["fold"] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        feature_importance_df = feature_importance_df.groupby(['feature'])['importance'].mean().sort_values(ascending=False)
    print('OOF-MEAN-F1 score:%.6f, OOF-STD:%.6f' % (np.mean(offline_score), np.std(offline_score)))
    #print(feature_importance_df.head(10))
    
    return output_preds
output_preds1 = lgb_model(train,test_b,5,0)
output_preds2 = lgb_model(train,test_b,5,1111111)
output_preds3 = lgb_model(train,test_b,5,2021)
output_preds = (output_preds1 + output_preds2 + output_preds3)
predict_b['label'] = np.mean(output_preds,axis=0)
predict_b.loc[predict_b['label']>0.2,'label'] = 1
predict_b.loc[predict_b['label']<0.2,'label'] = 0
predict_b['sum'] = test_b[['chrg_amt_x','out_activcall_fee_x','chrg_amt_y','out_activcall_fee_y']].sum(axis=1)
predict_b.loc[(predict_b['sum']>0)&(predict_b['label'] == 1),'label'] = 0
predict_b.drop(labels='sum',axis=1,inplace=True)
predict_b.to_csv('submit/sub1_b.csv',index=False)

更多推荐