Python增加样本篇

For Data Mining

Posted by Jiayue Cai on April 12, 2018

Last updated on 2018-9-25…

本篇是特征工程和模型之外的技巧篇,方法是增加新样本,即利用半监督代替过采样,以解决样本不平衡问题。(»原文链接

迭代半监督

  • 利用最好的模型预测无标签数据,调整阈值 a,b 将样本添加到训练集,其中 01 样本比例为 5:1~9:1。如果线上有提高,将这部分样本添加到训练集继续训练融合来预测无标签数据,重复迭代。
import pandas as pd
import xgboost as xgb
import sys,random
import cPickle
import os

os.mkdir('featurescore')
os.mkdir('model')
os.mkdir('preds')

#设置a和b的值(0~1之间)

a = 0.16
b = 0.80

#离散化特征的计数特征

test_nd = pd.read_csv('../data/test_x_nd.csv')[['uid','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10']]
train_nd = pd.read_csv('../data/train_x_nd.csv')[['uid','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10']]
trainunlabeled_nd = pd.read_csv('../data/train_unlabeled_nd.csv')[['uid','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10']]

#缺失值个数的离散化特征

test_dnull = pd.read_csv('../data/test_x_null.csv')[['uid','discret_null']]
train_dnull = pd.read_csv('../data/train_x_null.csv')[['uid','discret_null']]
trainunlabeled_dnull = pd.read_csv('../data/train_unlabeled_null.csv')[['uid','discret_null']]

#n1~n10,discret_null 这11维特征不做特征选择,先放在一起

eleven_feature = ['n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','discret_null']
test_eleven = pd.merge(test_nd,test_dnull,on='uid')
train_eleven = pd.merge(train_nd,train_dnull,on='uid')
trainunlabeled_eleven = pd.merge(trainunlabeled_nd,trainunlabeled_dnull,on='uid')

del test_dnull,train_dnull,trainunlabeled_dnull
del test_nd,train_nd,trainunlabeled_nd


#离散特征

discret_feature_score = pd.read_csv('./discret_feature_score.csv')
fs = list(discret_feature_score.feature[0:500])
discret_train = pd.read_csv("../data/train_x_discretization.csv")[['uid']+fs]
discret_test = pd.read_csv("../data/test_x_discretization.csv")[['uid']+fs]
discret_train_unlabeled = pd.read_csv("../data/train_unlabeled_discretization.csv")[['uid']+fs]

#排序特征

rank_feature_score = pd.read_csv('./rank_feature_score.csv')
fs = list(rank_feature_score.feature[0:500])
rank_train_x = pd.read_csv("../data/train_x_rank.csv")
rank_train = rank_train_x[fs] / float(len(rank_train_x))
rank_train['uid'] = rank_train_x.uid

rank_test_x = pd.read_csv("../data/test_x_rank.csv")
rank_test = rank_test_x[fs] / float(len(rank_test_x))
rank_test['uid'] = rank_test_x.uid

rank_train_unlabeled_x = pd.read_csv("../data/train_unlabeled_rank.csv")
rank_train_unlabeled = rank_train_unlabeled_x[fs] / float(len(rank_train_unlabeled_x))
rank_train_unlabeled['uid'] = rank_train_unlabeled_x.uid

del rank_train_x,rank_test_x,rank_train_unlabeled_x

#原始特征

raw_feature_score = pd.read_csv('./raw_feature_score.csv')
fs = list(raw_feature_score.feature[0:500])
raw_train_x = pd.read_csv("../data/train_x.csv")[['uid']+fs]
raw_train_y = pd.read_csv("../data/train_y.csv")
raw_train = pd.merge(raw_train_x,raw_train_y,on='uid')
del raw_train_x,raw_train_y

raw_test = pd.read_csv("../data/test_x.csv")[['uid']+fs]
raw_train_unlabel = pd.read_csv('../data/train_unlabeled.csv')[['uid']+fs]

#将原始特征,排序特征,离散特征,以及其他11维特征(n1~n10,discret_null)合并

train = pd.merge(raw_train,rank_train,on='uid')
train = pd.merge(train,discret_train,on='uid')
train = pd.merge(train,train_eleven,on='uid')

test = pd.merge(raw_test,rank_test,on='uid')
test = pd.merge(test,discret_test,on='uid')
test = pd.merge(test,test_eleven,on='uid')
test_uid = test.uid

#从无标签数据里选取预测score<a的样本作为负样本,选取预测score>b的样本作为正样本,添加到训练集合

m3_predict_trainunlabeled_data = pd.read_csv('./m3_predict_trainunlabeled_data.csv')
unlabeldata_0 = m3_predict_trainunlabeled_data[m3_predict_trainunlabeled_data.score<a]
tmp = pd.merge(unlabeldata_0,raw_train_unlabel,on="uid",how="left")
tmp1 = pd.merge(tmp,rank_train_unlabeled,on="uid",how="left")
tmp2 = pd.merge(tmp1,trainunlabeled_eleven,on="uid",how="left")
neg_sample = pd.merge(tmp2,discret_train_unlabeled,on="uid",how="left")
neg_sample = neg_sample.drop(["score","uid"],axis=1)
neg_sample['y'] = [0 for _ in range(len(neg_sample))]

unlabeldata_1 = m3_predict_trainunlabeled_data[m3_predict_trainunlabeled_data.score>b]
tmp = pd.merge(unlabeldata_1,raw_train_unlabel,on="uid",how="left")
tmp1 = pd.merge(tmp,rank_train_unlabeled,on="uid",how="left")
tmp2 = pd.merge(tmp1,trainunlabeled_eleven,on="uid",how="left")
pos_sample = pd.merge(tmp2,discret_train_unlabeled,on="uid",how="left")
pos_sample = pos_sample.drop(["score","uid"],axis=1)
pos_sample['y'] = [1 for _ in range(len(pos_sample))]

samples_from_unlabel = pd.concat([neg_sample,pos_sample])

print "select {0} samples from train_unlabel.csv".format(len(samples_from_unlabel))
del unlabeldata_0,unlabeldata_1,tmp,tmp1,tmp2


#将缺失值个数在区间5(即缺失值个数大于194的)的样本去掉。这个对结果的提升很大,从0.723提高到接近0.725

train = train[train.discret_null!=5]
samples_from_unlabel = samples_from_unlabel[samples_from_unlabel.discret_null!=5]

def pipeline(iteration,random_seed,feature_num,rank_feature_num,discret_feature_num,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight):
    raw_feature_selected = list(raw_feature_score.feature[0:feature_num])
    rank_feature_selected = list(rank_feature_score.feature[0:rank_feature_num])
    discret_feature_selected = list(discret_feature_score.feature[0:discret_feature_num])

    train_xy = train[eleven_feature+raw_feature_selected+rank_feature_selected+discret_feature_selected+['y']]
    train_xy[train_xy<0] = -1

    test_x = test[eleven_feature+raw_feature_selected+rank_feature_selected+discret_feature_selected]
    test_x[test_x<0] = -1

    neg_pos = samples_from_unlabel[eleven_feature+raw_feature_selected+rank_feature_selected+discret_feature_selected+['y']]
    neg_pos[neg_pos<0] = -1   
    
    #将从无标签数据中选取出的负样本和原始训练数据合并

    train_xy = pd.concat([train_xy,neg_pos])
    y = train_xy.y
    X = train_xy.drop(['y'],axis=1)
    
    #xgboost start
	
    dtest = xgb.DMatrix(test_x)
    dtrain = xgb.DMatrix(X, label=y)
    params={
        'booster':'gbtree',
        'objective': 'binary:logistic',
        'scale_pos_weight': float(len(y)-sum(y))/float(sum(y)),
        'eval_metric': 'auc',
        'gamma':gamma,
        'max_depth':max_depth,
        'lambda':lambd,
        'subsample':subsample,
        'colsample_bytree':colsample_bytree,
        'min_child_weight':min_child_weight, 
        'eta': 0.08,
        'seed':random_seed,
        'nthread':8
        }
    
    watchlist  = [(dtrain,'train')]
    model = xgb.train(params,dtrain,num_boost_round=1500,evals=watchlist)
    model.save_model('./model/xgb{0}.model'.format(iteration))
    
    #predict test set
	
    test_y = model.predict(dtest)
    test_result = pd.DataFrame(test_uid,columns=["uid"])
    test_result["score"] = test_y
    test_result.to_csv("./preds/xgb{0}.csv".format(iteration),index=None,encoding='utf-8')
    
    #save feature score
	
    feature_score = model.get_fscore()
    feature_score = sorted(feature_score.items(), key=lambda x:x[1],reverse=True)
    fs = []
    for (key,value) in feature_score:
        fs.append("{0},{1}\n".format(key,value))
    
    with open('./featurescore/feature_score_{0}.csv'.format(iteration),'w') as f:
        f.writelines("feature,score\n")
        f.writelines(fs)

if __name__ == "__main__":
    """
    random_seed = range(1000,2000,10)
    feature_num = range(300,500,2)
    rank_feature_num = range(300,500,2)
    discret_feature_num = range(64,100,1)
    gamma = [i/1000.0 for i in range(0,300,3)]
    max_depth = [6,7,8]
    lambd = range(500,700,2)
    subsample = [i/1000.0 for i in range(500,700,2)]
    colsample_bytree = [i/1000.0 for i in range(250,350,1)]
    min_child_weight = [i/1000.0 for i in range(250,550,3)]
    random.shuffle(rank_feature_num)
    random.shuffle(random_seed)
    random.shuffle(feature_num)
    random.shuffle(discret_feature_num)
    random.shuffle(gamma)
    random.shuffle(max_depth)
    random.shuffle(lambd)
    random.shuffle(subsample)
    random.shuffle(colsample_bytree)
    random.shuffle(min_child_weight)
    
    with open('params.pkl','w') as f:
        cPickle.dump((random_seed,feature_num,rank_feature_num,discret_feature_num,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight),f)
    """

    with open('params_for_reproducing.pkl','r') as f:
        random_seed,feature_num,rank_feature_num,discret_feature_num,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight = cPickle.load(f)
    
    
    for i in range(36):
        print "iter:",i
        pipeline(i,random_seed[i],feature_num[i],rank_feature_num[i],discret_feature_num[i],gamma[i],max_depth[i%3],lambd[i],subsample[i],colsample_bytree[i],min_child_weight[i])

暴力半监督

虽然效率低,但是能有效提高线上得分,充分利用了无标签样本的分布信息来改善分类器性能。虽然效率低,但是能有效提高线上得分,充分利用了无标签样本的分布信息来改善分类器性能。

  • 每次从无标签样本中选择,10 个样本,有1024 种打标签方式。使用单模型(多模型融合复杂度太高)训练1024 次并在测试集上测试,选择线下(train:test 为1:9 的比例)性能提升最多的那组标签
  • 这里选择10个样本和1:9的划分是因为:如果添加少量样本不能引起模型性能的明显改变,如果11个样本计算时间又成倍增加。所以就选择了一次10个样本(比如10折交叉验证,也不一定非要选10折的,但是经验来说10折靠谱)
  • 训练集和测试集1:9划分是因为:少量训练集可以迭代更快,大量测试集可以更好的验证新添加的样本带来的性能

选择样本添加到训练集

select.py

"""
对label.csv里的数据按照bset_auc排序,然后选取top5000个无标签样本(即线下auc提升最大的无标签样本)
每次从这top5000个样本里随机选取50个样本
"""
import pandas as pd
import random,os

labels = pd.read_csv('label.csv',header=None)
labels.columns = ['ind','sample1','sample2','sample3','sample4','sample5','sample6','sample7','sample8','sample9','sample10','auc']
#对auc排序

labels['rk'] = labels.auc.rank(ascending=False)

#选取top5000的样本,注意每组10个样本,top5000即选取rank值小于500的组合

labels = labels[labels.rk<=500]

#每次随机取50个样本,即随机取5组样本

#相当于随机打乱,生成一百份文件

inds = list(labels.ind)
random.shuffle(inds)

os.mkdir('samples_selected')
for i in range(100):
    y = ['sample1','sample2','sample3','sample4','sample5','sample6','sample7','sample8','sample9','sample10']
    five_inds = inds[(5*i):(5*(i+1))]
    five_inds_label = [labels[labels.ind==this_ind][y].values.tolist()  for this_ind in five_inds]
    
    five_inds_label_ = []
    [five_inds_label_.extend(j[0]) for j in five_inds_label]

    temp = [range(ind*10,(ind+1)*10) for ind in five_inds]
    uid_index = []
    [uid_index.extend(t) for t in temp]

    
    #无标签样本的uid

    train_unlabel_uid = pd.read_csv('train_unlabeled.csv')
    sample50 = train_unlabel_uid.loc[uid_index]
    sample50['y'] = five_inds_label_
    sample50[['uid','y']].to_csv('samples_selected/{0}.csv'.format(i),index=None)

给无标签数据打标签

label.py

"""
给无标签数据打标签,每10个样本有1024种标签组合,运行1024次xgboost,选取auc提升最大的组合。
详细:10个无标签样本,有01两种打标方式。所以就是2^10=1024 。具体打标是利用一个位移。比如0位移9次就是000000000,1023位移9次就是1111111111这种
"""

from sklearn.cross_validation import train_test_split
from sklearn import metrics
import pandas as pd
import xgboost as xgb
import sys,random,cPickle

train_xy = pd.read_csv('train_xy.csv')
train,val = train_test_split(train_xy,test_size=0.85,random_state=1024)

val_y = val.y
val_X = val.drop(['y'],axis=1)
dval = xgb.DMatrix(val_X)

train_unlabel = pd.read_csv('train_unlabeled.csv')

def pipeline(unlabel_data):
    """
    unlabel_data:
       columns=['uid','y',features]
    """
    this_train = pd.concat([train,unlabel_data])
    y = this_train.y
    X = this_train.drop(['y'],axis=1)
    dtrain = xgb.DMatrix(X, label=y)
    params={
        'booster':'gbtree',
        'objective': 'rank:pairwise',
        'scale_pos_weight': float(len(y)-sum(y))/float(sum(y)),
        'eval_metric': 'auc',
        'gamma':0.1,
        'max_depth':8,
        'lambda':600,
        'subsample':0.6,
        'colsample_bytree':0.3,
        'min_child_weight':0.3, 
        'eta': 0.04,
        'seed':1024,
        'nthread':20
        }
    model = xgb.train(params,dtrain,num_boost_round=256,verbose_eval=False)
    val_y_pred = model.predict(dval)
    fpr,tpr,thresholds = metrics.roc_curve(val_y,val_y_pred,pos_label=1)
    return metrics.auc(fpr,tpr)


labels = []
get_bin = lambda x: format(x, 'b').zfill(10)
for i in range(1024):
    label_str = get_bin(i)
    label = []
    for c in label_str:
        label.append(int(c))
    labels.append(label)


for i in range(5000):
    uid_index = range(i*10,(i+1)*10)
    samples_selected = train_unlabel.loc[uid_index]
    best_auc = 0
    best_label = []
    for label in labels:
        samples_selected['y'] = label
        this_auc = pipeline(samples_selected)
        print this_auc
        if this_auc>best_auc:
            best_auc = this_auc
            best_label = label
    
    with open('label.csv','a') as f:
        f.writelines(str(i)+','+','.join([str(i) for i in best_label])+','+str(best_auc)+'\n')

生成半监督所用的训练数据和无标签数据

gen_samples.py

from sklearn.cross_validation import train_test_split
import pandas as pd
import xgboost as xgb
import sys,random
import cPickle

#离散特征,特征名称与rank特征重了,需要重命名,统一在之前加‘d’

discret_feature_score = pd.read_csv('./discret_feature_score.csv')
fs = list(discret_feature_score.feature[0:500])
discret_train = pd.read_csv("../data/train_x_discretization.csv")
discret_test = pd.read_csv("../data/test_x_discretization.csv")
discret_train_unlabeled = pd.read_csv("../data/train_unlabeled_discretization.csv")

#discret_null feature

test_dnull = pd.read_csv('../data/test_x_null.csv')[['uid','discret_null']]
train_dnull = pd.read_csv('../data/train_x_null.csv')[['uid','discret_null']]
trainunlabeled_dnull = pd.read_csv('../data/train_unlabeled_null.csv')[['uid','discret_null']]

#n_discret feature

test_nd = pd.read_csv('../data/test_x_nd.csv')[['uid','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10']]
train_nd = pd.read_csv('../data/train_x_nd.csv')[['uid','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10']]
trainunlabeled_nd = pd.read_csv('../data/train_unlabeled_nd.csv')[['uid','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10']]

discret_feature = ['n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','discret_null']
test_d = pd.merge(test_nd,test_dnull,on='uid')
train_d = pd.merge(train_nd,train_dnull,on='uid')
trainunlabeled_d = pd.merge(trainunlabeled_nd,trainunlabeled_dnull,on='uid')

del test_dnull,train_dnull,trainunlabeled_dnull
del test_nd,train_nd,trainunlabeled_nd

#rank_feature

rank_feature_score = pd.read_csv('./rank_feature_score.csv')
fs = list(rank_feature_score.feature[0:500])

#load data

rank_train_x = pd.read_csv("../data/train_x_rank.csv")
rank_train = rank_train_x[fs] / float(len(rank_train_x))
rank_train['uid'] = rank_train_x.uid

rank_test_x = pd.read_csv("../data/test_x_rank.csv")
rank_test = rank_test_x[fs] / float(len(rank_test_x))
rank_test['uid'] = rank_test_x.uid

rank_train_unlabeled_x = pd.read_csv("../data/train_unlabeled_rank.csv")
rank_train_unlabeled = rank_train_unlabeled_x[fs] / float(len(rank_train_unlabeled_x))
rank_train_unlabeled['uid'] = rank_train_unlabeled_x.uid

del rank_train_x,rank_test_x,rank_train_unlabeled_x

#raw data

feature_score_717 = pd.read_csv('./raw_feature_score.csv')
fs = list(feature_score_717.feature[0:500])
train_x = pd.read_csv("../data/train_x.csv")[['uid']+fs]
train_y = pd.read_csv("../data/train_y.csv")
train_xy = pd.merge(train_x,train_y,on='uid')
del train_x,train_y
train = pd.merge(train_xy,rank_train,on='uid')
train = pd.merge(train,train_d,on='uid')
train = pd.merge(train,discret_train,on='uid')

test = pd.read_csv("../data/test_x.csv")[['uid']+fs]
test = pd.merge(test,rank_test,on='uid')
test = pd.merge(test,test_d,on='uid')
test = pd.merge(test,discret_test,on='uid')
test_uid = test.uid

train_unlabel = pd.read_csv('../data/train_unlabeled.csv')[['uid']+fs]
tmp1 = pd.merge(train_unlabel,rank_train_unlabeled,on="uid",how="left")
tmp2 = pd.merge(tmp1,trainunlabeled_d,on="uid",how="left")
newdata = pd.merge(tmp2,discret_train_unlabeled,on="uid",how="left")
newdata[newdata<0] = -1
print "select {0} sample from train_unlabel.csv".format(len(newdata))

feature_selected = list(feature_score_717.feature[0:500])
rank_feature_selected = list(rank_feature_score.feature[0:500])
discret_feature_selected = list(discret_feature_score.feature[0:100])

train_xy = train[['uid']+discret_feature+feature_selected+rank_feature_selected+discret_feature_selected+['y']]
train_xy[train_xy<0] = -1
train_xy.to_csv('train_xy.csv',index=None)

n = newdata[['uid']+discret_feature+feature_selected+rank_feature_selected+discret_feature_selected]
n.to_csv('train_unlabeled.csv',index=None)