Python模型训练篇

For Data Mining

Posted by Jiayue Cai on April 8, 2018

Last updated on 2019-7-23…

model.py
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import KFold
import os

print('Loading feature ...')

train_x = np.load("train_x.npy")
print('train_x prepared !')

train_y = np.load("train_y.npy")
print('train_y prepared !')

test_x = np.load("test_x.npy")
print('test_x prepared !')

res = np.load("res.npy")
res = pd.DataFrame({"id": res})
print('res prepared !')

print('Load feature OK !')

def LGB_predict(train_x,train_y,test_x,res):
  ...
return clf

model=LGB_predict(train_x,train_y,test_x,res)

from pandas import DataFrame
pd.set_option('display.max_rows', None)

features = train[use_cols].columns.map(str.lower)
feature_coef = DataFrame({'feature':features,'coef':model.feature_importances_},index=False)
feature_coef.sort_values(by = 'coef',ascending=False)

feature_coef.to_csv('feature_coef.csv', index=False)

模型调用

判别式模型

#切分训练集数据

train = data[data.iyear <= 2017]
test  = data[data.iyear == 2018]

train_X = train.drop('label', axis=1)
train_y = train['label']

test_X = test.drop('label', axis=1)
test_y

#训练

from sklearn.svm import SVC, LinearSVC
svc = SVC()
svc.fit(train_X, train_y)
svc.score(train_X, train_y)

#预测

test_y = svc.predict(test_X)

使用管道

df = pd.read_csv('data/train.csv')
y = df.author.values
X = df.text.values

df2 = pd.read_csv('data/test.csv')
X2 = df2.text.values

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import svm

text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', svm.LinearSVC())
                    ])
text_clf = text_clf.fit(X,y)
y2 = text_clf.predict(X2)
y2

生成式模型

#训练集:eventid,nlp,gname

#测试集:eventid,nlp

data = pd.concat([train,test])

#文本数据词频向量化

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf.fit(data['nlp'])
train_x = tfidf.transform(train['nlp'])
test_x = tfidf.transform(test['nlp'])
print('tfidf prepared !')

#choice 1: 对train_y进行编码映射

author_mapping_dict = {'Islamic State of Iraq and the Levant (ISIL)':0, 'Taliban':1, 'Al-Shabaab':2, 'Boko Haram':3, 'Houthi extremists (Ansar Allah)':4}
train_y = train['gname'].map(author_mapping_dict)

#choice 2:当train_y数据比较多的时候

from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
train_y = enc.fit_transform(train.gname.values)

#多项式贝叶斯模型的调用

from sklearn import ensemble, metrics, model_selection, naive_bayes
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(train_x, train_y)

cv_scores = []
cv_scores.append(metrics.log_loss(train_y, predictions))
print("Mean cv score : ", np.mean(cv_scores))

prediction = clf.predict_proba(test_x)

#对应choice 1的输出:

out_df = pd.DataFrame(prediction)
out_df.columns = ['Islamic State of Iraq and the Levant (ISIL)', 'Taliban', 'Al-Shabaab', 'Boko Haram', 'Houthi extremists (Ansar Allah)']
out_df.insert(0, 'eventid', test['eventid'])
out_df.to_csv("data/out_1.csv", index=False)

#对应choice 2的输出:

out_df = pd.DataFrame(prediction)
out_df.columns = list(enc.classes_)
out_df.insert(0, 'eventid', test['eventid'])
out_df.to_csv("data/out_2.csv", index=False)

#选取概率最大值所对应的列名输出

df = out_df.drop('eventid',axis=1)
pred = df.idxmax(axis=1)
submission = pd.DataFrame({"eventid": test['eventid'],"gname": pred})
submission.to_csv('data/out.csv', index=False)

自定义迭代

def LGB_predict(train_x,train_y,test_x,res):
  print("LGB test")
  clf = lgb.LGBMClassifier(
    boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,zero_as_missing=True,
    max_depth=-1, n_estimators=500, objective='binary',
    subsample=0.9, colsample_bytree=0.8, subsample_freq=1,
    learning_rate=0.1, min_child_weight=50, random_state=2018, n_jobs=100
  )
  clf.fit(train_x, train_y, eval_set=[(eval_x, eval_y)], eval_metric='auc',early_stopping_rounds=100)
  res['score'] = clf.predict_proba(test_x)[:,1]
  res['score'] = res['score'].apply(lambda x: float('%.6f' % x))
  res.to_csv('../data/submission.csv', index=False)
  os.system('zip baseline.zip ../data/submission.csv')
  
return clf

model=LGB_predict(train_x,train_y,test_x,res)

模型优化(调参)

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# 随机森林分类模型

RFC = RandomForestClassifier()

##设置备选属性用于grid search

rf_param_grid = {"max_depth": [None],
        "max_features": [1, 3, 10],
        "min_samples_split": [2, 3, 10],
        "min_samples_leaf": [1, 3, 10],
        "bootstrap": [False],
        "n_estimators" :[100,300],
        "criterion": ["gini"]}

#用于系统地遍历多种参数组合,通过交叉验证确定最佳效果参数

sRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gsRFC.fit(X_train,Y_train)

#得到最佳参数组合

RFC_best = gsRFC.best_estimator_

# Best score
gsRFC.best_score_

自定义损失函数

import numpy as np
from sklearn.metrics import roc_curve

def online_score(y_true, y_pred):
    fpr, tpr, thresholds = roc_curve(y_true=y_true, y_score=y_pred)
    fpr_list = [0.001, 0.005, 0.01]
    weight_list = [0.4, 0.3, 0.3] 
    grades = 0.0

    for weight, x in zip(weight_list, fpr_list):
        # the specified threshold may not exists, therefore find the closest one
        idx = np.argmin(np.abs(fpr - x)) 
        grades += weight * tpr[idx]

    return 'online_score', grades

交叉验证 Cross Validation

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import os

def LGB_predict(train_x,train_y,test_x,res):

    clf = lgb.LGBMClassifier(
        boosting_type='gbdt',
        n_estimators=2000,
        learning_rate=0.01, 
        objective='binary',
        
        max_depth=-1,
        num_leaves=35,            # 作用同 max_depth,控制树的过拟合

        min_child_samples=50,     # 控制过拟合

        min_child_weight=0,
        min_split_gain=0.5,       # 同 XGBoost 中的gamma,进一步划分需要的最小损耗减少
        
        max_bin=300,
        
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0,                # L1正则项

        reg_lambda=100,             # L2 正则项
        random_state=62,
        n_jobs=-1
    )
    
    print("LGB testing ...")
    train_predict = np.zeros(train_x.shape[0])
    test_predict = np.zeros(test_x.shape[0])
    
    folds = KFold(n_splits=5, shuffle=True, random_state=67)
    for n_fold, (train_index, valid_index) in enumerate(folds.split(train_x, train_y)):
        train_x_sample, train_y_sample = train_x[train_index], train_y[train_index]
        valid_x_sample, valid_y_sample = train_x[valid_index], train_y[valid_index]
        
        #这边使用了上节自定义函数 online_score
        
        clf.fit(train_x_sample, train_y_sample,
                eval_set = [(train_x_sample, train_y_sample), (valid_x_sample, valid_y_sample)], 
                eval_metric = online_score, #'auc'
                verbose = 200, 
                early_stopping_rounds = 200
        )
        train_predict[valid_index] = clf.predict_proba(valid_x_sample, num_iteration = clf.best_iteration_)[:, 1]
        test_predict += clf.predict_proba(test_x, num_iteration = clf.best_iteration_)[:, 1] / folds.n_splits
        
        train_predict[train_predict < 0] = 0
        print('第 {} 折 Valid_Metric: {}'.format(n_fold+1, online_score(valid_y_sample,train_predict[valid_index]))) #这边使用了上节自定义函数 online_score
        
    print("LGB test ok !")
    
    res['score'] = test_predict
    
    print("Generating output ...")
    res.to_csv('./data/submission.csv', index=False)
    #os.system('zip baseline.zip ./data/submission.csv')
    print("Output OK !")
    
    return clf

model=LGB_predict(train_x,train_y,test_x,res)

常用输出

表格输出

submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": Y_pred
})
submission.to_csv('data/titanic.csv', index=False)

One-hot后表格输出

from sklearn import preprocessing
encoder = preprocessing.LabelBinarizer()
encoder.fit(list(set(y2)))
one_hot_labels = encoder.transform(y2)                         
prediction = pd.DataFrame(one_hot_labels, columns=['EAP','HPL','MWS']).to_csv('data/author-pre.csv')

模型融合

基于得分的加权融合

import pandas as pd
result = pd.DataFrame()

result['nffm_7688'] = pd.read_csv('./nffm_final_preliminary/submission_nffm_7688.csv')['score']
result['nffm_765'] = pd.read_csv('./nffm_final_preliminary/submission_nffm_765.csv')['score']
result['lgb'] = pd.read_csv('data_preprocessing/submission2_p.csv')['score']

a = 0.7
sub = pd.read_csv('./nffm_final/submission_nffm_75866_0.csv')
sub['score'] = round((result['nffm_7688']*0.6+result['nffm_765']*0.4)*a+result['lgb']*(1-a),6)

print(sub['score'].describe())
sub.to_csv('./submission.csv',index=False)

归一化加权融合

import pandas as pd
pd.set_option('display.float_format',lambda x : '%.8f' % x)

import numpy as np
np.set_printoptions(suppress=True)

from sklearn.preprocessing import MinMaxScaler

res1 = pd.read_csv('./res1.csv')
res2 = pd.read_csv('./res2.csv')

scaler = MinMaxScaler()
res1['score_mm_01'] = scaler.fit_transform(np.array(res1['score']).reshape(-1, 1))

scaler = MinMaxScaler()
res2['score_mm_02'] = scaler.fit_transform(np.array(res2['score']).reshape(-1, 1))

result = res1.merge(res2,on='id')
result['score'] = result['score_mm_01']*0.6+result['score_mm_02']*0.4

#去除科学计数,保留8位小数

def as_num(x):
    y='{:.8f}'.format(x)
    return(y)

result['score'] = result['score'].apply(lambda x: as_num(x))
result.loc[:,['id','score']].to_csv('./submission.csv',index=False)

基于相关性的加权融合

计算模型之间最大信息系数(MIC),画热力图,选择相关性小的模型进行多模型加权融合。

import pandas as pd
import numpy as np
from minepy import MINE

fs = ['discret_5','R_7199','rank','discret_10','raw_rank','Py_717','Py_725','svm_6938']
out_nums = 8   #文件数

res = []
res.append(pd.read_csv('./avg_xgbs_discret_feature_5.csv').score.values)
res.append(pd.read_csv('./R_7199.csv').score.values)
res.append(pd.read_csv('./rank_feature_xgb_ensemble.csv').score.values)
res.append(pd.read_csv('./avg_xgbs_discret_feature_10.csv').score.values)
res.append(pd.read_csv('./based_on_select_rank_feature.csv').score.values)
res.append(pd.read_csv('./xgb717.csv').score.values)
res.append(pd.read_csv('./xgb725.csv').score.values)
res.append(pd.read_csv('./svm6938.csv').score.values)

cm = []

for i in range(out_nums): 
    tmp = []
    for j in range(out_nums):
        m = MINE()
        m.compute_score(res[i], res[j])
        tmp.append(m.mic())
    cm.append(tmp)

#画热力图

import numpy as np
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, title, cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(out_nums)
    plt.xticks(tick_marks, fs, rotation=45)
    plt.yticks(tick_marks, fs)
    plt.tight_layout()

plot_confusion_matrix(cm, title='mic')
plt.show()

选择更能适应模型之间的差异性融合(热力图中颜色最浅的几个区域),利用排名来进行融合(一种rank_avg的融合方式): sum(1/rank*score)

import pandas as pd

xgb717 = pd.read_csv("xgb717.csv")
svm6938 = pd.read_csv('svm6938.csv')
xgb725 = pd.read_csv('xgb725.csv')

uid = xgb717.uid
score = 0.15*xgb717.score+0.25*svm6938.score+0.6*xgb725.score
result = pd.DataFrame(uid,columns=['uid'])
result['score'] = score

result.to_csv('submission.csv',index=None,encoding='utf-8')