Last updated on 2019-7-23…
model.py
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import KFold
import os
print('Loading feature ...')
train_x = np.load("train_x.npy")
print('train_x prepared !')
train_y = np.load("train_y.npy")
print('train_y prepared !')
test_x = np.load("test_x.npy")
print('test_x prepared !')
res = np.load("res.npy")
res = pd.DataFrame({"id": res})
print('res prepared !')
print('Load feature OK !')
def LGB_predict(train_x,train_y,test_x,res):
...
return clf
model=LGB_predict(train_x,train_y,test_x,res)
from pandas import DataFrame
pd.set_option('display.max_rows', None)
features = train[use_cols].columns.map(str.lower)
feature_coef = DataFrame({'feature':features,'coef':model.feature_importances_},index=False)
feature_coef.sort_values(by = 'coef',ascending=False)
feature_coef.to_csv('feature_coef.csv', index=False)
模型调用
判别式模型
#切分训练集数据
train = data[data.iyear <= 2017]
test = data[data.iyear == 2018]
train_X = train.drop('label', axis=1)
train_y = train['label']
test_X = test.drop('label', axis=1)
test_y
#训练
from sklearn.svm import SVC, LinearSVC
svc = SVC()
svc.fit(train_X, train_y)
svc.score(train_X, train_y)
#预测
test_y = svc.predict(test_X)
使用管道
df = pd.read_csv('data/train.csv')
y = df.author.values
X = df.text.values
df2 = pd.read_csv('data/test.csv')
X2 = df2.text.values
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import svm
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', svm.LinearSVC())
])
text_clf = text_clf.fit(X,y)
y2 = text_clf.predict(X2)
y2
生成式模型
#训练集:eventid,nlp,gname
#测试集:eventid,nlp
data = pd.concat([train,test])
#文本数据词频向量化
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf.fit(data['nlp'])
train_x = tfidf.transform(train['nlp'])
test_x = tfidf.transform(test['nlp'])
print('tfidf prepared !')
#choice 1: 对train_y进行编码映射
author_mapping_dict = {'Islamic State of Iraq and the Levant (ISIL)':0, 'Taliban':1, 'Al-Shabaab':2, 'Boko Haram':3, 'Houthi extremists (Ansar Allah)':4}
train_y = train['gname'].map(author_mapping_dict)
#choice 2:当train_y数据比较多的时候
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
train_y = enc.fit_transform(train.gname.values)
#多项式贝叶斯模型的调用
from sklearn import ensemble, metrics, model_selection, naive_bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(train_x, train_y)
cv_scores = []
cv_scores.append(metrics.log_loss(train_y, predictions))
print("Mean cv score : ", np.mean(cv_scores))
prediction = clf.predict_proba(test_x)
#对应choice 1的输出:
out_df = pd.DataFrame(prediction)
out_df.columns = ['Islamic State of Iraq and the Levant (ISIL)', 'Taliban', 'Al-Shabaab', 'Boko Haram', 'Houthi extremists (Ansar Allah)']
out_df.insert(0, 'eventid', test['eventid'])
out_df.to_csv("data/out_1.csv", index=False)
#对应choice 2的输出:
out_df = pd.DataFrame(prediction)
out_df.columns = list(enc.classes_)
out_df.insert(0, 'eventid', test['eventid'])
out_df.to_csv("data/out_2.csv", index=False)
#选取概率最大值所对应的列名输出
df = out_df.drop('eventid',axis=1)
pred = df.idxmax(axis=1)
submission = pd.DataFrame({"eventid": test['eventid'],"gname": pred})
submission.to_csv('data/out.csv', index=False)
自定义迭代
def LGB_predict(train_x,train_y,test_x,res):
print("LGB test")
clf = lgb.LGBMClassifier(
boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,zero_as_missing=True,
max_depth=-1, n_estimators=500, objective='binary',
subsample=0.9, colsample_bytree=0.8, subsample_freq=1,
learning_rate=0.1, min_child_weight=50, random_state=2018, n_jobs=100
)
clf.fit(train_x, train_y, eval_set=[(eval_x, eval_y)], eval_metric='auc',early_stopping_rounds=100)
res['score'] = clf.predict_proba(test_x)[:,1]
res['score'] = res['score'].apply(lambda x: float('%.6f' % x))
res.to_csv('../data/submission.csv', index=False)
os.system('zip baseline.zip ../data/submission.csv')
return clf
model=LGB_predict(train_x,train_y,test_x,res)
模型优化(调参)
网格搜索 Grid Search
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# 随机森林分类模型
RFC = RandomForestClassifier()
##设置备选属性用于grid search
rf_param_grid = {"max_depth": [None],
"max_features": [1, 3, 10],
"min_samples_split": [2, 3, 10],
"min_samples_leaf": [1, 3, 10],
"bootstrap": [False],
"n_estimators" :[100,300],
"criterion": ["gini"]}
#用于系统地遍历多种参数组合,通过交叉验证确定最佳效果参数
sRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)
gsRFC.fit(X_train,Y_train)
#得到最佳参数组合
RFC_best = gsRFC.best_estimator_
# Best score
gsRFC.best_score_
自定义损失函数
import numpy as np
from sklearn.metrics import roc_curve
def online_score(y_true, y_pred):
fpr, tpr, thresholds = roc_curve(y_true=y_true, y_score=y_pred)
fpr_list = [0.001, 0.005, 0.01]
weight_list = [0.4, 0.3, 0.3]
grades = 0.0
for weight, x in zip(weight_list, fpr_list):
# the specified threshold may not exists, therefore find the closest one
idx = np.argmin(np.abs(fpr - x))
grades += weight * tpr[idx]
return 'online_score', grades
交叉验证 Cross Validation
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import os
def LGB_predict(train_x,train_y,test_x,res):
clf = lgb.LGBMClassifier(
boosting_type='gbdt',
n_estimators=2000,
learning_rate=0.01,
objective='binary',
max_depth=-1,
num_leaves=35, # 作用同 max_depth,控制树的过拟合
min_child_samples=50, # 控制过拟合
min_child_weight=0,
min_split_gain=0.5, # 同 XGBoost 中的gamma,进一步划分需要的最小损耗减少
max_bin=300,
subsample=0.8,
colsample_bytree=0.8,
reg_alpha=0, # L1正则项
reg_lambda=100, # L2 正则项
random_state=62,
n_jobs=-1
)
print("LGB testing ...")
train_predict = np.zeros(train_x.shape[0])
test_predict = np.zeros(test_x.shape[0])
folds = KFold(n_splits=5, shuffle=True, random_state=67)
for n_fold, (train_index, valid_index) in enumerate(folds.split(train_x, train_y)):
train_x_sample, train_y_sample = train_x[train_index], train_y[train_index]
valid_x_sample, valid_y_sample = train_x[valid_index], train_y[valid_index]
#这边使用了上节自定义函数 online_score
clf.fit(train_x_sample, train_y_sample,
eval_set = [(train_x_sample, train_y_sample), (valid_x_sample, valid_y_sample)],
eval_metric = online_score, #'auc'
verbose = 200,
early_stopping_rounds = 200
)
train_predict[valid_index] = clf.predict_proba(valid_x_sample, num_iteration = clf.best_iteration_)[:, 1]
test_predict += clf.predict_proba(test_x, num_iteration = clf.best_iteration_)[:, 1] / folds.n_splits
train_predict[train_predict < 0] = 0
print('第 {} 折 Valid_Metric: {}'.format(n_fold+1, online_score(valid_y_sample,train_predict[valid_index]))) #这边使用了上节自定义函数 online_score
print("LGB test ok !")
res['score'] = test_predict
print("Generating output ...")
res.to_csv('./data/submission.csv', index=False)
#os.system('zip baseline.zip ./data/submission.csv')
print("Output OK !")
return clf
model=LGB_predict(train_x,train_y,test_x,res)
常用输出
表格输出
submission = pd.DataFrame({
"PassengerId": test_df["PassengerId"],
"Survived": Y_pred
})
submission.to_csv('data/titanic.csv', index=False)
One-hot后表格输出
from sklearn import preprocessing
encoder = preprocessing.LabelBinarizer()
encoder.fit(list(set(y2)))
one_hot_labels = encoder.transform(y2)
prediction = pd.DataFrame(one_hot_labels, columns=['EAP','HPL','MWS']).to_csv('data/author-pre.csv')
模型融合
基于得分的加权融合
import pandas as pd
result = pd.DataFrame()
result['nffm_7688'] = pd.read_csv('./nffm_final_preliminary/submission_nffm_7688.csv')['score']
result['nffm_765'] = pd.read_csv('./nffm_final_preliminary/submission_nffm_765.csv')['score']
result['lgb'] = pd.read_csv('data_preprocessing/submission2_p.csv')['score']
a = 0.7
sub = pd.read_csv('./nffm_final/submission_nffm_75866_0.csv')
sub['score'] = round((result['nffm_7688']*0.6+result['nffm_765']*0.4)*a+result['lgb']*(1-a),6)
print(sub['score'].describe())
sub.to_csv('./submission.csv',index=False)
归一化加权融合
import pandas as pd
pd.set_option('display.float_format',lambda x : '%.8f' % x)
import numpy as np
np.set_printoptions(suppress=True)
from sklearn.preprocessing import MinMaxScaler
res1 = pd.read_csv('./res1.csv')
res2 = pd.read_csv('./res2.csv')
scaler = MinMaxScaler()
res1['score_mm_01'] = scaler.fit_transform(np.array(res1['score']).reshape(-1, 1))
scaler = MinMaxScaler()
res2['score_mm_02'] = scaler.fit_transform(np.array(res2['score']).reshape(-1, 1))
result = res1.merge(res2,on='id')
result['score'] = result['score_mm_01']*0.6+result['score_mm_02']*0.4
#去除科学计数,保留8位小数
def as_num(x):
y='{:.8f}'.format(x)
return(y)
result['score'] = result['score'].apply(lambda x: as_num(x))
result.loc[:,['id','score']].to_csv('./submission.csv',index=False)
基于相关性的加权融合
计算模型之间最大信息系数(MIC),画热力图,选择相关性小的模型进行多模型加权融合。
import pandas as pd
import numpy as np
from minepy import MINE
fs = ['discret_5','R_7199','rank','discret_10','raw_rank','Py_717','Py_725','svm_6938']
out_nums = 8 #文件数
res = []
res.append(pd.read_csv('./avg_xgbs_discret_feature_5.csv').score.values)
res.append(pd.read_csv('./R_7199.csv').score.values)
res.append(pd.read_csv('./rank_feature_xgb_ensemble.csv').score.values)
res.append(pd.read_csv('./avg_xgbs_discret_feature_10.csv').score.values)
res.append(pd.read_csv('./based_on_select_rank_feature.csv').score.values)
res.append(pd.read_csv('./xgb717.csv').score.values)
res.append(pd.read_csv('./xgb725.csv').score.values)
res.append(pd.read_csv('./svm6938.csv').score.values)
cm = []
for i in range(out_nums):
tmp = []
for j in range(out_nums):
m = MINE()
m.compute_score(res[i], res[j])
tmp.append(m.mic())
cm.append(tmp)
#画热力图
import numpy as np
import matplotlib.pyplot as plt
def plot_confusion_matrix(cm, title, cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(out_nums)
plt.xticks(tick_marks, fs, rotation=45)
plt.yticks(tick_marks, fs)
plt.tight_layout()
plot_confusion_matrix(cm, title='mic')
plt.show()
选择更能适应模型之间的差异性融合(热力图中颜色最浅的几个区域),利用排名来进行融合(一种rank_avg的融合方式): sum(1/rank*score)
import pandas as pd
xgb717 = pd.read_csv("xgb717.csv")
svm6938 = pd.read_csv('svm6938.csv')
xgb725 = pd.read_csv('xgb725.csv')
uid = xgb717.uid
score = 0.15*xgb717.score+0.25*svm6938.score+0.6*xgb725.score
result = pd.DataFrame(uid,columns=['uid'])
result['score'] = score
result.to_csv('submission.csv',index=None,encoding='utf-8')