泰坦尼克生存预测

数据来自Titanic - Machine Learning from Disaster

读取数据

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set(style='whitegrid')

1 2	train = pd.read_csv('../data/train.csv', index_col='PassengerId') train.head()

EDA

1 2	# 'Pclass'与生存的关系 sns.barplot(x='Pclass', y='Survived', data=train)

客舱等级越高，生存率越高

1 2	# 'Sex'与生存率关系 sns.barplot(x='Sex', y='Survived', data=train)

女性生存率远高于男性

# 'Age'与生存率关系
age_kde = sns.FacetGrid(train, hue='Survived', aspect=3)
age_kde.map(sns.kdeplot, 'Age', shade=True)
age_kde.set(xlim=(0, train['Age'].max()))
age_kde.add_legend()

10岁以下儿童生存的概率高于死亡

1 2	# 'SibSp'与生存率的关系 sns.barplot(x='SibSp', y='Survived', data=train)

同辈数量居中时生存率高

1 2	# 'Parch'与生存率关系父母或子女数量适中时生存率高 sns.barplot(x='Parch', y='Survived', data=train)

父母或子女数量适中时生存率高

# 'Fare'与生存率关系
fare_kde = sns.FacetGrid(train, hue='Survived', aspect=3)
fare_kde.map(sns.kdeplot, 'Fare', shade=True)
fare_kde.set(xlim=(0, train['Fare'].max()))
fare_kde.add_legend()

票价较高时生存的概率大于死亡
Fare的分布是偏斜的，需要进行对数处理

1 2	# 'Embarked'与生存率关系 sns.barplot(x='Embarked', y='Survived', data=train)

将训练集和测试集合并进行数据清洗

1 2	test = pd.read_csv('../data/test.csv', index_col='PassengerId') test.head()

1 2	full = train.drop('Survived', axis=1).append(test) full.shape

1	full.describe()

没有明显的异常值

1	full.isnull().sum()

Age, Fare, Cabin, Embarked有缺失值，需要填补

1 2	# 'Cabin'缺失值过多，将缺失值作为单独的一类 full['Cabin'] = full['Cabin'].fillna('Unknown')

1	full['Embarked'].value_counts()

1 2	# 用众数填补'Embarked' full['Embarked'] = full['Embarked'].fillna('S')

1	full[full['Fare'].isnull()]

1 2	# 用相同条件下的'Fare'的均值填补缺失值 full['Fare'] = full['Fare'].fillna(full[(full['Pclass'] == 3) & (full['Embarked'] == 'S')]['Fare'].mean())

用随机森林填补Age

age_use = full[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
age_train = age_use[~age_use['Age'].isnull()]
age_test = age_use[age_use['Age'].isnull()]
age_y = age_train['Age']
age_train = age_train.drop('Age', axis=1)
age_test = age_test.drop('Age', axis=1)
age_X_train = pd.get_dummies(age_train) # 独热编码
age_X_test = pd.get_dummies(age_test)

from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state=42, n_estimators=100)
rfr.fit(age_X_train, age_y)
rfr.score(age_X_train, age_y) # 0.686420545226768

1 2	age_pred = rfr.predict(age_X_test) full.loc[full['Age'].isnull(), 'Age'] = age_pred

1	full.isnull().sum()

特征工程

1 2	full['logFare'] = full['Fare'].apply(lambda x: np.log(x) if x > 0 else 0) sns.kdeplot(full['logFare'])

从姓名中提取Title

1 2	full['Title'] = full['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip()) full['Title'].value_counts()

1 2	full.loc[~full['Title'].isin(['Mr', 'Miss', 'Mrs', 'Master']), 'Title'] = 'Other' full['Title'].value_counts()

将SibSp和Parch合并，创造新特征

1 2	full['FamilySize'] = full['SibSp'] + full['Parch'] + 1 full['IsAlone'] = full['FamilySize'].apply(lambda x: 1 if x == 0 else 0)

根据年龄创造是否为儿童的特征

1	full['IsChild'] = full['Age'].apply(lambda x: 1 if x < 10 else 0)

统计相同票号的情况，创造特征

1	full['Ticket'].value_counts()

1
2
3

tic_cnt_dict = {}
tic_cnt_dict = full['Ticket'].value_counts()
full['TicketCnt'] = full['Ticket'].map(tic_cnt_dict)

删除无用的特征

1	full_use = full.drop(['Name', 'Ticket', 'Fare'], axis=1)

独热编码，重新划分训练集和测试集

full_dum = pd.get_dummies(full_use)
X_dum_train = full_dum[full_dum.index.isin(train.index.tolist())]
X_dum_test = full_dum[full_dum.index.isin(test.index.tolist())]
X_dum_train.shape, X_dum_test.shape # ((891, 206), (418, 206))

1	y_train_use = train['Survived'].reset_index(drop=True)

互信息法筛选特征

1 2	from sklearn.feature_selection import mutual_info_classif from sklearn.feature_selection import SelectKBest

mic = mutual_info_classif(X_dum_train, y_train_use, random_state=42)
k = mic.shape[0] - sum(mic <= 0)
kmic = SelectKBest(mutual_info_classif, k).fit(X_dum_train, y_train_use)
X_train_mic = kmic.transform(X_dum_train)
X_train_mic.shape # (891, 111)

1	X_test_mic = kmic.transform(X_dum_test)

1 2	np.savetxt('../data/X_train_mic.csv', X_train_mic, delimiter=',') np.savetxt('../data/X_test_mic.csv', X_test_mic, delimiter=',')

训练模型

常用分类模型：

逻辑回归
随机森林
支持向量机
XGBoost

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X_train_mic, y_train_use, test_size=0.2, random_state=42)

随机森林

rf_param = {'criterion': ['gini', 'entropy']}
rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf_cv = GridSearchCV(rf, rf_param, cv=5)
rf_cv.fit(X_train, y_train)
rf_cv.score(X_test, y_test) # 0.8212290502793296

1	rf_cv.best_params_ # {'criterion': 'entropy'}

rf_score = []
for i in range(5, 11):
    rf = RandomForestClassifier(criterion='entropy', random_state=42, n_estimators=100, max_depth=i)
    rf.fit(X_train, y_train)
    rf_score.append(rf.score(X_test, y_test))

plt.plot(range(5, 11), rf_score)
plt.show()

max_depth=6

rf_score = []
for i in range(40, 80, 5):
    rf = RandomForestClassifier(criterion='entropy', max_depth=6, random_state=42, n_estimators=i)
    rf.fit(X_train, y_train)
    rf_score.append(rf.score(X_test, y_test))

plt.plot(range(40, 80, 5), rf_score)
plt.show()

n_estimators=60

1
2
3

rf = RandomForestClassifier(criterion='entropy', n_estimators=60, max_depth=6, random_state=42)
rf.fit(X_train_mic, y_train_use)
rf_pred = rf.predict(X_test_mic)

1 2	rf_sub = pd.DataFrame({'PassengerId': test.index.tolist(), 'Survived': rf_pred}) rf_sub.to_csv('../result/rf.csv', index=False)

逻辑回归

lr_param = {'solver': ['liblinear', 'lbfgs', 'sag']}
lr = LogisticRegression(random_state=42)
lr_cv = GridSearchCV(lr, lr_param, cv=5)
lr_cv.fit(X_train, y_train)
lr_cv.score(X_test, y_test) # 0.8212290502793296

1	lr_cv.best_params_ # {'solver': 'lbfgs'}

lr_score = []
for i in range(15, 21):
    lr = LogisticRegression(random_state=42, solver='lbfgs', max_iter=i)
    lr.fit(X_train, y_train)
    lr_score.append(lr.score(X_test, y_test))

plt.plot(range(15, 21), lr_score)
plt.show()

max_iter=19

lr_score = []
for i in np.linspace(0.001, 0.05, 20):
    lr = LogisticRegression(random_state=42, solver='lbfgs', max_iter=19, C=i)
    lr.fit(X_train, y_train)
    lr_score.append(lr.score(X_test, y_test))

plt.plot(np.linspace(0.001, 0.05, 20), lr_score)
plt.show()

C=0.01

1
2
3

lr = LogisticRegression(random_state=42, solver='lbfgs', max_iter=19, C=0.01)
lr.fit(X_train_mic, y_train_use)
lr_pred = lr.predict(X_test_mic)

1 2	lr_sub = pd.DataFrame({'PassengerId': test.index.tolist(), 'Survived': lr_pred}) lr_sub.to_csv('../result/lr.csv', index=Flse)

支持向量机

svc_param = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
svc = SVC(random_state=42)
svc_cv = GridSearchCV(svc, svc_param, cv=5)
svc_cv.fit(X_train, y_train)
svc_cv.score(X_test, y_test) # 0.8100558659217877

1	svc_cv.best_params_ # {'kernel': 'linear'}

svc_score = []
for i in np.linspace(0.01, 0.05, 10):
    svc = SVC(random_state=42, kernel='linear', C=i)
    svc.fit(X_train, y_train)
    svc_score.append(svc.score(X_test, y_test))

plt.plot(np.linspace(0.01, 0.05, 10), svc_score)
plt.show()

C=0.045

1
2
3

svc = SVC(random_state=42, kernel='linear', C=0.045)
svc.fit(X_train_mic, y_train_use)
svc_pred = svc.predict(X_test_mic)

1 2	svc_sub = pd.DataFrame({'PassengerId': test.index.tolist(), 'Survived': svc_pred}) svc_sub.to_csv('../result/svc.csv', index=False)

XGBoost

1 2	import xgboost as xgb dtrain_mic = xgb.DMatrix(X_train_mic, y_train_use)

xgb_param = {'silent': True, 'obj': 'binary:logistic', 'subsample': 0.8, 'max_depth': 5,
             'eta': 0.3, 'gamma': 0, 'lambda': 1, 'alpha': 0, 'colsample_bytree': 1,
             'colsample_bylevel': 1, 'colsample_bynode': 1, 'nfold': 5, 'n_estimators': 1000,
             'seed': 42}
num_round = 200

1	xgb.cv(xgb_param, dtrain_mic, num_round, metrics='auc', early_stopping_rounds=50)

n_estimators=21

1	cv_param = {'max_depth': range(2, 10, 2), 'min_child_weight': range(1, 6, 2)}

1
2
3

xgb_cv = GridSearchCV(estimator=XGBClassifier(objective='binary:logistic', subsample=0.8, learning_rate=0.1, gamma=0, alpha=0, colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, nfold=5, n_estimators=21, seed=42, silent=True), param_grid=cv_param, scoring='roc_auc', cv=5)
xgb_cv.fit(X_train, y_train)
xgb_cv.score(X_test, y_test) # 0.8926640926640927

1	xgb_cv.best_params_ # {'max_depth': 8, 'min_child_weight': 3}

1	cv_param = {'max_depth': [7, 8, 9], 'min_child_weight': [2, 3, 4]}

1
2
3

xgb_cv = GridSearchCV(estimator=XGBClassifier(objective='binary:logistic', subsample=0.8, learning_rate=0.1, gamma=0, alpha=0, colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, nfold=5, n_estimators=21, seed=42, silent=True), param_grid=cv_param, scoring='roc_auc', cv=5)
xgb_cv.fit(X_train, y_train)
xgb_cv.score(X_test, y_test) # 0.8926640926640927

1	xgb_cv.best_params_ # {'max_depth': 8, 'min_child_weight': 3}

1	cv_param = {'gamma': [i/10 for i in range(0, 5)]}

1
2
3

xgb_cv = GridSearchCV(estimator=XGBClassifier(objective='binary:logistic', subsample=0.8, learning_rate=0.1, alpha=0, colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, nfold=5, n_estimators=21, seed=42, max_depth=8, min_child_weight=3), param_grid=cv_param, scoring='roc_auc', cv=5)
xgb_cv.fit(X_train, y_train)
xgb_cv.score(X_test, y_test) # 0.8926640926640927

1	xgb_cv.best_params_ # {'gamma': 0.0}

重新调整n_estimators

xgb_param = {'obj': 'binary:logistic', 'subsample': 0.8, 'max_depth': 8,
             'learning_rate': 0.1, 'gamma': 0, 'lambda': 1, 'alpha': 0, 'colsample_bytree': 1,
             'colsample_bylevel': 1, 'colsample_bynode': 1, 'n_estimators': 1000,
             'min_child_weight': 3, 'seed': 42}
num_round = 200

1	xgb.cv(xgb_param, dtrain_mic, num_round, metrics='auc', early_stopping_rounds=50)

n_estimators=26

1 2	cv_param = {'subsample': [i/10 for i in range(1, 10)], 'colsample_bytree': [i/10 for i in range(1, 10)]}

1
2
3

xgb_cv = GridSearchCV(estimator=XGBClassifier(objective='binary:logistic', subsample=0.8, learning_rate=0.1, alpha=0, gamma=0, colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, nfold=5, n_estimators=26, seed=42, max_depth=8, min_child_weight=3), param_grid=cv_param, scoring='roc_auc', cv=5)
xgb_cv.fit(X_train, y_train)
xgb_cv.score(X_test, y_test) # 0.8940797940797941

1	xgb_cv.best_params_ # {'colsample_bytree': 0.6, 'subsample': 0.6}

1	cv_param = {'alpha': [1e-5, 1e-2, 0, 0.1, 1, 100]}

1
2
3

xgb_cv = GridSearchCV(estimator=XGBClassifier(objective='binary:logistic', subsample=0.6, learning_rate=0.1, alpha=0, gamma=0, colsample_bytree=0.6, colsample_bylevel=1, colsample_bynode=1, nfold=5, n_estimators=26, seed=42, max_depth=8, min_child_weight=3), param_grid=cv_param, scoring='roc_auc', cv=5)
xgb_cv.fit(X_train, y_train)
xgb_cv.score(X_test, y_test) # 0.8956241956241956

1	xgb_cv.best_params_ # {'alpha': 0.1}

1
2
3

xgbc = XGBClassifier(objective='binary:logistic', subsample=0.6, learning_rate=0.1, alpha=0, gamma=0, colsample_bytree=0.6, colsample_bylevel=1, colsample_bynode=1, nfold=5, n_estimators=26, seed=42, max_depth=8, min_child_weight=3)
xgbc.fit(X_train, y_train)
xgbc.score(X_test, y_test) # 0.8268156424581006

1 2	xgbc.fit(X_train_mic, y_train_use) xgb_pred = xgbc.predict(X_test_mic)

1 2	xgb_sub = pd.DataFrame({'PassengerId': test.index.tolist(), 'Survived': xgb_pred}) xgb_sub.to_csv('../result/xgb.csv', index=False)

模型融合

def get_oof(clf, x_train, y_train, x_test):
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]
    from sklearn.model_selection import KFold
    kf = KFold(n_splits=5, random_state=42).split(x_train)

    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((5, ntest))  #NFOLDS行，ntest列的二维array

    for i, (train_index, test_index) in enumerate(kf): #循环NFOLDS次
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        clf.fit(x_tr, y_tr)
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)  #固定行填充，循环一次，填充一行
    oof_test[:] = oof_test_skf.mean(axis=0)  #axis=0,按列求平均，最后保留一行
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)  #转置，从一行变为一列

rf_oof_train, rf_oof_test = get_oof(rf, X_train_mic, y_train_use, X_test_mic)
lr_oof_train, lr_oof_test = get_oof(lr, X_train_mic, y_train_use, X_test_mic)
svc_oof_train, svc_oof_test = get_oof(svc, X_train_mic, y_train_use, X_test_mic)
xgb_oof_train, xgb_oof_test = get_oof(xgbc, X_train_mic, y_train_use, X_test_mic)

1
2
3

train_l2 = np.hstack([rf_oof_train, lr_oof_train, svc_oof_train, xgb_oof_train])
test_l2 = np.hstack([rf_oof_test, lr_oof_test, svc_oof_test, xgb_oof_test])
X_train_l2, X_test_l2, y_train_l2, y_test_l2 = train_test_split(train_l2, y_train_use, test_size=0.2, random_state=42)

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42, max_depth=3)
dt.fit(X_train_l2, y_train_l2)
dt.score(X_test_l2, y_test_l2) # 0.8268156424581006

1
2
3

stack_pred = dt.predict(test_l2)
stack_sub = pd.DataFrame({'PassengerId': test.index.tolist(), 'Survived': stack_pred})
stack_sub.to_csv('../result/stack.csv', index=False)