在Kaggle有这样一个经典的题目,根据船上的用户基本信息,判断剩下的人是否能生存下来。话不多说直接进入主题。
下载数据集
包含了源代码+训练集+ 测试集
整理数据
- 这一部主要处理缺失的数据,
- 将年龄等常数用平均值代替
- 将登船口用众数代替
def select_data():
selected_features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Embarked']
train_data = load_data()
test_data = load_data("test")
train_x = train_data[selected_features]
train_y = train_data["Survived"]
test_x = test_data[selected_features]
train_x["Age"].fillna(train_x["Age"].mean(), inplace = True)
train_x['Embarked'].fillna('S',inplace=True) #'S'出现次数最多,因此以'S'进行填充
test_x["Age"].fillna(test_x["Age"].mean(), inplace = True)
test_x["Fare"].fillna(test_x["Fare"].mean(), inplace = True)
train_x = format_data(train_x)
test_x = format_data(test_x)
print(test_x.info())
return train_x, train_y, test_x
数据数字化
- 将性别,登机口用 数学的形式来表示,方便训练
def format_data(train_x):
# 数据化性别
train_x.loc[train_x['Sex'] == "male", "Sex"] = 0
train_x.loc[train_x["Sex"] == "female", "Sex"] = 1
train_x.loc[train_x['Embarked'] == "S", "Embarked"] = 0
train_x.loc[train_x["Embarked"] == "C", "Embarked"] = 1
train_x.loc[train_x['Embarked'] == "Q", "Embarked"] = 2
return train_x
训练模型,并预测,写入到文件当中
def random_forest():
test_data = load_data('test')
x_train, y_train, x_test = select_data()
model = RandomForestClassifier()
paras = {'n_estimators': np.arange(10, 100, 10), 'criterion': ['gini', 'entropy'], 'max_depth': np.arange(5, 50, 5)}
gs = GridSearchCV(model, paras, cv=5, verbose=1,n_jobs=-1)
gs.fit(x_train, y_train)
y_pre = gs.predict(x_test)
print('best score:', gs.best_score_)
print('best parameters:', gs.best_params_)
print((test_data))
result = ''
with open('./result.csv', 'w', encoding="utf-8") as f:
f.write("PassengerId,Survived" + "\n")
for i in range(len(y_pre)):
result = str(test_data.iloc[i,0]) + "," + str(y_pre[i])
f.write(result + "\n")
文章出处登录后可见!
已经登录?立即刷新