模型搭建简单介绍

首先，需要安装xgboost和DEAP库，由于pip安装较慢，可以在命令行中输入如下指令进行快速安装，该部分可以参考添加链接描述

pip install -i https://pypi.tuna.tsinghua.edu.cn/simple +安装包

pip install -i https://pypi.tuna.tsinghua.edu.cn/simple  xgboost
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple  deap

然后，我们可以开始编写代码。首先，导入必要的库：

import xgboost as xgb
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
import numpy as np
from deap import base, creator, tools, algorithms在这里插入代码片

接下来，加载Boston房价数据集：

boston = load_boston()
X, y = boston.data, boston.target

由于1.2.0以上版本的scikit-learn已经不再支持，可以使用1.1.1的scikit-learn，安装时候会自动卸载原版本参考添加链接描述

pip install scikit-learn==1.1.1 -i https://pypi.tuna.tsinghua.edu.cn/simple

然后，定义目标函数，即XGBoost模型的交叉验证误差：

def eval_xgb(individual):
	#保证[0,1]
    individual[6]=math.fabs(individual[6])
    while individual[6]>1 :
         individual[6]=individual[6]-1
    params = {
            'max_depth': math.ceil(individual[0]),#取整
            'learning_rate': math.fabs(individual[1]),#取绝对值
            'n_estimators': individual[2],
            'gamma': individual[3],
            'min_child_weight': individual[4],
            'subsample': individual[5],
            'colsample_bytree': individual[6],
            'objective': 'reg:squarederror'
        }

    cv_results = xgb.cv(params=params, dtrain=dtrain, num_boost_round=100, nfold=5, metrics='rmse', early_stopping_rounds=10, seed=0)

    return cv_results['test-rmse-mean'][len(cv_results)-1],

这里的目标函数接受一个个体（即一组参数）作为输入，并返回该个体的交叉验证误差。

然后，定义遗传算法的参数和操作：

creator.create('FitnessMin', base.Fitness, weights=(-1.0,))
creator.create('Individual', list, fitness=creator.FitnessMin)

toolbox = base.Toolbox()
toolbox.register('attr_max_depth', np.random.randint, 1, 10)
toolbox.register('attr_learning_rate', np.random.uniform, 0.01, 0.3)
toolbox.register('attr_n_estimators', np.random.randint, 50, 200)
toolbox.register('attr_gamma', np.random.uniform, 0, 1)
toolbox.register('attr_min_child_weight', np.random.uniform, 0.1, 10)
toolbox.register('attr_subsample', np.random.uniform, 0.5, 1)
toolbox.register('attr_colsample_bytree', np.random.uniform, 0.5, 1)
toolbox.register('individual', tools.initCycle, creator.Individual, (
    toolbox.attr_max_depth,
    toolbox.attr_learning_rate,
    toolbox.attr_n_estimators,
    toolbox.attr_gamma,
    toolbox.attr_min_child_weight,
    toolbox.attr_subsample,
    toolbox.attr_colsample_bytree), n=1)
toolbox.register('population', tools.initRepeat, list, toolbox.individual)
toolbox.register('evaluate', eval_xgb)
toolbox.register('mate', tools.cxUniform, indpb=0.1)
toolbox.register('mutate', tools.mutGaussian, mu=0, sigma=0.1, indpb=0.1)
toolbox.register('select', tools.selTournament, tournsize=3)

这里我们使用了随机数生成器来初始化每个参数，并定义了交叉和变异操作。

最后，运行遗传算法：

np.random.seed(0)
dtrain = xgb.DMatrix(X, label=y)

pop = toolbox.population(n=50)
hof = tools.HallOfFame(1)
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register('avg', np.mean)
stats.register('min', np.min)

pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=10, stats=stats, halloffame=hof, verbose=True)

best_ind = hof[0]
print('Best individual:', best_ind)
print('Best RMSE:', best_ind.fitness.values[0])

这里我们使用了eaSimple函数来运行遗传算法，并设置了交叉概率和变异概率。运行完毕后，我们可以得到最优的个体和对应的RMSE误差。

模型类封装

亲测可用，有问题欢迎评论

import xgboost as xgb
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
import numpy as np
from deap import base, creator, tools, algorithms
import math
def optimize_xgb(X, y, n_pop=5, n_gen=5, cxpb=0.5, mutpb=0.2):
    #定义目标函数，即XGBoost模型的交叉验证误差，这里的目标函数接受一个个体（即一组参数）作为输入，并返回该个体的交叉验证误差
    def eval_xgb(individual):
        #保证[0,1]
        individual[6]=math.fabs(individual[6])
        while individual[6]>1 :
            individual[6]=individual[6]-1

        params = {
            'max_depth': math.ceil(individual[0]),#取整
            'learning_rate': math.fabs(individual[1]),#取绝对值
            'n_estimators': individual[2],
            'gamma': individual[3],
            'min_child_weight': individual[4],
            'subsample': individual[5],
            'colsample_bytree': individual[6],
            'objective': 'reg:squarederror'
        }

        cv_results = xgb.cv(params=params, dtrain=dtrain, num_boost_round=100, nfold=5, metrics='rmse', early_stopping_rounds=10, seed=0)

        return cv_results['test-rmse-mean'][len(cv_results)-1],

#定义遗传算法的参数和操作
    #这里我们使用了随机数生成器来初始化每个参数，并定义了交叉和变异操作
    creator.create('FitnessMin', base.Fitness, weights=(-1.0,))
    creator.create('Individual', list, fitness=creator.FitnessMin)

    toolbox = base.Toolbox()
    toolbox.register('attr_max_depth', np.random.randint, 1, 10)
    toolbox.register('attr_learning_rate', np.random.uniform, 0.01, 0.3)
    toolbox.register('attr_n_estimators', np.random.randint, 50, 200)
    toolbox.register('attr_gamma', np.random.uniform, 0, 1)
    toolbox.register('attr_min_child_weight', np.random.uniform, 0.1, 10)
    toolbox.register('attr_subsample', np.random.uniform, 0.5, 1)
    toolbox.register('attr_colsample_bytree', np.random.uniform, 0.5, 1)
    toolbox.register('individual', tools.initCycle, creator.Individual, (
        toolbox.attr_max_depth,
        toolbox.attr_learning_rate,
        toolbox.attr_n_estimators,
        toolbox.attr_gamma,
        toolbox.attr_min_child_weight,
        toolbox.attr_subsample,
        toolbox.attr_colsample_bytree), n=1)
    toolbox.register('population', tools.initRepeat, list, toolbox.individual)
    toolbox.register('evaluate', eval_xgb)
    toolbox.register('mate', tools.cxUniform, indpb=0.1)
    toolbox.register('mutate', tools.mutGaussian, mu=0, sigma=0.1, indpb=0.1)
    toolbox.register('select', tools.selTournament, tournsize=3)


#这里我们使用了eaSimple函数来运行遗传算法，并设置了交叉概率和变异概率。运行完毕后，我们可以得到最优的个体和对应的RMSE误差
    np.random.seed(0)
    dtrain = xgb.DMatrix(X, label=y)

    pop = toolbox.population(n=n_pop)
    hof = tools.HallOfFame(1)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register('avg', np.mean)
    stats.register('min', np.min)

    pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=cxpb, mutpb=mutpb, ngen=n_gen, stats=stats, halloffame=hof, verbose=True)

    best_ind = hof[0]
    best_params = {
        'max_depth': best_ind[0],
        'learning_rate': best_ind[1],
        'n_estimators': best_ind[2],
        'gamma': best_ind[3],
        'min_child_weight': best_ind[4],
        'subsample': best_ind[5],
        'colsample_bytree': best_ind[6],
        'objective': 'reg:squarederror'
    }
    best_rmse = best_ind.fitness.values[0]
    print('Best individual:', best_ind)
    print('Best RMSE:', best_rmse)

    return best_params, best_rmse

# 这里我们将整个代码封装成了一个函数optimize_xgb，它接受训练数据X和标签y，以及遗传算法的参数n_pop、n_gen、cxpb和mutpb。函数返回最优的XGBoost参数和对应的RMSE误差。

import pandas as pd
import numpy as np


#Best individual: [8.048710533322954, 0.0867211275103418, 153, 0.45615033221654855, 5.72749609379962, 0.5093949002181776, 0.8088177485379385]
# Best RMSE: 3.4154928196132395
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]
# boston = load_boston()
# X, y = boston.data, boston.target
optimize_xgb(data,target)

文章出处登录后可见！

已经登录？立即刷新

【算法】遗传算法GA寻优xgboost最优参数模型

模型搭建简单介绍

模型类封装

相关推荐