{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "21b25acb",
   "metadata": {},
   "source": [
    "# tuning hyperparamter\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "95df80d4",
   "metadata": {},
   "source": [
    "3种调参方法：\n",
    "1. 手动\n",
    "2. grid search： 会建立一些超参数对网格，逐个尝试，效率低下\n",
    "3. random search: 随机组合超参数网格值，\n",
    "4. auto tuning: 贝叶斯优化等"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e5254808",
   "metadata": {},
   "source": [
    "调参往往是一个极其耗时的过程\n",
    "\n",
    "我们会划分训练集和验证集，以评价参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b806d0ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import lightgbm as lgb\n",
    "from sklearn.model_selection import  train_test_split\n",
    "\n",
    "N_FOLDS = 5\n",
    "MAX_EVALS = 5\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "f6a9c236",
   "metadata": {},
   "outputs": [],
   "source": [
    "features = pd.read_csv('data/application_train.csv')\n",
    "features = features.sample(n=16000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "77c63340",
   "metadata": {},
   "outputs": [],
   "source": [
    "features = features.select_dtypes('number')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "420530e0",
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = np.array(features['TARGET'].astype(np.int32))\n",
    "features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "7450f3cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 6000, random_state = 50)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d7c30cd3",
   "metadata": {},
   "source": [
    "## CV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "7bf98aaf",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_set = lgb.Dataset(data=train_features, label=train_labels)\n",
    "test_set = lgb.Dataset(data=test_features, label=test_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "c14a9f75",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = lgb.LGBMClassifier()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "2fb7a54d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'boosting_type': 'gbdt',\n",
       " 'class_weight': None,\n",
       " 'colsample_bytree': 1.0,\n",
       " 'importance_type': 'split',\n",
       " 'learning_rate': 0.1,\n",
       " 'max_depth': -1,\n",
       " 'min_child_samples': 20,\n",
       " 'min_child_weight': 0.001,\n",
       " 'min_split_gain': 0.0,\n",
       " 'n_estimators': 100,\n",
       " 'n_jobs': None,\n",
       " 'num_leaves': 31,\n",
       " 'objective': None,\n",
       " 'random_state': None,\n",
       " 'reg_alpha': 0.0,\n",
       " 'reg_lambda': 0.0,\n",
       " 'subsample': 1.0,\n",
       " 'subsample_for_bin': 200000,\n",
       " 'subsample_freq': 0}"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "default_params = model.get_params()\n",
    "default_params"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "17c114b6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Help on function cv in module lightgbm.engine:\n",
      "\n",
      "cv(params: Dict[str, Any], train_set: lightgbm.basic.Dataset, num_boost_round: int = 100, folds: Union[Iterable[Tuple[numpy.ndarray, numpy.ndarray]], sklearn.model_selection._split.BaseCrossValidator, NoneType] = None, nfold: int = 5, stratified: bool = True, shuffle: bool = True, metrics: Union[str, List[str], NoneType] = None, feval: Union[Callable[[numpy.ndarray, lightgbm.basic.Dataset], Tuple[str, float, bool]], Callable[[numpy.ndarray, lightgbm.basic.Dataset], List[Tuple[str, float, bool]]], List[Union[Callable[[numpy.ndarray, lightgbm.basic.Dataset], Tuple[str, float, bool]], Callable[[numpy.ndarray, lightgbm.basic.Dataset], List[Tuple[str, float, bool]]]]], NoneType] = None, init_model: Union[str, pathlib.Path, lightgbm.basic.Booster, NoneType] = None, fpreproc: Optional[Callable[[lightgbm.basic.Dataset, lightgbm.basic.Dataset, Dict[str, Any]], Tuple[lightgbm.basic.Dataset, lightgbm.basic.Dataset, Dict[str, Any]]]] = None, seed: int = 0, callbacks: Optional[List[Callable]] = None, eval_train_metric: bool = False, return_cvbooster: bool = False) -> Dict[str, Union[List[float], lightgbm.engine.CVBooster]]\n",
      "    Perform the cross-validation with given parameters.\n",
      "    \n",
      "    Parameters\n",
      "    ----------\n",
      "    params : dict\n",
      "        Parameters for training. Values passed through ``params`` take precedence over those\n",
      "        supplied via arguments.\n",
      "    train_set : Dataset\n",
      "        Data to be trained on.\n",
      "    num_boost_round : int, optional (default=100)\n",
      "        Number of boosting iterations.\n",
      "    folds : generator or iterator of (train_idx, test_idx) tuples, scikit-learn splitter object or None, optional (default=None)\n",
      "        If generator or iterator, it should yield the train and test indices for each fold.\n",
      "        If object, it should be one of the scikit-learn splitter classes\n",
      "        (https://scikit-learn.org/stable/modules/classes.html#splitter-classes)\n",
      "        and have ``split`` method.\n",
      "        This argument has highest priority over other data split arguments.\n",
      "    nfold : int, optional (default=5)\n",
      "        Number of folds in CV.\n",
      "    stratified : bool, optional (default=True)\n",
      "        Whether to perform stratified sampling.\n",
      "    shuffle : bool, optional (default=True)\n",
      "        Whether to shuffle before splitting data.\n",
      "    metrics : str, list of str, or None, optional (default=None)\n",
      "        Evaluation metrics to be monitored while CV.\n",
      "        If not None, the metric in ``params`` will be overridden.\n",
      "    feval : callable, list of callable, or None, optional (default=None)\n",
      "        Customized evaluation function.\n",
      "        Each evaluation function should accept two parameters: preds, eval_data,\n",
      "        and return (eval_name, eval_result, is_higher_better) or list of such tuples.\n",
      "    \n",
      "            preds : numpy 1-D array or numpy 2-D array (for multi-class task)\n",
      "                The predicted values.\n",
      "                For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].\n",
      "                If custom objective function is used, predicted values are returned before any transformation,\n",
      "                e.g. they are raw margin instead of probability of positive class for binary task in this case.\n",
      "            eval_data : Dataset\n",
      "                A ``Dataset`` to evaluate.\n",
      "            eval_name : str\n",
      "                The name of evaluation function (without whitespace).\n",
      "            eval_result : float\n",
      "                The eval result.\n",
      "            is_higher_better : bool\n",
      "                Is eval result higher better, e.g. AUC is ``is_higher_better``.\n",
      "    \n",
      "        To ignore the default metric corresponding to the used objective,\n",
      "        set ``metrics`` to the string ``\"None\"``.\n",
      "    init_model : str, pathlib.Path, Booster or None, optional (default=None)\n",
      "        Filename of LightGBM model or Booster instance used for continue training.\n",
      "    fpreproc : callable or None, optional (default=None)\n",
      "        Preprocessing function that takes (dtrain, dtest, params)\n",
      "        and returns transformed versions of those.\n",
      "    seed : int, optional (default=0)\n",
      "        Seed used to generate the folds (passed to numpy.random.seed).\n",
      "    callbacks : list of callable, or None, optional (default=None)\n",
      "        List of callback functions that are applied at each iteration.\n",
      "        See Callbacks in Python API for more information.\n",
      "    eval_train_metric : bool, optional (default=False)\n",
      "        Whether to display the train metric in progress.\n",
      "        The score of the metric is calculated again after each training step, so there is some impact on performance.\n",
      "    return_cvbooster : bool, optional (default=False)\n",
      "        Whether to return Booster models trained on each fold through ``CVBooster``.\n",
      "    \n",
      "    Note\n",
      "    ----\n",
      "    A custom objective function can be provided for the ``objective`` parameter.\n",
      "    It should accept two parameters: preds, train_data and return (grad, hess).\n",
      "    \n",
      "        preds : numpy 1-D array or numpy 2-D array (for multi-class task)\n",
      "            The predicted values.\n",
      "            Predicted values are returned before any transformation,\n",
      "            e.g. they are raw margin instead of probability of positive class for binary task.\n",
      "        train_data : Dataset\n",
      "            The training dataset.\n",
      "        grad : numpy 1-D array or numpy 2-D array (for multi-class task)\n",
      "            The value of the first order derivative (gradient) of the loss\n",
      "            with respect to the elements of preds for each sample point.\n",
      "        hess : numpy 1-D array or numpy 2-D array (for multi-class task)\n",
      "            The value of the second order derivative (Hessian) of the loss\n",
      "            with respect to the elements of preds for each sample point.\n",
      "    \n",
      "    For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes],\n",
      "    and grad and hess should be returned in the same format.\n",
      "    \n",
      "    Returns\n",
      "    -------\n",
      "    eval_results : dict\n",
      "        History of evaluation results of each metric.\n",
      "        The dictionary has the following format:\n",
      "        {'valid metric1-mean': [values], 'valid metric1-stdv': [values],\n",
      "        'valid metric2-mean': [values], 'valid metric2-stdv': [values],\n",
      "        ...}.\n",
      "        If ``return_cvbooster=True``, also returns trained boosters wrapped in a ``CVBooster`` object via ``cvbooster`` key.\n",
      "        If ``eval_train_metric=True``, also returns the train metric history.\n",
      "        In this case, the dictionary has the following format:\n",
      "        {'train metric1-mean': [values], 'valid metric1-mean': [values],\n",
      "        'train metric2-mean': [values], 'valid metric2-mean': [values],\n",
      "        ...}.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "help(lgb.cv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "8da68ea1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[LightGBM] [Warning] Unknown parameter: importance_type\n",
      "[LightGBM] [Warning] Unknown parameter: importance_type\n",
      "[LightGBM] [Warning] Unknown parameter: importance_type\n",
      "[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007028 seconds.\n",
      "You can set `force_col_wise=true` to remove the overhead.\n",
      "[LightGBM] [Info] Total Bins 9966\n",
      "[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 93\n",
      "[LightGBM] [Warning] Unknown parameter: importance_type\n",
      "[LightGBM] [Warning] Unknown parameter: importance_type\n",
      "[LightGBM] [Warning] Unknown parameter: importance_type\n",
      "[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003572 seconds.\n",
      "You can set `force_col_wise=true` to remove the overhead.\n",
      "[LightGBM] [Info] Total Bins 9966\n",
      "[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 93\n",
      "[LightGBM] [Warning] Unknown parameter: importance_type\n",
      "[LightGBM] [Warning] Unknown parameter: importance_type\n",
      "[LightGBM] [Warning] Unknown parameter: importance_type\n",
      "[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004317 seconds.\n",
      "You can set `force_col_wise=true` to remove the overhead.\n",
      "[LightGBM] [Info] Total Bins 9966\n",
      "[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 93\n",
      "[LightGBM] [Warning] Unknown parameter: importance_type\n",
      "[LightGBM] [Warning] Unknown parameter: importance_type\n",
      "[LightGBM] [Warning] Unknown parameter: importance_type\n",
      "[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004565 seconds.\n",
      "You can set `force_col_wise=true` to remove the overhead.\n",
      "[LightGBM] [Info] Total Bins 9966\n",
      "[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 93\n",
      "[LightGBM] [Warning] Unknown parameter: importance_type\n",
      "[LightGBM] [Warning] Unknown parameter: importance_type\n",
      "[LightGBM] [Warning] Unknown parameter: importance_type\n",
      "[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004165 seconds.\n",
      "You can set `force_col_wise=true` to remove the overhead.\n",
      "[LightGBM] [Info] Total Bins 9966\n",
      "[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 93\n",
      "[LightGBM] [Warning] Unknown parameter: importance_type\n",
      "[LightGBM] [Info] Start training from score 0.077125\n",
      "[LightGBM] [Info] Start training from score 0.077125\n",
      "[LightGBM] [Info] Start training from score 0.077125\n",
      "[LightGBM] [Info] Start training from score 0.077125\n",
      "[LightGBM] [Info] Start training from score 0.077000\n",
      "Training until validation scores don't improve for 200 rounds\n",
      "Did not meet early stopping. Best iteration is:\n",
      "[14]\tvalid's auc: 0.706029 + 0.0223023\n"
     ]
    }
   ],
   "source": [
    "cv_results = lgb.cv(\n",
    "    default_params,\n",
    "    train_set,\n",
    "    metrics = 'auc',\n",
    "    num_boost_round=10000,\n",
    "    nfold=N_FOLDS,\n",
    "    callbacks=[\n",
    "        lgb.early_stopping(stopping_rounds=200),  # 如果连续200迭代没有提升auc，就自动停止 \n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "a1e0038a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'valid auc-mean': [0.65323335141481,\n",
       "  0.6775583880504371,\n",
       "  0.6817425581169888,\n",
       "  0.6873531227080611,\n",
       "  0.6898062112949138,\n",
       "  0.6939104626960922,\n",
       "  0.6957445684624461,\n",
       "  0.6966996317241883,\n",
       "  0.699616249256568,\n",
       "  0.6992604166394905,\n",
       "  0.7009808396041705,\n",
       "  0.7013666166946605,\n",
       "  0.7032957304365313,\n",
       "  0.7060287134596595],\n",
       " 'valid auc-stdv': [0.024272781101888612,\n",
       "  0.01812150742035603,\n",
       "  0.024102039428268655,\n",
       "  0.019996301858114098,\n",
       "  0.021825004316099888,\n",
       "  0.020519177494757675,\n",
       "  0.025953258778170885,\n",
       "  0.021477480378160406,\n",
       "  0.02144838863473038,\n",
       "  0.018759257870372713,\n",
       "  0.01978071914621832,\n",
       "  0.019988042613973512,\n",
       "  0.0220677550289575,\n",
       "  0.022302328855071645]}"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0643f809",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "data-analysis",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.25"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}