diff --git a/.gitignore b/.gitignore index ca8c907..008e237 100644 --- a/.gitignore +++ b/.gitignore @@ -134,4 +134,8 @@ catboost_info/catboost_training.json catboost_info/learn/events.out.tfevents catboost_info/learn_error.tsv catboost_info/time_left.tsv -*.ipynb +notebooks/catboost_info/catboost_training.json +notebooks/catboost_info/learn/events.out.tfevents +notebooks/catboost_info/learn_error.tsv +notebooks/catboost_info/time_left.tsv +.DS_Store diff --git a/notebooks/demo_linearboost_usage.ipynb b/notebooks/demo_linearboost_usage.ipynb new file mode 100644 index 0000000..7b3b656 --- /dev/null +++ b/notebooks/demo_linearboost_usage.ipynb @@ -0,0 +1,530 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "sys.path.append(\"../src\")\n", + "from linearboost.linear_boost import LinearBoostClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: xgboost in /Users/hamidrezakeshavarz/Documents/GitHub/linearboost-classifier/.venv/lib/python3.13/site-packages (3.0.2)\n", + "Requirement already satisfied: lightgbm in /Users/hamidrezakeshavarz/Documents/GitHub/linearboost-classifier/.venv/lib/python3.13/site-packages (4.6.0)\n", + "Requirement already satisfied: catboost in /Users/hamidrezakeshavarz/Documents/GitHub/linearboost-classifier/.venv/lib/python3.13/site-packages (1.2.8)\n", + "Requirement already satisfied: numpy in /Users/hamidrezakeshavarz/Documents/GitHub/linearboost-classifier/.venv/lib/python3.13/site-packages (from xgboost) (2.2.6)\n", + "Requirement already satisfied: scipy in /Users/hamidrezakeshavarz/Documents/GitHub/linearboost-classifier/.venv/lib/python3.13/site-packages (from xgboost) (1.15.3)\n", + "Requirement already satisfied: graphviz in /Users/hamidrezakeshavarz/Documents/GitHub/linearboost-classifier/.venv/lib/python3.13/site-packages (from catboost) (0.21)\n", + "Requirement already satisfied: matplotlib in /Users/hamidrezakeshavarz/Documents/GitHub/linearboost-classifier/.venv/lib/python3.13/site-packages (from catboost) (3.10.3)\n", + "Requirement already satisfied: pandas>=0.24 in /Users/hamidrezakeshavarz/Documents/GitHub/linearboost-classifier/.venv/lib/python3.13/site-packages (from catboost) (2.3.1)\n", + "Requirement already satisfied: plotly in /Users/hamidrezakeshavarz/Documents/GitHub/linearboost-classifier/.venv/lib/python3.13/site-packages (from catboost) (6.2.0)\n", + "Requirement already satisfied: six in /Users/hamidrezakeshavarz/Documents/GitHub/linearboost-classifier/.venv/lib/python3.13/site-packages (from catboost) (1.17.0)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /Users/hamidrezakeshavarz/Documents/GitHub/linearboost-classifier/.venv/lib/python3.13/site-packages (from pandas>=0.24->catboost) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in /Users/hamidrezakeshavarz/Documents/GitHub/linearboost-classifier/.venv/lib/python3.13/site-packages (from pandas>=0.24->catboost) (2025.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /Users/hamidrezakeshavarz/Documents/GitHub/linearboost-classifier/.venv/lib/python3.13/site-packages (from pandas>=0.24->catboost) (2025.2)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /Users/hamidrezakeshavarz/Documents/GitHub/linearboost-classifier/.venv/lib/python3.13/site-packages (from matplotlib->catboost) (1.3.3)\n", + "Requirement already satisfied: cycler>=0.10 in /Users/hamidrezakeshavarz/Documents/GitHub/linearboost-classifier/.venv/lib/python3.13/site-packages (from matplotlib->catboost) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /Users/hamidrezakeshavarz/Documents/GitHub/linearboost-classifier/.venv/lib/python3.13/site-packages (from matplotlib->catboost) (4.59.0)\n", + "Requirement already satisfied: kiwisolver>=1.3.1 in /Users/hamidrezakeshavarz/Documents/GitHub/linearboost-classifier/.venv/lib/python3.13/site-packages (from matplotlib->catboost) (1.4.8)\n", + "Requirement already satisfied: packaging>=20.0 in /Users/hamidrezakeshavarz/Documents/GitHub/linearboost-classifier/.venv/lib/python3.13/site-packages (from matplotlib->catboost) (25.0)\n", + "Requirement already satisfied: pillow>=8 in /Users/hamidrezakeshavarz/Documents/GitHub/linearboost-classifier/.venv/lib/python3.13/site-packages (from matplotlib->catboost) (11.3.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /Users/hamidrezakeshavarz/Documents/GitHub/linearboost-classifier/.venv/lib/python3.13/site-packages (from matplotlib->catboost) (3.2.3)\n", + "Requirement already satisfied: narwhals>=1.15.1 in /Users/hamidrezakeshavarz/Documents/GitHub/linearboost-classifier/.venv/lib/python3.13/site-packages (from plotly->catboost) (2.0.0)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install xgboost lightgbm catboost" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "from ucimlrepo import fetch_ucirepo\n", + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "# The Huberman's Survival's id on UCI Machine Learning Repository\n", + "dataset_id = 43\n", + "\n", + "dataset = fetch_ucirepo(id=dataset_id)\n", + "\n", + "# data (as pandas dataframes)\n", + "X = dataset.data.features.copy()\n", + "y = dataset.data.targets\n", + "\n", + "label_encoder = LabelEncoder()\n", + "\n", + "y = label_encoder.fit_transform(y.values.ravel())" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "# Identify categorical columns\n", + "categorical_cols = X.select_dtypes(include=[\"object\"]).columns.tolist()\n", + "\n", + "# Convert categorical columns to 'category' dtype\n", + "for col in categorical_cols:\n", + " X[col] = X[col].astype(\"category\")\n", + "\n", + "# Handle missing values\n", + "# Fill numeric columns with median\n", + "numeric_cols = X.select_dtypes(include=[\"int64\", \"float64\"]).columns.tolist()\n", + "for col in numeric_cols:\n", + " X[col] = X[col].fillna(X[col].median())\n", + "\n", + "# Fill categorical columns with mode\n", + "for col in categorical_cols:\n", + " X[col] = X[col].fillna(X[col].mode()[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\", message=\".*ignore_implicit_zeros.*\")\n", + "warnings.filterwarnings(\"ignore\", message=\".*n_quantiles.*\")\n", + "warnings.filterwarnings(\"ignore\", category=RuntimeWarning)\n", + "warnings.filterwarnings(\"ignore\", category=FutureWarning)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**LinearBoost results:**" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[I 2025-07-28 20:32:30,509] A new study created in memory with name: no-name-39689138-b2a1-447a-af63-be7733a7aeb8\n", + "[I 2025-07-28 20:32:30,764] Trial 0 finished with value: 0.7283291353857195 and parameters: {'n_estimators': 256, 'learning_rate': 0.11807746849928968, 'algorithm': 'SAMME', 'scaler': 'minmax', 'kernel': 'rbf', 'gamma': 0.10360685199357951}. Best is trial 0 with value: 0.7283291353857195.\n", + "[I 2025-07-28 20:32:32,767] Trial 1 finished with value: 0.7323671972329208 and parameters: {'n_estimators': 363, 'learning_rate': 0.013883181171194234, 'algorithm': 'SAMME', 'scaler': 'robust', 'kernel': 'rbf', 'gamma': 0.3980809182349502}. Best is trial 1 with value: 0.7323671972329208.\n", + "...", + "[I 2025-07-28 20:34:42,444] Trial 199 finished with value: 0.7515000210035615 and parameters: {'n_estimators': 245, 'learning_rate': 0.06109405565734974, 'algorithm': 'SAMME', 'scaler': 'minmax', 'kernel': 'rbf', 'gamma': 0.08461411124525335}. Best is trial 167 with value: 0.7583125868901313.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best trial:\n", + "F1 Score: 0.7583125868901313\n", + "Parameters: \n", + "n_estimators: 384\n", + "learning_rate: 0.06667610599938725\n", + "algorithm: SAMME\n", + "scaler: robust\n", + "kernel: rbf\n", + "gamma: 0.002777056559327566\n" + ] + } + ], + "source": [ + "import optuna\n", + "import numpy as np\n", + "from sklearn.model_selection import StratifiedKFold, cross_val_score\n", + "\n", + "\n", + "def custom_loss(y_true, y_pred, weights):\n", + " return np.mean(weights * (y_true - y_pred) ** 2)\n", + "\n", + "\n", + "df = X\n", + "\n", + "# One-hot encoding\n", + "cat_features = list(df.select_dtypes(include=[\"object\", \"category\"]).columns)\n", + "for col in cat_features:\n", + " df_onehot = pd.get_dummies(df[col], prefix=col)\n", + " df = df.drop(col, axis=1)\n", + " df = pd.concat([df_onehot, df], axis=1)\n", + "\n", + "\n", + "def objective(trial):\n", + " params = {\n", + " \"n_estimators\": trial.suggest_int(\"n_estimators\", 10, 500),\n", + " \"learning_rate\": trial.suggest_float(\"learning_rate\", 0.01, 1.0, log=True),\n", + " \"algorithm\": trial.suggest_categorical(\"algorithm\", [\"SAMME\", \"SAMME.R\"]),\n", + " \"scaler\": trial.suggest_categorical(\n", + " \"scaler\", [\"minmax\", \"robust\", \"quantile-uniform\", \"quantile-normal\"]\n", + " ),\n", + " \"kernel\": trial.suggest_categorical(\n", + " \"kernel\", [\"linear\", \"rbf\", \"poly\", \"sigmoid\"]\n", + " ),\n", + " }\n", + "\n", + " if params[\"kernel\"] != \"linear\":\n", + " params[\"gamma\"] = trial.suggest_float(\"gamma\", 1e-3, 10.0, log=True)\n", + " if params[\"kernel\"] == \"poly\":\n", + " params[\"degree\"] = trial.suggest_int(\"degree\", 2, 5)\n", + " if params[\"kernel\"] in [\"poly\", \"sigmoid\"]:\n", + " params[\"coef0\"] = trial.suggest_float(\"coef0\", 0.0, 1.0)\n", + "\n", + " # Using a custom loss function here\n", + " # params['loss_function'] = custom_loss\n", + "\n", + " model = LinearBoostClassifier(**params)\n", + "\n", + " scores = cross_val_score(\n", + " estimator=model,\n", + " X=df,\n", + " y=y,\n", + " scoring=\"f1_weighted\",\n", + " cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),\n", + " )\n", + "\n", + " return scores.mean()\n", + "\n", + "\n", + "# Create an Optuna study and optimize the objective function\n", + "study = optuna.create_study(direction=\"maximize\")\n", + "study.optimize(objective, n_trials=200)\n", + "\n", + "# Display the best trial's results\n", + "print(\"Best trial:\")\n", + "trial = study.best_trial\n", + "\n", + "print(f\"F1 Score: {trial.value}\")\n", + "print(\"Parameters: \")\n", + "for key, value in trial.params.items():\n", + " print(f\"{key}: {value}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**XGBoost results:**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[I 2025-07-28 20:34:58,357] A new study created in memory with name: no-name-fd2e9753-4f61-49f9-a22c-28b4c0177247\n", + "[I 2025-07-28 20:34:59,157] Trial 0 finished with value: 0.653658178784821 and parameters: {'n_estimators': 122, 'max_depth': 18, 'learning_rate': 0.6638400324564625, 'gamma': 1.3755330590290604e-05, 'min_child_weight': 6, 'subsample': 0.5447494156166959, 'colsample_bytree': 0.8879793066877644, 'reg_alpha': 1.0163427568069608e-07, 'reg_lambda': 2.39079916775675e-06}. Best is trial 0 with value: 0.653658178784821.\n", + "[I 2025-07-28 20:34:59,681] Trial 1 finished with value: 0.6770776702365993 and parameters: {'n_estimators': 317, 'max_depth': 20, 'learning_rate': 0.13249519981914618, 'gamma': 1.5953454630407348e-08, 'min_child_weight': 8, 'subsample': 0.9709077195029082, 'colsample_bytree': 0.9624785240081783, 'reg_alpha': 2.203094364625466e-07, 'reg_lambda': 1.4395372969777643e-08}. Best is trial 1 with value: 0.6770776702365993.\n", + "...", + "[I 2025-07-28 20:35:10,555] Trial 199 finished with value: 0.716699568607657 and parameters: {'n_estimators': 736, 'max_depth': 14, 'learning_rate': 0.5430003212800782, 'gamma': 0.00012981585330188922, 'min_child_weight': 7, 'subsample': 0.988996549758597, 'colsample_bytree': 0.5778156428539348, 'reg_alpha': 0.0012464630655151046, 'reg_lambda': 0.00018326860521084625}. Best is trial 22 with value: 0.7218094394645215.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best trial:\n", + "F1 Score: 0.721809\n", + "Parameters:\n", + "n_estimators: 395\n", + "max_depth: 14\n", + "learning_rate: 0.5516744736853054\n", + "gamma: 0.06586236308160907\n", + "min_child_weight: 7\n", + "subsample: 0.934260221749137\n", + "colsample_bytree: 0.5657192375418623\n", + "reg_alpha: 0.0004839456623687217\n", + "reg_lambda: 0.024086595827121155\n" + ] + } + ], + "source": [ + "import xgboost as xgb\n", + "\n", + "\n", + "def objective(trial):\n", + " params = {\n", + " \"objective\": \"binary:logistic\",\n", + " \"use_label_encoder\": False,\n", + " \"n_estimators\": trial.suggest_int(\"n_estimators\", 20, 1000),\n", + " \"max_depth\": trial.suggest_int(\"max_depth\", 1, 20),\n", + " \"learning_rate\": trial.suggest_float(\"learning_rate\", 0.01, 0.7),\n", + " \"gamma\": trial.suggest_float(\"gamma\", 1e-8, 1.0, log=True),\n", + " \"min_child_weight\": trial.suggest_int(\"min_child_weight\", 1, 10),\n", + " \"subsample\": trial.suggest_float(\"subsample\", 0.5, 1.0),\n", + " \"colsample_bytree\": trial.suggest_float(\"colsample_bytree\", 0.5, 1.0),\n", + " \"reg_alpha\": trial.suggest_float(\"reg_alpha\", 1e-8, 1.0, log=True),\n", + " \"reg_lambda\": trial.suggest_float(\"reg_lambda\", 1e-8, 1.0, log=True),\n", + " \"enable_categorical\": True,\n", + " \"eval_metric\": \"logloss\",\n", + " \"verbosity\": 0,\n", + " }\n", + "\n", + " model = xgb.XGBClassifier(**params)\n", + "\n", + " scores = cross_val_score(\n", + " estimator=model,\n", + " X=X,\n", + " y=y,\n", + " scoring=\"f1_weighted\",\n", + " cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),\n", + " n_jobs=-1,\n", + " )\n", + "\n", + " return scores.mean()\n", + "\n", + "\n", + "study = optuna.create_study(direction=\"maximize\")\n", + "study.optimize(objective, n_trials=200)\n", + "\n", + "best_trial = study.best_trial\n", + "\n", + "print(\"Best trial:\")\n", + "print(f\"F1 Score: {best_trial.value:.6f}\")\n", + "print(\"Parameters:\")\n", + "for k, v in best_trial.params.items():\n", + " print(f\"{k}: {v}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**LightGBM results:**" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[I 2025-07-28 20:39:17,046] A new study created in memory with name: no-name-d640d2f2-b93d-4408-a7ea-9161e95491e3\n", + "[I 2025-07-28 20:39:17,550] Trial 0 finished with value: 0.6231313020966095 and parameters: {'boosting_type': 'goss', 'num_leaves': 157, 'learning_rate': 0.011930732887702704, 'n_estimators': 104, 'max_depth': 8, 'min_child_samples': 71, 'subsample': 0.9402086280771477, 'colsample_bytree': 0.7682647640164344, 'reg_alpha': 1.4241168044547617e-08, 'reg_lambda': 2.0676712941542342e-07, 'min_split_gain': 0.0033415036895292826, 'cat_smooth': 97, 'cat_l2': 0.6970465070605171}. Best is trial 0 with value: 0.6231313020966095.\n", + "[I 2025-07-28 20:39:32,638] Trial 1 finished with value: 0.6231313020966095 and parameters: {'boosting_type': 'dart', 'num_leaves': 175, 'learning_rate': 0.0023119406442152646, 'n_estimators': 824, 'max_depth': 8, 'min_child_samples': 45, 'subsample': 0.9676678537059415, 'colsample_bytree': 0.5552007172897914, 'reg_alpha': 2.556389565074754e-07, 'reg_lambda': 1.8954788415197908e-06, 'min_split_gain': 9.567889959044472e-08, 'cat_smooth': 8, 'cat_l2': 0.012629197180178365}. Best is trial 0 with value: 0.6231313020966095.\n", + "...", + "[I 2025-07-28 21:09:32,868] Trial 199 finished with value: 0.7127633201595156 and parameters: {'boosting_type': 'dart', 'num_leaves': 144, 'learning_rate': 0.012575812659743247, 'n_estimators': 549, 'max_depth': 11, 'min_child_samples': 33, 'subsample': 0.8201396579082235, 'colsample_bytree': 0.8618094826946774, 'reg_alpha': 0.00022277139513162425, 'reg_lambda': 1.2026457556920858, 'min_split_gain': 9.340322067132456e-08, 'cat_smooth': 78, 'cat_l2': 9.351735111010793}. Best is trial 137 with value: 0.7338329002847149.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best trial:\n", + "F1 Score: 0.733833\n", + "Parameters:\n", + "boosting_type: dart\n", + "num_leaves: 115\n", + "learning_rate: 0.014925187890769775\n", + "n_estimators: 440\n", + "max_depth: 18\n", + "min_child_samples: 25\n", + "subsample: 0.8388698484023127\n", + "colsample_bytree: 0.871735744058394\n", + "reg_alpha: 0.0002339943750255717\n", + "reg_lambda: 0.008719224583360354\n", + "min_split_gain: 6.975191054445815e-05\n", + "cat_smooth: 52\n", + "cat_l2: 1.870771829368486e-07\n" + ] + } + ], + "source": [ + "import lightgbm as lgb\n", + "\n", + "\n", + "def objective(trial):\n", + " params = {\n", + " \"objective\": \"binary\",\n", + " \"metric\": \"binary_logloss\",\n", + " \"boosting_type\": trial.suggest_categorical(\n", + " \"boosting_type\", [\"gbdt\", \"dart\", \"goss\"]\n", + " ),\n", + " \"num_leaves\": trial.suggest_int(\"num_leaves\", 2, 256),\n", + " \"learning_rate\": trial.suggest_float(\"learning_rate\", 1e-3, 0.1, log=True),\n", + " \"n_estimators\": trial.suggest_int(\"n_estimators\", 20, 1000),\n", + " \"max_depth\": trial.suggest_int(\"max_depth\", 1, 20),\n", + " \"min_child_samples\": trial.suggest_int(\"min_child_samples\", 1, 100),\n", + " \"subsample\": trial.suggest_float(\"subsample\", 0.5, 1.0),\n", + " \"colsample_bytree\": trial.suggest_float(\"colsample_bytree\", 0.5, 1.0),\n", + " \"reg_alpha\": trial.suggest_float(\"reg_alpha\", 1e-8, 10.0, log=True),\n", + " \"reg_lambda\": trial.suggest_float(\"reg_lambda\", 1e-8, 10.0, log=True),\n", + " \"min_split_gain\": trial.suggest_float(\"min_split_gain\", 1e-8, 1.0, log=True),\n", + " \"cat_smooth\": trial.suggest_int(\"cat_smooth\", 1, 100),\n", + " \"cat_l2\": trial.suggest_float(\"cat_l2\", 1e-8, 10.0, log=True),\n", + " \"verbosity\": -1,\n", + " }\n", + "\n", + " model = lgb.LGBMClassifier(**params)\n", + "\n", + " scores = cross_val_score(\n", + " estimator=model,\n", + " X=X,\n", + " y=y,\n", + " scoring=\"f1_weighted\",\n", + " cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),\n", + " n_jobs=-1,\n", + " )\n", + "\n", + " return scores.mean()\n", + "\n", + "\n", + "study = optuna.create_study(direction=\"maximize\")\n", + "study.optimize(objective, n_trials=200)\n", + "\n", + "best_trial = study.best_trial\n", + "\n", + "print(\"Best trial:\")\n", + "print(f\"F1 Score: {best_trial.value:.6f}\")\n", + "print(\"Parameters:\")\n", + "for k, v in best_trial.params.items():\n", + " print(f\"{k}: {v}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**CatBoost results:**" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[I 2025-07-28 21:10:05,884] A new study created in memory with name: no-name-bf4ade11-fdd4-4e51-8e36-3ce62bb0ef30\n", + "[I 2025-07-28 21:10:06,304] Trial 0 finished with value: 0.644865039962623 and parameters: {'iterations': 326, 'depth': 10, 'learning_rate': 0.19634246718523193, 'l2_leaf_reg': 2.874223428781629e-05, 'random_strength': 0.0007132910532975053, 'bagging_temperature': 4.68236671244208, 'border_count': 140, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 71, 'rsm': 0.851641906909135}. Best is trial 0 with value: 0.644865039962623.\n", + "[I 2025-07-28 21:10:06,587] Trial 1 finished with value: 0.6769947086440917 and parameters: {'iterations': 364, 'depth': 13, 'learning_rate': 0.04465629029593174, 'l2_leaf_reg': 1.3554273893773743e-08, 'random_strength': 0.4601972154575927, 'bagging_temperature': 1.856284401555177, 'border_count': 182, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 33, 'rsm': 0.27841739352654576}. Best is trial 1 with value: 0.6769947086440917.\n", + "...", + "[I 2025-07-28 21:10:22,377] Trial 199 finished with value: 0.713373121188903 and parameters: {'iterations': 161, 'depth': 2, 'learning_rate': 0.007216156705027904, 'l2_leaf_reg': 4.2665020699161245e-07, 'random_strength': 0.00017324894641089118, 'bagging_temperature': 0.4153387841633685, 'border_count': 96, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 96, 'rsm': 0.8596971617051177}. Best is trial 94 with value: 0.7319447527560347.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best trial:\n", + "F1 Score: 0.731945\n", + "Parameters:\n", + "iterations: 168\n", + "depth: 5\n", + "learning_rate: 0.00889070096045054\n", + "l2_leaf_reg: 3.173038372914875e-05\n", + "random_strength: 0.0004606096176348796\n", + "bagging_temperature: 0.9387985722566684\n", + "border_count: 56\n", + "grow_policy: Depthwise\n", + "min_data_in_leaf: 92\n", + "rsm: 0.7489477360324039\n" + ] + } + ], + "source": [ + "from catboost import CatBoostClassifier\n", + "\n", + "\n", + "def objective(trial):\n", + " params = {\n", + " \"iterations\": trial.suggest_int(\"iterations\", 50, 500),\n", + " \"depth\": trial.suggest_int(\"depth\", 1, 16),\n", + " \"learning_rate\": trial.suggest_float(\"learning_rate\", 1e-3, 0.5, log=True),\n", + " \"l2_leaf_reg\": trial.suggest_float(\"l2_leaf_reg\", 1e-8, 10.0, log=True),\n", + " \"random_strength\": trial.suggest_float(\"random_strength\", 1e-8, 10.0, log=True),\n", + " \"bagging_temperature\": trial.suggest_float(\n", + " \"bagging_temperature\", 0.1, 10.0, log=True\n", + " ),\n", + " \"border_count\": trial.suggest_int(\"border_count\", 32, 255),\n", + " \"grow_policy\": trial.suggest_categorical(\n", + " \"grow_policy\", [\"SymmetricTree\", \"Depthwise\", \"Lossguide\"]\n", + " ),\n", + " \"min_data_in_leaf\": trial.suggest_int(\"min_data_in_leaf\", 1, 100),\n", + " \"rsm\": trial.suggest_float(\"rsm\", 0.1, 1.0),\n", + " \"loss_function\": \"Logloss\",\n", + " \"eval_metric\": \"F1\",\n", + " \"cat_features\": categorical_cols,\n", + " \"verbose\": 0,\n", + " }\n", + "\n", + " model = CatBoostClassifier(**params)\n", + "\n", + " scores = cross_val_score(\n", + " estimator=model,\n", + " X=X,\n", + " y=y,\n", + " scoring=\"f1_weighted\",\n", + " cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),\n", + " n_jobs=-1,\n", + " )\n", + "\n", + " return scores.mean()\n", + "\n", + "\n", + "study = optuna.create_study(direction=\"maximize\")\n", + "study.optimize(objective, n_trials=200)\n", + "\n", + "best_trial = study.best_trial\n", + "\n", + "print(\"Best trial:\")\n", + "print(f\"F1 Score: {best_trial.value:.6f}\")\n", + "print(\"Parameters:\")\n", + "for k, v in best_trial.params.items():\n", + " print(f\"{k}: {v}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv (3.13.5)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/linearboost/linear_boost.py b/src/linearboost/linear_boost.py index 7ffe091..82ef8b7 100644 --- a/src/linearboost/linear_boost.py +++ b/src/linearboost/linear_boost.py @@ -561,6 +561,11 @@ def fit(self, X, y, sample_weight=None) -> Self: category=FutureWarning, message=".*parameter 'algorithm' may change in the future.*", ) + warnings.filterwarnings( + "ignore", + category=FutureWarning, + message=".*parameter 'algorithm' is deprecated.*", + ) return super().fit(X_transformed, y, sample_weight) @staticmethod diff --git a/src/linearboost/sefr.py b/src/linearboost/sefr.py index 3d08dbd..37a3c30 100644 --- a/src/linearboost/sefr.py +++ b/src/linearboost/sefr.py @@ -312,6 +312,7 @@ def predict_proba(self, X): else: score = self.decision_function(X) / norm_coef proba = 1.0 / (1.0 + np.exp(-score)) + proba = np.clip(proba, 1e-9, 1 - 1e-9) return np.column_stack((1.0 - proba, proba)) def predict_log_proba(self, X):