From 17f991135c85d02cc66cd384ac282bc9429111b9 Mon Sep 17 00:00:00 2001 From: Winston Robson Date: Sat, 13 Jun 2020 17:56:49 -0700 Subject: [PATCH] Add lasso_and_ridge_regression.ipynb | init draft --- README.md | 1 + lasso_and_ridge_regression.ipynb | 506 +++++++++++++++++++++++++++++++ 2 files changed, 507 insertions(+) create mode 100644 lasso_and_ridge_regression.ipynb diff --git a/README.md b/README.md index 1343a7f..60d22c0 100644 --- a/README.md +++ b/README.md @@ -4,3 +4,4 @@ Algorithm | Notebook --- | --- Linear Regression (OLS) | [linear_regression.ipynb](linear_regression.ipynb) Distributed Linear Regression (OLS) | [distributed_linear_regression.ipynb](distributed/distributed_linear_regression.ipynb) +Linear Regression with Lasso or Ridge Regularization | [lasso_and_ridge_regression.ipynb](lasso_and_ridge_regression.ipynb) diff --git a/lasso_and_ridge_regression.ipynb b/lasso_and_ridge_regression.ipynb new file mode 100644 index 0000000..8375c83 --- /dev/null +++ b/lasso_and_ridge_regression.ipynb @@ -0,0 +1,506 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "XaDpeOh_coGf" + }, + "source": [ + "# Linear Regression with Lasso or Ridge Regularization\n", + "\n", + "Ridge and Lasso regression are some of the simple techniques to reduce model complexity and prevent over-fitting which may result from simple linear regression.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "gTp_2yZecoGm" + }, + "source": [ + "## Data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "qu9PGtG3coGn" + }, + "outputs": [], + "source": [ + "from sklearn.datasets import load_boston\n", + "# load Boston dataset\n", + "boston = load_boston()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# print(boston.DESCR)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import cudf\n", + "\n", + "df = cudf.DataFrame(data=list(boston.data), columns=boston.feature_names)\n", + "\n", + "df['MEDV'] = boston.target\n", + "\n", + "# df.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# corr = df.to_pandas().corr()\n", + "# corr.style.background_gradient(cmap='coolwarm')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "X = df[['RM', 'AGE', 'TAX']]\n", + "y = df.MEDV" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from cuml.preprocessing.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Lasso Regularization" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lasso extends LinearRegression by providing L1 regularization on the coefficients when predicting response y with a linear combination of the predictors in X. It can zero some of the coefficients for feature selection and improves the conditioning of the problem. \n", + "\n", + "cuML's Lasso can take array-like objects, either in host as NumPy arrays or in device (as Numba or `__cuda_array_interface__` compliant), in addition to cuDF objects. It uses coordinate descent to fit a linear model." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from cuml.linear_model import Lasso" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# run the cuml ridge regression model to fit the training dataset. Eig is the faster algorithm, but svd is more accurate \n", + "ls = Lasso(\n", + " alpha=1.0,\n", + " fit_intercept=True,\n", + " normalize=False,\n", + " max_iter=1000,\n", + " tol=0.001,\n", + " selection='cyclic',\n", + " handle=None,\n", + " output_type=None\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Lasso(alpha=1.0, fit_intercept=True, normalize=False, max_iter=1000, tol=0.001, selection='cyclic', handle=, output_type='input')" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ls.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.36 ms, sys: 743 µs, total: 2.1 ms\n", + "Wall time: 1.42 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "# calculate the mean squared error of the testing dataset using the cuml ridge regression model\n", + "preds = ls.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 4.1 ms, sys: 0 ns, total: 4.1 ms\n", + "Wall time: 3.23 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "# calculate the mean squared error of the testing dataset using the cuml ridge regression model\n", + "preds = ls.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "from cuml.metrics import mean_squared_error" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "error = mean_squared_error(y_test, preds)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "29.796427316935755\n" + ] + } + ], + "source": [ + "print(error)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# scatter actual and predicted results\n", + "plt.scatter(y_test.to_pandas(), preds.to_pandas())\n", + "\n", + "# label graph\n", + "plt.xlabel(\"Actual Prices: $Y_i$\")\n", + "plt.ylabel(\"Predicted prices: $\\hat{Y}_i$\")\n", + "plt.title(\"Prices vs Predicted prices: $Y_i$ vs $\\hat{Y}_i$\")\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Nd8dvjoKcoGr" + }, + "source": [ + "# Ridge Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Ridge Regularization \n", + "Ridge extends LinearRegression by providing L2 regularization on the coefficients when predicting response y with a linear combination of the predictors in X. It can reduce the variance of the predictors, and improves the conditioning of the problem.\n", + "\n", + "The Ridge Regression function implemented in the cuml library allows the user to change the fit_intercept, normalize, solver and alpha parameters. Here is a brief on RAPIDS' Ridge Regression's parameters:\n", + "1. `alpha`: float or double. Regularization strength - must be a positive float. Larger values specify stronger regularization. Array input will be supported later.\n", + "1. `solver`: ‘eig’ or ‘svd’ or ‘cd’ (default = ‘eig’). Eig uses a eigendecomposition of the covariance matrix, and is much faster. SVD is slower, but is guaranteed to be stable. CD or Coordinate Descent is very fast and is suitable for large problems.\n", + "1. `fit_intercept`: boolean (default = True). If True, Ridge tries to correct for the global mean of y. If False, the model expects that you have centered the data.\n", + "1. `normalize`: boolean (default = False). If True, the predictors in X will be normalized by dividing by it’s L2 norm. If False, no scaling will be done.\n", + "\n", + "The methods that can be used with the Ridge Regression are:\n", + "1. `fit`: Fit the model with X and y.\n", + "1. `get_params`: Sklearn style return parameter state\n", + "1. `predict`: Predicts the y for X.\n", + "1. `set_params`: Sklearn style set parameter state to dictionary of params.\n", + "\n", + "The model accepts only numpy arrays or cudf dataframes as the input. \n", + "- In order to convert your dataset to cudf format please read the cudf [documentation](https://rapidsai.github.io/projects/cudf/en/latest/). \n", + "- It is important to understand that the 'svd' solver will run slower than the 'eig' solver however, the 'svd' solver is more stable and robust. \n", + " - Therefore, we would recomend that you use the 'eig' solver when a slight error is acceptable. \n", + " - For additional information please refer to the [documentation](https://rapidsai.github.io/projects/cuml/en/latest/index.html)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "from cuml.linear_model import Ridge" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + }, + "colab_type": "code", + "id": "55fhclfXcoG9", + "outputId": "09e9ac9d-d5ce-4370-c16e-d959cc6bf200", + "scrolled": true + }, + "outputs": [], + "source": [ + "# run the cuml ridge regression model to fit the training dataset. Eig is the faster algorithm, but svd is more accurate \n", + "ridge = Ridge(fit_intercept=False,\n", + " normalize=True,\n", + " solver='eig', \n", + " alpha=0.1)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Ridge(alpha=0.1, solver='eig', fit_intercept=False, normalize=True, handle=, output_type='cudf')" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ridge.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + }, + "colab_type": "code", + "id": "VH2p-JmicoHB", + "outputId": "8a061986-ca73-4d4b-853b-9d304a2cb06a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 3.96 ms, sys: 0 ns, total: 3.96 ms\n", + "Wall time: 2.86 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "# calculate the mean squared error of the testing dataset using the cuml ridge regression model\n", + "preds = ridge.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "from cuml.metrics import mean_squared_error" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "error = mean_squared_error(y_test, preds)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 85 + }, + "colab_type": "code", + "id": "Z5pcDGqAcoHJ", + "outputId": "88bfba52-2fb6-4594-eafe-c2816cba2094" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "31.040458246536684\n" + ] + } + ], + "source": [ + "print(error)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# scatter actual and predicted results\n", + "plt.scatter(y_test.to_pandas(), preds.to_pandas())\n", + "\n", + "# label graph\n", + "plt.xlabel(\"Actual Prices: $Y_i$\")\n", + "plt.ylabel(\"Predicted prices: $\\hat{Y}_i$\")\n", + "plt.title(\"Prices vs Predicted prices: $Y_i$ vs $\\hat{Y}_i$\")\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "name": "ridge_regression_demo.ipynb", + "provenance": [], + "version": "0.3.2" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}