{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['Rank ', 'Name', 'Networth', 'Age', 'Country', 'Source', 'Industry'], dtype='object')\n" ] } ], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "df = pd.read_csv(\"C://Users//annal//aim//static//csv//Forbes_Billionaires.csv\")\n", "print(df.columns)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Определим бизнес цели:\n", "## 1- Прогнозирование состояния миллиардера(регрессия)\n", "## 2- Прогнозирование возраста миллиардера(классификация)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Проверим данные на пустые значения" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rank 0\n", "Name 0\n", "Networth 0\n", "Age 0\n", "Country 0\n", "Source 0\n", "Industry 0\n", "dtype: int64\n", "\n", "Rank False\n", "Name False\n", "Networth False\n", "Age False\n", "Country False\n", "Source False\n", "Industry False\n", "dtype: bool\n", "\n" ] } ], "source": [ "print(df.isnull().sum())\n", "\n", "print()\n", "\n", "# Есть ли пустые значения признаков\n", "print(df.isnull().any())\n", "\n", "print()\n", "\n", "# Процент пустых значений признаков\n", "for i in df.columns:\n", " null_rate = df[i].isnull().sum() / len(df) * 100\n", " if null_rate > 0:\n", " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting Jinja2\n", " Downloading jinja2-3.1.4-py3-none-any.whl.metadata (2.6 kB)\n", "Collecting MarkupSafe>=2.0 (from Jinja2)\n", " Downloading MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl.metadata (4.1 kB)\n", "Downloading jinja2-3.1.4-py3-none-any.whl (133 kB)\n", "Downloading MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl (15 kB)\n", "Installing collected packages: MarkupSafe, Jinja2\n", "Successfully installed Jinja2-3.1.4 MarkupSafe-3.0.2\n", "Note: you may need to restart the kernel to use updated packages.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "[notice] A new release of pip is available: 24.2 -> 24.3.1\n", "[notice] To update, run: python.exe -m pip install --upgrade pip\n" ] } ], "source": [ "pip install Jinja2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Задача регрессии" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Создадим выборки" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from sklearn.model_selection import train_test_split\n", "df = pd.read_csv(\"C://Users//annal//aim//static//csv//Forbes_Billionaires.csv\")\n", "X = df.drop(columns=['Networth','Rank ', 'Name']) # Признаки\n", "y = df['Networth'] # Целевая переменная для регрессии\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Формирование конвейера для классификации данных\n", "## preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n", "## preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n", "## features_preprocessing -- трансформер для предобработки признаков\n", "## features_engineering -- трансформер для конструирования признаков\n", "## drop_columns -- трансформер для удаления колонок\n", "## pipeline_end -- основной конвейер предобработки данных и конструирования признаков" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | prepocessing_num__Age | \n", "prepocessing_cat__Country_Argentina | \n", "prepocessing_cat__Country_Australia | \n", "prepocessing_cat__Country_Austria | \n", "prepocessing_cat__Country_Barbados | \n", "prepocessing_cat__Country_Belgium | \n", "prepocessing_cat__Country_Belize | \n", "prepocessing_cat__Country_Brazil | \n", "prepocessing_cat__Country_Bulgaria | \n", "prepocessing_cat__Country_Canada | \n", "... | \n", "prepocessing_cat__Industry_Logistics | \n", "prepocessing_cat__Industry_Manufacturing | \n", "prepocessing_cat__Industry_Media & Entertainment | \n", "prepocessing_cat__Industry_Metals & Mining | \n", "prepocessing_cat__Industry_Real Estate | \n", "prepocessing_cat__Industry_Service | \n", "prepocessing_cat__Industry_Sports | \n", "prepocessing_cat__Industry_Technology | \n", "prepocessing_cat__Industry_Telecom | \n", "prepocessing_cat__Industry_diversified | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
582 | \n", "-0.109934 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
48 | \n", "1.079079 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
1772 | \n", "1.004766 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
964 | \n", "-0.407187 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
2213 | \n", "1.302019 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1638 | \n", "1.227706 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
1095 | \n", "0.856139 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
1130 | \n", "0.781826 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
1294 | \n", "0.335946 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
860 | \n", "0.558886 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
2080 rows × 855 columns
\n", "\n", " | MAE | \n", "RMSE | \n", "R2 | \n", "
---|---|---|---|
RandomForestRegressor | \n", "3.454630 | \n", "7.755776 | \n", "-1.530803 | \n", "
GradientBoostingRegressor | \n", "3.585785 | \n", "10.312249 | \n", "-3.474193 | \n", "
LinearRegression | \n", "18059903.801767 | \n", "411829080.658451 | \n", "-7135788186375614.000000 | \n", "
\n", " | prepocessing_num__Networth | \n", "prepocessing_cat__Country_Argentina | \n", "prepocessing_cat__Country_Australia | \n", "prepocessing_cat__Country_Austria | \n", "prepocessing_cat__Country_Barbados | \n", "prepocessing_cat__Country_Belgium | \n", "prepocessing_cat__Country_Belize | \n", "prepocessing_cat__Country_Brazil | \n", "prepocessing_cat__Country_Bulgaria | \n", "prepocessing_cat__Country_Canada | \n", "... | \n", "prepocessing_cat__Industry_Logistics | \n", "prepocessing_cat__Industry_Manufacturing | \n", "prepocessing_cat__Industry_Media & Entertainment | \n", "prepocessing_cat__Industry_Metals & Mining | \n", "prepocessing_cat__Industry_Real Estate | \n", "prepocessing_cat__Industry_Service | \n", "prepocessing_cat__Industry_Sports | \n", "prepocessing_cat__Industry_Technology | \n", "prepocessing_cat__Industry_Telecom | \n", "prepocessing_cat__Industry_diversified | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
582 | \n", "-0.013606 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
48 | \n", "1.994083 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
1772 | \n", "-0.288162 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
964 | \n", "-0.159464 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
2213 | \n", "-0.322481 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1638 | \n", "-0.271002 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
1095 | \n", "-0.193783 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
1130 | \n", "-0.193783 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
1294 | \n", "-0.228103 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
860 | \n", "-0.133724 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
2080 rows × 855 columns
\n", "