AIM-PIbd-31-Yakovlev-M-G/lab_4/lab_4.ipynb

2998 lines
132 KiB
Plaintext
Raw Permalink Normal View History

2024-11-15 16:44:23 +04:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 337,
2024-11-15 16:44:23 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>...</th>\n",
" <th>grade</th>\n",
" <th>sqft_above</th>\n",
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>7129300520</td>\n",
" <td>20141013T000000</td>\n",
" <td>221900.0</td>\n",
" <td>3</td>\n",
" <td>1.00</td>\n",
" <td>1180</td>\n",
" <td>5650</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>7</td>\n",
" <td>1180</td>\n",
" <td>0</td>\n",
" <td>1955</td>\n",
" <td>0</td>\n",
" <td>98178</td>\n",
" <td>47.5112</td>\n",
" <td>-122.257</td>\n",
" <td>1340</td>\n",
" <td>5650</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6414100192</td>\n",
" <td>20141209T000000</td>\n",
" <td>538000.0</td>\n",
" <td>3</td>\n",
" <td>2.25</td>\n",
" <td>2570</td>\n",
" <td>7242</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>7</td>\n",
" <td>2170</td>\n",
" <td>400</td>\n",
" <td>1951</td>\n",
" <td>1991</td>\n",
" <td>98125</td>\n",
" <td>47.7210</td>\n",
" <td>-122.319</td>\n",
" <td>1690</td>\n",
" <td>7639</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5631500400</td>\n",
" <td>20150225T000000</td>\n",
" <td>180000.0</td>\n",
" <td>2</td>\n",
" <td>1.00</td>\n",
" <td>770</td>\n",
" <td>10000</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>770</td>\n",
" <td>0</td>\n",
" <td>1933</td>\n",
" <td>0</td>\n",
" <td>98028</td>\n",
" <td>47.7379</td>\n",
" <td>-122.233</td>\n",
" <td>2720</td>\n",
" <td>8062</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2487200875</td>\n",
" <td>20141209T000000</td>\n",
" <td>604000.0</td>\n",
" <td>4</td>\n",
" <td>3.00</td>\n",
" <td>1960</td>\n",
" <td>5000</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>7</td>\n",
" <td>1050</td>\n",
" <td>910</td>\n",
" <td>1965</td>\n",
" <td>0</td>\n",
" <td>98136</td>\n",
" <td>47.5208</td>\n",
" <td>-122.393</td>\n",
" <td>1360</td>\n",
" <td>5000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1954400510</td>\n",
" <td>20150218T000000</td>\n",
" <td>510000.0</td>\n",
" <td>3</td>\n",
" <td>2.00</td>\n",
" <td>1680</td>\n",
" <td>8080</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>8</td>\n",
" <td>1680</td>\n",
" <td>0</td>\n",
" <td>1987</td>\n",
" <td>0</td>\n",
" <td>98074</td>\n",
" <td>47.6168</td>\n",
" <td>-122.045</td>\n",
" <td>1800</td>\n",
" <td>7503</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9995</th>\n",
" <td>322059264</td>\n",
" <td>20140926T000000</td>\n",
" <td>279000.0</td>\n",
" <td>2</td>\n",
" <td>1.00</td>\n",
" <td>1020</td>\n",
" <td>47044</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>7</td>\n",
" <td>1020</td>\n",
" <td>0</td>\n",
" <td>1904</td>\n",
" <td>1958</td>\n",
" <td>98042</td>\n",
" <td>47.4206</td>\n",
" <td>-122.155</td>\n",
" <td>1930</td>\n",
" <td>12139</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9996</th>\n",
" <td>5557500270</td>\n",
" <td>20150209T000000</td>\n",
" <td>262000.0</td>\n",
" <td>3</td>\n",
" <td>1.50</td>\n",
" <td>1700</td>\n",
" <td>9579</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>7</td>\n",
" <td>1100</td>\n",
" <td>600</td>\n",
" <td>1962</td>\n",
" <td>0</td>\n",
" <td>98023</td>\n",
" <td>47.3209</td>\n",
" <td>-122.338</td>\n",
" <td>1700</td>\n",
" <td>9628</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9997</th>\n",
" <td>9164100125</td>\n",
" <td>20140807T000000</td>\n",
" <td>533000.0</td>\n",
" <td>4</td>\n",
" <td>1.00</td>\n",
" <td>1550</td>\n",
" <td>4750</td>\n",
" <td>1.5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>7</td>\n",
" <td>1550</td>\n",
" <td>0</td>\n",
" <td>1919</td>\n",
" <td>0</td>\n",
" <td>98117</td>\n",
" <td>47.6824</td>\n",
" <td>-122.389</td>\n",
" <td>1320</td>\n",
" <td>4750</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9998</th>\n",
" <td>7370600045</td>\n",
" <td>20150402T000000</td>\n",
" <td>640000.0</td>\n",
" <td>3</td>\n",
" <td>1.75</td>\n",
" <td>1680</td>\n",
" <td>8100</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>8</td>\n",
" <td>1680</td>\n",
" <td>0</td>\n",
" <td>1950</td>\n",
" <td>0</td>\n",
" <td>98177</td>\n",
" <td>47.7212</td>\n",
" <td>-122.364</td>\n",
" <td>1880</td>\n",
" <td>7750</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9999</th>\n",
" <td>8594400060</td>\n",
" <td>20140609T000000</td>\n",
" <td>285000.0</td>\n",
" <td>3</td>\n",
" <td>2.25</td>\n",
" <td>1680</td>\n",
" <td>35127</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>7</td>\n",
" <td>1680</td>\n",
" <td>0</td>\n",
" <td>1987</td>\n",
" <td>0</td>\n",
" <td>98092</td>\n",
" <td>47.3025</td>\n",
" <td>-122.067</td>\n",
" <td>1820</td>\n",
" <td>35166</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10000 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
" id date price bedrooms bathrooms sqft_living \\\n",
"0 7129300520 20141013T000000 221900.0 3 1.00 1180 \n",
"1 6414100192 20141209T000000 538000.0 3 2.25 2570 \n",
"2 5631500400 20150225T000000 180000.0 2 1.00 770 \n",
"3 2487200875 20141209T000000 604000.0 4 3.00 1960 \n",
"4 1954400510 20150218T000000 510000.0 3 2.00 1680 \n",
"... ... ... ... ... ... ... \n",
"9995 322059264 20140926T000000 279000.0 2 1.00 1020 \n",
"9996 5557500270 20150209T000000 262000.0 3 1.50 1700 \n",
"9997 9164100125 20140807T000000 533000.0 4 1.00 1550 \n",
"9998 7370600045 20150402T000000 640000.0 3 1.75 1680 \n",
"9999 8594400060 20140609T000000 285000.0 3 2.25 1680 \n",
"\n",
" sqft_lot floors waterfront view ... grade sqft_above \\\n",
"0 5650 1.0 0 0 ... 7 1180 \n",
"1 7242 2.0 0 0 ... 7 2170 \n",
"2 10000 1.0 0 0 ... 6 770 \n",
"3 5000 1.0 0 0 ... 7 1050 \n",
"4 8080 1.0 0 0 ... 8 1680 \n",
"... ... ... ... ... ... ... ... \n",
"9995 47044 1.0 0 0 ... 7 1020 \n",
"9996 9579 1.0 0 0 ... 7 1100 \n",
"9997 4750 1.5 0 0 ... 7 1550 \n",
"9998 8100 1.0 0 2 ... 8 1680 \n",
"9999 35127 2.0 0 0 ... 7 1680 \n",
"\n",
" sqft_basement yr_built yr_renovated zipcode lat long \\\n",
"0 0 1955 0 98178 47.5112 -122.257 \n",
"1 400 1951 1991 98125 47.7210 -122.319 \n",
"2 0 1933 0 98028 47.7379 -122.233 \n",
"3 910 1965 0 98136 47.5208 -122.393 \n",
"4 0 1987 0 98074 47.6168 -122.045 \n",
"... ... ... ... ... ... ... \n",
"9995 0 1904 1958 98042 47.4206 -122.155 \n",
"9996 600 1962 0 98023 47.3209 -122.338 \n",
"9997 0 1919 0 98117 47.6824 -122.389 \n",
"9998 0 1950 0 98177 47.7212 -122.364 \n",
"9999 0 1987 0 98092 47.3025 -122.067 \n",
"\n",
" sqft_living15 sqft_lot15 \n",
"0 1340 5650 \n",
"1 1690 7639 \n",
"2 2720 8062 \n",
"3 1360 5000 \n",
"4 1800 7503 \n",
"... ... ... \n",
"9995 1930 12139 \n",
"9996 1700 9628 \n",
"9997 1320 4750 \n",
"9998 1880 7750 \n",
"9999 1820 35166 \n",
"\n",
"[10000 rows x 21 columns]"
]
},
"execution_count": 337,
2024-11-15 16:44:23 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn import set_config\n",
"\n",
"df = pd.read_csv(\"data/house_data.csv\", sep=\",\", nrows=10000)\n",
"df.dropna()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Устраняем выбросы в колонке цены и добавляем колонку с категориями цены"
]
},
{
"cell_type": "code",
"execution_count": 338,
2024-11-15 16:44:23 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>...</th>\n",
" <th>sqft_above</th>\n",
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" <th>price_category</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>9980</th>\n",
" <td>6840700036</td>\n",
" <td>20140728T000000</td>\n",
" <td>497000.0</td>\n",
" <td>2</td>\n",
" <td>1.00</td>\n",
" <td>770</td>\n",
" <td>3325</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>770</td>\n",
" <td>0</td>\n",
" <td>1918</td>\n",
" <td>0</td>\n",
" <td>98122</td>\n",
" <td>47.6102</td>\n",
" <td>-122.299</td>\n",
" <td>960</td>\n",
" <td>4800</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9981</th>\n",
" <td>1824069083</td>\n",
" <td>20150429T000000</td>\n",
" <td>835000.0</td>\n",
" <td>3</td>\n",
" <td>1.00</td>\n",
" <td>3060</td>\n",
" <td>30166</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>3060</td>\n",
" <td>0</td>\n",
" <td>1959</td>\n",
" <td>0</td>\n",
" <td>98027</td>\n",
" <td>47.5656</td>\n",
" <td>-122.093</td>\n",
" <td>1880</td>\n",
" <td>19602</td>\n",
" <td>high</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9982</th>\n",
" <td>1836980240</td>\n",
" <td>20141015T000000</td>\n",
" <td>730000.0</td>\n",
" <td>4</td>\n",
" <td>2.75</td>\n",
" <td>2920</td>\n",
" <td>4500</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>2920</td>\n",
" <td>0</td>\n",
" <td>1999</td>\n",
" <td>0</td>\n",
" <td>98006</td>\n",
" <td>47.5646</td>\n",
" <td>-122.124</td>\n",
" <td>2920</td>\n",
" <td>4505</td>\n",
" <td>high</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9983</th>\n",
" <td>3528900160</td>\n",
" <td>20141001T000000</td>\n",
" <td>655000.0</td>\n",
" <td>3</td>\n",
" <td>1.00</td>\n",
" <td>1370</td>\n",
" <td>5250</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1070</td>\n",
" <td>300</td>\n",
" <td>1939</td>\n",
" <td>0</td>\n",
" <td>98109</td>\n",
" <td>47.6421</td>\n",
" <td>-122.348</td>\n",
" <td>2410</td>\n",
" <td>4200</td>\n",
" <td>high</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9984</th>\n",
" <td>1442800060</td>\n",
" <td>20141120T000000</td>\n",
" <td>205000.0</td>\n",
" <td>3</td>\n",
" <td>2.50</td>\n",
" <td>1870</td>\n",
" <td>3118</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1870</td>\n",
" <td>0</td>\n",
" <td>1993</td>\n",
" <td>0</td>\n",
" <td>98038</td>\n",
" <td>47.3739</td>\n",
" <td>-122.056</td>\n",
" <td>1580</td>\n",
" <td>3601</td>\n",
" <td>low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9985</th>\n",
" <td>8722100030</td>\n",
" <td>20150407T000000</td>\n",
" <td>632750.0</td>\n",
" <td>4</td>\n",
" <td>2.00</td>\n",
" <td>1800</td>\n",
" <td>4800</td>\n",
" <td>1.5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1800</td>\n",
" <td>0</td>\n",
" <td>1918</td>\n",
" <td>0</td>\n",
" <td>98112</td>\n",
" <td>47.6388</td>\n",
" <td>-122.302</td>\n",
" <td>1950</td>\n",
" <td>4800</td>\n",
" <td>high</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9986</th>\n",
" <td>1723049624</td>\n",
" <td>20140512T000000</td>\n",
" <td>330000.0</td>\n",
" <td>5</td>\n",
" <td>3.00</td>\n",
" <td>2100</td>\n",
" <td>7715</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1250</td>\n",
" <td>850</td>\n",
" <td>2013</td>\n",
" <td>0</td>\n",
" <td>98168</td>\n",
" <td>47.4866</td>\n",
" <td>-122.319</td>\n",
" <td>2100</td>\n",
" <td>7959</td>\n",
" <td>low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9987</th>\n",
" <td>4040400200</td>\n",
" <td>20141007T000000</td>\n",
" <td>527500.0</td>\n",
" <td>5</td>\n",
" <td>2.25</td>\n",
" <td>2530</td>\n",
" <td>8250</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>2530</td>\n",
" <td>0</td>\n",
" <td>1961</td>\n",
" <td>0</td>\n",
" <td>98007</td>\n",
" <td>47.6117</td>\n",
" <td>-122.134</td>\n",
" <td>2020</td>\n",
" <td>8250</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9988</th>\n",
" <td>8691391090</td>\n",
" <td>20140508T000000</td>\n",
" <td>716500.0</td>\n",
" <td>4</td>\n",
" <td>2.50</td>\n",
" <td>3290</td>\n",
" <td>6465</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>3290</td>\n",
" <td>0</td>\n",
" <td>2002</td>\n",
" <td>0</td>\n",
" <td>98075</td>\n",
" <td>47.5981</td>\n",
" <td>-121.976</td>\n",
" <td>3100</td>\n",
" <td>5929</td>\n",
" <td>high</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9989</th>\n",
" <td>7853302190</td>\n",
" <td>20141217T000000</td>\n",
" <td>388500.0</td>\n",
" <td>4</td>\n",
" <td>2.50</td>\n",
" <td>1890</td>\n",
" <td>5395</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1890</td>\n",
" <td>0</td>\n",
" <td>2006</td>\n",
" <td>0</td>\n",
" <td>98065</td>\n",
" <td>47.5415</td>\n",
" <td>-121.883</td>\n",
" <td>2060</td>\n",
" <td>5395</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9990</th>\n",
" <td>3260000700</td>\n",
" <td>20140904T000000</td>\n",
" <td>530000.0</td>\n",
" <td>3</td>\n",
" <td>1.75</td>\n",
" <td>1680</td>\n",
" <td>7770</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1680</td>\n",
" <td>0</td>\n",
" <td>1967</td>\n",
" <td>0</td>\n",
" <td>98005</td>\n",
" <td>47.6028</td>\n",
" <td>-122.167</td>\n",
" <td>1880</td>\n",
" <td>7770</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9991</th>\n",
" <td>5126300510</td>\n",
" <td>20150108T000000</td>\n",
" <td>419000.0</td>\n",
" <td>3</td>\n",
" <td>2.50</td>\n",
" <td>2170</td>\n",
" <td>4517</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>2170</td>\n",
" <td>0</td>\n",
" <td>2002</td>\n",
" <td>0</td>\n",
" <td>98059</td>\n",
" <td>47.4819</td>\n",
" <td>-122.140</td>\n",
" <td>2610</td>\n",
" <td>4770</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9992</th>\n",
" <td>7199330370</td>\n",
" <td>20150309T000000</td>\n",
" <td>385000.0</td>\n",
" <td>3</td>\n",
" <td>1.75</td>\n",
" <td>1200</td>\n",
" <td>7360</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1200</td>\n",
" <td>0</td>\n",
" <td>1978</td>\n",
" <td>0</td>\n",
" <td>98052</td>\n",
" <td>47.6979</td>\n",
" <td>-122.130</td>\n",
" <td>1200</td>\n",
" <td>7500</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9993</th>\n",
" <td>1854900240</td>\n",
" <td>20140528T000000</td>\n",
" <td>655000.0</td>\n",
" <td>4</td>\n",
" <td>2.50</td>\n",
" <td>2990</td>\n",
" <td>5669</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>2990</td>\n",
" <td>0</td>\n",
" <td>2003</td>\n",
" <td>0</td>\n",
" <td>98074</td>\n",
" <td>47.6119</td>\n",
" <td>-122.011</td>\n",
" <td>3110</td>\n",
" <td>5058</td>\n",
" <td>high</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9994</th>\n",
" <td>6738700335</td>\n",
" <td>20140701T000000</td>\n",
" <td>1127312.5</td>\n",
" <td>4</td>\n",
" <td>2.75</td>\n",
" <td>3770</td>\n",
" <td>10900</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>3070</td>\n",
" <td>700</td>\n",
" <td>1924</td>\n",
" <td>0</td>\n",
" <td>98144</td>\n",
" <td>47.5849</td>\n",
" <td>-122.290</td>\n",
" <td>3000</td>\n",
" <td>5000</td>\n",
" <td>very_high</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9995</th>\n",
" <td>322059264</td>\n",
" <td>20140926T000000</td>\n",
" <td>279000.0</td>\n",
" <td>2</td>\n",
" <td>1.00</td>\n",
" <td>1020</td>\n",
" <td>47044</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1020</td>\n",
" <td>0</td>\n",
" <td>1904</td>\n",
" <td>1958</td>\n",
" <td>98042</td>\n",
" <td>47.4206</td>\n",
" <td>-122.155</td>\n",
" <td>1930</td>\n",
" <td>12139</td>\n",
" <td>low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9996</th>\n",
" <td>5557500270</td>\n",
" <td>20150209T000000</td>\n",
" <td>262000.0</td>\n",
" <td>3</td>\n",
" <td>1.50</td>\n",
" <td>1700</td>\n",
" <td>9579</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1100</td>\n",
" <td>600</td>\n",
" <td>1962</td>\n",
" <td>0</td>\n",
" <td>98023</td>\n",
" <td>47.3209</td>\n",
" <td>-122.338</td>\n",
" <td>1700</td>\n",
" <td>9628</td>\n",
" <td>low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9997</th>\n",
" <td>9164100125</td>\n",
" <td>20140807T000000</td>\n",
" <td>533000.0</td>\n",
" <td>4</td>\n",
" <td>1.00</td>\n",
" <td>1550</td>\n",
" <td>4750</td>\n",
" <td>1.5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1550</td>\n",
" <td>0</td>\n",
" <td>1919</td>\n",
" <td>0</td>\n",
" <td>98117</td>\n",
" <td>47.6824</td>\n",
" <td>-122.389</td>\n",
" <td>1320</td>\n",
" <td>4750</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9998</th>\n",
" <td>7370600045</td>\n",
" <td>20150402T000000</td>\n",
" <td>640000.0</td>\n",
" <td>3</td>\n",
" <td>1.75</td>\n",
" <td>1680</td>\n",
" <td>8100</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>1680</td>\n",
" <td>0</td>\n",
" <td>1950</td>\n",
" <td>0</td>\n",
" <td>98177</td>\n",
" <td>47.7212</td>\n",
" <td>-122.364</td>\n",
" <td>1880</td>\n",
" <td>7750</td>\n",
" <td>high</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9999</th>\n",
" <td>8594400060</td>\n",
" <td>20140609T000000</td>\n",
" <td>285000.0</td>\n",
" <td>3</td>\n",
" <td>2.25</td>\n",
" <td>1680</td>\n",
" <td>35127</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1680</td>\n",
" <td>0</td>\n",
" <td>1987</td>\n",
" <td>0</td>\n",
" <td>98092</td>\n",
" <td>47.3025</td>\n",
" <td>-122.067</td>\n",
" <td>1820</td>\n",
" <td>35166</td>\n",
" <td>low</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>20 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" id date price bedrooms bathrooms \\\n",
"9980 6840700036 20140728T000000 497000.0 2 1.00 \n",
"9981 1824069083 20150429T000000 835000.0 3 1.00 \n",
"9982 1836980240 20141015T000000 730000.0 4 2.75 \n",
"9983 3528900160 20141001T000000 655000.0 3 1.00 \n",
"9984 1442800060 20141120T000000 205000.0 3 2.50 \n",
"9985 8722100030 20150407T000000 632750.0 4 2.00 \n",
"9986 1723049624 20140512T000000 330000.0 5 3.00 \n",
"9987 4040400200 20141007T000000 527500.0 5 2.25 \n",
"9988 8691391090 20140508T000000 716500.0 4 2.50 \n",
"9989 7853302190 20141217T000000 388500.0 4 2.50 \n",
"9990 3260000700 20140904T000000 530000.0 3 1.75 \n",
"9991 5126300510 20150108T000000 419000.0 3 2.50 \n",
"9992 7199330370 20150309T000000 385000.0 3 1.75 \n",
"9993 1854900240 20140528T000000 655000.0 4 2.50 \n",
"9994 6738700335 20140701T000000 1127312.5 4 2.75 \n",
"9995 322059264 20140926T000000 279000.0 2 1.00 \n",
"9996 5557500270 20150209T000000 262000.0 3 1.50 \n",
"9997 9164100125 20140807T000000 533000.0 4 1.00 \n",
"9998 7370600045 20150402T000000 640000.0 3 1.75 \n",
"9999 8594400060 20140609T000000 285000.0 3 2.25 \n",
"\n",
" sqft_living sqft_lot floors waterfront view ... sqft_above \\\n",
"9980 770 3325 1.0 0 0 ... 770 \n",
"9981 3060 30166 1.0 0 0 ... 3060 \n",
"9982 2920 4500 2.0 0 0 ... 2920 \n",
"9983 1370 5250 1.0 0 0 ... 1070 \n",
"9984 1870 3118 2.0 0 0 ... 1870 \n",
"9985 1800 4800 1.5 0 0 ... 1800 \n",
"9986 2100 7715 1.0 0 0 ... 1250 \n",
"9987 2530 8250 2.0 0 0 ... 2530 \n",
"9988 3290 6465 2.0 0 0 ... 3290 \n",
"9989 1890 5395 2.0 0 0 ... 1890 \n",
"9990 1680 7770 1.0 0 0 ... 1680 \n",
"9991 2170 4517 2.0 0 0 ... 2170 \n",
"9992 1200 7360 1.0 0 0 ... 1200 \n",
"9993 2990 5669 2.0 0 0 ... 2990 \n",
"9994 3770 10900 2.0 0 2 ... 3070 \n",
"9995 1020 47044 1.0 0 0 ... 1020 \n",
"9996 1700 9579 1.0 0 0 ... 1100 \n",
"9997 1550 4750 1.5 0 0 ... 1550 \n",
"9998 1680 8100 1.0 0 2 ... 1680 \n",
"9999 1680 35127 2.0 0 0 ... 1680 \n",
"\n",
" sqft_basement yr_built yr_renovated zipcode lat long \\\n",
"9980 0 1918 0 98122 47.6102 -122.299 \n",
"9981 0 1959 0 98027 47.5656 -122.093 \n",
"9982 0 1999 0 98006 47.5646 -122.124 \n",
"9983 300 1939 0 98109 47.6421 -122.348 \n",
"9984 0 1993 0 98038 47.3739 -122.056 \n",
"9985 0 1918 0 98112 47.6388 -122.302 \n",
"9986 850 2013 0 98168 47.4866 -122.319 \n",
"9987 0 1961 0 98007 47.6117 -122.134 \n",
"9988 0 2002 0 98075 47.5981 -121.976 \n",
"9989 0 2006 0 98065 47.5415 -121.883 \n",
"9990 0 1967 0 98005 47.6028 -122.167 \n",
"9991 0 2002 0 98059 47.4819 -122.140 \n",
"9992 0 1978 0 98052 47.6979 -122.130 \n",
"9993 0 2003 0 98074 47.6119 -122.011 \n",
"9994 700 1924 0 98144 47.5849 -122.290 \n",
"9995 0 1904 1958 98042 47.4206 -122.155 \n",
"9996 600 1962 0 98023 47.3209 -122.338 \n",
"9997 0 1919 0 98117 47.6824 -122.389 \n",
"9998 0 1950 0 98177 47.7212 -122.364 \n",
"9999 0 1987 0 98092 47.3025 -122.067 \n",
"\n",
" sqft_living15 sqft_lot15 price_category \n",
"9980 960 4800 middle \n",
"9981 1880 19602 high \n",
"9982 2920 4505 high \n",
"9983 2410 4200 high \n",
"9984 1580 3601 low \n",
"9985 1950 4800 high \n",
"9986 2100 7959 low \n",
"9987 2020 8250 middle \n",
"9988 3100 5929 high \n",
"9989 2060 5395 middle \n",
"9990 1880 7770 middle \n",
"9991 2610 4770 middle \n",
"9992 1200 7500 middle \n",
"9993 3110 5058 high \n",
"9994 3000 5000 very_high \n",
"9995 1930 12139 low \n",
"9996 1700 9628 low \n",
"9997 1320 4750 middle \n",
"9998 1880 7750 high \n",
"9999 1820 35166 low \n",
"\n",
"[20 rows x 22 columns]"
]
},
"execution_count": 338,
2024-11-15 16:44:23 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q1 = df['price'].quantile(0.25) # Находим 1-й квартиль (Q1)\n",
"q3 = df['price'].quantile(0.75) # Находим 3-й квартиль (Q3)\n",
"iqr = q3 - q1 # Вычисляем межквартильный размах (IQR)\n",
"\n",
"# Определяем границы для выбросов\n",
"lower_bound = q1 - 1.5 * iqr # Нижняя граница\n",
"upper_bound = q3 + 1.5 * iqr # Верхняя граница\n",
"\n",
"# Устраняем выбросы: заменяем значения ниже нижней границы на саму нижнюю границу, а выше верхней — на верхнюю\n",
"df['price'] = df['price'].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)\n",
"\n",
"# Добавляем столбец с категорями цены\n",
"df['price_category'] = pd.cut(df['price'], bins=[75000,338750,602750,866750,1130750], labels=['low','middle','high','very_high'], include_lowest=True)\n",
"df.tail(20)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Бизнес-цели\n",
"1. Прогноз класса цены недвижимости (Классификация)\n",
"2. Оценка состояния недвижимости (Регрессия)\n",
"\n",
"### Определение достижимого уровня качества модели для первой задачи\n",
"#### Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации (Целевой признак - price)"
]
},
{
"cell_type": "code",
"execution_count": 339,
2024-11-15 16:44:23 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'X_train'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>...</th>\n",
" <th>sqft_above</th>\n",
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" <th>price_category</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>9843</th>\n",
" <td>3260000340</td>\n",
" <td>20140622T000000</td>\n",
" <td>732600.0</td>\n",
" <td>4</td>\n",
" <td>2.50</td>\n",
" <td>2130</td>\n",
" <td>7300</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1230</td>\n",
" <td>900</td>\n",
" <td>1963</td>\n",
" <td>0</td>\n",
" <td>98005</td>\n",
" <td>47.6050</td>\n",
" <td>-122.167</td>\n",
" <td>2130</td>\n",
" <td>7560</td>\n",
" <td>high</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9623</th>\n",
" <td>9828702055</td>\n",
" <td>20140508T000000</td>\n",
" <td>358000.0</td>\n",
" <td>2</td>\n",
" <td>1.50</td>\n",
" <td>960</td>\n",
" <td>1808</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>960</td>\n",
" <td>0</td>\n",
" <td>1993</td>\n",
" <td>0</td>\n",
" <td>98122</td>\n",
" <td>47.6183</td>\n",
" <td>-122.298</td>\n",
" <td>1290</td>\n",
" <td>1668</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3095</th>\n",
" <td>3438500625</td>\n",
" <td>20140519T000000</td>\n",
" <td>210000.0</td>\n",
" <td>3</td>\n",
" <td>1.00</td>\n",
" <td>1080</td>\n",
" <td>21043</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1080</td>\n",
" <td>0</td>\n",
" <td>1942</td>\n",
" <td>0</td>\n",
" <td>98106</td>\n",
" <td>47.5515</td>\n",
" <td>-122.357</td>\n",
" <td>1380</td>\n",
" <td>7620</td>\n",
" <td>low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>411</th>\n",
" <td>2422029094</td>\n",
" <td>20140716T000000</td>\n",
" <td>517534.0</td>\n",
" <td>2</td>\n",
" <td>1.00</td>\n",
" <td>833</td>\n",
" <td>143947</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>833</td>\n",
" <td>0</td>\n",
" <td>2006</td>\n",
" <td>0</td>\n",
" <td>98070</td>\n",
" <td>47.3889</td>\n",
" <td>-122.482</td>\n",
" <td>1380</td>\n",
" <td>143947</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3060</th>\n",
" <td>7462900015</td>\n",
" <td>20150108T000000</td>\n",
" <td>387000.0</td>\n",
" <td>3</td>\n",
" <td>2.25</td>\n",
" <td>1760</td>\n",
" <td>45133</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1760</td>\n",
" <td>0</td>\n",
" <td>1984</td>\n",
" <td>0</td>\n",
" <td>98065</td>\n",
" <td>47.5124</td>\n",
" <td>-121.866</td>\n",
" <td>1910</td>\n",
" <td>51773</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1750</th>\n",
" <td>2787720140</td>\n",
" <td>20150407T000000</td>\n",
" <td>416000.0</td>\n",
" <td>3</td>\n",
" <td>2.50</td>\n",
" <td>1790</td>\n",
" <td>11542</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1190</td>\n",
" <td>600</td>\n",
" <td>1969</td>\n",
" <td>0</td>\n",
" <td>98059</td>\n",
" <td>47.5124</td>\n",
" <td>-122.160</td>\n",
" <td>1790</td>\n",
" <td>9131</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2354</th>\n",
" <td>6192400400</td>\n",
" <td>20140728T000000</td>\n",
" <td>775000.0</td>\n",
" <td>4</td>\n",
" <td>2.50</td>\n",
" <td>3090</td>\n",
" <td>7112</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>3090</td>\n",
" <td>0</td>\n",
" <td>2001</td>\n",
" <td>0</td>\n",
" <td>98052</td>\n",
" <td>47.7050</td>\n",
" <td>-122.118</td>\n",
" <td>3050</td>\n",
" <td>6000</td>\n",
" <td>high</td>\n",
" </tr>\n",
" <tr>\n",
" <th>857</th>\n",
" <td>2296500036</td>\n",
" <td>20150310T000000</td>\n",
" <td>450000.0</td>\n",
" <td>4</td>\n",
" <td>2.75</td>\n",
" <td>2980</td>\n",
" <td>13260</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1800</td>\n",
" <td>1180</td>\n",
" <td>1979</td>\n",
" <td>0</td>\n",
" <td>98056</td>\n",
" <td>47.5152</td>\n",
" <td>-122.197</td>\n",
" <td>1920</td>\n",
" <td>10731</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6181</th>\n",
" <td>2787310130</td>\n",
" <td>20141212T000000</td>\n",
" <td>289950.0</td>\n",
" <td>4</td>\n",
" <td>1.75</td>\n",
" <td>2090</td>\n",
" <td>7416</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1050</td>\n",
" <td>1040</td>\n",
" <td>1970</td>\n",
" <td>0</td>\n",
" <td>98031</td>\n",
" <td>47.4107</td>\n",
" <td>-122.179</td>\n",
" <td>1710</td>\n",
" <td>7527</td>\n",
" <td>low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3141</th>\n",
" <td>8567300110</td>\n",
" <td>20140604T000000</td>\n",
" <td>485000.0</td>\n",
" <td>3</td>\n",
" <td>2.50</td>\n",
" <td>2340</td>\n",
" <td>59058</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>2340</td>\n",
" <td>0</td>\n",
" <td>1985</td>\n",
" <td>0</td>\n",
" <td>98038</td>\n",
" <td>47.4052</td>\n",
" <td>-122.028</td>\n",
" <td>2700</td>\n",
" <td>37263</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8000 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" id date price bedrooms bathrooms sqft_living \\\n",
"9843 3260000340 20140622T000000 732600.0 4 2.50 2130 \n",
"9623 9828702055 20140508T000000 358000.0 2 1.50 960 \n",
"3095 3438500625 20140519T000000 210000.0 3 1.00 1080 \n",
"411 2422029094 20140716T000000 517534.0 2 1.00 833 \n",
"3060 7462900015 20150108T000000 387000.0 3 2.25 1760 \n",
"... ... ... ... ... ... ... \n",
"1750 2787720140 20150407T000000 416000.0 3 2.50 1790 \n",
"2354 6192400400 20140728T000000 775000.0 4 2.50 3090 \n",
"857 2296500036 20150310T000000 450000.0 4 2.75 2980 \n",
"6181 2787310130 20141212T000000 289950.0 4 1.75 2090 \n",
"3141 8567300110 20140604T000000 485000.0 3 2.50 2340 \n",
"\n",
" sqft_lot floors waterfront view ... sqft_above sqft_basement \\\n",
"9843 7300 1.0 0 0 ... 1230 900 \n",
"9623 1808 2.0 0 0 ... 960 0 \n",
"3095 21043 1.0 0 0 ... 1080 0 \n",
"411 143947 1.0 0 0 ... 833 0 \n",
"3060 45133 2.0 0 0 ... 1760 0 \n",
"... ... ... ... ... ... ... ... \n",
"1750 11542 1.0 0 0 ... 1190 600 \n",
"2354 7112 2.0 0 0 ... 3090 0 \n",
"857 13260 1.0 0 0 ... 1800 1180 \n",
"6181 7416 1.0 0 0 ... 1050 1040 \n",
"3141 59058 1.0 0 0 ... 2340 0 \n",
"\n",
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
"9843 1963 0 98005 47.6050 -122.167 2130 \n",
"9623 1993 0 98122 47.6183 -122.298 1290 \n",
"3095 1942 0 98106 47.5515 -122.357 1380 \n",
"411 2006 0 98070 47.3889 -122.482 1380 \n",
"3060 1984 0 98065 47.5124 -121.866 1910 \n",
"... ... ... ... ... ... ... \n",
"1750 1969 0 98059 47.5124 -122.160 1790 \n",
"2354 2001 0 98052 47.7050 -122.118 3050 \n",
"857 1979 0 98056 47.5152 -122.197 1920 \n",
"6181 1970 0 98031 47.4107 -122.179 1710 \n",
"3141 1985 0 98038 47.4052 -122.028 2700 \n",
"\n",
" sqft_lot15 price_category \n",
"9843 7560 high \n",
"9623 1668 middle \n",
"3095 7620 low \n",
"411 143947 middle \n",
"3060 51773 middle \n",
"... ... ... \n",
"1750 9131 middle \n",
"2354 6000 high \n",
"857 10731 middle \n",
"6181 7527 low \n",
"3141 37263 middle \n",
"\n",
"[8000 rows x 22 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'y_train'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>price_category</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>9843</th>\n",
" <td>high</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9623</th>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3095</th>\n",
" <td>low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>411</th>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3060</th>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1750</th>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2354</th>\n",
" <td>high</td>\n",
" </tr>\n",
" <tr>\n",
" <th>857</th>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6181</th>\n",
" <td>low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3141</th>\n",
" <td>middle</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8000 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" price_category\n",
"9843 high\n",
"9623 middle\n",
"3095 low\n",
"411 middle\n",
"3060 middle\n",
"... ...\n",
"1750 middle\n",
"2354 high\n",
"857 middle\n",
"6181 low\n",
"3141 middle\n",
"\n",
"[8000 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'X_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>...</th>\n",
" <th>sqft_above</th>\n",
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" <th>price_category</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>5341</th>\n",
" <td>6632900574</td>\n",
" <td>20150225T000000</td>\n",
" <td>595000.0</td>\n",
" <td>5</td>\n",
" <td>3.00</td>\n",
" <td>2980</td>\n",
" <td>10064</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1680</td>\n",
" <td>1300</td>\n",
" <td>1940</td>\n",
" <td>0</td>\n",
" <td>98155</td>\n",
" <td>47.7372</td>\n",
" <td>-122.316</td>\n",
" <td>1590</td>\n",
" <td>7800</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4384</th>\n",
" <td>2423029245</td>\n",
" <td>20140617T000000</td>\n",
" <td>550000.0</td>\n",
" <td>3</td>\n",
" <td>1.75</td>\n",
" <td>2240</td>\n",
" <td>78225</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>2240</td>\n",
" <td>0</td>\n",
" <td>1976</td>\n",
" <td>0</td>\n",
" <td>98070</td>\n",
" <td>47.4638</td>\n",
" <td>-122.484</td>\n",
" <td>2030</td>\n",
" <td>202554</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5795</th>\n",
" <td>2473370050</td>\n",
" <td>20140604T000000</td>\n",
" <td>327500.0</td>\n",
" <td>4</td>\n",
" <td>1.75</td>\n",
" <td>1650</td>\n",
" <td>7800</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1650</td>\n",
" <td>0</td>\n",
" <td>1968</td>\n",
" <td>0</td>\n",
" <td>98058</td>\n",
" <td>47.4507</td>\n",
" <td>-122.139</td>\n",
" <td>1750</td>\n",
" <td>10400</td>\n",
" <td>low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4956</th>\n",
" <td>9528104985</td>\n",
" <td>20141104T000000</td>\n",
" <td>611000.0</td>\n",
" <td>2</td>\n",
" <td>1.00</td>\n",
" <td>1270</td>\n",
" <td>5100</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1100</td>\n",
" <td>170</td>\n",
" <td>1900</td>\n",
" <td>0</td>\n",
" <td>98115</td>\n",
" <td>47.6771</td>\n",
" <td>-122.328</td>\n",
" <td>1670</td>\n",
" <td>3900</td>\n",
" <td>high</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7723</th>\n",
" <td>3972900025</td>\n",
" <td>20150313T000000</td>\n",
" <td>499000.0</td>\n",
" <td>6</td>\n",
" <td>1.75</td>\n",
" <td>2400</td>\n",
" <td>7500</td>\n",
" <td>1.5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1400</td>\n",
" <td>1000</td>\n",
" <td>1975</td>\n",
" <td>0</td>\n",
" <td>98155</td>\n",
" <td>47.7661</td>\n",
" <td>-122.313</td>\n",
" <td>1980</td>\n",
" <td>7500</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8517</th>\n",
" <td>3876600120</td>\n",
" <td>20150422T000000</td>\n",
" <td>265000.0</td>\n",
" <td>3</td>\n",
" <td>1.50</td>\n",
" <td>1780</td>\n",
" <td>10196</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1270</td>\n",
" <td>510</td>\n",
" <td>1967</td>\n",
" <td>0</td>\n",
" <td>98001</td>\n",
" <td>47.3375</td>\n",
" <td>-122.291</td>\n",
" <td>1320</td>\n",
" <td>7875</td>\n",
" <td>low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6914</th>\n",
" <td>6821600005</td>\n",
" <td>20150403T000000</td>\n",
" <td>710000.0</td>\n",
" <td>4</td>\n",
" <td>1.75</td>\n",
" <td>2120</td>\n",
" <td>5400</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1060</td>\n",
" <td>1060</td>\n",
" <td>1941</td>\n",
" <td>0</td>\n",
" <td>98199</td>\n",
" <td>47.6501</td>\n",
" <td>-122.395</td>\n",
" <td>2052</td>\n",
" <td>6000</td>\n",
" <td>high</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4499</th>\n",
" <td>2767603931</td>\n",
" <td>20140818T000000</td>\n",
" <td>469000.0</td>\n",
" <td>3</td>\n",
" <td>3.25</td>\n",
" <td>1370</td>\n",
" <td>1194</td>\n",
" <td>3.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1370</td>\n",
" <td>0</td>\n",
" <td>2004</td>\n",
" <td>0</td>\n",
" <td>98107</td>\n",
" <td>47.6718</td>\n",
" <td>-122.388</td>\n",
" <td>1800</td>\n",
" <td>2678</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8651</th>\n",
" <td>8802400411</td>\n",
" <td>20140619T000000</td>\n",
" <td>249000.0</td>\n",
" <td>3</td>\n",
" <td>1.00</td>\n",
" <td>1050</td>\n",
" <td>8498</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1050</td>\n",
" <td>0</td>\n",
" <td>1959</td>\n",
" <td>0</td>\n",
" <td>98031</td>\n",
" <td>47.4043</td>\n",
" <td>-122.202</td>\n",
" <td>1050</td>\n",
" <td>8498</td>\n",
" <td>low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4234</th>\n",
" <td>5452800735</td>\n",
" <td>20140722T000000</td>\n",
" <td>780000.0</td>\n",
" <td>4</td>\n",
" <td>2.50</td>\n",
" <td>2270</td>\n",
" <td>13449</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1310</td>\n",
" <td>960</td>\n",
" <td>1975</td>\n",
" <td>0</td>\n",
" <td>98040</td>\n",
" <td>47.5416</td>\n",
" <td>-122.232</td>\n",
" <td>2810</td>\n",
" <td>13475</td>\n",
" <td>high</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2000 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" id date price bedrooms bathrooms sqft_living \\\n",
"5341 6632900574 20150225T000000 595000.0 5 3.00 2980 \n",
"4384 2423029245 20140617T000000 550000.0 3 1.75 2240 \n",
"5795 2473370050 20140604T000000 327500.0 4 1.75 1650 \n",
"4956 9528104985 20141104T000000 611000.0 2 1.00 1270 \n",
"7723 3972900025 20150313T000000 499000.0 6 1.75 2400 \n",
"... ... ... ... ... ... ... \n",
"8517 3876600120 20150422T000000 265000.0 3 1.50 1780 \n",
"6914 6821600005 20150403T000000 710000.0 4 1.75 2120 \n",
"4499 2767603931 20140818T000000 469000.0 3 3.25 1370 \n",
"8651 8802400411 20140619T000000 249000.0 3 1.00 1050 \n",
"4234 5452800735 20140722T000000 780000.0 4 2.50 2270 \n",
"\n",
" sqft_lot floors waterfront view ... sqft_above sqft_basement \\\n",
"5341 10064 1.0 0 0 ... 1680 1300 \n",
"4384 78225 2.0 0 0 ... 2240 0 \n",
"5795 7800 1.0 0 0 ... 1650 0 \n",
"4956 5100 1.0 0 0 ... 1100 170 \n",
"7723 7500 1.5 0 0 ... 1400 1000 \n",
"... ... ... ... ... ... ... ... \n",
"8517 10196 1.0 0 0 ... 1270 510 \n",
"6914 5400 1.0 0 0 ... 1060 1060 \n",
"4499 1194 3.0 0 0 ... 1370 0 \n",
"8651 8498 1.0 0 0 ... 1050 0 \n",
"4234 13449 1.0 0 0 ... 1310 960 \n",
"\n",
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
"5341 1940 0 98155 47.7372 -122.316 1590 \n",
"4384 1976 0 98070 47.4638 -122.484 2030 \n",
"5795 1968 0 98058 47.4507 -122.139 1750 \n",
"4956 1900 0 98115 47.6771 -122.328 1670 \n",
"7723 1975 0 98155 47.7661 -122.313 1980 \n",
"... ... ... ... ... ... ... \n",
"8517 1967 0 98001 47.3375 -122.291 1320 \n",
"6914 1941 0 98199 47.6501 -122.395 2052 \n",
"4499 2004 0 98107 47.6718 -122.388 1800 \n",
"8651 1959 0 98031 47.4043 -122.202 1050 \n",
"4234 1975 0 98040 47.5416 -122.232 2810 \n",
"\n",
" sqft_lot15 price_category \n",
"5341 7800 middle \n",
"4384 202554 middle \n",
"5795 10400 low \n",
"4956 3900 high \n",
"7723 7500 middle \n",
"... ... ... \n",
"8517 7875 low \n",
"6914 6000 high \n",
"4499 2678 middle \n",
"8651 8498 low \n",
"4234 13475 high \n",
"\n",
"[2000 rows x 22 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'y_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>price_category</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>5341</th>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4384</th>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5795</th>\n",
" <td>low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4956</th>\n",
" <td>high</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7723</th>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8517</th>\n",
" <td>low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6914</th>\n",
" <td>high</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4499</th>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8651</th>\n",
" <td>low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4234</th>\n",
" <td>high</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2000 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" price_category\n",
"5341 middle\n",
"4384 middle\n",
"5795 low\n",
"4956 high\n",
"7723 middle\n",
"... ...\n",
"8517 low\n",
"6914 high\n",
"4499 middle\n",
"8651 low\n",
"4234 high\n",
"\n",
"[2000 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from typing import Tuple\n",
"import pandas as pd\n",
"from pandas import DataFrame\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"def split_stratified_into_train_val_test(\n",
" df_input,\n",
" stratify_colname=\"y\",\n",
" frac_train=0.6,\n",
" frac_val=0.15,\n",
" frac_test=0.25,\n",
" random_state=None,\n",
") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
" \n",
" if frac_train + frac_val + frac_test != 1.0:\n",
" raise ValueError(\n",
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
" % (frac_train, frac_val, frac_test)\n",
" )\n",
" if stratify_colname not in df_input.columns:\n",
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
" X = df_input # Contains all columns.\n",
" y = df_input[\n",
" [stratify_colname]\n",
" ] # Dataframe of just the column on which to stratify.\n",
" # Split original dataframe into train and temp dataframes.\n",
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
" )\n",
" if frac_val <= 0:\n",
" assert len(df_input) == len(df_train) + len(df_temp)\n",
" return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
" # Split the temp dataframe into val and test dataframes.\n",
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
" df_val, df_test, y_val, y_test = train_test_split(\n",
" df_temp,\n",
" y_temp,\n",
" stratify=y_temp,\n",
" test_size=relative_frac_test,\n",
" random_state=random_state,\n",
" )\n",
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
" return df_train, df_val, df_test, y_train, y_val, y_test\n",
"\n",
"X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
" df, stratify_colname=\"price_category\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=42\n",
")\n",
"\n",
"display(\"X_train\", X_train)\n",
"display(\"y_train\", y_train)\n",
"\n",
"display(\"X_test\", X_test)\n",
"display(\"y_test\", y_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Формирование конвейера\n",
"preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n",
"\n",
"preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n",
"\n",
"features_preprocessing -- трансформер для предобработки признаков\n",
"\n",
"drop_columns -- трансформер для удаления колонок\n",
"\n",
"pipeline_end -- основной конвейер предобработки данных и конструирования признаков"
]
},
{
"cell_type": "code",
"execution_count": 340,
2024-11-15 16:44:23 +04:00
"metadata": {},
"outputs": [
{
2024-11-15 16:44:46 +04:00
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>condition</th>\n",
" <th>grade</th>\n",
" <th>sqft_above</th>\n",
" <th>...</th>\n",
2024-11-15 16:44:46 +04:00
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" <th>price_h</th>\n",
" <th>price_l</th>\n",
" <th>price_m</th>\n",
" <th>price_vh</th>\n",
2024-11-15 16:44:46 +04:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>-0.451103</td>\n",
" <td>0.916381</td>\n",
" <td>0.700559</td>\n",
" <td>0.573416</td>\n",
" <td>0.081706</td>\n",
" <td>-0.187493</td>\n",
" <td>-0.838739</td>\n",
" <td>0.839159</td>\n",
" <td>-0.512647</td>\n",
" <td>-0.638064</td>\n",
" <td>...</td>\n",
" <td>-0.2158</td>\n",
" <td>-1.349962</td>\n",
" <td>0.32254</td>\n",
" <td>0.340593</td>\n",
" <td>0.223199</td>\n",
" <td>-0.210584</td>\n",
2024-11-15 16:44:46 +04:00
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-11-15 16:44:46 +04:00
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.845014</td>\n",
" <td>-0.589326</td>\n",
" <td>-1.49426</td>\n",
" <td>-0.72971</td>\n",
" <td>-1.191326</td>\n",
" <td>-0.302999</td>\n",
" <td>1.120073</td>\n",
" <td>-0.666734</td>\n",
" <td>-0.512647</td>\n",
" <td>-0.969739</td>\n",
" <td>...</td>\n",
" <td>-0.2158</td>\n",
" <td>0.820656</td>\n",
" <td>0.417588</td>\n",
" <td>-0.601419</td>\n",
" <td>-1.022503</td>\n",
" <td>-0.421966</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
2024-11-15 16:44:46 +04:00
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>-0.388708</td>\n",
" <td>-1.184213</td>\n",
" <td>-0.396851</td>\n",
" <td>-1.381273</td>\n",
" <td>-1.060759</td>\n",
" <td>0.101544</td>\n",
" <td>-0.838739</td>\n",
" <td>-0.666734</td>\n",
" <td>-1.369558</td>\n",
" <td>-0.822328</td>\n",
" <td>...</td>\n",
" <td>-0.2158</td>\n",
" <td>0.523819</td>\n",
" <td>-0.059795</td>\n",
" <td>-1.025683</td>\n",
" <td>-0.889035</td>\n",
" <td>-0.208431</td>\n",
" <td>0.0</td>\n",
2024-11-15 16:44:46 +04:00
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-11-15 16:44:46 +04:00
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>-0.74402</td>\n",
" <td>0.051922</td>\n",
" <td>-1.49426</td>\n",
" <td>-1.381273</td>\n",
" <td>-1.32951</td>\n",
" <td>2.686416</td>\n",
" <td>-0.838739</td>\n",
" <td>-0.666734</td>\n",
" <td>-2.22647</td>\n",
" <td>-1.125749</td>\n",
" <td>...</td>\n",
" <td>-0.2158</td>\n",
" <td>-0.144063</td>\n",
" <td>-1.221808</td>\n",
" <td>-1.924549</td>\n",
" <td>-0.889035</td>\n",
" <td>4.682444</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-11-15 16:44:46 +04:00
" <td>1.0</td>\n",
" <td>0.0</td>\n",
2024-11-15 16:44:46 +04:00
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1.018038</td>\n",
" <td>-0.47276</td>\n",
" <td>-0.396851</td>\n",
" <td>0.247635</td>\n",
" <td>-0.320877</td>\n",
" <td>0.608196</td>\n",
" <td>1.120073</td>\n",
" <td>-0.666734</td>\n",
" <td>-0.512647</td>\n",
" <td>0.013003</td>\n",
" <td>...</td>\n",
" <td>-0.2158</td>\n",
" <td>-0.236825</td>\n",
" <td>-0.339221</td>\n",
" <td>2.505062</td>\n",
" <td>-0.103056</td>\n",
" <td>1.375604</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-11-15 16:44:46 +04:00
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>-0.083826</td>\n",
" <td>-0.492858</td>\n",
" <td>-0.396851</td>\n",
" <td>1.550761</td>\n",
" <td>-0.701698</td>\n",
" <td>-0.314672</td>\n",
" <td>3.078884</td>\n",
" <td>-0.666734</td>\n",
" <td>0.344264</td>\n",
" <td>-0.416947</td>\n",
" <td>...</td>\n",
" <td>-0.2158</td>\n",
" <td>0.468162</td>\n",
" <td>0.987875</td>\n",
" <td>-0.903438</td>\n",
" <td>-0.844546</td>\n",
" <td>-0.436854</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-11-15 16:44:46 +04:00
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>0.301277</td>\n",
" <td>-0.953091</td>\n",
" <td>-0.396851</td>\n",
" <td>0.573416</td>\n",
" <td>-0.712579</td>\n",
" <td>-0.180574</td>\n",
" <td>-0.838739</td>\n",
" <td>-0.666734</td>\n",
" <td>-0.512647</td>\n",
" <td>-0.773191</td>\n",
" <td>...</td>\n",
" <td>-0.2158</td>\n",
" <td>-0.886155</td>\n",
" <td>-1.293987</td>\n",
" <td>0.254302</td>\n",
" <td>-0.666588</td>\n",
" <td>-0.205992</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>-0.086798</td>\n",
" <td>-1.148038</td>\n",
" <td>-1.49426</td>\n",
" <td>-1.381273</td>\n",
" <td>-1.25661</td>\n",
" <td>-0.232501</td>\n",
" <td>-0.838739</td>\n",
" <td>-0.666734</td>\n",
" <td>-1.369558</td>\n",
" <td>-1.043445</td>\n",
" <td>...</td>\n",
" <td>-0.2158</td>\n",
" <td>0.523819</td>\n",
" <td>-0.249176</td>\n",
" <td>-1.018493</td>\n",
" <td>-1.600865</td>\n",
" <td>-0.296686</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>-0.824567</td>\n",
" <td>-1.148038</td>\n",
" <td>-1.49426</td>\n",
" <td>-1.381273</td>\n",
" <td>-1.0934</td>\n",
" <td>-0.15174</td>\n",
" <td>0.140667</td>\n",
" <td>0.839159</td>\n",
" <td>-0.512647</td>\n",
" <td>-0.859181</td>\n",
" <td>...</td>\n",
" <td>-0.2158</td>\n",
" <td>-1.387066</td>\n",
" <td>-1.937882</td>\n",
" <td>-0.60861</td>\n",
" <td>-0.636929</td>\n",
" <td>-0.137397</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1.647935</td>\n",
" <td>-0.762165</td>\n",
" <td>2.895378</td>\n",
" <td>0.899198</td>\n",
" <td>0.963036</td>\n",
" <td>-0.186442</td>\n",
" <td>-0.838739</td>\n",
" <td>-0.666734</td>\n",
" <td>0.344264</td>\n",
" <td>0.037571</td>\n",
" <td>...</td>\n",
" <td>-0.2158</td>\n",
" <td>-1.016021</td>\n",
" <td>-1.783519</td>\n",
" <td>-0.896247</td>\n",
" <td>0.208369</td>\n",
" <td>-0.186332</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>-1.159614</td>\n",
" <td>-0.581287</td>\n",
" <td>-1.49426</td>\n",
" <td>-1.381273</td>\n",
" <td>-1.321893</td>\n",
" <td>-0.185096</td>\n",
" <td>-0.838739</td>\n",
" <td>0.839159</td>\n",
" <td>-1.369558</td>\n",
" <td>-1.11715</td>\n",
" <td>...</td>\n",
" <td>-0.2158</td>\n",
" <td>-0.830498</td>\n",
" <td>0.837799</td>\n",
" <td>0.304638</td>\n",
" <td>-0.355163</td>\n",
" <td>-0.130796</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>-1.329183</td>\n",
" <td>-0.681775</td>\n",
" <td>-1.49426</td>\n",
" <td>-1.381273</td>\n",
" <td>-1.071639</td>\n",
" <td>-0.200575</td>\n",
" <td>-0.838739</td>\n",
" <td>0.839159</td>\n",
" <td>-0.512647</td>\n",
" <td>-0.834612</td>\n",
" <td>...</td>\n",
" <td>-0.2158</td>\n",
" <td>1.024731</td>\n",
" <td>1.226566</td>\n",
" <td>-1.025683</td>\n",
" <td>-0.444141</td>\n",
" <td>-0.202404</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>0.377864</td>\n",
" <td>0.286926</td>\n",
" <td>0.700559</td>\n",
" <td>0.573416</td>\n",
" <td>0.419005</td>\n",
" <td>0.256379</td>\n",
" <td>1.120073</td>\n",
" <td>-0.666734</td>\n",
" <td>0.344264</td>\n",
" <td>0.848334</td>\n",
" <td>...</td>\n",
" <td>-0.2158</td>\n",
" <td>-0.923259</td>\n",
" <td>1.277306</td>\n",
" <td>-0.169963</td>\n",
" <td>0.742242</td>\n",
" <td>-0.071779</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>0.289882</td>\n",
" <td>-0.88677</td>\n",
" <td>-0.396851</td>\n",
" <td>0.573416</td>\n",
" <td>0.103467</td>\n",
" <td>-0.143853</td>\n",
" <td>-0.838739</td>\n",
" <td>-0.666734</td>\n",
" <td>0.344264</td>\n",
" <td>-0.244967</td>\n",
" <td>...</td>\n",
" <td>-0.2158</td>\n",
" <td>2.045107</td>\n",
" <td>-0.729417</td>\n",
" <td>-0.428836</td>\n",
" <td>-0.043737</td>\n",
" <td>-0.155335</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>1.613049</td>\n",
" <td>0.282907</td>\n",
" <td>-0.396851</td>\n",
" <td>-0.078147</td>\n",
" <td>0.103467</td>\n",
" <td>-0.259422</td>\n",
" <td>-0.838739</td>\n",
" <td>-0.666734</td>\n",
" <td>0.344264</td>\n",
" <td>-0.822328</td>\n",
" <td>...</td>\n",
" <td>-0.2158</td>\n",
" <td>0.727894</td>\n",
" <td>0.868529</td>\n",
" <td>-1.277366</td>\n",
" <td>0.223199</td>\n",
" <td>-0.338303</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>-0.962885</td>\n",
" <td>0.285118</td>\n",
" <td>0.700559</td>\n",
" <td>0.573416</td>\n",
" <td>0.005542</td>\n",
" <td>-0.183813</td>\n",
" <td>-0.838739</td>\n",
" <td>-0.666734</td>\n",
" <td>0.344264</td>\n",
" <td>-0.380094</td>\n",
" <td>...</td>\n",
" <td>-0.2158</td>\n",
" <td>-0.478004</td>\n",
" <td>1.195837</td>\n",
" <td>0.78643</td>\n",
" <td>0.445646</td>\n",
" <td>-0.180592</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>1.722145</td>\n",
" <td>-0.259726</td>\n",
" <td>-0.396851</td>\n",
" <td>-0.403928</td>\n",
" <td>-0.571131</td>\n",
" <td>-0.18865</td>\n",
" <td>-0.838739</td>\n",
" <td>0.839159</td>\n",
" <td>-0.512647</td>\n",
" <td>-0.269535</td>\n",
" <td>...</td>\n",
" <td>-0.2158</td>\n",
" <td>-0.811945</td>\n",
" <td>1.222993</td>\n",
" <td>0.168011</td>\n",
" <td>-0.666588</td>\n",
" <td>-0.213095</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>0.740562</td>\n",
" <td>1.589247</td>\n",
" <td>0.700559</td>\n",
" <td>1.550761</td>\n",
" <td>2.878025</td>\n",
" <td>0.466843</td>\n",
" <td>1.120073</td>\n",
" <td>-0.666734</td>\n",
" <td>2.058087</td>\n",
" <td>2.052192</td>\n",
" <td>...</td>\n",
" <td>-0.2158</td>\n",
" <td>-1.349962</td>\n",
" <td>0.604825</td>\n",
" <td>0.340593</td>\n",
" <td>2.462498</td>\n",
" <td>0.79434</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-11-15 16:44:46 +04:00
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>-1.555659</td>\n",
" <td>-0.922945</td>\n",
" <td>-0.396851</td>\n",
" <td>-1.381273</td>\n",
" <td>-0.799624</td>\n",
" <td>-0.107784</td>\n",
" <td>-0.838739</td>\n",
" <td>-0.666734</td>\n",
" <td>-0.512647</td>\n",
" <td>-0.527505</td>\n",
" <td>...</td>\n",
" <td>-0.2158</td>\n",
" <td>1.432881</td>\n",
" <td>1.536008</td>\n",
" <td>-0.644564</td>\n",
" <td>-0.978014</td>\n",
" <td>-0.183354</td>\n",
" <td>0.0</td>\n",
2024-11-15 16:44:46 +04:00
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>-0.953738</td>\n",
" <td>0.142224</td>\n",
" <td>2.895378</td>\n",
" <td>1.224979</td>\n",
" <td>0.886872</td>\n",
" <td>4.00146</td>\n",
" <td>1.120073</td>\n",
" <td>-0.666734</td>\n",
" <td>-0.512647</td>\n",
" <td>0.713207</td>\n",
" <td>...</td>\n",
" <td>4.605736</td>\n",
" <td>-0.663527</td>\n",
" <td>-1.135335</td>\n",
" <td>0.85834</td>\n",
" <td>0.593944</td>\n",
" <td>1.659169</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
2024-11-15 16:44:46 +04:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>20 rows × 22 columns</p>\n",
2024-11-15 16:44:46 +04:00
"</div>"
],
"text/plain": [
" id price bedrooms bathrooms sqft_living sqft_lot floors \\\n",
"0 -0.451103 0.916381 0.700559 0.573416 0.081706 -0.187493 -0.838739 \n",
"1 1.845014 -0.589326 -1.49426 -0.72971 -1.191326 -0.302999 1.120073 \n",
"2 -0.388708 -1.184213 -0.396851 -1.381273 -1.060759 0.101544 -0.838739 \n",
"3 -0.74402 0.051922 -1.49426 -1.381273 -1.32951 2.686416 -0.838739 \n",
"4 1.018038 -0.47276 -0.396851 0.247635 -0.320877 0.608196 1.120073 \n",
"5 -0.083826 -0.492858 -0.396851 1.550761 -0.701698 -0.314672 3.078884 \n",
"6 0.301277 -0.953091 -0.396851 0.573416 -0.712579 -0.180574 -0.838739 \n",
"7 -0.086798 -1.148038 -1.49426 -1.381273 -1.25661 -0.232501 -0.838739 \n",
"8 -0.824567 -1.148038 -1.49426 -1.381273 -1.0934 -0.15174 0.140667 \n",
"9 1.647935 -0.762165 2.895378 0.899198 0.963036 -0.186442 -0.838739 \n",
"10 -1.159614 -0.581287 -1.49426 -1.381273 -1.321893 -0.185096 -0.838739 \n",
"11 -1.329183 -0.681775 -1.49426 -1.381273 -1.071639 -0.200575 -0.838739 \n",
"12 0.377864 0.286926 0.700559 0.573416 0.419005 0.256379 1.120073 \n",
"13 0.289882 -0.88677 -0.396851 0.573416 0.103467 -0.143853 -0.838739 \n",
"14 1.613049 0.282907 -0.396851 -0.078147 0.103467 -0.259422 -0.838739 \n",
"15 -0.962885 0.285118 0.700559 0.573416 0.005542 -0.183813 -0.838739 \n",
"16 1.722145 -0.259726 -0.396851 -0.403928 -0.571131 -0.18865 -0.838739 \n",
"17 0.740562 1.589247 0.700559 1.550761 2.878025 0.466843 1.120073 \n",
"18 -1.555659 -0.922945 -0.396851 -1.381273 -0.799624 -0.107784 -0.838739 \n",
"19 -0.953738 0.142224 2.895378 1.224979 0.886872 4.00146 1.120073 \n",
2024-11-15 16:44:46 +04:00
"\n",
" condition grade sqft_above ... yr_renovated zipcode lat \\\n",
"0 0.839159 -0.512647 -0.638064 ... -0.2158 -1.349962 0.32254 \n",
"1 -0.666734 -0.512647 -0.969739 ... -0.2158 0.820656 0.417588 \n",
"2 -0.666734 -1.369558 -0.822328 ... -0.2158 0.523819 -0.059795 \n",
"3 -0.666734 -2.22647 -1.125749 ... -0.2158 -0.144063 -1.221808 \n",
"4 -0.666734 -0.512647 0.013003 ... -0.2158 -0.236825 -0.339221 \n",
"5 -0.666734 0.344264 -0.416947 ... -0.2158 0.468162 0.987875 \n",
"6 -0.666734 -0.512647 -0.773191 ... -0.2158 -0.886155 -1.293987 \n",
"7 -0.666734 -1.369558 -1.043445 ... -0.2158 0.523819 -0.249176 \n",
"8 0.839159 -0.512647 -0.859181 ... -0.2158 -1.387066 -1.937882 \n",
"9 -0.666734 0.344264 0.037571 ... -0.2158 -1.016021 -1.783519 \n",
"10 0.839159 -1.369558 -1.11715 ... -0.2158 -0.830498 0.837799 \n",
"11 0.839159 -0.512647 -0.834612 ... -0.2158 1.024731 1.226566 \n",
"12 -0.666734 0.344264 0.848334 ... -0.2158 -0.923259 1.277306 \n",
"13 -0.666734 0.344264 -0.244967 ... -0.2158 2.045107 -0.729417 \n",
"14 -0.666734 0.344264 -0.822328 ... -0.2158 0.727894 0.868529 \n",
"15 -0.666734 0.344264 -0.380094 ... -0.2158 -0.478004 1.195837 \n",
"16 0.839159 -0.512647 -0.269535 ... -0.2158 -0.811945 1.222993 \n",
"17 -0.666734 2.058087 2.052192 ... -0.2158 -1.349962 0.604825 \n",
"18 -0.666734 -0.512647 -0.527505 ... -0.2158 1.432881 1.536008 \n",
"19 -0.666734 -0.512647 0.713207 ... 4.605736 -0.663527 -1.135335 \n",
2024-11-15 16:44:46 +04:00
"\n",
" long sqft_living15 sqft_lot15 price_h price_l price_m price_vh \n",
"0 0.340593 0.223199 -0.210584 1.0 0.0 0.0 0.0 \n",
"1 -0.601419 -1.022503 -0.421966 0.0 0.0 1.0 0.0 \n",
"2 -1.025683 -0.889035 -0.208431 0.0 1.0 0.0 0.0 \n",
"3 -1.924549 -0.889035 4.682444 0.0 0.0 1.0 0.0 \n",
"4 2.505062 -0.103056 1.375604 0.0 0.0 1.0 0.0 \n",
"5 -0.903438 -0.844546 -0.436854 0.0 0.0 1.0 0.0 \n",
"6 0.254302 -0.666588 -0.205992 0.0 1.0 0.0 0.0 \n",
"7 -1.018493 -1.600865 -0.296686 0.0 1.0 0.0 0.0 \n",
"8 -0.60861 -0.636929 -0.137397 0.0 1.0 0.0 0.0 \n",
"9 -0.896247 0.208369 -0.186332 0.0 1.0 0.0 0.0 \n",
"10 0.304638 -0.355163 -0.130796 0.0 0.0 1.0 0.0 \n",
"11 -1.025683 -0.444141 -0.202404 0.0 1.0 0.0 0.0 \n",
"12 -0.169963 0.742242 -0.071779 0.0 0.0 1.0 0.0 \n",
"13 -0.428836 -0.043737 -0.155335 0.0 1.0 0.0 0.0 \n",
"14 -1.277366 0.223199 -0.338303 0.0 0.0 1.0 0.0 \n",
"15 0.78643 0.445646 -0.180592 0.0 0.0 1.0 0.0 \n",
"16 0.168011 -0.666588 -0.213095 0.0 0.0 1.0 0.0 \n",
"17 0.340593 2.462498 0.79434 0.0 0.0 0.0 1.0 \n",
"18 -0.644564 -0.978014 -0.183354 0.0 1.0 0.0 0.0 \n",
"19 0.85834 0.593944 1.659169 0.0 0.0 1.0 0.0 \n",
2024-11-15 16:44:46 +04:00
"\n",
"[20 rows x 22 columns]"
2024-11-15 16:44:46 +04:00
]
},
"execution_count": 340,
2024-11-15 16:44:46 +04:00
"metadata": {},
"output_type": "execute_result"
2024-11-15 16:44:23 +04:00
}
],
"source": [
"import numpy as np\n",
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.discriminant_analysis import StandardScaler\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.ensemble import RandomForestRegressor # Пример регрессионной модели\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.pipeline import make_pipeline\n",
"\n",
"random_state = 42\n",
2024-11-15 16:44:23 +04:00
"\n",
"# Указываем столбцы, которые нужно удалить и обрабатывать\n",
"columns_to_drop = [\"date\", \"view\", \"waterfront\"]\n",
"num_columns = [\n",
" column\n",
" for column in df.columns\n",
" if column not in columns_to_drop and df[column].dtype != \"object\" and df[column].dtype != \"category\"\n",
"]\n",
"cat_columns = [\n",
" column\n",
" for column in df.columns\n",
" if column not in columns_to_drop and df[column].dtype == \"object\" or df[column].dtype == \"category\"\n",
"]\n",
"\n",
"# Определяем предобработку для численных данных\n",
"num_imputer = SimpleImputer(strategy=\"median\")\n",
"num_scaler = StandardScaler()\n",
"preprocessing_num = Pipeline(\n",
" [\n",
" (\"imputer\", num_imputer),\n",
" (\"scaler\", num_scaler),\n",
" ]\n",
")\n",
"\n",
"# Определяем предобработку для категориальных данных\n",
"cat_imputer = SimpleImputer(strategy=\"constant\")\n",
"cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False)\n",
2024-11-15 16:44:23 +04:00
"preprocessing_cat = Pipeline(\n",
" [\n",
" (\"imputer\", cat_imputer),\n",
" (\"encoder\", cat_encoder),\n",
" ]\n",
")\n",
"\n",
"features_preprocessing = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"prepocessing_num\", preprocessing_num, num_columns),\n",
" (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n",
" (\"prepocessing_features\", cat_imputer, [\"price_category\"]),\n",
2024-11-15 16:44:23 +04:00
" ],\n",
" remainder=\"passthrough\"\n",
")\n",
"\n",
"drop_columns = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"drop_columns\", \"drop\", columns_to_drop),\n",
" ],\n",
" remainder=\"passthrough\",\n",
")\n",
"\n",
"features_postprocessing = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"prepocessing_cat\", preprocessing_cat, [\"price_category\"]),\n",
" ],\n",
" remainder=\"passthrough\",\n",
")\n",
"\n",
"pipeline_end = Pipeline(\n",
" [\n",
" (\"features_preprocessing\", features_preprocessing),\n",
" (\"drop_columns\", drop_columns),\n",
" (\"features_postprocessing\", features_postprocessing),\n",
" ]\n",
"\n",
")\n",
"# preprocessing_result = pipeline_end.fit_transform(X_train.values)\n",
"cols = ['price_h', 'price_l', 'price_m', 'price_vh']\n",
"preprocessing_result = features_preprocessing.fit_transform(X_train)\n",
"preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns + cols + columns_to_drop)\n",
"\n",
"preprocessing_result = drop_columns.fit_transform(preprocessing_result)\n",
"preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cols + cat_columns)\n",
"\n",
"preprocessing_result = preprocessing_result.drop(columns=[\"price_category\"])\n",
"preprocessing_result.head(20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Формирование набора моделей для классификации¶\n",
"logistic -- логистическая регрессия\n",
"\n",
"ridge -- гребневая регрессия\n",
"\n",
"decision_tree -- дерево решений\n",
"\n",
"knn -- k-ближайших соседей\n",
"\n",
"naive_bayes -- наивный Байесовский классификатор\n",
"\n",
"gradient_boosting -- метод градиентного бустинга (набор деревьев решений)\n",
"\n",
"random_forest -- метод случайного леса (набор деревьев решений)\n",
"\n",
"mlp -- многослойный персептрон (нейронная сеть)"
]
},
{
"cell_type": "code",
"execution_count": 341,
"metadata": {},
"outputs": [],
"source": [
"from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree\n",
"\n",
"class_models = {\n",
" \"logistic\": {\"model\": linear_model.LogisticRegression()},\n",
" # \"ridge\": {\"model\": linear_model.RidgeClassifierCV(cv=5, class_weight=\"balanced\")},\n",
" \"ridge\": {\"model\": linear_model.LogisticRegression(penalty=\"l2\", class_weight=\"balanced\")},\n",
" \"decision_tree\": {\n",
" \"model\": tree.DecisionTreeClassifier(max_depth=7, random_state=random_state)\n",
" },\n",
" \"knn\": {\"model\": neighbors.KNeighborsClassifier(n_neighbors=7)},\n",
" \"naive_bayes\": {\"model\": naive_bayes.GaussianNB()},\n",
" \"gradient_boosting\": {\n",
" \"model\": ensemble.GradientBoostingClassifier(n_estimators=210)\n",
" },\n",
" \"random_forest\": {\n",
" \"model\": ensemble.RandomForestClassifier(\n",
" max_depth=11, class_weight=\"balanced\", random_state=random_state\n",
" )\n",
" },\n",
" \"mlp\": {\n",
" \"model\": neural_network.MLPClassifier(\n",
" hidden_layer_sizes=(7,),\n",
" max_iter=500,\n",
" early_stopping=True,\n",
" random_state=random_state,\n",
" )\n",
" },\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Обучение моделей на обучающем наборе данных и оценка на тестовом"
]
},
{
"cell_type": "code",
"execution_count": 343,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: logistic\n"
]
},
{
"ename": "ValueError",
"evalue": "Specifying the columns using strings is only supported for dataframes.",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\utils\\_indexing.py:338\u001b[0m, in \u001b[0;36m_get_column_indices\u001b[1;34m(X, key)\u001b[0m\n\u001b[0;32m 337\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 338\u001b[0m all_columns \u001b[38;5;241m=\u001b[39m \u001b[43mX\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\n\u001b[0;32m 339\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m:\n",
"\u001b[1;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'columns'",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[343], line 9\u001b[0m\n\u001b[0;32m 6\u001b[0m model \u001b[38;5;241m=\u001b[39m class_models[model_name][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 8\u001b[0m model_pipeline \u001b[38;5;241m=\u001b[39m Pipeline([(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpipeline\u001b[39m\u001b[38;5;124m\"\u001b[39m, pipeline_end), (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m\"\u001b[39m, model)])\n\u001b[1;32m----> 9\u001b[0m model_pipeline \u001b[38;5;241m=\u001b[39m \u001b[43mmodel_pipeline\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mravel\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 11\u001b[0m y_train_predict \u001b[38;5;241m=\u001b[39m model_pipeline\u001b[38;5;241m.\u001b[39mpredict(X_train)\n\u001b[0;32m 12\u001b[0m y_test_probs \u001b[38;5;241m=\u001b[39m model_pipeline\u001b[38;5;241m.\u001b[39mpredict_proba(X_test)[:, \u001b[38;5;241m1\u001b[39m]\n",
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\pipeline.py:469\u001b[0m, in \u001b[0;36mPipeline.fit\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 426\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Fit the model.\u001b[39;00m\n\u001b[0;32m 427\u001b[0m \n\u001b[0;32m 428\u001b[0m \u001b[38;5;124;03mFit all the transformers one after the other and sequentially transform the\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 466\u001b[0m \u001b[38;5;124;03m Pipeline with fitted steps.\u001b[39;00m\n\u001b[0;32m 467\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 468\u001b[0m routed_params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_method_params(method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit\u001b[39m\u001b[38;5;124m\"\u001b[39m, props\u001b[38;5;241m=\u001b[39mparams)\n\u001b[1;32m--> 469\u001b[0m Xt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrouted_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 470\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_log_message(\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m)):\n\u001b[0;32m 471\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_final_estimator \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpassthrough\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\pipeline.py:406\u001b[0m, in \u001b[0;36mPipeline._fit\u001b[1;34m(self, X, y, routed_params)\u001b[0m\n\u001b[0;32m 404\u001b[0m cloned_transformer \u001b[38;5;241m=\u001b[39m clone(transformer)\n\u001b[0;32m 405\u001b[0m \u001b[38;5;66;03m# Fit or load from cache the current transformer\u001b[39;00m\n\u001b[1;32m--> 406\u001b[0m X, fitted_transformer \u001b[38;5;241m=\u001b[39m \u001b[43mfit_transform_one_cached\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 407\u001b[0m \u001b[43m \u001b[49m\u001b[43mcloned_transformer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 408\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 409\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 410\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 411\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage_clsname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPipeline\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 412\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_message\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstep_idx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 413\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrouted_params\u001b[49m\u001b[43m[\u001b[49m\u001b[43mname\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 414\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 415\u001b[0m \u001b[38;5;66;03m# Replace the transformer of the step with the fitted\u001b[39;00m\n\u001b[0;32m 416\u001b[0m \u001b[38;5;66;03m# transformer. This is necessary when loading the transformer\u001b[39;00m\n\u001b[0;32m 417\u001b[0m \u001b[38;5;66;03m# from the cache.\u001b[39;00m\n\u001b[0;32m 418\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps[step_idx] \u001b[38;5;241m=\u001b[39m (name, fitted_transformer)\n",
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\joblib\\memory.py:312\u001b[0m, in \u001b[0;36mNotMemorizedFunc.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 311\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 312\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\pipeline.py:1310\u001b[0m, in \u001b[0;36m_fit_transform_one\u001b[1;34m(transformer, X, y, weight, message_clsname, message, params)\u001b[0m\n\u001b[0;32m 1308\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(message_clsname, message):\n\u001b[0;32m 1309\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(transformer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m-> 1310\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mtransformer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfit_transform\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1311\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1312\u001b[0m res \u001b[38;5;241m=\u001b[39m transformer\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit\u001b[39m\u001b[38;5;124m\"\u001b[39m, {}))\u001b[38;5;241m.\u001b[39mtransform(\n\u001b[0;32m 1313\u001b[0m X, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtransform\u001b[39m\u001b[38;5;124m\"\u001b[39m, {})\n\u001b[0;32m 1314\u001b[0m )\n",
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\pipeline.py:533\u001b[0m, in \u001b[0;36mPipeline.fit_transform\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 490\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Fit the model and transform with the final estimator.\u001b[39;00m\n\u001b[0;32m 491\u001b[0m \n\u001b[0;32m 492\u001b[0m \u001b[38;5;124;03mFit all the transformers one after the other and sequentially transform\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 530\u001b[0m \u001b[38;5;124;03m Transformed samples.\u001b[39;00m\n\u001b[0;32m 531\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 532\u001b[0m routed_params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_method_params(method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m, props\u001b[38;5;241m=\u001b[39mparams)\n\u001b[1;32m--> 533\u001b[0m Xt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrouted_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 535\u001b[0m last_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_final_estimator\n\u001b[0;32m 536\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_log_message(\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m)):\n",
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\pipeline.py:406\u001b[0m, in \u001b[0;36mPipeline._fit\u001b[1;34m(self, X, y, routed_params)\u001b[0m\n\u001b[0;32m 404\u001b[0m cloned_transformer \u001b[38;5;241m=\u001b[39m clone(transformer)\n\u001b[0;32m 405\u001b[0m \u001b[38;5;66;03m# Fit or load from cache the current transformer\u001b[39;00m\n\u001b[1;32m--> 406\u001b[0m X, fitted_transformer \u001b[38;5;241m=\u001b[39m \u001b[43mfit_transform_one_cached\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 407\u001b[0m \u001b[43m \u001b[49m\u001b[43mcloned_transformer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 408\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 409\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 410\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 411\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage_clsname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPipeline\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 412\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_message\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstep_idx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 413\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrouted_params\u001b[49m\u001b[43m[\u001b[49m\u001b[43mname\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 414\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 415\u001b[0m \u001b[38;5;66;03m# Replace the transformer of the step with the fitted\u001b[39;00m\n\u001b[0;32m 416\u001b[0m \u001b[38;5;66;03m# transformer. This is necessary when loading the transformer\u001b[39;00m\n\u001b[0;32m 417\u001b[0m \u001b[38;5;66;03m# from the cache.\u001b[39;00m\n\u001b[0;32m 418\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps[step_idx] \u001b[38;5;241m=\u001b[39m (name, fitted_transformer)\n",
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\joblib\\memory.py:312\u001b[0m, in \u001b[0;36mNotMemorizedFunc.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 311\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 312\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\pipeline.py:1310\u001b[0m, in \u001b[0;36m_fit_transform_one\u001b[1;34m(transformer, X, y, weight, message_clsname, message, params)\u001b[0m\n\u001b[0;32m 1308\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(message_clsname, message):\n\u001b[0;32m 1309\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(transformer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m-> 1310\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mtransformer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfit_transform\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1311\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1312\u001b[0m res \u001b[38;5;241m=\u001b[39m transformer\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit\u001b[39m\u001b[38;5;124m\"\u001b[39m, {}))\u001b[38;5;241m.\u001b[39mtransform(\n\u001b[0;32m 1313\u001b[0m X, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtransform\u001b[39m\u001b[38;5;124m\"\u001b[39m, {})\n\u001b[0;32m 1314\u001b[0m )\n",
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\utils\\_set_output.py:316\u001b[0m, in \u001b[0;36m_wrap_method_output.<locals>.wrapped\u001b[1;34m(self, X, *args, **kwargs)\u001b[0m\n\u001b[0;32m 314\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[0;32m 315\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 316\u001b[0m data_to_wrap \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[0;32m 318\u001b[0m \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[0;32m 319\u001b[0m return_tuple \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 320\u001b[0m _wrap_data_with_container(method, data_to_wrap[\u001b[38;5;241m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[0;32m 321\u001b[0m \u001b[38;5;241m*\u001b[39mdata_to_wrap[\u001b[38;5;241m1\u001b[39m:],\n\u001b[0;32m 322\u001b[0m )\n",
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:968\u001b[0m, in \u001b[0;36mColumnTransformer.fit_transform\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 965\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_transformers()\n\u001b[0;32m 966\u001b[0m n_samples \u001b[38;5;241m=\u001b[39m _num_samples(X)\n\u001b[1;32m--> 968\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_column_callables\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 969\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_remainder(X)\n\u001b[0;32m 971\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _routing_enabled():\n",
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:536\u001b[0m, in \u001b[0;36mColumnTransformer._validate_column_callables\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m 534\u001b[0m columns \u001b[38;5;241m=\u001b[39m columns(X)\n\u001b[0;32m 535\u001b[0m all_columns\u001b[38;5;241m.\u001b[39mappend(columns)\n\u001b[1;32m--> 536\u001b[0m transformer_to_input_indices[name] \u001b[38;5;241m=\u001b[39m \u001b[43m_get_column_indices\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 538\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_columns \u001b[38;5;241m=\u001b[39m all_columns\n\u001b[0;32m 539\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_transformer_to_input_indices \u001b[38;5;241m=\u001b[39m transformer_to_input_indices\n",
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\utils\\_indexing.py:340\u001b[0m, in \u001b[0;36m_get_column_indices\u001b[1;34m(X, key)\u001b[0m\n\u001b[0;32m 338\u001b[0m all_columns \u001b[38;5;241m=\u001b[39m X\u001b[38;5;241m.\u001b[39mcolumns\n\u001b[0;32m 339\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m:\n\u001b[1;32m--> 340\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 341\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSpecifying the columns using strings is only supported for dataframes.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 342\u001b[0m )\n\u001b[0;32m 343\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m 344\u001b[0m columns \u001b[38;5;241m=\u001b[39m [key]\n",
"\u001b[1;31mValueError\u001b[0m: Specifying the columns using strings is only supported for dataframes."
]
}
],
"source": [
"import numpy as np\n",
"from sklearn import metrics\n",
"\n",
"for model_name in class_models.keys():\n",
" print(f\"Model: {model_name}\")\n",
" model = class_models[model_name][\"model\"]\n",
"\n",
" model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n",
" model_pipeline = model_pipeline.fit(X_train.values, y_train.values.ravel())\n",
"\n",
" y_train_predict = model_pipeline.predict(X_train)\n",
" y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]\n",
" y_test_predict = np.where(y_test_probs > 0.5, 1, 0)\n",
2024-11-15 16:44:23 +04:00
"\n",
" class_models[model_name][\"pipeline\"] = model_pipeline\n",
" class_models[model_name][\"probs\"] = y_test_probs\n",
" class_models[model_name][\"preds\"] = y_test_predict\n",
2024-11-15 16:44:23 +04:00
"\n",
" class_models[model_name][\"Precision_train\"] = metrics.precision_score(\n",
" y_train, y_train_predict\n",
" )\n",
" class_models[model_name][\"Precision_test\"] = metrics.precision_score(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"Recall_train\"] = metrics.recall_score(\n",
" y_train, y_train_predict\n",
" )\n",
" class_models[model_name][\"Recall_test\"] = metrics.recall_score(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"Accuracy_train\"] = metrics.accuracy_score(\n",
" y_train, y_train_predict\n",
" )\n",
" class_models[model_name][\"Accuracy_test\"] = metrics.accuracy_score(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"ROC_AUC_test\"] = metrics.roc_auc_score(\n",
" y_test, y_test_probs\n",
" )\n",
" class_models[model_name][\"F1_train\"] = metrics.f1_score(y_train, y_train_predict)\n",
" class_models[model_name][\"F1_test\"] = metrics.f1_score(y_test, y_test_predict)\n",
" class_models[model_name][\"MCC_test\"] = metrics.matthews_corrcoef(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"Confusion_matrix\"] = metrics.confusion_matrix(\n",
" y_test, y_test_predict\n",
" )"
2024-11-15 16:44:23 +04:00
]
}
],
"metadata": {
"kernelspec": {
"display_name": "kernel",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}