diff --git a/lab_4/lab_4.ipynb b/lab_4/lab_4.ipynb
index e69de29..e7d76c9 100644
--- a/lab_4/lab_4.ipynb
+++ b/lab_4/lab_4.ipynb
@@ -0,0 +1,2289 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 112,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date | \n",
+ " price | \n",
+ " bedrooms | \n",
+ " bathrooms | \n",
+ " sqft_living | \n",
+ " sqft_lot | \n",
+ " floors | \n",
+ " waterfront | \n",
+ " view | \n",
+ " ... | \n",
+ " grade | \n",
+ " sqft_above | \n",
+ " sqft_basement | \n",
+ " yr_built | \n",
+ " yr_renovated | \n",
+ " zipcode | \n",
+ " lat | \n",
+ " long | \n",
+ " sqft_living15 | \n",
+ " sqft_lot15 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 7129300520 | \n",
+ " 20141013T000000 | \n",
+ " 221900.0 | \n",
+ " 3 | \n",
+ " 1.00 | \n",
+ " 1180 | \n",
+ " 5650 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 7 | \n",
+ " 1180 | \n",
+ " 0 | \n",
+ " 1955 | \n",
+ " 0 | \n",
+ " 98178 | \n",
+ " 47.5112 | \n",
+ " -122.257 | \n",
+ " 1340 | \n",
+ " 5650 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 6414100192 | \n",
+ " 20141209T000000 | \n",
+ " 538000.0 | \n",
+ " 3 | \n",
+ " 2.25 | \n",
+ " 2570 | \n",
+ " 7242 | \n",
+ " 2.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 7 | \n",
+ " 2170 | \n",
+ " 400 | \n",
+ " 1951 | \n",
+ " 1991 | \n",
+ " 98125 | \n",
+ " 47.7210 | \n",
+ " -122.319 | \n",
+ " 1690 | \n",
+ " 7639 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 5631500400 | \n",
+ " 20150225T000000 | \n",
+ " 180000.0 | \n",
+ " 2 | \n",
+ " 1.00 | \n",
+ " 770 | \n",
+ " 10000 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 6 | \n",
+ " 770 | \n",
+ " 0 | \n",
+ " 1933 | \n",
+ " 0 | \n",
+ " 98028 | \n",
+ " 47.7379 | \n",
+ " -122.233 | \n",
+ " 2720 | \n",
+ " 8062 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2487200875 | \n",
+ " 20141209T000000 | \n",
+ " 604000.0 | \n",
+ " 4 | \n",
+ " 3.00 | \n",
+ " 1960 | \n",
+ " 5000 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 7 | \n",
+ " 1050 | \n",
+ " 910 | \n",
+ " 1965 | \n",
+ " 0 | \n",
+ " 98136 | \n",
+ " 47.5208 | \n",
+ " -122.393 | \n",
+ " 1360 | \n",
+ " 5000 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1954400510 | \n",
+ " 20150218T000000 | \n",
+ " 510000.0 | \n",
+ " 3 | \n",
+ " 2.00 | \n",
+ " 1680 | \n",
+ " 8080 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 8 | \n",
+ " 1680 | \n",
+ " 0 | \n",
+ " 1987 | \n",
+ " 0 | \n",
+ " 98074 | \n",
+ " 47.6168 | \n",
+ " -122.045 | \n",
+ " 1800 | \n",
+ " 7503 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 9995 | \n",
+ " 322059264 | \n",
+ " 20140926T000000 | \n",
+ " 279000.0 | \n",
+ " 2 | \n",
+ " 1.00 | \n",
+ " 1020 | \n",
+ " 47044 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 7 | \n",
+ " 1020 | \n",
+ " 0 | \n",
+ " 1904 | \n",
+ " 1958 | \n",
+ " 98042 | \n",
+ " 47.4206 | \n",
+ " -122.155 | \n",
+ " 1930 | \n",
+ " 12139 | \n",
+ "
\n",
+ " \n",
+ " 9996 | \n",
+ " 5557500270 | \n",
+ " 20150209T000000 | \n",
+ " 262000.0 | \n",
+ " 3 | \n",
+ " 1.50 | \n",
+ " 1700 | \n",
+ " 9579 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 7 | \n",
+ " 1100 | \n",
+ " 600 | \n",
+ " 1962 | \n",
+ " 0 | \n",
+ " 98023 | \n",
+ " 47.3209 | \n",
+ " -122.338 | \n",
+ " 1700 | \n",
+ " 9628 | \n",
+ "
\n",
+ " \n",
+ " 9997 | \n",
+ " 9164100125 | \n",
+ " 20140807T000000 | \n",
+ " 533000.0 | \n",
+ " 4 | \n",
+ " 1.00 | \n",
+ " 1550 | \n",
+ " 4750 | \n",
+ " 1.5 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 7 | \n",
+ " 1550 | \n",
+ " 0 | \n",
+ " 1919 | \n",
+ " 0 | \n",
+ " 98117 | \n",
+ " 47.6824 | \n",
+ " -122.389 | \n",
+ " 1320 | \n",
+ " 4750 | \n",
+ "
\n",
+ " \n",
+ " 9998 | \n",
+ " 7370600045 | \n",
+ " 20150402T000000 | \n",
+ " 640000.0 | \n",
+ " 3 | \n",
+ " 1.75 | \n",
+ " 1680 | \n",
+ " 8100 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " ... | \n",
+ " 8 | \n",
+ " 1680 | \n",
+ " 0 | \n",
+ " 1950 | \n",
+ " 0 | \n",
+ " 98177 | \n",
+ " 47.7212 | \n",
+ " -122.364 | \n",
+ " 1880 | \n",
+ " 7750 | \n",
+ "
\n",
+ " \n",
+ " 9999 | \n",
+ " 8594400060 | \n",
+ " 20140609T000000 | \n",
+ " 285000.0 | \n",
+ " 3 | \n",
+ " 2.25 | \n",
+ " 1680 | \n",
+ " 35127 | \n",
+ " 2.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 7 | \n",
+ " 1680 | \n",
+ " 0 | \n",
+ " 1987 | \n",
+ " 0 | \n",
+ " 98092 | \n",
+ " 47.3025 | \n",
+ " -122.067 | \n",
+ " 1820 | \n",
+ " 35166 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
10000 rows × 21 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date price bedrooms bathrooms sqft_living \\\n",
+ "0 7129300520 20141013T000000 221900.0 3 1.00 1180 \n",
+ "1 6414100192 20141209T000000 538000.0 3 2.25 2570 \n",
+ "2 5631500400 20150225T000000 180000.0 2 1.00 770 \n",
+ "3 2487200875 20141209T000000 604000.0 4 3.00 1960 \n",
+ "4 1954400510 20150218T000000 510000.0 3 2.00 1680 \n",
+ "... ... ... ... ... ... ... \n",
+ "9995 322059264 20140926T000000 279000.0 2 1.00 1020 \n",
+ "9996 5557500270 20150209T000000 262000.0 3 1.50 1700 \n",
+ "9997 9164100125 20140807T000000 533000.0 4 1.00 1550 \n",
+ "9998 7370600045 20150402T000000 640000.0 3 1.75 1680 \n",
+ "9999 8594400060 20140609T000000 285000.0 3 2.25 1680 \n",
+ "\n",
+ " sqft_lot floors waterfront view ... grade sqft_above \\\n",
+ "0 5650 1.0 0 0 ... 7 1180 \n",
+ "1 7242 2.0 0 0 ... 7 2170 \n",
+ "2 10000 1.0 0 0 ... 6 770 \n",
+ "3 5000 1.0 0 0 ... 7 1050 \n",
+ "4 8080 1.0 0 0 ... 8 1680 \n",
+ "... ... ... ... ... ... ... ... \n",
+ "9995 47044 1.0 0 0 ... 7 1020 \n",
+ "9996 9579 1.0 0 0 ... 7 1100 \n",
+ "9997 4750 1.5 0 0 ... 7 1550 \n",
+ "9998 8100 1.0 0 2 ... 8 1680 \n",
+ "9999 35127 2.0 0 0 ... 7 1680 \n",
+ "\n",
+ " sqft_basement yr_built yr_renovated zipcode lat long \\\n",
+ "0 0 1955 0 98178 47.5112 -122.257 \n",
+ "1 400 1951 1991 98125 47.7210 -122.319 \n",
+ "2 0 1933 0 98028 47.7379 -122.233 \n",
+ "3 910 1965 0 98136 47.5208 -122.393 \n",
+ "4 0 1987 0 98074 47.6168 -122.045 \n",
+ "... ... ... ... ... ... ... \n",
+ "9995 0 1904 1958 98042 47.4206 -122.155 \n",
+ "9996 600 1962 0 98023 47.3209 -122.338 \n",
+ "9997 0 1919 0 98117 47.6824 -122.389 \n",
+ "9998 0 1950 0 98177 47.7212 -122.364 \n",
+ "9999 0 1987 0 98092 47.3025 -122.067 \n",
+ "\n",
+ " sqft_living15 sqft_lot15 \n",
+ "0 1340 5650 \n",
+ "1 1690 7639 \n",
+ "2 2720 8062 \n",
+ "3 1360 5000 \n",
+ "4 1800 7503 \n",
+ "... ... ... \n",
+ "9995 1930 12139 \n",
+ "9996 1700 9628 \n",
+ "9997 1320 4750 \n",
+ "9998 1880 7750 \n",
+ "9999 1820 35166 \n",
+ "\n",
+ "[10000 rows x 21 columns]"
+ ]
+ },
+ "execution_count": 112,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn import set_config\n",
+ "\n",
+ "df = pd.read_csv(\"data/house_data.csv\", sep=\",\", nrows=10000)\n",
+ "df.dropna()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Устраняем выбросы в колонке цены и добавляем колонку с категориями цены"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 113,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date | \n",
+ " price | \n",
+ " bedrooms | \n",
+ " bathrooms | \n",
+ " sqft_living | \n",
+ " sqft_lot | \n",
+ " floors | \n",
+ " waterfront | \n",
+ " view | \n",
+ " ... | \n",
+ " sqft_above | \n",
+ " sqft_basement | \n",
+ " yr_built | \n",
+ " yr_renovated | \n",
+ " zipcode | \n",
+ " lat | \n",
+ " long | \n",
+ " sqft_living15 | \n",
+ " sqft_lot15 | \n",
+ " price_category | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 9980 | \n",
+ " 6840700036 | \n",
+ " 20140728T000000 | \n",
+ " 497000.0 | \n",
+ " 2 | \n",
+ " 1.00 | \n",
+ " 770 | \n",
+ " 3325 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 770 | \n",
+ " 0 | \n",
+ " 1918 | \n",
+ " 0 | \n",
+ " 98122 | \n",
+ " 47.6102 | \n",
+ " -122.299 | \n",
+ " 960 | \n",
+ " 4800 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " 9981 | \n",
+ " 1824069083 | \n",
+ " 20150429T000000 | \n",
+ " 835000.0 | \n",
+ " 3 | \n",
+ " 1.00 | \n",
+ " 3060 | \n",
+ " 30166 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 3060 | \n",
+ " 0 | \n",
+ " 1959 | \n",
+ " 0 | \n",
+ " 98027 | \n",
+ " 47.5656 | \n",
+ " -122.093 | \n",
+ " 1880 | \n",
+ " 19602 | \n",
+ " high | \n",
+ "
\n",
+ " \n",
+ " 9982 | \n",
+ " 1836980240 | \n",
+ " 20141015T000000 | \n",
+ " 730000.0 | \n",
+ " 4 | \n",
+ " 2.75 | \n",
+ " 2920 | \n",
+ " 4500 | \n",
+ " 2.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 2920 | \n",
+ " 0 | \n",
+ " 1999 | \n",
+ " 0 | \n",
+ " 98006 | \n",
+ " 47.5646 | \n",
+ " -122.124 | \n",
+ " 2920 | \n",
+ " 4505 | \n",
+ " high | \n",
+ "
\n",
+ " \n",
+ " 9983 | \n",
+ " 3528900160 | \n",
+ " 20141001T000000 | \n",
+ " 655000.0 | \n",
+ " 3 | \n",
+ " 1.00 | \n",
+ " 1370 | \n",
+ " 5250 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1070 | \n",
+ " 300 | \n",
+ " 1939 | \n",
+ " 0 | \n",
+ " 98109 | \n",
+ " 47.6421 | \n",
+ " -122.348 | \n",
+ " 2410 | \n",
+ " 4200 | \n",
+ " high | \n",
+ "
\n",
+ " \n",
+ " 9984 | \n",
+ " 1442800060 | \n",
+ " 20141120T000000 | \n",
+ " 205000.0 | \n",
+ " 3 | \n",
+ " 2.50 | \n",
+ " 1870 | \n",
+ " 3118 | \n",
+ " 2.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1870 | \n",
+ " 0 | \n",
+ " 1993 | \n",
+ " 0 | \n",
+ " 98038 | \n",
+ " 47.3739 | \n",
+ " -122.056 | \n",
+ " 1580 | \n",
+ " 3601 | \n",
+ " low | \n",
+ "
\n",
+ " \n",
+ " 9985 | \n",
+ " 8722100030 | \n",
+ " 20150407T000000 | \n",
+ " 632750.0 | \n",
+ " 4 | \n",
+ " 2.00 | \n",
+ " 1800 | \n",
+ " 4800 | \n",
+ " 1.5 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1800 | \n",
+ " 0 | \n",
+ " 1918 | \n",
+ " 0 | \n",
+ " 98112 | \n",
+ " 47.6388 | \n",
+ " -122.302 | \n",
+ " 1950 | \n",
+ " 4800 | \n",
+ " high | \n",
+ "
\n",
+ " \n",
+ " 9986 | \n",
+ " 1723049624 | \n",
+ " 20140512T000000 | \n",
+ " 330000.0 | \n",
+ " 5 | \n",
+ " 3.00 | \n",
+ " 2100 | \n",
+ " 7715 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1250 | \n",
+ " 850 | \n",
+ " 2013 | \n",
+ " 0 | \n",
+ " 98168 | \n",
+ " 47.4866 | \n",
+ " -122.319 | \n",
+ " 2100 | \n",
+ " 7959 | \n",
+ " low | \n",
+ "
\n",
+ " \n",
+ " 9987 | \n",
+ " 4040400200 | \n",
+ " 20141007T000000 | \n",
+ " 527500.0 | \n",
+ " 5 | \n",
+ " 2.25 | \n",
+ " 2530 | \n",
+ " 8250 | \n",
+ " 2.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 2530 | \n",
+ " 0 | \n",
+ " 1961 | \n",
+ " 0 | \n",
+ " 98007 | \n",
+ " 47.6117 | \n",
+ " -122.134 | \n",
+ " 2020 | \n",
+ " 8250 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " 9988 | \n",
+ " 8691391090 | \n",
+ " 20140508T000000 | \n",
+ " 716500.0 | \n",
+ " 4 | \n",
+ " 2.50 | \n",
+ " 3290 | \n",
+ " 6465 | \n",
+ " 2.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 3290 | \n",
+ " 0 | \n",
+ " 2002 | \n",
+ " 0 | \n",
+ " 98075 | \n",
+ " 47.5981 | \n",
+ " -121.976 | \n",
+ " 3100 | \n",
+ " 5929 | \n",
+ " high | \n",
+ "
\n",
+ " \n",
+ " 9989 | \n",
+ " 7853302190 | \n",
+ " 20141217T000000 | \n",
+ " 388500.0 | \n",
+ " 4 | \n",
+ " 2.50 | \n",
+ " 1890 | \n",
+ " 5395 | \n",
+ " 2.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1890 | \n",
+ " 0 | \n",
+ " 2006 | \n",
+ " 0 | \n",
+ " 98065 | \n",
+ " 47.5415 | \n",
+ " -121.883 | \n",
+ " 2060 | \n",
+ " 5395 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " 9990 | \n",
+ " 3260000700 | \n",
+ " 20140904T000000 | \n",
+ " 530000.0 | \n",
+ " 3 | \n",
+ " 1.75 | \n",
+ " 1680 | \n",
+ " 7770 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1680 | \n",
+ " 0 | \n",
+ " 1967 | \n",
+ " 0 | \n",
+ " 98005 | \n",
+ " 47.6028 | \n",
+ " -122.167 | \n",
+ " 1880 | \n",
+ " 7770 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " 9991 | \n",
+ " 5126300510 | \n",
+ " 20150108T000000 | \n",
+ " 419000.0 | \n",
+ " 3 | \n",
+ " 2.50 | \n",
+ " 2170 | \n",
+ " 4517 | \n",
+ " 2.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 2170 | \n",
+ " 0 | \n",
+ " 2002 | \n",
+ " 0 | \n",
+ " 98059 | \n",
+ " 47.4819 | \n",
+ " -122.140 | \n",
+ " 2610 | \n",
+ " 4770 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " 9992 | \n",
+ " 7199330370 | \n",
+ " 20150309T000000 | \n",
+ " 385000.0 | \n",
+ " 3 | \n",
+ " 1.75 | \n",
+ " 1200 | \n",
+ " 7360 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1200 | \n",
+ " 0 | \n",
+ " 1978 | \n",
+ " 0 | \n",
+ " 98052 | \n",
+ " 47.6979 | \n",
+ " -122.130 | \n",
+ " 1200 | \n",
+ " 7500 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " 9993 | \n",
+ " 1854900240 | \n",
+ " 20140528T000000 | \n",
+ " 655000.0 | \n",
+ " 4 | \n",
+ " 2.50 | \n",
+ " 2990 | \n",
+ " 5669 | \n",
+ " 2.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 2990 | \n",
+ " 0 | \n",
+ " 2003 | \n",
+ " 0 | \n",
+ " 98074 | \n",
+ " 47.6119 | \n",
+ " -122.011 | \n",
+ " 3110 | \n",
+ " 5058 | \n",
+ " high | \n",
+ "
\n",
+ " \n",
+ " 9994 | \n",
+ " 6738700335 | \n",
+ " 20140701T000000 | \n",
+ " 1127312.5 | \n",
+ " 4 | \n",
+ " 2.75 | \n",
+ " 3770 | \n",
+ " 10900 | \n",
+ " 2.0 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " ... | \n",
+ " 3070 | \n",
+ " 700 | \n",
+ " 1924 | \n",
+ " 0 | \n",
+ " 98144 | \n",
+ " 47.5849 | \n",
+ " -122.290 | \n",
+ " 3000 | \n",
+ " 5000 | \n",
+ " very_high | \n",
+ "
\n",
+ " \n",
+ " 9995 | \n",
+ " 322059264 | \n",
+ " 20140926T000000 | \n",
+ " 279000.0 | \n",
+ " 2 | \n",
+ " 1.00 | \n",
+ " 1020 | \n",
+ " 47044 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1020 | \n",
+ " 0 | \n",
+ " 1904 | \n",
+ " 1958 | \n",
+ " 98042 | \n",
+ " 47.4206 | \n",
+ " -122.155 | \n",
+ " 1930 | \n",
+ " 12139 | \n",
+ " low | \n",
+ "
\n",
+ " \n",
+ " 9996 | \n",
+ " 5557500270 | \n",
+ " 20150209T000000 | \n",
+ " 262000.0 | \n",
+ " 3 | \n",
+ " 1.50 | \n",
+ " 1700 | \n",
+ " 9579 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1100 | \n",
+ " 600 | \n",
+ " 1962 | \n",
+ " 0 | \n",
+ " 98023 | \n",
+ " 47.3209 | \n",
+ " -122.338 | \n",
+ " 1700 | \n",
+ " 9628 | \n",
+ " low | \n",
+ "
\n",
+ " \n",
+ " 9997 | \n",
+ " 9164100125 | \n",
+ " 20140807T000000 | \n",
+ " 533000.0 | \n",
+ " 4 | \n",
+ " 1.00 | \n",
+ " 1550 | \n",
+ " 4750 | \n",
+ " 1.5 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1550 | \n",
+ " 0 | \n",
+ " 1919 | \n",
+ " 0 | \n",
+ " 98117 | \n",
+ " 47.6824 | \n",
+ " -122.389 | \n",
+ " 1320 | \n",
+ " 4750 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " 9998 | \n",
+ " 7370600045 | \n",
+ " 20150402T000000 | \n",
+ " 640000.0 | \n",
+ " 3 | \n",
+ " 1.75 | \n",
+ " 1680 | \n",
+ " 8100 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " ... | \n",
+ " 1680 | \n",
+ " 0 | \n",
+ " 1950 | \n",
+ " 0 | \n",
+ " 98177 | \n",
+ " 47.7212 | \n",
+ " -122.364 | \n",
+ " 1880 | \n",
+ " 7750 | \n",
+ " high | \n",
+ "
\n",
+ " \n",
+ " 9999 | \n",
+ " 8594400060 | \n",
+ " 20140609T000000 | \n",
+ " 285000.0 | \n",
+ " 3 | \n",
+ " 2.25 | \n",
+ " 1680 | \n",
+ " 35127 | \n",
+ " 2.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1680 | \n",
+ " 0 | \n",
+ " 1987 | \n",
+ " 0 | \n",
+ " 98092 | \n",
+ " 47.3025 | \n",
+ " -122.067 | \n",
+ " 1820 | \n",
+ " 35166 | \n",
+ " low | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
20 rows × 22 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date price bedrooms bathrooms \\\n",
+ "9980 6840700036 20140728T000000 497000.0 2 1.00 \n",
+ "9981 1824069083 20150429T000000 835000.0 3 1.00 \n",
+ "9982 1836980240 20141015T000000 730000.0 4 2.75 \n",
+ "9983 3528900160 20141001T000000 655000.0 3 1.00 \n",
+ "9984 1442800060 20141120T000000 205000.0 3 2.50 \n",
+ "9985 8722100030 20150407T000000 632750.0 4 2.00 \n",
+ "9986 1723049624 20140512T000000 330000.0 5 3.00 \n",
+ "9987 4040400200 20141007T000000 527500.0 5 2.25 \n",
+ "9988 8691391090 20140508T000000 716500.0 4 2.50 \n",
+ "9989 7853302190 20141217T000000 388500.0 4 2.50 \n",
+ "9990 3260000700 20140904T000000 530000.0 3 1.75 \n",
+ "9991 5126300510 20150108T000000 419000.0 3 2.50 \n",
+ "9992 7199330370 20150309T000000 385000.0 3 1.75 \n",
+ "9993 1854900240 20140528T000000 655000.0 4 2.50 \n",
+ "9994 6738700335 20140701T000000 1127312.5 4 2.75 \n",
+ "9995 322059264 20140926T000000 279000.0 2 1.00 \n",
+ "9996 5557500270 20150209T000000 262000.0 3 1.50 \n",
+ "9997 9164100125 20140807T000000 533000.0 4 1.00 \n",
+ "9998 7370600045 20150402T000000 640000.0 3 1.75 \n",
+ "9999 8594400060 20140609T000000 285000.0 3 2.25 \n",
+ "\n",
+ " sqft_living sqft_lot floors waterfront view ... sqft_above \\\n",
+ "9980 770 3325 1.0 0 0 ... 770 \n",
+ "9981 3060 30166 1.0 0 0 ... 3060 \n",
+ "9982 2920 4500 2.0 0 0 ... 2920 \n",
+ "9983 1370 5250 1.0 0 0 ... 1070 \n",
+ "9984 1870 3118 2.0 0 0 ... 1870 \n",
+ "9985 1800 4800 1.5 0 0 ... 1800 \n",
+ "9986 2100 7715 1.0 0 0 ... 1250 \n",
+ "9987 2530 8250 2.0 0 0 ... 2530 \n",
+ "9988 3290 6465 2.0 0 0 ... 3290 \n",
+ "9989 1890 5395 2.0 0 0 ... 1890 \n",
+ "9990 1680 7770 1.0 0 0 ... 1680 \n",
+ "9991 2170 4517 2.0 0 0 ... 2170 \n",
+ "9992 1200 7360 1.0 0 0 ... 1200 \n",
+ "9993 2990 5669 2.0 0 0 ... 2990 \n",
+ "9994 3770 10900 2.0 0 2 ... 3070 \n",
+ "9995 1020 47044 1.0 0 0 ... 1020 \n",
+ "9996 1700 9579 1.0 0 0 ... 1100 \n",
+ "9997 1550 4750 1.5 0 0 ... 1550 \n",
+ "9998 1680 8100 1.0 0 2 ... 1680 \n",
+ "9999 1680 35127 2.0 0 0 ... 1680 \n",
+ "\n",
+ " sqft_basement yr_built yr_renovated zipcode lat long \\\n",
+ "9980 0 1918 0 98122 47.6102 -122.299 \n",
+ "9981 0 1959 0 98027 47.5656 -122.093 \n",
+ "9982 0 1999 0 98006 47.5646 -122.124 \n",
+ "9983 300 1939 0 98109 47.6421 -122.348 \n",
+ "9984 0 1993 0 98038 47.3739 -122.056 \n",
+ "9985 0 1918 0 98112 47.6388 -122.302 \n",
+ "9986 850 2013 0 98168 47.4866 -122.319 \n",
+ "9987 0 1961 0 98007 47.6117 -122.134 \n",
+ "9988 0 2002 0 98075 47.5981 -121.976 \n",
+ "9989 0 2006 0 98065 47.5415 -121.883 \n",
+ "9990 0 1967 0 98005 47.6028 -122.167 \n",
+ "9991 0 2002 0 98059 47.4819 -122.140 \n",
+ "9992 0 1978 0 98052 47.6979 -122.130 \n",
+ "9993 0 2003 0 98074 47.6119 -122.011 \n",
+ "9994 700 1924 0 98144 47.5849 -122.290 \n",
+ "9995 0 1904 1958 98042 47.4206 -122.155 \n",
+ "9996 600 1962 0 98023 47.3209 -122.338 \n",
+ "9997 0 1919 0 98117 47.6824 -122.389 \n",
+ "9998 0 1950 0 98177 47.7212 -122.364 \n",
+ "9999 0 1987 0 98092 47.3025 -122.067 \n",
+ "\n",
+ " sqft_living15 sqft_lot15 price_category \n",
+ "9980 960 4800 middle \n",
+ "9981 1880 19602 high \n",
+ "9982 2920 4505 high \n",
+ "9983 2410 4200 high \n",
+ "9984 1580 3601 low \n",
+ "9985 1950 4800 high \n",
+ "9986 2100 7959 low \n",
+ "9987 2020 8250 middle \n",
+ "9988 3100 5929 high \n",
+ "9989 2060 5395 middle \n",
+ "9990 1880 7770 middle \n",
+ "9991 2610 4770 middle \n",
+ "9992 1200 7500 middle \n",
+ "9993 3110 5058 high \n",
+ "9994 3000 5000 very_high \n",
+ "9995 1930 12139 low \n",
+ "9996 1700 9628 low \n",
+ "9997 1320 4750 middle \n",
+ "9998 1880 7750 high \n",
+ "9999 1820 35166 low \n",
+ "\n",
+ "[20 rows x 22 columns]"
+ ]
+ },
+ "execution_count": 113,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "q1 = df['price'].quantile(0.25) # Находим 1-й квартиль (Q1)\n",
+ "q3 = df['price'].quantile(0.75) # Находим 3-й квартиль (Q3)\n",
+ "iqr = q3 - q1 # Вычисляем межквартильный размах (IQR)\n",
+ "\n",
+ "# Определяем границы для выбросов\n",
+ "lower_bound = q1 - 1.5 * iqr # Нижняя граница\n",
+ "upper_bound = q3 + 1.5 * iqr # Верхняя граница\n",
+ "\n",
+ "# Устраняем выбросы: заменяем значения ниже нижней границы на саму нижнюю границу, а выше верхней — на верхнюю\n",
+ "df['price'] = df['price'].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)\n",
+ "\n",
+ "# Добавляем столбец с категорями цены\n",
+ "df['price_category'] = pd.cut(df['price'], bins=[75000,338750,602750,866750,1130750], labels=['low','middle','high','very_high'], include_lowest=True)\n",
+ "df.tail(20)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Бизнес-цели\n",
+ "1. Прогноз класса цены недвижимости (Классификация)\n",
+ "2. Оценка состояния недвижимости (Регрессия)\n",
+ "\n",
+ "### Определение достижимого уровня качества модели для первой задачи\n",
+ "#### Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации (Целевой признак - price)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 114,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'X_train'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date | \n",
+ " price | \n",
+ " bedrooms | \n",
+ " bathrooms | \n",
+ " sqft_living | \n",
+ " sqft_lot | \n",
+ " floors | \n",
+ " waterfront | \n",
+ " view | \n",
+ " ... | \n",
+ " sqft_above | \n",
+ " sqft_basement | \n",
+ " yr_built | \n",
+ " yr_renovated | \n",
+ " zipcode | \n",
+ " lat | \n",
+ " long | \n",
+ " sqft_living15 | \n",
+ " sqft_lot15 | \n",
+ " price_category | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 9843 | \n",
+ " 3260000340 | \n",
+ " 20140622T000000 | \n",
+ " 732600.0 | \n",
+ " 4 | \n",
+ " 2.50 | \n",
+ " 2130 | \n",
+ " 7300 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1230 | \n",
+ " 900 | \n",
+ " 1963 | \n",
+ " 0 | \n",
+ " 98005 | \n",
+ " 47.6050 | \n",
+ " -122.167 | \n",
+ " 2130 | \n",
+ " 7560 | \n",
+ " high | \n",
+ "
\n",
+ " \n",
+ " 9623 | \n",
+ " 9828702055 | \n",
+ " 20140508T000000 | \n",
+ " 358000.0 | \n",
+ " 2 | \n",
+ " 1.50 | \n",
+ " 960 | \n",
+ " 1808 | \n",
+ " 2.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 960 | \n",
+ " 0 | \n",
+ " 1993 | \n",
+ " 0 | \n",
+ " 98122 | \n",
+ " 47.6183 | \n",
+ " -122.298 | \n",
+ " 1290 | \n",
+ " 1668 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " 3095 | \n",
+ " 3438500625 | \n",
+ " 20140519T000000 | \n",
+ " 210000.0 | \n",
+ " 3 | \n",
+ " 1.00 | \n",
+ " 1080 | \n",
+ " 21043 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1080 | \n",
+ " 0 | \n",
+ " 1942 | \n",
+ " 0 | \n",
+ " 98106 | \n",
+ " 47.5515 | \n",
+ " -122.357 | \n",
+ " 1380 | \n",
+ " 7620 | \n",
+ " low | \n",
+ "
\n",
+ " \n",
+ " 411 | \n",
+ " 2422029094 | \n",
+ " 20140716T000000 | \n",
+ " 517534.0 | \n",
+ " 2 | \n",
+ " 1.00 | \n",
+ " 833 | \n",
+ " 143947 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 833 | \n",
+ " 0 | \n",
+ " 2006 | \n",
+ " 0 | \n",
+ " 98070 | \n",
+ " 47.3889 | \n",
+ " -122.482 | \n",
+ " 1380 | \n",
+ " 143947 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " 3060 | \n",
+ " 7462900015 | \n",
+ " 20150108T000000 | \n",
+ " 387000.0 | \n",
+ " 3 | \n",
+ " 2.25 | \n",
+ " 1760 | \n",
+ " 45133 | \n",
+ " 2.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1760 | \n",
+ " 0 | \n",
+ " 1984 | \n",
+ " 0 | \n",
+ " 98065 | \n",
+ " 47.5124 | \n",
+ " -121.866 | \n",
+ " 1910 | \n",
+ " 51773 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 1750 | \n",
+ " 2787720140 | \n",
+ " 20150407T000000 | \n",
+ " 416000.0 | \n",
+ " 3 | \n",
+ " 2.50 | \n",
+ " 1790 | \n",
+ " 11542 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1190 | \n",
+ " 600 | \n",
+ " 1969 | \n",
+ " 0 | \n",
+ " 98059 | \n",
+ " 47.5124 | \n",
+ " -122.160 | \n",
+ " 1790 | \n",
+ " 9131 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " 2354 | \n",
+ " 6192400400 | \n",
+ " 20140728T000000 | \n",
+ " 775000.0 | \n",
+ " 4 | \n",
+ " 2.50 | \n",
+ " 3090 | \n",
+ " 7112 | \n",
+ " 2.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 3090 | \n",
+ " 0 | \n",
+ " 2001 | \n",
+ " 0 | \n",
+ " 98052 | \n",
+ " 47.7050 | \n",
+ " -122.118 | \n",
+ " 3050 | \n",
+ " 6000 | \n",
+ " high | \n",
+ "
\n",
+ " \n",
+ " 857 | \n",
+ " 2296500036 | \n",
+ " 20150310T000000 | \n",
+ " 450000.0 | \n",
+ " 4 | \n",
+ " 2.75 | \n",
+ " 2980 | \n",
+ " 13260 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1800 | \n",
+ " 1180 | \n",
+ " 1979 | \n",
+ " 0 | \n",
+ " 98056 | \n",
+ " 47.5152 | \n",
+ " -122.197 | \n",
+ " 1920 | \n",
+ " 10731 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " 6181 | \n",
+ " 2787310130 | \n",
+ " 20141212T000000 | \n",
+ " 289950.0 | \n",
+ " 4 | \n",
+ " 1.75 | \n",
+ " 2090 | \n",
+ " 7416 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1050 | \n",
+ " 1040 | \n",
+ " 1970 | \n",
+ " 0 | \n",
+ " 98031 | \n",
+ " 47.4107 | \n",
+ " -122.179 | \n",
+ " 1710 | \n",
+ " 7527 | \n",
+ " low | \n",
+ "
\n",
+ " \n",
+ " 3141 | \n",
+ " 8567300110 | \n",
+ " 20140604T000000 | \n",
+ " 485000.0 | \n",
+ " 3 | \n",
+ " 2.50 | \n",
+ " 2340 | \n",
+ " 59058 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 2340 | \n",
+ " 0 | \n",
+ " 1985 | \n",
+ " 0 | \n",
+ " 98038 | \n",
+ " 47.4052 | \n",
+ " -122.028 | \n",
+ " 2700 | \n",
+ " 37263 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
8000 rows × 22 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date price bedrooms bathrooms sqft_living \\\n",
+ "9843 3260000340 20140622T000000 732600.0 4 2.50 2130 \n",
+ "9623 9828702055 20140508T000000 358000.0 2 1.50 960 \n",
+ "3095 3438500625 20140519T000000 210000.0 3 1.00 1080 \n",
+ "411 2422029094 20140716T000000 517534.0 2 1.00 833 \n",
+ "3060 7462900015 20150108T000000 387000.0 3 2.25 1760 \n",
+ "... ... ... ... ... ... ... \n",
+ "1750 2787720140 20150407T000000 416000.0 3 2.50 1790 \n",
+ "2354 6192400400 20140728T000000 775000.0 4 2.50 3090 \n",
+ "857 2296500036 20150310T000000 450000.0 4 2.75 2980 \n",
+ "6181 2787310130 20141212T000000 289950.0 4 1.75 2090 \n",
+ "3141 8567300110 20140604T000000 485000.0 3 2.50 2340 \n",
+ "\n",
+ " sqft_lot floors waterfront view ... sqft_above sqft_basement \\\n",
+ "9843 7300 1.0 0 0 ... 1230 900 \n",
+ "9623 1808 2.0 0 0 ... 960 0 \n",
+ "3095 21043 1.0 0 0 ... 1080 0 \n",
+ "411 143947 1.0 0 0 ... 833 0 \n",
+ "3060 45133 2.0 0 0 ... 1760 0 \n",
+ "... ... ... ... ... ... ... ... \n",
+ "1750 11542 1.0 0 0 ... 1190 600 \n",
+ "2354 7112 2.0 0 0 ... 3090 0 \n",
+ "857 13260 1.0 0 0 ... 1800 1180 \n",
+ "6181 7416 1.0 0 0 ... 1050 1040 \n",
+ "3141 59058 1.0 0 0 ... 2340 0 \n",
+ "\n",
+ " yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
+ "9843 1963 0 98005 47.6050 -122.167 2130 \n",
+ "9623 1993 0 98122 47.6183 -122.298 1290 \n",
+ "3095 1942 0 98106 47.5515 -122.357 1380 \n",
+ "411 2006 0 98070 47.3889 -122.482 1380 \n",
+ "3060 1984 0 98065 47.5124 -121.866 1910 \n",
+ "... ... ... ... ... ... ... \n",
+ "1750 1969 0 98059 47.5124 -122.160 1790 \n",
+ "2354 2001 0 98052 47.7050 -122.118 3050 \n",
+ "857 1979 0 98056 47.5152 -122.197 1920 \n",
+ "6181 1970 0 98031 47.4107 -122.179 1710 \n",
+ "3141 1985 0 98038 47.4052 -122.028 2700 \n",
+ "\n",
+ " sqft_lot15 price_category \n",
+ "9843 7560 high \n",
+ "9623 1668 middle \n",
+ "3095 7620 low \n",
+ "411 143947 middle \n",
+ "3060 51773 middle \n",
+ "... ... ... \n",
+ "1750 9131 middle \n",
+ "2354 6000 high \n",
+ "857 10731 middle \n",
+ "6181 7527 low \n",
+ "3141 37263 middle \n",
+ "\n",
+ "[8000 rows x 22 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'y_train'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " price_category | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 9843 | \n",
+ " high | \n",
+ "
\n",
+ " \n",
+ " 9623 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " 3095 | \n",
+ " low | \n",
+ "
\n",
+ " \n",
+ " 411 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " 3060 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 1750 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " 2354 | \n",
+ " high | \n",
+ "
\n",
+ " \n",
+ " 857 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " 6181 | \n",
+ " low | \n",
+ "
\n",
+ " \n",
+ " 3141 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
8000 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " price_category\n",
+ "9843 high\n",
+ "9623 middle\n",
+ "3095 low\n",
+ "411 middle\n",
+ "3060 middle\n",
+ "... ...\n",
+ "1750 middle\n",
+ "2354 high\n",
+ "857 middle\n",
+ "6181 low\n",
+ "3141 middle\n",
+ "\n",
+ "[8000 rows x 1 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'X_test'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date | \n",
+ " price | \n",
+ " bedrooms | \n",
+ " bathrooms | \n",
+ " sqft_living | \n",
+ " sqft_lot | \n",
+ " floors | \n",
+ " waterfront | \n",
+ " view | \n",
+ " ... | \n",
+ " sqft_above | \n",
+ " sqft_basement | \n",
+ " yr_built | \n",
+ " yr_renovated | \n",
+ " zipcode | \n",
+ " lat | \n",
+ " long | \n",
+ " sqft_living15 | \n",
+ " sqft_lot15 | \n",
+ " price_category | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 5341 | \n",
+ " 6632900574 | \n",
+ " 20150225T000000 | \n",
+ " 595000.0 | \n",
+ " 5 | \n",
+ " 3.00 | \n",
+ " 2980 | \n",
+ " 10064 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1680 | \n",
+ " 1300 | \n",
+ " 1940 | \n",
+ " 0 | \n",
+ " 98155 | \n",
+ " 47.7372 | \n",
+ " -122.316 | \n",
+ " 1590 | \n",
+ " 7800 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " 4384 | \n",
+ " 2423029245 | \n",
+ " 20140617T000000 | \n",
+ " 550000.0 | \n",
+ " 3 | \n",
+ " 1.75 | \n",
+ " 2240 | \n",
+ " 78225 | \n",
+ " 2.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 2240 | \n",
+ " 0 | \n",
+ " 1976 | \n",
+ " 0 | \n",
+ " 98070 | \n",
+ " 47.4638 | \n",
+ " -122.484 | \n",
+ " 2030 | \n",
+ " 202554 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " 5795 | \n",
+ " 2473370050 | \n",
+ " 20140604T000000 | \n",
+ " 327500.0 | \n",
+ " 4 | \n",
+ " 1.75 | \n",
+ " 1650 | \n",
+ " 7800 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1650 | \n",
+ " 0 | \n",
+ " 1968 | \n",
+ " 0 | \n",
+ " 98058 | \n",
+ " 47.4507 | \n",
+ " -122.139 | \n",
+ " 1750 | \n",
+ " 10400 | \n",
+ " low | \n",
+ "
\n",
+ " \n",
+ " 4956 | \n",
+ " 9528104985 | \n",
+ " 20141104T000000 | \n",
+ " 611000.0 | \n",
+ " 2 | \n",
+ " 1.00 | \n",
+ " 1270 | \n",
+ " 5100 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1100 | \n",
+ " 170 | \n",
+ " 1900 | \n",
+ " 0 | \n",
+ " 98115 | \n",
+ " 47.6771 | \n",
+ " -122.328 | \n",
+ " 1670 | \n",
+ " 3900 | \n",
+ " high | \n",
+ "
\n",
+ " \n",
+ " 7723 | \n",
+ " 3972900025 | \n",
+ " 20150313T000000 | \n",
+ " 499000.0 | \n",
+ " 6 | \n",
+ " 1.75 | \n",
+ " 2400 | \n",
+ " 7500 | \n",
+ " 1.5 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1400 | \n",
+ " 1000 | \n",
+ " 1975 | \n",
+ " 0 | \n",
+ " 98155 | \n",
+ " 47.7661 | \n",
+ " -122.313 | \n",
+ " 1980 | \n",
+ " 7500 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 8517 | \n",
+ " 3876600120 | \n",
+ " 20150422T000000 | \n",
+ " 265000.0 | \n",
+ " 3 | \n",
+ " 1.50 | \n",
+ " 1780 | \n",
+ " 10196 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1270 | \n",
+ " 510 | \n",
+ " 1967 | \n",
+ " 0 | \n",
+ " 98001 | \n",
+ " 47.3375 | \n",
+ " -122.291 | \n",
+ " 1320 | \n",
+ " 7875 | \n",
+ " low | \n",
+ "
\n",
+ " \n",
+ " 6914 | \n",
+ " 6821600005 | \n",
+ " 20150403T000000 | \n",
+ " 710000.0 | \n",
+ " 4 | \n",
+ " 1.75 | \n",
+ " 2120 | \n",
+ " 5400 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1060 | \n",
+ " 1060 | \n",
+ " 1941 | \n",
+ " 0 | \n",
+ " 98199 | \n",
+ " 47.6501 | \n",
+ " -122.395 | \n",
+ " 2052 | \n",
+ " 6000 | \n",
+ " high | \n",
+ "
\n",
+ " \n",
+ " 4499 | \n",
+ " 2767603931 | \n",
+ " 20140818T000000 | \n",
+ " 469000.0 | \n",
+ " 3 | \n",
+ " 3.25 | \n",
+ " 1370 | \n",
+ " 1194 | \n",
+ " 3.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1370 | \n",
+ " 0 | \n",
+ " 2004 | \n",
+ " 0 | \n",
+ " 98107 | \n",
+ " 47.6718 | \n",
+ " -122.388 | \n",
+ " 1800 | \n",
+ " 2678 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " 8651 | \n",
+ " 8802400411 | \n",
+ " 20140619T000000 | \n",
+ " 249000.0 | \n",
+ " 3 | \n",
+ " 1.00 | \n",
+ " 1050 | \n",
+ " 8498 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1050 | \n",
+ " 0 | \n",
+ " 1959 | \n",
+ " 0 | \n",
+ " 98031 | \n",
+ " 47.4043 | \n",
+ " -122.202 | \n",
+ " 1050 | \n",
+ " 8498 | \n",
+ " low | \n",
+ "
\n",
+ " \n",
+ " 4234 | \n",
+ " 5452800735 | \n",
+ " 20140722T000000 | \n",
+ " 780000.0 | \n",
+ " 4 | \n",
+ " 2.50 | \n",
+ " 2270 | \n",
+ " 13449 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1310 | \n",
+ " 960 | \n",
+ " 1975 | \n",
+ " 0 | \n",
+ " 98040 | \n",
+ " 47.5416 | \n",
+ " -122.232 | \n",
+ " 2810 | \n",
+ " 13475 | \n",
+ " high | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2000 rows × 22 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date price bedrooms bathrooms sqft_living \\\n",
+ "5341 6632900574 20150225T000000 595000.0 5 3.00 2980 \n",
+ "4384 2423029245 20140617T000000 550000.0 3 1.75 2240 \n",
+ "5795 2473370050 20140604T000000 327500.0 4 1.75 1650 \n",
+ "4956 9528104985 20141104T000000 611000.0 2 1.00 1270 \n",
+ "7723 3972900025 20150313T000000 499000.0 6 1.75 2400 \n",
+ "... ... ... ... ... ... ... \n",
+ "8517 3876600120 20150422T000000 265000.0 3 1.50 1780 \n",
+ "6914 6821600005 20150403T000000 710000.0 4 1.75 2120 \n",
+ "4499 2767603931 20140818T000000 469000.0 3 3.25 1370 \n",
+ "8651 8802400411 20140619T000000 249000.0 3 1.00 1050 \n",
+ "4234 5452800735 20140722T000000 780000.0 4 2.50 2270 \n",
+ "\n",
+ " sqft_lot floors waterfront view ... sqft_above sqft_basement \\\n",
+ "5341 10064 1.0 0 0 ... 1680 1300 \n",
+ "4384 78225 2.0 0 0 ... 2240 0 \n",
+ "5795 7800 1.0 0 0 ... 1650 0 \n",
+ "4956 5100 1.0 0 0 ... 1100 170 \n",
+ "7723 7500 1.5 0 0 ... 1400 1000 \n",
+ "... ... ... ... ... ... ... ... \n",
+ "8517 10196 1.0 0 0 ... 1270 510 \n",
+ "6914 5400 1.0 0 0 ... 1060 1060 \n",
+ "4499 1194 3.0 0 0 ... 1370 0 \n",
+ "8651 8498 1.0 0 0 ... 1050 0 \n",
+ "4234 13449 1.0 0 0 ... 1310 960 \n",
+ "\n",
+ " yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
+ "5341 1940 0 98155 47.7372 -122.316 1590 \n",
+ "4384 1976 0 98070 47.4638 -122.484 2030 \n",
+ "5795 1968 0 98058 47.4507 -122.139 1750 \n",
+ "4956 1900 0 98115 47.6771 -122.328 1670 \n",
+ "7723 1975 0 98155 47.7661 -122.313 1980 \n",
+ "... ... ... ... ... ... ... \n",
+ "8517 1967 0 98001 47.3375 -122.291 1320 \n",
+ "6914 1941 0 98199 47.6501 -122.395 2052 \n",
+ "4499 2004 0 98107 47.6718 -122.388 1800 \n",
+ "8651 1959 0 98031 47.4043 -122.202 1050 \n",
+ "4234 1975 0 98040 47.5416 -122.232 2810 \n",
+ "\n",
+ " sqft_lot15 price_category \n",
+ "5341 7800 middle \n",
+ "4384 202554 middle \n",
+ "5795 10400 low \n",
+ "4956 3900 high \n",
+ "7723 7500 middle \n",
+ "... ... ... \n",
+ "8517 7875 low \n",
+ "6914 6000 high \n",
+ "4499 2678 middle \n",
+ "8651 8498 low \n",
+ "4234 13475 high \n",
+ "\n",
+ "[2000 rows x 22 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'y_test'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " price_category | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 5341 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " 4384 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " 5795 | \n",
+ " low | \n",
+ "
\n",
+ " \n",
+ " 4956 | \n",
+ " high | \n",
+ "
\n",
+ " \n",
+ " 7723 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 8517 | \n",
+ " low | \n",
+ "
\n",
+ " \n",
+ " 6914 | \n",
+ " high | \n",
+ "
\n",
+ " \n",
+ " 4499 | \n",
+ " middle | \n",
+ "
\n",
+ " \n",
+ " 8651 | \n",
+ " low | \n",
+ "
\n",
+ " \n",
+ " 4234 | \n",
+ " high | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2000 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " price_category\n",
+ "5341 middle\n",
+ "4384 middle\n",
+ "5795 low\n",
+ "4956 high\n",
+ "7723 middle\n",
+ "... ...\n",
+ "8517 low\n",
+ "6914 high\n",
+ "4499 middle\n",
+ "8651 low\n",
+ "4234 high\n",
+ "\n",
+ "[2000 rows x 1 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from typing import Tuple\n",
+ "import pandas as pd\n",
+ "from pandas import DataFrame\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "def split_stratified_into_train_val_test(\n",
+ " df_input,\n",
+ " stratify_colname=\"y\",\n",
+ " frac_train=0.6,\n",
+ " frac_val=0.15,\n",
+ " frac_test=0.25,\n",
+ " random_state=None,\n",
+ ") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
+ " \n",
+ " if frac_train + frac_val + frac_test != 1.0:\n",
+ " raise ValueError(\n",
+ " \"fractions %f, %f, %f do not add up to 1.0\"\n",
+ " % (frac_train, frac_val, frac_test)\n",
+ " )\n",
+ " if stratify_colname not in df_input.columns:\n",
+ " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
+ " X = df_input # Contains all columns.\n",
+ " y = df_input[\n",
+ " [stratify_colname]\n",
+ " ] # Dataframe of just the column on which to stratify.\n",
+ " # Split original dataframe into train and temp dataframes.\n",
+ " df_train, df_temp, y_train, y_temp = train_test_split(\n",
+ " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
+ " )\n",
+ " if frac_val <= 0:\n",
+ " assert len(df_input) == len(df_train) + len(df_temp)\n",
+ " return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
+ " # Split the temp dataframe into val and test dataframes.\n",
+ " relative_frac_test = frac_test / (frac_val + frac_test)\n",
+ " df_val, df_test, y_val, y_test = train_test_split(\n",
+ " df_temp,\n",
+ " y_temp,\n",
+ " stratify=y_temp,\n",
+ " test_size=relative_frac_test,\n",
+ " random_state=random_state,\n",
+ " )\n",
+ " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
+ " return df_train, df_val, df_test, y_train, y_val, y_test\n",
+ "\n",
+ "X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
+ " df, stratify_colname=\"price_category\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=42\n",
+ ")\n",
+ "\n",
+ "display(\"X_train\", X_train)\n",
+ "display(\"y_train\", y_train)\n",
+ "\n",
+ "display(\"X_test\", X_test)\n",
+ "display(\"y_test\", y_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Формирование конвейера\n",
+ "preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n",
+ "\n",
+ "preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n",
+ "\n",
+ "features_preprocessing -- трансформер для предобработки признаков\n",
+ "\n",
+ "features_engineering -- трансформер для конструирования признаков\n",
+ "\n",
+ "drop_columns -- трансформер для удаления колонок\n",
+ "\n",
+ "pipeline_end -- основной конвейер предобработки данных и конструирования признаков"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "ValueError",
+ "evalue": "Shape of passed values is (8000, 21), indices imply (8000, 19)",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[1;32mIn[184], line 123\u001b[0m\n\u001b[0;32m 121\u001b[0m preprocessing_result \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(preprocessing_result, columns\u001b[38;5;241m=\u001b[39mnum_columns \u001b[38;5;241m+\u001b[39m cat_columns \u001b[38;5;241m+\u001b[39m cols)\n\u001b[0;32m 122\u001b[0m preprocessing_result \u001b[38;5;241m=\u001b[39m features_engineering\u001b[38;5;241m.\u001b[39mfit_transform(preprocessing_result)\n\u001b[1;32m--> 123\u001b[0m preprocessing_result \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mDataFrame\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpreprocessing_result\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnum_columns\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mcat_columns\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 125\u001b[0m \u001b[38;5;66;03m# preprocessing_result = features_postprocessing.fit_transform(preprocessing_result)\u001b[39;00m\n\u001b[0;32m 126\u001b[0m \n\u001b[0;32m 127\u001b[0m \u001b[38;5;66;03m# preprocessing_result = pipeline_end.fit_transform(X_train)\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 131\u001b[0m \u001b[38;5;66;03m# )\u001b[39;00m\n\u001b[0;32m 132\u001b[0m \u001b[38;5;66;03m# preprocessed_df\u001b[39;00m\n",
+ "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\pandas\\core\\frame.py:827\u001b[0m, in \u001b[0;36mDataFrame.__init__\u001b[1;34m(self, data, index, columns, dtype, copy)\u001b[0m\n\u001b[0;32m 816\u001b[0m mgr \u001b[38;5;241m=\u001b[39m dict_to_mgr(\n\u001b[0;32m 817\u001b[0m \u001b[38;5;66;03m# error: Item \"ndarray\" of \"Union[ndarray, Series, Index]\" has no\u001b[39;00m\n\u001b[0;32m 818\u001b[0m \u001b[38;5;66;03m# attribute \"name\"\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 824\u001b[0m copy\u001b[38;5;241m=\u001b[39m_copy,\n\u001b[0;32m 825\u001b[0m )\n\u001b[0;32m 826\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 827\u001b[0m mgr \u001b[38;5;241m=\u001b[39m \u001b[43mndarray_to_mgr\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 828\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 829\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 830\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 831\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 832\u001b[0m \u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 833\u001b[0m \u001b[43m \u001b[49m\u001b[43mtyp\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmanager\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 834\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 836\u001b[0m \u001b[38;5;66;03m# For data is list-like, or Iterable (will consume into list)\u001b[39;00m\n\u001b[0;32m 837\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m is_list_like(data):\n",
+ "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\pandas\\core\\internals\\construction.py:336\u001b[0m, in \u001b[0;36mndarray_to_mgr\u001b[1;34m(values, index, columns, dtype, copy, typ)\u001b[0m\n\u001b[0;32m 331\u001b[0m \u001b[38;5;66;03m# _prep_ndarraylike ensures that values.ndim == 2 at this point\u001b[39;00m\n\u001b[0;32m 332\u001b[0m index, columns \u001b[38;5;241m=\u001b[39m _get_axes(\n\u001b[0;32m 333\u001b[0m values\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m], values\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m], index\u001b[38;5;241m=\u001b[39mindex, columns\u001b[38;5;241m=\u001b[39mcolumns\n\u001b[0;32m 334\u001b[0m )\n\u001b[1;32m--> 336\u001b[0m \u001b[43m_check_values_indices_shape_match\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 338\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m typ \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124marray\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m 339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(values\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, \u001b[38;5;28mstr\u001b[39m):\n",
+ "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\pandas\\core\\internals\\construction.py:420\u001b[0m, in \u001b[0;36m_check_values_indices_shape_match\u001b[1;34m(values, index, columns)\u001b[0m\n\u001b[0;32m 418\u001b[0m passed \u001b[38;5;241m=\u001b[39m values\u001b[38;5;241m.\u001b[39mshape\n\u001b[0;32m 419\u001b[0m implied \u001b[38;5;241m=\u001b[39m (\u001b[38;5;28mlen\u001b[39m(index), \u001b[38;5;28mlen\u001b[39m(columns))\n\u001b[1;32m--> 420\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mShape of passed values is \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpassed\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, indices imply \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mimplied\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
+ "\u001b[1;31mValueError\u001b[0m: Shape of passed values is (8000, 21), indices imply (8000, 19)"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "from sklearn.base import BaseEstimator, TransformerMixin\n",
+ "from sklearn.compose import ColumnTransformer\n",
+ "from sklearn.discriminant_analysis import StandardScaler\n",
+ "from sklearn.impute import SimpleImputer\n",
+ "from sklearn.pipeline import Pipeline\n",
+ "from sklearn.preprocessing import OneHotEncoder\n",
+ "from sklearn.ensemble import RandomForestRegressor # Пример регрессионной модели\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.pipeline import make_pipeline\n",
+ "\n",
+ "class HousesFeatures(BaseEstimator, TransformerMixin):\n",
+ " def __init__(self):\n",
+ " pass\n",
+ "\n",
+ " def fit(self, X, y=None):\n",
+ " return self\n",
+ "\n",
+ "\n",
+ " def transform(self, X, y=None):\n",
+ "\n",
+ " def get_price_type(category) -> int:\n",
+ " if pd.isna(category):\n",
+ " return \"unknown\"\n",
+ " if category == 'low':\n",
+ " return 1\n",
+ " elif category == 'middle':\n",
+ " return 2\n",
+ " elif category == 'high':\n",
+ " return 3\n",
+ " elif category == 'very_high':\n",
+ " return 4\n",
+ "\n",
+ " # Преобразование категориальных столбцов в числовые 1/0\n",
+ " X[\"price_category\"] = [get_price_type(category) for category in X[\"price_category\"]]\n",
+ " return X\n",
+ "\n",
+ " def get_feature_names_out(self, features_in):\n",
+ " return np.append(features_in, [\"price_type\"], axis=0)\n",
+ "\n",
+ "# Указываем столбцы, которые нужно удалить и обрабатывать\n",
+ "columns_to_drop = [\"date\", \"view\", \"waterfront\"]\n",
+ "num_columns = [\n",
+ " column\n",
+ " for column in df.columns\n",
+ " if column not in columns_to_drop and df[column].dtype != \"object\" and df[column].dtype != \"category\"\n",
+ "]\n",
+ "cat_columns = [\n",
+ " column\n",
+ " for column in df.columns\n",
+ " if column not in columns_to_drop and df[column].dtype == \"object\" or df[column].dtype == \"category\"\n",
+ "]\n",
+ "\n",
+ "# Определяем предобработку для численных данных\n",
+ "num_imputer = SimpleImputer(strategy=\"median\")\n",
+ "num_scaler = StandardScaler()\n",
+ "preprocessing_num = Pipeline(\n",
+ " [\n",
+ " (\"imputer\", num_imputer),\n",
+ " (\"scaler\", num_scaler),\n",
+ " ]\n",
+ ")\n",
+ "\n",
+ "# Определяем предобработку для категориальных данных\n",
+ "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n",
+ "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
+ "preprocessing_cat = Pipeline(\n",
+ " [\n",
+ " (\"imputer\", cat_imputer),\n",
+ " (\"encoder\", cat_encoder),\n",
+ " ]\n",
+ ")\n",
+ "\n",
+ "features_preprocessing = ColumnTransformer(\n",
+ " verbose_feature_names_out=False,\n",
+ " transformers=[\n",
+ " (\"prepocessing_num\", preprocessing_num, num_columns),\n",
+ " (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n",
+ " ],\n",
+ " remainder=\"passthrough\"\n",
+ ")\n",
+ "\n",
+ "# features_engineering = ColumnTransformer(\n",
+ "# verbose_feature_names_out=False,\n",
+ "# transformers=[\n",
+ "# (\"add_features\", HousesFeatures(), [\"price_category\"]),\n",
+ "# ],\n",
+ "# remainder=\"passthrough\",\n",
+ "# )\n",
+ "\n",
+ "drop_columns = ColumnTransformer(\n",
+ " verbose_feature_names_out=False,\n",
+ " transformers=[\n",
+ " (\"drop_columns\", \"drop\", columns_to_drop),\n",
+ " ],\n",
+ " remainder=\"passthrough\",\n",
+ ")\n",
+ "\n",
+ "features_postprocessing = ColumnTransformer(\n",
+ " verbose_feature_names_out=False,\n",
+ " transformers=[\n",
+ " (\"prepocessing_cat\", preprocessing_cat, [\"price_category\"]),\n",
+ " ],\n",
+ " remainder=\"passthrough\",\n",
+ ")\n",
+ "\n",
+ "pipeline_end = Pipeline(\n",
+ " [\n",
+ " (\"features_preprocessing\", features_preprocessing),\n",
+ " (\"features_engineering\", features_engineering),\n",
+ " (\"drop_columns\", drop_columns),\n",
+ " (\"features_postprocessing\", features_postprocessing),\n",
+ " ]\n",
+ "\n",
+ ")\n",
+ "cols = ['a', 'b']\n",
+ "preprocessing_result = drop_columns.fit_transform(X_train)\n",
+ "preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns)\n",
+ "preprocessing_result = features_preprocessing.fit_transform(preprocessing_result)\n",
+ "preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns + cols)\n",
+ "preprocessing_result = features_engineering.fit_transform(preprocessing_result)\n",
+ "preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns)\n",
+ "\n",
+ "# preprocessing_result = features_postprocessing.fit_transform(preprocessing_result)\n",
+ "\n",
+ "# preprocessing_result = pipeline_end.fit_transform(X_train)\n",
+ "# preprocessed_df = pd.DataFrame(\n",
+ "# preprocessing_result,\n",
+ "# columns=pipeline_end.get_feature_names_out(),\n",
+ "# )\n",
+ "# preprocessed_df"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "kernel",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}