diff --git a/lab_4/lab_4.ipynb b/lab_4/lab_4.ipynb index d9069a0..50ade99 100644 --- a/lab_4/lab_4.ipynb +++ b/lab_4/lab_4.ipynb @@ -2,384 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 337, + "execution_count": 157, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...gradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15
0712930052020141013T000000221900.031.00118056501.000...711800195509817847.5112-122.25713405650
1641410019220141209T000000538000.032.25257072422.000...72170400195119919812547.7210-122.31916907639
2563150040020150225T000000180000.021.00770100001.000...67700193309802847.7379-122.23327208062
3248720087520141209T000000604000.043.00196050001.000...71050910196509813647.5208-122.39313605000
4195440051020150218T000000510000.032.00168080801.000...816800198709807447.6168-122.04518007503
..................................................................
999532205926420140926T000000279000.021.001020470441.000...710200190419589804247.4206-122.155193012139
9996555750027020150209T000000262000.031.50170095791.000...71100600196209802347.3209-122.33817009628
9997916410012520140807T000000533000.041.00155047501.500...715500191909811747.6824-122.38913204750
9998737060004520150402T000000640000.031.75168081001.002...816800195009817747.7212-122.36418807750
9999859440006020140609T000000285000.032.251680351272.000...716800198709809247.3025-122.067182035166
\n", - "

10000 rows × 21 columns

\n", - "
" - ], - "text/plain": [ - " id date price bedrooms bathrooms sqft_living \\\n", - "0 7129300520 20141013T000000 221900.0 3 1.00 1180 \n", - "1 6414100192 20141209T000000 538000.0 3 2.25 2570 \n", - "2 5631500400 20150225T000000 180000.0 2 1.00 770 \n", - "3 2487200875 20141209T000000 604000.0 4 3.00 1960 \n", - "4 1954400510 20150218T000000 510000.0 3 2.00 1680 \n", - "... ... ... ... ... ... ... \n", - "9995 322059264 20140926T000000 279000.0 2 1.00 1020 \n", - "9996 5557500270 20150209T000000 262000.0 3 1.50 1700 \n", - "9997 9164100125 20140807T000000 533000.0 4 1.00 1550 \n", - "9998 7370600045 20150402T000000 640000.0 3 1.75 1680 \n", - "9999 8594400060 20140609T000000 285000.0 3 2.25 1680 \n", - "\n", - " sqft_lot floors waterfront view ... grade sqft_above \\\n", - "0 5650 1.0 0 0 ... 7 1180 \n", - "1 7242 2.0 0 0 ... 7 2170 \n", - "2 10000 1.0 0 0 ... 6 770 \n", - "3 5000 1.0 0 0 ... 7 1050 \n", - "4 8080 1.0 0 0 ... 8 1680 \n", - "... ... ... ... ... ... ... ... \n", - "9995 47044 1.0 0 0 ... 7 1020 \n", - "9996 9579 1.0 0 0 ... 7 1100 \n", - "9997 4750 1.5 0 0 ... 7 1550 \n", - "9998 8100 1.0 0 2 ... 8 1680 \n", - "9999 35127 2.0 0 0 ... 7 1680 \n", - "\n", - " sqft_basement yr_built yr_renovated zipcode lat long \\\n", - "0 0 1955 0 98178 47.5112 -122.257 \n", - "1 400 1951 1991 98125 47.7210 -122.319 \n", - "2 0 1933 0 98028 47.7379 -122.233 \n", - "3 910 1965 0 98136 47.5208 -122.393 \n", - "4 0 1987 0 98074 47.6168 -122.045 \n", - "... ... ... ... ... ... ... \n", - "9995 0 1904 1958 98042 47.4206 -122.155 \n", - "9996 600 1962 0 98023 47.3209 -122.338 \n", - "9997 0 1919 0 98117 47.6824 -122.389 \n", - "9998 0 1950 0 98177 47.7212 -122.364 \n", - "9999 0 1987 0 98092 47.3025 -122.067 \n", - "\n", - " sqft_living15 sqft_lot15 \n", - "0 1340 5650 \n", - "1 1690 7639 \n", - "2 2720 8062 \n", - "3 1360 5000 \n", - "4 1800 7503 \n", - "... ... ... \n", - "9995 1930 12139 \n", - "9996 1700 9628 \n", - "9997 1320 4750 \n", - "9998 1880 7750 \n", - "9999 1820 35166 \n", - "\n", - "[10000 rows x 21 columns]" - ] - }, - "execution_count": 337, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", @@ -387,9 +12,12 @@ "import seaborn as sns\n", "from sklearn.model_selection import train_test_split\n", "from sklearn import set_config\n", + "set_config(transform_output=\"pandas\")\n", "\n", - "df = pd.read_csv(\"data/house_data.csv\", sep=\",\", nrows=10000)\n", - "df.dropna()" + "random_state = 42\n", + "\n", + "# Подключим датафрейм и выгрузим данные\n", + "df = pd.read_csv(\"data/house_data.csv\")" ] }, { @@ -401,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 338, + "execution_count": 158, "metadata": {}, "outputs": [ { @@ -450,483 +78,483 @@ " \n", " \n", " \n", - " 9980\n", - " 6840700036\n", - " 20140728T000000\n", - " 497000.0\n", + " 21593\n", + " 8672200110\n", + " 20150317T000000\n", + " 1088000.0\n", + " 5\n", + " 3.75\n", + " 4170\n", + " 8142\n", + " 2.0\n", + " 0\n", " 2\n", - " 1.00\n", - " 770\n", - " 3325\n", - " 1.0\n", - " 0\n", - " 0\n", " ...\n", - " 770\n", - " 0\n", - " 1918\n", - " 0\n", - " 98122\n", - " 47.6102\n", - " -122.299\n", - " 960\n", - " 4800\n", - " middle\n", - " \n", - " \n", - " 9981\n", - " 1824069083\n", - " 20150429T000000\n", - " 835000.0\n", - " 3\n", - " 1.00\n", - " 3060\n", - " 30166\n", - " 1.0\n", - " 0\n", - " 0\n", - " ...\n", - " 3060\n", - " 0\n", - " 1959\n", - " 0\n", - " 98027\n", - " 47.5656\n", - " -122.093\n", - " 1880\n", - " 19602\n", - " high\n", - " \n", - " \n", - " 9982\n", - " 1836980240\n", - " 20141015T000000\n", - " 730000.0\n", - " 4\n", - " 2.75\n", - " 2920\n", - " 4500\n", - " 2.0\n", - " 0\n", - " 0\n", - " ...\n", - " 2920\n", - " 0\n", - " 1999\n", - " 0\n", - " 98006\n", - " 47.5646\n", - " -122.124\n", - " 2920\n", - " 4505\n", - " high\n", - " \n", - " \n", - " 9983\n", - " 3528900160\n", - " 20141001T000000\n", - " 655000.0\n", - " 3\n", - " 1.00\n", - " 1370\n", - " 5250\n", - " 1.0\n", - " 0\n", - " 0\n", - " ...\n", - " 1070\n", - " 300\n", - " 1939\n", - " 0\n", - " 98109\n", - " 47.6421\n", - " -122.348\n", - " 2410\n", - " 4200\n", - " high\n", - " \n", - " \n", - " 9984\n", - " 1442800060\n", - " 20141120T000000\n", - " 205000.0\n", - " 3\n", - " 2.50\n", - " 1870\n", - " 3118\n", - " 2.0\n", - " 0\n", - " 0\n", - " ...\n", - " 1870\n", - " 0\n", - " 1993\n", - " 0\n", - " 98038\n", - " 47.3739\n", - " -122.056\n", - " 1580\n", - " 3601\n", - " low\n", - " \n", - " \n", - " 9985\n", - " 8722100030\n", - " 20150407T000000\n", - " 632750.0\n", - " 4\n", - " 2.00\n", - " 1800\n", - " 4800\n", - " 1.5\n", - " 0\n", - " 0\n", - " ...\n", - " 1800\n", - " 0\n", - " 1918\n", - " 0\n", - " 98112\n", - " 47.6388\n", - " -122.302\n", - " 1950\n", - " 4800\n", - " high\n", - " \n", - " \n", - " 9986\n", - " 1723049624\n", - " 20140512T000000\n", - " 330000.0\n", - " 5\n", - " 3.00\n", - " 2100\n", - " 7715\n", - " 1.0\n", - " 0\n", - " 0\n", - " ...\n", - " 1250\n", - " 850\n", - " 2013\n", - " 0\n", - " 98168\n", - " 47.4866\n", - " -122.319\n", - " 2100\n", - " 7959\n", - " low\n", - " \n", - " \n", - " 9987\n", - " 4040400200\n", - " 20141007T000000\n", - " 527500.0\n", - " 5\n", - " 2.25\n", - " 2530\n", - " 8250\n", - " 2.0\n", - " 0\n", - " 0\n", - " ...\n", - " 2530\n", - " 0\n", - " 1961\n", - " 0\n", - " 98007\n", - " 47.6117\n", - " -122.134\n", - " 2020\n", - " 8250\n", - " middle\n", - " \n", - " \n", - " 9988\n", - " 8691391090\n", - " 20140508T000000\n", - " 716500.0\n", - " 4\n", - " 2.50\n", - " 3290\n", - " 6465\n", - " 2.0\n", - " 0\n", - " 0\n", - " ...\n", - " 3290\n", - " 0\n", - " 2002\n", - " 0\n", - " 98075\n", - " 47.5981\n", - " -121.976\n", - " 3100\n", - " 5929\n", - " high\n", - " \n", - " \n", - " 9989\n", - " 7853302190\n", - " 20141217T000000\n", - " 388500.0\n", - " 4\n", - " 2.50\n", - " 1890\n", - " 5395\n", - " 2.0\n", - " 0\n", - " 0\n", - " ...\n", - " 1890\n", + " 4170\n", " 0\n", " 2006\n", " 0\n", - " 98065\n", - " 47.5415\n", - " -121.883\n", - " 2060\n", - " 5395\n", - " middle\n", - " \n", - " \n", - " 9990\n", - " 3260000700\n", - " 20140904T000000\n", - " 530000.0\n", - " 3\n", - " 1.75\n", - " 1680\n", - " 7770\n", - " 1.0\n", - " 0\n", - " 0\n", - " ...\n", - " 1680\n", - " 0\n", - " 1967\n", - " 0\n", - " 98005\n", - " 47.6028\n", - " -122.167\n", - " 1880\n", - " 7770\n", - " middle\n", - " \n", - " \n", - " 9991\n", - " 5126300510\n", - " 20150108T000000\n", - " 419000.0\n", - " 3\n", - " 2.50\n", - " 2170\n", - " 4517\n", - " 2.0\n", - " 0\n", - " 0\n", - " ...\n", - " 2170\n", - " 0\n", - " 2002\n", - " 0\n", - " 98059\n", - " 47.4819\n", - " -122.140\n", - " 2610\n", - " 4770\n", - " middle\n", - " \n", - " \n", - " 9992\n", - " 7199330370\n", - " 20150309T000000\n", - " 385000.0\n", - " 3\n", - " 1.75\n", - " 1200\n", - " 7360\n", - " 1.0\n", - " 0\n", - " 0\n", - " ...\n", - " 1200\n", - " 0\n", - " 1978\n", - " 0\n", - " 98052\n", - " 47.6979\n", - " -122.130\n", - " 1200\n", - " 7500\n", - " middle\n", - " \n", - " \n", - " 9993\n", - " 1854900240\n", - " 20140528T000000\n", - " 655000.0\n", - " 4\n", - " 2.50\n", - " 2990\n", - " 5669\n", - " 2.0\n", - " 0\n", - " 0\n", - " ...\n", - " 2990\n", - " 0\n", - " 2003\n", - " 0\n", - " 98074\n", - " 47.6119\n", - " -122.011\n", - " 3110\n", - " 5058\n", - " high\n", - " \n", - " \n", - " 9994\n", - " 6738700335\n", - " 20140701T000000\n", - " 1127312.5\n", - " 4\n", - " 2.75\n", - " 3770\n", - " 10900\n", - " 2.0\n", - " 0\n", - " 2\n", - " ...\n", - " 3070\n", - " 700\n", - " 1924\n", - " 0\n", - " 98144\n", - " 47.5849\n", - " -122.290\n", - " 3000\n", - " 5000\n", + " 98056\n", + " 47.5354\n", + " -122.181\n", + " 3030\n", + " 7980\n", " very_high\n", " \n", " \n", - " 9995\n", - " 322059264\n", - " 20140926T000000\n", - " 279000.0\n", - " 2\n", - " 1.00\n", - " 1020\n", - " 47044\n", - " 1.0\n", - " 0\n", - " 0\n", - " ...\n", - " 1020\n", - " 0\n", - " 1904\n", - " 1958\n", - " 98042\n", - " 47.4206\n", - " -122.155\n", - " 1930\n", - " 12139\n", - " low\n", - " \n", - " \n", - " 9996\n", - " 5557500270\n", - " 20150209T000000\n", - " 262000.0\n", - " 3\n", - " 1.50\n", - " 1700\n", - " 9579\n", - " 1.0\n", - " 0\n", - " 0\n", - " ...\n", - " 1100\n", - " 600\n", - " 1962\n", - " 0\n", - " 98023\n", - " 47.3209\n", - " -122.338\n", - " 1700\n", - " 9628\n", - " low\n", - " \n", - " \n", - " 9997\n", - " 9164100125\n", - " 20140807T000000\n", - " 533000.0\n", + " 21594\n", + " 5087900040\n", + " 20141017T000000\n", + " 350000.0\n", " 4\n", - " 1.00\n", - " 1550\n", - " 4750\n", - " 1.5\n", - " 0\n", - " 0\n", - " ...\n", - " 1550\n", - " 0\n", - " 1919\n", - " 0\n", - " 98117\n", - " 47.6824\n", - " -122.389\n", - " 1320\n", - " 4750\n", - " middle\n", - " \n", - " \n", - " 9998\n", - " 7370600045\n", - " 20150402T000000\n", - " 640000.0\n", - " 3\n", - " 1.75\n", - " 1680\n", - " 8100\n", - " 1.0\n", - " 0\n", - " 2\n", - " ...\n", - " 1680\n", - " 0\n", - " 1950\n", - " 0\n", - " 98177\n", - " 47.7212\n", - " -122.364\n", - " 1880\n", - " 7750\n", - " high\n", - " \n", - " \n", - " 9999\n", - " 8594400060\n", - " 20140609T000000\n", - " 285000.0\n", - " 3\n", - " 2.25\n", - " 1680\n", - " 35127\n", + " 2.75\n", + " 2500\n", + " 5995\n", " 2.0\n", " 0\n", " 0\n", " ...\n", - " 1680\n", + " 2500\n", " 0\n", - " 1987\n", + " 2008\n", " 0\n", - " 98092\n", - " 47.3025\n", - " -122.067\n", - " 1820\n", - " 35166\n", + " 98042\n", + " 47.3749\n", + " -122.107\n", + " 2530\n", + " 5988\n", + " middle\n", + " \n", + " \n", + " 21595\n", + " 1972201967\n", + " 20141031T000000\n", + " 520000.0\n", + " 2\n", + " 2.25\n", + " 1530\n", + " 981\n", + " 3.0\n", + " 0\n", + " 0\n", + " ...\n", + " 1480\n", + " 50\n", + " 2006\n", + " 0\n", + " 98103\n", + " 47.6533\n", + " -122.346\n", + " 1530\n", + " 1282\n", + " middle\n", + " \n", + " \n", + " 21596\n", + " 7502800100\n", + " 20140813T000000\n", + " 679950.0\n", + " 5\n", + " 2.75\n", + " 3600\n", + " 9437\n", + " 2.0\n", + " 0\n", + " 0\n", + " ...\n", + " 3600\n", + " 0\n", + " 2014\n", + " 0\n", + " 98059\n", + " 47.4822\n", + " -122.131\n", + " 3550\n", + " 9421\n", + " high\n", + " \n", + " \n", + " 21597\n", + " 191100405\n", + " 20150421T000000\n", + " 1575000.0\n", + " 4\n", + " 3.25\n", + " 3410\n", + " 10125\n", + " 2.0\n", + " 0\n", + " 0\n", + " ...\n", + " 3410\n", + " 0\n", + " 2007\n", + " 0\n", + " 98040\n", + " 47.5653\n", + " -122.223\n", + " 2290\n", + " 10125\n", + " NaN\n", + " \n", + " \n", + " 21598\n", + " 8956200760\n", + " 20141013T000000\n", + " 541800.0\n", + " 4\n", + " 2.50\n", + " 3118\n", + " 7866\n", + " 2.0\n", + " 0\n", + " 2\n", + " ...\n", + " 3118\n", + " 0\n", + " 2014\n", + " 0\n", + " 98001\n", + " 47.2931\n", + " -122.264\n", + " 2673\n", + " 6500\n", + " middle\n", + " \n", + " \n", + " 21599\n", + " 7202300110\n", + " 20140915T000000\n", + " 810000.0\n", + " 4\n", + " 3.00\n", + " 3990\n", + " 7838\n", + " 2.0\n", + " 0\n", + " 0\n", + " ...\n", + " 3990\n", + " 0\n", + " 2003\n", + " 0\n", + " 98053\n", + " 47.6857\n", + " -122.046\n", + " 3370\n", + " 6814\n", + " high\n", + " \n", + " \n", + " 21600\n", + " 249000205\n", + " 20141015T000000\n", + " 1537000.0\n", + " 5\n", + " 3.75\n", + " 4470\n", + " 8088\n", + " 2.0\n", + " 0\n", + " 0\n", + " ...\n", + " 4470\n", + " 0\n", + " 2008\n", + " 0\n", + " 98004\n", + " 47.6321\n", + " -122.200\n", + " 2780\n", + " 8964\n", + " NaN\n", + " \n", + " \n", + " 21601\n", + " 5100403806\n", + " 20150407T000000\n", + " 467000.0\n", + " 3\n", + " 2.50\n", + " 1425\n", + " 1179\n", + " 3.0\n", + " 0\n", + " 0\n", + " ...\n", + " 1425\n", + " 0\n", + " 2008\n", + " 0\n", + " 98125\n", + " 47.6963\n", + " -122.318\n", + " 1285\n", + " 1253\n", + " middle\n", + " \n", + " \n", + " 21602\n", + " 844000965\n", + " 20140626T000000\n", + " 224000.0\n", + " 3\n", + " 1.75\n", + " 1500\n", + " 11968\n", + " 1.0\n", + " 0\n", + " 0\n", + " ...\n", + " 1500\n", + " 0\n", + " 2014\n", + " 0\n", + " 98010\n", + " 47.3095\n", + " -122.002\n", + " 1320\n", + " 11303\n", + " low\n", + " \n", + " \n", + " 21603\n", + " 7852140040\n", + " 20140825T000000\n", + " 507250.0\n", + " 3\n", + " 2.50\n", + " 2270\n", + " 5536\n", + " 2.0\n", + " 0\n", + " 0\n", + " ...\n", + " 2270\n", + " 0\n", + " 2003\n", + " 0\n", + " 98065\n", + " 47.5389\n", + " -121.881\n", + " 2270\n", + " 5731\n", + " middle\n", + " \n", + " \n", + " 21604\n", + " 9834201367\n", + " 20150126T000000\n", + " 429000.0\n", + " 3\n", + " 2.00\n", + " 1490\n", + " 1126\n", + " 3.0\n", + " 0\n", + " 0\n", + " ...\n", + " 1490\n", + " 0\n", + " 2014\n", + " 0\n", + " 98144\n", + " 47.5699\n", + " -122.288\n", + " 1400\n", + " 1230\n", + " middle\n", + " \n", + " \n", + " 21605\n", + " 3448900210\n", + " 20141014T000000\n", + " 610685.0\n", + " 4\n", + " 2.50\n", + " 2520\n", + " 6023\n", + " 2.0\n", + " 0\n", + " 0\n", + " ...\n", + " 2520\n", + " 0\n", + " 2014\n", + " 0\n", + " 98056\n", + " 47.5137\n", + " -122.167\n", + " 2520\n", + " 6023\n", + " high\n", + " \n", + " \n", + " 21606\n", + " 7936000429\n", + " 20150326T000000\n", + " 1007500.0\n", + " 4\n", + " 3.50\n", + " 3510\n", + " 7200\n", + " 2.0\n", + " 0\n", + " 0\n", + " ...\n", + " 2600\n", + " 910\n", + " 2009\n", + " 0\n", + " 98136\n", + " 47.5537\n", + " -122.398\n", + " 2050\n", + " 6200\n", + " very_high\n", + " \n", + " \n", + " 21607\n", + " 2997800021\n", + " 20150219T000000\n", + " 475000.0\n", + " 3\n", + " 2.50\n", + " 1310\n", + " 1294\n", + " 2.0\n", + " 0\n", + " 0\n", + " ...\n", + " 1180\n", + " 130\n", + " 2008\n", + " 0\n", + " 98116\n", + " 47.5773\n", + " -122.409\n", + " 1330\n", + " 1265\n", + " middle\n", + " \n", + " \n", + " 21608\n", + " 263000018\n", + " 20140521T000000\n", + " 360000.0\n", + " 3\n", + " 2.50\n", + " 1530\n", + " 1131\n", + " 3.0\n", + " 0\n", + " 0\n", + " ...\n", + " 1530\n", + " 0\n", + " 2009\n", + " 0\n", + " 98103\n", + " 47.6993\n", + " -122.346\n", + " 1530\n", + " 1509\n", + " middle\n", + " \n", + " \n", + " 21609\n", + " 6600060120\n", + " 20150223T000000\n", + " 400000.0\n", + " 4\n", + " 2.50\n", + " 2310\n", + " 5813\n", + " 2.0\n", + " 0\n", + " 0\n", + " ...\n", + " 2310\n", + " 0\n", + " 2014\n", + " 0\n", + " 98146\n", + " 47.5107\n", + " -122.362\n", + " 1830\n", + " 7200\n", + " middle\n", + " \n", + " \n", + " 21610\n", + " 1523300141\n", + " 20140623T000000\n", + " 402101.0\n", + " 2\n", + " 0.75\n", + " 1020\n", + " 1350\n", + " 2.0\n", + " 0\n", + " 0\n", + " ...\n", + " 1020\n", + " 0\n", + " 2009\n", + " 0\n", + " 98144\n", + " 47.5944\n", + " -122.299\n", + " 1020\n", + " 2007\n", + " middle\n", + " \n", + " \n", + " 21611\n", + " 291310100\n", + " 20150116T000000\n", + " 400000.0\n", + " 3\n", + " 2.50\n", + " 1600\n", + " 2388\n", + " 2.0\n", + " 0\n", + " 0\n", + " ...\n", + " 1600\n", + " 0\n", + " 2004\n", + " 0\n", + " 98027\n", + " 47.5345\n", + " -122.069\n", + " 1410\n", + " 1287\n", + " middle\n", + " \n", + " \n", + " 21612\n", + " 1523300157\n", + " 20141015T000000\n", + " 325000.0\n", + " 2\n", + " 0.75\n", + " 1020\n", + " 1076\n", + " 2.0\n", + " 0\n", + " 0\n", + " ...\n", + " 1020\n", + " 0\n", + " 2008\n", + " 0\n", + " 98144\n", + " 47.5941\n", + " -122.299\n", + " 1020\n", + " 1357\n", " low\n", " \n", " \n", @@ -935,113 +563,104 @@ "" ], "text/plain": [ - " id date price bedrooms bathrooms \\\n", - "9980 6840700036 20140728T000000 497000.0 2 1.00 \n", - "9981 1824069083 20150429T000000 835000.0 3 1.00 \n", - "9982 1836980240 20141015T000000 730000.0 4 2.75 \n", - "9983 3528900160 20141001T000000 655000.0 3 1.00 \n", - "9984 1442800060 20141120T000000 205000.0 3 2.50 \n", - "9985 8722100030 20150407T000000 632750.0 4 2.00 \n", - "9986 1723049624 20140512T000000 330000.0 5 3.00 \n", - "9987 4040400200 20141007T000000 527500.0 5 2.25 \n", - "9988 8691391090 20140508T000000 716500.0 4 2.50 \n", - "9989 7853302190 20141217T000000 388500.0 4 2.50 \n", - "9990 3260000700 20140904T000000 530000.0 3 1.75 \n", - "9991 5126300510 20150108T000000 419000.0 3 2.50 \n", - "9992 7199330370 20150309T000000 385000.0 3 1.75 \n", - "9993 1854900240 20140528T000000 655000.0 4 2.50 \n", - "9994 6738700335 20140701T000000 1127312.5 4 2.75 \n", - "9995 322059264 20140926T000000 279000.0 2 1.00 \n", - "9996 5557500270 20150209T000000 262000.0 3 1.50 \n", - "9997 9164100125 20140807T000000 533000.0 4 1.00 \n", - "9998 7370600045 20150402T000000 640000.0 3 1.75 \n", - "9999 8594400060 20140609T000000 285000.0 3 2.25 \n", + " id date price bedrooms bathrooms \\\n", + "21593 8672200110 20150317T000000 1088000.0 5 3.75 \n", + "21594 5087900040 20141017T000000 350000.0 4 2.75 \n", + "21595 1972201967 20141031T000000 520000.0 2 2.25 \n", + "21596 7502800100 20140813T000000 679950.0 5 2.75 \n", + "21597 191100405 20150421T000000 1575000.0 4 3.25 \n", + "21598 8956200760 20141013T000000 541800.0 4 2.50 \n", + "21599 7202300110 20140915T000000 810000.0 4 3.00 \n", + "21600 249000205 20141015T000000 1537000.0 5 3.75 \n", + "21601 5100403806 20150407T000000 467000.0 3 2.50 \n", + "21602 844000965 20140626T000000 224000.0 3 1.75 \n", + "21603 7852140040 20140825T000000 507250.0 3 2.50 \n", + "21604 9834201367 20150126T000000 429000.0 3 2.00 \n", + "21605 3448900210 20141014T000000 610685.0 4 2.50 \n", + "21606 7936000429 20150326T000000 1007500.0 4 3.50 \n", + "21607 2997800021 20150219T000000 475000.0 3 2.50 \n", + "21608 263000018 20140521T000000 360000.0 3 2.50 \n", + "21609 6600060120 20150223T000000 400000.0 4 2.50 \n", + "21610 1523300141 20140623T000000 402101.0 2 0.75 \n", + "21611 291310100 20150116T000000 400000.0 3 2.50 \n", + "21612 1523300157 20141015T000000 325000.0 2 0.75 \n", "\n", - " sqft_living sqft_lot floors waterfront view ... sqft_above \\\n", - "9980 770 3325 1.0 0 0 ... 770 \n", - "9981 3060 30166 1.0 0 0 ... 3060 \n", - "9982 2920 4500 2.0 0 0 ... 2920 \n", - "9983 1370 5250 1.0 0 0 ... 1070 \n", - "9984 1870 3118 2.0 0 0 ... 1870 \n", - "9985 1800 4800 1.5 0 0 ... 1800 \n", - "9986 2100 7715 1.0 0 0 ... 1250 \n", - "9987 2530 8250 2.0 0 0 ... 2530 \n", - "9988 3290 6465 2.0 0 0 ... 3290 \n", - "9989 1890 5395 2.0 0 0 ... 1890 \n", - "9990 1680 7770 1.0 0 0 ... 1680 \n", - "9991 2170 4517 2.0 0 0 ... 2170 \n", - "9992 1200 7360 1.0 0 0 ... 1200 \n", - "9993 2990 5669 2.0 0 0 ... 2990 \n", - "9994 3770 10900 2.0 0 2 ... 3070 \n", - "9995 1020 47044 1.0 0 0 ... 1020 \n", - "9996 1700 9579 1.0 0 0 ... 1100 \n", - "9997 1550 4750 1.5 0 0 ... 1550 \n", - "9998 1680 8100 1.0 0 2 ... 1680 \n", - "9999 1680 35127 2.0 0 0 ... 1680 \n", + " sqft_living sqft_lot floors waterfront view ... sqft_above \\\n", + "21593 4170 8142 2.0 0 2 ... 4170 \n", + "21594 2500 5995 2.0 0 0 ... 2500 \n", + "21595 1530 981 3.0 0 0 ... 1480 \n", + "21596 3600 9437 2.0 0 0 ... 3600 \n", + "21597 3410 10125 2.0 0 0 ... 3410 \n", + "21598 3118 7866 2.0 0 2 ... 3118 \n", + "21599 3990 7838 2.0 0 0 ... 3990 \n", + "21600 4470 8088 2.0 0 0 ... 4470 \n", + "21601 1425 1179 3.0 0 0 ... 1425 \n", + "21602 1500 11968 1.0 0 0 ... 1500 \n", + "21603 2270 5536 2.0 0 0 ... 2270 \n", + "21604 1490 1126 3.0 0 0 ... 1490 \n", + "21605 2520 6023 2.0 0 0 ... 2520 \n", + "21606 3510 7200 2.0 0 0 ... 2600 \n", + "21607 1310 1294 2.0 0 0 ... 1180 \n", + "21608 1530 1131 3.0 0 0 ... 1530 \n", + "21609 2310 5813 2.0 0 0 ... 2310 \n", + "21610 1020 1350 2.0 0 0 ... 1020 \n", + "21611 1600 2388 2.0 0 0 ... 1600 \n", + "21612 1020 1076 2.0 0 0 ... 1020 \n", "\n", - " sqft_basement yr_built yr_renovated zipcode lat long \\\n", - "9980 0 1918 0 98122 47.6102 -122.299 \n", - "9981 0 1959 0 98027 47.5656 -122.093 \n", - "9982 0 1999 0 98006 47.5646 -122.124 \n", - "9983 300 1939 0 98109 47.6421 -122.348 \n", - "9984 0 1993 0 98038 47.3739 -122.056 \n", - "9985 0 1918 0 98112 47.6388 -122.302 \n", - "9986 850 2013 0 98168 47.4866 -122.319 \n", - "9987 0 1961 0 98007 47.6117 -122.134 \n", - "9988 0 2002 0 98075 47.5981 -121.976 \n", - "9989 0 2006 0 98065 47.5415 -121.883 \n", - "9990 0 1967 0 98005 47.6028 -122.167 \n", - "9991 0 2002 0 98059 47.4819 -122.140 \n", - "9992 0 1978 0 98052 47.6979 -122.130 \n", - "9993 0 2003 0 98074 47.6119 -122.011 \n", - "9994 700 1924 0 98144 47.5849 -122.290 \n", - "9995 0 1904 1958 98042 47.4206 -122.155 \n", - "9996 600 1962 0 98023 47.3209 -122.338 \n", - "9997 0 1919 0 98117 47.6824 -122.389 \n", - "9998 0 1950 0 98177 47.7212 -122.364 \n", - "9999 0 1987 0 98092 47.3025 -122.067 \n", + " sqft_basement yr_built yr_renovated zipcode lat long \\\n", + "21593 0 2006 0 98056 47.5354 -122.181 \n", + "21594 0 2008 0 98042 47.3749 -122.107 \n", + "21595 50 2006 0 98103 47.6533 -122.346 \n", + "21596 0 2014 0 98059 47.4822 -122.131 \n", + "21597 0 2007 0 98040 47.5653 -122.223 \n", + "21598 0 2014 0 98001 47.2931 -122.264 \n", + "21599 0 2003 0 98053 47.6857 -122.046 \n", + "21600 0 2008 0 98004 47.6321 -122.200 \n", + "21601 0 2008 0 98125 47.6963 -122.318 \n", + "21602 0 2014 0 98010 47.3095 -122.002 \n", + "21603 0 2003 0 98065 47.5389 -121.881 \n", + "21604 0 2014 0 98144 47.5699 -122.288 \n", + "21605 0 2014 0 98056 47.5137 -122.167 \n", + "21606 910 2009 0 98136 47.5537 -122.398 \n", + "21607 130 2008 0 98116 47.5773 -122.409 \n", + "21608 0 2009 0 98103 47.6993 -122.346 \n", + "21609 0 2014 0 98146 47.5107 -122.362 \n", + "21610 0 2009 0 98144 47.5944 -122.299 \n", + "21611 0 2004 0 98027 47.5345 -122.069 \n", + "21612 0 2008 0 98144 47.5941 -122.299 \n", "\n", - " sqft_living15 sqft_lot15 price_category \n", - "9980 960 4800 middle \n", - "9981 1880 19602 high \n", - "9982 2920 4505 high \n", - "9983 2410 4200 high \n", - "9984 1580 3601 low \n", - "9985 1950 4800 high \n", - "9986 2100 7959 low \n", - "9987 2020 8250 middle \n", - "9988 3100 5929 high \n", - "9989 2060 5395 middle \n", - "9990 1880 7770 middle \n", - "9991 2610 4770 middle \n", - "9992 1200 7500 middle \n", - "9993 3110 5058 high \n", - "9994 3000 5000 very_high \n", - "9995 1930 12139 low \n", - "9996 1700 9628 low \n", - "9997 1320 4750 middle \n", - "9998 1880 7750 high \n", - "9999 1820 35166 low \n", + " sqft_living15 sqft_lot15 price_category \n", + "21593 3030 7980 very_high \n", + "21594 2530 5988 middle \n", + "21595 1530 1282 middle \n", + "21596 3550 9421 high \n", + "21597 2290 10125 NaN \n", + "21598 2673 6500 middle \n", + "21599 3370 6814 high \n", + "21600 2780 8964 NaN \n", + "21601 1285 1253 middle \n", + "21602 1320 11303 low \n", + "21603 2270 5731 middle \n", + "21604 1400 1230 middle \n", + "21605 2520 6023 high \n", + "21606 2050 6200 very_high \n", + "21607 1330 1265 middle \n", + "21608 1530 1509 middle \n", + "21609 1830 7200 middle \n", + "21610 1020 2007 middle \n", + "21611 1410 1287 middle \n", + "21612 1020 1357 low \n", "\n", "[20 rows x 22 columns]" ] }, - "execution_count": 338, + "execution_count": 158, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "q1 = df['price'].quantile(0.25) # Находим 1-й квартиль (Q1)\n", - "q3 = df['price'].quantile(0.75) # Находим 3-й квартиль (Q3)\n", - "iqr = q3 - q1 # Вычисляем межквартильный размах (IQR)\n", "\n", - "# Определяем границы для выбросов\n", - "lower_bound = q1 - 1.5 * iqr # Нижняя граница\n", - "upper_bound = q3 + 1.5 * iqr # Верхняя граница\n", - "\n", - "# Устраняем выбросы: заменяем значения ниже нижней границы на саму нижнюю границу, а выше верхней — на верхнюю\n", - "df['price'] = df['price'].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)\n", "\n", "# Добавляем столбец с категорями цены\n", "df['price_category'] = pd.cut(df['price'], bins=[75000,338750,602750,866750,1130750], labels=['low','middle','high','very_high'], include_lowest=True)\n", @@ -1053,16 +672,17 @@ "metadata": {}, "source": [ "### Бизнес-цели\n", - "1. Прогноз класса цены недвижимости (Классификация)\n", - "2. Оценка состояния недвижимости (Регрессия)\n", + "1. Задача регрессии – предсказание цены дома (price). Это может помочь риэлторам и аналитикам определить справедливую рыночную стоимость недвижимости.\n", "\n", - "### Определение достижимого уровня качества модели для первой задачи\n", - "#### Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации (Целевой признак - price)" + "2. Задача классификации – определение вероятности того, что цена дома будет выше/ниже медианы рынка. Классифицировать дома по ценовым категориям (например, низкая, средняя, высокая цена). Это может помочь определить, какие дома популярны у покупателей.\n", + "\n", + "### Определение достижимого уровня качества модели для задачи классификации\n", + "#### Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации (Целевой признак - median_price)" ] }, { "cell_type": "code", - "execution_count": 339, + "execution_count": 159, "metadata": {}, "outputs": [ { @@ -1106,7 +726,6 @@ " waterfront\n", " view\n", " ...\n", - " sqft_above\n", " sqft_basement\n", " yr_built\n", " yr_renovated\n", @@ -1116,128 +735,129 @@ " sqft_living15\n", " sqft_lot15\n", " price_category\n", + " median_price\n", " \n", " \n", " \n", " \n", - " 9843\n", - " 3260000340\n", - " 20140622T000000\n", - " 732600.0\n", + " 20962\n", + " 1278000210\n", + " 20150311T000000\n", + " 110000.0\n", + " 2\n", + " 1.00\n", + " 828\n", + " 4524\n", + " 1.0\n", + " 0\n", + " 0\n", + " ...\n", + " 0\n", + " 1968\n", + " 2007\n", + " 98001\n", + " 47.2655\n", + " -122.244\n", + " 828\n", + " 5402\n", + " 0\n", + " 0\n", + " \n", + " \n", + " 12284\n", + " 2193300390\n", + " 20140923T000000\n", + " 624000.0\n", " 4\n", - " 2.50\n", - " 2130\n", - " 7300\n", + " 3.25\n", + " 2810\n", + " 11250\n", " 1.0\n", " 0\n", " 0\n", " ...\n", - " 1230\n", - " 900\n", - " 1963\n", + " 1130\n", + " 1980\n", " 0\n", - " 98005\n", - " 47.6050\n", - " -122.167\n", - " 2130\n", - " 7560\n", - " high\n", + " 98052\n", + " 47.6920\n", + " -122.099\n", + " 2110\n", + " 11250\n", + " 1\n", + " 1\n", " \n", " \n", - " 9623\n", - " 9828702055\n", - " 20140508T000000\n", - " 358000.0\n", - " 2\n", - " 1.50\n", - " 960\n", - " 1808\n", + " 7343\n", + " 4289900005\n", + " 20141230T000000\n", + " 1535000.0\n", + " 4\n", + " 3.25\n", + " 2850\n", + " 4100\n", " 2.0\n", " 0\n", - " 0\n", + " 3\n", " ...\n", - " 960\n", - " 0\n", - " 1993\n", - " 0\n", + " 1030\n", + " 1908\n", + " 2003\n", " 98122\n", - " 47.6183\n", - " -122.298\n", - " 1290\n", - " 1668\n", - " middle\n", - " \n", - " \n", - " 3095\n", - " 3438500625\n", - " 20140519T000000\n", - " 210000.0\n", - " 3\n", - " 1.00\n", - " 1080\n", - " 21043\n", - " 1.0\n", - " 0\n", - " 0\n", - " ...\n", - " 1080\n", - " 0\n", - " 1942\n", - " 0\n", - " 98106\n", - " 47.5515\n", - " -122.357\n", - " 1380\n", - " 7620\n", - " low\n", - " \n", - " \n", - " 411\n", - " 2422029094\n", - " 20140716T000000\n", - " 517534.0\n", + " 47.6147\n", + " -122.285\n", + " 2130\n", + " 4200\n", " 2\n", + " 1\n", + " \n", + " \n", + " 14247\n", + " 316000145\n", + " 20150325T000000\n", + " 235000.0\n", + " 4\n", " 1.00\n", - " 833\n", - " 143947\n", - " 1.0\n", + " 1360\n", + " 7132\n", + " 1.5\n", " 0\n", " 0\n", " ...\n", - " 833\n", " 0\n", - " 2006\n", + " 1941\n", + " 0\n", + " 98168\n", + " 47.5054\n", + " -122.301\n", + " 1280\n", + " 7175\n", + " 0\n", " 0\n", - " 98070\n", - " 47.3889\n", - " -122.482\n", - " 1380\n", - " 143947\n", - " middle\n", " \n", " \n", - " 3060\n", - " 7462900015\n", - " 20150108T000000\n", - " 387000.0\n", - " 3\n", - " 2.25\n", - " 1760\n", - " 45133\n", + " 16670\n", + " 629400480\n", + " 20140619T000000\n", + " 775000.0\n", + " 4\n", + " 2.75\n", + " 3010\n", + " 15992\n", " 2.0\n", " 0\n", " 0\n", " ...\n", - " 1760\n", " 0\n", - " 1984\n", + " 1996\n", " 0\n", - " 98065\n", - " 47.5124\n", - " -121.866\n", - " 1910\n", - " 51773\n", - " middle\n", + " 98075\n", + " 47.5895\n", + " -121.994\n", + " 3330\n", + " 12333\n", + " 2\n", + " 1\n", " \n", " \n", " ...\n", @@ -1264,184 +884,184 @@ " ...\n", " \n", " \n", - " 1750\n", - " 2787720140\n", - " 20150407T000000\n", - " 416000.0\n", - " 3\n", - " 2.50\n", - " 1790\n", - " 11542\n", - " 1.0\n", - " 0\n", - " 0\n", - " ...\n", - " 1190\n", - " 600\n", - " 1969\n", - " 0\n", - " 98059\n", - " 47.5124\n", - " -122.160\n", - " 1790\n", - " 9131\n", - " middle\n", - " \n", - " \n", - " 2354\n", - " 6192400400\n", - " 20140728T000000\n", - " 775000.0\n", - " 4\n", - " 2.50\n", - " 3090\n", - " 7112\n", + " 88\n", + " 1332700270\n", + " 20140519T000000\n", + " 215000.0\n", + " 2\n", + " 2.25\n", + " 1610\n", + " 2040\n", " 2.0\n", " 0\n", " 0\n", " ...\n", - " 3090\n", " 0\n", - " 2001\n", - " 0\n", - " 98052\n", - " 47.7050\n", - " -122.118\n", - " 3050\n", - " 6000\n", - " high\n", - " \n", - " \n", - " 857\n", - " 2296500036\n", - " 20150310T000000\n", - " 450000.0\n", - " 4\n", - " 2.75\n", - " 2980\n", - " 13260\n", - " 1.0\n", - " 0\n", - " 0\n", - " ...\n", - " 1800\n", - " 1180\n", " 1979\n", " 0\n", " 98056\n", - " 47.5152\n", - " -122.197\n", - " 1920\n", - " 10731\n", - " middle\n", + " 47.5180\n", + " -122.194\n", + " 1950\n", + " 2025\n", + " 0\n", + " 0\n", " \n", " \n", - " 6181\n", - " 2787310130\n", - " 20141212T000000\n", - " 289950.0\n", + " 15031\n", + " 7129303070\n", + " 20140820T000000\n", + " 735000.0\n", + " 4\n", + " 2.75\n", + " 3040\n", + " 2415\n", + " 2.0\n", + " 1\n", " 4\n", - " 1.75\n", - " 2090\n", - " 7416\n", - " 1.0\n", - " 0\n", - " 0\n", " ...\n", - " 1050\n", - " 1040\n", - " 1970\n", " 0\n", - " 98031\n", - " 47.4107\n", - " -122.179\n", - " 1710\n", - " 7527\n", - " low\n", + " 1966\n", + " 0\n", + " 98118\n", + " 47.5188\n", + " -122.256\n", + " 2620\n", + " 2433\n", + " 2\n", + " 1\n", " \n", " \n", - " 3141\n", - " 8567300110\n", - " 20140604T000000\n", - " 485000.0\n", + " 5234\n", + " 2432000130\n", + " 20150414T000000\n", + " 675000.0\n", " 3\n", - " 2.50\n", - " 2340\n", - " 59058\n", + " 1.75\n", + " 1660\n", + " 9549\n", " 1.0\n", " 0\n", " 0\n", " ...\n", - " 2340\n", " 0\n", - " 1985\n", + " 1956\n", " 0\n", - " 98038\n", - " 47.4052\n", - " -122.028\n", - " 2700\n", - " 37263\n", - " middle\n", + " 98033\n", + " 47.6503\n", + " -122.198\n", + " 2090\n", + " 9549\n", + " 1\n", + " 1\n", + " \n", + " \n", + " 19980\n", + " 774100475\n", + " 20140627T000000\n", + " 415000.0\n", + " 3\n", + " 2.75\n", + " 2600\n", + " 64626\n", + " 1.5\n", + " 0\n", + " 0\n", + " ...\n", + " 0\n", + " 2009\n", + " 0\n", + " 98014\n", + " 47.7185\n", + " -121.405\n", + " 1740\n", + " 64626\n", + " 1\n", + " 0\n", + " \n", + " \n", + " 3671\n", + " 8847400115\n", + " 20140723T000000\n", + " 590000.0\n", + " 3\n", + " 2.00\n", + " 2420\n", + " 208652\n", + " 1.5\n", + " 0\n", + " 0\n", + " ...\n", + " 0\n", + " 2005\n", + " 0\n", + " 98010\n", + " 47.3666\n", + " -121.978\n", + " 3180\n", + " 212137\n", + " 1\n", + " 1\n", " \n", " \n", "\n", - "

8000 rows × 22 columns

\n", + "

17290 rows × 23 columns

\n", "" ], "text/plain": [ - " id date price bedrooms bathrooms sqft_living \\\n", - "9843 3260000340 20140622T000000 732600.0 4 2.50 2130 \n", - "9623 9828702055 20140508T000000 358000.0 2 1.50 960 \n", - "3095 3438500625 20140519T000000 210000.0 3 1.00 1080 \n", - "411 2422029094 20140716T000000 517534.0 2 1.00 833 \n", - "3060 7462900015 20150108T000000 387000.0 3 2.25 1760 \n", - "... ... ... ... ... ... ... \n", - "1750 2787720140 20150407T000000 416000.0 3 2.50 1790 \n", - "2354 6192400400 20140728T000000 775000.0 4 2.50 3090 \n", - "857 2296500036 20150310T000000 450000.0 4 2.75 2980 \n", - "6181 2787310130 20141212T000000 289950.0 4 1.75 2090 \n", - "3141 8567300110 20140604T000000 485000.0 3 2.50 2340 \n", + " id date price bedrooms bathrooms \\\n", + "20962 1278000210 20150311T000000 110000.0 2 1.00 \n", + "12284 2193300390 20140923T000000 624000.0 4 3.25 \n", + "7343 4289900005 20141230T000000 1535000.0 4 3.25 \n", + "14247 316000145 20150325T000000 235000.0 4 1.00 \n", + "16670 629400480 20140619T000000 775000.0 4 2.75 \n", + "... ... ... ... ... ... \n", + "88 1332700270 20140519T000000 215000.0 2 2.25 \n", + "15031 7129303070 20140820T000000 735000.0 4 2.75 \n", + "5234 2432000130 20150414T000000 675000.0 3 1.75 \n", + "19980 774100475 20140627T000000 415000.0 3 2.75 \n", + "3671 8847400115 20140723T000000 590000.0 3 2.00 \n", "\n", - " sqft_lot floors waterfront view ... sqft_above sqft_basement \\\n", - "9843 7300 1.0 0 0 ... 1230 900 \n", - "9623 1808 2.0 0 0 ... 960 0 \n", - "3095 21043 1.0 0 0 ... 1080 0 \n", - "411 143947 1.0 0 0 ... 833 0 \n", - "3060 45133 2.0 0 0 ... 1760 0 \n", - "... ... ... ... ... ... ... ... \n", - "1750 11542 1.0 0 0 ... 1190 600 \n", - "2354 7112 2.0 0 0 ... 3090 0 \n", - "857 13260 1.0 0 0 ... 1800 1180 \n", - "6181 7416 1.0 0 0 ... 1050 1040 \n", - "3141 59058 1.0 0 0 ... 2340 0 \n", + " sqft_living sqft_lot floors waterfront view ... sqft_basement \\\n", + "20962 828 4524 1.0 0 0 ... 0 \n", + "12284 2810 11250 1.0 0 0 ... 1130 \n", + "7343 2850 4100 2.0 0 3 ... 1030 \n", + "14247 1360 7132 1.5 0 0 ... 0 \n", + "16670 3010 15992 2.0 0 0 ... 0 \n", + "... ... ... ... ... ... ... ... \n", + "88 1610 2040 2.0 0 0 ... 0 \n", + "15031 3040 2415 2.0 1 4 ... 0 \n", + "5234 1660 9549 1.0 0 0 ... 0 \n", + "19980 2600 64626 1.5 0 0 ... 0 \n", + "3671 2420 208652 1.5 0 0 ... 0 \n", "\n", - " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", - "9843 1963 0 98005 47.6050 -122.167 2130 \n", - "9623 1993 0 98122 47.6183 -122.298 1290 \n", - "3095 1942 0 98106 47.5515 -122.357 1380 \n", - "411 2006 0 98070 47.3889 -122.482 1380 \n", - "3060 1984 0 98065 47.5124 -121.866 1910 \n", - "... ... ... ... ... ... ... \n", - "1750 1969 0 98059 47.5124 -122.160 1790 \n", - "2354 2001 0 98052 47.7050 -122.118 3050 \n", - "857 1979 0 98056 47.5152 -122.197 1920 \n", - "6181 1970 0 98031 47.4107 -122.179 1710 \n", - "3141 1985 0 98038 47.4052 -122.028 2700 \n", + " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", + "20962 1968 2007 98001 47.2655 -122.244 828 \n", + "12284 1980 0 98052 47.6920 -122.099 2110 \n", + "7343 1908 2003 98122 47.6147 -122.285 2130 \n", + "14247 1941 0 98168 47.5054 -122.301 1280 \n", + "16670 1996 0 98075 47.5895 -121.994 3330 \n", + "... ... ... ... ... ... ... \n", + "88 1979 0 98056 47.5180 -122.194 1950 \n", + "15031 1966 0 98118 47.5188 -122.256 2620 \n", + "5234 1956 0 98033 47.6503 -122.198 2090 \n", + "19980 2009 0 98014 47.7185 -121.405 1740 \n", + "3671 2005 0 98010 47.3666 -121.978 3180 \n", "\n", - " sqft_lot15 price_category \n", - "9843 7560 high \n", - "9623 1668 middle \n", - "3095 7620 low \n", - "411 143947 middle \n", - "3060 51773 middle \n", - "... ... ... \n", - "1750 9131 middle \n", - "2354 6000 high \n", - "857 10731 middle \n", - "6181 7527 low \n", - "3141 37263 middle \n", + " sqft_lot15 price_category median_price \n", + "20962 5402 0 0 \n", + "12284 11250 1 1 \n", + "7343 4200 2 1 \n", + "14247 7175 0 0 \n", + "16670 12333 2 1 \n", + "... ... ... ... \n", + "88 2025 0 0 \n", + "15031 2433 2 1 \n", + "5234 9549 1 1 \n", + "19980 64626 1 0 \n", + "3671 212137 1 1 \n", "\n", - "[8000 rows x 22 columns]" + "[17290 rows x 23 columns]" ] }, "metadata": {}, @@ -1477,74 +1097,74 @@ " \n", " \n", " \n", - " price_category\n", + " median_price\n", " \n", " \n", " \n", " \n", - " 9843\n", - " high\n", + " 20962\n", + " 0\n", " \n", " \n", - " 9623\n", - " middle\n", + " 12284\n", + " 1\n", " \n", " \n", - " 3095\n", - " low\n", + " 7343\n", + " 1\n", " \n", " \n", - " 411\n", - " middle\n", + " 14247\n", + " 0\n", " \n", " \n", - " 3060\n", - " middle\n", + " 16670\n", + " 1\n", " \n", " \n", " ...\n", " ...\n", " \n", " \n", - " 1750\n", - " middle\n", + " 88\n", + " 0\n", " \n", " \n", - " 2354\n", - " high\n", + " 15031\n", + " 1\n", " \n", " \n", - " 857\n", - " middle\n", + " 5234\n", + " 1\n", " \n", " \n", - " 6181\n", - " low\n", + " 19980\n", + " 0\n", " \n", " \n", - " 3141\n", - " middle\n", + " 3671\n", + " 1\n", " \n", " \n", "\n", - "

8000 rows × 1 columns

\n", + "

17290 rows × 1 columns

\n", "" ], "text/plain": [ - " price_category\n", - "9843 high\n", - "9623 middle\n", - "3095 low\n", - "411 middle\n", - "3060 middle\n", + " median_price\n", + "20962 0\n", + "12284 1\n", + "7343 1\n", + "14247 0\n", + "16670 1\n", "... ...\n", - "1750 middle\n", - "2354 high\n", - "857 middle\n", - "6181 low\n", - "3141 middle\n", + "88 0\n", + "15031 1\n", + "5234 1\n", + "19980 0\n", + "3671 1\n", "\n", - "[8000 rows x 1 columns]" + "[17290 rows x 1 columns]" ] }, "metadata": {}, @@ -1591,7 +1211,6 @@ " waterfront\n", " view\n", " ...\n", - " sqft_above\n", " sqft_basement\n", " yr_built\n", " yr_renovated\n", @@ -1601,128 +1220,129 @@ " sqft_living15\n", " sqft_lot15\n", " price_category\n", + " median_price\n", " \n", " \n", " \n", " \n", - " 5341\n", - " 6632900574\n", - " 20150225T000000\n", - " 595000.0\n", - " 5\n", - " 3.00\n", - " 2980\n", - " 10064\n", + " 11592\n", + " 2028701000\n", + " 20140529T000000\n", + " 635200.0\n", + " 4\n", + " 1.75\n", + " 1640\n", + " 4240\n", " 1.0\n", " 0\n", " 0\n", " ...\n", - " 1680\n", - " 1300\n", - " 1940\n", + " 720\n", + " 1921\n", " 0\n", - " 98155\n", - " 47.7372\n", - " -122.316\n", - " 1590\n", - " 7800\n", - " middle\n", + " 98117\n", + " 47.6766\n", + " -122.368\n", + " 1300\n", + " 4240\n", + " 1\n", + " 1\n", " \n", " \n", - " 4384\n", - " 2423029245\n", - " 20140617T000000\n", - " 550000.0\n", - " 3\n", - " 1.75\n", - " 2240\n", - " 78225\n", + " 8984\n", + " 9406500530\n", + " 20140912T000000\n", + " 249000.0\n", + " 2\n", + " 2.00\n", + " 1090\n", + " 1357\n", " 2.0\n", " 0\n", " 0\n", " ...\n", - " 2240\n", " 0\n", - " 1976\n", + " 1990\n", + " 0\n", + " 98028\n", + " 47.7526\n", + " -122.244\n", + " 1078\n", + " 1318\n", + " 0\n", " 0\n", - " 98070\n", - " 47.4638\n", - " -122.484\n", - " 2030\n", - " 202554\n", - " middle\n", " \n", " \n", - " 5795\n", - " 2473370050\n", - " 20140604T000000\n", - " 327500.0\n", - " 4\n", - " 1.75\n", - " 1650\n", - " 7800\n", - " 1.0\n", + " 8280\n", + " 8097000330\n", + " 20140721T000000\n", + " 359950.0\n", + " 3\n", + " 2.75\n", + " 2540\n", + " 8604\n", + " 2.0\n", " 0\n", " 0\n", " ...\n", - " 1650\n", " 0\n", - " 1968\n", + " 1991\n", + " 0\n", + " 98092\n", + " 47.3209\n", + " -122.185\n", + " 2260\n", + " 7438\n", + " 1\n", " 0\n", - " 98058\n", - " 47.4507\n", - " -122.139\n", - " 1750\n", - " 10400\n", - " low\n", " \n", " \n", - " 4956\n", - " 9528104985\n", - " 20141104T000000\n", - " 611000.0\n", + " 792\n", + " 8081020370\n", + " 20140709T000000\n", + " 1355000.0\n", + " 4\n", + " 3.50\n", + " 3550\n", + " 11000\n", + " 1.0\n", + " 0\n", + " 2\n", + " ...\n", + " 1290\n", + " 1999\n", + " 0\n", + " 98006\n", + " 47.5506\n", + " -122.134\n", + " 4100\n", + " 10012\n", + " 2\n", + " 1\n", + " \n", + " \n", + " 10371\n", + " 7518507580\n", + " 20150502T000000\n", + " 581000.0\n", " 2\n", " 1.00\n", - " 1270\n", - " 5100\n", + " 1170\n", + " 4080\n", " 1.0\n", " 0\n", " 0\n", " ...\n", - " 1100\n", - " 170\n", - " 1900\n", " 0\n", - " 98115\n", - " 47.6771\n", - " -122.328\n", - " 1670\n", - " 3900\n", - " high\n", - " \n", - " \n", - " 7723\n", - " 3972900025\n", - " 20150313T000000\n", - " 499000.0\n", - " 6\n", - " 1.75\n", - " 2400\n", - " 7500\n", - " 1.5\n", + " 1909\n", " 0\n", - " 0\n", - " ...\n", - " 1400\n", - " 1000\n", - " 1975\n", - " 0\n", - " 98155\n", - " 47.7661\n", - " -122.313\n", - " 1980\n", - " 7500\n", - " middle\n", + " 98117\n", + " 47.6784\n", + " -122.386\n", + " 1560\n", + " 4586\n", + " 1\n", + " 1\n", " \n", " \n", " ...\n", @@ -1749,184 +1369,184 @@ " ...\n", " \n", " \n", - " 8517\n", - " 3876600120\n", - " 20150422T000000\n", - " 265000.0\n", - " 3\n", - " 1.50\n", - " 1780\n", - " 10196\n", - " 1.0\n", - " 0\n", - " 0\n", - " ...\n", - " 1270\n", - " 510\n", - " 1967\n", - " 0\n", - " 98001\n", - " 47.3375\n", - " -122.291\n", - " 1320\n", - " 7875\n", - " low\n", - " \n", - " \n", - " 6914\n", - " 6821600005\n", - " 20150403T000000\n", - " 710000.0\n", - " 4\n", - " 1.75\n", - " 2120\n", - " 5400\n", - " 1.0\n", - " 0\n", - " 0\n", - " ...\n", - " 1060\n", - " 1060\n", - " 1941\n", - " 0\n", - " 98199\n", - " 47.6501\n", - " -122.395\n", - " 2052\n", - " 6000\n", - " high\n", - " \n", - " \n", - " 4499\n", - " 2767603931\n", - " 20140818T000000\n", - " 469000.0\n", - " 3\n", - " 3.25\n", - " 1370\n", - " 1194\n", - " 3.0\n", - " 0\n", - " 0\n", - " ...\n", - " 1370\n", - " 0\n", - " 2004\n", - " 0\n", - " 98107\n", - " 47.6718\n", - " -122.388\n", - " 1800\n", - " 2678\n", - " middle\n", - " \n", - " \n", - " 8651\n", - " 8802400411\n", - " 20140619T000000\n", - " 249000.0\n", - " 3\n", - " 1.00\n", - " 1050\n", - " 8498\n", - " 1.0\n", - " 0\n", - " 0\n", - " ...\n", - " 1050\n", - " 0\n", - " 1959\n", - " 0\n", - " 98031\n", - " 47.4043\n", - " -122.202\n", - " 1050\n", - " 8498\n", - " low\n", - " \n", - " \n", - " 4234\n", - " 5452800735\n", - " 20140722T000000\n", - " 780000.0\n", + " 16733\n", + " 7212650950\n", + " 20140708T000000\n", + " 336000.0\n", " 4\n", " 2.50\n", - " 2270\n", - " 13449\n", + " 2530\n", + " 8169\n", + " 2.0\n", + " 0\n", + " 0\n", + " ...\n", + " 0\n", + " 1993\n", + " 0\n", + " 98003\n", + " 47.2634\n", + " -122.312\n", + " 2220\n", + " 8013\n", + " 1\n", + " 0\n", + " \n", + " \n", + " 13151\n", + " 4365200620\n", + " 20150312T000000\n", + " 394000.0\n", + " 3\n", + " 1.00\n", + " 1450\n", + " 7930\n", " 1.0\n", " 0\n", " 0\n", " ...\n", - " 1310\n", - " 960\n", - " 1975\n", + " 300\n", + " 1923\n", " 0\n", - " 98040\n", - " 47.5416\n", - " -122.232\n", - " 2810\n", - " 13475\n", - " high\n", + " 98126\n", + " 47.5212\n", + " -122.371\n", + " 1040\n", + " 7740\n", + " 1\n", + " 0\n", + " \n", + " \n", + " 11667\n", + " 4083304355\n", + " 20150318T000000\n", + " 675000.0\n", + " 4\n", + " 1.75\n", + " 1530\n", + " 3615\n", + " 1.5\n", + " 0\n", + " 0\n", + " ...\n", + " 0\n", + " 1913\n", + " 0\n", + " 98103\n", + " 47.6529\n", + " -122.334\n", + " 1650\n", + " 4200\n", + " 1\n", + " 1\n", + " \n", + " \n", + " 3683\n", + " 2891100820\n", + " 20140825T000000\n", + " 213500.0\n", + " 3\n", + " 1.00\n", + " 1220\n", + " 6000\n", + " 1.0\n", + " 0\n", + " 0\n", + " ...\n", + " 0\n", + " 1968\n", + " 0\n", + " 98002\n", + " 47.3245\n", + " -122.209\n", + " 1420\n", + " 6000\n", + " 0\n", + " 0\n", + " \n", + " \n", + " 12059\n", + " 952000640\n", + " 20141027T000000\n", + " 715000.0\n", + " 3\n", + " 1.50\n", + " 1670\n", + " 5060\n", + " 2.0\n", + " 0\n", + " 2\n", + " ...\n", + " 0\n", + " 1925\n", + " 0\n", + " 98126\n", + " 47.5671\n", + " -122.379\n", + " 1670\n", + " 5118\n", + " 2\n", + " 1\n", " \n", " \n", "\n", - "

2000 rows × 22 columns

\n", + "

4323 rows × 23 columns

\n", "" ], "text/plain": [ - " id date price bedrooms bathrooms sqft_living \\\n", - "5341 6632900574 20150225T000000 595000.0 5 3.00 2980 \n", - "4384 2423029245 20140617T000000 550000.0 3 1.75 2240 \n", - "5795 2473370050 20140604T000000 327500.0 4 1.75 1650 \n", - "4956 9528104985 20141104T000000 611000.0 2 1.00 1270 \n", - "7723 3972900025 20150313T000000 499000.0 6 1.75 2400 \n", - "... ... ... ... ... ... ... \n", - "8517 3876600120 20150422T000000 265000.0 3 1.50 1780 \n", - "6914 6821600005 20150403T000000 710000.0 4 1.75 2120 \n", - "4499 2767603931 20140818T000000 469000.0 3 3.25 1370 \n", - "8651 8802400411 20140619T000000 249000.0 3 1.00 1050 \n", - "4234 5452800735 20140722T000000 780000.0 4 2.50 2270 \n", + " id date price bedrooms bathrooms \\\n", + "11592 2028701000 20140529T000000 635200.0 4 1.75 \n", + "8984 9406500530 20140912T000000 249000.0 2 2.00 \n", + "8280 8097000330 20140721T000000 359950.0 3 2.75 \n", + "792 8081020370 20140709T000000 1355000.0 4 3.50 \n", + "10371 7518507580 20150502T000000 581000.0 2 1.00 \n", + "... ... ... ... ... ... \n", + "16733 7212650950 20140708T000000 336000.0 4 2.50 \n", + "13151 4365200620 20150312T000000 394000.0 3 1.00 \n", + "11667 4083304355 20150318T000000 675000.0 4 1.75 \n", + "3683 2891100820 20140825T000000 213500.0 3 1.00 \n", + "12059 952000640 20141027T000000 715000.0 3 1.50 \n", "\n", - " sqft_lot floors waterfront view ... sqft_above sqft_basement \\\n", - "5341 10064 1.0 0 0 ... 1680 1300 \n", - "4384 78225 2.0 0 0 ... 2240 0 \n", - "5795 7800 1.0 0 0 ... 1650 0 \n", - "4956 5100 1.0 0 0 ... 1100 170 \n", - "7723 7500 1.5 0 0 ... 1400 1000 \n", - "... ... ... ... ... ... ... ... \n", - "8517 10196 1.0 0 0 ... 1270 510 \n", - "6914 5400 1.0 0 0 ... 1060 1060 \n", - "4499 1194 3.0 0 0 ... 1370 0 \n", - "8651 8498 1.0 0 0 ... 1050 0 \n", - "4234 13449 1.0 0 0 ... 1310 960 \n", + " sqft_living sqft_lot floors waterfront view ... sqft_basement \\\n", + "11592 1640 4240 1.0 0 0 ... 720 \n", + "8984 1090 1357 2.0 0 0 ... 0 \n", + "8280 2540 8604 2.0 0 0 ... 0 \n", + "792 3550 11000 1.0 0 2 ... 1290 \n", + "10371 1170 4080 1.0 0 0 ... 0 \n", + "... ... ... ... ... ... ... ... \n", + "16733 2530 8169 2.0 0 0 ... 0 \n", + "13151 1450 7930 1.0 0 0 ... 300 \n", + "11667 1530 3615 1.5 0 0 ... 0 \n", + "3683 1220 6000 1.0 0 0 ... 0 \n", + "12059 1670 5060 2.0 0 2 ... 0 \n", "\n", - " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", - "5341 1940 0 98155 47.7372 -122.316 1590 \n", - "4384 1976 0 98070 47.4638 -122.484 2030 \n", - "5795 1968 0 98058 47.4507 -122.139 1750 \n", - "4956 1900 0 98115 47.6771 -122.328 1670 \n", - "7723 1975 0 98155 47.7661 -122.313 1980 \n", - "... ... ... ... ... ... ... \n", - "8517 1967 0 98001 47.3375 -122.291 1320 \n", - "6914 1941 0 98199 47.6501 -122.395 2052 \n", - "4499 2004 0 98107 47.6718 -122.388 1800 \n", - "8651 1959 0 98031 47.4043 -122.202 1050 \n", - "4234 1975 0 98040 47.5416 -122.232 2810 \n", + " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", + "11592 1921 0 98117 47.6766 -122.368 1300 \n", + "8984 1990 0 98028 47.7526 -122.244 1078 \n", + "8280 1991 0 98092 47.3209 -122.185 2260 \n", + "792 1999 0 98006 47.5506 -122.134 4100 \n", + "10371 1909 0 98117 47.6784 -122.386 1560 \n", + "... ... ... ... ... ... ... \n", + "16733 1993 0 98003 47.2634 -122.312 2220 \n", + "13151 1923 0 98126 47.5212 -122.371 1040 \n", + "11667 1913 0 98103 47.6529 -122.334 1650 \n", + "3683 1968 0 98002 47.3245 -122.209 1420 \n", + "12059 1925 0 98126 47.5671 -122.379 1670 \n", "\n", - " sqft_lot15 price_category \n", - "5341 7800 middle \n", - "4384 202554 middle \n", - "5795 10400 low \n", - "4956 3900 high \n", - "7723 7500 middle \n", - "... ... ... \n", - "8517 7875 low \n", - "6914 6000 high \n", - "4499 2678 middle \n", - "8651 8498 low \n", - "4234 13475 high \n", + " sqft_lot15 price_category median_price \n", + "11592 4240 1 1 \n", + "8984 1318 0 0 \n", + "8280 7438 1 0 \n", + "792 10012 2 1 \n", + "10371 4586 1 1 \n", + "... ... ... ... \n", + "16733 8013 1 0 \n", + "13151 7740 1 0 \n", + "11667 4200 1 1 \n", + "3683 6000 0 0 \n", + "12059 5118 2 1 \n", "\n", - "[2000 rows x 22 columns]" + "[4323 rows x 23 columns]" ] }, "metadata": {}, @@ -1962,78 +1582,108 @@ " \n", " \n", " \n", - " price_category\n", + " median_price\n", " \n", " \n", " \n", " \n", - " 5341\n", - " middle\n", + " 11592\n", + " 1\n", " \n", " \n", - " 4384\n", - " middle\n", + " 8984\n", + " 0\n", " \n", " \n", - " 5795\n", - " low\n", + " 8280\n", + " 0\n", " \n", " \n", - " 4956\n", - " high\n", + " 792\n", + " 1\n", " \n", " \n", - " 7723\n", - " middle\n", + " 10371\n", + " 1\n", " \n", " \n", " ...\n", " ...\n", " \n", " \n", - " 8517\n", - " low\n", + " 16733\n", + " 0\n", " \n", " \n", - " 6914\n", - " high\n", + " 13151\n", + " 0\n", " \n", " \n", - " 4499\n", - " middle\n", + " 11667\n", + " 1\n", " \n", " \n", - " 8651\n", - " low\n", + " 3683\n", + " 0\n", " \n", " \n", - " 4234\n", - " high\n", + " 12059\n", + " 1\n", " \n", " \n", "\n", - "

2000 rows × 1 columns

\n", + "

4323 rows × 1 columns

\n", "" ], "text/plain": [ - " price_category\n", - "5341 middle\n", - "4384 middle\n", - "5795 low\n", - "4956 high\n", - "7723 middle\n", + " median_price\n", + "11592 1\n", + "8984 0\n", + "8280 0\n", + "792 1\n", + "10371 1\n", "... ...\n", - "8517 low\n", - "6914 high\n", - "4499 middle\n", - "8651 low\n", - "4234 high\n", + "16733 0\n", + "13151 0\n", + "11667 1\n", + "3683 0\n", + "12059 1\n", "\n", - "[2000 rows x 1 columns]" + "[4323 rows x 1 columns]" ] }, "metadata": {}, "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id int64\n", + "date object\n", + "price float64\n", + "bedrooms int64\n", + "bathrooms float64\n", + "sqft_living int64\n", + "sqft_lot int64\n", + "floors float64\n", + "waterfront int64\n", + "view int64\n", + "condition int64\n", + "grade int64\n", + "sqft_above int64\n", + "sqft_basement int64\n", + "yr_built int64\n", + "yr_renovated int64\n", + "zipcode int64\n", + "lat float64\n", + "long float64\n", + "sqft_living15 int64\n", + "sqft_lot15 int64\n", + "price_category category\n", + "median_price int64\n", + "dtype: object\n" + ] } ], "source": [ @@ -2042,6 +1692,21 @@ "from pandas import DataFrame\n", "from sklearn.model_selection import train_test_split\n", "\n", + "# Создание целевого признака\n", + "median_price = df['price'].median()\n", + "df['median_price'] = np.where(df['price'] > median_price, 1, 0)\n", + "\n", + "# Разделение на признаки и целевую переменную\n", + "X = df.drop(columns=['id', 'date', 'price', 'median_price'])\n", + "y = df['median_price']\n", + "\n", + "# Примерная категоризация\n", + "df['price_category'] = pd.cut(df['price'], bins=[0, 300000, 700000, np.inf], labels=[0, 1, 2])\n", + "\n", + "# Выбор признаков и целевых переменных\n", + "X = df.drop(columns=['id', 'date', 'price', 'price_category'])\n", + "\n", + "\n", "def split_stratified_into_train_val_test(\n", " df_input,\n", " stratify_colname=\"y\",\n", @@ -2056,21 +1721,25 @@ " \"fractions %f, %f, %f do not add up to 1.0\"\n", " % (frac_train, frac_val, frac_test)\n", " )\n", + " \n", " if stratify_colname not in df_input.columns:\n", " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", " X = df_input # Contains all columns.\n", " y = df_input[\n", " [stratify_colname]\n", " ] # Dataframe of just the column on which to stratify.\n", + " \n", " # Split original dataframe into train and temp dataframes.\n", " df_train, df_temp, y_train, y_temp = train_test_split(\n", " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", " )\n", + "\n", " if frac_val <= 0:\n", " assert len(df_input) == len(df_train) + len(df_temp)\n", " return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n", " # Split the temp dataframe into val and test dataframes.\n", " relative_frac_test = frac_test / (frac_val + frac_test)\n", + "\n", " df_val, df_test, y_val, y_test = train_test_split(\n", " df_temp,\n", " y_temp,\n", @@ -2078,18 +1747,21 @@ " test_size=relative_frac_test,\n", " random_state=random_state,\n", " )\n", + "\n", " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", " return df_train, df_val, df_test, y_train, y_val, y_test\n", "\n", "X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n", - " df, stratify_colname=\"price_category\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=42\n", + " df, stratify_colname=\"median_price\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=42\n", ")\n", "\n", "display(\"X_train\", X_train)\n", "display(\"y_train\", y_train)\n", "\n", "display(\"X_test\", X_test)\n", - "display(\"y_test\", y_test)" + "display(\"y_test\", y_test)\n", + "\n", + "print(df.dtypes)" ] }, { @@ -2110,7 +1782,91 @@ }, { "cell_type": "code", - "execution_count": 340, + "execution_count": 160, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from sklearn.base import BaseEstimator, TransformerMixin\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.discriminant_analysis import StandardScaler\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.ensemble import RandomForestRegressor # Пример регрессионной модели\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.pipeline import make_pipeline\n", + "\n", + "pipeline_end = StandardScaler()\n", + "\n", + "\n", + "class HouseFeatures(BaseEstimator, TransformerMixin):\n", + " def __init__(self):\n", + " pass\n", + " def fit(self, X, y=None):\n", + " return self\n", + " def transform(self, X, y=None):\n", + " # Создание новых признаков\n", + " X = X.copy()\n", + " X[\"Living_area_to_Lot_ratio\"] = X[\"sqft_living\"] / X[\"sqft_lot\"]\n", + " return X\n", + " def get_feature_names_out(self, features_in):\n", + " # Добавление имен новых признаков\n", + " new_features = [\"Living_area_to_Lot_ratio\"]\n", + " return np.append(features_in, new_features, axis=0)\n", + "\n", + "#Предобработка числовых значений. Заполнение пустых значений на медиану.\n", + "preprocessing_num_class = Pipeline(steps=[\n", + " ('imputer', SimpleImputer(strategy='median')),\n", + " ('scaler', StandardScaler())\n", + "])\n", + "\n", + "#Предобработка категориальных значений\n", + "preprocessing_cat_class = Pipeline(steps=[\n", + " ('imputer', SimpleImputer(strategy='most_frequent')),\n", + " ('onehot', OneHotEncoder(handle_unknown='ignore'))\n", + "])\n", + "\n", + "columns_to_drop = [\"date\"]\n", + "numeric_columns = [\"sqft_living\", \"sqft_lot\", \"median_price\"]\n", + "cat_columns = []\n", + "\n", + "features_preprocessing = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"prepocessing_num\", preprocessing_num_class, numeric_columns),\n", + " (\"prepocessing_cat\", preprocessing_cat_class, cat_columns),\n", + " ],\n", + " remainder=\"passthrough\"\n", + ")\n", + "\n", + "drop_columns = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"drop_columns\", \"drop\", columns_to_drop),\n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "\n", + "pipeline_end = Pipeline(\n", + " [\n", + " (\"features_preprocessing\", features_preprocessing),\n", + " (\"custom_features\", HouseFeatures()),\n", + " (\"drop_columns\", drop_columns),\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Пример работы конвейера." + ] + }, + { + "cell_type": "code", + "execution_count": 161, "metadata": {}, "outputs": [ { @@ -2134,681 +1890,368 @@ " \n", " \n", " \n", + " sqft_living\n", + " sqft_lot\n", + " median_price\n", " id\n", " price\n", " bedrooms\n", " bathrooms\n", - " sqft_living\n", - " sqft_lot\n", " floors\n", - " condition\n", - " grade\n", - " sqft_above\n", + " waterfront\n", + " view\n", " ...\n", + " sqft_basement\n", + " yr_built\n", " yr_renovated\n", " zipcode\n", " lat\n", " long\n", " sqft_living15\n", " sqft_lot15\n", - " price_h\n", - " price_l\n", - " price_m\n", - " price_vh\n", + " price_category\n", + " Living_area_to_Lot_ratio\n", " \n", " \n", " \n", " \n", - " 0\n", - " -0.451103\n", - " 0.916381\n", - " 0.700559\n", - " 0.573416\n", - " 0.081706\n", - " -0.187493\n", - " -0.838739\n", - " 0.839159\n", - " -0.512647\n", - " -0.638064\n", - " ...\n", - " -0.2158\n", - " -1.349962\n", - " 0.32254\n", - " 0.340593\n", - " 0.223199\n", - " -0.210584\n", + " 20962\n", + " -1.360742\n", + " -0.262132\n", + " -0.994693\n", + " 1278000210\n", + " 110000.0\n", + " 2\n", + " 1.00\n", " 1.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " 0\n", + " ...\n", + " 0\n", + " 1968\n", + " 2007\n", + " 98001\n", + " 47.2655\n", + " -122.244\n", + " 828\n", + " 5402\n", + " 0\n", + " 5.191063\n", " \n", " \n", - " 1\n", - " 1.845014\n", - " -0.589326\n", - " -1.49426\n", - " -0.72971\n", - " -1.191326\n", - " -0.302999\n", - " 1.120073\n", - " -0.666734\n", - " -0.512647\n", - " -0.969739\n", - " ...\n", - " -0.2158\n", - " 0.820656\n", - " 0.417588\n", - " -0.601419\n", - " -1.022503\n", - " -0.421966\n", - " 0.0\n", - " 0.0\n", + " 12284\n", + " 0.794390\n", + " -0.094121\n", + " 1.005335\n", + " 2193300390\n", + " 624000.0\n", + " 4\n", + " 3.25\n", " 1.0\n", - " 0.0\n", + " 0\n", + " 0\n", + " ...\n", + " 1130\n", + " 1980\n", + " 0\n", + " 98052\n", + " 47.6920\n", + " -122.099\n", + " 2110\n", + " 11250\n", + " 1\n", + " -8.440052\n", " \n", " \n", - " 2\n", - " -0.388708\n", - " -1.184213\n", - " -0.396851\n", - " -1.381273\n", - " -1.060759\n", - " 0.101544\n", - " -0.838739\n", - " -0.666734\n", - " -1.369558\n", - " -0.822328\n", + " 7343\n", + " 0.837884\n", + " -0.272723\n", + " 1.005335\n", + " 4289900005\n", + " 1535000.0\n", + " 4\n", + " 3.25\n", + " 2.0\n", + " 0\n", + " 3\n", " ...\n", - " -0.2158\n", - " 0.523819\n", - " -0.059795\n", - " -1.025683\n", - " -0.889035\n", - " -0.208431\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", + " 1030\n", + " 1908\n", + " 2003\n", + " 98122\n", + " 47.6147\n", + " -122.285\n", + " 2130\n", + " 4200\n", + " 2\n", + " -3.072292\n", " \n", " \n", - " 3\n", - " -0.74402\n", - " 0.051922\n", - " -1.49426\n", - " -1.381273\n", - " -1.32951\n", - " 2.686416\n", - " -0.838739\n", - " -0.666734\n", - " -2.22647\n", - " -1.125749\n", + " 14247\n", + " -0.782270\n", + " -0.196986\n", + " -0.994693\n", + " 316000145\n", + " 235000.0\n", + " 4\n", + " 1.00\n", + " 1.5\n", + " 0\n", + " 0\n", " ...\n", - " -0.2158\n", - " -0.144063\n", - " -1.221808\n", - " -1.924549\n", - " -0.889035\n", - " 4.682444\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", + " 0\n", + " 1941\n", + " 0\n", + " 98168\n", + " 47.5054\n", + " -122.301\n", + " 1280\n", + " 7175\n", + " 0\n", + " 3.971201\n", " \n", " \n", - " 4\n", - " 1.018038\n", - " -0.47276\n", - " -0.396851\n", - " 0.247635\n", - " -0.320877\n", - " 0.608196\n", - " 1.120073\n", - " -0.666734\n", - " -0.512647\n", - " 0.013003\n", + " 16670\n", + " 1.011860\n", + " 0.024330\n", + " 1.005335\n", + " 629400480\n", + " 775000.0\n", + " 4\n", + " 2.75\n", + " 2.0\n", + " 0\n", + " 0\n", " ...\n", - " -0.2158\n", - " -0.236825\n", - " -0.339221\n", - " 2.505062\n", - " -0.103056\n", - " 1.375604\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", + " 0\n", + " 1996\n", + " 0\n", + " 98075\n", + " 47.5895\n", + " -121.994\n", + " 3330\n", + " 12333\n", + " 2\n", + " 41.589045\n", " \n", " \n", - " 5\n", - " -0.083826\n", - " -0.492858\n", - " -0.396851\n", - " 1.550761\n", - " -0.701698\n", - " -0.314672\n", - " 3.078884\n", - " -0.666734\n", - " 0.344264\n", - " -0.416947\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", " ...\n", - " -0.2158\n", - " 0.468162\n", - " 0.987875\n", - " -0.903438\n", - " -0.844546\n", - " -0.436854\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", " \n", " \n", - " 6\n", - " 0.301277\n", - " -0.953091\n", - " -0.396851\n", - " 0.573416\n", - " -0.712579\n", - " -0.180574\n", - " -0.838739\n", - " -0.666734\n", - " -0.512647\n", - " -0.773191\n", + " 88\n", + " -0.510432\n", + " -0.324180\n", + " -0.994693\n", + " 1332700270\n", + " 215000.0\n", + " 2\n", + " 2.25\n", + " 2.0\n", + " 0\n", + " 0\n", " ...\n", - " -0.2158\n", - " -0.886155\n", - " -1.293987\n", - " 0.254302\n", - " -0.666588\n", - " -0.205992\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " 1979\n", + " 0\n", + " 98056\n", + " 47.5180\n", + " -122.194\n", + " 1950\n", + " 2025\n", + " 0\n", + " 1.574534\n", " \n", " \n", - " 7\n", - " -0.086798\n", - " -1.148038\n", - " -1.49426\n", - " -1.381273\n", - " -1.25661\n", - " -0.232501\n", - " -0.838739\n", - " -0.666734\n", - " -1.369558\n", - " -1.043445\n", + " 15031\n", + " 1.044481\n", + " -0.314813\n", + " 1.005335\n", + " 7129303070\n", + " 735000.0\n", + " 4\n", + " 2.75\n", + " 2.0\n", + " 1\n", + " 4\n", " ...\n", - " -0.2158\n", - " 0.523819\n", - " -0.249176\n", - " -1.018493\n", - " -1.600865\n", - " -0.296686\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " 1966\n", + " 0\n", + " 98118\n", + " 47.5188\n", + " -122.256\n", + " 2620\n", + " 2433\n", + " 2\n", + " -3.317784\n", " \n", " \n", - " 8\n", - " -0.824567\n", - " -1.148038\n", - " -1.49426\n", - " -1.381273\n", - " -1.0934\n", - " -0.15174\n", - " 0.140667\n", - " 0.839159\n", - " -0.512647\n", - " -0.859181\n", - " ...\n", - " -0.2158\n", - " -1.387066\n", - " -1.937882\n", - " -0.60861\n", - " -0.636929\n", - " -0.137397\n", - " 0.0\n", + " 5234\n", + " -0.456065\n", + " -0.136611\n", + " 1.005335\n", + " 2432000130\n", + " 675000.0\n", + " 3\n", + " 1.75\n", " 1.0\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " 0\n", + " ...\n", + " 0\n", + " 1956\n", + " 0\n", + " 98033\n", + " 47.6503\n", + " -122.198\n", + " 2090\n", + " 9549\n", + " 1\n", + " 3.338418\n", " \n", " \n", - " 9\n", - " 1.647935\n", - " -0.762165\n", - " 2.895378\n", - " 0.899198\n", - " 0.963036\n", - " -0.186442\n", - " -0.838739\n", - " -0.666734\n", - " 0.344264\n", - " 0.037571\n", + " 19980\n", + " 0.566046\n", + " 1.239169\n", + " -0.994693\n", + " 774100475\n", + " 415000.0\n", + " 3\n", + " 2.75\n", + " 1.5\n", + " 0\n", + " 0\n", " ...\n", - " -0.2158\n", - " -1.016021\n", - " -1.783519\n", - " -0.896247\n", - " 0.208369\n", - " -0.186332\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " 2009\n", + " 0\n", + " 98014\n", + " 47.7185\n", + " -121.405\n", + " 1740\n", + " 64626\n", + " 1\n", + " 0.456795\n", " \n", " \n", - " 10\n", - " -1.159614\n", - " -0.581287\n", - " -1.49426\n", - " -1.381273\n", - " -1.321893\n", - " -0.185096\n", - " -0.838739\n", - " 0.839159\n", - " -1.369558\n", - " -1.11715\n", + " 3671\n", + " 0.370323\n", + " 4.836825\n", + " 1.005335\n", + " 8847400115\n", + " 590000.0\n", + " 3\n", + " 2.00\n", + " 1.5\n", + " 0\n", + " 0\n", " ...\n", - " -0.2158\n", - " -0.830498\n", - " 0.837799\n", - " 0.304638\n", - " -0.355163\n", - " -0.130796\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " \n", - " \n", - " 11\n", - " -1.329183\n", - " -0.681775\n", - " -1.49426\n", - " -1.381273\n", - " -1.071639\n", - " -0.200575\n", - " -0.838739\n", - " 0.839159\n", - " -0.512647\n", - " -0.834612\n", - " ...\n", - " -0.2158\n", - " 1.024731\n", - " 1.226566\n", - " -1.025683\n", - " -0.444141\n", - " -0.202404\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " 12\n", - " 0.377864\n", - " 0.286926\n", - " 0.700559\n", - " 0.573416\n", - " 0.419005\n", - " 0.256379\n", - " 1.120073\n", - " -0.666734\n", - " 0.344264\n", - " 0.848334\n", - " ...\n", - " -0.2158\n", - " -0.923259\n", - " 1.277306\n", - " -0.169963\n", - " 0.742242\n", - " -0.071779\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " \n", - " \n", - " 13\n", - " 0.289882\n", - " -0.88677\n", - " -0.396851\n", - " 0.573416\n", - " 0.103467\n", - " -0.143853\n", - " -0.838739\n", - " -0.666734\n", - " 0.344264\n", - " -0.244967\n", - " ...\n", - " -0.2158\n", - " 2.045107\n", - " -0.729417\n", - " -0.428836\n", - " -0.043737\n", - " -0.155335\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " 14\n", - " 1.613049\n", - " 0.282907\n", - " -0.396851\n", - " -0.078147\n", - " 0.103467\n", - " -0.259422\n", - " -0.838739\n", - " -0.666734\n", - " 0.344264\n", - " -0.822328\n", - " ...\n", - " -0.2158\n", - " 0.727894\n", - " 0.868529\n", - " -1.277366\n", - " 0.223199\n", - " -0.338303\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " \n", - " \n", - " 15\n", - " -0.962885\n", - " 0.285118\n", - " 0.700559\n", - " 0.573416\n", - " 0.005542\n", - " -0.183813\n", - " -0.838739\n", - " -0.666734\n", - " 0.344264\n", - " -0.380094\n", - " ...\n", - " -0.2158\n", - " -0.478004\n", - " 1.195837\n", - " 0.78643\n", - " 0.445646\n", - " -0.180592\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " \n", - " \n", - " 16\n", - " 1.722145\n", - " -0.259726\n", - " -0.396851\n", - " -0.403928\n", - " -0.571131\n", - " -0.18865\n", - " -0.838739\n", - " 0.839159\n", - " -0.512647\n", - " -0.269535\n", - " ...\n", - " -0.2158\n", - " -0.811945\n", - " 1.222993\n", - " 0.168011\n", - " -0.666588\n", - " -0.213095\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " \n", - " \n", - " 17\n", - " 0.740562\n", - " 1.589247\n", - " 0.700559\n", - " 1.550761\n", - " 2.878025\n", - " 0.466843\n", - " 1.120073\n", - " -0.666734\n", - " 2.058087\n", - " 2.052192\n", - " ...\n", - " -0.2158\n", - " -1.349962\n", - " 0.604825\n", - " 0.340593\n", - " 2.462498\n", - " 0.79434\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " \n", - " \n", - " 18\n", - " -1.555659\n", - " -0.922945\n", - " -0.396851\n", - " -1.381273\n", - " -0.799624\n", - " -0.107784\n", - " -0.838739\n", - " -0.666734\n", - " -0.512647\n", - " -0.527505\n", - " ...\n", - " -0.2158\n", - " 1.432881\n", - " 1.536008\n", - " -0.644564\n", - " -0.978014\n", - " -0.183354\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " 19\n", - " -0.953738\n", - " 0.142224\n", - " 2.895378\n", - " 1.224979\n", - " 0.886872\n", - " 4.00146\n", - " 1.120073\n", - " -0.666734\n", - " -0.512647\n", - " 0.713207\n", - " ...\n", - " 4.605736\n", - " -0.663527\n", - " -1.135335\n", - " 0.85834\n", - " 0.593944\n", - " 1.659169\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", + " 0\n", + " 2005\n", + " 0\n", + " 98010\n", + " 47.3666\n", + " -121.978\n", + " 3180\n", + " 212137\n", + " 1\n", + " 0.076563\n", " \n", " \n", "\n", - "

20 rows × 22 columns

\n", + "

17290 rows × 23 columns

\n", "" ], "text/plain": [ - " id price bedrooms bathrooms sqft_living sqft_lot floors \\\n", - "0 -0.451103 0.916381 0.700559 0.573416 0.081706 -0.187493 -0.838739 \n", - "1 1.845014 -0.589326 -1.49426 -0.72971 -1.191326 -0.302999 1.120073 \n", - "2 -0.388708 -1.184213 -0.396851 -1.381273 -1.060759 0.101544 -0.838739 \n", - "3 -0.74402 0.051922 -1.49426 -1.381273 -1.32951 2.686416 -0.838739 \n", - "4 1.018038 -0.47276 -0.396851 0.247635 -0.320877 0.608196 1.120073 \n", - "5 -0.083826 -0.492858 -0.396851 1.550761 -0.701698 -0.314672 3.078884 \n", - "6 0.301277 -0.953091 -0.396851 0.573416 -0.712579 -0.180574 -0.838739 \n", - "7 -0.086798 -1.148038 -1.49426 -1.381273 -1.25661 -0.232501 -0.838739 \n", - "8 -0.824567 -1.148038 -1.49426 -1.381273 -1.0934 -0.15174 0.140667 \n", - "9 1.647935 -0.762165 2.895378 0.899198 0.963036 -0.186442 -0.838739 \n", - "10 -1.159614 -0.581287 -1.49426 -1.381273 -1.321893 -0.185096 -0.838739 \n", - "11 -1.329183 -0.681775 -1.49426 -1.381273 -1.071639 -0.200575 -0.838739 \n", - "12 0.377864 0.286926 0.700559 0.573416 0.419005 0.256379 1.120073 \n", - "13 0.289882 -0.88677 -0.396851 0.573416 0.103467 -0.143853 -0.838739 \n", - "14 1.613049 0.282907 -0.396851 -0.078147 0.103467 -0.259422 -0.838739 \n", - "15 -0.962885 0.285118 0.700559 0.573416 0.005542 -0.183813 -0.838739 \n", - "16 1.722145 -0.259726 -0.396851 -0.403928 -0.571131 -0.18865 -0.838739 \n", - "17 0.740562 1.589247 0.700559 1.550761 2.878025 0.466843 1.120073 \n", - "18 -1.555659 -0.922945 -0.396851 -1.381273 -0.799624 -0.107784 -0.838739 \n", - "19 -0.953738 0.142224 2.895378 1.224979 0.886872 4.00146 1.120073 \n", + " sqft_living sqft_lot median_price id price bedrooms \\\n", + "20962 -1.360742 -0.262132 -0.994693 1278000210 110000.0 2 \n", + "12284 0.794390 -0.094121 1.005335 2193300390 624000.0 4 \n", + "7343 0.837884 -0.272723 1.005335 4289900005 1535000.0 4 \n", + "14247 -0.782270 -0.196986 -0.994693 316000145 235000.0 4 \n", + "16670 1.011860 0.024330 1.005335 629400480 775000.0 4 \n", + "... ... ... ... ... ... ... \n", + "88 -0.510432 -0.324180 -0.994693 1332700270 215000.0 2 \n", + "15031 1.044481 -0.314813 1.005335 7129303070 735000.0 4 \n", + "5234 -0.456065 -0.136611 1.005335 2432000130 675000.0 3 \n", + "19980 0.566046 1.239169 -0.994693 774100475 415000.0 3 \n", + "3671 0.370323 4.836825 1.005335 8847400115 590000.0 3 \n", "\n", - " condition grade sqft_above ... yr_renovated zipcode lat \\\n", - "0 0.839159 -0.512647 -0.638064 ... -0.2158 -1.349962 0.32254 \n", - "1 -0.666734 -0.512647 -0.969739 ... -0.2158 0.820656 0.417588 \n", - "2 -0.666734 -1.369558 -0.822328 ... -0.2158 0.523819 -0.059795 \n", - "3 -0.666734 -2.22647 -1.125749 ... -0.2158 -0.144063 -1.221808 \n", - "4 -0.666734 -0.512647 0.013003 ... -0.2158 -0.236825 -0.339221 \n", - "5 -0.666734 0.344264 -0.416947 ... -0.2158 0.468162 0.987875 \n", - "6 -0.666734 -0.512647 -0.773191 ... -0.2158 -0.886155 -1.293987 \n", - "7 -0.666734 -1.369558 -1.043445 ... -0.2158 0.523819 -0.249176 \n", - "8 0.839159 -0.512647 -0.859181 ... -0.2158 -1.387066 -1.937882 \n", - "9 -0.666734 0.344264 0.037571 ... -0.2158 -1.016021 -1.783519 \n", - "10 0.839159 -1.369558 -1.11715 ... -0.2158 -0.830498 0.837799 \n", - "11 0.839159 -0.512647 -0.834612 ... -0.2158 1.024731 1.226566 \n", - "12 -0.666734 0.344264 0.848334 ... -0.2158 -0.923259 1.277306 \n", - "13 -0.666734 0.344264 -0.244967 ... -0.2158 2.045107 -0.729417 \n", - "14 -0.666734 0.344264 -0.822328 ... -0.2158 0.727894 0.868529 \n", - "15 -0.666734 0.344264 -0.380094 ... -0.2158 -0.478004 1.195837 \n", - "16 0.839159 -0.512647 -0.269535 ... -0.2158 -0.811945 1.222993 \n", - "17 -0.666734 2.058087 2.052192 ... -0.2158 -1.349962 0.604825 \n", - "18 -0.666734 -0.512647 -0.527505 ... -0.2158 1.432881 1.536008 \n", - "19 -0.666734 -0.512647 0.713207 ... 4.605736 -0.663527 -1.135335 \n", + " bathrooms floors waterfront view ... sqft_basement yr_built \\\n", + "20962 1.00 1.0 0 0 ... 0 1968 \n", + "12284 3.25 1.0 0 0 ... 1130 1980 \n", + "7343 3.25 2.0 0 3 ... 1030 1908 \n", + "14247 1.00 1.5 0 0 ... 0 1941 \n", + "16670 2.75 2.0 0 0 ... 0 1996 \n", + "... ... ... ... ... ... ... ... \n", + "88 2.25 2.0 0 0 ... 0 1979 \n", + "15031 2.75 2.0 1 4 ... 0 1966 \n", + "5234 1.75 1.0 0 0 ... 0 1956 \n", + "19980 2.75 1.5 0 0 ... 0 2009 \n", + "3671 2.00 1.5 0 0 ... 0 2005 \n", "\n", - " long sqft_living15 sqft_lot15 price_h price_l price_m price_vh \n", - "0 0.340593 0.223199 -0.210584 1.0 0.0 0.0 0.0 \n", - "1 -0.601419 -1.022503 -0.421966 0.0 0.0 1.0 0.0 \n", - "2 -1.025683 -0.889035 -0.208431 0.0 1.0 0.0 0.0 \n", - "3 -1.924549 -0.889035 4.682444 0.0 0.0 1.0 0.0 \n", - "4 2.505062 -0.103056 1.375604 0.0 0.0 1.0 0.0 \n", - "5 -0.903438 -0.844546 -0.436854 0.0 0.0 1.0 0.0 \n", - "6 0.254302 -0.666588 -0.205992 0.0 1.0 0.0 0.0 \n", - "7 -1.018493 -1.600865 -0.296686 0.0 1.0 0.0 0.0 \n", - "8 -0.60861 -0.636929 -0.137397 0.0 1.0 0.0 0.0 \n", - "9 -0.896247 0.208369 -0.186332 0.0 1.0 0.0 0.0 \n", - "10 0.304638 -0.355163 -0.130796 0.0 0.0 1.0 0.0 \n", - "11 -1.025683 -0.444141 -0.202404 0.0 1.0 0.0 0.0 \n", - "12 -0.169963 0.742242 -0.071779 0.0 0.0 1.0 0.0 \n", - "13 -0.428836 -0.043737 -0.155335 0.0 1.0 0.0 0.0 \n", - "14 -1.277366 0.223199 -0.338303 0.0 0.0 1.0 0.0 \n", - "15 0.78643 0.445646 -0.180592 0.0 0.0 1.0 0.0 \n", - "16 0.168011 -0.666588 -0.213095 0.0 0.0 1.0 0.0 \n", - "17 0.340593 2.462498 0.79434 0.0 0.0 0.0 1.0 \n", - "18 -0.644564 -0.978014 -0.183354 0.0 1.0 0.0 0.0 \n", - "19 0.85834 0.593944 1.659169 0.0 0.0 1.0 0.0 \n", + " yr_renovated zipcode lat long sqft_living15 sqft_lot15 \\\n", + "20962 2007 98001 47.2655 -122.244 828 5402 \n", + "12284 0 98052 47.6920 -122.099 2110 11250 \n", + "7343 2003 98122 47.6147 -122.285 2130 4200 \n", + "14247 0 98168 47.5054 -122.301 1280 7175 \n", + "16670 0 98075 47.5895 -121.994 3330 12333 \n", + "... ... ... ... ... ... ... \n", + "88 0 98056 47.5180 -122.194 1950 2025 \n", + "15031 0 98118 47.5188 -122.256 2620 2433 \n", + "5234 0 98033 47.6503 -122.198 2090 9549 \n", + "19980 0 98014 47.7185 -121.405 1740 64626 \n", + "3671 0 98010 47.3666 -121.978 3180 212137 \n", "\n", - "[20 rows x 22 columns]" + " price_category Living_area_to_Lot_ratio \n", + "20962 0 5.191063 \n", + "12284 1 -8.440052 \n", + "7343 2 -3.072292 \n", + "14247 0 3.971201 \n", + "16670 2 41.589045 \n", + "... ... ... \n", + "88 0 1.574534 \n", + "15031 2 -3.317784 \n", + "5234 1 3.338418 \n", + "19980 1 0.456795 \n", + "3671 1 0.076563 \n", + "\n", + "[17290 rows x 23 columns]" ] }, - "execution_count": 340, + "execution_count": 161, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import numpy as np\n", - "from sklearn.base import BaseEstimator, TransformerMixin\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.discriminant_analysis import StandardScaler\n", - "from sklearn.impute import SimpleImputer\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.preprocessing import OneHotEncoder\n", - "from sklearn.ensemble import RandomForestRegressor # Пример регрессионной модели\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.pipeline import make_pipeline\n", - "\n", - "random_state = 42\n", - "\n", - "# Указываем столбцы, которые нужно удалить и обрабатывать\n", - "columns_to_drop = [\"date\", \"view\", \"waterfront\"]\n", - "num_columns = [\n", - " column\n", - " for column in df.columns\n", - " if column not in columns_to_drop and df[column].dtype != \"object\" and df[column].dtype != \"category\"\n", - "]\n", - "cat_columns = [\n", - " column\n", - " for column in df.columns\n", - " if column not in columns_to_drop and df[column].dtype == \"object\" or df[column].dtype == \"category\"\n", - "]\n", - "\n", - "# Определяем предобработку для численных данных\n", - "num_imputer = SimpleImputer(strategy=\"median\")\n", - "num_scaler = StandardScaler()\n", - "preprocessing_num = Pipeline(\n", - " [\n", - " (\"imputer\", num_imputer),\n", - " (\"scaler\", num_scaler),\n", - " ]\n", + "preprocessing_result = pipeline_end.fit_transform(X_train)\n", + "preprocessed_df = pd.DataFrame(\n", + " preprocessing_result,\n", + " columns=pipeline_end.get_feature_names_out(),\n", ")\n", "\n", - "# Определяем предобработку для категориальных данных\n", - "cat_imputer = SimpleImputer(strategy=\"constant\")\n", - "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False)\n", - "preprocessing_cat = Pipeline(\n", - " [\n", - " (\"imputer\", cat_imputer),\n", - " (\"encoder\", cat_encoder),\n", - " ]\n", - ")\n", - "\n", - "features_preprocessing = ColumnTransformer(\n", - " verbose_feature_names_out=False,\n", - " transformers=[\n", - " (\"prepocessing_num\", preprocessing_num, num_columns),\n", - " (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n", - " (\"prepocessing_features\", cat_imputer, [\"price_category\"]),\n", - " ],\n", - " remainder=\"passthrough\"\n", - ")\n", - "\n", - "drop_columns = ColumnTransformer(\n", - " verbose_feature_names_out=False,\n", - " transformers=[\n", - " (\"drop_columns\", \"drop\", columns_to_drop),\n", - " ],\n", - " remainder=\"passthrough\",\n", - ")\n", - "\n", - "features_postprocessing = ColumnTransformer(\n", - " verbose_feature_names_out=False,\n", - " transformers=[\n", - " (\"prepocessing_cat\", preprocessing_cat, [\"price_category\"]),\n", - " ],\n", - " remainder=\"passthrough\",\n", - ")\n", - "\n", - "pipeline_end = Pipeline(\n", - " [\n", - " (\"features_preprocessing\", features_preprocessing),\n", - " (\"drop_columns\", drop_columns),\n", - " (\"features_postprocessing\", features_postprocessing),\n", - " ]\n", - "\n", - ")\n", - "# preprocessing_result = pipeline_end.fit_transform(X_train.values)\n", - "cols = ['price_h', 'price_l', 'price_m', 'price_vh']\n", - "preprocessing_result = features_preprocessing.fit_transform(X_train)\n", - "preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns + cols + columns_to_drop)\n", - "\n", - "preprocessing_result = drop_columns.fit_transform(preprocessing_result)\n", - "preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cols + cat_columns)\n", - "\n", - "preprocessing_result = preprocessing_result.drop(columns=[\"price_category\"])\n", - "preprocessing_result.head(20)" + "preprocessed_df" ] }, { @@ -2835,33 +2278,36 @@ }, { "cell_type": "code", - "execution_count": 341, + "execution_count": 162, "metadata": {}, "outputs": [], "source": [ - "from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree\n", + "from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree, svm\n", "\n", "class_models = {\n", - " \"logistic\": {\"model\": linear_model.LogisticRegression()},\n", - " # \"ridge\": {\"model\": linear_model.RidgeClassifierCV(cv=5, class_weight=\"balanced\")},\n", - " \"ridge\": {\"model\": linear_model.LogisticRegression(penalty=\"l2\", class_weight=\"balanced\")},\n", + " \"logistic\": {\"model\": linear_model.LogisticRegression(max_iter=150)},\n", + " \"ridge\": {\"model\": linear_model.RidgeClassifierCV(cv=5, class_weight=\"balanced\")},\n", + " \"ridge\": {\"model\": linear_model.LogisticRegression(max_iter=150, solver='lbfgs', penalty=\"l2\", class_weight=\"balanced\")},\n", " \"decision_tree\": {\n", - " \"model\": tree.DecisionTreeClassifier(max_depth=7, random_state=random_state)\n", + " \"model\": tree.DecisionTreeClassifier(max_depth=5, min_samples_split=10, random_state=random_state)\n", " },\n", + "\n", " \"knn\": {\"model\": neighbors.KNeighborsClassifier(n_neighbors=7)},\n", " \"naive_bayes\": {\"model\": naive_bayes.GaussianNB()},\n", " \"gradient_boosting\": {\n", " \"model\": ensemble.GradientBoostingClassifier(n_estimators=210)\n", " },\n", + "\n", " \"random_forest\": {\n", " \"model\": ensemble.RandomForestClassifier(\n", - " max_depth=11, class_weight=\"balanced\", random_state=random_state\n", + " max_depth=5, class_weight=\"balanced\", random_state=random_state\n", " )\n", " },\n", + "\n", " \"mlp\": {\n", " \"model\": neural_network.MLPClassifier(\n", " hidden_layer_sizes=(7,),\n", - " max_iter=500,\n", + " max_iter=200,\n", " early_stopping=True,\n", " random_state=random_state,\n", " )\n", @@ -2878,44 +2324,21 @@ }, { "cell_type": "code", - "execution_count": 343, + "execution_count": 163, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Model: logistic\n" - ] - }, - { - "ename": "ValueError", - "evalue": "Specifying the columns using strings is only supported for dataframes.", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\utils\\_indexing.py:338\u001b[0m, in \u001b[0;36m_get_column_indices\u001b[1;34m(X, key)\u001b[0m\n\u001b[0;32m 337\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 338\u001b[0m all_columns \u001b[38;5;241m=\u001b[39m \u001b[43mX\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\n\u001b[0;32m 339\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m:\n", - "\u001b[1;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'columns'", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[343], line 9\u001b[0m\n\u001b[0;32m 6\u001b[0m model \u001b[38;5;241m=\u001b[39m class_models[model_name][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 8\u001b[0m model_pipeline \u001b[38;5;241m=\u001b[39m Pipeline([(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpipeline\u001b[39m\u001b[38;5;124m\"\u001b[39m, pipeline_end), (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m\"\u001b[39m, model)])\n\u001b[1;32m----> 9\u001b[0m model_pipeline \u001b[38;5;241m=\u001b[39m \u001b[43mmodel_pipeline\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mravel\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 11\u001b[0m y_train_predict \u001b[38;5;241m=\u001b[39m model_pipeline\u001b[38;5;241m.\u001b[39mpredict(X_train)\n\u001b[0;32m 12\u001b[0m y_test_probs \u001b[38;5;241m=\u001b[39m model_pipeline\u001b[38;5;241m.\u001b[39mpredict_proba(X_test)[:, \u001b[38;5;241m1\u001b[39m]\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\pipeline.py:469\u001b[0m, in \u001b[0;36mPipeline.fit\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 426\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Fit the model.\u001b[39;00m\n\u001b[0;32m 427\u001b[0m \n\u001b[0;32m 428\u001b[0m \u001b[38;5;124;03mFit all the transformers one after the other and sequentially transform the\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 466\u001b[0m \u001b[38;5;124;03m Pipeline with fitted steps.\u001b[39;00m\n\u001b[0;32m 467\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 468\u001b[0m routed_params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_method_params(method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit\u001b[39m\u001b[38;5;124m\"\u001b[39m, props\u001b[38;5;241m=\u001b[39mparams)\n\u001b[1;32m--> 469\u001b[0m Xt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrouted_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 470\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_log_message(\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m)):\n\u001b[0;32m 471\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_final_estimator \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpassthrough\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\pipeline.py:406\u001b[0m, in \u001b[0;36mPipeline._fit\u001b[1;34m(self, X, y, routed_params)\u001b[0m\n\u001b[0;32m 404\u001b[0m cloned_transformer \u001b[38;5;241m=\u001b[39m clone(transformer)\n\u001b[0;32m 405\u001b[0m \u001b[38;5;66;03m# Fit or load from cache the current transformer\u001b[39;00m\n\u001b[1;32m--> 406\u001b[0m X, fitted_transformer \u001b[38;5;241m=\u001b[39m \u001b[43mfit_transform_one_cached\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 407\u001b[0m \u001b[43m \u001b[49m\u001b[43mcloned_transformer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 408\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 409\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 410\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 411\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage_clsname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPipeline\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 412\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_message\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstep_idx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 413\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrouted_params\u001b[49m\u001b[43m[\u001b[49m\u001b[43mname\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 414\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 415\u001b[0m \u001b[38;5;66;03m# Replace the transformer of the step with the fitted\u001b[39;00m\n\u001b[0;32m 416\u001b[0m \u001b[38;5;66;03m# transformer. This is necessary when loading the transformer\u001b[39;00m\n\u001b[0;32m 417\u001b[0m \u001b[38;5;66;03m# from the cache.\u001b[39;00m\n\u001b[0;32m 418\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps[step_idx] \u001b[38;5;241m=\u001b[39m (name, fitted_transformer)\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\joblib\\memory.py:312\u001b[0m, in \u001b[0;36mNotMemorizedFunc.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 311\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 312\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\pipeline.py:1310\u001b[0m, in \u001b[0;36m_fit_transform_one\u001b[1;34m(transformer, X, y, weight, message_clsname, message, params)\u001b[0m\n\u001b[0;32m 1308\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(message_clsname, message):\n\u001b[0;32m 1309\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(transformer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m-> 1310\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mtransformer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfit_transform\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1311\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1312\u001b[0m res \u001b[38;5;241m=\u001b[39m transformer\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit\u001b[39m\u001b[38;5;124m\"\u001b[39m, {}))\u001b[38;5;241m.\u001b[39mtransform(\n\u001b[0;32m 1313\u001b[0m X, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtransform\u001b[39m\u001b[38;5;124m\"\u001b[39m, {})\n\u001b[0;32m 1314\u001b[0m )\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\pipeline.py:533\u001b[0m, in \u001b[0;36mPipeline.fit_transform\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 490\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Fit the model and transform with the final estimator.\u001b[39;00m\n\u001b[0;32m 491\u001b[0m \n\u001b[0;32m 492\u001b[0m \u001b[38;5;124;03mFit all the transformers one after the other and sequentially transform\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 530\u001b[0m \u001b[38;5;124;03m Transformed samples.\u001b[39;00m\n\u001b[0;32m 531\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 532\u001b[0m routed_params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_method_params(method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m, props\u001b[38;5;241m=\u001b[39mparams)\n\u001b[1;32m--> 533\u001b[0m Xt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrouted_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 535\u001b[0m last_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_final_estimator\n\u001b[0;32m 536\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_log_message(\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m)):\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\pipeline.py:406\u001b[0m, in \u001b[0;36mPipeline._fit\u001b[1;34m(self, X, y, routed_params)\u001b[0m\n\u001b[0;32m 404\u001b[0m cloned_transformer \u001b[38;5;241m=\u001b[39m clone(transformer)\n\u001b[0;32m 405\u001b[0m \u001b[38;5;66;03m# Fit or load from cache the current transformer\u001b[39;00m\n\u001b[1;32m--> 406\u001b[0m X, fitted_transformer \u001b[38;5;241m=\u001b[39m \u001b[43mfit_transform_one_cached\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 407\u001b[0m \u001b[43m \u001b[49m\u001b[43mcloned_transformer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 408\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 409\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 410\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 411\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage_clsname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPipeline\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 412\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_message\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstep_idx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 413\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrouted_params\u001b[49m\u001b[43m[\u001b[49m\u001b[43mname\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 414\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 415\u001b[0m \u001b[38;5;66;03m# Replace the transformer of the step with the fitted\u001b[39;00m\n\u001b[0;32m 416\u001b[0m \u001b[38;5;66;03m# transformer. This is necessary when loading the transformer\u001b[39;00m\n\u001b[0;32m 417\u001b[0m \u001b[38;5;66;03m# from the cache.\u001b[39;00m\n\u001b[0;32m 418\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps[step_idx] \u001b[38;5;241m=\u001b[39m (name, fitted_transformer)\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\joblib\\memory.py:312\u001b[0m, in \u001b[0;36mNotMemorizedFunc.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 311\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 312\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\pipeline.py:1310\u001b[0m, in \u001b[0;36m_fit_transform_one\u001b[1;34m(transformer, X, y, weight, message_clsname, message, params)\u001b[0m\n\u001b[0;32m 1308\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(message_clsname, message):\n\u001b[0;32m 1309\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(transformer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m-> 1310\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mtransformer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfit_transform\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1311\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1312\u001b[0m res \u001b[38;5;241m=\u001b[39m transformer\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit\u001b[39m\u001b[38;5;124m\"\u001b[39m, {}))\u001b[38;5;241m.\u001b[39mtransform(\n\u001b[0;32m 1313\u001b[0m X, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtransform\u001b[39m\u001b[38;5;124m\"\u001b[39m, {})\n\u001b[0;32m 1314\u001b[0m )\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\utils\\_set_output.py:316\u001b[0m, in \u001b[0;36m_wrap_method_output..wrapped\u001b[1;34m(self, X, *args, **kwargs)\u001b[0m\n\u001b[0;32m 314\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[0;32m 315\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 316\u001b[0m data_to_wrap \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[0;32m 318\u001b[0m \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[0;32m 319\u001b[0m return_tuple \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 320\u001b[0m _wrap_data_with_container(method, data_to_wrap[\u001b[38;5;241m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[0;32m 321\u001b[0m \u001b[38;5;241m*\u001b[39mdata_to_wrap[\u001b[38;5;241m1\u001b[39m:],\n\u001b[0;32m 322\u001b[0m )\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:968\u001b[0m, in \u001b[0;36mColumnTransformer.fit_transform\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 965\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_transformers()\n\u001b[0;32m 966\u001b[0m n_samples \u001b[38;5;241m=\u001b[39m _num_samples(X)\n\u001b[1;32m--> 968\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_column_callables\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 969\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_remainder(X)\n\u001b[0;32m 971\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _routing_enabled():\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:536\u001b[0m, in \u001b[0;36mColumnTransformer._validate_column_callables\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m 534\u001b[0m columns \u001b[38;5;241m=\u001b[39m columns(X)\n\u001b[0;32m 535\u001b[0m all_columns\u001b[38;5;241m.\u001b[39mappend(columns)\n\u001b[1;32m--> 536\u001b[0m transformer_to_input_indices[name] \u001b[38;5;241m=\u001b[39m \u001b[43m_get_column_indices\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 538\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_columns \u001b[38;5;241m=\u001b[39m all_columns\n\u001b[0;32m 539\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_transformer_to_input_indices \u001b[38;5;241m=\u001b[39m transformer_to_input_indices\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\utils\\_indexing.py:340\u001b[0m, in \u001b[0;36m_get_column_indices\u001b[1;34m(X, key)\u001b[0m\n\u001b[0;32m 338\u001b[0m all_columns \u001b[38;5;241m=\u001b[39m X\u001b[38;5;241m.\u001b[39mcolumns\n\u001b[0;32m 339\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m:\n\u001b[1;32m--> 340\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 341\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSpecifying the columns using strings is only supported for dataframes.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 342\u001b[0m )\n\u001b[0;32m 343\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m 344\u001b[0m columns \u001b[38;5;241m=\u001b[39m [key]\n", - "\u001b[1;31mValueError\u001b[0m: Specifying the columns using strings is only supported for dataframes." + "Model: logistic\n", + "Model: ridge\n", + "Model: decision_tree\n", + "Model: knn\n", + "Model: naive_bayes\n", + "Model: gradient_boosting\n", + "Model: random_forest\n", + "Model: mlp\n" ] } ], @@ -2928,7 +2351,7 @@ " model = class_models[model_name][\"model\"]\n", "\n", " model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n", - " model_pipeline = model_pipeline.fit(X_train.values, y_train.values.ravel())\n", + " model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())\n", "\n", " y_train_predict = model_pipeline.predict(X_train)\n", " y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]\n", @@ -2939,10 +2362,10 @@ " class_models[model_name][\"preds\"] = y_test_predict\n", "\n", " class_models[model_name][\"Precision_train\"] = metrics.precision_score(\n", - " y_train, y_train_predict\n", + " y_train, y_train_predict, zero_division=1\n", " )\n", " class_models[model_name][\"Precision_test\"] = metrics.precision_score(\n", - " y_test, y_test_predict\n", + " y_test, y_test_predict, zero_division=1\n", " )\n", " class_models[model_name][\"Recall_train\"] = metrics.recall_score(\n", " y_train, y_train_predict\n", @@ -2971,6 +2394,2943 @@ " y_test, y_test_predict\n", " )" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Сводная таблица оценок качества для использованных моделей классификации. Матрица неточностей" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.metrics import ConfusionMatrixDisplay\n", + "import matplotlib.pyplot as plt\n", + "\n", + "_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(12, 10), sharex=False, sharey=False)\n", + "for index, key in enumerate(class_models.keys()):\n", + " c_matrix = class_models[key][\"Confusion_matrix\"]\n", + " disp = ConfusionMatrixDisplay(\n", + " confusion_matrix=c_matrix, display_labels=[\"Less\", \"More\"]\n", + " ).plot(ax=ax.flat[index])\n", + " disp.ax_.set_title(key)\n", + "\n", + "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "На данных графиках, левый нижний квадрат обозначает, кол-во правильно классифицированных значениях, относимых к классу \"Less\", чем больше число в этом квадрате, тем лучше модель может классифицировать этот класс. Нижний левый квадрат отвечает за кол-во правильно классифицированных значениях \"More\". Здесь так же как и в левом верхнем, чем выше значение, тем лучше.\n", + "\n", + "### Точность, полнота, верность (аккуратность), F-мера" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 Precision_trainPrecision_testRecall_trainRecall_testAccuracy_trainAccuracy_testF1_trainF1_test
logistic1.0000001.0000000.9997671.0000000.9998841.0000000.9998841.000000
ridge1.0000001.0000000.9996511.0000000.9998261.0000000.9998261.000000
decision_tree1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
gradient_boosting1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
random_forest1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
naive_bayes1.0000001.0000000.7867190.7939530.8939270.8975250.8806300.885144
knn0.8724860.8274730.8577740.8209300.8669170.8258150.8650680.824189
mlp0.6875000.6153850.0025580.0037210.5033550.5033540.0050980.007397
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 165, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n", + " [\n", + " \"Precision_train\",\n", + " \"Precision_test\",\n", + " \"Recall_train\",\n", + " \"Recall_test\",\n", + " \"Accuracy_train\",\n", + " \"Accuracy_test\",\n", + " \"F1_train\",\n", + " \"F1_test\",\n", + " ]\n", + "]\n", + "class_metrics.sort_values(\n", + " by=\"Accuracy_test\", ascending=False\n", + ").style.background_gradient(\n", + " cmap=\"plasma\",\n", + " low=0.3,\n", + " high=1,\n", + " subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n", + ").background_gradient(\n", + " cmap=\"viridis\",\n", + " low=1,\n", + " high=0.3,\n", + " subset=[\n", + " \"Precision_train\",\n", + " \"Precision_test\",\n", + " \"Recall_train\",\n", + " \"Recall_test\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "100% точности у модели может свидетельствовать о ее переобучении, то есть модели обучилась классифицировать значения только для обучающей выборки, но на тестовой выборке результаты будут плохими.\n", + "\n", + "### ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 Accuracy_testF1_testROC_AUC_testCohen_kappa_testMCC_test
logistic1.0000001.0000001.0000001.0000001.000000
ridge1.0000001.0000001.0000001.0000001.000000
decision_tree1.0000001.0000001.0000001.0000001.000000
gradient_boosting1.0000001.0000001.0000001.0000001.000000
random_forest1.0000001.0000001.0000001.0000001.000000
naive_bayes0.8975250.8851440.9995660.7948200.812098
knn0.8258150.8241890.9108230.6516060.651627
mlp0.5033540.0073970.4970710.0014270.012966
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 166, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n", + " [\n", + " \"Accuracy_test\",\n", + " \"F1_test\",\n", + " \"ROC_AUC_test\",\n", + " \"Cohen_kappa_test\",\n", + " \"MCC_test\",\n", + " ]\n", + "]\n", + "class_metrics.sort_values(by=\"ROC_AUC_test\", ascending=False).style.background_gradient(\n", + " cmap=\"plasma\",\n", + " low=0.3,\n", + " high=1,\n", + " subset=[\n", + " \"ROC_AUC_test\",\n", + " \"MCC_test\",\n", + " \"Cohen_kappa_test\",\n", + " ],\n", + ").background_gradient(\n", + " cmap=\"viridis\",\n", + " low=1,\n", + " high=0.3,\n", + " subset=[\n", + " \"Accuracy_test\",\n", + " \"F1_test\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Вывод лучшей модели" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'logistic'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "best_model = str(class_metrics.sort_values(by=\"MCC_test\", ascending=False).iloc[0].name)\n", + "\n", + "display(best_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Вывод данных с ошибкой предсказания для оценки" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Error items count: 0'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idPredicteddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfront...sqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15price_categorymedian_price
\n", + "

0 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [id, Predicted, date, price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront, view, condition, grade, sqft_above, sqft_basement, yr_built, yr_renovated, zipcode, lat, long, sqft_living15, sqft_lot15, price_category, median_price]\n", + "Index: []\n", + "\n", + "[0 rows x 24 columns]" + ] + }, + "execution_count": 168, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preprocessing_result = pipeline_end.transform(X_test)\n", + "preprocessed_df = pd.DataFrame(\n", + " preprocessing_result,\n", + " columns=pipeline_end.get_feature_names_out(),\n", + ")\n", + "\n", + "y_pred = class_models[best_model][\"preds\"]\n", + "\n", + "error_index = y_test[y_test[\"median_price\"] != y_pred].index.tolist()\n", + "display(f\"Error items count: {len(error_index)}\")\n", + "\n", + "error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]\n", + "error_df = X_test.loc[error_index].copy()\n", + "error_df.insert(loc=1, column=\"Predicted\", value=error_predicted)\n", + "error_df.sort_index()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Пример использования обученной модели (конвейера) для предсказания" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...sqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15price_categorymedian_price
11592202870100020140529T000000635200.041.75164042401.000...720192109811747.6766-122.3681300424011
\n", + "

1 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living \\\n", + "11592 2028701000 20140529T000000 635200.0 4 1.75 1640 \n", + "\n", + " sqft_lot floors waterfront view ... sqft_basement yr_built \\\n", + "11592 4240 1.0 0 0 ... 720 1921 \n", + "\n", + " yr_renovated zipcode lat long sqft_living15 sqft_lot15 \\\n", + "11592 0 98117 47.6766 -122.368 1300 4240 \n", + "\n", + " price_category median_price \n", + "11592 1 1 \n", + "\n", + "[1 rows x 23 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sqft_livingsqft_lotmedian_priceidpricebedroomsbathroomsfloorswaterfrontview...sqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15price_categoryLiving_area_to_Lot_ratio
11592-0.477812-0.2692261.0053352.028701e+09635200.04.01.751.00.00.0...720.01921.00.098117.047.6766-122.3681300.04240.01.01.774763
\n", + "

1 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " sqft_living sqft_lot median_price id price bedrooms \\\n", + "11592 -0.477812 -0.269226 1.005335 2.028701e+09 635200.0 4.0 \n", + "\n", + " bathrooms floors waterfront view ... sqft_basement yr_built \\\n", + "11592 1.75 1.0 0.0 0.0 ... 720.0 1921.0 \n", + "\n", + " yr_renovated zipcode lat long sqft_living15 sqft_lot15 \\\n", + "11592 0.0 98117.0 47.6766 -122.368 1300.0 4240.0 \n", + "\n", + " price_category Living_area_to_Lot_ratio \n", + "11592 1.0 1.774763 \n", + "\n", + "[1 rows x 23 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'predicted: 1 (proba: [0. 1.])'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'real: 1'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "model = class_models[best_model][\"pipeline\"]\n", + "\n", + "example_id = 11592\n", + "test = pd.DataFrame(X_test.loc[example_id, :]).T\n", + "test_preprocessed = pd.DataFrame(preprocessed_df.loc[example_id, :]).T\n", + "display(test)\n", + "display(test_preprocessed)\n", + "result_proba = model.predict_proba(test)[0]\n", + "result = model.predict(test)[0]\n", + "real = int(y_test.loc[example_id].values[0])\n", + "display(f\"predicted: {result} (proba: {result_proba})\")\n", + "display(f\"real: {real}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Подбор гиперпараметров методом поиска по сетке" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'model__criterion': 'gini',\n", + " 'model__max_depth': 5,\n", + " 'model__max_features': 'sqrt',\n", + " 'model__n_estimators': 10}" + ] + }, + "execution_count": 170, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "optimized_model_type = \"random_forest\"\n", + "\n", + "random_forest_model = class_models[optimized_model_type][\"pipeline\"]\n", + "\n", + "param_grid = {\n", + " \"model__n_estimators\": [10, 50, 100],\n", + " \"model__max_features\": [\"sqrt\", \"log2\"],\n", + " \"model__max_depth\": [5, 7, 10],\n", + " \"model__criterion\": [\"gini\", \"entropy\"],\n", + "}\n", + "\n", + "gs_optomizer = GridSearchCV(\n", + " estimator=random_forest_model, param_grid=param_grid, n_jobs=-1\n", + ")\n", + "gs_optomizer.fit(X_train, y_train.values.ravel())\n", + "gs_optomizer.best_params_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Обучение модели с новыми гиперпараметрами" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "metadata": {}, + "outputs": [], + "source": [ + "optimized_model = ensemble.RandomForestClassifier(\n", + " random_state=random_state,\n", + " criterion=\"gini\",\n", + " max_depth=5,\n", + " max_features=\"sqrt\",\n", + " n_estimators=10,\n", + ")\n", + "\n", + "result = {}\n", + "\n", + "result[\"pipeline\"] = Pipeline([(\"pipeline\", pipeline_end), (\"model\", optimized_model)]).fit(X_train, y_train.values.ravel())\n", + "result[\"train_preds\"] = result[\"pipeline\"].predict(X_train)\n", + "result[\"probs\"] = result[\"pipeline\"].predict_proba(X_test)[:, 1]\n", + "result[\"preds\"] = np.where(result[\"probs\"] > 0.5, 1, 0)\n", + "\n", + "result[\"Precision_train\"] = metrics.precision_score(y_train, result[\"train_preds\"])\n", + "result[\"Precision_test\"] = metrics.precision_score(y_test, result[\"preds\"])\n", + "result[\"Recall_train\"] = metrics.recall_score(y_train, result[\"train_preds\"])\n", + "result[\"Recall_test\"] = metrics.recall_score(y_test, result[\"preds\"])\n", + "result[\"Accuracy_train\"] = metrics.accuracy_score(y_train, result[\"train_preds\"])\n", + "result[\"Accuracy_test\"] = metrics.accuracy_score(y_test, result[\"preds\"])\n", + "result[\"ROC_AUC_test\"] = metrics.roc_auc_score(y_test, result[\"probs\"])\n", + "result[\"F1_train\"] = metrics.f1_score(y_train, result[\"train_preds\"])\n", + "result[\"F1_test\"] = metrics.f1_score(y_test, result[\"preds\"])\n", + "result[\"MCC_test\"] = metrics.matthews_corrcoef(y_test, result[\"preds\"])\n", + "result[\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(y_test, result[\"preds\"])\n", + "result[\"Confusion_matrix\"] = metrics.confusion_matrix(y_test, result[\"preds\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Формирование данных для оценки старой и новой версии модели" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": {}, + "outputs": [], + "source": [ + "optimized_metrics = pd.DataFrame(columns=list(result.keys()))\n", + "optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n", + " data=class_models[optimized_model_type]\n", + ")\n", + "optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n", + " data=result\n", + ")\n", + "optimized_metrics.insert(loc=0, column=\"Name\", value=[\"Old\", \"New\"])\n", + "optimized_metrics = optimized_metrics.set_index(\"Name\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Оценка параметров старой и новой модели" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 Precision_trainPrecision_testRecall_trainRecall_testAccuracy_trainAccuracy_testF1_trainF1_test
Name        
Old1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
New1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 173, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "optimized_metrics[\n", + " [\n", + " \"Precision_train\",\n", + " \"Precision_test\",\n", + " \"Recall_train\",\n", + " \"Recall_test\",\n", + " \"Accuracy_train\",\n", + " \"Accuracy_test\",\n", + " \"F1_train\",\n", + " \"F1_test\",\n", + " ]\n", + "].style.background_gradient(\n", + " cmap=\"plasma\",\n", + " low=0.3,\n", + " high=1,\n", + " subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n", + ").background_gradient(\n", + " cmap=\"viridis\",\n", + " low=1,\n", + " high=0.3,\n", + " subset=[\n", + " \"Precision_train\",\n", + " \"Precision_test\",\n", + " \"Recall_train\",\n", + " \"Recall_test\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Значения 1 в кажой ячейке обосзначают, что модели очень точно классифицируют положительные образцы, не пропуская их." + ] + }, + { + "cell_type": "code", + "execution_count": 174, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 Accuracy_testF1_testROC_AUC_testCohen_kappa_testMCC_test
Name     
Old1.0000001.0000001.0000001.0000001.000000
New1.0000001.0000001.0000001.0000001.000000
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 174, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "optimized_metrics[\n", + " [\n", + " \"Accuracy_test\",\n", + " \"F1_test\",\n", + " \"ROC_AUC_test\",\n", + " \"Cohen_kappa_test\",\n", + " \"MCC_test\",\n", + " ]\n", + "].style.background_gradient(\n", + " cmap=\"plasma\",\n", + " low=0.3,\n", + " high=1,\n", + " subset=[\n", + " \"ROC_AUC_test\",\n", + " \"MCC_test\",\n", + " \"Cohen_kappa_test\",\n", + " ],\n", + ").background_gradient(\n", + " cmap=\"viridis\",\n", + " low=1,\n", + " high=0.3,\n", + " subset=[\n", + " \"Accuracy_test\",\n", + " \"F1_test\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Значения 1 в кажой ячейке обосзначают, что модели точно классифицировали все тестовые примеры, не допустив никаких ошибок в предсказаниях." + ] + }, + { + "cell_type": "code", + "execution_count": 175, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False\n", + ")\n", + "\n", + "for index in range(0, len(optimized_metrics)):\n", + " c_matrix = optimized_metrics.iloc[index][\"Confusion_matrix\"]\n", + " disp = ConfusionMatrixDisplay(\n", + " confusion_matrix=c_matrix, display_labels=[\"Less\", \"More\"]\n", + " ).plot(ax=ax.flat[index])\n", + "\n", + "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Задача регресии: предсказание цены дома (price)." + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Среднее значение поля: 2079.8997362698374\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...yr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15price_categorymedian_priceaverage_price
0712930052020141013T000000221900.031.00118056501.000...195509817847.5112-122.25713405650000
1641410019220141209T000000538000.032.25257072422.000...195119919812547.7210-122.31916907639111
2563150040020150225T000000180000.021.00770100001.000...193309802847.7379-122.23327208062000
3248720087520141209T000000604000.043.00196050001.000...196509813647.5208-122.39313605000110
4195440051020150218T000000510000.032.00168080801.000...198709807447.6168-122.04518007503110
\n", + "

5 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living \\\n", + "0 7129300520 20141013T000000 221900.0 3 1.00 1180 \n", + "1 6414100192 20141209T000000 538000.0 3 2.25 2570 \n", + "2 5631500400 20150225T000000 180000.0 2 1.00 770 \n", + "3 2487200875 20141209T000000 604000.0 4 3.00 1960 \n", + "4 1954400510 20150218T000000 510000.0 3 2.00 1680 \n", + "\n", + " sqft_lot floors waterfront view ... yr_built yr_renovated zipcode \\\n", + "0 5650 1.0 0 0 ... 1955 0 98178 \n", + "1 7242 2.0 0 0 ... 1951 1991 98125 \n", + "2 10000 1.0 0 0 ... 1933 0 98028 \n", + "3 5000 1.0 0 0 ... 1965 0 98136 \n", + "4 8080 1.0 0 0 ... 1987 0 98074 \n", + "\n", + " lat long sqft_living15 sqft_lot15 price_category median_price \\\n", + "0 47.5112 -122.257 1340 5650 0 0 \n", + "1 47.7210 -122.319 1690 7639 1 1 \n", + "2 47.7379 -122.233 2720 8062 0 0 \n", + "3 47.5208 -122.393 1360 5000 1 1 \n", + "4 47.6168 -122.045 1800 7503 1 1 \n", + "\n", + " average_price \n", + "0 0 \n", + "1 1 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + "[5 rows x 24 columns]" + ] + }, + "execution_count": 176, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Вычисление среднего значения поля \"sqft_living\"\n", + "average_price = df['sqft_living'].mean()\n", + "print(f\"Среднее значение поля: {average_price}\")\n", + "\n", + "# Создание новой колонки, указывающей, выше или ниже среднего значение цена закрытия\n", + "df['average_price'] = (df['sqft_living'] > average_price).astype(int)\n", + "\n", + "df.dropna(inplace=True)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Делим DF на выборки" + ] + }, + { + "cell_type": "code", + "execution_count": 179, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'X_train'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...sqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15price_categorymedian_price
6325546791019020140527T000000325000.031.751780130951.000...0198309804247.3670-122.15227501309510
13473933180058020150310T000000257000.021.00100037001.000...200192909811847.5520-122.2901270500000
17614240700040520150226T000000228500.031.00108074861.500...90194209814647.4838-122.3351170780000
16970546670029020150108T000000288000.032.25209075001.000...810197709803147.3951-122.1721800735000
20868302605936120150417T000000479000.022.50174114392.000...295200709803447.7043-122.20920901045411
..................................................................
11964527220004520141113T000000378000.031.50100069141.000...0194709812547.7144-122.3191000694710
21575957850079020141111T000000399950.032.50308750022.000...0201409802347.2974-122.3492927518310
5390720235048020140930T000000575000.032.50212047802.000...0200409805347.6810-122.0321690265011
860172304903320140620T000000245000.010.75380150001.000...0196309816847.4810-122.32311701500000
15795614765028020150325T000000315000.042.50313059992.000...0200609804247.3837-122.0993020599710
\n", + "

17290 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms \\\n", + "6325 5467910190 20140527T000000 325000.0 3 1.75 \n", + "13473 9331800580 20150310T000000 257000.0 2 1.00 \n", + "17614 2407000405 20150226T000000 228500.0 3 1.00 \n", + "16970 5466700290 20150108T000000 288000.0 3 2.25 \n", + "20868 3026059361 20150417T000000 479000.0 2 2.50 \n", + "... ... ... ... ... ... \n", + "11964 5272200045 20141113T000000 378000.0 3 1.50 \n", + "21575 9578500790 20141111T000000 399950.0 3 2.50 \n", + "5390 7202350480 20140930T000000 575000.0 3 2.50 \n", + "860 1723049033 20140620T000000 245000.0 1 0.75 \n", + "15795 6147650280 20150325T000000 315000.0 4 2.50 \n", + "\n", + " sqft_living sqft_lot floors waterfront view ... sqft_basement \\\n", + "6325 1780 13095 1.0 0 0 ... 0 \n", + "13473 1000 3700 1.0 0 0 ... 200 \n", + "17614 1080 7486 1.5 0 0 ... 90 \n", + "16970 2090 7500 1.0 0 0 ... 810 \n", + "20868 1741 1439 2.0 0 0 ... 295 \n", + "... ... ... ... ... ... ... ... \n", + "11964 1000 6914 1.0 0 0 ... 0 \n", + "21575 3087 5002 2.0 0 0 ... 0 \n", + "5390 2120 4780 2.0 0 0 ... 0 \n", + "860 380 15000 1.0 0 0 ... 0 \n", + "15795 3130 5999 2.0 0 0 ... 0 \n", + "\n", + " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", + "6325 1983 0 98042 47.3670 -122.152 2750 \n", + "13473 1929 0 98118 47.5520 -122.290 1270 \n", + "17614 1942 0 98146 47.4838 -122.335 1170 \n", + "16970 1977 0 98031 47.3951 -122.172 1800 \n", + "20868 2007 0 98034 47.7043 -122.209 2090 \n", + "... ... ... ... ... ... ... \n", + "11964 1947 0 98125 47.7144 -122.319 1000 \n", + "21575 2014 0 98023 47.2974 -122.349 2927 \n", + "5390 2004 0 98053 47.6810 -122.032 1690 \n", + "860 1963 0 98168 47.4810 -122.323 1170 \n", + "15795 2006 0 98042 47.3837 -122.099 3020 \n", + "\n", + " sqft_lot15 price_category median_price \n", + "6325 13095 1 0 \n", + "13473 5000 0 0 \n", + "17614 7800 0 0 \n", + "16970 7350 0 0 \n", + "20868 10454 1 1 \n", + "... ... ... ... \n", + "11964 6947 1 0 \n", + "21575 5183 1 0 \n", + "5390 2650 1 1 \n", + "860 15000 0 0 \n", + "15795 5997 1 0 \n", + "\n", + "[17290 rows x 23 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'y_train'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
average_price
63250
134730
176140
169701
208680
......
119640
215751
53901
8600
157951
\n", + "

17290 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " average_price\n", + "6325 0\n", + "13473 0\n", + "17614 0\n", + "16970 1\n", + "20868 0\n", + "... ...\n", + "11964 0\n", + "21575 1\n", + "5390 1\n", + "860 0\n", + "15795 1\n", + "\n", + "[17290 rows x 1 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'X_test'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...sqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15price_categorymedian_price
735259182031020141006T000000365000.042.25207088932.000...0198609805847.4388-122.1622390770010
2830797420082020140821T000000865000.053.00290067301.000...1070197709811547.6784-122.2852370628321
4106770145011020140815T0000001038000.042.503770108932.002...0199709800647.5646-122.1293710968521
16218952230001020150331T0000001490000.033.504560146082.002...0199009803447.6995-122.22840501422621
19964951086114020140714T000000711000.032.50255053762.000...0200409805247.6647-122.0832250405021
..................................................................
13674616390033320141110T000000338000.031.75125077101.000...0194709815547.7623-122.3171340771010
20377352896002020140708T000000673000.032.75283034962.000...0201209802947.5606-122.0112160350111
8805168700022020141016T000000285000.042.50243444002.000...0200709800147.2874-122.2832434440000
10168414140003020141201T000000605000.041.752250101081.000...0196709800847.5922-122.1182050975011
2522182250016020141212T000000356500.042.502570114732.000...0200809800347.2809-122.2962430599710
\n", + "

4323 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms \\\n", + "735 2591820310 20141006T000000 365000.0 4 2.25 \n", + "2830 7974200820 20140821T000000 865000.0 5 3.00 \n", + "4106 7701450110 20140815T000000 1038000.0 4 2.50 \n", + "16218 9522300010 20150331T000000 1490000.0 3 3.50 \n", + "19964 9510861140 20140714T000000 711000.0 3 2.50 \n", + "... ... ... ... ... ... \n", + "13674 6163900333 20141110T000000 338000.0 3 1.75 \n", + "20377 3528960020 20140708T000000 673000.0 3 2.75 \n", + "8805 1687000220 20141016T000000 285000.0 4 2.50 \n", + "10168 4141400030 20141201T000000 605000.0 4 1.75 \n", + "2522 1822500160 20141212T000000 356500.0 4 2.50 \n", + "\n", + " sqft_living sqft_lot floors waterfront view ... sqft_basement \\\n", + "735 2070 8893 2.0 0 0 ... 0 \n", + "2830 2900 6730 1.0 0 0 ... 1070 \n", + "4106 3770 10893 2.0 0 2 ... 0 \n", + "16218 4560 14608 2.0 0 2 ... 0 \n", + "19964 2550 5376 2.0 0 0 ... 0 \n", + "... ... ... ... ... ... ... ... \n", + "13674 1250 7710 1.0 0 0 ... 0 \n", + "20377 2830 3496 2.0 0 0 ... 0 \n", + "8805 2434 4400 2.0 0 0 ... 0 \n", + "10168 2250 10108 1.0 0 0 ... 0 \n", + "2522 2570 11473 2.0 0 0 ... 0 \n", + "\n", + " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", + "735 1986 0 98058 47.4388 -122.162 2390 \n", + "2830 1977 0 98115 47.6784 -122.285 2370 \n", + "4106 1997 0 98006 47.5646 -122.129 3710 \n", + "16218 1990 0 98034 47.6995 -122.228 4050 \n", + "19964 2004 0 98052 47.6647 -122.083 2250 \n", + "... ... ... ... ... ... ... \n", + "13674 1947 0 98155 47.7623 -122.317 1340 \n", + "20377 2012 0 98029 47.5606 -122.011 2160 \n", + "8805 2007 0 98001 47.2874 -122.283 2434 \n", + "10168 1967 0 98008 47.5922 -122.118 2050 \n", + "2522 2008 0 98003 47.2809 -122.296 2430 \n", + "\n", + " sqft_lot15 price_category median_price \n", + "735 7700 1 0 \n", + "2830 6283 2 1 \n", + "4106 9685 2 1 \n", + "16218 14226 2 1 \n", + "19964 4050 2 1 \n", + "... ... ... ... \n", + "13674 7710 1 0 \n", + "20377 3501 1 1 \n", + "8805 4400 0 0 \n", + "10168 9750 1 1 \n", + "2522 5997 1 0 \n", + "\n", + "[4323 rows x 23 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'y_test'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
average_price
7350
28301
41061
162181
199641
......
136740
203771
88051
101681
25221
\n", + "

4323 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " average_price\n", + "735 0\n", + "2830 1\n", + "4106 1\n", + "16218 1\n", + "19964 1\n", + "... ...\n", + "13674 0\n", + "20377 1\n", + "8805 1\n", + "10168 1\n", + "2522 1\n", + "\n", + "[4323 rows x 1 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from typing import Tuple\n", + "from pandas import DataFrame\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "def split_into_train_test(\n", + " df_input: DataFrame,\n", + " target_colname: str = \"average_price\",\n", + " frac_train: float = 0.8,\n", + " random_state: int = None,\n", + ") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame]:\n", + " \n", + " if not (0 < frac_train < 1):\n", + " raise ValueError(\"Fraction must be between 0 and 1.\")\n", + " \n", + " # Проверка наличия целевого признака\n", + " if target_colname not in df_input.columns:\n", + " raise ValueError(f\"{target_colname} is not a column in the DataFrame.\")\n", + " \n", + " # Разделяем данные на признаки и целевую переменную\n", + " X = df_input.drop(columns=[target_colname]) # Признаки\n", + " y = df_input[[target_colname]] # Целевая переменная\n", + "\n", + " # Разделяем данные на обучающую и тестовую выборки\n", + " X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y,\n", + " test_size=(1.0 - frac_train),\n", + " random_state=random_state\n", + " )\n", + " \n", + " return X_train, X_test, y_train, y_test\n", + "\n", + "X_train, X_test, y_train, y_test = split_into_train_test(\n", + " df, \n", + " target_colname=\"average_price\", \n", + " frac_train=0.8, \n", + " random_state=42\n", + ")\n", + "\n", + "display(\"X_train\", X_train)\n", + "display(\"y_train\", y_train)\n", + "\n", + "display(\"X_test\", X_test)\n", + "display(\"y_test\", y_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Формирование конвейера для решения задачи регрессии" + ] + }, + { + "cell_type": "code", + "execution_count": 180, + "metadata": {}, + "outputs": [], + "source": [ + "class HouseFeatures(BaseEstimator, TransformerMixin):\n", + " def __init__(self):\n", + " pass\n", + " def fit(self, X, y=None):\n", + " return self\n", + " def transform(self, X, y=None):\n", + " # Создание новых признаков\n", + " X = X.copy()\n", + " X[\"Square\"] = X[\"sqft_living\"] / X[\"sqft_lot\"]\n", + " return X\n", + " def get_feature_names_out(self, features_in):\n", + " # Добавление имен новых признаков\n", + " new_features = [\"Square\"]\n", + " return np.append(features_in, new_features, axis=0)\n", + "\n", + "# Указываем столбцы, которые нужно удалить и обрабатывать\n", + "columns_to_drop = [\"date\"]\n", + "num_columns = [\"bathrooms\", \"floors\", \"waterfront\", \"view\"]\n", + "cat_columns = [] \n", + "\n", + "# Определяем предобработку для численных данных\n", + "num_imputer = SimpleImputer(strategy=\"median\")\n", + "num_scaler = StandardScaler()\n", + "preprocessing_num = Pipeline(\n", + " [\n", + " (\"imputer\", num_imputer),\n", + " (\"scaler\", num_scaler),\n", + " ]\n", + ")\n", + "\n", + "# Определяем предобработку для категориальных данных\n", + "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n", + "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n", + "preprocessing_cat = Pipeline(\n", + " [\n", + " (\"imputer\", cat_imputer),\n", + " (\"encoder\", cat_encoder),\n", + " ]\n", + ")\n", + "\n", + "# Подготовка признаков с использованием ColumnTransformer\n", + "features_preprocessing = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"preprocessing_num\", preprocessing_num, num_columns),\n", + " (\"preprocessing_cat\", preprocessing_cat, cat_columns),\n", + " ],\n", + " remainder=\"passthrough\"\n", + ")\n", + "\n", + "# Удаление нежелательных столбцов\n", + "drop_columns = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"drop_columns\", \"drop\", columns_to_drop),\n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "\n", + "# Постобработка признаков\n", + "features_postprocessing = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"preprocessing_cat\", preprocessing_cat, [\"price_category\"]), \n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "\n", + "# Создание окончательного конвейера\n", + "pipeline = Pipeline(\n", + " [\n", + " (\"features_preprocessing\", features_preprocessing),\n", + " (\"drop_columns\", drop_columns),\n", + " (\"custom_features\", HouseFeatures()),\n", + " (\"model\", RandomForestRegressor()) # Выбор модели для обучения\n", + " ]\n", + ")\n", + "\n", + "# Использование конвейера\n", + "def train_pipeline(X, y):\n", + " pipeline.fit(X, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Определение перечня алгоритмов решения задачи аппроксимации (регрессии)" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import PolynomialFeatures\n", + "from sklearn import linear_model, tree, neighbors, ensemble, neural_network\n", + "\n", + "random_state = 9\n", + "\n", + "models = {\n", + " \"linear\": {\"model\": linear_model.LinearRegression(n_jobs=-1)},\n", + " \"linear_poly\": {\n", + " \"model\": make_pipeline(\n", + " PolynomialFeatures(degree=2),\n", + " linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n", + " )\n", + " },\n", + " \"linear_interact\": {\n", + " \"model\": make_pipeline(\n", + " PolynomialFeatures(interaction_only=True),\n", + " linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n", + " )\n", + " },\n", + " \"ridge\": {\"model\": linear_model.RidgeCV()},\n", + " \"decision_tree\": {\n", + " \"model\": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)\n", + " },\n", + " \"knn\": {\"model\": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},\n", + " \"random_forest\": {\n", + " \"model\": ensemble.RandomForestRegressor(\n", + " max_depth=7, random_state=random_state, n_jobs=-1\n", + " )\n", + " },\n", + " \"mlp\": {\n", + " \"model\": neural_network.MLPRegressor(\n", + " activation=\"tanh\",\n", + " hidden_layer_sizes=(3,),\n", + " max_iter=500,\n", + " early_stopping=True,\n", + " random_state=random_state,\n", + " )\n", + " },\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Обучение и оценка моделей с помощью различных алгоритмов" + ] + }, + { + "cell_type": "code", + "execution_count": 182, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: logistic\n", + "MSE (train): 0.24060150375939848\n", + "MSE (test): 0.23455933379597502\n", + "MAE (train): 0.24060150375939848\n", + "MAE (test): 0.23455933379597502\n", + "R2 (train): 0.015780807725750634\n", + "R2 (test): 0.045807954005714024\n", + "STD (train): 0.48387852043102103\n", + "STD (test): 0.4780359236045559\n", + "----------------------------------------\n", + "Model: ridge\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MSE (train): 0.210989010989011\n", + "MSE (test): 0.2035623409669211\n", + "MAE (train): 0.210989010989011\n", + "MAE (test): 0.2035623409669211\n", + "R2 (train): 0.1369154775441198\n", + "R2 (test): 0.17190433878207922\n", + "STD (train): 0.45781332911823247\n", + "STD (test): 0.4499815316182845\n", + "----------------------------------------\n", + "Model: decision_tree\n", + "MSE (train): 0.0\n", + "MSE (test): 0.0\n", + "MAE (train): 0.0\n", + "MAE (test): 0.0\n", + "R2 (train): 1.0\n", + "R2 (test): 1.0\n", + "STD (train): 0.0\n", + "STD (test): 0.0\n", + "----------------------------------------\n", + "Model: knn\n", + "MSE (train): 0.1949681897050318\n", + "MSE (test): 0.27989821882951654\n", + "MAE (train): 0.1949681897050318\n", + "MAE (test): 0.27989821882951654\n", + "R2 (train): 0.20245122664507342\n", + "R2 (test): -0.13863153417464114\n", + "STD (train): 0.43948973967967464\n", + "STD (test): 0.5264647910268833\n", + "----------------------------------------\n", + "Model: naive_bayes\n", + "MSE (train): 0.26928860613071137\n", + "MSE (test): 0.2690261392551469\n", + "MAE (train): 0.26928860613071137\n", + "MAE (test): 0.2690261392551469\n", + "R2 (train): -0.10156840366079445\n", + "R2 (test): -0.09440369772322943\n", + "STD (train): 0.47316941542228536\n", + "STD (test): 0.47206502931490235\n", + "----------------------------------------\n", + "Model: gradient_boosting\n", + "MSE (train): 0.0\n", + "MSE (test): 0.0\n", + "MAE (train): 0.0\n", + "MAE (test): 0.0\n", + "R2 (train): 1.0\n", + "R2 (test): 1.0\n", + "STD (train): 0.0\n", + "STD (test): 0.0\n", + "----------------------------------------\n", + "Model: random_forest\n", + "MSE (train): 0.0\n", + "MSE (test): 0.0\n", + "MAE (train): 0.0\n", + "MAE (test): 0.0\n", + "R2 (train): 1.0\n", + "R2 (test): 1.0\n", + "STD (train): 0.0\n", + "STD (test): 0.0\n", + "----------------------------------------\n", + "Model: mlp\n", + "MSE (train): 0.4253903990746096\n", + "MSE (test): 0.4353458246588018\n", + "MAE (train): 0.4253903990746096\n", + "MAE (test): 0.4353458246588018\n", + "R2 (train): -0.7401279228791116\n", + "R2 (test): -0.7709954936501442\n", + "STD (train): 0.4959884986820156\n", + "STD (test): 0.49782384226978177\n", + "----------------------------------------\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn import metrics\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "# Проверка наличия необходимых переменных\n", + "if 'class_models' not in locals():\n", + " raise ValueError(\"class_models is not defined\")\n", + "if 'X_train' not in locals() or 'X_test' not in locals() or 'y_train' not in locals() or 'y_test' not in locals():\n", + " raise ValueError(\"Train/test data is not defined\")\n", + "\n", + "\n", + "y_train = np.ravel(y_train) \n", + "y_test = np.ravel(y_test) \n", + "\n", + "# Инициализация списка для хранения результатов\n", + "results = []\n", + "\n", + "# Проход по моделям и оценка их качества\n", + "for model_name in class_models.keys():\n", + " print(f\"Model: {model_name}\")\n", + " \n", + " # Извлечение модели из словаря\n", + " model = class_models[model_name][\"model\"]\n", + " \n", + " # Создание пайплайна\n", + " model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n", + " \n", + " # Обучение модели\n", + " model_pipeline.fit(X_train, y_train)\n", + "\n", + " # Предсказание для обучающей и тестовой выборки\n", + " y_train_predict = model_pipeline.predict(X_train)\n", + " y_test_predict = model_pipeline.predict(X_test)\n", + "\n", + " # Сохранение пайплайна и предсказаний\n", + " class_models[model_name][\"pipeline\"] = model_pipeline\n", + " class_models[model_name][\"preds\"] = y_test_predict\n", + "\n", + " # Вычисление метрик для регрессии\n", + " class_models[model_name][\"MSE_train\"] = metrics.mean_squared_error(y_train, y_train_predict)\n", + " class_models[model_name][\"MSE_test\"] = metrics.mean_squared_error(y_test, y_test_predict)\n", + " class_models[model_name][\"MAE_train\"] = metrics.mean_absolute_error(y_train, y_train_predict)\n", + " class_models[model_name][\"MAE_test\"] = metrics.mean_absolute_error(y_test, y_test_predict)\n", + " class_models[model_name][\"R2_train\"] = metrics.r2_score(y_train, y_train_predict)\n", + " class_models[model_name][\"R2_test\"] = metrics.r2_score(y_test, y_test_predict)\n", + "\n", + " # Дополнительные метрики\n", + " class_models[model_name][\"STD_train\"] = np.std(y_train - y_train_predict)\n", + " class_models[model_name][\"STD_test\"] = np.std(y_test - y_test_predict)\n", + "\n", + " # Вывод результатов для текущей модели\n", + " print(f\"MSE (train): {class_models[model_name]['MSE_train']}\")\n", + " print(f\"MSE (test): {class_models[model_name]['MSE_test']}\")\n", + " print(f\"MAE (train): {class_models[model_name]['MAE_train']}\")\n", + " print(f\"MAE (test): {class_models[model_name]['MAE_test']}\")\n", + " print(f\"R2 (train): {class_models[model_name]['R2_train']}\")\n", + " print(f\"R2 (test): {class_models[model_name]['R2_test']}\")\n", + " print(f\"STD (train): {class_models[model_name]['STD_train']}\")\n", + " print(f\"STD (test): {class_models[model_name]['STD_test']}\")\n", + " print(\"-\" * 40) # Разделитель для разных моделей" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Подбор гиперпараметров методом поиска по сетке" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 5 folds for each of 36 candidates, totalling 180 fits\n", + "Best parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}\n", + "Best MSE: 0.1476816399382576\n" + ] + } + ], + "source": [ + "df['date'] = pd.to_datetime(df['date'], errors='coerce') # Coerce invalid dates to NaT\n", + "df.dropna(subset=['date'], inplace=True) # Drop rows with invalid dates\n", + "df['year'] = df['date'].dt.year\n", + "df['month'] = df['date'].dt.month\n", + "df['day'] = df['date'].dt.day\n", + "\n", + "X = df[['yr_built', 'year', 'month', 'day', 'price', 'price_category']]\n", + "y = df['average_price']\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "model = RandomForestRegressor()\n", + "param_grid = {\n", + " 'n_estimators': [50, 100, 200],\n", + " 'max_depth': [None, 10, 20, 30],\n", + " 'min_samples_split': [2, 5, 10]\n", + "}\n", + "\n", + "grid_search = GridSearchCV(estimator=model, param_grid=param_grid,\n", + " scoring='neg_mean_squared_error', cv=5, n_jobs=-1, verbose=2)\n", + "\n", + "grid_search.fit(X_train, y_train)\n", + "\n", + "print(\"Best parameters:\", grid_search.best_params_)\n", + "print(\"Best MSE:\", -grid_search.best_score_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Обучение модели с новыми гиперпараметрами и сравнение новых и старых данных" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 5 folds for each of 36 candidates, totalling 180 fits\n", + "Старые параметры: {'max_depth': 10, 'min_samples_split': 15, 'n_estimators': 200}\n", + "Лучший результат (MSE) на старых параметрах: 0.1472657852824936\n", + "\n", + "Новые параметры: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}\n", + "Лучший результат (MSE) на новых параметрах: 0.14907357358498077\n", + "Среднеквадратическая ошибка (MSE) на тестовых данных: 0.1443569152033931\n", + "Корень среднеквадратичной ошибки (RMSE) на тестовых данных: 0.37994330524881353\n" + ] + } + ], + "source": [ + "# 1. Настройка параметров для старых значений\n", + "old_param_grid = {\n", + " 'n_estimators': [50, 100, 200], # Количество деревьев\n", + " 'max_depth': [None, 10, 20, 30], # Максимальная глубина дерева\n", + " 'min_samples_split': [2, 10, 15] # Минимальное количество образцов для разбиения узла\n", + "}\n", + "\n", + "# Подбор гиперпараметров с помощью Grid Search для старых параметров\n", + "old_grid_search = GridSearchCV(estimator=RandomForestRegressor(), \n", + " param_grid=old_param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1, verbose=2)\n", + "\n", + "# Обучение модели на тренировочных данных\n", + "old_grid_search.fit(X_train, y_train)\n", + "\n", + "# 2. Результаты подбора для старых параметров\n", + "old_best_params = old_grid_search.best_params_\n", + "old_best_mse = -old_grid_search.best_score_ # Меняем знак, так как берем отрицательное значение MSE\n", + "\n", + "# 3. Настройка параметров для новых значений\n", + "new_param_grid = {\n", + " 'n_estimators': [200],\n", + " 'max_depth': [10],\n", + " 'min_samples_split': [10]\n", + "}\n", + "\n", + "# Подбор гиперпараметров с помощью Grid Search для новых параметров\n", + "new_grid_search = GridSearchCV(estimator=RandomForestRegressor(), \n", + " param_grid=new_param_grid, scoring='neg_mean_squared_error', cv=2)\n", + "\n", + "# Обучение модели на тренировочных данных\n", + "new_grid_search.fit(X_train, y_train)\n", + "\n", + "# 4. Результаты подбора для новых параметров\n", + "new_best_params = new_grid_search.best_params_\n", + "new_best_mse = -new_grid_search.best_score_ # Меняем знак, так как берем отрицательное значение MSE\n", + "\n", + "# 5. Обучение модели с лучшими параметрами для новых значений\n", + "model_best = RandomForestRegressor(**new_best_params)\n", + "model_best.fit(X_train, y_train)\n", + "\n", + "# Прогнозирование на тестовой выборке\n", + "y_pred = model_best.predict(X_test)\n", + "\n", + "# Оценка производительности модели\n", + "mse = metrics.mean_squared_error(y_test, y_pred)\n", + "rmse = np.sqrt(mse)\n", + "\n", + "# Вывод результатов\n", + "print(\"Старые параметры:\", old_best_params)\n", + "print(\"Лучший результат (MSE) на старых параметрах:\", old_best_mse)\n", + "print(\"\\nНовые параметры:\", new_best_params)\n", + "print(\"Лучший результат (MSE) на новых параметрах:\", new_best_mse)\n", + "print(\"Среднеквадратическая ошибка (MSE) на тестовых данных:\", mse)\n", + "print(\"Корень среднеквадратичной ошибки (RMSE) на тестовых данных:\", rmse)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Посмотрев на результат, можно сказать, что старая модель имеет меньшую среднеквадратичную ошибку, следовательно она оказалась лучше модели с новыми настройками.\n", + "Т.к. старые параметры дали наилучший результат, можно сказать, что модель способна выдать высокую точность при настройке гиперпараметров. Попытка с новыми параметрами позволила оценить, как модель реагирует на изменения параметров." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {