From da729ef74ed9ff6a407e30513dd20034ccc9c91e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=AF=D0=BA=D0=BE?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=B2?= Date: Fri, 15 Nov 2024 16:44:23 +0400 Subject: [PATCH] =?UTF-8?q?=D0=BD=D0=B8=D1=87=D0=B5=20=D0=BD=D0=B5=20?= =?UTF-8?q?=D1=80=D0=B0=D0=B1=D0=BE=D1=82=D0=B0=D0=B5=D1=82=20T.T?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lab_4/lab_4.ipynb | 2289 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2289 insertions(+) diff --git a/lab_4/lab_4.ipynb b/lab_4/lab_4.ipynb index e69de29..e7d76c9 100644 --- a/lab_4/lab_4.ipynb +++ b/lab_4/lab_4.ipynb @@ -0,0 +1,2289 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...gradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15
0712930052020141013T000000221900.031.00118056501.000...711800195509817847.5112-122.25713405650
1641410019220141209T000000538000.032.25257072422.000...72170400195119919812547.7210-122.31916907639
2563150040020150225T000000180000.021.00770100001.000...67700193309802847.7379-122.23327208062
3248720087520141209T000000604000.043.00196050001.000...71050910196509813647.5208-122.39313605000
4195440051020150218T000000510000.032.00168080801.000...816800198709807447.6168-122.04518007503
..................................................................
999532205926420140926T000000279000.021.001020470441.000...710200190419589804247.4206-122.155193012139
9996555750027020150209T000000262000.031.50170095791.000...71100600196209802347.3209-122.33817009628
9997916410012520140807T000000533000.041.00155047501.500...715500191909811747.6824-122.38913204750
9998737060004520150402T000000640000.031.75168081001.002...816800195009817747.7212-122.36418807750
9999859440006020140609T000000285000.032.251680351272.000...716800198709809247.3025-122.067182035166
\n", + "

10000 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living \\\n", + "0 7129300520 20141013T000000 221900.0 3 1.00 1180 \n", + "1 6414100192 20141209T000000 538000.0 3 2.25 2570 \n", + "2 5631500400 20150225T000000 180000.0 2 1.00 770 \n", + "3 2487200875 20141209T000000 604000.0 4 3.00 1960 \n", + "4 1954400510 20150218T000000 510000.0 3 2.00 1680 \n", + "... ... ... ... ... ... ... \n", + "9995 322059264 20140926T000000 279000.0 2 1.00 1020 \n", + "9996 5557500270 20150209T000000 262000.0 3 1.50 1700 \n", + "9997 9164100125 20140807T000000 533000.0 4 1.00 1550 \n", + "9998 7370600045 20150402T000000 640000.0 3 1.75 1680 \n", + "9999 8594400060 20140609T000000 285000.0 3 2.25 1680 \n", + "\n", + " sqft_lot floors waterfront view ... grade sqft_above \\\n", + "0 5650 1.0 0 0 ... 7 1180 \n", + "1 7242 2.0 0 0 ... 7 2170 \n", + "2 10000 1.0 0 0 ... 6 770 \n", + "3 5000 1.0 0 0 ... 7 1050 \n", + "4 8080 1.0 0 0 ... 8 1680 \n", + "... ... ... ... ... ... ... ... \n", + "9995 47044 1.0 0 0 ... 7 1020 \n", + "9996 9579 1.0 0 0 ... 7 1100 \n", + "9997 4750 1.5 0 0 ... 7 1550 \n", + "9998 8100 1.0 0 2 ... 8 1680 \n", + "9999 35127 2.0 0 0 ... 7 1680 \n", + "\n", + " sqft_basement yr_built yr_renovated zipcode lat long \\\n", + "0 0 1955 0 98178 47.5112 -122.257 \n", + "1 400 1951 1991 98125 47.7210 -122.319 \n", + "2 0 1933 0 98028 47.7379 -122.233 \n", + "3 910 1965 0 98136 47.5208 -122.393 \n", + "4 0 1987 0 98074 47.6168 -122.045 \n", + "... ... ... ... ... ... ... \n", + "9995 0 1904 1958 98042 47.4206 -122.155 \n", + "9996 600 1962 0 98023 47.3209 -122.338 \n", + "9997 0 1919 0 98117 47.6824 -122.389 \n", + "9998 0 1950 0 98177 47.7212 -122.364 \n", + "9999 0 1987 0 98092 47.3025 -122.067 \n", + "\n", + " sqft_living15 sqft_lot15 \n", + "0 1340 5650 \n", + "1 1690 7639 \n", + "2 2720 8062 \n", + "3 1360 5000 \n", + "4 1800 7503 \n", + "... ... ... \n", + "9995 1930 12139 \n", + "9996 1700 9628 \n", + "9997 1320 4750 \n", + "9998 1880 7750 \n", + "9999 1820 35166 \n", + "\n", + "[10000 rows x 21 columns]" + ] + }, + "execution_count": 112, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn import set_config\n", + "\n", + "df = pd.read_csv(\"data/house_data.csv\", sep=\",\", nrows=10000)\n", + "df.dropna()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Устраняем выбросы в колонке цены и добавляем колонку с категориями цены" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...sqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15price_category
9980684070003620140728T000000497000.021.0077033251.000...7700191809812247.6102-122.2999604800middle
9981182406908320150429T000000835000.031.003060301661.000...30600195909802747.5656-122.093188019602high
9982183698024020141015T000000730000.042.75292045002.000...29200199909800647.5646-122.12429204505high
9983352890016020141001T000000655000.031.00137052501.000...1070300193909810947.6421-122.34824104200high
9984144280006020141120T000000205000.032.50187031182.000...18700199309803847.3739-122.05615803601low
9985872210003020150407T000000632750.042.00180048001.500...18000191809811247.6388-122.30219504800high
9986172304962420140512T000000330000.053.00210077151.000...1250850201309816847.4866-122.31921007959low
9987404040020020141007T000000527500.052.25253082502.000...25300196109800747.6117-122.13420208250middle
9988869139109020140508T000000716500.042.50329064652.000...32900200209807547.5981-121.97631005929high
9989785330219020141217T000000388500.042.50189053952.000...18900200609806547.5415-121.88320605395middle
9990326000070020140904T000000530000.031.75168077701.000...16800196709800547.6028-122.16718807770middle
9991512630051020150108T000000419000.032.50217045172.000...21700200209805947.4819-122.14026104770middle
9992719933037020150309T000000385000.031.75120073601.000...12000197809805247.6979-122.13012007500middle
9993185490024020140528T000000655000.042.50299056692.000...29900200309807447.6119-122.01131105058high
9994673870033520140701T0000001127312.542.753770109002.002...3070700192409814447.5849-122.29030005000very_high
999532205926420140926T000000279000.021.001020470441.000...10200190419589804247.4206-122.155193012139low
9996555750027020150209T000000262000.031.50170095791.000...1100600196209802347.3209-122.33817009628low
9997916410012520140807T000000533000.041.00155047501.500...15500191909811747.6824-122.38913204750middle
9998737060004520150402T000000640000.031.75168081001.002...16800195009817747.7212-122.36418807750high
9999859440006020140609T000000285000.032.251680351272.000...16800198709809247.3025-122.067182035166low
\n", + "

20 rows × 22 columns

\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms \\\n", + "9980 6840700036 20140728T000000 497000.0 2 1.00 \n", + "9981 1824069083 20150429T000000 835000.0 3 1.00 \n", + "9982 1836980240 20141015T000000 730000.0 4 2.75 \n", + "9983 3528900160 20141001T000000 655000.0 3 1.00 \n", + "9984 1442800060 20141120T000000 205000.0 3 2.50 \n", + "9985 8722100030 20150407T000000 632750.0 4 2.00 \n", + "9986 1723049624 20140512T000000 330000.0 5 3.00 \n", + "9987 4040400200 20141007T000000 527500.0 5 2.25 \n", + "9988 8691391090 20140508T000000 716500.0 4 2.50 \n", + "9989 7853302190 20141217T000000 388500.0 4 2.50 \n", + "9990 3260000700 20140904T000000 530000.0 3 1.75 \n", + "9991 5126300510 20150108T000000 419000.0 3 2.50 \n", + "9992 7199330370 20150309T000000 385000.0 3 1.75 \n", + "9993 1854900240 20140528T000000 655000.0 4 2.50 \n", + "9994 6738700335 20140701T000000 1127312.5 4 2.75 \n", + "9995 322059264 20140926T000000 279000.0 2 1.00 \n", + "9996 5557500270 20150209T000000 262000.0 3 1.50 \n", + "9997 9164100125 20140807T000000 533000.0 4 1.00 \n", + "9998 7370600045 20150402T000000 640000.0 3 1.75 \n", + "9999 8594400060 20140609T000000 285000.0 3 2.25 \n", + "\n", + " sqft_living sqft_lot floors waterfront view ... sqft_above \\\n", + "9980 770 3325 1.0 0 0 ... 770 \n", + "9981 3060 30166 1.0 0 0 ... 3060 \n", + "9982 2920 4500 2.0 0 0 ... 2920 \n", + "9983 1370 5250 1.0 0 0 ... 1070 \n", + "9984 1870 3118 2.0 0 0 ... 1870 \n", + "9985 1800 4800 1.5 0 0 ... 1800 \n", + "9986 2100 7715 1.0 0 0 ... 1250 \n", + "9987 2530 8250 2.0 0 0 ... 2530 \n", + "9988 3290 6465 2.0 0 0 ... 3290 \n", + "9989 1890 5395 2.0 0 0 ... 1890 \n", + "9990 1680 7770 1.0 0 0 ... 1680 \n", + "9991 2170 4517 2.0 0 0 ... 2170 \n", + "9992 1200 7360 1.0 0 0 ... 1200 \n", + "9993 2990 5669 2.0 0 0 ... 2990 \n", + "9994 3770 10900 2.0 0 2 ... 3070 \n", + "9995 1020 47044 1.0 0 0 ... 1020 \n", + "9996 1700 9579 1.0 0 0 ... 1100 \n", + "9997 1550 4750 1.5 0 0 ... 1550 \n", + "9998 1680 8100 1.0 0 2 ... 1680 \n", + "9999 1680 35127 2.0 0 0 ... 1680 \n", + "\n", + " sqft_basement yr_built yr_renovated zipcode lat long \\\n", + "9980 0 1918 0 98122 47.6102 -122.299 \n", + "9981 0 1959 0 98027 47.5656 -122.093 \n", + "9982 0 1999 0 98006 47.5646 -122.124 \n", + "9983 300 1939 0 98109 47.6421 -122.348 \n", + "9984 0 1993 0 98038 47.3739 -122.056 \n", + "9985 0 1918 0 98112 47.6388 -122.302 \n", + "9986 850 2013 0 98168 47.4866 -122.319 \n", + "9987 0 1961 0 98007 47.6117 -122.134 \n", + "9988 0 2002 0 98075 47.5981 -121.976 \n", + "9989 0 2006 0 98065 47.5415 -121.883 \n", + "9990 0 1967 0 98005 47.6028 -122.167 \n", + "9991 0 2002 0 98059 47.4819 -122.140 \n", + "9992 0 1978 0 98052 47.6979 -122.130 \n", + "9993 0 2003 0 98074 47.6119 -122.011 \n", + "9994 700 1924 0 98144 47.5849 -122.290 \n", + "9995 0 1904 1958 98042 47.4206 -122.155 \n", + "9996 600 1962 0 98023 47.3209 -122.338 \n", + "9997 0 1919 0 98117 47.6824 -122.389 \n", + "9998 0 1950 0 98177 47.7212 -122.364 \n", + "9999 0 1987 0 98092 47.3025 -122.067 \n", + "\n", + " sqft_living15 sqft_lot15 price_category \n", + "9980 960 4800 middle \n", + "9981 1880 19602 high \n", + "9982 2920 4505 high \n", + "9983 2410 4200 high \n", + "9984 1580 3601 low \n", + "9985 1950 4800 high \n", + "9986 2100 7959 low \n", + "9987 2020 8250 middle \n", + "9988 3100 5929 high \n", + "9989 2060 5395 middle \n", + "9990 1880 7770 middle \n", + "9991 2610 4770 middle \n", + "9992 1200 7500 middle \n", + "9993 3110 5058 high \n", + "9994 3000 5000 very_high \n", + "9995 1930 12139 low \n", + "9996 1700 9628 low \n", + "9997 1320 4750 middle \n", + "9998 1880 7750 high \n", + "9999 1820 35166 low \n", + "\n", + "[20 rows x 22 columns]" + ] + }, + "execution_count": 113, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "q1 = df['price'].quantile(0.25) # Находим 1-й квартиль (Q1)\n", + "q3 = df['price'].quantile(0.75) # Находим 3-й квартиль (Q3)\n", + "iqr = q3 - q1 # Вычисляем межквартильный размах (IQR)\n", + "\n", + "# Определяем границы для выбросов\n", + "lower_bound = q1 - 1.5 * iqr # Нижняя граница\n", + "upper_bound = q3 + 1.5 * iqr # Верхняя граница\n", + "\n", + "# Устраняем выбросы: заменяем значения ниже нижней границы на саму нижнюю границу, а выше верхней — на верхнюю\n", + "df['price'] = df['price'].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)\n", + "\n", + "# Добавляем столбец с категорями цены\n", + "df['price_category'] = pd.cut(df['price'], bins=[75000,338750,602750,866750,1130750], labels=['low','middle','high','very_high'], include_lowest=True)\n", + "df.tail(20)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Бизнес-цели\n", + "1. Прогноз класса цены недвижимости (Классификация)\n", + "2. Оценка состояния недвижимости (Регрессия)\n", + "\n", + "### Определение достижимого уровня качества модели для первой задачи\n", + "#### Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации (Целевой признак - price)" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'X_train'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...sqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15price_category
9843326000034020140622T000000732600.042.50213073001.000...1230900196309800547.6050-122.16721307560high
9623982870205520140508T000000358000.021.5096018082.000...9600199309812247.6183-122.29812901668middle
3095343850062520140519T000000210000.031.001080210431.000...10800194209810647.5515-122.35713807620low
411242202909420140716T000000517534.021.008331439471.000...8330200609807047.3889-122.4821380143947middle
3060746290001520150108T000000387000.032.251760451332.000...17600198409806547.5124-121.866191051773middle
..................................................................
1750278772014020150407T000000416000.032.501790115421.000...1190600196909805947.5124-122.16017909131middle
2354619240040020140728T000000775000.042.50309071122.000...30900200109805247.7050-122.11830506000high
857229650003620150310T000000450000.042.752980132601.000...18001180197909805647.5152-122.197192010731middle
6181278731013020141212T000000289950.041.75209074161.000...10501040197009803147.4107-122.17917107527low
3141856730011020140604T000000485000.032.502340590581.000...23400198509803847.4052-122.028270037263middle
\n", + "

8000 rows × 22 columns

\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living \\\n", + "9843 3260000340 20140622T000000 732600.0 4 2.50 2130 \n", + "9623 9828702055 20140508T000000 358000.0 2 1.50 960 \n", + "3095 3438500625 20140519T000000 210000.0 3 1.00 1080 \n", + "411 2422029094 20140716T000000 517534.0 2 1.00 833 \n", + "3060 7462900015 20150108T000000 387000.0 3 2.25 1760 \n", + "... ... ... ... ... ... ... \n", + "1750 2787720140 20150407T000000 416000.0 3 2.50 1790 \n", + "2354 6192400400 20140728T000000 775000.0 4 2.50 3090 \n", + "857 2296500036 20150310T000000 450000.0 4 2.75 2980 \n", + "6181 2787310130 20141212T000000 289950.0 4 1.75 2090 \n", + "3141 8567300110 20140604T000000 485000.0 3 2.50 2340 \n", + "\n", + " sqft_lot floors waterfront view ... sqft_above sqft_basement \\\n", + "9843 7300 1.0 0 0 ... 1230 900 \n", + "9623 1808 2.0 0 0 ... 960 0 \n", + "3095 21043 1.0 0 0 ... 1080 0 \n", + "411 143947 1.0 0 0 ... 833 0 \n", + "3060 45133 2.0 0 0 ... 1760 0 \n", + "... ... ... ... ... ... ... ... \n", + "1750 11542 1.0 0 0 ... 1190 600 \n", + "2354 7112 2.0 0 0 ... 3090 0 \n", + "857 13260 1.0 0 0 ... 1800 1180 \n", + "6181 7416 1.0 0 0 ... 1050 1040 \n", + "3141 59058 1.0 0 0 ... 2340 0 \n", + "\n", + " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", + "9843 1963 0 98005 47.6050 -122.167 2130 \n", + "9623 1993 0 98122 47.6183 -122.298 1290 \n", + "3095 1942 0 98106 47.5515 -122.357 1380 \n", + "411 2006 0 98070 47.3889 -122.482 1380 \n", + "3060 1984 0 98065 47.5124 -121.866 1910 \n", + "... ... ... ... ... ... ... \n", + "1750 1969 0 98059 47.5124 -122.160 1790 \n", + "2354 2001 0 98052 47.7050 -122.118 3050 \n", + "857 1979 0 98056 47.5152 -122.197 1920 \n", + "6181 1970 0 98031 47.4107 -122.179 1710 \n", + "3141 1985 0 98038 47.4052 -122.028 2700 \n", + "\n", + " sqft_lot15 price_category \n", + "9843 7560 high \n", + "9623 1668 middle \n", + "3095 7620 low \n", + "411 143947 middle \n", + "3060 51773 middle \n", + "... ... ... \n", + "1750 9131 middle \n", + "2354 6000 high \n", + "857 10731 middle \n", + "6181 7527 low \n", + "3141 37263 middle \n", + "\n", + "[8000 rows x 22 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'y_train'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
price_category
9843high
9623middle
3095low
411middle
3060middle
......
1750middle
2354high
857middle
6181low
3141middle
\n", + "

8000 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " price_category\n", + "9843 high\n", + "9623 middle\n", + "3095 low\n", + "411 middle\n", + "3060 middle\n", + "... ...\n", + "1750 middle\n", + "2354 high\n", + "857 middle\n", + "6181 low\n", + "3141 middle\n", + "\n", + "[8000 rows x 1 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'X_test'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...sqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15price_category
5341663290057420150225T000000595000.053.002980100641.000...16801300194009815547.7372-122.31615907800middle
4384242302924520140617T000000550000.031.752240782252.000...22400197609807047.4638-122.4842030202554middle
5795247337005020140604T000000327500.041.75165078001.000...16500196809805847.4507-122.139175010400low
4956952810498520141104T000000611000.021.00127051001.000...1100170190009811547.6771-122.32816703900high
7723397290002520150313T000000499000.061.75240075001.500...14001000197509815547.7661-122.31319807500middle
..................................................................
8517387660012020150422T000000265000.031.501780101961.000...1270510196709800147.3375-122.29113207875low
6914682160000520150403T000000710000.041.75212054001.000...10601060194109819947.6501-122.39520526000high
4499276760393120140818T000000469000.033.25137011943.000...13700200409810747.6718-122.38818002678middle
8651880240041120140619T000000249000.031.00105084981.000...10500195909803147.4043-122.20210508498low
4234545280073520140722T000000780000.042.502270134491.000...1310960197509804047.5416-122.232281013475high
\n", + "

2000 rows × 22 columns

\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living \\\n", + "5341 6632900574 20150225T000000 595000.0 5 3.00 2980 \n", + "4384 2423029245 20140617T000000 550000.0 3 1.75 2240 \n", + "5795 2473370050 20140604T000000 327500.0 4 1.75 1650 \n", + "4956 9528104985 20141104T000000 611000.0 2 1.00 1270 \n", + "7723 3972900025 20150313T000000 499000.0 6 1.75 2400 \n", + "... ... ... ... ... ... ... \n", + "8517 3876600120 20150422T000000 265000.0 3 1.50 1780 \n", + "6914 6821600005 20150403T000000 710000.0 4 1.75 2120 \n", + "4499 2767603931 20140818T000000 469000.0 3 3.25 1370 \n", + "8651 8802400411 20140619T000000 249000.0 3 1.00 1050 \n", + "4234 5452800735 20140722T000000 780000.0 4 2.50 2270 \n", + "\n", + " sqft_lot floors waterfront view ... sqft_above sqft_basement \\\n", + "5341 10064 1.0 0 0 ... 1680 1300 \n", + "4384 78225 2.0 0 0 ... 2240 0 \n", + "5795 7800 1.0 0 0 ... 1650 0 \n", + "4956 5100 1.0 0 0 ... 1100 170 \n", + "7723 7500 1.5 0 0 ... 1400 1000 \n", + "... ... ... ... ... ... ... ... \n", + "8517 10196 1.0 0 0 ... 1270 510 \n", + "6914 5400 1.0 0 0 ... 1060 1060 \n", + "4499 1194 3.0 0 0 ... 1370 0 \n", + "8651 8498 1.0 0 0 ... 1050 0 \n", + "4234 13449 1.0 0 0 ... 1310 960 \n", + "\n", + " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", + "5341 1940 0 98155 47.7372 -122.316 1590 \n", + "4384 1976 0 98070 47.4638 -122.484 2030 \n", + "5795 1968 0 98058 47.4507 -122.139 1750 \n", + "4956 1900 0 98115 47.6771 -122.328 1670 \n", + "7723 1975 0 98155 47.7661 -122.313 1980 \n", + "... ... ... ... ... ... ... \n", + "8517 1967 0 98001 47.3375 -122.291 1320 \n", + "6914 1941 0 98199 47.6501 -122.395 2052 \n", + "4499 2004 0 98107 47.6718 -122.388 1800 \n", + "8651 1959 0 98031 47.4043 -122.202 1050 \n", + "4234 1975 0 98040 47.5416 -122.232 2810 \n", + "\n", + " sqft_lot15 price_category \n", + "5341 7800 middle \n", + "4384 202554 middle \n", + "5795 10400 low \n", + "4956 3900 high \n", + "7723 7500 middle \n", + "... ... ... \n", + "8517 7875 low \n", + "6914 6000 high \n", + "4499 2678 middle \n", + "8651 8498 low \n", + "4234 13475 high \n", + "\n", + "[2000 rows x 22 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'y_test'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
price_category
5341middle
4384middle
5795low
4956high
7723middle
......
8517low
6914high
4499middle
8651low
4234high
\n", + "

2000 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " price_category\n", + "5341 middle\n", + "4384 middle\n", + "5795 low\n", + "4956 high\n", + "7723 middle\n", + "... ...\n", + "8517 low\n", + "6914 high\n", + "4499 middle\n", + "8651 low\n", + "4234 high\n", + "\n", + "[2000 rows x 1 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from typing import Tuple\n", + "import pandas as pd\n", + "from pandas import DataFrame\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "def split_stratified_into_train_val_test(\n", + " df_input,\n", + " stratify_colname=\"y\",\n", + " frac_train=0.6,\n", + " frac_val=0.15,\n", + " frac_test=0.25,\n", + " random_state=None,\n", + ") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n", + " \n", + " if frac_train + frac_val + frac_test != 1.0:\n", + " raise ValueError(\n", + " \"fractions %f, %f, %f do not add up to 1.0\"\n", + " % (frac_train, frac_val, frac_test)\n", + " )\n", + " if stratify_colname not in df_input.columns:\n", + " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", + " X = df_input # Contains all columns.\n", + " y = df_input[\n", + " [stratify_colname]\n", + " ] # Dataframe of just the column on which to stratify.\n", + " # Split original dataframe into train and temp dataframes.\n", + " df_train, df_temp, y_train, y_temp = train_test_split(\n", + " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", + " )\n", + " if frac_val <= 0:\n", + " assert len(df_input) == len(df_train) + len(df_temp)\n", + " return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n", + " # Split the temp dataframe into val and test dataframes.\n", + " relative_frac_test = frac_test / (frac_val + frac_test)\n", + " df_val, df_test, y_val, y_test = train_test_split(\n", + " df_temp,\n", + " y_temp,\n", + " stratify=y_temp,\n", + " test_size=relative_frac_test,\n", + " random_state=random_state,\n", + " )\n", + " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", + " return df_train, df_val, df_test, y_train, y_val, y_test\n", + "\n", + "X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n", + " df, stratify_colname=\"price_category\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=42\n", + ")\n", + "\n", + "display(\"X_train\", X_train)\n", + "display(\"y_train\", y_train)\n", + "\n", + "display(\"X_test\", X_test)\n", + "display(\"y_test\", y_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Формирование конвейера\n", + "preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n", + "\n", + "preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n", + "\n", + "features_preprocessing -- трансформер для предобработки признаков\n", + "\n", + "features_engineering -- трансформер для конструирования признаков\n", + "\n", + "drop_columns -- трансформер для удаления колонок\n", + "\n", + "pipeline_end -- основной конвейер предобработки данных и конструирования признаков" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "Shape of passed values is (8000, 21), indices imply (8000, 19)", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[184], line 123\u001b[0m\n\u001b[0;32m 121\u001b[0m preprocessing_result \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(preprocessing_result, columns\u001b[38;5;241m=\u001b[39mnum_columns \u001b[38;5;241m+\u001b[39m cat_columns \u001b[38;5;241m+\u001b[39m cols)\n\u001b[0;32m 122\u001b[0m preprocessing_result \u001b[38;5;241m=\u001b[39m features_engineering\u001b[38;5;241m.\u001b[39mfit_transform(preprocessing_result)\n\u001b[1;32m--> 123\u001b[0m preprocessing_result \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mDataFrame\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpreprocessing_result\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnum_columns\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mcat_columns\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 125\u001b[0m \u001b[38;5;66;03m# preprocessing_result = features_postprocessing.fit_transform(preprocessing_result)\u001b[39;00m\n\u001b[0;32m 126\u001b[0m \n\u001b[0;32m 127\u001b[0m \u001b[38;5;66;03m# preprocessing_result = pipeline_end.fit_transform(X_train)\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 131\u001b[0m \u001b[38;5;66;03m# )\u001b[39;00m\n\u001b[0;32m 132\u001b[0m \u001b[38;5;66;03m# preprocessed_df\u001b[39;00m\n", + "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\pandas\\core\\frame.py:827\u001b[0m, in \u001b[0;36mDataFrame.__init__\u001b[1;34m(self, data, index, columns, dtype, copy)\u001b[0m\n\u001b[0;32m 816\u001b[0m mgr \u001b[38;5;241m=\u001b[39m dict_to_mgr(\n\u001b[0;32m 817\u001b[0m \u001b[38;5;66;03m# error: Item \"ndarray\" of \"Union[ndarray, Series, Index]\" has no\u001b[39;00m\n\u001b[0;32m 818\u001b[0m \u001b[38;5;66;03m# attribute \"name\"\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 824\u001b[0m copy\u001b[38;5;241m=\u001b[39m_copy,\n\u001b[0;32m 825\u001b[0m )\n\u001b[0;32m 826\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 827\u001b[0m mgr \u001b[38;5;241m=\u001b[39m \u001b[43mndarray_to_mgr\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 828\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 829\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 830\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 831\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 832\u001b[0m \u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 833\u001b[0m \u001b[43m \u001b[49m\u001b[43mtyp\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmanager\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 834\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 836\u001b[0m \u001b[38;5;66;03m# For data is list-like, or Iterable (will consume into list)\u001b[39;00m\n\u001b[0;32m 837\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m is_list_like(data):\n", + "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\pandas\\core\\internals\\construction.py:336\u001b[0m, in \u001b[0;36mndarray_to_mgr\u001b[1;34m(values, index, columns, dtype, copy, typ)\u001b[0m\n\u001b[0;32m 331\u001b[0m \u001b[38;5;66;03m# _prep_ndarraylike ensures that values.ndim == 2 at this point\u001b[39;00m\n\u001b[0;32m 332\u001b[0m index, columns \u001b[38;5;241m=\u001b[39m _get_axes(\n\u001b[0;32m 333\u001b[0m values\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m], values\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m], index\u001b[38;5;241m=\u001b[39mindex, columns\u001b[38;5;241m=\u001b[39mcolumns\n\u001b[0;32m 334\u001b[0m )\n\u001b[1;32m--> 336\u001b[0m \u001b[43m_check_values_indices_shape_match\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 338\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m typ \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124marray\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m 339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(values\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, \u001b[38;5;28mstr\u001b[39m):\n", + "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\pandas\\core\\internals\\construction.py:420\u001b[0m, in \u001b[0;36m_check_values_indices_shape_match\u001b[1;34m(values, index, columns)\u001b[0m\n\u001b[0;32m 418\u001b[0m passed \u001b[38;5;241m=\u001b[39m values\u001b[38;5;241m.\u001b[39mshape\n\u001b[0;32m 419\u001b[0m implied \u001b[38;5;241m=\u001b[39m (\u001b[38;5;28mlen\u001b[39m(index), \u001b[38;5;28mlen\u001b[39m(columns))\n\u001b[1;32m--> 420\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mShape of passed values is \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpassed\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, indices imply \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mimplied\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[1;31mValueError\u001b[0m: Shape of passed values is (8000, 21), indices imply (8000, 19)" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.base import BaseEstimator, TransformerMixin\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.discriminant_analysis import StandardScaler\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.ensemble import RandomForestRegressor # Пример регрессионной модели\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.pipeline import make_pipeline\n", + "\n", + "class HousesFeatures(BaseEstimator, TransformerMixin):\n", + " def __init__(self):\n", + " pass\n", + "\n", + " def fit(self, X, y=None):\n", + " return self\n", + "\n", + "\n", + " def transform(self, X, y=None):\n", + "\n", + " def get_price_type(category) -> int:\n", + " if pd.isna(category):\n", + " return \"unknown\"\n", + " if category == 'low':\n", + " return 1\n", + " elif category == 'middle':\n", + " return 2\n", + " elif category == 'high':\n", + " return 3\n", + " elif category == 'very_high':\n", + " return 4\n", + "\n", + " # Преобразование категориальных столбцов в числовые 1/0\n", + " X[\"price_category\"] = [get_price_type(category) for category in X[\"price_category\"]]\n", + " return X\n", + "\n", + " def get_feature_names_out(self, features_in):\n", + " return np.append(features_in, [\"price_type\"], axis=0)\n", + "\n", + "# Указываем столбцы, которые нужно удалить и обрабатывать\n", + "columns_to_drop = [\"date\", \"view\", \"waterfront\"]\n", + "num_columns = [\n", + " column\n", + " for column in df.columns\n", + " if column not in columns_to_drop and df[column].dtype != \"object\" and df[column].dtype != \"category\"\n", + "]\n", + "cat_columns = [\n", + " column\n", + " for column in df.columns\n", + " if column not in columns_to_drop and df[column].dtype == \"object\" or df[column].dtype == \"category\"\n", + "]\n", + "\n", + "# Определяем предобработку для численных данных\n", + "num_imputer = SimpleImputer(strategy=\"median\")\n", + "num_scaler = StandardScaler()\n", + "preprocessing_num = Pipeline(\n", + " [\n", + " (\"imputer\", num_imputer),\n", + " (\"scaler\", num_scaler),\n", + " ]\n", + ")\n", + "\n", + "# Определяем предобработку для категориальных данных\n", + "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n", + "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n", + "preprocessing_cat = Pipeline(\n", + " [\n", + " (\"imputer\", cat_imputer),\n", + " (\"encoder\", cat_encoder),\n", + " ]\n", + ")\n", + "\n", + "features_preprocessing = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"prepocessing_num\", preprocessing_num, num_columns),\n", + " (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n", + " ],\n", + " remainder=\"passthrough\"\n", + ")\n", + "\n", + "# features_engineering = ColumnTransformer(\n", + "# verbose_feature_names_out=False,\n", + "# transformers=[\n", + "# (\"add_features\", HousesFeatures(), [\"price_category\"]),\n", + "# ],\n", + "# remainder=\"passthrough\",\n", + "# )\n", + "\n", + "drop_columns = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"drop_columns\", \"drop\", columns_to_drop),\n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "\n", + "features_postprocessing = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"prepocessing_cat\", preprocessing_cat, [\"price_category\"]),\n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "\n", + "pipeline_end = Pipeline(\n", + " [\n", + " (\"features_preprocessing\", features_preprocessing),\n", + " (\"features_engineering\", features_engineering),\n", + " (\"drop_columns\", drop_columns),\n", + " (\"features_postprocessing\", features_postprocessing),\n", + " ]\n", + "\n", + ")\n", + "cols = ['a', 'b']\n", + "preprocessing_result = drop_columns.fit_transform(X_train)\n", + "preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns)\n", + "preprocessing_result = features_preprocessing.fit_transform(preprocessing_result)\n", + "preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns + cols)\n", + "preprocessing_result = features_engineering.fit_transform(preprocessing_result)\n", + "preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns)\n", + "\n", + "# preprocessing_result = features_postprocessing.fit_transform(preprocessing_result)\n", + "\n", + "# preprocessing_result = pipeline_end.fit_transform(X_train)\n", + "# preprocessed_df = pd.DataFrame(\n", + "# preprocessing_result,\n", + "# columns=pipeline_end.get_feature_names_out(),\n", + "# )\n", + "# preprocessed_df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "kernel", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}