{ "cells": [ { "cell_type": "code", "execution_count": 112, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...gradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15
0712930052020141013T000000221900.031.00118056501.000...711800195509817847.5112-122.25713405650
1641410019220141209T000000538000.032.25257072422.000...72170400195119919812547.7210-122.31916907639
2563150040020150225T000000180000.021.00770100001.000...67700193309802847.7379-122.23327208062
3248720087520141209T000000604000.043.00196050001.000...71050910196509813647.5208-122.39313605000
4195440051020150218T000000510000.032.00168080801.000...816800198709807447.6168-122.04518007503
..................................................................
999532205926420140926T000000279000.021.001020470441.000...710200190419589804247.4206-122.155193012139
9996555750027020150209T000000262000.031.50170095791.000...71100600196209802347.3209-122.33817009628
9997916410012520140807T000000533000.041.00155047501.500...715500191909811747.6824-122.38913204750
9998737060004520150402T000000640000.031.75168081001.002...816800195009817747.7212-122.36418807750
9999859440006020140609T000000285000.032.251680351272.000...716800198709809247.3025-122.067182035166
\n", "

10000 rows × 21 columns

\n", "
" ], "text/plain": [ " id date price bedrooms bathrooms sqft_living \\\n", "0 7129300520 20141013T000000 221900.0 3 1.00 1180 \n", "1 6414100192 20141209T000000 538000.0 3 2.25 2570 \n", "2 5631500400 20150225T000000 180000.0 2 1.00 770 \n", "3 2487200875 20141209T000000 604000.0 4 3.00 1960 \n", "4 1954400510 20150218T000000 510000.0 3 2.00 1680 \n", "... ... ... ... ... ... ... \n", "9995 322059264 20140926T000000 279000.0 2 1.00 1020 \n", "9996 5557500270 20150209T000000 262000.0 3 1.50 1700 \n", "9997 9164100125 20140807T000000 533000.0 4 1.00 1550 \n", "9998 7370600045 20150402T000000 640000.0 3 1.75 1680 \n", "9999 8594400060 20140609T000000 285000.0 3 2.25 1680 \n", "\n", " sqft_lot floors waterfront view ... grade sqft_above \\\n", "0 5650 1.0 0 0 ... 7 1180 \n", "1 7242 2.0 0 0 ... 7 2170 \n", "2 10000 1.0 0 0 ... 6 770 \n", "3 5000 1.0 0 0 ... 7 1050 \n", "4 8080 1.0 0 0 ... 8 1680 \n", "... ... ... ... ... ... ... ... \n", "9995 47044 1.0 0 0 ... 7 1020 \n", "9996 9579 1.0 0 0 ... 7 1100 \n", "9997 4750 1.5 0 0 ... 7 1550 \n", "9998 8100 1.0 0 2 ... 8 1680 \n", "9999 35127 2.0 0 0 ... 7 1680 \n", "\n", " sqft_basement yr_built yr_renovated zipcode lat long \\\n", "0 0 1955 0 98178 47.5112 -122.257 \n", "1 400 1951 1991 98125 47.7210 -122.319 \n", "2 0 1933 0 98028 47.7379 -122.233 \n", "3 910 1965 0 98136 47.5208 -122.393 \n", "4 0 1987 0 98074 47.6168 -122.045 \n", "... ... ... ... ... ... ... \n", "9995 0 1904 1958 98042 47.4206 -122.155 \n", "9996 600 1962 0 98023 47.3209 -122.338 \n", "9997 0 1919 0 98117 47.6824 -122.389 \n", "9998 0 1950 0 98177 47.7212 -122.364 \n", "9999 0 1987 0 98092 47.3025 -122.067 \n", "\n", " sqft_living15 sqft_lot15 \n", "0 1340 5650 \n", "1 1690 7639 \n", "2 2720 8062 \n", "3 1360 5000 \n", "4 1800 7503 \n", "... ... ... \n", "9995 1930 12139 \n", "9996 1700 9628 \n", "9997 1320 4750 \n", "9998 1880 7750 \n", "9999 1820 35166 \n", "\n", "[10000 rows x 21 columns]" ] }, "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn.model_selection import train_test_split\n", "from sklearn import set_config\n", "\n", "df = pd.read_csv(\"data/house_data.csv\", sep=\",\", nrows=10000)\n", "df.dropna()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Устраняем выбросы в колонке цены и добавляем колонку с категориями цены" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...sqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15price_category
9980684070003620140728T000000497000.021.0077033251.000...7700191809812247.6102-122.2999604800middle
9981182406908320150429T000000835000.031.003060301661.000...30600195909802747.5656-122.093188019602high
9982183698024020141015T000000730000.042.75292045002.000...29200199909800647.5646-122.12429204505high
9983352890016020141001T000000655000.031.00137052501.000...1070300193909810947.6421-122.34824104200high
9984144280006020141120T000000205000.032.50187031182.000...18700199309803847.3739-122.05615803601low
9985872210003020150407T000000632750.042.00180048001.500...18000191809811247.6388-122.30219504800high
9986172304962420140512T000000330000.053.00210077151.000...1250850201309816847.4866-122.31921007959low
9987404040020020141007T000000527500.052.25253082502.000...25300196109800747.6117-122.13420208250middle
9988869139109020140508T000000716500.042.50329064652.000...32900200209807547.5981-121.97631005929high
9989785330219020141217T000000388500.042.50189053952.000...18900200609806547.5415-121.88320605395middle
9990326000070020140904T000000530000.031.75168077701.000...16800196709800547.6028-122.16718807770middle
9991512630051020150108T000000419000.032.50217045172.000...21700200209805947.4819-122.14026104770middle
9992719933037020150309T000000385000.031.75120073601.000...12000197809805247.6979-122.13012007500middle
9993185490024020140528T000000655000.042.50299056692.000...29900200309807447.6119-122.01131105058high
9994673870033520140701T0000001127312.542.753770109002.002...3070700192409814447.5849-122.29030005000very_high
999532205926420140926T000000279000.021.001020470441.000...10200190419589804247.4206-122.155193012139low
9996555750027020150209T000000262000.031.50170095791.000...1100600196209802347.3209-122.33817009628low
9997916410012520140807T000000533000.041.00155047501.500...15500191909811747.6824-122.38913204750middle
9998737060004520150402T000000640000.031.75168081001.002...16800195009817747.7212-122.36418807750high
9999859440006020140609T000000285000.032.251680351272.000...16800198709809247.3025-122.067182035166low
\n", "

20 rows × 22 columns

\n", "
" ], "text/plain": [ " id date price bedrooms bathrooms \\\n", "9980 6840700036 20140728T000000 497000.0 2 1.00 \n", "9981 1824069083 20150429T000000 835000.0 3 1.00 \n", "9982 1836980240 20141015T000000 730000.0 4 2.75 \n", "9983 3528900160 20141001T000000 655000.0 3 1.00 \n", "9984 1442800060 20141120T000000 205000.0 3 2.50 \n", "9985 8722100030 20150407T000000 632750.0 4 2.00 \n", "9986 1723049624 20140512T000000 330000.0 5 3.00 \n", "9987 4040400200 20141007T000000 527500.0 5 2.25 \n", "9988 8691391090 20140508T000000 716500.0 4 2.50 \n", "9989 7853302190 20141217T000000 388500.0 4 2.50 \n", "9990 3260000700 20140904T000000 530000.0 3 1.75 \n", "9991 5126300510 20150108T000000 419000.0 3 2.50 \n", "9992 7199330370 20150309T000000 385000.0 3 1.75 \n", "9993 1854900240 20140528T000000 655000.0 4 2.50 \n", "9994 6738700335 20140701T000000 1127312.5 4 2.75 \n", "9995 322059264 20140926T000000 279000.0 2 1.00 \n", "9996 5557500270 20150209T000000 262000.0 3 1.50 \n", "9997 9164100125 20140807T000000 533000.0 4 1.00 \n", "9998 7370600045 20150402T000000 640000.0 3 1.75 \n", "9999 8594400060 20140609T000000 285000.0 3 2.25 \n", "\n", " sqft_living sqft_lot floors waterfront view ... sqft_above \\\n", "9980 770 3325 1.0 0 0 ... 770 \n", "9981 3060 30166 1.0 0 0 ... 3060 \n", "9982 2920 4500 2.0 0 0 ... 2920 \n", "9983 1370 5250 1.0 0 0 ... 1070 \n", "9984 1870 3118 2.0 0 0 ... 1870 \n", "9985 1800 4800 1.5 0 0 ... 1800 \n", "9986 2100 7715 1.0 0 0 ... 1250 \n", "9987 2530 8250 2.0 0 0 ... 2530 \n", "9988 3290 6465 2.0 0 0 ... 3290 \n", "9989 1890 5395 2.0 0 0 ... 1890 \n", "9990 1680 7770 1.0 0 0 ... 1680 \n", "9991 2170 4517 2.0 0 0 ... 2170 \n", "9992 1200 7360 1.0 0 0 ... 1200 \n", "9993 2990 5669 2.0 0 0 ... 2990 \n", "9994 3770 10900 2.0 0 2 ... 3070 \n", "9995 1020 47044 1.0 0 0 ... 1020 \n", "9996 1700 9579 1.0 0 0 ... 1100 \n", "9997 1550 4750 1.5 0 0 ... 1550 \n", "9998 1680 8100 1.0 0 2 ... 1680 \n", "9999 1680 35127 2.0 0 0 ... 1680 \n", "\n", " sqft_basement yr_built yr_renovated zipcode lat long \\\n", "9980 0 1918 0 98122 47.6102 -122.299 \n", "9981 0 1959 0 98027 47.5656 -122.093 \n", "9982 0 1999 0 98006 47.5646 -122.124 \n", "9983 300 1939 0 98109 47.6421 -122.348 \n", "9984 0 1993 0 98038 47.3739 -122.056 \n", "9985 0 1918 0 98112 47.6388 -122.302 \n", "9986 850 2013 0 98168 47.4866 -122.319 \n", "9987 0 1961 0 98007 47.6117 -122.134 \n", "9988 0 2002 0 98075 47.5981 -121.976 \n", "9989 0 2006 0 98065 47.5415 -121.883 \n", "9990 0 1967 0 98005 47.6028 -122.167 \n", "9991 0 2002 0 98059 47.4819 -122.140 \n", "9992 0 1978 0 98052 47.6979 -122.130 \n", "9993 0 2003 0 98074 47.6119 -122.011 \n", "9994 700 1924 0 98144 47.5849 -122.290 \n", "9995 0 1904 1958 98042 47.4206 -122.155 \n", "9996 600 1962 0 98023 47.3209 -122.338 \n", "9997 0 1919 0 98117 47.6824 -122.389 \n", "9998 0 1950 0 98177 47.7212 -122.364 \n", "9999 0 1987 0 98092 47.3025 -122.067 \n", "\n", " sqft_living15 sqft_lot15 price_category \n", "9980 960 4800 middle \n", "9981 1880 19602 high \n", "9982 2920 4505 high \n", "9983 2410 4200 high \n", "9984 1580 3601 low \n", "9985 1950 4800 high \n", "9986 2100 7959 low \n", "9987 2020 8250 middle \n", "9988 3100 5929 high \n", "9989 2060 5395 middle \n", "9990 1880 7770 middle \n", "9991 2610 4770 middle \n", "9992 1200 7500 middle \n", "9993 3110 5058 high \n", "9994 3000 5000 very_high \n", "9995 1930 12139 low \n", "9996 1700 9628 low \n", "9997 1320 4750 middle \n", "9998 1880 7750 high \n", "9999 1820 35166 low \n", "\n", "[20 rows x 22 columns]" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q1 = df['price'].quantile(0.25) # Находим 1-й квартиль (Q1)\n", "q3 = df['price'].quantile(0.75) # Находим 3-й квартиль (Q3)\n", "iqr = q3 - q1 # Вычисляем межквартильный размах (IQR)\n", "\n", "# Определяем границы для выбросов\n", "lower_bound = q1 - 1.5 * iqr # Нижняя граница\n", "upper_bound = q3 + 1.5 * iqr # Верхняя граница\n", "\n", "# Устраняем выбросы: заменяем значения ниже нижней границы на саму нижнюю границу, а выше верхней — на верхнюю\n", "df['price'] = df['price'].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)\n", "\n", "# Добавляем столбец с категорями цены\n", "df['price_category'] = pd.cut(df['price'], bins=[75000,338750,602750,866750,1130750], labels=['low','middle','high','very_high'], include_lowest=True)\n", "df.tail(20)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Бизнес-цели\n", "1. Прогноз класса цены недвижимости (Классификация)\n", "2. Оценка состояния недвижимости (Регрессия)\n", "\n", "### Определение достижимого уровня качества модели для первой задачи\n", "#### Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации (Целевой признак - price)" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'X_train'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...sqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15price_category
9843326000034020140622T000000732600.042.50213073001.000...1230900196309800547.6050-122.16721307560high
9623982870205520140508T000000358000.021.5096018082.000...9600199309812247.6183-122.29812901668middle
3095343850062520140519T000000210000.031.001080210431.000...10800194209810647.5515-122.35713807620low
411242202909420140716T000000517534.021.008331439471.000...8330200609807047.3889-122.4821380143947middle
3060746290001520150108T000000387000.032.251760451332.000...17600198409806547.5124-121.866191051773middle
..................................................................
1750278772014020150407T000000416000.032.501790115421.000...1190600196909805947.5124-122.16017909131middle
2354619240040020140728T000000775000.042.50309071122.000...30900200109805247.7050-122.11830506000high
857229650003620150310T000000450000.042.752980132601.000...18001180197909805647.5152-122.197192010731middle
6181278731013020141212T000000289950.041.75209074161.000...10501040197009803147.4107-122.17917107527low
3141856730011020140604T000000485000.032.502340590581.000...23400198509803847.4052-122.028270037263middle
\n", "

8000 rows × 22 columns

\n", "
" ], "text/plain": [ " id date price bedrooms bathrooms sqft_living \\\n", "9843 3260000340 20140622T000000 732600.0 4 2.50 2130 \n", "9623 9828702055 20140508T000000 358000.0 2 1.50 960 \n", "3095 3438500625 20140519T000000 210000.0 3 1.00 1080 \n", "411 2422029094 20140716T000000 517534.0 2 1.00 833 \n", "3060 7462900015 20150108T000000 387000.0 3 2.25 1760 \n", "... ... ... ... ... ... ... \n", "1750 2787720140 20150407T000000 416000.0 3 2.50 1790 \n", "2354 6192400400 20140728T000000 775000.0 4 2.50 3090 \n", "857 2296500036 20150310T000000 450000.0 4 2.75 2980 \n", "6181 2787310130 20141212T000000 289950.0 4 1.75 2090 \n", "3141 8567300110 20140604T000000 485000.0 3 2.50 2340 \n", "\n", " sqft_lot floors waterfront view ... sqft_above sqft_basement \\\n", "9843 7300 1.0 0 0 ... 1230 900 \n", "9623 1808 2.0 0 0 ... 960 0 \n", "3095 21043 1.0 0 0 ... 1080 0 \n", "411 143947 1.0 0 0 ... 833 0 \n", "3060 45133 2.0 0 0 ... 1760 0 \n", "... ... ... ... ... ... ... ... \n", "1750 11542 1.0 0 0 ... 1190 600 \n", "2354 7112 2.0 0 0 ... 3090 0 \n", "857 13260 1.0 0 0 ... 1800 1180 \n", "6181 7416 1.0 0 0 ... 1050 1040 \n", "3141 59058 1.0 0 0 ... 2340 0 \n", "\n", " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", "9843 1963 0 98005 47.6050 -122.167 2130 \n", "9623 1993 0 98122 47.6183 -122.298 1290 \n", "3095 1942 0 98106 47.5515 -122.357 1380 \n", "411 2006 0 98070 47.3889 -122.482 1380 \n", "3060 1984 0 98065 47.5124 -121.866 1910 \n", "... ... ... ... ... ... ... \n", "1750 1969 0 98059 47.5124 -122.160 1790 \n", "2354 2001 0 98052 47.7050 -122.118 3050 \n", "857 1979 0 98056 47.5152 -122.197 1920 \n", "6181 1970 0 98031 47.4107 -122.179 1710 \n", "3141 1985 0 98038 47.4052 -122.028 2700 \n", "\n", " sqft_lot15 price_category \n", "9843 7560 high \n", "9623 1668 middle \n", "3095 7620 low \n", "411 143947 middle \n", "3060 51773 middle \n", "... ... ... \n", "1750 9131 middle \n", "2354 6000 high \n", "857 10731 middle \n", "6181 7527 low \n", "3141 37263 middle \n", "\n", "[8000 rows x 22 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'y_train'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
price_category
9843high
9623middle
3095low
411middle
3060middle
......
1750middle
2354high
857middle
6181low
3141middle
\n", "

8000 rows × 1 columns

\n", "
" ], "text/plain": [ " price_category\n", "9843 high\n", "9623 middle\n", "3095 low\n", "411 middle\n", "3060 middle\n", "... ...\n", "1750 middle\n", "2354 high\n", "857 middle\n", "6181 low\n", "3141 middle\n", "\n", "[8000 rows x 1 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'X_test'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...sqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15price_category
5341663290057420150225T000000595000.053.002980100641.000...16801300194009815547.7372-122.31615907800middle
4384242302924520140617T000000550000.031.752240782252.000...22400197609807047.4638-122.4842030202554middle
5795247337005020140604T000000327500.041.75165078001.000...16500196809805847.4507-122.139175010400low
4956952810498520141104T000000611000.021.00127051001.000...1100170190009811547.6771-122.32816703900high
7723397290002520150313T000000499000.061.75240075001.500...14001000197509815547.7661-122.31319807500middle
..................................................................
8517387660012020150422T000000265000.031.501780101961.000...1270510196709800147.3375-122.29113207875low
6914682160000520150403T000000710000.041.75212054001.000...10601060194109819947.6501-122.39520526000high
4499276760393120140818T000000469000.033.25137011943.000...13700200409810747.6718-122.38818002678middle
8651880240041120140619T000000249000.031.00105084981.000...10500195909803147.4043-122.20210508498low
4234545280073520140722T000000780000.042.502270134491.000...1310960197509804047.5416-122.232281013475high
\n", "

2000 rows × 22 columns

\n", "
" ], "text/plain": [ " id date price bedrooms bathrooms sqft_living \\\n", "5341 6632900574 20150225T000000 595000.0 5 3.00 2980 \n", "4384 2423029245 20140617T000000 550000.0 3 1.75 2240 \n", "5795 2473370050 20140604T000000 327500.0 4 1.75 1650 \n", "4956 9528104985 20141104T000000 611000.0 2 1.00 1270 \n", "7723 3972900025 20150313T000000 499000.0 6 1.75 2400 \n", "... ... ... ... ... ... ... \n", "8517 3876600120 20150422T000000 265000.0 3 1.50 1780 \n", "6914 6821600005 20150403T000000 710000.0 4 1.75 2120 \n", "4499 2767603931 20140818T000000 469000.0 3 3.25 1370 \n", "8651 8802400411 20140619T000000 249000.0 3 1.00 1050 \n", "4234 5452800735 20140722T000000 780000.0 4 2.50 2270 \n", "\n", " sqft_lot floors waterfront view ... sqft_above sqft_basement \\\n", "5341 10064 1.0 0 0 ... 1680 1300 \n", "4384 78225 2.0 0 0 ... 2240 0 \n", "5795 7800 1.0 0 0 ... 1650 0 \n", "4956 5100 1.0 0 0 ... 1100 170 \n", "7723 7500 1.5 0 0 ... 1400 1000 \n", "... ... ... ... ... ... ... ... \n", "8517 10196 1.0 0 0 ... 1270 510 \n", "6914 5400 1.0 0 0 ... 1060 1060 \n", "4499 1194 3.0 0 0 ... 1370 0 \n", "8651 8498 1.0 0 0 ... 1050 0 \n", "4234 13449 1.0 0 0 ... 1310 960 \n", "\n", " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", "5341 1940 0 98155 47.7372 -122.316 1590 \n", "4384 1976 0 98070 47.4638 -122.484 2030 \n", "5795 1968 0 98058 47.4507 -122.139 1750 \n", "4956 1900 0 98115 47.6771 -122.328 1670 \n", "7723 1975 0 98155 47.7661 -122.313 1980 \n", "... ... ... ... ... ... ... \n", "8517 1967 0 98001 47.3375 -122.291 1320 \n", "6914 1941 0 98199 47.6501 -122.395 2052 \n", "4499 2004 0 98107 47.6718 -122.388 1800 \n", "8651 1959 0 98031 47.4043 -122.202 1050 \n", "4234 1975 0 98040 47.5416 -122.232 2810 \n", "\n", " sqft_lot15 price_category \n", "5341 7800 middle \n", "4384 202554 middle \n", "5795 10400 low \n", "4956 3900 high \n", "7723 7500 middle \n", "... ... ... \n", "8517 7875 low \n", "6914 6000 high \n", "4499 2678 middle \n", "8651 8498 low \n", "4234 13475 high \n", "\n", "[2000 rows x 22 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'y_test'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
price_category
5341middle
4384middle
5795low
4956high
7723middle
......
8517low
6914high
4499middle
8651low
4234high
\n", "

2000 rows × 1 columns

\n", "
" ], "text/plain": [ " price_category\n", "5341 middle\n", "4384 middle\n", "5795 low\n", "4956 high\n", "7723 middle\n", "... ...\n", "8517 low\n", "6914 high\n", "4499 middle\n", "8651 low\n", "4234 high\n", "\n", "[2000 rows x 1 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from typing import Tuple\n", "import pandas as pd\n", "from pandas import DataFrame\n", "from sklearn.model_selection import train_test_split\n", "\n", "def split_stratified_into_train_val_test(\n", " df_input,\n", " stratify_colname=\"y\",\n", " frac_train=0.6,\n", " frac_val=0.15,\n", " frac_test=0.25,\n", " random_state=None,\n", ") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n", " \n", " if frac_train + frac_val + frac_test != 1.0:\n", " raise ValueError(\n", " \"fractions %f, %f, %f do not add up to 1.0\"\n", " % (frac_train, frac_val, frac_test)\n", " )\n", " if stratify_colname not in df_input.columns:\n", " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", " X = df_input # Contains all columns.\n", " y = df_input[\n", " [stratify_colname]\n", " ] # Dataframe of just the column on which to stratify.\n", " # Split original dataframe into train and temp dataframes.\n", " df_train, df_temp, y_train, y_temp = train_test_split(\n", " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", " )\n", " if frac_val <= 0:\n", " assert len(df_input) == len(df_train) + len(df_temp)\n", " return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n", " # Split the temp dataframe into val and test dataframes.\n", " relative_frac_test = frac_test / (frac_val + frac_test)\n", " df_val, df_test, y_val, y_test = train_test_split(\n", " df_temp,\n", " y_temp,\n", " stratify=y_temp,\n", " test_size=relative_frac_test,\n", " random_state=random_state,\n", " )\n", " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", " return df_train, df_val, df_test, y_train, y_val, y_test\n", "\n", "X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n", " df, stratify_colname=\"price_category\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=42\n", ")\n", "\n", "display(\"X_train\", X_train)\n", "display(\"y_train\", y_train)\n", "\n", "display(\"X_test\", X_test)\n", "display(\"y_test\", y_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Формирование конвейера\n", "preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n", "\n", "preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n", "\n", "features_preprocessing -- трансформер для предобработки признаков\n", "\n", "features_engineering -- трансформер для конструирования признаков\n", "\n", "drop_columns -- трансформер для удаления колонок\n", "\n", "pipeline_end -- основной конвейер предобработки данных и конструирования признаков" ] }, { "cell_type": "code", "execution_count": 191, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idpricebedroomsbathroomssqft_livingsqft_lotfloorsconditiongradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15price_category
033260000340732600.042.5213073001.0471230900196309800547.605-122.16721307560
129828702055358000.021.596018082.0379600199309812247.6183-122.29812901668
213438500625210000.031.01080210431.03610800194209810647.5515-122.35713807620
322422029094517534.021.08331439471.0358330200609807047.3889-122.4821380143947
427462900015387000.032.251760451332.03717600198409806547.5124-121.866191051773
............................................................
799522787720140416000.032.51790115421.0571190600196909805947.5124-122.1617909131
799636192400400775000.042.5309071122.03930900200109805247.705-122.11830506000
799722296500036450000.042.752980132601.04818001180197909805647.5152-122.197192010731
799812787310130289950.041.75209074161.04710501040197009803147.4107-122.17917107527
799928567300110485000.032.52340590581.03823400198509803847.4052-122.028270037263
\n", "

8000 rows × 19 columns

\n", "
" ], "text/plain": [ " id price bedrooms bathrooms sqft_living sqft_lot floors \\\n", "0 3 3260000340 732600.0 4 2.5 2130 7300 \n", "1 2 9828702055 358000.0 2 1.5 960 1808 \n", "2 1 3438500625 210000.0 3 1.0 1080 21043 \n", "3 2 2422029094 517534.0 2 1.0 833 143947 \n", "4 2 7462900015 387000.0 3 2.25 1760 45133 \n", "... .. ... ... ... ... ... ... \n", "7995 2 2787720140 416000.0 3 2.5 1790 11542 \n", "7996 3 6192400400 775000.0 4 2.5 3090 7112 \n", "7997 2 2296500036 450000.0 4 2.75 2980 13260 \n", "7998 1 2787310130 289950.0 4 1.75 2090 7416 \n", "7999 2 8567300110 485000.0 3 2.5 2340 59058 \n", "\n", " condition grade sqft_above sqft_basement yr_built yr_renovated zipcode \\\n", "0 1.0 4 7 1230 900 1963 0 \n", "1 2.0 3 7 960 0 1993 0 \n", "2 1.0 3 6 1080 0 1942 0 \n", "3 1.0 3 5 833 0 2006 0 \n", "4 2.0 3 7 1760 0 1984 0 \n", "... ... ... ... ... ... ... ... \n", "7995 1.0 5 7 1190 600 1969 0 \n", "7996 2.0 3 9 3090 0 2001 0 \n", "7997 1.0 4 8 1800 1180 1979 0 \n", "7998 1.0 4 7 1050 1040 1970 0 \n", "7999 1.0 3 8 2340 0 1985 0 \n", "\n", " lat long sqft_living15 sqft_lot15 price_category \n", "0 98005 47.605 -122.167 2130 7560 \n", "1 98122 47.6183 -122.298 1290 1668 \n", "2 98106 47.5515 -122.357 1380 7620 \n", "3 98070 47.3889 -122.482 1380 143947 \n", "4 98065 47.5124 -121.866 1910 51773 \n", "... ... ... ... ... ... \n", "7995 98059 47.5124 -122.16 1790 9131 \n", "7996 98052 47.705 -122.118 3050 6000 \n", "7997 98056 47.5152 -122.197 1920 10731 \n", "7998 98031 47.4107 -122.179 1710 7527 \n", "7999 98038 47.4052 -122.028 2700 37263 \n", "\n", "[8000 rows x 19 columns]" ] }, "execution_count": 191, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "from sklearn.base import BaseEstimator, TransformerMixin\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.discriminant_analysis import StandardScaler\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.ensemble import RandomForestRegressor # Пример регрессионной модели\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.pipeline import make_pipeline\n", "\n", "class HousesFeatures(BaseEstimator, TransformerMixin):\n", " def __init__(self):\n", " pass\n", "\n", " def fit(self, X, y=None):\n", " return self\n", "\n", "\n", " def transform(self, X, y=None):\n", "\n", " def get_price_type(category) -> int:\n", " if pd.isna(category):\n", " return \"unknown\"\n", " if category == 'low':\n", " return 1\n", " elif category == 'middle':\n", " return 2\n", " elif category == 'high':\n", " return 3\n", " elif category == 'very_high':\n", " return 4\n", "\n", " # Преобразование категориальных столбцов в числовые 1/0\n", " X[\"price_category\"] = [get_price_type(category) for category in X[\"price_category\"]]\n", " return X\n", "\n", " def get_feature_names_out(self, features_in):\n", " return np.append(features_in, [\"price_type\"], axis=0)\n", "\n", "# Указываем столбцы, которые нужно удалить и обрабатывать\n", "columns_to_drop = [\"date\", \"view\", \"waterfront\"]\n", "num_columns = [\n", " column\n", " for column in df.columns\n", " if column not in columns_to_drop and df[column].dtype != \"object\" and df[column].dtype != \"category\"\n", "]\n", "cat_columns = [\n", " column\n", " for column in df.columns\n", " if column not in columns_to_drop and df[column].dtype == \"object\" or df[column].dtype == \"category\"\n", "]\n", "\n", "# Определяем предобработку для численных данных\n", "num_imputer = SimpleImputer(strategy=\"median\")\n", "num_scaler = StandardScaler()\n", "preprocessing_num = Pipeline(\n", " [\n", " (\"imputer\", num_imputer),\n", " (\"scaler\", num_scaler),\n", " ]\n", ")\n", "\n", "# Определяем предобработку для категориальных данных\n", "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n", "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n", "preprocessing_cat = Pipeline(\n", " [\n", " (\"imputer\", cat_imputer),\n", " (\"encoder\", cat_encoder),\n", " ]\n", ")\n", "\n", "features_preprocessing = ColumnTransformer(\n", " verbose_feature_names_out=False,\n", " transformers=[\n", " (\"prepocessing_num\", preprocessing_num, num_columns),\n", " (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n", " # (\"prepocessing_features\", cat_imputer, [\"price_category\"]),\n", " ],\n", " remainder=\"passthrough\"\n", ")\n", "\n", "features_engineering = ColumnTransformer(\n", " verbose_feature_names_out=False,\n", " transformers=[\n", " (\"add_features\", HousesFeatures(), [\"price_category\"]),\n", " ],\n", " remainder=\"passthrough\",\n", ")\n", "\n", "drop_columns = ColumnTransformer(\n", " verbose_feature_names_out=False,\n", " transformers=[\n", " (\"drop_columns\", \"drop\", columns_to_drop),\n", " ],\n", " remainder=\"passthrough\",\n", ")\n", "\n", "features_postprocessing = ColumnTransformer(\n", " verbose_feature_names_out=False,\n", " transformers=[\n", " (\"prepocessing_cat\", preprocessing_cat, [\"price_category\"]),\n", " ],\n", " remainder=\"passthrough\",\n", ")\n", "\n", "pipeline_end = Pipeline(\n", " [\n", " (\"features_preprocessing\", features_preprocessing),\n", " (\"features_engineering\", features_engineering),\n", " (\"drop_columns\", drop_columns),\n", " (\"features_postprocessing\", features_postprocessing),\n", " ]\n", "\n", ")\n", "cols = ['a', 'b']\n", "preprocessing_result = drop_columns.fit_transform(X_train)\n", "preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns)\n", "preprocessing_result = features_engineering.fit_transform(preprocessing_result)\n", "preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns)\n", "preprocessing_result\n", "# # preprocessing_result = features_preprocessing.fit_transform(preprocessing_result)\n", "# # preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns)\n", "\n", "# preprocessing_result = features_postprocessing.fit_transform(preprocessing_result)\n", "\n", "# preprocessing_result = pipeline_end.fit_transform(X_train)\n", "# preprocessed_df = pd.DataFrame(\n", "# preprocessing_result,\n", "# columns=pipeline_end.get_feature_names_out(),\n", "# )\n", "# preprocessed_df" ] } ], "metadata": { "kernelspec": { "display_name": "kernel", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.5" } }, "nbformat": 4, "nbformat_minor": 2 }