{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Загрузка данных" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idpricebedroomsbathroomssqft_lotfloorswaterfrontviewconditiongrade...zipcode_98146zipcode_98148zipcode_98155zipcode_98166zipcode_98168zipcode_98177zipcode_98178zipcode_98188zipcode_98198zipcode_98199
07129300520221900.031.0056501.00037...0.00.00.00.00.00.01.00.00.00.0
16414100192538000.032.2572422.00037...0.00.00.00.00.00.00.00.00.00.0
25631500400180000.021.00100001.00036...0.00.00.00.00.00.00.00.00.00.0
32487200875604000.043.0050001.00057...0.00.00.00.00.00.00.00.00.00.0
41954400510510000.032.0080801.00038...0.00.00.00.00.00.00.00.00.00.0
..................................................................
21608263000018360000.032.5011313.00038...0.00.00.00.00.00.00.00.00.00.0
216096600060120400000.042.5058132.00038...1.00.00.00.00.00.00.00.00.00.0
216101523300141402101.020.7513502.00037...0.00.00.00.00.00.00.00.00.00.0
21611291310100400000.032.5023882.00038...0.00.00.00.00.00.00.00.00.00.0
216121523300157325000.020.7510762.00037...0.00.00.00.00.00.00.00.00.00.0
\n", "

21613 rows × 4594 columns

\n", "
" ], "text/plain": [ " id price bedrooms bathrooms sqft_lot floors \\\n", "0 7129300520 221900.0 3 1.00 5650 1.0 \n", "1 6414100192 538000.0 3 2.25 7242 2.0 \n", "2 5631500400 180000.0 2 1.00 10000 1.0 \n", "3 2487200875 604000.0 4 3.00 5000 1.0 \n", "4 1954400510 510000.0 3 2.00 8080 1.0 \n", "... ... ... ... ... ... ... \n", "21608 263000018 360000.0 3 2.50 1131 3.0 \n", "21609 6600060120 400000.0 4 2.50 5813 2.0 \n", "21610 1523300141 402101.0 2 0.75 1350 2.0 \n", "21611 291310100 400000.0 3 2.50 2388 2.0 \n", "21612 1523300157 325000.0 2 0.75 1076 2.0 \n", "\n", " waterfront view condition grade ... zipcode_98146 zipcode_98148 \\\n", "0 0 0 3 7 ... 0.0 0.0 \n", "1 0 0 3 7 ... 0.0 0.0 \n", "2 0 0 3 6 ... 0.0 0.0 \n", "3 0 0 5 7 ... 0.0 0.0 \n", "4 0 0 3 8 ... 0.0 0.0 \n", "... ... ... ... ... ... ... ... \n", "21608 0 0 3 8 ... 0.0 0.0 \n", "21609 0 0 3 8 ... 1.0 0.0 \n", "21610 0 0 3 7 ... 0.0 0.0 \n", "21611 0 0 3 8 ... 0.0 0.0 \n", "21612 0 0 3 7 ... 0.0 0.0 \n", "\n", " zipcode_98155 zipcode_98166 zipcode_98168 zipcode_98177 \\\n", "0 0.0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... \n", "21608 0.0 0.0 0.0 0.0 \n", "21609 0.0 0.0 0.0 0.0 \n", "21610 0.0 0.0 0.0 0.0 \n", "21611 0.0 0.0 0.0 0.0 \n", "21612 0.0 0.0 0.0 0.0 \n", "\n", " zipcode_98178 zipcode_98188 zipcode_98198 zipcode_98199 \n", "0 1.0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... \n", "21608 0.0 0.0 0.0 0.0 \n", "21609 0.0 0.0 0.0 0.0 \n", "21610 0.0 0.0 0.0 0.0 \n", "21611 0.0 0.0 0.0 0.0 \n", "21612 0.0 0.0 0.0 0.0 \n", "\n", "[21613 rows x 4594 columns]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "from sklearn.preprocessing import OneHotEncoder\n", "import numpy as np # type: ignore\n", "\n", "from sklearn import set_config\n", "\n", "set_config(transform_output=\"pandas\")\n", "\n", "random_state = 9\n", "\n", "df = pd.read_csv(\"data/kc_house_data.csv\", index_col=False)\n", "\n", "encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n", "\n", "encoded_values = encoder.fit_transform(df[[\"date\",\"price\",\"yr_built\", \"zipcode\"]])\n", "\n", "encoded_columns = encoder.get_feature_names_out([\"date\",\"price\", \"yr_built\", \"zipcode\"])\n", "\n", "encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n", "\n", "df = pd.concat([df, encoded_values_df], axis=1)\n", "\n", "df = df.drop(\n", " [\n", " \"yr_built\",\n", " \"date\",\n", " \"lat\",\n", " \"sqft_living15\",\n", " \"sqft_lot15\",\n", " \"zipcode\",\n", " \"sqft_basement\",\n", " \"sqft_above\",\n", " \"sqft_living\",\n", " ],\n", " axis=1,\n", ")\n", "\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Формирование выборок" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'X_train'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idpricebedroomsbathroomssqft_lotwaterfrontviewconditiongradeyr_renovated...zipcode_98146zipcode_98148zipcode_98155zipcode_98166zipcode_98168zipcode_98177zipcode_98178zipcode_98188zipcode_98198zipcode_98199
203589265880170550000.042.50595400380...0.00.00.00.00.00.00.00.00.00.0
1017198030400790000.043.506098003100...0.00.00.00.00.00.00.00.00.00.0
212143343903611615000.053.25706900390...0.00.00.00.00.00.00.00.00.00.0
81811139000215416000.021.75756000470...0.00.00.00.00.00.00.00.00.00.0
156577893805650475000.052.001020000360...0.00.00.00.00.00.00.00.01.00.0
..................................................................
145273888100133360000.031.001098800370...0.00.00.00.00.00.00.00.00.00.0
133927137800085185000.031.75908500370...0.00.00.00.00.00.00.00.00.00.0
156932402100675645000.033.75600000570...0.00.00.00.00.00.00.00.00.00.0
197163943600070400000.032.50478800380...0.00.00.00.00.00.00.00.00.00.0
76126679000720296000.032.50584500370...0.00.00.00.00.00.00.00.00.00.0
\n", "

17290 rows × 4593 columns

\n", "
" ], "text/plain": [ " id price bedrooms bathrooms sqft_lot waterfront view \\\n", "20358 9265880170 550000.0 4 2.50 5954 0 0 \n", "10171 98030400 790000.0 4 3.50 6098 0 0 \n", "21214 3343903611 615000.0 5 3.25 7069 0 0 \n", "8181 1139000215 416000.0 2 1.75 7560 0 0 \n", "15657 7893805650 475000.0 5 2.00 10200 0 0 \n", "... ... ... ... ... ... ... ... \n", "14527 3888100133 360000.0 3 1.00 10988 0 0 \n", "13392 7137800085 185000.0 3 1.75 9085 0 0 \n", "15693 2402100675 645000.0 3 3.75 6000 0 0 \n", "19716 3943600070 400000.0 3 2.50 4788 0 0 \n", "7612 6679000720 296000.0 3 2.50 5845 0 0 \n", "\n", " condition grade yr_renovated ... zipcode_98146 zipcode_98148 \\\n", "20358 3 8 0 ... 0.0 0.0 \n", "10171 3 10 0 ... 0.0 0.0 \n", "21214 3 9 0 ... 0.0 0.0 \n", "8181 4 7 0 ... 0.0 0.0 \n", "15657 3 6 0 ... 0.0 0.0 \n", "... ... ... ... ... ... ... \n", "14527 3 7 0 ... 0.0 0.0 \n", "13392 3 7 0 ... 0.0 0.0 \n", "15693 5 7 0 ... 0.0 0.0 \n", "19716 3 8 0 ... 0.0 0.0 \n", "7612 3 7 0 ... 0.0 0.0 \n", "\n", " zipcode_98155 zipcode_98166 zipcode_98168 zipcode_98177 \\\n", "20358 0.0 0.0 0.0 0.0 \n", "10171 0.0 0.0 0.0 0.0 \n", "21214 0.0 0.0 0.0 0.0 \n", "8181 0.0 0.0 0.0 0.0 \n", "15657 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... \n", "14527 0.0 0.0 0.0 0.0 \n", "13392 0.0 0.0 0.0 0.0 \n", "15693 0.0 0.0 0.0 0.0 \n", "19716 0.0 0.0 0.0 0.0 \n", "7612 0.0 0.0 0.0 0.0 \n", "\n", " zipcode_98178 zipcode_98188 zipcode_98198 zipcode_98199 \n", "20358 0.0 0.0 0.0 0.0 \n", "10171 0.0 0.0 0.0 0.0 \n", "21214 0.0 0.0 0.0 0.0 \n", "8181 0.0 0.0 0.0 0.0 \n", "15657 0.0 0.0 1.0 0.0 \n", "... ... ... ... ... \n", "14527 0.0 0.0 0.0 0.0 \n", "13392 0.0 0.0 0.0 0.0 \n", "15693 0.0 0.0 0.0 0.0 \n", "19716 0.0 0.0 0.0 0.0 \n", "7612 0.0 0.0 0.0 0.0 \n", "\n", "[17290 rows x 4593 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'y_train'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
floors
203582.0
101712.0
212142.0
81811.5
156571.0
......
145271.0
133921.0
156932.0
197162.0
76122.0
\n", "

17290 rows × 1 columns

\n", "
" ], "text/plain": [ " floors\n", "20358 2.0\n", "10171 2.0\n", "21214 2.0\n", "8181 1.5\n", "15657 1.0\n", "... ...\n", "14527 1.0\n", "13392 1.0\n", "15693 2.0\n", "19716 2.0\n", "7612 2.0\n", "\n", "[17290 rows x 1 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'X_test'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idpricebedroomsbathroomssqft_lotwaterfrontviewconditiongradeyr_renovated...zipcode_98146zipcode_98148zipcode_98155zipcode_98166zipcode_98168zipcode_98177zipcode_98178zipcode_98188zipcode_98198zipcode_98199
124351217000340340000.031.00810000470...0.00.00.01.00.00.00.00.00.00.0
19037984200690299000.052.50936000470...0.00.00.00.00.00.00.00.00.00.0
66097971300020800000.052.001096000470...0.00.00.00.00.00.00.00.00.00.0
118691498303905615000.041.50324000480...0.00.00.00.00.00.00.00.00.00.0
134144402700230352500.031.50768000370...0.00.00.00.00.00.00.00.00.00.0
..................................................................
121133861500340279900.031.75662000370...0.00.00.00.00.00.00.00.00.00.0
57761870400470637800.041.75475000470...0.00.00.00.00.00.00.00.00.00.0
86714022900571385000.052.001175000370...0.00.01.00.00.00.00.00.00.00.0
2155115617500401375000.054.5013405003110...0.00.00.00.00.00.00.00.00.00.0
66341853200190612000.042.50597400380...0.00.00.00.00.00.00.00.00.00.0
\n", "

4323 rows × 4593 columns

\n", "
" ], "text/plain": [ " id price bedrooms bathrooms sqft_lot waterfront view \\\n", "12435 1217000340 340000.0 3 1.00 8100 0 0 \n", "19037 984200690 299000.0 5 2.50 9360 0 0 \n", "6609 7971300020 800000.0 5 2.00 10960 0 0 \n", "11869 1498303905 615000.0 4 1.50 3240 0 0 \n", "13414 4402700230 352500.0 3 1.50 7680 0 0 \n", "... ... ... ... ... ... ... ... \n", "12113 3861500340 279900.0 3 1.75 6620 0 0 \n", "5776 1870400470 637800.0 4 1.75 4750 0 0 \n", "8671 4022900571 385000.0 5 2.00 11750 0 0 \n", "21551 1561750040 1375000.0 5 4.50 13405 0 0 \n", "6634 1853200190 612000.0 4 2.50 5974 0 0 \n", "\n", " condition grade yr_renovated ... zipcode_98146 zipcode_98148 \\\n", "12435 4 7 0 ... 0.0 0.0 \n", "19037 4 7 0 ... 0.0 0.0 \n", "6609 4 7 0 ... 0.0 0.0 \n", "11869 4 8 0 ... 0.0 0.0 \n", "13414 3 7 0 ... 0.0 0.0 \n", "... ... ... ... ... ... ... \n", "12113 3 7 0 ... 0.0 0.0 \n", "5776 4 7 0 ... 0.0 0.0 \n", "8671 3 7 0 ... 0.0 0.0 \n", "21551 3 11 0 ... 0.0 0.0 \n", "6634 3 8 0 ... 0.0 0.0 \n", "\n", " zipcode_98155 zipcode_98166 zipcode_98168 zipcode_98177 \\\n", "12435 0.0 1.0 0.0 0.0 \n", "19037 0.0 0.0 0.0 0.0 \n", "6609 0.0 0.0 0.0 0.0 \n", "11869 0.0 0.0 0.0 0.0 \n", "13414 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... \n", "12113 0.0 0.0 0.0 0.0 \n", "5776 0.0 0.0 0.0 0.0 \n", "8671 1.0 0.0 0.0 0.0 \n", "21551 0.0 0.0 0.0 0.0 \n", "6634 0.0 0.0 0.0 0.0 \n", "\n", " zipcode_98178 zipcode_98188 zipcode_98198 zipcode_98199 \n", "12435 0.0 0.0 0.0 0.0 \n", "19037 0.0 0.0 0.0 0.0 \n", "6609 0.0 0.0 0.0 0.0 \n", "11869 0.0 0.0 0.0 0.0 \n", "13414 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... \n", "12113 0.0 0.0 0.0 0.0 \n", "5776 0.0 0.0 0.0 0.0 \n", "8671 0.0 0.0 0.0 0.0 \n", "21551 0.0 0.0 0.0 0.0 \n", "6634 0.0 0.0 0.0 0.0 \n", "\n", "[4323 rows x 4593 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'y_test'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
floors
124351.0
190371.0
66091.0
118691.5
134141.0
......
121131.0
57761.5
86711.0
215512.0
66342.0
\n", "

4323 rows × 1 columns

\n", "
" ], "text/plain": [ " floors\n", "12435 1.0\n", "19037 1.0\n", "6609 1.0\n", "11869 1.5\n", "13414 1.0\n", "... ...\n", "12113 1.0\n", "5776 1.5\n", "8671 1.0\n", "21551 2.0\n", "6634 2.0\n", "\n", "[4323 rows x 1 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from utils import split_stratified_into_train_val_test\n", "\n", "X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n", " df,\n", " stratify_colname=\"floors\",\n", " frac_train=0.80,\n", " frac_val=0,\n", " frac_test=0.20,\n", " random_state=random_state,\n", ")\n", "\n", "X_train = X_train.drop([\"floors\"], axis=1)\n", "X_test = X_test.drop([\"floors\"], axis=1)\n", "\n", "display(\"X_train\", X_train)\n", "display(\"y_train\", y_train)\n", "\n", "display(\"X_test\", X_test)\n", "display(\"y_test\", y_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Определение перечня алгоритмов решения задачи аппроксимации (регрессии)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import make_pipeline\n", "from sklearn.preprocessing import PolynomialFeatures\n", "from sklearn import linear_model, tree, neighbors, ensemble, neural_network\n", "\n", "random_state = 9\n", "\n", "models = {\n", " \"linear\": {\"model\": linear_model.LinearRegression(n_jobs=-1)},\n", " \"linear_poly\": {\n", " \"model\": make_pipeline(\n", " PolynomialFeatures(degree=2),\n", " linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n", " )\n", " },\n", " \"linear_interact\": {\n", " \"model\": make_pipeline(\n", " PolynomialFeatures(interaction_only=True),\n", " linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n", " )\n", " },\n", " \"ridge\": {\"model\": linear_model.RidgeCV()},\n", " \"decision_tree\": {\n", " \"model\": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)\n", " },\n", " \"knn\": {\"model\": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},\n", " \"random_forest\": {\n", " \"model\": ensemble.RandomForestRegressor(\n", " max_depth=7, random_state=random_state, n_jobs=-1\n", " )\n", " },\n", " \"mlp\": {\n", " \"model\": neural_network.MLPRegressor(\n", " activation=\"tanh\",\n", " hidden_layer_sizes=(3,),\n", " max_iter=500,\n", " early_stopping=True,\n", " random_state=random_state,\n", " )\n", " },\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Определение функции для стандартизации значений в столбце \"Температура\" для MLP" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "from pandas import DataFrame\n", "from sklearn import preprocessing\n", "\n", "\n", "stndart_scaler = preprocessing.StandardScaler()\n", "\n", "\n", "def std_q(df: DataFrame) -> DataFrame:\n", " df[\"price\"] = np.array(\n", " stndart_scaler.fit_transform(df[\"price\"].to_numpy().reshape(-1, 1))\n", " ).reshape(df[\"price\"].shape)\n", " df[\"bedrooms\"] = np.array(\n", " stndart_scaler.fit_transform(df[\"bedrooms\"].to_numpy().reshape(-1, 1))\n", " ).reshape(df[\"bedrooms\"].shape)\n", " return df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Обучение и оценка моделей с помощью различных алгоритмов" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: linear\n", "Model: linear_poly\n", "Model: linear_interact\n", "Model: ridge\n", "Model: decision_tree\n", "Model: knn\n", "Model: random_forest\n", "Model: mlp\n" ] } ], "source": [ "from sklearn.decomposition import PCA\n", "import math\n", "from pandas import DataFrame\n", "from sklearn import metrics\n", "\n", "# Adding PCA to reduce dimensionality\n", "pca = PCA(n_components=100) # Adjust based on memory constraints\n", "\n", "# Transform X_train and X_test\n", "X_train_reduced = pca.fit_transform(X_train)\n", "X_test_reduced = pca.transform(X_test)\n", "\n", "for model_name in models.keys():\n", " print(f\"Model: {model_name}\")\n", "\n", " X_train_model = X_train_reduced\n", " X_test_model = X_test_reduced\n", "\n", " if model_name == \"mlp\":\n", " X_train_model = std_q(X_train)\n", " X_test_model = std_q(X_test)\n", "\n", " fitted_model = models[model_name][\"model\"].fit(\n", " X_train_model, y_train.values.ravel()\n", " )\n", " y_train_pred = fitted_model.predict(X_train_model)\n", " y_test_pred = fitted_model.predict(X_test_model)\n", " models[model_name][\"fitted\"] = fitted_model\n", " models[model_name][\"train_preds\"] = y_train_pred\n", " models[model_name][\"preds\"] = y_test_pred\n", " models[model_name][\"RMSE_train\"] = math.sqrt(\n", " metrics.mean_squared_error(y_train, y_train_pred)\n", " )\n", " models[model_name][\"RMSE_test\"] = math.sqrt(\n", " metrics.mean_squared_error(y_test, y_test_pred)\n", " )\n", " models[model_name][\"RMAE_test\"] = math.sqrt(\n", " metrics.mean_absolute_error(y_test, y_test_pred)\n", " )\n", " models[model_name][\"R2_test\"] = metrics.r2_score(y_test, y_test_pred)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Вывод результатов оценки" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
 RMSE_trainRMSE_testRMAE_testR2_test
random_forest0.3459510.3770870.5355640.513150
linear0.3954350.3945670.5567760.466967
decision_tree0.3709410.4196090.5467810.397161
knn0.4077750.4809280.5991050.208099
mlp0.5425580.5450980.700044-0.017329
linear_interact0.6891030.6888840.681739-0.624815
linear_poly4.9669615.1190871.798987-88.721546
ridge76554.11953677087.654118261.058309-20346113161.492393
\n" ], "text/plain": [ "" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reg_metrics = pd.DataFrame.from_dict(models, \"index\")[\n", " [\"RMSE_train\", \"RMSE_test\", \"RMAE_test\", \"R2_test\"]\n", "]\n", "reg_metrics.sort_values(by=\"RMSE_test\").style.background_gradient(\n", " cmap=\"viridis\", low=1, high=0.3, subset=[\"RMSE_train\", \"RMSE_test\"]\n", ").background_gradient(cmap=\"plasma\", low=0.3, high=1, subset=[\"RMAE_test\", \"R2_test\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Вывод реального и \"спрогнозированного\" результата для обучающей и тестовой выборок" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Получение лучшей модели" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'random_forest'" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "best_model = str(reg_metrics.sort_values(by=\"RMSE_test\").iloc[0].name)\n", "\n", "display(best_model)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Вывод для обучающей выборки" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idpricebedroomsbathroomssqft_lotwaterfrontviewconditiongradeyr_renovated...zipcode_98155zipcode_98166zipcode_98168zipcode_98177zipcode_98178zipcode_98188zipcode_98198zipcode_98199floorsDensityPred
2035892658801700.0269340.6746422.50595400380...0.00.00.00.00.00.00.00.02.01.763424
10171980304000.6844740.6746423.506098003100...0.00.00.00.00.00.00.00.02.01.951525
2121433439036110.2050171.7428263.25706900390...0.00.00.00.00.00.00.00.02.01.802658
81811139000215-0.340193-1.4617251.75756000470...0.00.00.00.00.00.00.00.01.51.183226
156577893805650-0.1785481.7428262.001020000360...0.00.00.00.00.00.01.00.01.01.201648
\n", "

5 rows × 4595 columns

\n", "
" ], "text/plain": [ " id price bedrooms bathrooms sqft_lot waterfront view \\\n", "20358 9265880170 0.026934 0.674642 2.50 5954 0 0 \n", "10171 98030400 0.684474 0.674642 3.50 6098 0 0 \n", "21214 3343903611 0.205017 1.742826 3.25 7069 0 0 \n", "8181 1139000215 -0.340193 -1.461725 1.75 7560 0 0 \n", "15657 7893805650 -0.178548 1.742826 2.00 10200 0 0 \n", "\n", " condition grade yr_renovated ... zipcode_98155 zipcode_98166 \\\n", "20358 3 8 0 ... 0.0 0.0 \n", "10171 3 10 0 ... 0.0 0.0 \n", "21214 3 9 0 ... 0.0 0.0 \n", "8181 4 7 0 ... 0.0 0.0 \n", "15657 3 6 0 ... 0.0 0.0 \n", "\n", " zipcode_98168 zipcode_98177 zipcode_98178 zipcode_98188 \\\n", "20358 0.0 0.0 0.0 0.0 \n", "10171 0.0 0.0 0.0 0.0 \n", "21214 0.0 0.0 0.0 0.0 \n", "8181 0.0 0.0 0.0 0.0 \n", "15657 0.0 0.0 0.0 0.0 \n", "\n", " zipcode_98198 zipcode_98199 floors DensityPred \n", "20358 0.0 0.0 2.0 1.763424 \n", "10171 0.0 0.0 2.0 1.951525 \n", "21214 0.0 0.0 2.0 1.802658 \n", "8181 0.0 0.0 1.5 1.183226 \n", "15657 1.0 0.0 1.0 1.201648 \n", "\n", "[5 rows x 4595 columns]" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat(\n", " [\n", " X_train,\n", " y_train,\n", " pd.Series(\n", " models[best_model][\"train_preds\"],\n", " index=y_train.index,\n", " name=\"FloorPred\",\n", " ),\n", " ],\n", " axis=1,\n", ").head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Вывод для тестовой выборки" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TAl2O3TiO2DensityDensityPred
0300.000.01.056961.057040
1550.000.01.041581.041341
2250.050.01.084381.084063
3300.050.01.081121.080764
4350.050.01.077811.077444
\n", "
" ], "text/plain": [ " T Al2O3 TiO2 Density DensityPred\n", "0 30 0.00 0.0 1.05696 1.057040\n", "1 55 0.00 0.0 1.04158 1.041341\n", "2 25 0.05 0.0 1.08438 1.084063\n", "3 30 0.05 0.0 1.08112 1.080764\n", "4 35 0.05 0.0 1.07781 1.077444" ] }, "execution_count": 154, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat(\n", " [\n", " X_test,\n", " y_test,\n", " pd.Series(\n", " models[best_model][\"preds\"],\n", " index=y_test.index,\n", " name=\"FloorsPred\",\n", " ),\n", " ],\n", " axis=1,\n", ").head(5)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }