1084 lines
123 KiB
Plaintext
1084 lines
123 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Бизнес цели:\n",
|
|||
|
"1. Оптимизация ценовой стратегии: анализ факторов, влияющих на стоимость недвижимости, чтобы помочь продавцам устанавливать конкурентоспособные цены и увеличивать прибыль.\n",
|
|||
|
"2. Улучшение инвестиционных решений: предоставление аналитики для инвесторов, чтобы они могли определить наиболее выгодные районы и типы недвижимости для вложений."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Цели технического проекта:\n",
|
|||
|
"1. Создание модели машинного обучения для прогнозирования стоимости недвижимости на основе таких характеристик, как площадь дома, количество спален и ванных комнат, расположение, возраст дома, наличие бассейна и других факторов.\n",
|
|||
|
"2. Разработка системы, которая анализирует волатильность цен (показатель изменчивости цены актива за определённый период времени) на недвижимость в разных районах, учитывая исторические данные о продажах, сезонные колебания и демографические изменения, чтобы выявить наиболее стабильные и перспективные зоны для инвестиций."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 68,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" id date price bedrooms bathrooms \\\n",
|
|||
|
"0 7129300520 20141013T000000 221900.0 3 1.00 \n",
|
|||
|
"1 6414100192 20141209T000000 538000.0 3 2.25 \n",
|
|||
|
"2 5631500400 20150225T000000 180000.0 2 1.00 \n",
|
|||
|
"3 2487200875 20141209T000000 604000.0 4 3.00 \n",
|
|||
|
"4 1954400510 20150218T000000 510000.0 3 2.00 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"21608 263000018 20140521T000000 360000.0 3 2.50 \n",
|
|||
|
"21609 6600060120 20150223T000000 400000.0 4 2.50 \n",
|
|||
|
"21610 1523300141 20140623T000000 402101.0 2 0.75 \n",
|
|||
|
"21611 291310100 20150116T000000 400000.0 3 2.50 \n",
|
|||
|
"21612 1523300157 20141015T000000 325000.0 2 0.75 \n",
|
|||
|
"\n",
|
|||
|
" sqft_living sqft_lot floors waterfront view ... grade \\\n",
|
|||
|
"0 1180 5650 1.0 0 0 ... 7 \n",
|
|||
|
"1 2570 7242 2.0 0 0 ... 7 \n",
|
|||
|
"2 770 10000 1.0 0 0 ... 6 \n",
|
|||
|
"3 1960 5000 1.0 0 0 ... 7 \n",
|
|||
|
"4 1680 8080 1.0 0 0 ... 8 \n",
|
|||
|
"... ... ... ... ... ... ... ... \n",
|
|||
|
"21608 1530 1131 3.0 0 0 ... 8 \n",
|
|||
|
"21609 2310 5813 2.0 0 0 ... 8 \n",
|
|||
|
"21610 1020 1350 2.0 0 0 ... 7 \n",
|
|||
|
"21611 1600 2388 2.0 0 0 ... 8 \n",
|
|||
|
"21612 1020 1076 2.0 0 0 ... 7 \n",
|
|||
|
"\n",
|
|||
|
" sqft_above sqft_basement yr_built yr_renovated zipcode lat \\\n",
|
|||
|
"0 1180 0 1955 0 98178 47.5112 \n",
|
|||
|
"1 2170 400 1951 1991 98125 47.7210 \n",
|
|||
|
"2 770 0 1933 0 98028 47.7379 \n",
|
|||
|
"3 1050 910 1965 0 98136 47.5208 \n",
|
|||
|
"4 1680 0 1987 0 98074 47.6168 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"21608 1530 0 2009 0 98103 47.6993 \n",
|
|||
|
"21609 2310 0 2014 0 98146 47.5107 \n",
|
|||
|
"21610 1020 0 2009 0 98144 47.5944 \n",
|
|||
|
"21611 1600 0 2004 0 98027 47.5345 \n",
|
|||
|
"21612 1020 0 2008 0 98144 47.5941 \n",
|
|||
|
"\n",
|
|||
|
" long sqft_living15 sqft_lot15 \n",
|
|||
|
"0 -122.257 1340 5650 \n",
|
|||
|
"1 -122.319 1690 7639 \n",
|
|||
|
"2 -122.233 2720 8062 \n",
|
|||
|
"3 -122.393 1360 5000 \n",
|
|||
|
"4 -122.045 1800 7503 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"21608 -122.346 1530 1509 \n",
|
|||
|
"21609 -122.362 1830 7200 \n",
|
|||
|
"21610 -122.299 1020 2007 \n",
|
|||
|
"21611 -122.069 1410 1287 \n",
|
|||
|
"21612 -122.299 1020 1357 \n",
|
|||
|
"\n",
|
|||
|
"[21613 rows x 21 columns]\n",
|
|||
|
"0 16356\n",
|
|||
|
"1 16413\n",
|
|||
|
"2 16491\n",
|
|||
|
"3 16413\n",
|
|||
|
"4 16484\n",
|
|||
|
" ... \n",
|
|||
|
"21608 16211\n",
|
|||
|
"21609 16489\n",
|
|||
|
"21610 16244\n",
|
|||
|
"21611 16451\n",
|
|||
|
"21612 16358\n",
|
|||
|
"Name: date_numeric, Length: 21613, dtype: int64\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\"data/kc_house_data.csv\")\n",
|
|||
|
"print(df)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование даты продажи в числовой формат (кол-во дней с 01.01.1970)\n",
|
|||
|
"df['date'] = pd.to_datetime(df['date'])\n",
|
|||
|
"df['date_numeric'] = (df['date'] - pd.Timestamp('1970-01-01')).dt.days\n",
|
|||
|
"print(df['date_numeric'])\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 69,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>bathrooms_0.5</th>\n",
|
|||
|
" <th>bathrooms_0.75</th>\n",
|
|||
|
" <th>bathrooms_1.0</th>\n",
|
|||
|
" <th>bathrooms_1.25</th>\n",
|
|||
|
" <th>bathrooms_1.5</th>\n",
|
|||
|
" <th>bathrooms_1.75</th>\n",
|
|||
|
" <th>bathrooms_2.0</th>\n",
|
|||
|
" <th>bathrooms_2.25</th>\n",
|
|||
|
" <th>bathrooms_2.5</th>\n",
|
|||
|
" <th>bathrooms_2.75</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>bedrooms_3</th>\n",
|
|||
|
" <th>bedrooms_4</th>\n",
|
|||
|
" <th>bedrooms_5</th>\n",
|
|||
|
" <th>bedrooms_6</th>\n",
|
|||
|
" <th>bedrooms_7</th>\n",
|
|||
|
" <th>bedrooms_8</th>\n",
|
|||
|
" <th>bedrooms_9</th>\n",
|
|||
|
" <th>bedrooms_10</th>\n",
|
|||
|
" <th>bedrooms_11</th>\n",
|
|||
|
" <th>bedrooms_33</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21608</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21609</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21610</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21611</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21612</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>21613 rows × 41 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" bathrooms_0.5 bathrooms_0.75 bathrooms_1.0 bathrooms_1.25 \\\n",
|
|||
|
"0 0.0 0.0 1.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 1.0 0.0 \n",
|
|||
|
"3 0.0 0.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"21608 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21609 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21610 0.0 1.0 0.0 0.0 \n",
|
|||
|
"21611 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21612 0.0 1.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" bathrooms_1.5 bathrooms_1.75 bathrooms_2.0 bathrooms_2.25 \\\n",
|
|||
|
"0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 1.0 \n",
|
|||
|
"2 0.0 0.0 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 1.0 0.0 \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"21608 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21609 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21610 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21611 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21612 0.0 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" bathrooms_2.5 bathrooms_2.75 ... bedrooms_3 bedrooms_4 bedrooms_5 \\\n",
|
|||
|
"0 0.0 0.0 ... 1.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 ... 1.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 ... 0.0 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 ... 0.0 1.0 0.0 \n",
|
|||
|
"4 0.0 0.0 ... 1.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"21608 1.0 0.0 ... 1.0 0.0 0.0 \n",
|
|||
|
"21609 1.0 0.0 ... 0.0 1.0 0.0 \n",
|
|||
|
"21610 0.0 0.0 ... 0.0 0.0 0.0 \n",
|
|||
|
"21611 1.0 0.0 ... 1.0 0.0 0.0 \n",
|
|||
|
"21612 0.0 0.0 ... 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" bedrooms_6 bedrooms_7 bedrooms_8 bedrooms_9 bedrooms_10 \\\n",
|
|||
|
"0 0.0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"21608 0.0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21609 0.0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21610 0.0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21611 0.0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21612 0.0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" bedrooms_11 bedrooms_33 \n",
|
|||
|
"0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"21608 0.0 0.0 \n",
|
|||
|
"21609 0.0 0.0 \n",
|
|||
|
"21610 0.0 0.0 \n",
|
|||
|
"21611 0.0 0.0 \n",
|
|||
|
"21612 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
"[21613 rows x 41 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 69,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|||
|
"\n",
|
|||
|
"encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
|
|||
|
"\n",
|
|||
|
"encoded_values = encoder.fit_transform(df[[\"bathrooms\", \"bedrooms\"]])\n",
|
|||
|
"\n",
|
|||
|
"encoded_columns = encoder.get_feature_names_out([\"bathrooms\", \"bedrooms\"])\n",
|
|||
|
"\n",
|
|||
|
"encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
|
|||
|
"\n",
|
|||
|
"encoded_values_df"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 70,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',\n",
|
|||
|
" 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',\n",
|
|||
|
" 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',\n",
|
|||
|
" 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'date_numeric',\n",
|
|||
|
" 'price_binned'],\n",
|
|||
|
" dtype='object')\n",
|
|||
|
"Обучающая выборка: (12967, 23)\n",
|
|||
|
"price\n",
|
|||
|
"550000.0 105\n",
|
|||
|
"350000.0 104\n",
|
|||
|
"450000.0 99\n",
|
|||
|
"400000.0 96\n",
|
|||
|
"325000.0 94\n",
|
|||
|
" ... \n",
|
|||
|
"887250.0 1\n",
|
|||
|
"123000.0 1\n",
|
|||
|
"638700.0 1\n",
|
|||
|
"502500.0 1\n",
|
|||
|
"793000.0 1\n",
|
|||
|
"Name: count, Length: 2982, dtype: int64\n",
|
|||
|
"Валидационная выборка: (4322, 23)\n",
|
|||
|
"price\n",
|
|||
|
"350000.0 39\n",
|
|||
|
"450000.0 37\n",
|
|||
|
"425000.0 34\n",
|
|||
|
"325000.0 33\n",
|
|||
|
"575000.0 33\n",
|
|||
|
" ..\n",
|
|||
|
"500007.0 1\n",
|
|||
|
"129888.0 1\n",
|
|||
|
"546800.0 1\n",
|
|||
|
"378750.0 1\n",
|
|||
|
"436500.0 1\n",
|
|||
|
"Name: count, Length: 1526, dtype: int64\n",
|
|||
|
"Тестовая выборка: (4323, 23)\n",
|
|||
|
"price\n",
|
|||
|
"450000.0 36\n",
|
|||
|
"375000.0 36\n",
|
|||
|
"300000.0 34\n",
|
|||
|
"500000.0 31\n",
|
|||
|
"425000.0 30\n",
|
|||
|
" ..\n",
|
|||
|
"196700.0 1\n",
|
|||
|
"482975.0 1\n",
|
|||
|
"2238890.0 1\n",
|
|||
|
"694000.0 1\n",
|
|||
|
"1770000.0 1\n",
|
|||
|
"Name: count, Length: 1565, dtype: int64\n",
|
|||
|
"Обучающая выборка по категориям: price_binned\n",
|
|||
|
"High 4377\n",
|
|||
|
"Low 4335\n",
|
|||
|
"Medium 4255\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Валидационная выборка по категориям: price_binned\n",
|
|||
|
"High 1459\n",
|
|||
|
"Low 1445\n",
|
|||
|
"Medium 1418\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Тестовая выборка по категориям: price_binned\n",
|
|||
|
"High 1460\n",
|
|||
|
"Low 1445\n",
|
|||
|
"Medium 1418\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Функция для разбиение на выборки\n",
|
|||
|
"def split_stratified_into_train_val_test(\n",
|
|||
|
" df_input,\n",
|
|||
|
" stratify_colname=\"y\",\n",
|
|||
|
" frac_train=0.6,\n",
|
|||
|
" frac_val=0.2,\n",
|
|||
|
" frac_test=0.2,\n",
|
|||
|
" random_state=None,\n",
|
|||
|
"):\n",
|
|||
|
" if frac_train + frac_val + frac_test != 1.0:\n",
|
|||
|
" raise ValueError(\n",
|
|||
|
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
|||
|
" % (frac_train, frac_val, frac_test)\n",
|
|||
|
" )\n",
|
|||
|
"\n",
|
|||
|
" if stratify_colname not in df_input.columns:\n",
|
|||
|
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
|||
|
"\n",
|
|||
|
" X = df_input\n",
|
|||
|
" y = df_input[[stratify_colname]]\n",
|
|||
|
"\n",
|
|||
|
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
|||
|
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
|||
|
" )\n",
|
|||
|
"\n",
|
|||
|
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
|||
|
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
|||
|
" df_temp,\n",
|
|||
|
" y_temp,\n",
|
|||
|
" stratify=y_temp,\n",
|
|||
|
" test_size=relative_frac_test,\n",
|
|||
|
" random_state=random_state,\n",
|
|||
|
" )\n",
|
|||
|
"\n",
|
|||
|
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
|||
|
"\n",
|
|||
|
" return df_train, df_val, df_test\n",
|
|||
|
"\n",
|
|||
|
"# Определение бинов для цены\n",
|
|||
|
"bins = [\n",
|
|||
|
" df[\"price\"].min(),\n",
|
|||
|
" df[\"price\"].quantile(0.33),\n",
|
|||
|
" df[\"price\"].quantile(0.66),\n",
|
|||
|
" df[\"price\"].max(),\n",
|
|||
|
"]\n",
|
|||
|
"labels = [\"Low\", \"Medium\", \"High\"]\n",
|
|||
|
"df[\"price_binned\"] = pd.cut(df[\"price\"], bins=bins, labels=labels)\n",
|
|||
|
"df = df.dropna()\n",
|
|||
|
"\n",
|
|||
|
"# Стратифицированное (для сохранения пропорций)\n",
|
|||
|
"# разбиение на обучающую, валидационную и тестовую выборки\n",
|
|||
|
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
|
|||
|
" df, stratify_colname=\"price_binned\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"print(df_train.columns)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
|||
|
"print(df_train[\"price\"].value_counts())\n",
|
|||
|
"\n",
|
|||
|
"print(\"Валидационная выборка: \", df_val.shape)\n",
|
|||
|
"print(df_val[\"price\"].value_counts())\n",
|
|||
|
"\n",
|
|||
|
"print(\"Тестовая выборка: \", df_test.shape)\n",
|
|||
|
"print(df_test[\"price\"].value_counts())\n",
|
|||
|
"\n",
|
|||
|
"print(\"Обучающая выборка по категориям: \", df_train[\"price_binned\"].value_counts())\n",
|
|||
|
"print(\"Валидационная выборка по категориям: \", df_val[\"price_binned\"].value_counts())\n",
|
|||
|
"print(\"Тестовая выборка по категориям: \", df_test[\"price_binned\"].value_counts())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 71,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Обучающая выборка после undersampling: (12765, 23)\n",
|
|||
|
"price_binned\n",
|
|||
|
"Low 4255\n",
|
|||
|
"Medium 4255\n",
|
|||
|
"High 4255\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"rus = RandomUnderSampler(random_state=42)\n",
|
|||
|
"X_resampled, y_resampled = rus.fit_resample(df_train, df_train[\"price_binned\"])\n",
|
|||
|
"\n",
|
|||
|
"# Создание датафрейма для результирующей выборки\n",
|
|||
|
"df_train_rus = pd.DataFrame(X_resampled)\n",
|
|||
|
"\n",
|
|||
|
"# Добавление целевой переменной в новый датафрейм\n",
|
|||
|
"df_train_rus[\"price_binned\"] = y_resampled\n",
|
|||
|
"\n",
|
|||
|
"print(\"Обучающая выборка после undersampling: \", df_train_rus.shape)\n",
|
|||
|
"print(df_train_rus[\"price_binned\"].value_counts())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 72,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" id date price bedrooms bathrooms sqft_living \\\n",
|
|||
|
"921 4046500320 2015-01-20 342000.0 3 1.75 1660 \n",
|
|||
|
"16637 6802200190 2015-01-21 222500.0 3 2.00 1450 \n",
|
|||
|
"20955 8138870530 2014-05-05 419190.0 2 2.50 1590 \n",
|
|||
|
"14806 4370700065 2015-05-04 907500.0 3 2.25 2850 \n",
|
|||
|
"20000 2767601311 2014-10-24 445000.0 3 2.50 1260 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"2640 4045800030 2015-05-11 739000.0 3 2.25 2220 \n",
|
|||
|
"16590 8635750950 2014-06-07 568500.0 4 2.50 2460 \n",
|
|||
|
"20011 9396700028 2014-07-22 358000.0 2 2.50 1278 \n",
|
|||
|
"9296 625049310 2015-03-11 587750.0 2 1.00 890 \n",
|
|||
|
"15823 5088500170 2014-10-27 435000.0 3 2.50 2530 \n",
|
|||
|
"\n",
|
|||
|
" sqft_lot floors waterfront view ... zipcode lat long \\\n",
|
|||
|
"921 16275 2.0 0 0 ... 98014 47.6903 -121.915 \n",
|
|||
|
"16637 9044 2.0 0 0 ... 98022 47.1955 -121.987 \n",
|
|||
|
"20955 1426 2.0 0 0 ... 98029 47.5441 -122.013 \n",
|
|||
|
"14806 6281 2.0 0 2 ... 98115 47.6911 -122.326 \n",
|
|||
|
"20000 1102 3.0 0 0 ... 98107 47.6750 -122.387 \n",
|
|||
|
"... ... ... ... ... ... ... ... ... \n",
|
|||
|
"2640 10530 1.0 0 0 ... 98052 47.6383 -122.098 \n",
|
|||
|
"16590 4200 2.0 0 0 ... 98074 47.6041 -122.020 \n",
|
|||
|
"20011 987 2.0 0 0 ... 98136 47.5532 -122.381 \n",
|
|||
|
"9296 4730 1.0 0 0 ... 98103 47.6876 -122.341 \n",
|
|||
|
"15823 16102 2.0 0 0 ... 98038 47.3710 -122.055 \n",
|
|||
|
"\n",
|
|||
|
" sqft_living15 sqft_lot15 date_numeric price_binned_Low \\\n",
|
|||
|
"921 1520 16275 16455 True \n",
|
|||
|
"16637 1450 9044 16456 True \n",
|
|||
|
"20955 1590 1426 16195 False \n",
|
|||
|
"14806 1680 7006 16559 False \n",
|
|||
|
"20000 1320 2500 16367 False \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"2640 2500 10014 16566 False \n",
|
|||
|
"16590 2460 4200 16228 False \n",
|
|||
|
"20011 1220 1287 16273 True \n",
|
|||
|
"9296 1330 5904 16505 False \n",
|
|||
|
"15823 2370 14957 16370 False \n",
|
|||
|
"\n",
|
|||
|
" price_binned_Medium price_binned_High Area_binned \n",
|
|||
|
"921 False False 1 \n",
|
|||
|
"16637 False False 1 \n",
|
|||
|
"20955 True False 1 \n",
|
|||
|
"14806 False True 3 \n",
|
|||
|
"20000 True False 0 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"2640 False True 2 \n",
|
|||
|
"16590 False True 2 \n",
|
|||
|
"20011 False False 0 \n",
|
|||
|
"9296 False True 0 \n",
|
|||
|
"15823 True False 2 \n",
|
|||
|
"\n",
|
|||
|
"[12967 rows x 26 columns]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Преобразование категориального столбца price_binned в дамми-переменные\n",
|
|||
|
"# Добавляем новые столбцы (Low, Medium, High) и выставляем 1 или 0 в зависмоитси от цены\n",
|
|||
|
"df_train = pd.get_dummies(df_train, columns=[\"price_binned\"])\n",
|
|||
|
"\n",
|
|||
|
"# Binning площади в 4 категории на основе квартилей\n",
|
|||
|
"# Новый столбец с со значением 0, если оно подходит первому квартилю (категории значений) и тд\n",
|
|||
|
"df_train[\"Area_binned\"] = pd.qcut(df_train[\"sqft_living\"], q=4, labels=False)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод обновленного датафрейма\n",
|
|||
|
"print(df_train)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 73,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" id date price bedrooms bathrooms sqft_living \\\n",
|
|||
|
"921 4046500320 2015-01-20 -0.540558 3 1.75 -0.461829 \n",
|
|||
|
"16637 6802200190 2015-01-21 -0.863431 3 2.00 -0.693145 \n",
|
|||
|
"20955 8138870530 2014-05-05 -0.332001 2 2.50 -0.538934 \n",
|
|||
|
"14806 4370700065 2015-05-04 0.987347 3 2.25 0.848963 \n",
|
|||
|
"20000 2767601311 2014-10-24 -0.262266 3 2.50 -0.902431 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"2640 4045800030 2015-05-11 0.532083 3 2.25 0.155014 \n",
|
|||
|
"16590 8635750950 2014-06-07 0.071415 4 2.50 0.419376 \n",
|
|||
|
"20011 9396700028 2014-07-22 -0.497328 2 2.50 -0.882604 \n",
|
|||
|
"9296 625049310 2015-03-11 0.123426 2 1.00 -1.309988 \n",
|
|||
|
"15823 5088500170 2014-10-27 -0.289284 3 2.50 0.496481 \n",
|
|||
|
"\n",
|
|||
|
" sqft_lot floors waterfront view ... lat long \\\n",
|
|||
|
"921 0.051429 0.942938 -0.087265 0 ... 47.6903 -121.915 \n",
|
|||
|
"16637 -0.157341 0.942938 -0.087265 0 ... 47.1955 -121.987 \n",
|
|||
|
"20955 -0.377285 0.942938 -0.087265 0 ... 47.5441 -122.013 \n",
|
|||
|
"14806 -0.237113 0.942938 -0.087265 2 ... 47.6911 -122.326 \n",
|
|||
|
"20000 -0.386639 2.802570 -0.087265 0 ... 47.6750 -122.387 \n",
|
|||
|
"... ... ... ... ... ... ... ... \n",
|
|||
|
"2640 -0.114438 -0.916694 -0.087265 0 ... 47.6383 -122.098 \n",
|
|||
|
"16590 -0.297195 0.942938 -0.087265 0 ... 47.6041 -122.020 \n",
|
|||
|
"20011 -0.389959 0.942938 -0.087265 0 ... 47.5532 -122.381 \n",
|
|||
|
"9296 -0.281893 -0.916694 -0.087265 0 ... 47.6876 -122.341 \n",
|
|||
|
"15823 0.046434 0.942938 -0.087265 0 ... 47.3710 -122.055 \n",
|
|||
|
"\n",
|
|||
|
" sqft_living15 sqft_lot15 date_numeric price_binned_Low \\\n",
|
|||
|
"921 1520 16275 16455 True \n",
|
|||
|
"16637 1450 9044 16456 True \n",
|
|||
|
"20955 1590 1426 16195 False \n",
|
|||
|
"14806 1680 7006 16559 False \n",
|
|||
|
"20000 1320 2500 16367 False \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"2640 2500 10014 16566 False \n",
|
|||
|
"16590 2460 4200 16228 False \n",
|
|||
|
"20011 1220 1287 16273 True \n",
|
|||
|
"9296 1330 5904 16505 False \n",
|
|||
|
"15823 2370 14957 16370 False \n",
|
|||
|
"\n",
|
|||
|
" price_binned_Medium price_binned_High Area_binned Volatility \n",
|
|||
|
"921 False False 1 -0.513258 \n",
|
|||
|
"16637 False False 1 -0.535804 \n",
|
|||
|
"20955 True False 1 -0.161650 \n",
|
|||
|
"14806 False True 3 1.086076 \n",
|
|||
|
"20000 True False 0 -0.515792 \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"2640 False True 2 0.269452 \n",
|
|||
|
"16590 False True 2 0.716571 \n",
|
|||
|
"20011 False False 0 -0.492645 \n",
|
|||
|
"9296 False True 0 -1.028095 \n",
|
|||
|
"15823 True False 2 0.450047 \n",
|
|||
|
"\n",
|
|||
|
"[12967 rows x 27 columns]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"\n",
|
|||
|
"# Нормализация значений для указанных столбцов\n",
|
|||
|
"# чтобы значения разных столбов в среднем были 0, а стандартное отклонение - 1\n",
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"df_train[[\"price\", \"sqft_living\", \"sqft_lot\", \"floors\", \"waterfront\"]] = (\n",
|
|||
|
" scaler.fit_transform(\n",
|
|||
|
" df_train[[\"price\", \"sqft_living\", \"sqft_lot\", \"floors\", \"waterfront\"]]\n",
|
|||
|
" )\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Расчет волатильности (разница между площадью дома и площадью участка)\n",
|
|||
|
"df_train[\"Volatility\"] = (\n",
|
|||
|
" df_train[\"sqft_living\"] - df_train[\"sqft_lot\"]\n",
|
|||
|
") \n",
|
|||
|
"\n",
|
|||
|
"# Вывод обновленного датафрейма\n",
|
|||
|
"print(df_train)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 74,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\TEMP_UNIVERSITY\\mai\\.venv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"c:\\TEMP_UNIVERSITY\\mai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\TEMP_UNIVERSITY\\mai\\.venv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"[<Feature: price>, <Feature: bedrooms>, <Feature: bathrooms>, <Feature: sqft_living>, <Feature: sqft_lot>, <Feature: floors>, <Feature: waterfront>, <Feature: view>, <Feature: condition>, <Feature: grade>, <Feature: sqft_above>, <Feature: sqft_basement>, <Feature: yr_built>, <Feature: yr_renovated>, <Feature: zipcode>, <Feature: lat>, <Feature: long>, <Feature: sqft_living15>, <Feature: sqft_lot15>, <Feature: date_numeric>, <Feature: price_binned>, <Feature: DAY(date)>, <Feature: MONTH(date)>, <Feature: WEEKDAY(date)>, <Feature: YEAR(date)>]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import featuretools as ft\n",
|
|||
|
"\n",
|
|||
|
"# Убеждаемся, что столбец 'date' в формате datetime\n",
|
|||
|
"df[\"date\"] = pd.to_datetime(df[\"date\"])\n",
|
|||
|
"\n",
|
|||
|
"# Установка индекса на столбец 'id' (предполагается, что 'id' уникален)\n",
|
|||
|
"df.set_index(\"id\", inplace=True)\n",
|
|||
|
"\n",
|
|||
|
"# Создание EntitySet для объединения разных датасетов для удобного использования\n",
|
|||
|
"es = ft.EntitySet(id=\"house_sales\")\n",
|
|||
|
"es = es.add_dataframe(\n",
|
|||
|
" dataframe_name=\"house_data\",\n",
|
|||
|
" dataframe=df,\n",
|
|||
|
" index=\"id\", # уникальный идентификатор\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Генерация признаков\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name=\"house_data\")\n",
|
|||
|
"\n",
|
|||
|
"# Показать определение признаков\n",
|
|||
|
"print(feature_defs)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 75,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"id int64\n",
|
|||
|
"bedrooms int64\n",
|
|||
|
"bathrooms float64\n",
|
|||
|
"sqft_living float64\n",
|
|||
|
"sqft_lot float64\n",
|
|||
|
"floors float64\n",
|
|||
|
"waterfront float64\n",
|
|||
|
"view int64\n",
|
|||
|
"condition int64\n",
|
|||
|
"grade int64\n",
|
|||
|
"sqft_above int64\n",
|
|||
|
"sqft_basement int64\n",
|
|||
|
"yr_built int64\n",
|
|||
|
"yr_renovated int64\n",
|
|||
|
"zipcode int64\n",
|
|||
|
"lat float64\n",
|
|||
|
"long float64\n",
|
|||
|
"sqft_living15 int64\n",
|
|||
|
"sqft_lot15 int64\n",
|
|||
|
"date_numeric int64\n",
|
|||
|
"price_binned_Low bool\n",
|
|||
|
"price_binned_Medium bool\n",
|
|||
|
"price_binned_High bool\n",
|
|||
|
"Area_binned int64\n",
|
|||
|
"Volatility float64\n",
|
|||
|
"dtype: object\n",
|
|||
|
"MAE: 539145.7717358439\n",
|
|||
|
"MSE: 430064692545.0895\n",
|
|||
|
"RMSE: 655793.1781782191\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.linear_model import LinearRegression\n",
|
|||
|
"from sklearn.metrics import mean_absolute_error, mean_squared_error\n",
|
|||
|
"\n",
|
|||
|
"# Копирование датафрейма для регрессионного анализа\n",
|
|||
|
"df_train_regression = df_train.copy()\n",
|
|||
|
"\n",
|
|||
|
"# Определение признаков и целевой переменной\n",
|
|||
|
"X_train = df_train_regression.drop(\n",
|
|||
|
" [\"price\", \"date\"], axis=1\n",
|
|||
|
")\n",
|
|||
|
"y_train = df_train_regression[\"price\"] # Целевая переменная\n",
|
|||
|
"X_test = df_test.drop(\n",
|
|||
|
" [\"price\", \"date\"], axis=1\n",
|
|||
|
")\n",
|
|||
|
"y_test = df_test[\"price\"] # Целевая переменная\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование категориальных признаков в дамми-переменные\n",
|
|||
|
"# (создание столбцов значений со значениями 0 или 1, если это булевой столбец)\n",
|
|||
|
"X_train_encoded = pd.get_dummies(X_train, drop_first=True)\n",
|
|||
|
"X_test_encoded = pd.get_dummies(X_test, drop_first=True)\n",
|
|||
|
"\n",
|
|||
|
"# Устранение различий в количестве столбцов между обучающей и тестовой выборками\n",
|
|||
|
"X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка типов данных\n",
|
|||
|
"print(X_train_encoded.dtypes)\n",
|
|||
|
"\n",
|
|||
|
"# Обучение модели линейной регрессии (поиск зависимостей между признаками)\n",
|
|||
|
"model = LinearRegression()\n",
|
|||
|
"model.fit(X_train_encoded, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Предсказание цены на тестовой выборке\n",
|
|||
|
"y_pred = model.predict(X_test_encoded)\n",
|
|||
|
"\n",
|
|||
|
"# Оценка качества модели\n",
|
|||
|
"mae = mean_absolute_error(y_test, y_pred)\n",
|
|||
|
"mse = mean_squared_error(y_test, y_pred)\n",
|
|||
|
"rmse = mse**0.5 # Корень из MSE для RMSE\n",
|
|||
|
"\n",
|
|||
|
"print(\"MAE:\", mae)\n",
|
|||
|
"print(\"MSE:\", mse)\n",
|
|||
|
"print(\"RMSE:\", rmse)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 76,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Время, затраченное на обучение модели: 0.08102583885192871.\n",
|
|||
|
"Время, затраченное на предсказание: 0.003012418746948242\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Оценка скорости вычисления\n",
|
|||
|
"import time\n",
|
|||
|
"\n",
|
|||
|
"start_time = time.time()\n",
|
|||
|
"model.fit(X_train_encoded, y_train)\n",
|
|||
|
"training_time = time.time() - start_time\n",
|
|||
|
"\n",
|
|||
|
"start_time = time.time()\n",
|
|||
|
"predictions = model.predict(X_test_encoded)\n",
|
|||
|
"prediction_time = time.time() - start_time\n",
|
|||
|
"\n",
|
|||
|
"print(\n",
|
|||
|
" f\"Время, затраченное на обучение модели: {training_time}.\\nВремя, затраченное на предсказание: {prediction_time}\"\n",
|
|||
|
")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 77,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAoEAAAIsCAYAAACN0XOIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADnD0lEQVR4nOzdeVxU1f8/8Ndl31dlDcWVRVFI1JAUVMotcyk1JRU1rU/iElrJ9+O+oYWKW+pHE9Ss7JNLVoYZigtuiGJkqIgSpiiaguLCMtzfH/6cjyMwwJlBdHg9fdzHQ+7c9z1nLjPDe8655xxJlmUZRERERFSn6NV2BYiIiIjo2WMSSERERFQHMQkkIiIiqoOYBBIRERHVQUwCiYiIiOogJoFEREREdRCTQCIiIqI6iEkgERERUR3EJJCIiIioDmISSERERFQHMQkkIiIi0qIDBw6gd+/ecHFxgSRJ2LFjR6UxiYmJePnll2FsbIymTZsiLi6uxuvJJJCIiIhIi+7du4fWrVtj5cqVVTr+0qVL6NWrFzp37ozU1FRMnDgR7733Hnbv3l2j9ZRkWZZrtAQiIiKiOkqSJGzfvh19+/at8JhPP/0UP//8M/744w/lvnfeeQd5eXmIj4+vsbqxJZCIiIioEoWFhbhz547KVlhYqJVzHzlyBCEhISr7unXrhiNHjmjl/BUxqNGz03MtODgYvr6+iImJKfdxd3d3TJw4ERMnTqzyOYtvXhSuz1etpwvFFUnCRcKoFtrBSzSo7wUDhXCsT5G+UFyhBl8VDTW4vgUalGtWKhan0OB3I1gkAMBUMPieBtdIkxYA0VhjDS6SJq/D84LvG89isfcMoFl9RV+/AFAs+BrW0+C96lpSIhz7+vVvxQuuAk3+Jj0tasVGzJo1S2XfjBkzMHPmTI3Pfe3aNTg6Oqrsc3R0xJ07d/DgwQOYmppqXEZ5mATWYdu2bYOhoWFtV4OIiKhmlIp/cX5aZGQkIiIiVPYZGxtr7fy1gUlgHWZnZ1fbVSAiInohGBsb11jS5+TkhOvXr6vsu379OqysrGqsFRDgPYF1WnBwsLKrNzc3F71794apqSkaNWqEzZs3127liIiINCWXam+rQQEBAUhISFDZt2fPHgQEBNRouWwJJABAWFgYrl69in379sHQ0BDjx49Hbm5ubVeLiIhIXGnNJm8VKSgowIULF5Q/X7p0CampqbCzs0ODBg0QGRmJK1euYOPGjQCADz74ACtWrMAnn3yCkSNHYu/evfjuu+/w888/12g9mQQSzp8/j19++QXHjx9H27ZtAQBffvklvLy8arlmREREL54TJ06gc+fOyp8f30s4fPhwxMXFIScnB9nZ2crHGzVqhJ9//hkfffQRli5dipdeegnr1q1Dt27darSeTAIJ6enpMDAwQJs2bZT7PD09YWNjozausLCwzPB4vcLCF/5GWSIi0g1yDXfjViQ4OBjqpmEubzWQ4OBgnDp1qgZrVRbvCSRhUVFRsLa2VtkWLl1d29UiIiJ6pLRUe5sOYhJI8PT0RElJCVJSUpT7zp07h7y8PLVxkZGRyM/PV9k+nfBBDdeWiIiItIHdwQQPDw90794d77//PlatWgUDAwNMnDix0mHp5Q2XLy66WZNVJSIiqrpa6g5+UbAlkAAAsbGxcHFxQVBQEPr3748xY8bAwcGhtqtFREQkrlShvU0HsSWwDktMTFT+38nJCT/99JPK40OHDn3GNSIiItIitgSqxZZAIiIiojqILYFERESkm3R0VK+2MAkkrfqq9XTh2HdPzxaKi/MVL7NEEg5FsQaxopqX6AvHFgiGGmvwGarJNdIk9q5gH4eheJGQKp4SrFKiz1WTa2SqQX3vC5ZbqkHfkybPtang+0b0MwkAvtHgs1CT51oiWqYGv5uLRs9vKlFb8wS+KNgdTERERFQHPb/pOxEREZEm2B2sFpNAIiIi0k3sDlaL3cFEREREdRBbAomIiEg36egkz9rCJJCIiIh0E7uD1WJ3MBEREVEdxJZAIiIi0k0cHawWk0AiIiLSTewOVotJIBEREekmtgSqxXsCiYiIiOogtgQSERGRTpJlThGjDpNA0qoiDRY+j/MVW3A9LFV8kffrvd4TjrXpZC0Up+/bUrjMvREXhGOb2d0WitPXl4XLtGn0UDg2PsVNOLb/HCehOPnWLeEy87ZfEo49fMlFKK5hSbFwmaeNjYRj33bMEYrbd9VZuMwhY/WFY39bKnadRD+TAGDOg9PCsW9btRCOnTejgVCc4qz4Z4ucf184tsbxnkC12B2so7KysiBJElJTU2u7KkRERPQcYkugjnJzc0NOTg7q1atX21UhIiKqHRwYohaTQB1UVFQEIyMjODmJdYkRERHpBHYHq8Xu4BdAcHAwwsPDER4eDmtra9SrVw/Tpk2DLD+6V8vd3R1z5szBsGHDYGVlhTFjxpTbHXzmzBm88cYbsLKygqWlJTp27IjMzEzl4+vWrYOXlxdMTEzg6emJL7744lk/VSIiInpG2BL4gtiwYQNGjRqF48eP48SJExgzZgwaNGiA0aNHAwCio6Mxffp0zJgxo9z4K1euoFOnTggODsbevXthZWWFpKQklJSUAAA2b96M6dOnY8WKFfDz88OpU6cwevRomJubY/jw4c/seRIREWlNKUcHq8Mk8AXh5uaGJUuWQJIkeHh4IC0tDUuWLFEmgV26dMGkSZOUx2dlZanEr1y5EtbW1vj2229haGgIAGjevLny8RkzZmDRokXo378/AKBRo0b4888/sWbNmgqTwMLCQhQWFqrsK5YVMJTER/ERERFpDbuD1WJ38AvilVdegST9b/6VgIAAZGRkQKF49C3H399fbXxqaio6duyoTACfdO/ePWRmZmLUqFGwsLBQbnPnzlXpLn5aVFQUrK2tVbb4u2cEnyERERE9S2wJ1BHm5uZqHzc1Na3wsYKCAgDA2rVr0b59e5XH9PUrbtWLjIxERESEyr6NXu9XVlUiIqJng6OD1WIS+II4duyYys9Hjx5Fs2bN1CZpT2rVqhU2bNiA4uLiMq2Bjo6OcHFxwcWLFxEaGlrlOhkbG8PY2FhlH7uCiYjoucHuYLXYHfyCyM7ORkREBM6dO4dvvvkGy5cvx4QJE6ocHx4ejjt37uCdd97BiRMnkJGRgU2bNuHcuXMAgFmzZiEqKgrLli3D+fPnkZaWhtjYWCxevLimnhIREVHNKi3V3qaD2BL4ghg2bBgePHiAdu3aQV9fHxMmTMCYMWOqHG9vb4+9e/fi448/RlBQEPT19eHr64vAwEAAwHvvvQczMzN8/vnn+Pjjj2Fubg4fHx9MnDixhp4RERER1SYmgS8IQ0NDxMTEYNWqVWUee3okMPBo7sDH8wg+1qpVK+zevbvCMoYMGYIhQ4ZoXFciIqLngo624GkLk0AiIiLSSbLMeQLVYRJIWmUkV35MRUqkyo8pz/Ve7wmX6fjzOuHYwoWTKj+oHPLffwuXmVfFgUDluZ1vJhT3u55YHAA4JIt/C5c1uGO54NtkoTgTD/Wj7NVRFItX2FkurPygcmQamAiXmaMn/sfx1xxnobhGJUXCZRYevS4cW6DnJhwrqqtV88oPqoBLqfif5tzlJ4Xi6g8Sv0bFV/OFY3XdypUr8fnnn+PatWto3bo1li9fjnbt2lV4/OMev+zsbNSrVw9vv/02oqKiYGIi/l5Xh0ngCyAxMbG2q0BERPTiqcXu4C1btiAiIgKrV69G+/btERMTg27duuHcuXNwcHAoc/zXX3+NKVOmYP369ejQoQPOnz+PsLAwSJJUY4M0OTqYiIiIdJNcqr2tmhYvXozRo0djxIgR8Pb2xurVq2FmZob169eXe/zhw4cRGBiIIUOGwN3dHa+//joGDx6M48ePa3oVKsQkkIiIiKgShYWFuHPnjsr29NKpjxUVFSElJQUhISHKfXp6eggJCcGRI0fKjenQoQNSUlKUSd/Fixexa9cu9OzZU/tP5nGdauzMRERERLVJi/MElrdUalRUVLnF3rx5EwqFAo6Ojir7HR0dce3atXJjhgwZgtmzZ+PVV1+FoaEhmjRpguD
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 2 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Оценка корреляции\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"corr_matrix = df_train_regression.corr()\n",
|
|||
|
"sns.heatmap(corr_matrix, annot=False)\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": ".venv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.5"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|