1292 lines
190 KiB
Plaintext
1292 lines
190 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 106,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',\n",
|
|||
|
" 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',\n",
|
|||
|
" 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',\n",
|
|||
|
" 'lat', 'long', 'sqft_living15', 'sqft_lot15'],\n",
|
|||
|
" dtype='object')"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 106,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd;\n",
|
|||
|
"df = pd.read_csv(\"data/house_data.csv\", sep=\",\", nrows=10000)\n",
|
|||
|
"df.columns"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Определение бизнес целей:\n",
|
|||
|
"1. Прогнозирование цены недвижимости\n",
|
|||
|
"2. Оценка влияния факторов на цену недвижимости\n",
|
|||
|
"### Определение целей технического проекта:\n",
|
|||
|
"1. Построить модель, которая будет прогнозировать стоимость недвижимости на основе данных характеристик.\n",
|
|||
|
"2. Провести анализ данных для выявления факторов, которые больше всего влияют на цену\n",
|
|||
|
"### Проверка данных на пропуски"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 107,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"id 0\n",
|
|||
|
"date 0\n",
|
|||
|
"price 0\n",
|
|||
|
"bedrooms 0\n",
|
|||
|
"bathrooms 0\n",
|
|||
|
"sqft_living 0\n",
|
|||
|
"sqft_lot 0\n",
|
|||
|
"floors 0\n",
|
|||
|
"waterfront 0\n",
|
|||
|
"view 0\n",
|
|||
|
"condition 0\n",
|
|||
|
"grade 0\n",
|
|||
|
"sqft_above 0\n",
|
|||
|
"sqft_basement 0\n",
|
|||
|
"yr_built 0\n",
|
|||
|
"yr_renovated 0\n",
|
|||
|
"zipcode 0\n",
|
|||
|
"lat 0\n",
|
|||
|
"long 0\n",
|
|||
|
"sqft_living15 0\n",
|
|||
|
"sqft_lot15 0\n",
|
|||
|
"dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"id False\n",
|
|||
|
"date False\n",
|
|||
|
"price False\n",
|
|||
|
"bedrooms False\n",
|
|||
|
"bathrooms False\n",
|
|||
|
"sqft_living False\n",
|
|||
|
"sqft_lot False\n",
|
|||
|
"floors False\n",
|
|||
|
"waterfront False\n",
|
|||
|
"view False\n",
|
|||
|
"condition False\n",
|
|||
|
"grade False\n",
|
|||
|
"sqft_above False\n",
|
|||
|
"sqft_basement False\n",
|
|||
|
"yr_built False\n",
|
|||
|
"yr_renovated False\n",
|
|||
|
"zipcode False\n",
|
|||
|
"lat False\n",
|
|||
|
"long False\n",
|
|||
|
"sqft_living15 False\n",
|
|||
|
"sqft_lot15 False\n",
|
|||
|
"dtype: bool"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 107,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"for i in df.columns:\n",
|
|||
|
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
|||
|
" if null_rate > 0:\n",
|
|||
|
" print(f'{i} Процент пустых значений: %{null_rate:.2f}')\n",
|
|||
|
"\n",
|
|||
|
"print(df.isnull().sum())\n",
|
|||
|
"\n",
|
|||
|
"df.isnull().any()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Пустых значений нет, номинальных значений тоже."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Проверка выбросов, и их усреднение"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 108,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Колонка price:\n",
|
|||
|
" Есть выбросы: Да\n",
|
|||
|
" Количество выбросов: 499\n",
|
|||
|
" Минимальное значение: 75000.0\n",
|
|||
|
" Максимальное значение: 1127312.5\n",
|
|||
|
" id date price bedrooms bathrooms sqft_living \\\n",
|
|||
|
"0 7129300520 20141013T000000 221900.0 3 1.00 1180 \n",
|
|||
|
"1 6414100192 20141209T000000 538000.0 3 2.25 2570 \n",
|
|||
|
"2 5631500400 20150225T000000 180000.0 2 1.00 770 \n",
|
|||
|
"3 2487200875 20141209T000000 604000.0 4 3.00 1960 \n",
|
|||
|
"4 1954400510 20150218T000000 510000.0 3 2.00 1680 \n",
|
|||
|
"\n",
|
|||
|
" sqft_lot floors waterfront view ... grade sqft_above sqft_basement \\\n",
|
|||
|
"0 5650 1.0 0 0 ... 7 1180 0 \n",
|
|||
|
"1 7242 2.0 0 0 ... 7 2170 400 \n",
|
|||
|
"2 10000 1.0 0 0 ... 6 770 0 \n",
|
|||
|
"3 5000 1.0 0 0 ... 7 1050 910 \n",
|
|||
|
"4 8080 1.0 0 0 ... 8 1680 0 \n",
|
|||
|
"\n",
|
|||
|
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
|
|||
|
"0 1955 0 98178 47.5112 -122.257 1340 \n",
|
|||
|
"1 1951 1991 98125 47.7210 -122.319 1690 \n",
|
|||
|
"2 1933 0 98028 47.7379 -122.233 2720 \n",
|
|||
|
"3 1965 0 98136 47.5208 -122.393 1360 \n",
|
|||
|
"4 1987 0 98074 47.6168 -122.045 1800 \n",
|
|||
|
"\n",
|
|||
|
" sqft_lot15 \n",
|
|||
|
"0 5650 \n",
|
|||
|
"1 7639 \n",
|
|||
|
"2 8062 \n",
|
|||
|
"3 5000 \n",
|
|||
|
"4 7503 \n",
|
|||
|
"\n",
|
|||
|
"[5 rows x 21 columns]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"numeric_columns = ['price']\n",
|
|||
|
"for column in numeric_columns:\n",
|
|||
|
" if pd.api.types.is_numeric_dtype(df[column]): # Проверяем, является ли колонка числовой\n",
|
|||
|
" q1 = df[column].quantile(0.25) # Находим 1-й квартиль (Q1)\n",
|
|||
|
" q3 = df[column].quantile(0.75) # Находим 3-й квартиль (Q3)\n",
|
|||
|
" iqr = q3 - q1 # Вычисляем межквартильный размах (IQR)\n",
|
|||
|
"\n",
|
|||
|
" # Определяем границы для выбросов\n",
|
|||
|
" lower_bound = q1 - 1.5 * iqr # Нижняя граница\n",
|
|||
|
" upper_bound = q3 + 1.5 * iqr # Верхняя граница\n",
|
|||
|
"\n",
|
|||
|
" # Подсчитываем количество выбросов\n",
|
|||
|
" outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]\n",
|
|||
|
" outlier_count = outliers.shape[0]\n",
|
|||
|
"\n",
|
|||
|
" # Устраняем выбросы: заменяем значения ниже нижней границы на саму нижнюю границу, а выше верхней — на верхнюю\n",
|
|||
|
" df[column] = df[column].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)\n",
|
|||
|
"\n",
|
|||
|
" print(f\"Колонка {column}:\")\n",
|
|||
|
" print(f\" Есть выбросы: {'Да' if outlier_count > 0 else 'Нет'}\")\n",
|
|||
|
" print(f\" Количество выбросов: {outlier_count}\")\n",
|
|||
|
" print(f\" Минимальное значение: {df[column].min()}\")\n",
|
|||
|
" print(f\" Максимальное значение: {df[column].max()}\")\n",
|
|||
|
" print(df.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Создание выборок\n",
|
|||
|
"Так как мы будет предсказывать цену, то и целевым признаком будет параметр цены. "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 109,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 6400\n",
|
|||
|
"Размер контрольной выборки: 1600\n",
|
|||
|
"Размер тестовой выборки: 2000\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"#Разделение данных на обучающую и тестовую выборки\n",
|
|||
|
"train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)\n",
|
|||
|
"#Разделение обучающей выборки на обучающую и контрольную\n",
|
|||
|
"train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_data))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_data))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_data))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Визуализвация цен в выборках "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 110,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAHHCAYAAABZbpmkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB35UlEQVR4nO3dd3hT9f4H8PdJmqR7ppu2QCkUyi6rbKGAgDhAUWS68CJ4FRS9uEAQESeoKE7AnygXUFEBmTIUCkihUGhZpRBautI90yY5vz9Kcwkt0Ja2J0nfr+fJ8zQn53vOJydJ88l3CqIoiiAiIiKyUTKpAyAiIiJqTEx2iIiIyKYx2SEiIiKbxmSHiIiIbBqTHSIiIrJpTHaIiIjIpjHZISIiIpvGZIeIiIhsGpMdIqJGkJeXhwsXLkCv10sdCjUgURSRk5OD8+fPSx0K1QGTHSKiBlBRUYF3330XXbp0gUqlgoeHB8LCwrB7926pQ7MKp06dwqZNm0z34+LisGXLFukCuk5hYSFee+01tGvXDkqlEl5eXmjbti3Onj0rdWhUS3ZSB0CNb/Xq1XjsscdM91UqFYKDgzF8+HC8/vrr8PX1lTA6Iuun0+kwfPhwHDp0CP/617+waNEiODo6Qi6XIzIyUurwrEJhYSGefvpp+Pn5wcvLC8899xxGjhyJ0aNHSxpXdnY2Bg0aBI1Gg2effRb9+vWDUqmEQqFAy5YtJY2Nao/JTjOycOFCtGrVCmVlZfj777/x+eefY+vWrTh16hQcHR2lDo/Iai1duhSHDx/G9u3bMXjwYKnDsUpRUVGmGwC0bdsWTz31lMRRAXPnzkVaWhpiYmIQEREhdThUT0x2mpGRI0eiR48eAIAnn3wSXl5e+PDDD/Hrr79iwoQJEkdHZJ30ej2WLVuGF154gYnOHdq0aRMSEhJQWlqKTp06QalUShpPZmYm1qxZg5UrVzLRsXLss9OMDRkyBACQnJwMAMjJycGLL76ITp06wdnZGa6urhg5ciROnDhRrWxZWRkWLFiAtm3bwt7eHv7+/hg7diySkpIAAJcuXYIgCDe9Xf+lsHfvXgiCgP/+97945ZVX4OfnBycnJ9x77724cuVKtXMfPnwYd999N9zc3ODo6IhBgwbhwIEDNT7HwYMH13j+BQsWVNv3+++/R2RkJBwcHODp6YlHHnmkxvPf6rldz2g0YtmyZYiIiIC9vT18fX3x9NNPIzc312y/li1b4p577ql2nlmzZlU7Zk2xv/fee9WuKVDZtDJ//ny0adMGKpUKQUFBeOmll6DT6Wq8VtcbPHgwOnbsWG37+++/D0EQcOnSJbPteXl5eP755xEUFASVSoU2bdpg6dKlMBqNpn2qrtv7779f7bgdO3as8T2xcePGm8Y4bdq0WjUjtGzZ0vT6yGQy+Pn54eGHH4ZGo7ltWQD47LPPEBERAZVKhYCAAMycORN5eXmmx8+ePYvc3Fy4uLhg0KBBcHR0hJubG+655x6cOnXKtN+ePXsgCAJ++eWXauf44YcfIAgCYmJiTDFPmzbNbJ+qa7J3717Ttr/++gsPPfQQgoODTa/x7NmzUVpaalZ2wYIF1d5La9euRdeuXWFvbw8vLy9MmDCh2jWZNm0anJ2dzbZt3LixWhwA4OzsXC1moHafq8GDB5te/w4dOiAyMhInTpyo8XNVkxs/52q1GqNHjza7/kDl52fWrFk3Pc7q1avN3t///PMPjEYjysvL0aNHj1teKwD4888/MWDAADg5OcHd3R333XcfEhMTzfapei3OnDmD8ePHw9XV1dRsV1ZWVi3e6z/ver0eo0aNgqenJxISEsz2re3/r+aKNTvNWFVi4uXlBQC4ePEiNm3ahIceegitWrVCRkYGvvjiCwwaNAgJCQkICAgAABgMBtxzzz3YvXs3HnnkETz33HMoLCzEzp07cerUKYSGhprOMWHCBIwaNcrsvPPmzasxnsWLF0MQBLz88svIzMzEsmXLEB0djbi4ODg4OACo/GcycuRIREZGYv78+ZDJZFi1ahWGDBmCv/76C7169ap23BYtWmDJkiUAgKKiIsyYMaPGc7/++usYP348nnzySWRlZeGTTz7BwIEDcfz4cbi7u1crM336dAwYMAAA8PPPP1f7Env66adN/aX+/e9/Izk5GZ9++imOHz+OAwcOQKFQ1Hgd6iIvL8/03K5nNBpx77334u+//8b06dPRvn17xMfH46OPPsK5c+fMOoLeqZKSEgwaNAipqal4+umnERwcjIMHD2LevHlIS0vDsmXLGuxc9TVgwABMnz4dRqMRp06dwrJly3D16lX89ddftyy3YMECvPnmm4iOjsaMGTNw9uxZfP755/jnn39Mr2F2djaAyvd1WFgY3nzzTZSVlWHFihXo168f/vnnH7Rt2xaDBw9GUFAQ1q5diwceeMDsPGvXrkVoaKipCae2NmzYgJKSEsyYMQNeXl44cuQIPvnkE6SkpGDDhg03LffDDz9g0qRJ6NKlC5YsWYLs7Gx8/PHH+Pvvv3H8+HGo1eo6xXEz9flcVXn55ZfrdK7w8HC8+uqrEEURSUlJ+PDDDzFq1KhaJ7U1qXptZ82ahcjISLzzzjvIysqq8Vrt2rULI0eOROvWrbFgwQKUlpbik08+Qb9+/XDs2LFqifn48ePRsmVLLFmyBIcOHcLHH3+M3NxcfPfddzeN58knn8TevXuxc+dOdOjQwbT9Tq5zsyGSzVu1apUIQNy1a5eYlZUlXrlyRVy3bp3o5eUlOjg4iCkpKaIoimJZWZloMBjMyiYnJ4sqlUpcuHChadu3334rAhA//PDDaucyGo2mcgDE9957r9o+ERER4qBBg0z39+zZIwIQAwMDxYKCAtP29evXiwDE5cuXm44dFhYmjhgxwnQeURTFkpISsVWrVuKwYcOqnatv375ix44dTfezsrJEAOL8+fNN2y5duiTK5XJx8eLFZmXj4+NFOzu7atvPnz8vAhDXrFlj2jZ//nzx+o/TX3/9JQIQ165da1Z227Zt1baHhISIo0ePrhb7zJkzxRs/ojfG/tJLL4k+Pj5iZGSk2TX9v//7P1Emk4l//fWXWfmVK1eKAMQDBw5UO9/1Bg0aJEZERFTb/t5774kAxOTkZNO2RYsWiU5OTuK5c+fM9v3Pf/4jyuVyUaPRiKJYv/fEhg0bbhrj1KlTxZCQkFs+D1GsvL5Tp0412/boo4+Kjo6OtyyXmZkpKpVKcfjw4Wafi08//VQEIH777bdmsarValGr1Zr2O3funKhQKMRx48aZts2bN09UqVRiXl6e2Xns7OzMXtdWrVqJU6ZMMYun6jx79uwxbSspKakW95IlS0RBEMTLly+btl3//tTr9aKvr68YGhoqFhUVmfbZu3evCEB84YUXTNumTp0qOjk5mR1/w4YN1eIQRVF0cnIyu851+VwNGjTI7PXfunWrCEC8++67q30GanJjeVEUxVdeeUUEIGZmZpq2ARBnzpx50+NU/a+sen9X3e/QoYPZta56La6/Vl27dhV9fHzE7Oxs07YTJ06IMpnM7LWsei3uvfdes3M/88wzIgDxxIkTZvFWvS/mzZsnyuVycdOmTWbl6vr/q7liM1YzEh0dDW9vbwQFBeGRRx6Bs7MzfvnlFwQGBgKoHKUlk1W+JQwGA7Kzs+Hs7Ix27drh2LFjpuP89NNPUKvVePbZZ6udozZVzjczZcoUuLi4mO4/+OCD8Pf3x9atWwFUDkU9f/48Hn30UWRnZ0Or1UKr1aK4uBhDhw7F/v37zZpNgMrmNnt7+1ue9+eff4bRaMT48eNNx9RqtfDz80NYWBj27Nljtn95eTmAyut1Mxs2bICbmxuGDRtmdszIyEg4OztXO2ZFRYXZflqttlqV9o1SU1PxySef4PXXX6/W1LBhwwa0b98e4eHhZsesarq88fx3YsOGDRgwYAA8PDzMzhUdHQ2DwYD9+/eb7V9SUlLtuRoMhhqPXVhYCK1Wa9ZsVB86nQ5arRaZmZnYuXMn/vzzTwwdOvSWZXbt2oXy8nI8//zzps8FADz11FNwdXWtNiz6scceM9WSAkBYWBj
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAHHCAYAAABZbpmkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB1A0lEQVR4nO3dd3hTdd8G8DujTfemky4KtIyWUWRvECgoAgqCoIAyVNBHUFT0UYbKEFQQcT0i4CuKoAwHguxZVqHMAm0ptHSnpbtNmuS8f5RGQstoaXqS9P5cV66rOTk555vTpLl7zm9IBEEQQERERGShpGIXQERERGRMDDtERERk0Rh2iIiIyKIx7BAREZFFY9ghIiIii8awQ0RERBaNYYeIiIgsGsMOERERWTS52AUQERFZArVajdzcXOh0Ovj6+opdDt2GZ3aIiMik/fjjj7h27Zr+/po1a5CamipeQbc5efIknnnmGXh4eEChUMDHxwdPPvmk2GXRHRh2LMiaNWsgkUj0NxsbGzRv3hzTp09HZmam2OUREdXKwYMH8eabb+LatWvYsWMHpk2bBqlU/K+vrVu3onv37rh48SI++ugj7Ny5Ezt37sQ333wjdml0B17GskDz589HcHAwysrKcOjQIXz11VfYtm0bzp8/Dzs7O7HLIyKqkRkzZqB3794IDg4GAMycORM+Pj6i1pSbm4tJkyZh4MCB2LhxI6ytrUWth+6NYccCRUVFoUOHDgCASZMmwd3dHZ9++im2bt2KMWPGiFwdEVHNhIWFITExEefPn4eHhwdCQkLELgmrV69GWVkZ1qxZw6BjBsQ/D0hG17dvXwBAUlISgIr/SN544w2Eh4fDwcEBTk5OiIqKwpkzZ6o8t6ysDHPnzkXz5s1hY2MDHx8fjBgxAomJiQCAa9euGVw6u/PWu3dv/bb27dsHiUSCX375Be+88w68vb1hb2+PoUOHIiUlpcq+jx07hkGDBsHZ2Rl2dnbo1asXDh8+XO1r7N27d7X7nzt3bpV1f/zxR0RGRsLW1hZubm4YPXp0tfu/12u7nU6nw7Jly9CqVSvY2NjAy8sLU6dOxc2bNw3WCwoKwmOPPVZlP9OnT6+yzepqX7JkSZVjCgAqlQpz5sxB06ZNoVAo4O/vjzfffBMqlaraY3W73r17o3Xr1lWWL126FBKJxKCdBADk5eXhtddeg7+/PxQKBZo2bYrFixdDp9Pp16k8bkuXLq2y3datW1f7nvj111/vWuOECRMQFBR039cSFBSk//1IpVJ4e3vj6aefRnJy8gM9d8KECQbLpkyZAhsbG+zbt89g+ZdffolWrVpBoVDA19cX06ZNQ15ensE6D3pcb6+5ulvl6779mH722WcIDAyEra0tevXqhfPnz1fZz549e9CjRw/Y29vDxcUFTzzxBOLi4u573G6/3f667/bevV1Nfu8AkJWVhRdeeAFeXl6wsbFBmzZtsHbt2mq3uWbNGtjb26NTp04ICQnBtGnTIJFIqvzO7lZT5c3KygpBQUGYNWsW1Gq1fr3KJgAnT56867Z69+5t8BqOHj2Ktm3bYsGCBfrPQ7NmzbBo0SKDzwMAaDQafPDBBwgJCYFCoUBQUBDeeeedKp/RyuP8zz//oG3btrCxsUHLli2xadMmg/Uq673983nhwgW4urriscceg0aj0S9/kM9sQ8AzOw1AZTBxd3cHAFy9ehVbtmzByJEjERwcjMzMTHzzzTfo1asXLl68qO9FoNVq8dhjj2H37t0YPXo0/vOf/6CwsBA7d+7E+fPnDf67GjNmDAYPHmyw39mzZ1dbz0cffQSJRIK33noLWVlZWLZsGfr374/Y2FjY2toCqPhjHRUVhcjISMyZMwdSqRSrV69G3759cfDgQXTs2LHKdhs3boyFCxcCAIqKivDSSy9Vu+/33nsPo0aNwqRJk5CdnY0VK1agZ8+eOH36NFxcXKo8Z8qUKejRowcAYNOmTdi8ebPB41OnTsWaNWswceJEvPrqq0hKSsIXX3yB06dP4/Dhw7Cysqr2ONREXl6e/rXdTqfTYejQoTh06BCmTJmCFi1a4Ny5c/jss89w5coVbNmy5aH3XamkpAS9evVCamoqpk6dioCAABw5cgSzZ89Geno6li1bVmf7qq0ePXpgypQp0Ol0OH/+PJYtW4a0tDQcPHiwRtuZM2cOVq1ahV9++cXgC27u3LmYN28e+vfvj5deegmXL1/GV199hRMnTtTqd71s2TIUFRUBAOLi4rBgwQK88847aNGiBQDAwcHBYP0ffvgBhYWFmDZtGsrKyrB8+XL07dsX586dg5eXFwBg165diIqKQpMmTTB37lyUlpZixYoV6NatG06dOlVtcKw8brfXYUylpaXo3bs3EhISMH36dAQHB2Pjxo2YMGEC8vLy8J///Oeuz01ISMD//ve/Gu2v8jOsUqmwY8cOLF26FDY2Nvjggw9q/RpycnJw6NAhHDp0CM8//zwiIyOxe/duzJ49G9euXcPXX3+tX3fSpElYu3YtnnrqKbz++us4duwYFi5ciLi4uCp/T+Lj4/H000/jxRdfxPjx47F69WqMHDkS27dvx6OPPlptLSkpKRg0aBDCwsKwYcMGyOUVX+3m8JmtNwJZjNWrVwsAhF27dgnZ2dlCSkqKsH79esHd3V2wtbUVbty4IQiCIJSVlQlardbguUlJSYJCoRDmz5+vX/b9998LAIRPP/20yr50Op3+eQCEJUuWVFmnVatWQq9evfT39+7dKwAQ/Pz8hIKCAv3yDRs2CACE5cuX67fdrFkzYeDAgfr9CIIglJSUCMHBwcKjjz5aZV9du3YVWrdurb+fnZ0tABDmzJmjX3bt2jVBJpMJH330kcFzz507J8jl8irL4+PjBQDC2rVr9cvmzJkj3P6xOXjwoABAWLduncFzt2/fXmV5YGCgMGTIkCq1T5s2Tbjzo3hn7W+++abg6ekpREZGGhzT//u//xOkUqlw8OBBg+d//fXXAgDh8OHDVfZ3u169egmtWrWqsnzJkiUCACEpKUm/7IMPPhDs7e2FK1euGKz79ttvCzKZTEhOThYEoXbviY0bN961xvHjxwuBgYH3fB2CUHF8x48fb7DsmWeeEezs7Gr03G+++UYAIKxYscJgnaysLMHa2loYMGCAwefniy++EAAI33//vX5ZTY5rpcpjsXfv3iqPVR7T2z/HgiAIx44dEwAIM2bM0C9r27at4OnpKeTk5OiXnTlzRpBKpcJzzz1XZdt+fn7CxIkT71nH3d671dX4IL/3ZcuWCQCEH3/8Ub9MrVYLXbp0ERwcHPR/Hyq3uXr1av16o0aNElq3bi34+/tX+X3frabbny8IguDr6ysMHjxYf7/yb+eJEyfuuq1evXoZvIZevXoJAIS5c+carDdhwgQBgHDu3DlBEAQhNjZWACBMmjTJYL033nhDACDs2bNHvywwMFAAIPz222/6Zfn5+YKPj4/Qrl27KvUmJSUJubm5QsuWLYXQ0FBBqVQa7ONBP7MNAS9jWaD+/fujUaNG8Pf3x+jRo+Hg4IDNmzfDz88PAKBQKPQ9GbRaLXJycuDg4IDQ0FCcOnVKv53ffvsNHh4eeOWVV6rs487LLjXx3HPPwdHRUX//qaeego+PD7Zt2wYAiI2NRXx8PJ555hnk5ORAqVRCqVSiuLgY/fr1w4EDB6qcgi0rK4ONjc0997tp0ybodDqMGjVKv02lUglvb280a9YMe/fuNVi/8jS3QqG46zY3btwIZ2dnPProowbbjIyMhIODQ5VtlpeXG6ynVCpRVlZ2z7pTU1OxYsUKvPfee1X+09+4cSNatGiBsLAwg21WXrq8c/8PY+PGjejRowdcXV0N9tW/f39otVocOHDAYP2SkpIqr1Wr1Va77cLCQiiVyiqXg2pKpVJBqVQiKysLO3fuxJ49e9CvX78Hfv7WrVvx8ssvY9asWZg+fbrBY7t27YJarcZrr71m0BNo8uTJcHJywl9//WWwvlarrfL6S0pKHur1DRs2TP85BoCOHTuiU6dO+s9Oeno6YmNjMWHCBLi5uenXi4iIwKOPPqpf73Zqtfq
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAHHCAYAAABZbpmkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABr00lEQVR4nO3dd3hTZf8G8PtkdqZ70gEUKKsMy7AgU5QliqgIAgKyRFygqMiriAsRXkEFBw6QVxABFQURZCMbCmUWKNDSFuhI90yb5Pz+qM2P0FLakvYk6f25rlwXOTl5zjcnCbl7zvM8RxBFUQQRERGRnZJJXQARERFRXWLYISIiIrvGsENERER2jWGHiIiI7BrDDhEREdk1hh0iIiKyaww7REREZNcYdoiIiMiuMewQEVGDp9frkZaWhsTERKlLoTrAsENERHVm06ZNiImJMd3fsGEDzp49K11BN4mLi8OkSZMQEBAAlUoFPz8/REVFgRcWsD8MO2RmxYoVEATBdHNwcECLFi3w/PPPIzU1VeryiMjGnD59Gi+99BLi4uJw6NAhPPvss8jLy5O6LBw6dAhdunTBzp078cYbb2Dr1q3Ytm0bNmzYAEEQpC6PLEzgtbHoZitWrMD48ePx7rvvokmTJiguLsa+ffvwv//9D6GhoThz5gycnJykLpOIbER6ejq6deuGS5cuAQCGDRuGX375RdKaSkpK0L59e2g0Gvz9999wc3OTtB6qewqpCyDrNHDgQHTq1AkAMHHiRHh5eeGTTz7B77//jpEjR0pcHRHZCh8fH5w5c8b0h1KrVq2kLgkbN27EhQsXcP78eQadBoKnsaha+vbtCwCIj48HAGRmZuLVV19FREQEXFxcoNFoMHDgQJw8ebLCc4uLi/HOO++gRYsWcHBwQEBAAIYNG4bLly8DABISEsxOnd166927t6mt3bt3QxAE/Pzzz3jzzTfh7+8PZ2dnPPzww0hKSqqw7cOHD2PAgAFwc3ODk5MTevXqhf3791f6Gnv37l3p9t95550K6/7444+IjIyEo6MjPD09MWLEiEq3X9Vru5nRaMTixYvRpk0bODg4wM/PD1OmTEFWVpbZeo0bN8ZDDz1UYTvPP/98hTYrq33BggUV9ikA6HQ6zJkzB82aNYNarUZwcDBee+016HS6SvfVzXr37o22bdtWWL5w4UIIgoCEhASz5dnZ2Xj55ZcRHBwMtVqNZs2aYf78+TAajaZ1yvfbwoULK7Tbtm3bSj8T69evv22N48aNQ+PGje/4Who3bmx6f2QyGfz9/fHkk0/esdPqzc+r7Hbztqv7XgPAX3/9hV69esHV1RUajQadO3fG6tWrAdz+81rZZ0yv1+O9995DWFgY1Go1GjdujDfffLPC+1vd119QUIBXXnnF9B6Gh4dj4cKFFfq6lH8G1Wo1IiMj0apVq9t+Bitz82uRy+Vo1KgRJk+ejOzsbNM6tXn/Dx06hCZNmuCXX35BWFgYVCoVQkJC8Nprr6GoqKjC87/44gu0adMGarUagYGBmDZtmlkNwP9/D6Kjo9GtWzc4OjqiSZMm+Oqrr8zWK6939+7dpmXXr19H48aN0alTJ+Tn55uW3833kszxyA5VS3kw8fLyAgBcuXIFGzZswBNPPIEmTZogNTUVX3/9NXr16oVz584hMDAQAGAwGPDQQw9hx44dGDFiBF566SXk5eVh27ZtOHPmDMLCwkzbGDlyJAYNGmS23VmzZlVazwcffABBEPD6668jLS0NixcvRr9+/RATEwNHR0cAwM6dOzFw4EBERkZizpw5kMlkWL58Ofr27Yt//vkHXbp0qdBuUFAQ5s2bBwDIz8/H1KlTK932W2+9heHDh2PixIlIT0/H559/jp49e+LEiRNwd3ev8JzJkyejR48eAIBff/0Vv/32m9njU6ZMMZ1CfPHFFxEfH48lS5bgxIkT2L9/P5RKZaX7oSays7NNr+1mRqMRDz/8MPbt24fJkyejVatWOH36NBYtWoSLFy9iw4YNd73tcoWFhejVqxeuXbuGKVOmICQkBAcOHMCsWbNw48YNLF682GLbqq0ePXpg8uTJMBqNOHPmDBYvXozr16/jn3/+ue1zFi9ebPqRio2NxYcffog333zTdBTDxcXFtG513+sVK1bgmWeeQZs2bTBr1iy4u7vjxIkT2LJlC5566inMnj0bEydOBABotVpMnz7d7HN2s4kTJ+KHH37A448/jldeeQWHDx/GvHnzEBsbW+GzeKfXL4oiHn74YezatQsTJkxAhw4dsHXrVsycORPXrl3DokWLbrufbvcZrMqjjz6KYcOGQa/X4+DBg1i2bBmKiorwv//9r0bt3CwjIwNXrlzBm2++iWHDhuGVV17BsWPHsGDBApw5cwZ//vmnKSy+8847mDt3Lvr164epU6fiwoUL+PLLL3H06NEK382srCwMGjQIw4cPx8iRI7F27VpMnToVKpUKzzzzTKW15OTkYODAgVAqldi8ebPps1Kf38sGQSS6yfLly0UA4vbt28X09HQxKSlJXLNmjejl5SU6OjqKycnJoiiKYnFxsWgwGMyeGx8fL6rVavHdd981Lfv+++9FAOInn3xSYVtGo9H0PADiggULKqzTpk0bsVevXqb7u3btEgGIjRo1EnNzc03L165dKwIQP/30U1PbzZs3F/v372/ajiiKYmFhodikSRPxgQceqLCtbt26iW3btjXdT09PFwGIc+bMMS1LSEgQ5XK5+MEHH5g99/Tp06JCoaiwPC4uTgQg/vDDD6Zlc+bMEW/+6v3zzz8iAHHVqlVmz92yZUuF5aGhoeLgwYMr1D5t2jTx1q/zrbW/9tproq+vrxgZGWm2T//3v/+JMplM/Oeff8ye/9VXX4kAxP3791fY3s169eoltmnTpsLyBQsWiADE+Ph407L33ntPdHZ2Fi9evGi27htvvCHK5XIxMTFRFMXafSbWrVt32xrHjh0rhoaGVvk6RLFs/44dO9Zs2VNPPSU6OTnd8bm31rNr164Kj1X3vc7OzhZdXV3Frl27ikVFRWbr3vx5Lle+v5YvX17hsZiYGBGAOHHiRLPlr776qghA3Llzp2lZdV7/hg0bRADi+++/b7be448/LgqCIF66dMm0rLqfwdu59fmiWPY9bd26tel+bd7/sWPHigDEcePGma1X/t3cuHGjKIqimJaWJqpUKvHBBx80+/9uyZIlIgDx+++/Ny3r1auXCED873//a1qm0+nEDh06iL6+vmJJSYlZvbt27RKLi4vF3r17i76+vmb7TRTv/ntJ5ngaiyrVr18/+Pj4IDg4GCNGjICLiwt+++03NGrUCACgVqshk5V9fAwGAzIyMuDi4oLw8HAcP37c1M4vv/wCb29vvPDCCxW2cTcjHp5++mm4urqa7j/++OMICAjA5s2bAQAxMTGIi4vDU089hYyMDGi1Wmi1WhQUFOD+++/H3r17zU6bAGWn2xwcHKrc7q+//gqj0Yjhw4eb2tRqtfD390fz5s2xa9cus/VLSkoAlO2v21m3bh3c3NzwwAMPmLUZGRkJFxeXCm2WlpaarafValFcXFxl3deuXcPnn3+Ot956y+woQ/n2W7VqhZYtW5q1WX7q8tbt341169ahR48e8PDwMNtWv379YDAYsHfvXrP1CwsLK7xWg8FQadt5eXnQarUVTi/UlE6ng1arRVpaGrZt24adO3fi/vvvv6s2y1X3vd62bRvy8vLwxhtvVPhM1vR7U/6dmDFjhtnyV155BQDw559/mi2/0+vfvHkz5HI5XnzxxQrtiaKIv/76q9I6qvoMVqX8M5CSkoJffvkFJ0+erPT9qM37P3PmTLP706dPh1wuN+2T7du3o6SkBC+//LLp/zsAmDRpEjQaTYV9p1AoMGXKFNN9lUqFKVOmIC0tDdHR0WbrGo1GPP300zh06BA2b95sdpQbqN/vZUPA01hUqaVLl6JFixZQKBTw8/NDeHi42ZfdaDTi008/xRdffIH4+HizH6DyU11A2emv8PBwKBSW/ag1b97c7L4gCGjWrJmpf0hcXBwAYOzYsbdtIycnBx4eHqb7Wq22Qru
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Средняя цена в обучающей выборке: 503651.6890625\n",
|
|||
|
"Средняя цена в контрольной выборке: 515548.73125\n",
|
|||
|
"Средняя цена в тестовой выборке: 502023.62175\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"def plot_price_dist(data, title):\n",
|
|||
|
" sns.histplot(data['price'], kde=True)\n",
|
|||
|
" plt.title(title)\n",
|
|||
|
" plt.xlabel('Цена')\n",
|
|||
|
" plt.ylabel('Частота')\n",
|
|||
|
" plt.show()\n",
|
|||
|
"\n",
|
|||
|
"plot_price_dist(train_data, 'Распределение цены в обучающей выборке')\n",
|
|||
|
"plot_price_dist(val_data, 'Распределение цены в контрольной выборке')\n",
|
|||
|
"plot_price_dist(test_data, 'Распределение цены в тестовой выборке')\n",
|
|||
|
"\n",
|
|||
|
"print(\"Средняя цена в обучающей выборке: \", train_data['price'].mean())\n",
|
|||
|
"print(\"Средняя цена в контрольной выборке: \", val_data['price'].mean())\n",
|
|||
|
"print(\"Средняя цена в тестовой выборке: \", test_data['price'].mean())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Преобразование целевой переменной в категории дискретизацией"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 111,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAHHCAYAAABeLEexAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABMgElEQVR4nO3deVhO+f8/8OfdvkeljRDZZZmYJClEkm0YhmHC2MbUzNh9mrFkmz6WsRtmI0bGMoaZYSayJEuWaWQX+mTCFBUVRev794dv59dxF5W44zwf13Vfl/M+73PO6yz37dlZ7lslhBAgIiIiUjAtTRdAREREpGkMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERBqSnp6O69evIz8/X9OlUCUSQuDevXu4du2apkuhcmAgIiJ6RfLy8rBw4UK0bNkS+vr6qF69Oho0aIADBw5ourTXwoULF7Br1y5pODY2Fnv27NFcQcU8ePAA06dPR6NGjaCnpwdLS0s0bNgQcXFxmi6NykhH0wVQ1RAaGooRI0ZIw/r6+qhduza6deuGGTNmwMbGRoPVEb3+cnJy0K1bN5w4cQIfffQR5s6dCyMjI2hra8PFxUXT5b0WHjx4gLFjx8LW1haWlpb47LPP4OvrCz8/P43WlZaWBk9PTyQmJuKTTz6Bu7s79PT0oKuri7p162q0Nio7BiKSmTNnDhwdHfH48WMcPXoUa9aswR9//IELFy7AyMhI0+URvbYWLFiAkydPYu/evfDy8tJ0Oa8lNzc36QUADRs2xOjRozVcFTBlyhQkJSUhOjoazZo103Q5VEEMRCTj6+uLNm3aAABGjRoFS0tLLFmyBL/++isGDx6s4eqIXk/5+flYtmwZJk2axDD0gnbt2oVLly7h0aNHcHZ2hp6enkbruXv3LjZs2IC1a9cyDL3meA8RPVPnzp0BAAkJCQCAe/fuYfLkyXB2doaJiQnMzMzg6+uLs2fPqk37+PFjBAcHo2HDhjAwMICdnR369euH+Ph4AMCNGzegUqlKfRX/jyMyMhIqlQpbt27F559/DltbWxgbG6N37964efOm2rJPnjyJ7t27w9zcHEZGRvD09MSxY8dKXEcvL68Slx8cHKzWd9OmTXBxcYGhoSEsLCwwaNCgEpf/rHUrrrCwEMuWLUOzZs1gYGAAGxsbjB07Fvfv35f1q1u3Lnr27Km2nMDAQLV5llT7okWL1LYp8OQyzqxZs+Dk5AR9fX04ODhg6tSpyMnJKXFbFefl5YXmzZurtS9evBgqlQo3btyQtaenp2P8+PFwcHCAvr4+nJycsGDBAhQWFkp9irbb4sWL1ebbvHnzEo+Jn3/+udQahw8fXqZLFnXr1pX2j5aWFmxtbfHee+8hMTHxudMCwNdff41mzZpBX18f9vb2CAgIQHp6ujQ+Li4O9+/fh6mpKTw9PWFkZARzc3P07NkTFy5ckPodOnQIKpUKO3fuVFvG5s2boVKpEB0dLdU8fPhwWZ+ibRIZGSm1HTlyBAMGDEDt2rWlfTxhwgQ8evRINm1wcLDasRQWFoZWrVrBwMAAlpaWGDx4sNo2GT58OExMTGRtP//8s1odAGBiYqJWM1C295WXl5e0/5s2bQoXFxecPXu2xPdVSZ5+n1tZWcHPz0+2/YEn75/AwMBS5xMaGio7vk+fPo3CwkLk5uaiTZs2z9xWAHDw4EF4eHjA2NgY1apVQ58+fXD58mVZn6J9ceXKFQwcOBBmZmbSJcLHjx+r1Vv8/Z6fn48ePXrAwsICly5dkvUt6+eXUvEMET1TUXixtLQEAPzvf//Drl27MGDAADg6OuLOnTv45ptv4OnpiUuXLsHe3h4AUFBQgJ49e+LAgQMYNGgQPvvsMzx48AARERG4cOEC6tevLy1j8ODB6NGjh2y5QUFBJdYzf/58qFQqTJs2DXfv3sWyZcvg7e2N2NhYGBoaAnjygePr6wsXFxfMmjULWlpaWL9+PTp37owjR47g7bffVptvrVq1EBISAgB4+PAhxo0bV+KyZ8yYgYEDB2LUqFFISUnBypUr0bFjR5w5cwbVqlVTm2bMmDHw8PAAAPzyyy9q/9GNHTtWun/r008/RUJCAlatWoUzZ87g2LFj0NXVLXE7lEd6erq0bsUVFhaid+/eOHr0KMaMGYMmTZrg/PnzWLp0Ka5evSq7efVFZWdnw9PTE7dv38bYsWNRu3ZtHD9+HEFBQUhKSsKyZcsqbVkV5eHhgTFjxqCwsBAXLlzAsmXL8O+//+LIkSPPnC44OBizZ8+Gt7c3xo0bh7i4OKxZswanT5+W9mFaWhqAJ8d1gwYNMHv2bDx+/BirV6+Gu7s7Tp8+jYYNG8LLywsODg4ICwvDO++8I1tOWFgY6tevL10uKqvt27cjOzsb48aNg6WlJU6dOoWVK1fi1q1b2L59e6nTbd68GUOHDkXLli0REhKCtLQ0rFixAkePHsWZM2dgZWVVrjpKU5H3VZFp06aVa1mNGzfGF198ASEE4uPjsWTJEvTo0aPMwbckRfs2MDAQLi4u+O9//4uUlJQSt9X+/fvh6+uLevXqITg4GI8ePcLKlSvh7u6Ov//+Wy28Dxw4EHXr1kVISAhOnDiBFStW4P79+9i4cWOp9YwaNQqRkZGIiIhA06ZNpfYX2c6KIYiEEOvXrxcAxP79+0VKSoq4efOm2LJli7C0tBSGhobi1q1bQgghHj9+LAoKCmTTJiQkCH19fTFnzhypbd26dQKAWLJkidqyCgsLpekAiEWLFqn1adasmfD09JSGDx06JACImjVriszMTKl927ZtAoBYvny5NO8GDRoIHx8faTlCCJGdnS0cHR1F165d1ZbVvn170bx5c2k4JSVFABCzZs2S2m7cuCG0tbXF/PnzZdOeP39e6OjoqLVfu3ZNABAbNmyQ2mbNmiWKv+WOHDkiAIiwsDDZtOHh4WrtderUEX5+fmq1BwQEiKffxk/XPnXqVGFtbS1cXFxk2/THH38UWlpa4siRI7Lp165dKwCIY8eOqS2vOE9PT9GsWTO19kWLFgkAIiEhQWqbO3euMDY2FlevXpX1/c9//iO0tbVFYmKiEKJix8T27dtLrXHYsGGiTp06z1wPIZ5s32HDhsna3n//fWFkZPTM6e7evSv09PREt27dZO+LVatWCQBi3bp1slqtrKxEamqq1O/q1atCV1dX9O/fX2oLCgoS+vr6Ij09XbYcHR0d2X51dHQU/v7+snqKlnPo0CGpLTs7W63ukJAQoVKpxD///CO1FT8+8/PzhY2Njahfv754+PCh1CcyMlIAEJMmTZLahg0bJoyNjWXz3759u1odQghhbGws287leV95enrK9v8ff/whAIju3burvQdK8vT0Qgjx+eefCwDi7t27UhsAERAQUOp8ij4ri47vouGmTZvKtnXRvii+rVq1aiWsra1FWlqa1Hb27FmhpaUl25dF+6J3796yZX/88ccCgDh79qys3qLjIigoSGhra4tdu3bJpivv55dS8ZIZyXh7e6NGjRpwcHDAoEGDYGJigp07d6JmzZoAnjx9pqX15LApKChAWloaTExM0KhRI/z999/SfHbs2AErKyt88sknassoy+nt0vj7+8PU1FQafvfdd2FnZ4c//vgDwJPHcK9du4b3338faWlpSE1NRWpqKrKystClSxdERUXJLtEATy7tGRgYPHO5v/zyCwoLCzFw4EBpnqmpqbC1tUWDBg1w6NAhWf/c3FwAT7ZXabZv3w5zc3N07dpVNk8XFxeYmJiozTMvL0/WLzU1Ve30+dNu376NlStXYsaMGWqXNbZv344mTZqgcePGsnkWXSZ9evkvYvv27fDw8ED16tVly/L29kZBQQGioqJk/bOzs9XWtaCgoMR5P3jwAKmpqbJLVBWRk5OD1NRU3L17FxERETh48CC6dOnyzGn279+P3NxcjB8/XnpfAMDo0aNhZmam9kj4iBEjpLOtANCgQQP07t0b4eHh0vr5+/sjJydHdilw69atyM/Px9ChQ6U2a2tr3Lp167nrVXTmFAC
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import numpy as np\n",
|
|||
|
"labels = [\"low\", \"middle\", \"high\", \"very_high\"]\n",
|
|||
|
"num_bins = 4\n",
|
|||
|
"hist1, bins1 = np.histogram(train_data[\"price\"].fillna(train_data[\"price\"].median()), bins=num_bins)\n",
|
|||
|
"bins1,hist1\n",
|
|||
|
"pd.concat([train_data[\"price\"], pd.cut(train_data[\"price\"], list(bins1))], axis=1).head(10)\n",
|
|||
|
"pd.concat([train_data[\"price\"], pd.cut(train_data[\"price\"], list(bins1), labels=labels)], axis=1).head()\n",
|
|||
|
"train_data['price_category'] = pd.cut(train_data[\"price\"], list(bins1), labels=labels)\n",
|
|||
|
"test_data['price_category'] = pd.cut(train_data[\"price\"], list(bins1), labels=labels)\n",
|
|||
|
"val_data['price_category'] = pd.cut(train_data[\"price\"], list(bins1), labels=labels)\n",
|
|||
|
"\n",
|
|||
|
"sns.countplot(x=train_data['price_category'])\n",
|
|||
|
"plt.title('Распределение цены в обучающей выборке')\n",
|
|||
|
"plt.xlabel('Категория цены')\n",
|
|||
|
"plt.ylabel('Частота')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Конструирование признаков\n",
|
|||
|
"1. Прогнозирование цен недвижимости. Цель технического проекта: Разработка модели машинного обучения для точного прогнозирования рыночной стоимости недвижимости.\n",
|
|||
|
"2. Оценка влияния факторов на цену недвижимости Цель технического проекта: Разработки модели для анализа данных для выявления факторов, которые больше всего влияют на цену\n",
|
|||
|
"\n",
|
|||
|
"### Конструирование признаков\n",
|
|||
|
"Унитарное кодирование - замена категориальных признаков бинарными значениями."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 112,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Столбцы train_data_encoded: ['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'price_category_low', 'price_category_middle', 'price_category_high', 'price_category_very_high']\n",
|
|||
|
"Столбцы val_data_encoded: ['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'price_category_low', 'price_category_middle', 'price_category_high', 'price_category_very_high']\n",
|
|||
|
"Столбцы test_data_encoded: ['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'price_category_low', 'price_category_middle', 'price_category_high', 'price_category_very_high']\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"categorical_features = ['price_category']\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"train_data_encoded = pd.get_dummies(train_data, columns=categorical_features)\n",
|
|||
|
"val_data_encoded = pd.get_dummies(val_data, columns=categorical_features)\n",
|
|||
|
"test_data_encoded = pd.get_dummies(test_data, columns=categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Столбцы train_data_encoded:\", train_data_encoded.columns.tolist())\n",
|
|||
|
"print(\"Столбцы val_data_encoded:\", val_data_encoded.columns.tolist())\n",
|
|||
|
"print(\"Столбцы test_data_encoded:\", test_data_encoded.columns.tolist())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Ручной синтез\n",
|
|||
|
"Создание новых признаков на основе экспертных знаний и логики предметной области.\n",
|
|||
|
"Новый признак будет - цена недвижимости за фут."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 113,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th>date</th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>bedrooms</th>\n",
|
|||
|
" <th>bathrooms</th>\n",
|
|||
|
" <th>sqft_living</th>\n",
|
|||
|
" <th>sqft_lot</th>\n",
|
|||
|
" <th>floors</th>\n",
|
|||
|
" <th>waterfront</th>\n",
|
|||
|
" <th>view</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>zipcode</th>\n",
|
|||
|
" <th>lat</th>\n",
|
|||
|
" <th>long</th>\n",
|
|||
|
" <th>sqft_living15</th>\n",
|
|||
|
" <th>sqft_lot15</th>\n",
|
|||
|
" <th>price_category_low</th>\n",
|
|||
|
" <th>price_category_middle</th>\n",
|
|||
|
" <th>price_category_high</th>\n",
|
|||
|
" <th>price_category_very_high</th>\n",
|
|||
|
" <th>price_per_sqft</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6252</th>\n",
|
|||
|
" <td>1150900080</td>\n",
|
|||
|
" <td>20150427T000000</td>\n",
|
|||
|
" <td>846450.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>3710</td>\n",
|
|||
|
" <td>7491</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>98029</td>\n",
|
|||
|
" <td>47.5596</td>\n",
|
|||
|
" <td>-122.016</td>\n",
|
|||
|
" <td>3040</td>\n",
|
|||
|
" <td>7491</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>228.153639</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4684</th>\n",
|
|||
|
" <td>9268200600</td>\n",
|
|||
|
" <td>20140519T000000</td>\n",
|
|||
|
" <td>413500.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>770</td>\n",
|
|||
|
" <td>4000</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>98117</td>\n",
|
|||
|
" <td>47.6959</td>\n",
|
|||
|
" <td>-122.364</td>\n",
|
|||
|
" <td>1420</td>\n",
|
|||
|
" <td>5040</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>537.012987</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1731</th>\n",
|
|||
|
" <td>7883603750</td>\n",
|
|||
|
" <td>20141209T000000</td>\n",
|
|||
|
" <td>337000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>1.75</td>\n",
|
|||
|
" <td>1400</td>\n",
|
|||
|
" <td>6000</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>98108</td>\n",
|
|||
|
" <td>47.5283</td>\n",
|
|||
|
" <td>-122.321</td>\n",
|
|||
|
" <td>1030</td>\n",
|
|||
|
" <td>6000</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>240.714286</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4742</th>\n",
|
|||
|
" <td>3330501545</td>\n",
|
|||
|
" <td>20141201T000000</td>\n",
|
|||
|
" <td>330000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>950</td>\n",
|
|||
|
" <td>3090</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>98118</td>\n",
|
|||
|
" <td>47.5510</td>\n",
|
|||
|
" <td>-122.276</td>\n",
|
|||
|
" <td>1230</td>\n",
|
|||
|
" <td>4120</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>347.368421</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4521</th>\n",
|
|||
|
" <td>993001332</td>\n",
|
|||
|
" <td>20140903T000000</td>\n",
|
|||
|
" <td>407000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.25</td>\n",
|
|||
|
" <td>1430</td>\n",
|
|||
|
" <td>1448</td>\n",
|
|||
|
" <td>3.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>98103</td>\n",
|
|||
|
" <td>47.6916</td>\n",
|
|||
|
" <td>-122.341</td>\n",
|
|||
|
" <td>1430</td>\n",
|
|||
|
" <td>1383</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>284.615385</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6412</th>\n",
|
|||
|
" <td>925069071</td>\n",
|
|||
|
" <td>20150126T000000</td>\n",
|
|||
|
" <td>750000.0</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>3.75</td>\n",
|
|||
|
" <td>3500</td>\n",
|
|||
|
" <td>101494</td>\n",
|
|||
|
" <td>1.5</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>98053</td>\n",
|
|||
|
" <td>47.6745</td>\n",
|
|||
|
" <td>-122.054</td>\n",
|
|||
|
" <td>3250</td>\n",
|
|||
|
" <td>38636</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>214.285714</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8285</th>\n",
|
|||
|
" <td>9268200315</td>\n",
|
|||
|
" <td>20140828T000000</td>\n",
|
|||
|
" <td>456000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.00</td>\n",
|
|||
|
" <td>1870</td>\n",
|
|||
|
" <td>8442</td>\n",
|
|||
|
" <td>1.5</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>98117</td>\n",
|
|||
|
" <td>47.6964</td>\n",
|
|||
|
" <td>-122.365</td>\n",
|
|||
|
" <td>1640</td>\n",
|
|||
|
" <td>6174</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>243.850267</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7853</th>\n",
|
|||
|
" <td>1822300040</td>\n",
|
|||
|
" <td>20140507T000000</td>\n",
|
|||
|
" <td>420000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>1.50</td>\n",
|
|||
|
" <td>1040</td>\n",
|
|||
|
" <td>3500</td>\n",
|
|||
|
" <td>1.5</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>98144</td>\n",
|
|||
|
" <td>47.5880</td>\n",
|
|||
|
" <td>-122.304</td>\n",
|
|||
|
" <td>1340</td>\n",
|
|||
|
" <td>1213</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>403.846154</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1095</th>\n",
|
|||
|
" <td>7202290160</td>\n",
|
|||
|
" <td>20150114T000000</td>\n",
|
|||
|
" <td>435000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>1560</td>\n",
|
|||
|
" <td>3987</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>98053</td>\n",
|
|||
|
" <td>47.6870</td>\n",
|
|||
|
" <td>-122.043</td>\n",
|
|||
|
" <td>1600</td>\n",
|
|||
|
" <td>3152</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>278.846154</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6929</th>\n",
|
|||
|
" <td>2621700010</td>\n",
|
|||
|
" <td>20140508T000000</td>\n",
|
|||
|
" <td>569000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2.25</td>\n",
|
|||
|
" <td>2250</td>\n",
|
|||
|
" <td>41688</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>98053</td>\n",
|
|||
|
" <td>47.6695</td>\n",
|
|||
|
" <td>-122.050</td>\n",
|
|||
|
" <td>2350</td>\n",
|
|||
|
" <td>37920</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>252.888889</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>2000 rows × 26 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" id date price bedrooms bathrooms sqft_living \\\n",
|
|||
|
"6252 1150900080 20150427T000000 846450.0 4 2.50 3710 \n",
|
|||
|
"4684 9268200600 20140519T000000 413500.0 2 1.00 770 \n",
|
|||
|
"1731 7883603750 20141209T000000 337000.0 3 1.75 1400 \n",
|
|||
|
"4742 3330501545 20141201T000000 330000.0 2 1.00 950 \n",
|
|||
|
"4521 993001332 20140903T000000 407000.0 3 2.25 1430 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"6412 925069071 20150126T000000 750000.0 5 3.75 3500 \n",
|
|||
|
"8285 9268200315 20140828T000000 456000.0 3 2.00 1870 \n",
|
|||
|
"7853 1822300040 20140507T000000 420000.0 2 1.50 1040 \n",
|
|||
|
"1095 7202290160 20150114T000000 435000.0 3 2.50 1560 \n",
|
|||
|
"6929 2621700010 20140508T000000 569000.0 4 2.25 2250 \n",
|
|||
|
"\n",
|
|||
|
" sqft_lot floors waterfront view ... zipcode lat long \\\n",
|
|||
|
"6252 7491 2.0 0 0 ... 98029 47.5596 -122.016 \n",
|
|||
|
"4684 4000 1.0 0 0 ... 98117 47.6959 -122.364 \n",
|
|||
|
"1731 6000 1.0 0 0 ... 98108 47.5283 -122.321 \n",
|
|||
|
"4742 3090 1.0 0 0 ... 98118 47.5510 -122.276 \n",
|
|||
|
"4521 1448 3.0 0 0 ... 98103 47.6916 -122.341 \n",
|
|||
|
"... ... ... ... ... ... ... ... ... \n",
|
|||
|
"6412 101494 1.5 0 0 ... 98053 47.6745 -122.054 \n",
|
|||
|
"8285 8442 1.5 0 0 ... 98117 47.6964 -122.365 \n",
|
|||
|
"7853 3500 1.5 0 0 ... 98144 47.5880 -122.304 \n",
|
|||
|
"1095 3987 2.0 0 0 ... 98053 47.6870 -122.043 \n",
|
|||
|
"6929 41688 2.0 0 0 ... 98053 47.6695 -122.050 \n",
|
|||
|
"\n",
|
|||
|
" sqft_living15 sqft_lot15 price_category_low price_category_middle \\\n",
|
|||
|
"6252 3040 7491 False False \n",
|
|||
|
"4684 1420 5040 False False \n",
|
|||
|
"1731 1030 6000 False False \n",
|
|||
|
"4742 1230 4120 False False \n",
|
|||
|
"4521 1430 1383 False False \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"6412 3250 38636 False False \n",
|
|||
|
"8285 1640 6174 False False \n",
|
|||
|
"7853 1340 1213 False False \n",
|
|||
|
"1095 1600 3152 False False \n",
|
|||
|
"6929 2350 37920 False False \n",
|
|||
|
"\n",
|
|||
|
" price_category_high price_category_very_high price_per_sqft \n",
|
|||
|
"6252 False False 228.153639 \n",
|
|||
|
"4684 False False 537.012987 \n",
|
|||
|
"1731 False False 240.714286 \n",
|
|||
|
"4742 False False 347.368421 \n",
|
|||
|
"4521 False False 284.615385 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"6412 False False 214.285714 \n",
|
|||
|
"8285 False False 243.850267 \n",
|
|||
|
"7853 False False 403.846154 \n",
|
|||
|
"1095 False False 278.846154 \n",
|
|||
|
"6929 False False 252.888889 \n",
|
|||
|
"\n",
|
|||
|
"[2000 rows x 26 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 113,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"train_data_encoded['price_per_sqft'] = df['price'] / df['sqft_living']\n",
|
|||
|
"val_data_encoded['price_per_sqft'] = df['price'] / df['sqft_living']\n",
|
|||
|
"test_data_encoded['price_per_sqft'] = df['price'] / df['sqft_living']\n",
|
|||
|
"test_data_encoded"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Масштабирование признаков\n",
|
|||
|
"Это процесс изменения диапазона признаков, чтобы равномерно распределить значения. "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 114,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"numerical_features = ['bedrooms']\n",
|
|||
|
"\n",
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"train_data_encoded[numerical_features] = scaler.fit_transform(train_data_encoded[numerical_features])\n",
|
|||
|
"val_data_encoded[numerical_features] = scaler.transform(val_data_encoded[numerical_features])\n",
|
|||
|
"test_data_encoded[numerical_features] = scaler.transform(test_data_encoded[numerical_features])"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Конструирование признаков с применением фреймворка Featuretools"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 115,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>bedrooms</th>\n",
|
|||
|
" <th>bathrooms</th>\n",
|
|||
|
" <th>sqft_living</th>\n",
|
|||
|
" <th>sqft_lot</th>\n",
|
|||
|
" <th>floors</th>\n",
|
|||
|
" <th>waterfront</th>\n",
|
|||
|
" <th>view</th>\n",
|
|||
|
" <th>condition</th>\n",
|
|||
|
" <th>grade</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>sqft_lot15</th>\n",
|
|||
|
" <th>price_category_low</th>\n",
|
|||
|
" <th>price_category_middle</th>\n",
|
|||
|
" <th>price_category_high</th>\n",
|
|||
|
" <th>price_category_very_high</th>\n",
|
|||
|
" <th>price_per_sqft</th>\n",
|
|||
|
" <th>DAY(date)</th>\n",
|
|||
|
" <th>MONTH(date)</th>\n",
|
|||
|
" <th>WEEKDAY(date)</th>\n",
|
|||
|
" <th>YEAR(date)</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1121059105</th>\n",
|
|||
|
" <td>378500.0</td>\n",
|
|||
|
" <td>-0.382609</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>2860</td>\n",
|
|||
|
" <td>43821</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>9</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>65340</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>132.342657</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>2014</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1026069163</th>\n",
|
|||
|
" <td>630000.0</td>\n",
|
|||
|
" <td>-0.382609</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>2460</td>\n",
|
|||
|
" <td>38794</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>9</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>51400</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>256.097561</td>\n",
|
|||
|
" <td>22</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>2015</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3751601501</th>\n",
|
|||
|
" <td>382450.0</td>\n",
|
|||
|
" <td>-0.382609</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>2220</td>\n",
|
|||
|
" <td>20531</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>19249</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>172.274775</td>\n",
|
|||
|
" <td>16</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>2014</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4322300340</th>\n",
|
|||
|
" <td>265000.0</td>\n",
|
|||
|
" <td>0.706185</td>\n",
|
|||
|
" <td>1.50</td>\n",
|
|||
|
" <td>1740</td>\n",
|
|||
|
" <td>12728</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>11125</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>152.298851</td>\n",
|
|||
|
" <td>12</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2015</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7701960130</th>\n",
|
|||
|
" <td>820000.0</td>\n",
|
|||
|
" <td>-0.382609</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>2980</td>\n",
|
|||
|
" <td>18935</td>\n",
|
|||
|
" <td>1.5</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>11</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>18225</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>275.167785</td>\n",
|
|||
|
" <td>17</td>\n",
|
|||
|
" <td>10</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2014</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8645500900</th>\n",
|
|||
|
" <td>279000.0</td>\n",
|
|||
|
" <td>0.706185</td>\n",
|
|||
|
" <td>2.00</td>\n",
|
|||
|
" <td>2200</td>\n",
|
|||
|
" <td>7700</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>7700</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>126.818182</td>\n",
|
|||
|
" <td>20</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2014</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9528104660</th>\n",
|
|||
|
" <td>905000.0</td>\n",
|
|||
|
" <td>0.706185</td>\n",
|
|||
|
" <td>3.50</td>\n",
|
|||
|
" <td>2980</td>\n",
|
|||
|
" <td>3000</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>9</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>4545</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>303.691275</td>\n",
|
|||
|
" <td>27</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>2014</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5100402668</th>\n",
|
|||
|
" <td>495000.0</td>\n",
|
|||
|
" <td>-0.382609</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>1570</td>\n",
|
|||
|
" <td>5510</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>6380</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>315.286624</td>\n",
|
|||
|
" <td>18</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>2015</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1338300180</th>\n",
|
|||
|
" <td>1127312.5</td>\n",
|
|||
|
" <td>0.706185</td>\n",
|
|||
|
" <td>2.25</td>\n",
|
|||
|
" <td>3960</td>\n",
|
|||
|
" <td>8640</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>9</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>8640</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>284.674874</td>\n",
|
|||
|
" <td>29</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>2014</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7167000020</th>\n",
|
|||
|
" <td>792500.0</td>\n",
|
|||
|
" <td>0.706185</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>4290</td>\n",
|
|||
|
" <td>175421</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>10</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>63162</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>184.731935</td>\n",
|
|||
|
" <td>16</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2014</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>6362 rows × 28 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" price bedrooms bathrooms sqft_living sqft_lot floors \\\n",
|
|||
|
"id \n",
|
|||
|
"1121059105 378500.0 -0.382609 2.50 2860 43821 2.0 \n",
|
|||
|
"1026069163 630000.0 -0.382609 2.50 2460 38794 2.0 \n",
|
|||
|
"3751601501 382450.0 -0.382609 2.50 2220 20531 2.0 \n",
|
|||
|
"4322300340 265000.0 0.706185 1.50 1740 12728 1.0 \n",
|
|||
|
"7701960130 820000.0 -0.382609 2.50 2980 18935 1.5 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"8645500900 279000.0 0.706185 2.00 2200 7700 1.0 \n",
|
|||
|
"9528104660 905000.0 0.706185 3.50 2980 3000 2.0 \n",
|
|||
|
"5100402668 495000.0 -0.382609 1.00 1570 5510 1.0 \n",
|
|||
|
"1338300180 1127312.5 0.706185 2.25 3960 8640 2.0 \n",
|
|||
|
"7167000020 792500.0 0.706185 2.50 4290 175421 2.0 \n",
|
|||
|
"\n",
|
|||
|
" waterfront view condition grade ... sqft_lot15 \\\n",
|
|||
|
"id ... \n",
|
|||
|
"1121059105 0 0 4 9 ... 65340 \n",
|
|||
|
"1026069163 0 0 3 9 ... 51400 \n",
|
|||
|
"3751601501 0 0 3 8 ... 19249 \n",
|
|||
|
"4322300340 0 0 4 7 ... 11125 \n",
|
|||
|
"7701960130 0 0 3 11 ... 18225 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"8645500900 0 0 3 7 ... 7700 \n",
|
|||
|
"9528104660 0 0 3 9 ... 4545 \n",
|
|||
|
"5100402668 0 0 4 7 ... 6380 \n",
|
|||
|
"1338300180 0 2 3 9 ... 8640 \n",
|
|||
|
"7167000020 0 0 3 10 ... 63162 \n",
|
|||
|
"\n",
|
|||
|
" price_category_low price_category_middle price_category_high \\\n",
|
|||
|
"id \n",
|
|||
|
"1121059105 False True False \n",
|
|||
|
"1026069163 False False True \n",
|
|||
|
"3751601501 False True False \n",
|
|||
|
"4322300340 True False False \n",
|
|||
|
"7701960130 False False True \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"8645500900 True False False \n",
|
|||
|
"9528104660 False False False \n",
|
|||
|
"5100402668 False True False \n",
|
|||
|
"1338300180 False False False \n",
|
|||
|
"7167000020 False False True \n",
|
|||
|
"\n",
|
|||
|
" price_category_very_high price_per_sqft DAY(date) MONTH(date) \\\n",
|
|||
|
"id \n",
|
|||
|
"1121059105 False 132.342657 8 7 \n",
|
|||
|
"1026069163 False 256.097561 22 4 \n",
|
|||
|
"3751601501 False 172.274775 16 7 \n",
|
|||
|
"4322300340 False 152.298851 12 1 \n",
|
|||
|
"7701960130 False 275.167785 17 10 \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"8645500900 False 126.818182 20 6 \n",
|
|||
|
"9528104660 True 303.691275 27 8 \n",
|
|||
|
"5100402668 False 315.286624 18 2 \n",
|
|||
|
"1338300180 True 284.674874 29 7 \n",
|
|||
|
"7167000020 False 184.731935 16 6 \n",
|
|||
|
"\n",
|
|||
|
" WEEKDAY(date) YEAR(date) \n",
|
|||
|
"id \n",
|
|||
|
"1121059105 1 2014 \n",
|
|||
|
"1026069163 2 2015 \n",
|
|||
|
"3751601501 2 2014 \n",
|
|||
|
"4322300340 0 2015 \n",
|
|||
|
"7701960130 4 2014 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"8645500900 4 2014 \n",
|
|||
|
"9528104660 2 2014 \n",
|
|||
|
"5100402668 2 2015 \n",
|
|||
|
"1338300180 1 2014 \n",
|
|||
|
"7167000020 0 2014 \n",
|
|||
|
"\n",
|
|||
|
"[6362 rows x 28 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 115,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import featuretools as ft\n",
|
|||
|
"\n",
|
|||
|
"# Удаление дубликатов по идентификатору\n",
|
|||
|
"train_data_encoded = train_data_encoded.drop_duplicates(subset='id', keep='first')\n",
|
|||
|
"\n",
|
|||
|
"#Создание EntitySet\n",
|
|||
|
"es = ft.EntitySet(id='house_data')\n",
|
|||
|
"\n",
|
|||
|
"#Добавление датафрейма в EntitySet\n",
|
|||
|
"es = es.add_dataframe(dataframe_name='houses', dataframe=train_data_encoded, index='id')\n",
|
|||
|
"\n",
|
|||
|
"#Генерация признаков\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='houses', max_depth=2)\n",
|
|||
|
"\n",
|
|||
|
"feature_matrix"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "kernel",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.5"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|