AIM-PIbd-31-Yakovlev-M-G/lab_3/lab_3.ipynb

1363 lines
192 KiB
Plaintext
Raw Normal View History

2024-11-01 17:56:43 +04:00
{
"cells": [
{
"cell_type": "code",
2024-11-02 13:01:31 +04:00
"execution_count": 10,
2024-11-01 17:56:43 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',\n",
" 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',\n",
" 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',\n",
" 'lat', 'long', 'sqft_living15', 'sqft_lot15'],\n",
" dtype='object')"
]
},
2024-11-02 13:01:31 +04:00
"execution_count": 10,
2024-11-01 17:56:43 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd;\n",
"df = pd.read_csv(\"data/house_data.csv\", sep=\",\", nrows=10000)\n",
"df.columns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Определение бизнес целей:\n",
"1. Прогнозирование цены недвижимости\n",
"2. Оценка влияния факторов на цену недвижимости\n",
"### Определение целей технического проекта:\n",
"1. Построить модель, которая будет прогнозировать стоимость недвижимости на основе данных характеристик.\n",
"2. Провести анализ данных для выявления факторов, которые больше всего влияют на цену\n",
"### Проверка данных на пропуски"
]
},
{
"cell_type": "code",
2024-11-02 13:01:31 +04:00
"execution_count": 11,
2024-11-01 17:56:43 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id 0\n",
"date 0\n",
"price 0\n",
"bedrooms 0\n",
"bathrooms 0\n",
"sqft_living 0\n",
"sqft_lot 0\n",
"floors 0\n",
"waterfront 0\n",
"view 0\n",
"condition 0\n",
"grade 0\n",
"sqft_above 0\n",
"sqft_basement 0\n",
"yr_built 0\n",
"yr_renovated 0\n",
"zipcode 0\n",
"lat 0\n",
"long 0\n",
"sqft_living15 0\n",
"sqft_lot15 0\n",
"dtype: int64\n"
]
},
{
"data": {
"text/plain": [
"id False\n",
"date False\n",
"price False\n",
"bedrooms False\n",
"bathrooms False\n",
"sqft_living False\n",
"sqft_lot False\n",
"floors False\n",
"waterfront False\n",
"view False\n",
"condition False\n",
"grade False\n",
"sqft_above False\n",
"sqft_basement False\n",
"yr_built False\n",
"yr_renovated False\n",
"zipcode False\n",
"lat False\n",
"long False\n",
"sqft_living15 False\n",
"sqft_lot15 False\n",
"dtype: bool"
]
},
2024-11-02 13:01:31 +04:00
"execution_count": 11,
2024-11-01 17:56:43 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
" print(f'{i} Процент пустых значений: %{null_rate:.2f}')\n",
"\n",
"print(df.isnull().sum())\n",
"\n",
"df.isnull().any()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Пустых значений нет, номинальных значений тоже."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Проверка выбросов, и их усреднение"
]
},
{
"cell_type": "code",
2024-11-02 13:01:31 +04:00
"execution_count": 12,
2024-11-01 17:56:43 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Колонка price:\n",
" Есть выбросы: Да\n",
" Количество выбросов: 499\n",
" Минимальное значение: 75000.0\n",
" Максимальное значение: 1127312.5\n",
" id date price bedrooms bathrooms sqft_living \\\n",
"0 7129300520 20141013T000000 221900.0 3 1.00 1180 \n",
"1 6414100192 20141209T000000 538000.0 3 2.25 2570 \n",
"2 5631500400 20150225T000000 180000.0 2 1.00 770 \n",
"3 2487200875 20141209T000000 604000.0 4 3.00 1960 \n",
"4 1954400510 20150218T000000 510000.0 3 2.00 1680 \n",
"\n",
" sqft_lot floors waterfront view ... grade sqft_above sqft_basement \\\n",
"0 5650 1.0 0 0 ... 7 1180 0 \n",
"1 7242 2.0 0 0 ... 7 2170 400 \n",
"2 10000 1.0 0 0 ... 6 770 0 \n",
"3 5000 1.0 0 0 ... 7 1050 910 \n",
"4 8080 1.0 0 0 ... 8 1680 0 \n",
"\n",
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
"0 1955 0 98178 47.5112 -122.257 1340 \n",
"1 1951 1991 98125 47.7210 -122.319 1690 \n",
"2 1933 0 98028 47.7379 -122.233 2720 \n",
"3 1965 0 98136 47.5208 -122.393 1360 \n",
"4 1987 0 98074 47.6168 -122.045 1800 \n",
"\n",
" sqft_lot15 \n",
"0 5650 \n",
"1 7639 \n",
"2 8062 \n",
"3 5000 \n",
"4 7503 \n",
"\n",
"[5 rows x 21 columns]\n"
]
}
],
"source": [
"numeric_columns = ['price']\n",
"for column in numeric_columns:\n",
" if pd.api.types.is_numeric_dtype(df[column]): # Проверяем, является ли колонка числовой\n",
" q1 = df[column].quantile(0.25) # Находим 1-й квартиль (Q1)\n",
" q3 = df[column].quantile(0.75) # Находим 3-й квартиль (Q3)\n",
" iqr = q3 - q1 # Вычисляем межквартильный размах (IQR)\n",
"\n",
" # Определяем границы для выбросов\n",
" lower_bound = q1 - 1.5 * iqr # Нижняя граница\n",
" upper_bound = q3 + 1.5 * iqr # Верхняя граница\n",
"\n",
" # Подсчитываем количество выбросов\n",
" outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]\n",
" outlier_count = outliers.shape[0]\n",
"\n",
" # Устраняем выбросы: заменяем значения ниже нижней границы на саму нижнюю границу, а выше верхней — на верхнюю\n",
" df[column] = df[column].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)\n",
"\n",
" print(f\"Колонка {column}:\")\n",
" print(f\" Есть выбросы: {'Да' if outlier_count > 0 else 'Нет'}\")\n",
" print(f\" Количество выбросов: {outlier_count}\")\n",
" print(f\" Минимальное значение: {df[column].min()}\")\n",
" print(f\" Максимальное значение: {df[column].max()}\")\n",
" print(df.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Создание выборок\n",
"Так как мы будет предсказывать цену, то и целевым признаком будет параметр цены. "
]
},
{
"cell_type": "code",
2024-11-02 13:01:31 +04:00
"execution_count": 13,
2024-11-01 17:56:43 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 6400\n",
"Размер контрольной выборки: 1600\n",
"Размер тестовой выборки: 2000\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"#Разделение данных на обучающую и тестовую выборки\n",
"train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)\n",
"#Разделение обучающей выборки на обучающую и контрольную\n",
"train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", len(train_data))\n",
"print(\"Размер контрольной выборки:\", len(val_data))\n",
"print(\"Размер тестовой выборки:\", len(test_data))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Визуализвация цен в выборках "
]
},
{
"cell_type": "code",
2024-11-02 13:01:31 +04:00
"execution_count": 14,
2024-11-01 17:56:43 +04:00
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAHHCAYAAABZbpmkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB35UlEQVR4nO3dd3hT9f4H8PdJmqR7ppu2QCkUyi6rbKGAgDhAUWS68CJ4FRS9uEAQESeoKE7AnygXUFEBmTIUCkihUGhZpRBautI90yY5vz9Kcwkt0Ja2J0nfr+fJ8zQn53vOJydJ88l3CqIoiiAiIiKyUTKpAyAiIiJqTEx2iIiIyKYx2SEiIiKbxmSHiIiIbBqTHSIiIrJpTHaIiIjIpjHZISIiIpvGZIeIiIhsGpMdIqJGkJeXhwsXLkCv10sdCjUgURSRk5OD8+fPSx0K1QGTHSKiBlBRUYF3330XXbp0gUqlgoeHB8LCwrB7926pQ7MKp06dwqZNm0z34+LisGXLFukCuk5hYSFee+01tGvXDkqlEl5eXmjbti3Onj0rdWhUS3ZSB0CNb/Xq1XjsscdM91UqFYKDgzF8+HC8/vrr8PX1lTA6Iuun0+kwfPhwHDp0CP/617+waNEiODo6Qi6XIzIyUurwrEJhYSGefvpp+Pn5wcvLC8899xxGjhyJ0aNHSxpXdnY2Bg0aBI1Gg2effRb9+vWDUqmEQqFAy5YtJY2Nao/JTjOycOFCtGrVCmVlZfj777/x+eefY+vWrTh16hQcHR2lDo/Iai1duhSHDx/G9u3bMXjwYKnDsUpRUVGmGwC0bdsWTz31lMRRAXPnzkVaWhpiYmIQEREhdThUT0x2mpGRI0eiR48eAIAnn3wSXl5e+PDDD/Hrr79iwoQJEkdHZJ30ej2WLVuGF154gYnOHdq0aRMSEhJQWlqKTp06QalUShpPZmYm1qxZg5UrVzLRsXLss9OMDRkyBACQnJwMAMjJycGLL76ITp06wdnZGa6urhg5ciROnDhRrWxZWRkWLFiAtm3bwt7eHv7+/hg7diySkpIAAJcuXYIgCDe9Xf+lsHfvXgiCgP/+97945ZVX4OfnBycnJ9x77724cuVKtXMfPnwYd999N9zc3ODo6IhBgwbhwIEDNT7HwYMH13j+BQsWVNv3+++/R2RkJBwcHODp6YlHHnmkxvPf6rldz2g0YtmyZYiIiIC9vT18fX3x9NNPIzc312y/li1b4p577ql2nlmzZlU7Zk2xv/fee9WuKVDZtDJ//ny0adMGKpUKQUFBeOmll6DT6Wq8VtcbPHgwOnbsWG37+++/D0EQcOnSJbPteXl5eP755xEUFASVSoU2bdpg6dKlMBqNpn2qrtv7779f7bgdO3as8T2xcePGm8Y4bdq0WjUjtGzZ0vT6yGQy+Pn54eGHH4ZGo7ltWQD47LPPEBERAZVKhYCAAMycORN5eXmmx8+ePYvc3Fy4uLhg0KBBcHR0hJubG+655x6cOnXKtN+ePXsgCAJ++eWXauf44YcfIAgCYmJiTDFPmzbNbJ+qa7J3717Ttr/++gsPPfQQgoODTa/x7NmzUVpaalZ2wYIF1d5La9euRdeuXWFvbw8vLy9MmDCh2jWZNm0anJ2dzbZt3LixWhwA4OzsXC1moHafq8GDB5te/w4dOiAyMhInTpyo8XNVkxs/52q1GqNHjza7/kDl52fWrFk3Pc7q1avN3t///PMPjEYjysvL0aNHj1teKwD4888/MWDAADg5OcHd3R333XcfEhMTzfapei3OnDmD8ePHw9XV1dRsV1ZWVi3e6z/ver0eo0aNgqenJxISEsz2re3/r+aKNTvNWFVi4uXlBQC4ePEiNm3ahIceegitWrVCRkYGvvjiCwwaNAgJCQkICAgAABgMBtxzzz3YvXs3HnnkETz33HMoLCzEzp07cerUKYSGhprOMWHCBIwaNcrsvPPmzasxnsWLF0MQBLz88svIzMzEsmXLEB0djbi4ODg4OACo/GcycuRIREZGYv78+ZDJZFi1ahWGDBmCv/76C7169ap23BYtWmDJkiUAgKKiIsyYMaPGc7/++usYP348nnzySWRlZeGTTz7BwIEDcfz4cbi7u1crM336dAwYMAAA8PPPP1f7Env66adN/aX+/e9/Izk5GZ9++imOHz+OAwcOQKFQ1Hgd6iIvL8/03K5nNBpx77334u+//8b06dPRvn17xMfH46OPPsK5c+fMOoLeqZKSEgwaNAipqal4+umnERwcjIMHD2LevHlIS0vDsmXLGuxc9TVgwABMnz4dRqMRp06dwrJly3D16lX89ddftyy3YMECvPnmm4iOjsaMGTNw9uxZfP755/jnn39Mr2F2djaAyvd1WFgY3nzzTZSVlWHFihXo168f/vnnH7Rt2xaDBw9GUFAQ1q5diwceeMDsPGvXrkVoaKipCae2NmzYgJKSEsyYMQNeXl44cuQIPvnkE6SkpGDDhg03LffDDz9g0qRJ6NKlC5YsWYLs7Gx8/PHH+Pvvv3H8+HGo1eo6xXEz9flcVXn55ZfrdK7w8HC8+uqrEEURSUlJ+PDDDzFq1KhaJ7U1qXptZ82ahcjISLzzzjvIysqq8Vrt2rULI0eOROvWrbFgwQKUlpbik08+Qb9+/XDs2LFqifn48ePRsmVLLFmyBIcOHcLHH3+M3NxcfPfddzeN58knn8TevXuxc+dOdOjQwbT9Tq5zsyGSzVu1apUIQNy1a5eYlZUlXrlyRVy3bp3o5eUlOjg4iCkpKaIoimJZWZloMBjMyiYnJ4sqlUpcuHChadu3334rAhA//PDDaucyGo2mcgDE9957r9o+ERER4qBBg0z39+zZIwIQAwMDxYKCAtP29evXiwDE5cuXm44dFhYmjhgxwnQeURTFkpISsVWrVuKwYcOqnatv375ix44dTfezsrJEAOL8+fNN2y5duiTK5XJx8eLFZmXj4+NFOzu7atvPnz8vAhDXrFlj2jZ//nzx+o/TX3/9JQIQ165da1Z227Zt1baHhISIo0ePrhb7zJkzxRs/ojfG/tJLL4k+Pj5iZGSk2TX9v//7P1Emk4l//fWXWfmVK1eKAMQDBw5UO9/1Bg0aJEZERFTb/t5774kAxOTkZNO2RYsWiU5OTuK5c+fM9v3Pf/4jyuVyUaPRiKJYv/fEhg0bbhrj1KlTxZCQkFs+D1GsvL5Tp0412/boo4+Kjo6OtyyXmZkpKpVKcfjw4Wafi08//VQEIH777bdmsarValGr1Zr2O3funKhQKMRx48aZts2bN09UqVRiXl6e2Xns7OzMXtdWrVqJU6ZMMYun6jx79uwxbSspKakW95IlS0RBEMTLly+btl3//tTr9aKvr68YGhoqFhUVmfbZu3evCEB84YUXTNumTp0qOjk5mR1/w4YN1eIQRVF0cnIyu851+VwNGjTI7PXfunWrCEC8++67q30GanJjeVEUxVdeeUUEIGZmZpq2ARBnzpx50+NU/a+sen9X3e/QoYPZta56La6/Vl27dhV9fHzE7Oxs07YTJ06IMpnM7LWsei3uvfdes3M/88wzIgDxxIkTZvFWvS/mzZsnyuVycdOmTWbl6vr/q7liM1YzEh0dDW9vbwQFBeGRRx6Bs7MzfvnlFwQGBgKoHKUlk1W+JQwGA7Kzs+Hs7Ix27drh2LFjpuP89NNPUKvVePbZZ6udozZVzjczZcoUuLi4mO4/+OCD8Pf3x9atWwFUDkU9f/48Hn30UWRnZ0Or1UKr1aK4uBhDhw7F/v37zZpNgMrmNnt7+1ue9+eff4bRaMT48eNNx9RqtfDz80NYWBj27Nljtn95eTmAyut1Mxs2bICbmxuGDRtmdszIyEg4OztXO2ZFRYXZflqttlqV9o1SU1PxySef4PXXX6/W1LBhwwa0b98e4eHhZsesarq88fx3YsOGDRgwYAA8PDzMzhUdHQ2DwYD9+/eb7V9SUlLtuRoMhhqPXVhYCK1Wa9ZsVB86nQ5arRaZmZnYuXMn/vzzTwwdOvSWZXbt2oXy8nI8//zzps8FADz11FNwdXWtNiz6scceM9WSAkBYWBj
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAHHCAYAAABZbpmkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB1A0lEQVR4nO3dd3hTdd8G8DujTfemky4KtIyWUWRvECgoAgqCoIAyVNBHUFT0UYbKEFQQcT0i4CuKoAwHguxZVqHMAm0ptHSnpbtNmuS8f5RGQstoaXqS9P5cV66rOTk555vTpLl7zm9IBEEQQERERGShpGIXQERERGRMDDtERERk0Rh2iIiIyKIx7BAREZFFY9ghIiIii8awQ0RERBaNYYeIiIgsGsMOERERWTS52AUQERFZArVajdzcXOh0Ovj6+opdDt2GZ3aIiMik/fjjj7h27Zr+/po1a5CamipeQbc5efIknnnmGXh4eEChUMDHxwdPPvmk2GXRHRh2LMiaNWsgkUj0NxsbGzRv3hzTp09HZmam2OUREdXKwYMH8eabb+LatWvYsWMHpk2bBqlU/K+vrVu3onv37rh48SI++ugj7Ny5Ezt37sQ333wjdml0B17GskDz589HcHAwysrKcOjQIXz11VfYtm0bzp8/Dzs7O7HLIyKqkRkzZqB3794IDg4GAMycORM+Pj6i1pSbm4tJkyZh4MCB2LhxI6ytrUWth+6NYccCRUVFoUOHDgCASZMmwd3dHZ9++im2bt2KMWPGiFwdEVHNhIWFITExEefPn4eHhwdCQkLELgmrV69GWVkZ1qxZw6BjBsQ/D0hG17dvXwBAUlISgIr/SN544w2Eh4fDwcEBTk5OiIqKwpkzZ6o8t6ysDHPnzkXz5s1hY2MDHx8fjBgxAomJiQCAa9euGVw6u/PWu3dv/bb27dsHiUSCX375Be+88w68vb1hb2+PoUOHIiUlpcq+jx07hkGDBsHZ2Rl2dnbo1asXDh8+XO1r7N27d7X7nzt3bpV1f/zxR0RGRsLW1hZubm4YPXp0tfu/12u7nU6nw7Jly9CqVSvY2NjAy8sLU6dOxc2bNw3WCwoKwmOPPVZlP9OnT6+yzepqX7JkSZVjCgAqlQpz5sxB06ZNoVAo4O/vjzfffBMqlaraY3W73r17o3Xr1lWWL126FBKJxKCdBADk5eXhtddeg7+/PxQKBZo2bYrFixdDp9Pp16k8bkuXLq2y3datW1f7nvj111/vWuOECRMQFBR039cSFBSk//1IpVJ4e3vj6aefRnJy8gM9d8KECQbLpkyZAhsbG+zbt89g+ZdffolWrVpBoVDA19cX06ZNQ15ensE6D3pcb6+5ulvl6779mH722WcIDAyEra0tevXqhfPnz1fZz549e9CjRw/Y29vDxcUFTzzxBOLi4u573G6/3f667/bevV1Nfu8AkJWVhRdeeAFeXl6wsbFBmzZtsHbt2mq3uWbNGtjb26NTp04ICQnBtGnTIJFIqvzO7lZT5c3KygpBQUGYNWsW1Gq1fr3KJgAnT56867Z69+5t8BqOHj2Ktm3bYsGCBfrPQ7NmzbBo0SKDzwMAaDQafPDBBwgJCYFCoUBQUBDeeeedKp/RyuP8zz//oG3btrCxsUHLli2xadMmg/Uq673983nhwgW4urriscceg0aj0S9/kM9sQ8AzOw1AZTBxd3cHAFy9ehVbtmzByJEjERwcjMzMTHzzzTfo1asXLl68qO9FoNVq8dhjj2H37t0YPXo0/vOf/6CwsBA7d+7E+fPnDf67GjNmDAYPHmyw39mzZ1dbz0cffQSJRIK33noLWVlZWLZsGfr374/Y2FjY2toCqPhjHRUVhcjISMyZMwdSqRSrV69G3759cfDgQXTs2LHKdhs3boyFCxcCAIqKivDSSy9Vu+/33nsPo0aNwqRJk5CdnY0VK1agZ8+eOH36NFxcXKo8Z8qUKejRowcAYNOmTdi8ebPB41OnTsWaNWswceJEvPrqq0hKSsIXX3yB06dP4/Dhw7Cysqr2ONREXl6e/rXdTqfTYejQoTh06BCmTJmCFi1a4Ny5c/jss89w5coVbNmy5aH3XamkpAS9evVCamoqpk6dioCAABw5cgSzZ89Geno6li1bVmf7qq0ePXpgypQp0Ol0OH/+PJYtW4a0tDQcPHiwRtuZM2cOVq1ahV9++cXgC27u3LmYN28e+vfvj5deegmXL1/GV199hRMnTtTqd71s2TIUFRUBAOLi4rBgwQK88847aNGiBQDAwcHBYP0ffvgBhYWFmDZtGsrKyrB8+XL07dsX586dg5eXFwBg165diIqKQpMmTTB37lyUlpZixYoV6NatG06dOlVtcKw8brfXYUylpaXo3bs3EhISMH36dAQHB2Pjxo2YMGEC8vLy8J///Oeuz01ISMD//ve/Gu2v8jOsUqmwY8cOLF26FDY2Nvjggw9q/RpycnJw6NAhHDp0CM8//zwiIyOxe/duzJ49G9euXcPXX3+tX3fSpElYu3YtnnrqKbz++us4duwYFi5ciLi4uCp/T+Lj4/H000/jxRdfxPjx47F69WqMHDkS27dvx6OPPlptLSkpKRg0aBDCwsKwYcMGyOUVX+3m8JmtNwJZjNWrVwsAhF27dgnZ2dlCSkqKsH79esHd3V2wtbUVbty4IQiCIJSVlQlardbguUlJSYJCoRDmz5+vX/b9998LAIRPP/20yr50Op3+eQCEJUuWVFmnVatWQq9evfT39+7dKwAQ/Pz8hIKCAv3yDRs2CACE5cuX67fdrFkzYeDAgfr9CIIglJSUCMHBwcKjjz5aZV9du3YVWrdurb+fnZ0tABDmzJmjX3bt2jVBJpMJH330kcFzz507J8jl8irL4+PjBQDC2rVr9cvmzJkj3P6xOXjwoABAWLduncFzt2/fXmV5YGCgMGTIkCq1T5s2Tbjzo3hn7W+++abg6ekpREZGGhzT//u//xOkUqlw8OBBg+d//fXXAgDh8OHDVfZ3u169egmtWrWqsnzJkiUCACEpKUm/7IMPPhDs7e2FK1euGKz79ttvCzKZTEhOThYEoXbviY0bN961xvHjxwuBgYH3fB2CUHF8x48fb7DsmWeeEezs7Gr03G+++UYAIKxYscJgnaysLMHa2loYMGCAwefniy++EAAI33//vX5ZTY5rpcpjsXfv3iqPVR7T2z/HgiAIx44dEwAIM2bM0C9r27at4OnpKeTk5OiXnTlzRpBKpcJzzz1XZdt+fn7CxIkT71nH3d671dX4IL/3ZcuWCQCEH3/8Ub9MrVYLXbp0ERwcHPR/Hyq3uXr1av16o0aNElq3bi34+/tX+X3frabbny8IguDr6ysMHjxYf7/yb+eJEyfuuq1evXoZvIZevXoJAIS5c+carDdhwgQBgHDu3DlBEAQhNjZWACBMmjTJYL033nhDACDs2bNHvywwMFAAIPz222/6Zfn5+YKPj4/Qrl27KvUmJSUJubm5QsuWLYXQ0FBBqVQa7ONBP7MNAS9jWaD+/fujUaNG8Pf3x+jRo+Hg4IDNmzfDz88PAKBQKPQ9GbRaLXJycuDg4IDQ0FCcOnVKv53ffvsNHh4eeOWVV6rs487LLjXx3HPPwdHRUX//qaeego+PD7Zt2wYAiI2NRXx8PJ555hnk5ORAqVRCqVSiuLgY/fr1w4EDB6qcgi0rK4ONjc0997tp0ybodDqMGjVKv02lUglvb280a9YMe/fuNVi/8jS3QqG46zY3btwIZ2dnPProowbbjIyMhIODQ5VtlpeXG6ynVCpRVlZ2z7pTU1OxYsUKvPfee1X+09+4cSNatGiBsLAwg21WXrq8c/8PY+PGjejRowdcXV0N9tW/f39otVocOHDAYP2SkpIqr1Wr1Va77cLCQiiVyiqXg2pKpVJBqVQiKysLO3fuxJ49e9CvX78Hfv7WrVvx8ssvY9asWZg+fbrBY7t27YJarcZrr71m0BNo8uTJcHJywl9//WWwvlarrfL6S0pKHur1DRs2TP85BoCOHTuiU6dO+s9Oeno6YmNjMWHCBLi5uenXi4iIwKOPPqpf73Zqtfq
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAHHCAYAAABZbpmkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABr00lEQVR4nO3dd3hTZf8G8PtkdqZ70gEUKKsMy7AgU5QliqgIAgKyRFygqMiriAsRXkEFBw6QVxABFQURZCMbCmUWKNDSFuhI90yb5Pz+qM2P0FLakvYk6f25rlwXOTl5zjcnCbl7zvM8RxBFUQQRERGRnZJJXQARERFRXWLYISIiIrvGsENERER2jWGHiIiI7BrDDhEREdk1hh0iIiKyaww7REREZNcYdoiIiMiuMewQEVGDp9frkZaWhsTERKlLoTrAsENERHVm06ZNiImJMd3fsGEDzp49K11BN4mLi8OkSZMQEBAAlUoFPz8/REVFgRcWsD8MO2RmxYoVEATBdHNwcECLFi3w/PPPIzU1VeryiMjGnD59Gi+99BLi4uJw6NAhPPvss8jLy5O6LBw6dAhdunTBzp078cYbb2Dr1q3Ytm0bNmzYAEEQpC6PLEzgtbHoZitWrMD48ePx7rvvokmTJiguLsa+ffvwv//9D6GhoThz5gycnJykLpOIbER6ejq6deuGS5cuAQCGDRuGX375RdKaSkpK0L59e2g0Gvz9999wc3OTtB6qewqpCyDrNHDgQHTq1AkAMHHiRHh5eeGTTz7B77//jpEjR0pcHRHZCh8fH5w5c8b0h1KrVq2kLgkbN27EhQsXcP78eQadBoKnsaha+vbtCwCIj48HAGRmZuLVV19FREQEXFxcoNFoMHDgQJw8ebLCc4uLi/HOO++gRYsWcHBwQEBAAIYNG4bLly8DABISEsxOnd166927t6mt3bt3QxAE/Pzzz3jzzTfh7+8PZ2dnPPzww0hKSqqw7cOHD2PAgAFwc3ODk5MTevXqhf3791f6Gnv37l3p9t95550K6/7444+IjIyEo6MjPD09MWLEiEq3X9Vru5nRaMTixYvRpk0bODg4wM/PD1OmTEFWVpbZeo0bN8ZDDz1UYTvPP/98hTYrq33BggUV9ikA6HQ6zJkzB82aNYNarUZwcDBee+016HS6SvfVzXr37o22bdtWWL5w4UIIgoCEhASz5dnZ2Xj55ZcRHBwMtVqNZs2aYf78+TAajaZ1yvfbwoULK7Tbtm3bSj8T69evv22N48aNQ+PGje/4Who3bmx6f2QyGfz9/fHkk0/esdPqzc+r7Hbztqv7XgPAX3/9hV69esHV1RUajQadO3fG6tWrAdz+81rZZ0yv1+O9995DWFgY1Go1GjdujDfffLPC+1vd119QUIBXXnnF9B6Gh4dj4cKFFfq6lH8G1Wo1IiMj0apVq9t+Bitz82uRy+Vo1KgRJk+ejOzsbNM6tXn/Dx06hCZNmuCXX35BWFgYVCoVQkJC8Nprr6GoqKjC87/44gu0adMGarUagYGBmDZtmlkNwP9/D6Kjo9GtWzc4OjqiSZMm+Oqrr8zWK6939+7dpmXXr19H48aN0alTJ+Tn55uW3833kszxyA5VS3kw8fLyAgBcuXIFGzZswBNPPIEmTZogNTUVX3/9NXr16oVz584hMDAQAGAwGPDQQw9hx44dGDFiBF566SXk5eVh27ZtOHPmDMLCwkzbGDlyJAYNGmS23VmzZlVazwcffABBEPD6668jLS0NixcvRr9+/RATEwNHR0cAwM6dOzFw4EBERkZizpw5kMlkWL58Ofr27Yt//vkHXbp0qdBuUFAQ5s2bBwDIz8/H1KlTK932W2+9heHDh2PixIlIT0/H559/jp49e+LEiRNwd3ev8JzJkyejR48eAIBff/0Vv/32m9njU6ZMMZ1CfPHFFxEfH48lS5bgxIkT2L9/P5RKZaX7oSays7NNr+1mRqMRDz/8MPbt24fJkyejVatWOH36NBYtWoSLFy9iw4YNd73tcoWFhejVqxeuXbuGKVOmICQkBAcOHMCsWbNw48YNLF682GLbqq0ePXpg8uTJMBqNOHPmDBYvXozr16/jn3/+ue1zFi9ebPqRio2NxYcffog333zTdBTDxcXFtG513+sVK1bgmWeeQZs2bTBr1iy4u7vjxIkT2LJlC5566inMnj0bEydOBABotVpMnz7d7HN2s4kTJ+KHH37A448/jldeeQWHDx/GvHnzEBsbW+GzeKfXL4oiHn74YezatQsTJkxAhw4dsHXrVsycORPXrl3DokWLbrufbvcZrMqjjz6KYcOGQa/X4+DBg1i2bBmKiorwv//9r0bt3CwjIwNXrlzBm2++iWHDhuGVV17BsWPHsGDBApw5cwZ//vmnKSy+8847mDt3Lvr164epU6fiwoUL+PLLL3H06NEK382srCwMGjQIw4cPx8iRI7F27VpMnToVKpUKzzzzTKW15OTkYODAgVAqldi8ebPps1Kf38sGQSS6yfLly0UA4vbt28X09HQxKSlJXLNmjejl5SU6OjqKycnJoiiKYnFxsWgwGMyeGx8fL6rVavHdd981Lfv+++9FAOInn3xSYVtGo9H0PADiggULKqzTpk0bsVevXqb7u3btEgGIjRo1EnNzc03L165dKwIQP/30U1PbzZs3F/v372/ajiiKYmFhodikSRPxgQceqLCtbt26iW3btjXdT09PFwGIc+bMMS1LSEgQ5XK5+MEHH5g99/Tp06JCoaiwPC4uTgQg/vDDD6Zlc+bMEW/+6v3zzz8iAHHVqlVmz92yZUuF5aGhoeLgwYMr1D5t2jTx1q/zrbW/9tproq+vrxgZGWm2T//3v/+JMplM/Oeff8ye/9VXX4kAxP3791fY3s169eoltmnTpsLyBQsWiADE+Ph407L33ntPdHZ2Fi9evGi27htvvCHK5XIxMTFRFMXafSbWrVt32xrHjh0rhoaGVvk6RLFs/44dO9Zs2VNPPSU6OTnd8bm31rNr164Kj1X3vc7OzhZdXV3Frl27ikVFRWbr3vx5Lle+v5YvX17hsZiYGBGAOHHiRLPlr776qghA3Llzp2lZdV7/hg0bRADi+++/b7be448/LgqCIF66dMm0rLqfwdu59fmiWPY9bd26tel+bd7/sWPHigDEcePGma1X/t3cuHGjKIqimJaWJqpUKvHBBx80+/9uyZIlIgDx+++/Ny3r1auXCED873//a1qm0+nEDh06iL6+vmJJSYlZvbt27RKLi4vF3r17i76+vmb7TRTv/ntJ5ngaiyrVr18/+Pj4IDg4GCNGjICLiwt+++03NGrUCACgVqshk5V9fAwGAzIyMuDi4oLw8HAcP37c1M4vv/wCb29vvPDCCxW2cTcjHp5++mm4urqa7j/++OMICAjA5s2bAQAxMTGIi4vDU089hYyMDGi1Wmi1WhQUFOD+++/H3r17zU6bAGWn2xwcHKrc7q+//gqj0Yjhw4eb2tRqtfD390fz5s2xa9cus/VLSkoAlO2v21m3bh3c3NzwwAMPmLUZGRkJFxeXCm2WlpaarafValFcXFxl3deuXcPnn3+Ot956y+woQ/n2W7VqhZYtW5q1WX7q8tbt341169ahR48e8PDwMNtWv379YDAYsHfvXrP1CwsLK7xWg8FQadt5eXnQarUVTi/UlE6ng1arRVpaGrZt24adO3fi/vvvv6s2y1X3vd62bRvy8vLwxhtvVPhM1vR7U/6dmDFjhtnyV155BQDw559/mi2/0+vfvHkz5HI5XnzxxQrtiaKIv/76q9I6qvoMVqX8M5CSkoJffvkFJ0+erPT9qM37P3PmTLP706dPh1wuN+2T7du3o6SkBC+//LLp/zsAmDRpEjQaTYV9p1AoMGXKFNN9lUqFKVOmIC0tDdHR0WbrGo1GPP300zh06BA2b95sdpQbqN/vZUPA01hUqaVLl6JFixZQKBTw8/NDeHi42ZfdaDTi008/xRdffIH4+HizH6DyU11A2emv8PBwKBSW/ag1b97c7L4gCGjWrJmpf0hcXBwAYOzYsbdtIycnBx4eHqb7Wq22Qru
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Средняя цена в обучающей выборке: 503651.6890625\n",
"Средняя цена в контрольной выборке: 515548.73125\n",
"Средняя цена в тестовой выборке: 502023.62175\n"
]
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"def plot_price_dist(data, title):\n",
" sns.histplot(data['price'], kde=True)\n",
" plt.title(title)\n",
" plt.xlabel('Цена')\n",
" plt.ylabel('Частота')\n",
" plt.show()\n",
"\n",
"plot_price_dist(train_data, 'Распределение цены в обучающей выборке')\n",
"plot_price_dist(val_data, 'Распределение цены в контрольной выборке')\n",
"plot_price_dist(test_data, 'Распределение цены в тестовой выборке')\n",
"\n",
"print(\"Средняя цена в обучающей выборке: \", train_data['price'].mean())\n",
"print(\"Средняя цена в контрольной выборке: \", val_data['price'].mean())\n",
"print(\"Средняя цена в тестовой выборке: \", test_data['price'].mean())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Преобразование целевой переменной в категории дискретизацией"
]
},
{
"cell_type": "code",
2024-11-02 13:01:31 +04:00
"execution_count": 15,
2024-11-01 17:56:43 +04:00
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAHHCAYAAABeLEexAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABMgElEQVR4nO3deVhO+f8/8OfdvkeljRDZZZmYJClEkm0YhmHC2MbUzNh9mrFkmz6WsRtmI0bGMoaZYSayJEuWaWQX+mTCFBUVRev794dv59dxF5W44zwf13Vfl/M+73PO6yz37dlZ7lslhBAgIiIiUjAtTRdAREREpGkMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERBqSnp6O69evIz8/X9OlUCUSQuDevXu4du2apkuhcmAgIiJ6RfLy8rBw4UK0bNkS+vr6qF69Oho0aIADBw5ourTXwoULF7Br1y5pODY2Fnv27NFcQcU8ePAA06dPR6NGjaCnpwdLS0s0bNgQcXFxmi6NykhH0wVQ1RAaGooRI0ZIw/r6+qhduza6deuGGTNmwMbGRoPVEb3+cnJy0K1bN5w4cQIfffQR5s6dCyMjI2hra8PFxUXT5b0WHjx4gLFjx8LW1haWlpb47LPP4OvrCz8/P43WlZaWBk9PTyQmJuKTTz6Bu7s79PT0oKuri7p162q0Nio7BiKSmTNnDhwdHfH48WMcPXoUa9aswR9//IELFy7AyMhI0+URvbYWLFiAkydPYu/evfDy8tJ0Oa8lNzc36QUADRs2xOjRozVcFTBlyhQkJSUhOjoazZo103Q5VEEMRCTj6+uLNm3aAABGjRoFS0tLLFmyBL/++isGDx6s4eqIXk/5+flYtmwZJk2axDD0gnbt2oVLly7h0aNHcHZ2hp6enkbruXv3LjZs2IC1a9cyDL3meA8RPVPnzp0BAAkJCQCAe/fuYfLkyXB2doaJiQnMzMzg6+uLs2fPqk37+PFjBAcHo2HDhjAwMICdnR369euH+Ph4AMCNGzegUqlKfRX/jyMyMhIqlQpbt27F559/DltbWxgbG6N37964efOm2rJPnjyJ7t27w9zcHEZGRvD09MSxY8dKXEcvL68Slx8cHKzWd9OmTXBxcYGhoSEsLCwwaNCgEpf/rHUrrrCwEMuWLUOzZs1gYGAAGxsbjB07Fvfv35f1q1u3Lnr27Km2nMDAQLV5llT7okWL1LYp8OQyzqxZs+Dk5AR9fX04ODhg6tSpyMnJKXFbFefl5YXmzZurtS9evBgqlQo3btyQtaenp2P8+PFwcHCAvr4+nJycsGDBAhQWFkp9irbb4sWL1ebbvHnzEo+Jn3/+udQahw8fXqZLFnXr1pX2j5aWFmxtbfHee+8hMTHxudMCwNdff41mzZpBX18f9vb2CAgIQHp6ujQ+Li4O9+/fh6mpKTw9PWFkZARzc3P07NkTFy5ckPodOnQIKpUKO3fuVFvG5s2boVKpEB0dLdU8fPhwWZ+ibRIZGSm1HTlyBAMGDEDt2rWlfTxhwgQ8evRINm1wcLDasRQWFoZWrVrBwMAAlpaWGDx4sNo2GT58OExMTGRtP//8s1odAGBiYqJWM1C295WXl5e0/5s2bQoXFxecPXu2xPdVSZ5+n1tZWcHPz0+2/YEn75/AwMBS5xMaGio7vk+fPo3CwkLk5uaiTZs2z9xWAHDw4EF4eHjA2NgY1apVQ58+fXD58mVZn6J9ceXKFQwcOBBmZmbSJcLHjx+r1Vv8/Z6fn48ePXrAwsICly5dkvUt6+eXUvEMET1TUXixtLQEAPzvf//Drl27MGDAADg6OuLOnTv45ptv4OnpiUuXLsHe3h4AUFBQgJ49e+LAgQMYNGgQPvvsMzx48AARERG4cOEC6tevLy1j8ODB6NGjh2y5QUFBJdYzf/58qFQqTJs2DXfv3sWyZcvg7e2N2NhYGBoaAnjygePr6wsXFxfMmjULWlpaWL9+PTp37owjR47g7bffVptvrVq1EBISAgB4+PAhxo0bV+KyZ8yYgYEDB2LUqFFISUnBypUr0bFjR5w5cwbVqlVTm2bMmDHw8PAAAPzyyy9q/9GNHTtWun/r008/RUJCAlatWoUzZ87g2LFj0NXVLXE7lEd6erq0bsUVFhaid+/eOHr0KMaMGYMmTZrg/PnzWLp0Ka5evSq7efVFZWdnw9PTE7dv38bYsWNRu3ZtHD9+HEFBQUhKSsKyZcsqbVkV5eHhgTFjxqCwsBAXLlzAsmXL8O+//+LIkSPPnC44OBizZ8+Gt7c3xo0bh7i4OKxZswanT5+W9mFaWhqAJ8d1gwYNMHv2bDx+/BirV6+Gu7s7Tp8+jYYNG8LLywsODg4ICwvDO++8I1tOWFgY6tevL10uKqvt27cjOzsb48aNg6WlJU6dOoWVK1fi1q1b2L59e6nTbd68GUOHDkXLli0REhKCtLQ0rFixAkePHsWZM2dgZWVVrjpKU5H3VZFp06aVa1mNGzfGF198ASEE4uPjsWTJEvTo0aPMwbckRfs2MDAQLi4u+O9//4uUlJQSt9X+/fvh6+uLevXqITg4GI8ePcLKlSvh7u6Ov//+Wy28Dxw4EHXr1kVISAhOnDiBFStW4P79+9i4cWOp9YwaNQqRkZGIiIhA06ZNpfYX2c6KIYiEEOvXrxcAxP79+0VKSoq4efOm2LJli7C0tBSGhobi1q1bQgghHj9+LAoKCmTTJiQkCH19fTFnzhypbd26dQKAWLJkidqyCgsLpekAiEWLFqn1adasmfD09JSGDx06JACImjVriszMTKl927ZtAoBYvny5NO8GDRoIHx8faTlCCJGdnS0cHR1F165d1ZbVvn170bx5c2k4JSVFABCzZs2S2m7cuCG0tbXF/PnzZdOeP39e6OjoqLVfu3ZNABAbNmyQ2mbNmiWKv+WOHDkiAIiwsDDZtOHh4WrtderUEX5+fmq1BwQEiKffxk/XPnXqVGFtbS1cXFxk2/THH38UWlpa4siRI7Lp165dKwCIY8eOqS2vOE9PT9GsWTO19kWLFgkAIiEhQWqbO3euMDY2FlevXpX1/c9//iO0tbVFYmKiEKJix8T27dtLrXHYsGGiTp06z1wPIZ5s32HDhsna3n//fWFkZPTM6e7evSv09PREt27dZO+LVatWCQBi3bp1slqtrKxEamqq1O/q1atCV1dX9O/fX2oLCgoS+vr6Ij09XbYcHR0d2X51dHQU/v7+snqKlnPo0CGpLTs7W63ukJAQoVKpxD///CO1FT8+8/PzhY2Njahfv754+PCh1CcyMlIAEJMmTZLahg0bJoyNjWXz3759u1odQghhbGws287leV95enrK9v8ff/whAIju3burvQdK8vT0Qgjx+eefCwDi7t27UhsAERAQUOp8ij4ri47vouGmTZvKtnXRvii+rVq1aiWsra1FWlqa1Hb27FmhpaUl25dF+6J3796yZX/88ccCgDh79qys3qLjIigoSGhra4tdu3bJpivv55dS8ZIZyXh7e6NGjRpwcHDAoEGDYGJigp07d6JmzZoAnjx9pqX15LApKChAWloaTExM0KhRI/z999/SfHbs2AErKyt88sknassoy+nt0vj7+8PU1FQafvfdd2FnZ4c//vgDwJPHcK9du4b3338faWlpSE1NRWpqKrKystClSxdERUXJLtEATy7tGRgYPHO5v/zyCwoLCzFw4EBpnqmpqbC1tUWDBg1w6NAhWf/c3FwAT7ZXabZv3w5zc3N07dpVNk8XFxeYmJiozTMvL0/WLzU1Ve30+dNu376NlStXYsaMGWqXNbZv344mTZqgcePGsnkWXSZ9evkvYvv27fDw8ED16tVly/L29kZBQQGioqJk/bOzs9XWtaCgoMR5P3jwAKmpqbJLVBWRk5OD1NRU3L17FxERETh48CC6dOnyzGn279+P3NxcjB8/XnpfAMDo0aNhZmam9kj4iBEjpLOtANCgQQP07t0b4eHh0vr5+/sjJydHdilw69atyM/Px9ChQ6U2a2tr3Lp167nrVXTmFAC
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import numpy as np\n",
"labels = [\"low\", \"middle\", \"high\", \"very_high\"]\n",
"num_bins = 4\n",
"hist1, bins1 = np.histogram(train_data[\"price\"].fillna(train_data[\"price\"].median()), bins=num_bins)\n",
"bins1,hist1\n",
"pd.concat([train_data[\"price\"], pd.cut(train_data[\"price\"], list(bins1))], axis=1).head(10)\n",
"pd.concat([train_data[\"price\"], pd.cut(train_data[\"price\"], list(bins1), labels=labels)], axis=1).head()\n",
"train_data['price_category'] = pd.cut(train_data[\"price\"], list(bins1), labels=labels)\n",
"test_data['price_category'] = pd.cut(train_data[\"price\"], list(bins1), labels=labels)\n",
"val_data['price_category'] = pd.cut(train_data[\"price\"], list(bins1), labels=labels)\n",
"\n",
"sns.countplot(x=train_data['price_category'])\n",
"plt.title('Распределение цены в обучающей выборке')\n",
"plt.xlabel('Категория цены')\n",
"plt.ylabel('Частота')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Конструирование признаков\n",
"1. Прогнозирование цен недвижимости. Цель технического проекта: Разработка модели машинного обучения для точного прогнозирования рыночной стоимости недвижимости.\n",
"2. Оценка влияния факторов на цену недвижимости Цель технического проекта: Разработки модели для анализа данных для выявления факторов, которые больше всего влияют на цену\n",
"\n",
"### Конструирование признаков\n",
"Унитарное кодирование - замена категориальных признаков бинарными значениями."
]
},
{
"cell_type": "code",
2024-11-02 13:01:31 +04:00
"execution_count": 16,
2024-11-01 17:56:43 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Столбцы train_data_encoded: ['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'price_category_low', 'price_category_middle', 'price_category_high', 'price_category_very_high']\n",
"Столбцы val_data_encoded: ['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'price_category_low', 'price_category_middle', 'price_category_high', 'price_category_very_high']\n",
"Столбцы test_data_encoded: ['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'price_category_low', 'price_category_middle', 'price_category_high', 'price_category_very_high']\n"
]
}
],
"source": [
"categorical_features = ['price_category']\n",
"\n",
"\n",
"train_data_encoded = pd.get_dummies(train_data, columns=categorical_features)\n",
"val_data_encoded = pd.get_dummies(val_data, columns=categorical_features)\n",
"test_data_encoded = pd.get_dummies(test_data, columns=categorical_features)\n",
"\n",
"print(\"Столбцы train_data_encoded:\", train_data_encoded.columns.tolist())\n",
"print(\"Столбцы val_data_encoded:\", val_data_encoded.columns.tolist())\n",
"print(\"Столбцы test_data_encoded:\", test_data_encoded.columns.tolist())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Ручной синтез\n",
"Создание новых признаков на основе экспертных знаний и логики предметной области.\n",
"Новый признак будет - цена недвижимости за фут."
]
},
{
"cell_type": "code",
2024-11-02 13:01:31 +04:00
"execution_count": 17,
2024-11-01 17:56:43 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>...</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" <th>price_category_low</th>\n",
" <th>price_category_middle</th>\n",
" <th>price_category_high</th>\n",
" <th>price_category_very_high</th>\n",
" <th>price_per_sqft</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>6252</th>\n",
" <td>1150900080</td>\n",
" <td>20150427T000000</td>\n",
" <td>846450.0</td>\n",
" <td>4</td>\n",
" <td>2.50</td>\n",
" <td>3710</td>\n",
" <td>7491</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>98029</td>\n",
" <td>47.5596</td>\n",
" <td>-122.016</td>\n",
" <td>3040</td>\n",
" <td>7491</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>228.153639</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4684</th>\n",
" <td>9268200600</td>\n",
" <td>20140519T000000</td>\n",
" <td>413500.0</td>\n",
" <td>2</td>\n",
" <td>1.00</td>\n",
" <td>770</td>\n",
" <td>4000</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>98117</td>\n",
" <td>47.6959</td>\n",
" <td>-122.364</td>\n",
" <td>1420</td>\n",
" <td>5040</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>537.012987</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1731</th>\n",
" <td>7883603750</td>\n",
" <td>20141209T000000</td>\n",
" <td>337000.0</td>\n",
" <td>3</td>\n",
" <td>1.75</td>\n",
" <td>1400</td>\n",
" <td>6000</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>98108</td>\n",
" <td>47.5283</td>\n",
" <td>-122.321</td>\n",
" <td>1030</td>\n",
" <td>6000</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>240.714286</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4742</th>\n",
" <td>3330501545</td>\n",
" <td>20141201T000000</td>\n",
" <td>330000.0</td>\n",
" <td>2</td>\n",
" <td>1.00</td>\n",
" <td>950</td>\n",
" <td>3090</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>98118</td>\n",
" <td>47.5510</td>\n",
" <td>-122.276</td>\n",
" <td>1230</td>\n",
" <td>4120</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>347.368421</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4521</th>\n",
" <td>993001332</td>\n",
" <td>20140903T000000</td>\n",
" <td>407000.0</td>\n",
" <td>3</td>\n",
" <td>2.25</td>\n",
" <td>1430</td>\n",
" <td>1448</td>\n",
" <td>3.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>98103</td>\n",
" <td>47.6916</td>\n",
" <td>-122.341</td>\n",
" <td>1430</td>\n",
" <td>1383</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>284.615385</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6412</th>\n",
" <td>925069071</td>\n",
" <td>20150126T000000</td>\n",
" <td>750000.0</td>\n",
" <td>5</td>\n",
" <td>3.75</td>\n",
" <td>3500</td>\n",
" <td>101494</td>\n",
" <td>1.5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>98053</td>\n",
" <td>47.6745</td>\n",
" <td>-122.054</td>\n",
" <td>3250</td>\n",
" <td>38636</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>214.285714</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8285</th>\n",
" <td>9268200315</td>\n",
" <td>20140828T000000</td>\n",
" <td>456000.0</td>\n",
" <td>3</td>\n",
" <td>2.00</td>\n",
" <td>1870</td>\n",
" <td>8442</td>\n",
" <td>1.5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>98117</td>\n",
" <td>47.6964</td>\n",
" <td>-122.365</td>\n",
" <td>1640</td>\n",
" <td>6174</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>243.850267</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7853</th>\n",
" <td>1822300040</td>\n",
" <td>20140507T000000</td>\n",
" <td>420000.0</td>\n",
" <td>2</td>\n",
" <td>1.50</td>\n",
" <td>1040</td>\n",
" <td>3500</td>\n",
" <td>1.5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>98144</td>\n",
" <td>47.5880</td>\n",
" <td>-122.304</td>\n",
" <td>1340</td>\n",
" <td>1213</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>403.846154</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1095</th>\n",
" <td>7202290160</td>\n",
" <td>20150114T000000</td>\n",
" <td>435000.0</td>\n",
" <td>3</td>\n",
" <td>2.50</td>\n",
" <td>1560</td>\n",
" <td>3987</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>98053</td>\n",
" <td>47.6870</td>\n",
" <td>-122.043</td>\n",
" <td>1600</td>\n",
" <td>3152</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>278.846154</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6929</th>\n",
" <td>2621700010</td>\n",
" <td>20140508T000000</td>\n",
" <td>569000.0</td>\n",
" <td>4</td>\n",
" <td>2.25</td>\n",
" <td>2250</td>\n",
" <td>41688</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>98053</td>\n",
" <td>47.6695</td>\n",
" <td>-122.050</td>\n",
" <td>2350</td>\n",
" <td>37920</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>252.888889</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2000 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
" id date price bedrooms bathrooms sqft_living \\\n",
"6252 1150900080 20150427T000000 846450.0 4 2.50 3710 \n",
"4684 9268200600 20140519T000000 413500.0 2 1.00 770 \n",
"1731 7883603750 20141209T000000 337000.0 3 1.75 1400 \n",
"4742 3330501545 20141201T000000 330000.0 2 1.00 950 \n",
"4521 993001332 20140903T000000 407000.0 3 2.25 1430 \n",
"... ... ... ... ... ... ... \n",
"6412 925069071 20150126T000000 750000.0 5 3.75 3500 \n",
"8285 9268200315 20140828T000000 456000.0 3 2.00 1870 \n",
"7853 1822300040 20140507T000000 420000.0 2 1.50 1040 \n",
"1095 7202290160 20150114T000000 435000.0 3 2.50 1560 \n",
"6929 2621700010 20140508T000000 569000.0 4 2.25 2250 \n",
"\n",
" sqft_lot floors waterfront view ... zipcode lat long \\\n",
"6252 7491 2.0 0 0 ... 98029 47.5596 -122.016 \n",
"4684 4000 1.0 0 0 ... 98117 47.6959 -122.364 \n",
"1731 6000 1.0 0 0 ... 98108 47.5283 -122.321 \n",
"4742 3090 1.0 0 0 ... 98118 47.5510 -122.276 \n",
"4521 1448 3.0 0 0 ... 98103 47.6916 -122.341 \n",
"... ... ... ... ... ... ... ... ... \n",
"6412 101494 1.5 0 0 ... 98053 47.6745 -122.054 \n",
"8285 8442 1.5 0 0 ... 98117 47.6964 -122.365 \n",
"7853 3500 1.5 0 0 ... 98144 47.5880 -122.304 \n",
"1095 3987 2.0 0 0 ... 98053 47.6870 -122.043 \n",
"6929 41688 2.0 0 0 ... 98053 47.6695 -122.050 \n",
"\n",
" sqft_living15 sqft_lot15 price_category_low price_category_middle \\\n",
"6252 3040 7491 False False \n",
"4684 1420 5040 False False \n",
"1731 1030 6000 False False \n",
"4742 1230 4120 False False \n",
"4521 1430 1383 False False \n",
"... ... ... ... ... \n",
"6412 3250 38636 False False \n",
"8285 1640 6174 False False \n",
"7853 1340 1213 False False \n",
"1095 1600 3152 False False \n",
"6929 2350 37920 False False \n",
"\n",
" price_category_high price_category_very_high price_per_sqft \n",
"6252 False False 228.153639 \n",
"4684 False False 537.012987 \n",
"1731 False False 240.714286 \n",
"4742 False False 347.368421 \n",
"4521 False False 284.615385 \n",
"... ... ... ... \n",
"6412 False False 214.285714 \n",
"8285 False False 243.850267 \n",
"7853 False False 403.846154 \n",
"1095 False False 278.846154 \n",
"6929 False False 252.888889 \n",
"\n",
"[2000 rows x 26 columns]"
]
},
2024-11-02 13:01:31 +04:00
"execution_count": 17,
2024-11-01 17:56:43 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_data_encoded['price_per_sqft'] = df['price'] / df['sqft_living']\n",
"val_data_encoded['price_per_sqft'] = df['price'] / df['sqft_living']\n",
"test_data_encoded['price_per_sqft'] = df['price'] / df['sqft_living']\n",
"test_data_encoded"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Масштабирование признаков\n",
"Это процесс изменения диапазона признаков, чтобы равномерно распределить значения. "
]
},
{
"cell_type": "code",
2024-11-02 13:01:31 +04:00
"execution_count": 18,
2024-11-01 17:56:43 +04:00
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import StandardScaler\n",
"numerical_features = ['bedrooms']\n",
"\n",
"scaler = StandardScaler()\n",
"train_data_encoded[numerical_features] = scaler.fit_transform(train_data_encoded[numerical_features])\n",
"val_data_encoded[numerical_features] = scaler.transform(val_data_encoded[numerical_features])\n",
"test_data_encoded[numerical_features] = scaler.transform(test_data_encoded[numerical_features])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Конструирование признаков с применением фреймворка Featuretools"
]
},
{
"cell_type": "code",
2024-11-02 13:01:31 +04:00
"execution_count": 21,
2024-11-01 17:56:43 +04:00
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-11-02 13:01:31 +04:00
"d:\\Study\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
2024-11-01 17:56:43 +04:00
" warnings.warn(\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>condition</th>\n",
" <th>grade</th>\n",
" <th>...</th>\n",
" <th>sqft_lot15</th>\n",
" <th>price_category_low</th>\n",
" <th>price_category_middle</th>\n",
" <th>price_category_high</th>\n",
" <th>price_category_very_high</th>\n",
" <th>price_per_sqft</th>\n",
" <th>DAY(date)</th>\n",
" <th>MONTH(date)</th>\n",
" <th>WEEKDAY(date)</th>\n",
" <th>YEAR(date)</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1121059105</th>\n",
" <td>378500.0</td>\n",
" <td>-0.382609</td>\n",
" <td>2.50</td>\n",
" <td>2860</td>\n",
" <td>43821</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>9</td>\n",
" <td>...</td>\n",
" <td>65340</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>132.342657</td>\n",
" <td>8</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" <td>2014</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1026069163</th>\n",
" <td>630000.0</td>\n",
" <td>-0.382609</td>\n",
" <td>2.50</td>\n",
" <td>2460</td>\n",
" <td>38794</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>9</td>\n",
" <td>...</td>\n",
" <td>51400</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>256.097561</td>\n",
" <td>22</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>2015</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3751601501</th>\n",
" <td>382450.0</td>\n",
" <td>-0.382609</td>\n",
" <td>2.50</td>\n",
" <td>2220</td>\n",
" <td>20531</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>...</td>\n",
" <td>19249</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>172.274775</td>\n",
" <td>16</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" <td>2014</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4322300340</th>\n",
" <td>265000.0</td>\n",
" <td>0.706185</td>\n",
" <td>1.50</td>\n",
" <td>1740</td>\n",
" <td>12728</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>7</td>\n",
" <td>...</td>\n",
" <td>11125</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>152.298851</td>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2015</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7701960130</th>\n",
" <td>820000.0</td>\n",
" <td>-0.382609</td>\n",
" <td>2.50</td>\n",
" <td>2980</td>\n",
" <td>18935</td>\n",
" <td>1.5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>11</td>\n",
" <td>...</td>\n",
" <td>18225</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>275.167785</td>\n",
" <td>17</td>\n",
" <td>10</td>\n",
" <td>4</td>\n",
" <td>2014</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8645500900</th>\n",
" <td>279000.0</td>\n",
" <td>0.706185</td>\n",
" <td>2.00</td>\n",
" <td>2200</td>\n",
" <td>7700</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>...</td>\n",
" <td>7700</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>126.818182</td>\n",
" <td>20</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>2014</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9528104660</th>\n",
" <td>905000.0</td>\n",
" <td>0.706185</td>\n",
" <td>3.50</td>\n",
" <td>2980</td>\n",
" <td>3000</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>9</td>\n",
" <td>...</td>\n",
" <td>4545</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>303.691275</td>\n",
" <td>27</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>2014</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5100402668</th>\n",
" <td>495000.0</td>\n",
" <td>-0.382609</td>\n",
" <td>1.00</td>\n",
" <td>1570</td>\n",
" <td>5510</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>7</td>\n",
" <td>...</td>\n",
" <td>6380</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>315.286624</td>\n",
" <td>18</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2015</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1338300180</th>\n",
" <td>1127312.5</td>\n",
" <td>0.706185</td>\n",
" <td>2.25</td>\n",
" <td>3960</td>\n",
" <td>8640</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>9</td>\n",
" <td>...</td>\n",
" <td>8640</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>284.674874</td>\n",
" <td>29</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" <td>2014</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7167000020</th>\n",
" <td>792500.0</td>\n",
" <td>0.706185</td>\n",
" <td>2.50</td>\n",
" <td>4290</td>\n",
" <td>175421</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>10</td>\n",
" <td>...</td>\n",
" <td>63162</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>184.731935</td>\n",
" <td>16</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>2014</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6362 rows × 28 columns</p>\n",
"</div>"
],
"text/plain": [
" price bedrooms bathrooms sqft_living sqft_lot floors \\\n",
"id \n",
"1121059105 378500.0 -0.382609 2.50 2860 43821 2.0 \n",
"1026069163 630000.0 -0.382609 2.50 2460 38794 2.0 \n",
"3751601501 382450.0 -0.382609 2.50 2220 20531 2.0 \n",
"4322300340 265000.0 0.706185 1.50 1740 12728 1.0 \n",
"7701960130 820000.0 -0.382609 2.50 2980 18935 1.5 \n",
"... ... ... ... ... ... ... \n",
"8645500900 279000.0 0.706185 2.00 2200 7700 1.0 \n",
"9528104660 905000.0 0.706185 3.50 2980 3000 2.0 \n",
"5100402668 495000.0 -0.382609 1.00 1570 5510 1.0 \n",
"1338300180 1127312.5 0.706185 2.25 3960 8640 2.0 \n",
"7167000020 792500.0 0.706185 2.50 4290 175421 2.0 \n",
"\n",
" waterfront view condition grade ... sqft_lot15 \\\n",
"id ... \n",
"1121059105 0 0 4 9 ... 65340 \n",
"1026069163 0 0 3 9 ... 51400 \n",
"3751601501 0 0 3 8 ... 19249 \n",
"4322300340 0 0 4 7 ... 11125 \n",
"7701960130 0 0 3 11 ... 18225 \n",
"... ... ... ... ... ... ... \n",
"8645500900 0 0 3 7 ... 7700 \n",
"9528104660 0 0 3 9 ... 4545 \n",
"5100402668 0 0 4 7 ... 6380 \n",
"1338300180 0 2 3 9 ... 8640 \n",
"7167000020 0 0 3 10 ... 63162 \n",
"\n",
" price_category_low price_category_middle price_category_high \\\n",
"id \n",
"1121059105 False True False \n",
"1026069163 False False True \n",
"3751601501 False True False \n",
"4322300340 True False False \n",
"7701960130 False False True \n",
"... ... ... ... \n",
"8645500900 True False False \n",
"9528104660 False False False \n",
"5100402668 False True False \n",
"1338300180 False False False \n",
"7167000020 False False True \n",
"\n",
" price_category_very_high price_per_sqft DAY(date) MONTH(date) \\\n",
"id \n",
"1121059105 False 132.342657 8 7 \n",
"1026069163 False 256.097561 22 4 \n",
"3751601501 False 172.274775 16 7 \n",
"4322300340 False 152.298851 12 1 \n",
"7701960130 False 275.167785 17 10 \n",
"... ... ... ... ... \n",
"8645500900 False 126.818182 20 6 \n",
"9528104660 True 303.691275 27 8 \n",
"5100402668 False 315.286624 18 2 \n",
"1338300180 True 284.674874 29 7 \n",
"7167000020 False 184.731935 16 6 \n",
"\n",
" WEEKDAY(date) YEAR(date) \n",
"id \n",
"1121059105 1 2014 \n",
"1026069163 2 2015 \n",
"3751601501 2 2014 \n",
"4322300340 0 2015 \n",
"7701960130 4 2014 \n",
"... ... ... \n",
"8645500900 4 2014 \n",
"9528104660 2 2014 \n",
"5100402668 2 2015 \n",
"1338300180 1 2014 \n",
"7167000020 0 2014 \n",
"\n",
"[6362 rows x 28 columns]"
]
},
2024-11-02 13:01:31 +04:00
"execution_count": 21,
2024-11-01 17:56:43 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import featuretools as ft\n",
"\n",
"# Удаление дубликатов по идентификатору\n",
"train_data_encoded = train_data_encoded.drop_duplicates(subset='id', keep='first')\n",
"\n",
"#Создание EntitySet\n",
"es = ft.EntitySet(id='house_data')\n",
"\n",
"#Добавление датафрейма в EntitySet\n",
"es = es.add_dataframe(dataframe_name='houses', dataframe=train_data_encoded, index='id')\n",
"\n",
"#Генерация признаков\n",
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='houses', max_depth=2)\n",
"\n",
"feature_matrix"
]
2024-11-02 13:01:31 +04:00
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Оценка качества"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Время обучения: 0.27 секунд\n",
"MSE: 2205809457.9675403\n",
"RMSE: 46966.04579872081\n",
"R²: 0.9623494755346685\n",
"MAE: 35440.813340503635 \n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\Study\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
}
],
"source": [
"import time\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.metrics import mean_squared_error\n",
"from sklearn.metrics import r2_score, mean_absolute_error\n",
"from sklearn.model_selection import cross_val_score\n",
"\n",
"X = feature_matrix.drop('price', axis=1)\n",
"y = feature_matrix['price']\n",
"\n",
"#Делим на выборки\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"#Начнем обучение\n",
"model = LinearRegression()\n",
"start_time = time.time()\n",
"model.fit(X_train, y_train)\n",
"\n",
"train_time = time.time() - start_time\n",
"\n",
"#Вычесляем показательную способность\n",
"y_pred = model.predict(X_test)\n",
"mse = mean_squared_error(y_test, y_pred)\n",
"rmse = mean_squared_error(y_test, y_pred, squared=False)\n",
"r2 = r2_score(y_test, y_pred)\n",
"mae = mean_absolute_error(y_test, y_pred)\n",
"\n",
"print()\n",
"print(f\"Время обучения: {train_time:.2f} секунд\")\n",
"print(\"Метрики:\")\n",
"print(f\"MSE: {mse}\")\n",
"print(f\"RMSE: {rmse}\")\n",
"print(f\"R²: {r2}\")\n",
"print(f\"MAE: {mae} \\n\")"
]
2024-11-01 17:56:43 +04:00
}
],
"metadata": {
"kernelspec": {
"display_name": "kernel",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}