824 lines
161 KiB
Plaintext
Raw Normal View History

2024-11-16 02:38:00 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Датасет астероидов"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выведем записи и столбцы"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'name', 'est_diameter_min', 'est_diameter_max',\n",
" 'relative_velocity', 'miss_distance', 'orbiting_body', 'sentry_object',\n",
" 'absolute_magnitude', 'hazardous'],\n",
" dtype='object')\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>est_diameter_min</th>\n",
" <th>est_diameter_max</th>\n",
" <th>relative_velocity</th>\n",
" <th>miss_distance</th>\n",
" <th>orbiting_body</th>\n",
" <th>sentry_object</th>\n",
" <th>absolute_magnitude</th>\n",
" <th>hazardous</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2162635</td>\n",
" <td>162635 (2000 SS164)</td>\n",
" <td>1.198271</td>\n",
" <td>2.679415</td>\n",
" <td>13569.249224</td>\n",
" <td>5.483974e+07</td>\n",
" <td>Earth</td>\n",
" <td>False</td>\n",
" <td>16.73</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2277475</td>\n",
" <td>277475 (2005 WK4)</td>\n",
" <td>0.265800</td>\n",
" <td>0.594347</td>\n",
" <td>73588.726663</td>\n",
" <td>6.143813e+07</td>\n",
" <td>Earth</td>\n",
" <td>False</td>\n",
" <td>20.00</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2512244</td>\n",
" <td>512244 (2015 YE18)</td>\n",
" <td>0.722030</td>\n",
" <td>1.614507</td>\n",
" <td>114258.692129</td>\n",
" <td>4.979872e+07</td>\n",
" <td>Earth</td>\n",
" <td>False</td>\n",
" <td>17.83</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3596030</td>\n",
" <td>(2012 BV13)</td>\n",
" <td>0.096506</td>\n",
" <td>0.215794</td>\n",
" <td>24764.303138</td>\n",
" <td>2.543497e+07</td>\n",
" <td>Earth</td>\n",
" <td>False</td>\n",
" <td>22.20</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3667127</td>\n",
" <td>(2014 GE35)</td>\n",
" <td>0.255009</td>\n",
" <td>0.570217</td>\n",
" <td>42737.733765</td>\n",
" <td>4.627557e+07</td>\n",
" <td>Earth</td>\n",
" <td>False</td>\n",
" <td>20.09</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name est_diameter_min est_diameter_max \\\n",
"0 2162635 162635 (2000 SS164) 1.198271 2.679415 \n",
"1 2277475 277475 (2005 WK4) 0.265800 0.594347 \n",
"2 2512244 512244 (2015 YE18) 0.722030 1.614507 \n",
"3 3596030 (2012 BV13) 0.096506 0.215794 \n",
"4 3667127 (2014 GE35) 0.255009 0.570217 \n",
"\n",
" relative_velocity miss_distance orbiting_body sentry_object \\\n",
"0 13569.249224 5.483974e+07 Earth False \n",
"1 73588.726663 6.143813e+07 Earth False \n",
"2 114258.692129 4.979872e+07 Earth False \n",
"3 24764.303138 2.543497e+07 Earth False \n",
"4 42737.733765 4.627557e+07 Earth False \n",
"\n",
" absolute_magnitude hazardous \n",
"0 16.73 False \n",
"1 20.00 True \n",
"2 17.83 False \n",
"3 22.20 False \n",
"4 20.09 True "
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"df_subset = pd.read_csv(\"..//..//static//csv//neo.csv\")\n",
"df = df_subset.head(15000)\n",
"print(df.columns)\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Бизнес-цели:\n",
"\n",
"1. Повышение безопасности планеты от потенциальных угроз космических объектов.\n",
"2. Оптимизация исследования космических объектов для использования в коммерческих или исследовательских миссиях."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Цели технического проекта:\n",
"\n",
"Для 1-й бизнес-цели: \n",
" * Создать веб-приложение или API, которое принимает параметры объекта и прогнозирует, опасен ли он для Земли.\n",
" * Модель может использоваться в системах мониторинга космических объектов для предоставления оперативных оценок и предупреждений.\n",
" * Включение автоматической системы оповещения для НАСА и других космических агентств с обновлениями по объектам, представляющим угрозу.\n",
"\n",
"Для 2-й бизнес-цели:\n",
" * Разработка модели, которая позволяет астрономам и специалистам по космосу загружать данные о новых объектах и получать предсказания о расстоянии их ближайшего сближения с Землей.\n",
" * Создание системы мониторинга с графическим интерфейсом, отображающим траектории движения объектов и предполагаемые даты и расстояния их ближайших подходов.\n",
" * Реализация системы оповещений на основе пороговых значений расстояний для идентификации особо опасных сближений."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проверим датасет на пропущенные значения:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id Процент пустых значений: %0.00\n",
"name Процент пустых значений: %0.00\n",
"est_diameter_min Процент пустых значений: %0.00\n",
"est_diameter_max Процент пустых значений: %0.00\n",
"relative_velocity Процент пустых значений: %0.00\n",
"miss_distance Процент пустых значений: %0.00\n",
"orbiting_body Процент пустых значений: %0.00\n",
"sentry_object Процент пустых значений: %0.00\n",
"absolute_magnitude Процент пустых значений: %0.00\n",
"hazardous Процент пустых значений: %0.00\n",
"id 0\n",
"name 0\n",
"est_diameter_min 0\n",
"est_diameter_max 0\n",
"relative_velocity 0\n",
"miss_distance 0\n",
"orbiting_body 0\n",
"sentry_object 0\n",
"absolute_magnitude 0\n",
"hazardous 0\n",
"dtype: int64\n"
]
},
{
"data": {
"text/plain": [
"id False\n",
"name False\n",
"est_diameter_min False\n",
"est_diameter_max False\n",
"relative_velocity False\n",
"miss_distance False\n",
"orbiting_body False\n",
"sentry_object False\n",
"absolute_magnitude False\n",
"hazardous False\n",
"dtype: bool"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" print(f'{i} Процент пустых значений: %{null_rate:.2f}')\n",
"\n",
"print(df.isnull().sum())\n",
"\n",
"df.isnull().any()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Нулевых значений нет\n",
"\n",
"Разобьём набор на 3 классических выборки: обучающую, тестовую и контрольную"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: (9000, 9)\n",
"Размер контрольной выборки: (3000, 9)\n",
"Размер тестовой выборки: (3000, 9)\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"df = df.dropna()\n",
"df = df.drop_duplicates()\n",
"\n",
"X = df.drop(columns=['absolute_magnitude'])\n",
"y = df['absolute_magnitude']\n",
"\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"\n",
"X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", X_train.shape)\n",
"print(\"Размер контрольной выборки:\", X_val.shape)\n",
"print(\"Размер тестовой выборки:\", X_test.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Построим несколько столбчатых диаграмм для визуализации распределения:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkIAAAHHCAYAAABTMjf2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB7v0lEQVR4nO3dd3hTdd8G8DtJk3TvkZaWtmxKC4WyCgoICLJEwAGi4ALUggoPDpQlDhR9FUUcPCqggCgCDgRkI0LZFAqUsgqF7r1n8nv/KMlDaAttaXvS5P5cVy7IOSfnfHNOcvrNb8qEEAJEREREFkgudQBEREREUmEiRERERBaLiRARERFZLCZCREREZLGYCBEREZHFYiJEREREFouJEBEREVksJkJERERksZgIERFRvRBCIDMzExcuXJA6FKpnOp0O6enpuHz5stSh1DsmQkT14KmnnoK9vX2jHvPKlSuQyWRYsWJFox7XUsyfPx8ymazRjieTyTB//vxGO159ycvLw+zZs9G2bVuoVCq4ubmhTZs2iI2NlTq0JuHff//Fnj17DM/37NmD/fv3SxfQTZKTk/HKK6/A398fKpUKHh4eCAoKQm5urtSh1SsrqQNoTCtWrMDTTz9teK5Wq9G8eXMMGjQIc+bMgZeXl4TRETWewsJCLFq0CP369UO/fv2kDqfJeP/99xEUFISHHnpI6lBMQkZGBvr27Yv4+HhMmzYNvXv3hkqlglKpREBAgNThNQnXrl3DO++8g59//hkA8OKLL2Lu3LkSRwVcvHgR9913H8rKyvDSSy+hS5cusLKygo2NDezs7KQOr15ZVCKkt2DBAgQGBqK4uBj//vsvvvrqK2zevBmnT5+Gra2t1OERNbjCwkK8/fbbAMBEqBqzZ8/GG2+8YbTs/fffx8MPP8xE6IZXX30VSUlJiIyMRIcOHaQOp0kaPXo0Fi9ejI4dOwIAwsPDMXr0aImjAqZMmQKVSoWDBw+iWbNmUofToCwyERoyZAi6du0KAHjuuefg5uaGTz75BL///jvGjRsncXREZAqsrKxgZWWRt8gaSU1NxcqVK/H1118zCboLarUaBw4cwOnTpwEAwcHBUCgUksZ07Ngx7Nq1C9u2bTP7JAhgGyEAQP/+/QEAcXFxAIDMzEzMnDkTISEhsLe3h6OjI4YMGYKTJ09Wem1xcTHmz5+PNm3awNraGt7e3hg9ejQuXboE4H/tOKp73PxrfM+ePZDJZPj555/x5ptvQqPRwM7ODg8++CCuXbtW6diHDh3CAw88ACcnJ9ja2qJv377V1i3369evyuNX1SZh1apVCAsLg42NDVxdXTF27Ngqj3+793YznU6HxYsXo0OHDrC2toaXlxemTJmCrKwso+0CAgIwfPjwSseZOnVqpX1WFftHH31U6ZwCQElJCebNm4dWrVpBrVbDz88Pr732GkpKSqo8Vzfbt28fHnnkETRv3tzw2unTp6OoqKjK7S9fvozBgwfDzs4OPj4+WLBgAYQQRtusXbsWYWFhcHBwgKOjI0JCQvDZZ59V2s8jjzwCV1dX2NraomfPnvjrr7/uGG91VV1PPfWUoariypUr8PDwAAC8/fbbVX4Wzp07h4cffhiurq6wtrZG165d8ccff9zx+LeSyWSYOnUq1q1bh6CgINjY2CA8PBzR0dEAgG+++QatWrWCtbU1+vXrhytXrhi9vjbnX38Ma2trBAcHY+PGjUbvW//eZTIZPv74YyxbtgwtW7aEWq1Gt27dcOTIEaP93dpGSCaToaCgACtXrjScs6eeeqrS+b3dPoCKz+P06dPh4eEBBwcHPPjgg7h+/XqV5y8hIQHPPPMMvLy8oFar0aFDB3z//ffVnW4jN38fFQoFmjVrhsmTJyM7O/uOry0vL8c777xjOD8BAQF48803jb4zR44cgU6nQ2lpKbp27Qpra2u4ublh3LhxiI+PN2y3fPlyyGQynDhxotJx3n//fSgUCiQkJBhivvV7vWLFCshkMqPPxu+//45hw4bBx8cHarUaLVu2xDvvvAOtVmv02qquy+LFi9GuXTuo1WpoNBpMmTIFmZmZRtv069cPwcHBRss+/vjjSnGkp6dXGXNt7nlPPfUUFAoFOnXqhE6dOmHDhg2QyWQ1qloMCAgwXGO5XA6NRoPHHnvM6Pzf/Jmvzq2f04MHD8La2hqXLl1Chw4dbnuugIrvnv5vhru7O5544gnDNdXTt6O80z2yqraPeXl5CAsLQ2BgIJKSkmp9nu+EP3cAQ9Li5uYGoOKP0G+//YZHHnkEgYGBSElJwTfffIO+ffvi7Nmz8PHxAQBotVoMHz4cO3fuxNixY/Hyyy8jLy8P27dvx+nTp9GyZUvDMcaNG4ehQ4caHXfWrFlVxvPee+9BJpPh9ddfR2pqKhYvXoyBAwciKioKNjY2AIBdu3ZhyJAhCAsLw7x58yCXy7F8+XL0798f+/btQ/fu3Svt19fXFwsXLgQA5Ofn44UXXqjy2HPmzMGjjz6K5557DmlpaViyZAn69OmDEydOwNnZudJrJk+ejHvvvRcAsGHDBmzcuNFo/ZQpUwzts1566SXExcXhiy++wIkTJ7B//34olcoqz0NtZGdnG97bzXQ6HR588EH8+++/mDx5Mtq3b4/o6Gh8+umnOH/+PH777bfb7nfdunUoLCzECy+8ADc3Nxw+fBhLlizB9evXsW7dOqNttVotHnjgAfTs2ROLFi3C1q1bMW/ePJSXl2PBggUAgO3bt2PcuHEYMGAAPvzwQwBATEwM9u/fj5dffhkAkJKSgl69eqGwsBAvvfQS3NzcsHLlSjz44IP49ddfMWrUqLs6Vx4eHvjqq6/wwgsvYNSoUYZieH3R/JkzZ9C7d280a9YMb7zxBuzs7PDLL7/goYcewvr162t9/H379uGPP/5AREQEAGDhwoUYPnw4XnvtNXz55Zd48cUXkZWVhUWLFuGZZ57Brl27DK+t6fn/66+/8NhjjyEkJAQLFy5EVlYWnn322Wp/za5ZswZ5eXmYMmUKZDIZFi1ahNGjR+Py5cvVfh5//PFHPPfcc+jevTsmT54MAEbf8Zp67rnnsGrVKjz++OPo1asXdu3ahWHDhlXaLiUlBT179jQkkx4eHtiyZQueffZZ5Obm4pVXXrnjsfTXt7y8HJGRkVi2bBmKiorw448/3jHGlStX4uGHH8Z//vMfHDp0CAsXLkRMTIzh+52RkQGg4odKWFgYPvjgA6SlpeHzzz/Hv//+ixMnTsDd3R0PP/wwIiIisHr1anTu3NnoOKtXr0a/fv1qXeqwYsUK2NvbY8aMGbC3t8euXbswd+5c5Obm4qOPPqr2de+//z7eeust9OnTBxEREYZ70aFDh3Do0CGo1epaxVGdut7zysvL8dZbb9XqWPfeey8mT54MnU6H06dPY/HixUhMTMS+ffvqHH9GRgaKi4vxwgsvoH///nj++edx6dIlLF26tNK50r/Pbt26YeHChUhJScFnn32G/fv3V/qbUZN75K3KysowZswYxMfHY//+/fD29jasq7e/LcKCLF++XAAQO3bsEGlpaeLatWti7dq1ws3NTdjY2Ijr168LIYQoLi4WWq3W6LVxcXFCrVaLBQsWGJZ9//33AoD45JNPKh1Lp9MZXgdAfPTRR5W26dChg+jbt6/h+e7duwUA0axZM5Gbm2tY/ssvvwgA4rPPPjPsu3Xr1mLw4MGG4wghRGFhoQgMDBT3339/pWP16tVLBAcHG56npaUJAGLevHmGZVeuXBEKhUK89957Rq+Njo4WVlZWlZZfuHBBABArV640LJs3b564+WO1b98+AUCsXr3a6LVbt26ttNzf318MGzasUuwRERHi1o/qrbG/9tprwtPTU4SFhRmd0x9//FHI5XKxb98+o9d//fXXAoDYv39/pePdrLCwsNKyhQsXCplMJq5evWpYNnHiRAFATJs2zbBMp9OJYcOGCZVKJdLS0oQQQrz88svC0dFRlJeXV3vMV155RQAwijkvL08EBgaKgIAAw2dT/9lavny5Ybu+ffsavf+b4/P39zc8r+r66w0YMECEhISI4uJio/f
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkkAAAHHCAYAAACr0swBAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACBP0lEQVR4nO3deVxUVf8H8M+dAYZ932UVEFBwwyWt1NTcl9QWS9PKtEXbbHuszKynzOoprWzx95Tak2WpqS3ua2a47woICCLINiD7PnN+fyATI8MqcAf4vF+veencbb73cu+d75xz7jmSEEKAiIiIiPQo5A6AiIiIyBgxSSIiIiIygEkSERERkQFMkoiIiIgMYJJEREREZACTJCIiIiIDmCQRERERGcAkiYiIiMgAE7kDICIiorarpKQE2dnZMDExgaurq9zhNCuWJBE1wiOPPAJra+tW/czExERIkoTVq1e36ud2FG+99RYkSWq1z5MkCW+99VarfR61XZ9//jlycnJ075ctW4bCwkL5Aqpm9+7dmDBhAuzt7WFhYYFOnTrhueeekzusZtcuk6TVq1dDkiTdy9zcHF26dMG8efOQnp4ud3hEraaoqAhvvfUW9u/fL3cobcp7772HzZs3yx0GdXC//fYb3nrrLVy9ehVr167FwoULYWFhIXdY+OKLLzBy5Ejk5uZi+fLl2LVrF3bt2oW3335b7tCaXbuubnv77bfh7++PkpIS/PXXX/jyyy+xdetWnD9/HpaWlnKHR9TiioqKsHjxYgDAkCFD5A3GSL3xxhv417/+pTftvffew7333ot77rlHnqCIALz22muYMGECli9fDoVCgf/85z9QKOQt24iNjcX8+fMxZ84cfPHFF61aCiuHdp0kjR49Gn369AEAPP7443BycsLHH3+MLVu24MEHH5Q5OiIyBiYmJjAxade3QmqjBg8ejCtXriAqKgre3t7w8vKSOyR8+umncHd3x6efftruEySgnVa31Wbo0KEAgISEBABAdnY2XnrpJYSHh8Pa2hq2trYYPXo0zpw5U2PdkpISvPXWW+jSpQvMzc3h4eGByZMnIz4+HsA/7UZqe1X/Fb9//35IkoSffvoJr732Gtzd3WFlZYUJEybg6tWrNT77yJEjGDVqFOzs7GBpaYnBgwfj0KFDBvdxyJAhBj/fUBuI77//HhEREbCwsICjoyOmTp1q8PPr2rfqtFotli1bhm7dusHc3Bxubm544okncP36db3l/Pz8MG7cuBqfM2/evBrbNBT7hx9+WOOYAkBpaSkWLVqEwMBAqFQqeHt745VXXkFpaanBY1XdwYMHcd9998HHx0e37gsvvIDi4mKDy1++fBkjR46ElZUVPD098fbbb0MIobfMunXrEBERARsbG9ja2iI8PBzLly+vsZ377rsPjo6OsLS0xG233YY//vij3niHDBlisGTokUcegZ+fH4DKv5uLiwsAYPHixQbPhejoaNx7771wdHSEubk5+vTpg19//bXez7+ZJEmYN28e1q9fj65du8LCwgIDBgzAuXPnAABff/01AgMDYW5ujiFDhiAxMVFv/cYc/6rPMDc3R1hYGDZt2qS331X7LkkSPvroI6xcuRIBAQFQqVTo27cvjh07pre9m9skSZKEwsJCrFmzRnfMHnnkkRrHt65tAJXn4wsvvAAXFxfY2NhgwoQJSE5ONnj8UlJS8Nhjj8HNzQ0qlQrdunXDt99+W9vh1lP9elQqlejUqRPmzJmj15alrnWrnw8VFRUYM2YMHB0dcfHiRb3p77zzju44+vn54bXXXqtxbTX02q7rXln92m7svXL9+vW6e5qzszOmT5+OlJSUeo9b9Vf1c7PqvK5LVYwbNmyoMc/a2lp37lRpyDVftc39+/fD3t4eAwYMgJeXF8aOHdugNm1V61e9VCoVunTpgiVLlujdp6rOW7VaXeu2/Pz89Pbh8OHDiIiIwNNPP607X8PCwvB///d/NdYtLCzEiy++CG9vb6hUKgQHB+Ojjz6qca+sOs5r165FcHAwzM3NERERgT///FNvOUPX2b59+6BSqfDkk0/qTb+Va6q6DvXzqSqhcXJyAlB5sm7evBn33Xcf/P39kZ6ejq+//hqDBw/GxYsX4enpCQDQaDQYN24c9uzZg6lTp+K5555Dfn4+du3ahfPnzyMgIED3GQ8++CDGjBmj97kLFiwwGM+7774LSZLw6quvIiMjA8uWLcPw4cNx+vRpXb3z3r17MXr0aERERGDRokVQKBRYtWoVhg4dioMHD6Jfv341tuvl5YUlS5YAAAoKCvDUU08Z/OyFCxfi/vvvx+OPP47MzEx89tlnGDRoEE6dOgV7e/sa68yZMwd33nknAOCXX37Bpk2b9OY/8cQTWL16NR599FE8++yzSEhIwOeff45Tp07h0KFDMDU1NXgcGiMnJ0e3b9VptVpMmDABf/31F+bMmYPQ0FCcO3cOn3zyCS5dulRv+5L169ejqKgITz31FJycnHD06FF89tlnSE5Oxvr16/WW1Wg0GDVqFG677TZ88MEH2L59OxYtWoSKigpdnfyuXbvw4IMPYtiwYVi6dCkAICoqCocOHdI1bkxPT8fAgQNRVFSEZ599Fk5OTlizZg0mTJiADRs2YNKkSbd0rFxcXPDll1/iqaeewqRJkzB58mQAQPfu3QEAFy5cwO23345OnTrhX//6F6ysrPDzzz/jnnvuwcaNGxv9+QcPHsSvv/6KuXPnAgCWLFmCcePG4ZVXXsEXX3yBp59+GtevX8cHH3yAxx57DHv37tWt29Dj/8cff+CBBx5AeHg4lixZguvXr2PWrFno1KmTwZh++OEH5Ofn44knnoAkSfjggw8wefJkXL58udbz8X//+x8ef/xx9OvXD3PmzAEAvWu8oR5//HF8//33eOihhzBw4EDs3bsXY8eOrbFceno6brvtNt0XhYuLC7Zt24ZZs2YhLy8Pzz//fL2fVfX3raioQGRkJFauXIni4mL873//a3TM+/fvx65du9C1a1e96WvWrMG9996LF198EUeOHMGSJUsQFRVV4z7QENXjOnjwIFauXIlPPvkEzs7OAAA3Nze95Rtyr6y69/Tt2xdLlixBeno6li9fjkOHDtV6T6t+XVTF0ZJu5Zr/888/sXXr1kZ93muvvYbQ0FAUFxfrEk1XV1fMmjWryfuQlZWF48ePw8TEBHPnzkVAQAA2b96MOXPmICsrS1d1LYTAhAkTsG/fPsyaNQs9e/bEjh078PLLLyMlJQWffPKJ3nYPHDiAn376Cc8++yxUKhW++OILjBo1CkePHkVYWJjBWM6cOYN77rkHY8aMwYoVK3TTm+Oa0hHt0KpVqwQAsXv3bpGZmSmuXr0q1q1bJ5ycnISFhYVITk4WQghRUlIiNBqN3roJCQlCpVKJt99+Wzft22+/FQDExx9/XOOztFqtbj0A4sMPP6yxTLdu3cTgwYN17/ft2ycAiE6dOom8vDzd9J9//lkAEMuXL9dtOygoSIwcOVL3OUIIUVRUJPz9/cXdd99d47MGDhwowsLCdO8zMzMFALFo0SLdtMTERKFUKsW7776rt+65c+eEiYlJjemxsbECgFizZo1u2qJFi0T10+fgwYMCgFi7dq3eutu3b68x3dfXV4wdO7ZG7HPnzhU3n5I3x/7KK68IV1dXERERoXdM//e//wmFQiEOHjyot/5XX30lAIhDhw7V+LzqioqKakxbsmSJkCRJXLlyRTdt5syZAoB45plndNO0Wq0YO3asMDMzE5mZmUIIIZ577jlha2srKioqav3M559/XgDQizk/P1/4+/sLPz8/3blZdW6tWrVKt9zgwYP19r96fL6+vrr3hv7+VYYNGybCw8NFSUmJ3r4MHDhQBAUF1Rq3IQCESqUSCQkJumlff/21ACDc3d31zvMFCxYIAHrLNvT4h4eHCy8vL5Gfn6+btn//fgFAb7+rjpmTk5PIzs7WTd+yZYsAIH777TfdtJvPZSGEsLKyEjNnzqwR083Ht7ZtnD59WgAQTz/9tN5yDz30UI2/x6xZs4SHh4dQq9V6y06dOlXY2dkZPDb
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAj4AAAHHCAYAAAC/R1LgAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACCp0lEQVR4nO3dd3xT1f8/8FeStunee5e2lEILhSLIRkCGgLJEBGWIgAounKjI8PMR11dRRJGfCjhRRAQHe8koq2wo0JYO6E73Hsn5/VGaD6EttKXtTZPX8/HIA3Jz77nvc5ObvnPuOefKhBACREREREZALnUARERERK2FiQ8REREZDSY+REREZDSY+BAREZHRYOJDRERERoOJDxERERkNJj5ERERkNJj4EBERkdEwkToAIiIikkZFRQVycnKg0Wjg6ekpdTitgi0+RK1s+vTpsLa2btV9JiYmQiaTYe3ata26X2OxePFiyGSyVtufTCbD4sWLW21/1Hg//PADEhMTtc/Xrl2LlJQU6QK6yYkTJzB58mQ4OztDqVTCw8MD48ePlzqsVsPEpx5r166FTCbTPszNzdG+fXvMmzcPGRkZUodH1GpKSkqwePFi7Nu3T+pQ2pR3330Xf/zxh9RhkEQOHDiAV199FYmJidi+fTvmzp0LuVz6P7mbN29G3759cfHiRfz3v//Fzp07sXPnTnz11VdSh9ZqeKnrDpYuXYqAgACUlZXh4MGD+PLLL/HPP//g/PnzsLS0lDo8ohZXUlKCJUuWAAAGDhwobTB66q233sLrr7+us+zdd9/FhAkTMGbMGGmCIkm9+OKLGDhwIAICAgAA8+fPh4eHh6Qx5eTk4Mknn8SwYcOwYcMGmJmZSRqPVJj43MGIESPQvXt3AMCTTz4JJycnfPzxx9i8eTMeffRRiaMjIn1gYmICExN+ndL/dOjQAfHx8Th//jycnZ0RGBgodUhYs2YNysrKsHbtWqNNegBe6mq0QYMGAQASEhIAVGfQL7/8MsLDw2FtbQ1bW1uMGDECZ86cqbVtWVkZFi9ejPbt28Pc3BweHh4YN24c4uPjAfyvH0Z9j5t/be/btw8ymQy//PIL3njjDbi7u8PKygoPPvggrl27VmvfR48exfDhw2FnZwdLS0sMGDAAhw4dqrOOAwcOrHP/dfUp+OGHHxAZGQkLCws4Ojpi0qRJde7/dnW7mUajwfLly9GpUyeYm5vDzc0Nc+bMQW5urs56/v7+GDVqVK39zJs3r1aZdcX+4Ycf1jqmAFBeXo5FixYhKCgISqUSPj4+ePXVV1FeXl7nsbrZgQMH8PDDD8PX11e77YsvvojS0tI617969SqGDRsGKysreHp6YunSpRBC6Kyzfv16REZGwsbGBra2tggPD8enn35aq5yHH34Yjo6OsLS0xL333ou///77jvEOHDiwzhac6dOnw9/fH0D1++bi4gIAWLJkSZ2fhUuXLmHChAlwdHSEubk5unfvji1bttxx/7eSyWSYN28eNmzYgI4dO8LCwgK9evXCuXPnAABfffUVgoKCYG5ujoEDB+r0nwAad/xr9mFubo6wsDBs2rRJp941dZfJZPjoo4+wevVqBAYGQqlU4p577sHx48d1yru1j49MJkNxcTHWrVunPWbTp0+vdXxvVwZQ/Xl88cUX4eLiAhsbGzz44IO4fv16nccvJSUFTzzxBNzc3KBUKtGpUyd8++239R1uHTefjwqFAl5eXpg9ezby8vIavN2dvrMac2798MMP6NGjBywtLeHg4ID+/ftjx44dAKrP/dvt8+ZjW1xcjJdeegk+Pj5QKpUICQnBRx99VOs8a2j9MzMzMXPmTLi5ucHc3BxdunTBunXrdNa5uT+dlZUVevbsicDAQMydO1fnc1CfW78rTU1N4e/vj1deeQUVFRXa9Wq6Y5w4caLesm49x48cOYKIiAi8++672mMSHByM9957DxqNRmfbqqoqvPPOO9rPvb+/P954441a71fNd/GOHTsQEREBc3NzdOzYEb///rvOejXx3nzeXrhwAQ4ODhg1ahSqqqq0y/Py8vDCCy9oYwwKCsL7779fK8am4k+URqpJUpycnABU/9H5448/8PDDDyMgIAAZGRn46quvMGDAAFy8eFHbS16tVmPUqFHYvXs3Jk2ahOeffx6FhYXYuXMnzp8/r/Nr4NFHH8UDDzygs98FCxbUGc9///tfyGQyvPbaa8jMzMTy5csxZMgQnD59GhYWFgCAPXv2YMSIEYiMjMSiRYsgl8uxZs0aDBo0CAcOHECPHj1qlevt7Y1ly5YBAIqKivD000/Xue+FCxdi4sSJePLJJ5GVlYUVK1agf//+OHXqFOzt7WttM3v2bPTr1w8A8Pvvv2PTpk06r8+ZMwdr167FjBkz8NxzzyEhIQGff/45Tp06hUOHDsHU1LTO49AYeXl52rrdTKPR4MEHH8TBgwcxe/ZshIaG4ty5c/jkk09w5cqVO/bX2LBhA0pKSvD000/DyckJx44dw4oVK3D9+nVs2LBBZ121Wo3hw4fj3nvvxQcffIBt27Zh0aJFqKqqwtKlSwEAO3fuxKOPPorBgwfj/fffBwDExMTg0KFDeP755wEAGRkZ6N27N0pKSvDcc8/ByckJ69atw4MPPojffvsNY8eOvatj5eLigi+//BJPP/00xo4di3HjxgEAOnfuDKD6i6tPnz7w8vLC66+/DisrK/z6668YM2YMNm7c2Oj9HzhwAFu2bMHcuXMBAMuWLcOoUaPw6quv4osvvsAzzzyD3NxcfPDBB3jiiSewZ88e7bYNPf5///03HnnkEYSHh2PZsmXIzc3FzJkz4eXlVWdMP/30EwoLCzFnzhzIZDJ88MEHGDduHK5evVrv5/H777/Hk08+iR49emD27NkA0KRf/E8++SR++OEHTJ48Gb1798aePXswcuTIWutlZGTg3nvv1SaPLi4u2Lp1K2bOnImCggK88MILd9xXzftbVVWFqKgorF69GqWlpfj+++/r3ebm1w4cOIDVq1fjk08+gbOzMwDAzc0NQOPOrSVLlmDx4sXo3bs3li5dCjMzMxw9ehR79uzB0KFDsXz5chQVFQGoPh/effddvPHGGwgNDQUA7cABIQQefPBB7N27FzNnzkRERAS2b9+OV155BSkpKfjkk08aVf/S0lIMHDgQcXFxmDdvHgICArBhwwZMnz4deXl52nOyLnFxcfh//+//3fE9uFnNd2V5eTm2b9+Ojz76CObm5njnnXcaVc7NsrOzcfDgQRw8eBBPPPEEIiMjsXv3bixYsACJiYlYtWqVdt0nn3wS69atw4QJE/DSSy/h6NGjWLZsGWJiYmp9b8fGxuKRRx7BU089hWnTpmHNmjV4+OGHsW3bNtx///11xnLt2jUMHz4cHTp0wK+//qptMS0pKcGAAQOQkpKCOXPmwNfXF4cPH8aCBQuQlpaG5cuXN7n+WoLqtGbNGgFA7Nq1S2RlZYlr166J9evXCycnJ2FhYSGuX78uhBCirKxMqNVqnW0TEhKEUqkUS5cu1S779ttvBQDx8ccf19qXRqPRbgdAfPjhh7XW6dSpkxgwYID2+d69ewUA4eXlJQoKCrTLf/31VwFAfPrpp9qyg4ODxbBhw7T7EUKIkpISERAQIO6///5a++rdu7cICwvTPs/KyhIAxKJFi7TLEhMThUKhEP/97391tj137pwwMTGptTw2NlYAEOvWrdMuW7Rokbj5I3jgwAEBQPz44486227btq3Wcj8/PzFy5Mhasc+dO1fc+rG+NfZXX31VuLq6isjISJ1j+v333wu5XC4OHDigs/2qVasEAHHo0KFa+7tZSUlJrWXLli0TMplMJCUlaZdNmzZNABDPPvusdplGoxEjR44UZmZmIisrSwghxPPPPy9sbW1FVVVVvft84YUXBACdmAsLC0VAQIDw9/fXfjZrPltr1qzRrjdgwACd+t8cn5+fn/Z5Xe9/jcGDB4vw8HBRVlamU5fevXuL4ODgeuOuCwChVCpFQkKCdtlXX30lAAh3d3edz/mCBQsEAJ11G3r8w8PDhbe3tygsLNQu27dvnwC
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"train_data = pd.DataFrame({'absolute_magnitude': y_train})\n",
"val_data = pd.DataFrame({'absolute_magnitude': y_val})\n",
"test_data = pd.DataFrame({'absolute_magnitude': y_test})\n",
"\n",
"sns.histplot(train_data['absolute_magnitude'], kde=True)\n",
"plt.title('Распределение absolute_magnitude в обучающей выборке')\n",
"plt.show()\n",
"\n",
"sns.histplot(val_data['absolute_magnitude'], kde=True)\n",
"plt.title('Распределение absolute_magnitude в контрольной выборке')\n",
"plt.show()\n",
"\n",
"sns.histplot(test_data['absolute_magnitude'], kde=True)\n",
"plt.title('Распределение absolute_magnitude в тестовой выборке')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Конструирование признаков\n",
"\n",
"**Унитарное кодирование**\n",
"\n",
"Унитарное кодирование категориальных признаков (one-hot encoding). Преобразуем категориальные признаки в бинарные векторы.\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" id name est_diameter_min est_diameter_max \\\n",
"0 2162635 162635 (2000 SS164) 1.198271 2.679415 \n",
"1 2277475 277475 (2005 WK4) 0.265800 0.594347 \n",
"2 2512244 512244 (2015 YE18) 0.722030 1.614507 \n",
"3 3596030 (2012 BV13) 0.096506 0.215794 \n",
"4 3667127 (2014 GE35) 0.255009 0.570217 \n",
"\n",
" relative_velocity miss_distance absolute_magnitude hazardous \\\n",
"0 13569.249224 5.483974e+07 16.73 False \n",
"1 73588.726663 6.143813e+07 20.00 True \n",
"2 114258.692129 4.979872e+07 17.83 False \n",
"3 24764.303138 2.543497e+07 22.20 False \n",
"4 42737.733765 4.627557e+07 20.09 True \n",
"\n",
" orbiting_body_Earth sentry_object_False \n",
"0 True True \n",
"1 True True \n",
"2 True True \n",
"3 True True \n",
"4 True True \n"
]
}
],
"source": [
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"df = pd.read_csv(\"..//..//static//csv//neo.csv\")\n",
"\n",
"categorical_columns = ['orbiting_body', 'sentry_object']\n",
"\n",
"df_encoded = pd.get_dummies(df, columns=categorical_columns)\n",
"\n",
"print(df_encoded.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Дискретизация числовых признаков**\n",
"\n",
"Процесс преобразования непрерывных числовых значений в дискретные категории или интервалы (бины)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" miss_distance miss_distance_binned\n",
"0 5.483974e+07 (44881889.084, 59840270.268]\n",
"1 6.143813e+07 (59840270.268, 74798651.452]\n",
"2 4.979872e+07 (44881889.084, 59840270.268]\n",
"3 2.543497e+07 (14965126.716, 29923507.9]\n",
"4 4.627557e+07 (44881889.084, 59840270.268]\n",
" absolute_magnitude absolute_magnitude_binned\n",
"0 16.73 (9.229000000000001, 21.34]\n",
"1 20.00 (9.229000000000001, 21.34]\n",
"2 17.83 (9.229000000000001, 21.34]\n",
"3 22.20 (21.34, 23.7]\n",
"4 20.09 (9.229000000000001, 21.34]\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"df['miss_distance_binned'] = pd.cut(df['miss_distance'], bins=5)\n",
"\n",
"df['absolute_magnitude_binned'] = pd.qcut(df['absolute_magnitude'], q=4)\n",
"\n",
"print(df[['miss_distance', 'miss_distance_binned']].head())\n",
"print(df[['absolute_magnitude', 'absolute_magnitude_binned']].head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Ручной синтез**\n",
"\n",
"Создание новых признаков на основе экспертных знаний и логики предметной области. В нашем случае можно задействовать расстояния объекта от Земли и скорость движения объекта, синтезировав новый признак - \"скорость в сравнении с расстоянием\". Этот признак показывает, что объект может быть более опасным, если его скорость велика, а расстояние до Земли — маленькое."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Создание нового признака 'Speed VS Distance'\n",
"df['high_risk'] = ((df['miss_distance'] < threshold_distance) & (df['relative_velocity'] > threshold_velocity)).astype(int)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Масштабирование признаков**\n",
"\n",
"Процесс преобразования числовых признаков таким образом, чтобы они имели одинаковый масштаб. Это важно для многих алгоритмов машинного обучения, которые чувствительны к масштабу признаков, таких как линейная регрессия, метод опорных векторов (SVM) и нейронные сети."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'df_encoded' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[6], line 7\u001b[0m\n\u001b[0;32m 4\u001b[0m numerical_features \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmiss_distance\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mabsolute_magnitude\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[0;32m 6\u001b[0m scaler \u001b[38;5;241m=\u001b[39m StandardScaler()\n\u001b[1;32m----> 7\u001b[0m df_encoded[numerical_features] \u001b[38;5;241m=\u001b[39m scaler\u001b[38;5;241m.\u001b[39mfit_transform(\u001b[43mdf_encoded\u001b[49m[numerical_features])\n\u001b[0;32m 8\u001b[0m df_encoded[numerical_features] \u001b[38;5;241m=\u001b[39m scaler\u001b[38;5;241m.\u001b[39mtransform(df_encoded[numerical_features])\n\u001b[0;32m 9\u001b[0m df_encoded[numerical_features] \u001b[38;5;241m=\u001b[39m scaler\u001b[38;5;241m.\u001b[39mtransform(df_encoded[numerical_features])\n",
"\u001b[1;31mNameError\u001b[0m: name 'df_encoded' is not defined"
]
}
],
"source": [
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
"\n",
"# Пример масштабирования числовых признаков\n",
"numerical_features = ['miss_distance', 'absolute_magnitude']\n",
"\n",
"scaler = StandardScaler()\n",
"df_encoded[numerical_features] = scaler.fit_transform(df_encoded[numerical_features])\n",
"df_encoded[numerical_features] = scaler.transform(df_encoded[numerical_features])\n",
"df_encoded[numerical_features] = scaler.transform(df_encoded[numerical_features])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"e:\\Aim\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"e:\\Aim\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" est_diameter_min est_diameter_max relative_velocity miss_distance \\\n",
"id \n",
"1 1.198271 2.679415 13569.249224 5.483974e+07 \n",
"2 0.265800 0.594347 73588.726663 6.143813e+07 \n",
"3 0.722030 1.614507 114258.692129 4.979872e+07 \n",
"4 0.096506 0.215794 24764.303138 2.543497e+07 \n",
"5 0.255009 0.570217 42737.733765 4.627557e+07 \n",
"\n",
" orbiting_body sentry_object absolute_magnitude hazardous \n",
"id \n",
"1 Earth False 16.73 False \n",
"2 Earth False 20.00 True \n",
"3 Earth False 17.83 False \n",
"4 Earth False 22.20 False \n",
"5 Earth False 20.09 True \n",
" est_diameter_min est_diameter_max relative_velocity miss_distance \\\n",
"id \n",
"17465 0.265800 0.594347 6639.199305 7.248720e+07 \n",
"10057 0.023150 0.051765 66065.475247 2.182677e+07 \n",
"6905 0.148784 0.332690 35092.567329 6.261058e+07 \n",
"40989 0.007321 0.016370 24301.494107 2.765938e+06 \n",
"23499 0.044112 0.098637 33502.608133 7.025798e+07 \n",
"\n",
" orbiting_body sentry_object absolute_magnitude hazardous \n",
"id \n",
"17465 Earth False 20.00 False \n",
"10057 Earth False 25.30 False \n",
"6905 Earth False 21.26 False \n",
"40989 Earth False 27.80 False \n",
"23499 Earth False 23.90 False \n",
" est_diameter_min est_diameter_max relative_velocity miss_distance \\\n",
"id \n",
"66148 0.020163 0.045086 24899.946486 7.427192e+06 \n",
"68694 0.175612 0.392681 67322.863166 3.526971e+07 \n",
"17013 0.031809 0.071128 20216.336390 5.832689e+07 \n",
"69199 0.007321 0.016370 40616.528788 2.591562e+07 \n",
"45632 0.199781 0.446725 86281.198262 6.763452e+07 \n",
"\n",
" orbiting_body sentry_object absolute_magnitude hazardous \n",
"id \n",
"66148 Earth False 25.60 False \n",
"68694 Earth False 20.90 True \n",
"17013 Earth False 24.61 False \n",
"69199 Earth False 27.80 False \n",
"45632 Earth False 20.62 True \n"
]
}
],
"source": [
"import pandas as pd\n",
"import featuretools as ft\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"df = pd.read_csv(\"..//..//static//csv//neo.csv\")\n",
"\n",
"df['id'] = range(1, len(df) + 1)\n",
"\n",
"df = df.drop_duplicates()\n",
"\n",
"es = ft.EntitySet(id='objects_data')\n",
"\n",
"es = es.add_dataframe(\n",
" dataframe_name='objects',\n",
" dataframe=df,\n",
" index='id'\n",
")\n",
"\n",
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='objects', max_depth=1)\n",
"\n",
"print(feature_matrix.head())\n",
"\n",
"train_data, test_data = train_test_split(df, test_size=0.3, random_state=42)\n",
"\n",
"val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)\n",
"\n",
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_data['id'])\n",
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_data['id'])\n",
"\n",
"print(val_feature_matrix.head())\n",
"print(test_feature_matrix.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Оценка качества каждого набора признаков\n",
"\n",
"Представим основные оценки качества наборов признаков: \n",
"\n",
"* Предсказательная способность Метрики: RMSE, MAE, R²\n",
"\n",
" Методы: Обучение модели на обучающей выборке и оценка на контрольной и тестовой выборках.\n",
"\n",
"* Скорость вычисления \n",
"\n",
" Методы: Измерение времени выполнения генерации признаков и обучения модели.\n",
"\n",
"* Надежность \n",
"\n",
" Методы: Кросс-валидация, анализ чувствительности модели к изменениям в данных.\n",
"\n",
"* Корреляция \n",
"\n",
" Методы: Анализ корреляционной матрицы признаков, удаление мультиколлинеарных признаков.\n",
"\n",
"* Цельность \n",
"\n",
" Методы: Проверка логической связи между признаками и целевой переменной, интерпретация результатов модели."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Время обучения модели: 0.06 секунд\n",
"Среднеквадратичная ошибка: 5.08\n"
]
}
],
"source": [
"import time\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.metrics import mean_squared_error\n",
"\n",
"X = feature_matrix.drop('absolute_magnitude', axis=1)\n",
"y = feature_matrix['absolute_magnitude']\n",
"\n",
"X = pd.get_dummies(X, drop_first=True)\n",
"\n",
"X.fillna(X.median(), inplace=True)\n",
"\n",
"X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"model = LinearRegression()\n",
"\n",
"start_time = time.time()\n",
"model.fit(X_train, y_train)\n",
"\n",
"train_time = time.time() - start_time\n",
"\n",
"predictions = model.predict(X_val)\n",
"mse = mean_squared_error(y_val, predictions)\n",
"\n",
"print(f'Время обучения модели: {train_time:.2f} секунд')\n",
"print(f'Среднеквадратичная ошибка: {mse:.2f}')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"e:\\Aim\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RMSE: 0.007747870644321186\n",
"R²: 0.9999928256622078\n",
"MAE: 0.00013519980189125583 \n",
"\n",
"Кросс-валидация RMSE: 0.010153168491376482 \n",
"\n",
"Train RMSE: 0.004358914935336195\n",
"Train R²: 0.999997732046293\n",
"Train MAE: 4.508435629289199e-05\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"e:\\Aim\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
}
],
"source": [
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.metrics import r2_score, mean_absolute_error\n",
"from sklearn.model_selection import cross_val_score\n",
"\n",
"feature_matrix = feature_matrix.dropna()\n",
"val_feature_matrix = val_feature_matrix.dropna()\n",
"test_feature_matrix = test_feature_matrix.dropna()\n",
"\n",
"X_train = feature_matrix.drop('absolute_magnitude', axis=1)\n",
"y_train = feature_matrix['absolute_magnitude']\n",
"X_val = val_feature_matrix.drop('absolute_magnitude', axis=1)\n",
"y_val = val_feature_matrix['absolute_magnitude']\n",
"X_test = test_feature_matrix.drop('absolute_magnitude', axis=1)\n",
"y_test = test_feature_matrix['absolute_magnitude']\n",
"\n",
"X_test = X_test.reindex(columns=X_train.columns, fill_value=0) \n",
"\n",
"X = pd.get_dummies(X, drop_first=True)\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"model = RandomForestRegressor(random_state=42)\n",
"\n",
"model.fit(X_train, y_train)\n",
"\n",
"y_pred = model.predict(X_test)\n",
"\n",
"rmse = mean_squared_error(y_test, y_pred, squared=False)\n",
"r2 = r2_score(y_test, y_pred)\n",
"mae = mean_absolute_error(y_test, y_pred)\n",
"\n",
"print()\n",
"print(f\"RMSE: {rmse}\")\n",
"print(f\"R²: {r2}\")\n",
"print(f\"MAE: {mae} \\n\")\n",
"\n",
"scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n",
"rmse_cv = (-scores.mean())**0.5\n",
"print(f\"Кросс-валидация RMSE: {rmse_cv} \\n\")\n",
"\n",
"feature_importances = model.feature_importances_\n",
"feature_names = X_train.columns\n",
"\n",
"y_train_pred = model.predict(X_train)\n",
"\n",
"rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)\n",
"r2_train = r2_score(y_train, y_train_pred)\n",
"mae_train = mean_absolute_error(y_train, y_train_pred)\n",
"\n",
"print(f\"Train RMSE: {rmse_train}\")\n",
"print(f\"Train R²: {r2_train}\")\n",
"print(f\"Train MAE: {mae_train}\")\n",
"print()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "aimenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}