824 lines
161 KiB
Plaintext
824 lines
161 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Датасет астероидов"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выведем записи и столбцы"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 1,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['id', 'name', 'est_diameter_min', 'est_diameter_max',\n",
|
|||
|
" 'relative_velocity', 'miss_distance', 'orbiting_body', 'sentry_object',\n",
|
|||
|
" 'absolute_magnitude', 'hazardous'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th>name</th>\n",
|
|||
|
" <th>est_diameter_min</th>\n",
|
|||
|
" <th>est_diameter_max</th>\n",
|
|||
|
" <th>relative_velocity</th>\n",
|
|||
|
" <th>miss_distance</th>\n",
|
|||
|
" <th>orbiting_body</th>\n",
|
|||
|
" <th>sentry_object</th>\n",
|
|||
|
" <th>absolute_magnitude</th>\n",
|
|||
|
" <th>hazardous</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>2162635</td>\n",
|
|||
|
" <td>162635 (2000 SS164)</td>\n",
|
|||
|
" <td>1.198271</td>\n",
|
|||
|
" <td>2.679415</td>\n",
|
|||
|
" <td>13569.249224</td>\n",
|
|||
|
" <td>5.483974e+07</td>\n",
|
|||
|
" <td>Earth</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>16.73</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>2277475</td>\n",
|
|||
|
" <td>277475 (2005 WK4)</td>\n",
|
|||
|
" <td>0.265800</td>\n",
|
|||
|
" <td>0.594347</td>\n",
|
|||
|
" <td>73588.726663</td>\n",
|
|||
|
" <td>6.143813e+07</td>\n",
|
|||
|
" <td>Earth</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>20.00</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>2512244</td>\n",
|
|||
|
" <td>512244 (2015 YE18)</td>\n",
|
|||
|
" <td>0.722030</td>\n",
|
|||
|
" <td>1.614507</td>\n",
|
|||
|
" <td>114258.692129</td>\n",
|
|||
|
" <td>4.979872e+07</td>\n",
|
|||
|
" <td>Earth</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>17.83</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>3596030</td>\n",
|
|||
|
" <td>(2012 BV13)</td>\n",
|
|||
|
" <td>0.096506</td>\n",
|
|||
|
" <td>0.215794</td>\n",
|
|||
|
" <td>24764.303138</td>\n",
|
|||
|
" <td>2.543497e+07</td>\n",
|
|||
|
" <td>Earth</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>22.20</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>3667127</td>\n",
|
|||
|
" <td>(2014 GE35)</td>\n",
|
|||
|
" <td>0.255009</td>\n",
|
|||
|
" <td>0.570217</td>\n",
|
|||
|
" <td>42737.733765</td>\n",
|
|||
|
" <td>4.627557e+07</td>\n",
|
|||
|
" <td>Earth</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>20.09</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" id name est_diameter_min est_diameter_max \\\n",
|
|||
|
"0 2162635 162635 (2000 SS164) 1.198271 2.679415 \n",
|
|||
|
"1 2277475 277475 (2005 WK4) 0.265800 0.594347 \n",
|
|||
|
"2 2512244 512244 (2015 YE18) 0.722030 1.614507 \n",
|
|||
|
"3 3596030 (2012 BV13) 0.096506 0.215794 \n",
|
|||
|
"4 3667127 (2014 GE35) 0.255009 0.570217 \n",
|
|||
|
"\n",
|
|||
|
" relative_velocity miss_distance orbiting_body sentry_object \\\n",
|
|||
|
"0 13569.249224 5.483974e+07 Earth False \n",
|
|||
|
"1 73588.726663 6.143813e+07 Earth False \n",
|
|||
|
"2 114258.692129 4.979872e+07 Earth False \n",
|
|||
|
"3 24764.303138 2.543497e+07 Earth False \n",
|
|||
|
"4 42737.733765 4.627557e+07 Earth False \n",
|
|||
|
"\n",
|
|||
|
" absolute_magnitude hazardous \n",
|
|||
|
"0 16.73 False \n",
|
|||
|
"1 20.00 True \n",
|
|||
|
"2 17.83 False \n",
|
|||
|
"3 22.20 False \n",
|
|||
|
"4 20.09 True "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 1,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"df_subset = pd.read_csv(\"..//..//static//csv//neo.csv\")\n",
|
|||
|
"df = df_subset.head(15000)\n",
|
|||
|
"print(df.columns)\n",
|
|||
|
"df.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Бизнес-цели:\n",
|
|||
|
"\n",
|
|||
|
"1. Повышение безопасности планеты от потенциальных угроз космических объектов.\n",
|
|||
|
"2. Оптимизация исследования космических объектов для использования в коммерческих или исследовательских миссиях."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Цели технического проекта:\n",
|
|||
|
"\n",
|
|||
|
"Для 1-й бизнес-цели: \n",
|
|||
|
" * Создать веб-приложение или API, которое принимает параметры объекта и прогнозирует, опасен ли он для Земли.\n",
|
|||
|
" * Модель может использоваться в системах мониторинга космических объектов для предоставления оперативных оценок и предупреждений.\n",
|
|||
|
" * Включение автоматической системы оповещения для НАСА и других космических агентств с обновлениями по объектам, представляющим угрозу.\n",
|
|||
|
"\n",
|
|||
|
"Для 2-й бизнес-цели:\n",
|
|||
|
" * Разработка модели, которая позволяет астрономам и специалистам по космосу загружать данные о новых объектах и получать предсказания о расстоянии их ближайшего сближения с Землей.\n",
|
|||
|
" * Создание системы мониторинга с графическим интерфейсом, отображающим траектории движения объектов и предполагаемые даты и расстояния их ближайших подходов.\n",
|
|||
|
" * Реализация системы оповещений на основе пороговых значений расстояний для идентификации особо опасных сближений."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Проверим датасет на пропущенные значения:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 2,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"id Процент пустых значений: %0.00\n",
|
|||
|
"name Процент пустых значений: %0.00\n",
|
|||
|
"est_diameter_min Процент пустых значений: %0.00\n",
|
|||
|
"est_diameter_max Процент пустых значений: %0.00\n",
|
|||
|
"relative_velocity Процент пустых значений: %0.00\n",
|
|||
|
"miss_distance Процент пустых значений: %0.00\n",
|
|||
|
"orbiting_body Процент пустых значений: %0.00\n",
|
|||
|
"sentry_object Процент пустых значений: %0.00\n",
|
|||
|
"absolute_magnitude Процент пустых значений: %0.00\n",
|
|||
|
"hazardous Процент пустых значений: %0.00\n",
|
|||
|
"id 0\n",
|
|||
|
"name 0\n",
|
|||
|
"est_diameter_min 0\n",
|
|||
|
"est_diameter_max 0\n",
|
|||
|
"relative_velocity 0\n",
|
|||
|
"miss_distance 0\n",
|
|||
|
"orbiting_body 0\n",
|
|||
|
"sentry_object 0\n",
|
|||
|
"absolute_magnitude 0\n",
|
|||
|
"hazardous 0\n",
|
|||
|
"dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"id False\n",
|
|||
|
"name False\n",
|
|||
|
"est_diameter_min False\n",
|
|||
|
"est_diameter_max False\n",
|
|||
|
"relative_velocity False\n",
|
|||
|
"miss_distance False\n",
|
|||
|
"orbiting_body False\n",
|
|||
|
"sentry_object False\n",
|
|||
|
"absolute_magnitude False\n",
|
|||
|
"hazardous False\n",
|
|||
|
"dtype: bool"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 2,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"for i in df.columns:\n",
|
|||
|
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
|||
|
" print(f'{i} Процент пустых значений: %{null_rate:.2f}')\n",
|
|||
|
"\n",
|
|||
|
"print(df.isnull().sum())\n",
|
|||
|
"\n",
|
|||
|
"df.isnull().any()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Нулевых значений нет\n",
|
|||
|
"\n",
|
|||
|
"Разобьём набор на 3 классических выборки: обучающую, тестовую и контрольную"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: (9000, 9)\n",
|
|||
|
"Размер контрольной выборки: (3000, 9)\n",
|
|||
|
"Размер тестовой выборки: (3000, 9)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"df = df.dropna()\n",
|
|||
|
"df = df.drop_duplicates()\n",
|
|||
|
"\n",
|
|||
|
"X = df.drop(columns=['absolute_magnitude'])\n",
|
|||
|
"y = df['absolute_magnitude']\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", X_train.shape)\n",
|
|||
|
"print(\"Размер контрольной выборки:\", X_val.shape)\n",
|
|||
|
"print(\"Размер тестовой выборки:\", X_test.shape)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Построим несколько столбчатых диаграмм для визуализации распределения:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 4,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkIAAAHHCAYAAABTMjf2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB7v0lEQVR4nO3dd3hTdd8G8DtJk3TvkZaWtmxKC4WyCgoICLJEwAGi4ALUggoPDpQlDhR9FUUcPCqggCgCDgRkI0LZFAqUsgqF7r1n8nv/KMlDaAttaXvS5P5cVy7IOSfnfHNOcvrNb8qEEAJEREREFkgudQBEREREUmEiRERERBaLiRARERFZLCZCREREZLGYCBEREZHFYiJEREREFouJEBEREVksJkJERERksZgIERFRvRBCIDMzExcuXJA6FKpnOp0O6enpuHz5stSh1DsmQkT14KmnnoK9vX2jHvPKlSuQyWRYsWJFox7XUsyfPx8ymazRjieTyTB//vxGO159ycvLw+zZs9G2bVuoVCq4ubmhTZs2iI2NlTq0JuHff//Fnj17DM/37NmD/fv3SxfQTZKTk/HKK6/A398fKpUKHh4eCAoKQm5urtSh1SsrqQNoTCtWrMDTTz9teK5Wq9G8eXMMGjQIc+bMgZeXl4TRETWewsJCLFq0CP369UO/fv2kDqfJeP/99xEUFISHHnpI6lBMQkZGBvr27Yv4+HhMmzYNvXv3hkqlglKpREBAgNThNQnXrl3DO++8g59//hkA8OKLL2Lu3LkSRwVcvHgR9913H8rKyvDSSy+hS5cusLKygo2NDezs7KQOr15ZVCKkt2DBAgQGBqK4uBj//vsvvvrqK2zevBmnT5+Gra2t1OERNbjCwkK8/fbbAMBEqBqzZ8/GG2+8YbTs/fffx8MPP8xE6IZXX30VSUlJiIyMRIcOHaQOp0kaPXo0Fi9ejI4dOwIAwsPDMXr0aImjAqZMmQKVSoWDBw+iWbNmUofToCwyERoyZAi6du0KAHjuuefg5uaGTz75BL///jvGjRsncXREZAqsrKxgZWWRt8gaSU1NxcqVK/H1118zCboLarUaBw4cwOnTpwEAwcHBUCgUksZ07Ngx7Nq1C9u2bTP7JAhgGyEAQP/+/QEAcXFxAIDMzEzMnDkTISEhsLe3h6OjI4YMGYKTJ09Wem1xcTHmz5+PNm3awNraGt7e3hg9ejQuXboE4H/tOKp73PxrfM+ePZDJZPj555/x5ptvQqPRwM7ODg8++CCuXbtW6diHDh3CAw88ACcnJ9ja2qJv377V1i3369evyuNX1SZh1apVCAsLg42NDVxdXTF27Ngqj3+793YznU6HxYsXo0OHDrC2toaXlxemTJmCrKwso+0CAgIwfPjwSseZOnVqpX1WFftHH31U6ZwCQElJCebNm4dWrVpBrVbDz88Pr732GkpKSqo8Vzfbt28fHnnkETRv3tzw2unTp6OoqKjK7S9fvozBgwfDzs4OPj4+WLBgAYQQRtusXbsWYWFhcHBwgKOjI0JCQvDZZ59V2s8jjzwCV1dX2NraomfPnvjrr7/uGG91VV1PPfWUoariypUr8PDwAAC8/fbbVX4Wzp07h4cffhiurq6wtrZG165d8ccff9zx+LeSyWSYOnUq1q1bh6CgINjY2CA8PBzR0dEAgG+++QatWrWCtbU1+vXrhytXrhi9vjbnX38Ma2trBAcHY+PGjUbvW//eZTIZPv74YyxbtgwtW7aEWq1Gt27dcOTIEaP93dpGSCaToaCgACtXrjScs6eeeqrS+b3dPoCKz+P06dPh4eEBBwcHPPjgg7h+/XqV5y8hIQHPPPMMvLy8oFar0aFDB3z//ffVnW4jN38fFQoFmjVrhsmTJyM7O/uOry0vL8c777xjOD8BAQF48803jb4zR44cgU6nQ2lpKbp27Qpra2u4ublh3LhxiI+PN2y3fPlyyGQynDhxotJx3n//fSgUCiQkJBhivvV7vWLFCshkMqPPxu+//45hw4bBx8cHarUaLVu2xDvvvAOtVmv02qquy+LFi9GuXTuo1WpoNBpMmTIFmZmZRtv069cPwcHBRss+/vjjSnGkp6dXGXNt7nlPPfUUFAoFOnXqhE6dOmHDhg2QyWQ1qloMCAgwXGO5XA6NRoPHHnvM6Pzf/Jmvzq2f04MHD8La2hqXLl1Chw4dbnuugIrvnv5vhru7O5544gnDNdXTt6O80z2yqraPeXl5CAsLQ2BgIJKSkmp9nu+EP3cAQ9Li5uYGoOKP0G+//YZHHnkEgYGBSElJwTfffIO+ffvi7Nmz8PHxAQBotVoMHz4cO3fuxNixY/Hyyy8jLy8P27dvx+nTp9GyZUvDMcaNG4ehQ4caHXfWrFlVxvPee+9BJpPh9ddfR2pqKhYvXoyBAwciKioKNjY2AIBdu3ZhyJAhCAsLw7x58yCXy7F8+XL0798f+/btQ/fu3Svt19fXFwsXLgQA5Ofn44UXXqjy2HPmzMGjjz6K5557DmlpaViyZAn69OmDEydOwNnZudJrJk+ejHvvvRcAsGHDBmzcuNFo/ZQpUwzts1566SXExcXhiy++wIkTJ7B//34olcoqz0NtZGdnG97bzXQ6HR588EH8+++/mDx5Mtq3b4/o6Gh8+umnOH/+PH777bfb7nfdunUoLCzECy+8ADc3Nxw+fBhLlizB9evXsW7dOqNttVotHnjgAfTs2ROLFi3C1q1bMW/ePJSXl2PBggUAgO3bt2PcuHEYMGAAPvzwQwBATEwM9u/fj5dffhkAkJKSgl69eqGwsBAvvfQS3NzcsHLlSjz44IP49ddfMWrUqLs6Vx4eHvjqq6/wwgsvYNSoUYZieH3R/JkzZ9C7d280a9YMb7zxBuzs7PDLL7/goYcewvr162t9/H379uGPP/5AREQEAGDhwoUYPnw4XnvtNXz55Zd48cUXkZWVhUWLFuGZZ57Brl27DK+t6fn/66+/8NhjjyEkJAQLFy5EVlYWnn322Wp/za5ZswZ5eXmYMmUKZDIZFi1ahNGjR+Py5cvVfh5//PFHPPfcc+jevTsmT54MAEbf8Zp67rnnsGrVKjz++OPo1asXdu3ahWHDhlXaLiUlBT179jQkkx4eHtiyZQueffZZ5Obm4pVXXrnjsfTXt7y8HJGRkVi2bBmKiorw448/3jHGlStX4uGHH8Z//vMfHDp0CAsXLkRMTIzh+52RkQGg4odKWFgYPvjgA6SlpeHzzz/Hv//+ixMnTsDd3R0PP/wwIiIisHr1anTu3NnoOKtXr0a/fv1qXeqwYsUK2NvbY8aMGbC3t8euXbswd+5c5Obm4qOPPqr2de+//z7eeust9OnTBxEREYZ70aFDh3Do0CGo1epaxVGdut7zysvL8dZbb9XqWPfeey8mT54MnU6H06dPY/HixUhMTMS+ffvqHH9GRgaKi4vxwgsvoH///nj++edx6dIlLF26tNK50r/Pbt26YeHChUhJScFnn32G/fv3V/qbUZN75K3KysowZswYxMfHY//+/fD29jasq7e/LcKCLF++XAAQO3bsEGlpaeLatWti7dq1ws3NTdjY2Ijr168LIYQoLi4WWq3W6LVxcXFCrVaLBQsWGJZ9//33AoD45JNPKh1Lp9MZXgdAfPTRR5W26dChg+jbt6/h+e7duwUA0axZM5Gbm2tY/ssvvwgA4rPPPjPsu3Xr1mLw4MGG4wghRGFhoQgMDBT3339/pWP16tVLBAcHG56npaUJAGLevHmGZVeuXBEKhUK89957Rq+Njo4WVlZWlZZfuHBBABArV640LJs3b564+WO1b98+AUCsXr3a6LVbt26ttNzf318MGzasUuwRERHi1o/qrbG/9tprwtPTU4SFhRmd0x9//FHI5XKxb98+o9d//fXXAoDYv39/pePdrLCwsNKyhQsXCplMJq5evWpYNnHiRAFATJs2zbBMp9OJYcOGCZVKJdLS0oQQQrz88svC0dFRlJeXV3vMV155RQAwijkvL08EBgaKgIAAw2dT/9lavny5Ybu+ffsavf+b4/P39zc8r+r66w0YMECEhISI4uJio/f
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkkAAAHHCAYAAACr0swBAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACBP0lEQVR4nO3deVxUVf8H8M+dAYZ932UVEFBwwyWt1NTcl9QWS9PKtEXbbHuszKynzOoprWzx95Tak2WpqS3ua2a47woICCLINiD7PnN+fyATI8MqcAf4vF+veencbb73cu+d75xz7jmSEEKAiIiIiPQo5A6AiIiIyBgxSSIiIiIygEkSERERkQFMkoiIiIgMYJJEREREZACTJCIiIiIDmCQRERERGcAkiYiIiMgAE7kDICIiorarpKQE2dnZMDExgaurq9zhNCuWJBE1wiOPPAJra+tW/czExERIkoTVq1e36ud2FG+99RYkSWq1z5MkCW+99VarfR61XZ9//jlycnJ075ctW4bCwkL5Aqpm9+7dmDBhAuzt7WFhYYFOnTrhueeekzusZtcuk6TVq1dDkiTdy9zcHF26dMG8efOQnp4ud3hEraaoqAhvvfUW9u/fL3cobcp7772HzZs3yx0GdXC//fYb3nrrLVy9ehVr167FwoULYWFhIXdY+OKLLzBy5Ejk5uZi+fLl2LVrF3bt2oW3335b7tCaXbuubnv77bfh7++PkpIS/PXXX/jyyy+xdetWnD9/HpaWlnKHR9TiioqKsHjxYgDAkCFD5A3GSL3xxhv417/+pTftvffew7333ot77rlHnqCIALz22muYMGECli9fDoVCgf/85z9QKOQt24iNjcX8+fMxZ84cfPHFF61aCiuHdp0kjR49Gn369AEAPP7443BycsLHH3+MLVu24MEHH5Q5OiIyBiYmJjAxade3QmqjBg8ejCtXriAqKgre3t7w8vKSOyR8+umncHd3x6efftruEySgnVa31Wbo0KEAgISEBABAdnY2XnrpJYSHh8Pa2hq2trYYPXo0zpw5U2PdkpISvPXWW+jSpQvMzc3h4eGByZMnIz4+HsA/7UZqe1X/Fb9//35IkoSffvoJr732Gtzd3WFlZYUJEybg6tWrNT77yJEjGDVqFOzs7GBpaYnBgwfj0KFDBvdxyJAhBj/fUBuI77//HhEREbCwsICjoyOmTp1q8PPr2rfqtFotli1bhm7dusHc3Bxubm544okncP36db3l/Pz8MG7cuBqfM2/evBrbNBT7hx9+WOOYAkBpaSkWLVqEwMBAqFQqeHt745VXXkFpaanBY1XdwYMHcd9998HHx0e37gsvvIDi4mKDy1++fBkjR46ElZUVPD098fbbb0MIobfMunXrEBERARsbG9ja2iI8PBzLly+vsZ377rsPjo6OsLS0xG233YY//vij3niHDBlisGTokUcegZ+fH4DKv5uLiwsAYPHixQbPhejoaNx7771wdHSEubk5+vTpg19//bXez7+ZJEmYN28e1q9fj65du8LCwgIDBgzAuXPnAABff/01AgMDYW5ujiFDhiAxMVFv/cYc/6rPMDc3R1hYGDZt2qS331X7LkkSPvroI6xcuRIBAQFQqVTo27cvjh07pre9m9skSZKEwsJCrFmzRnfMHnnkkRrHt65tAJXn4wsvvAAXFxfY2NhgwoQJSE5ONnj8UlJS8Nhjj8HNzQ0qlQrdunXDt99+W9vh1lP9elQqlejUqRPmzJmj15alrnWrnw8VFRUYM2YMHB0dcfHiRb3p77zzju44+vn54bXXXqtxbTX02q7rXln92m7svXL9+vW6e5qzszOmT5+OlJSUeo9b9Vf1c7PqvK5LVYwbNmyoMc/a2lp37lRpyDVftc39+/fD3t4eAwYMgJeXF8aOHdugNm1V61e9VCoVunTpgiVLlujdp6rOW7VaXeu2/Pz89Pbh8OHDiIiIwNNPP607X8PCwvB///d/NdYtLCzEiy++CG9vb6hUKgQHB+Ojjz6qca+sOs5r165FcHAwzM3NERERgT///FNvOUPX2b59+6BSqfDkk0/qTb+Va6q6DvXzqSqhcXJyAlB5sm7evBn33Xcf/P39kZ6ejq+//hqDBw/GxYsX4enpCQDQaDQYN24c9uzZg6lTp+K5555Dfn4+du3ahfPnzyMgIED3GQ8++CDGjBmj97kLFiwwGM+7774LSZLw6quvIiMjA8uWLcPw4cNx+vRpXb3z3r17MXr0aERERGDRokVQKBRYtWoVhg4dioMHD6Jfv341tuvl5YUlS5YAAAoKCvDUU08Z/OyFCxfi/vvvx+OPP47MzEx89tlnGDRoEE6dOgV7e/sa68yZMwd33nknAOCXX37Bpk2b9OY/8cQTWL16NR599FE8++yzSEhIwOeff45Tp07h0KFDMDU1NXgcGiMnJ0e3b9VptVpMmDABf/31F+bMmYPQ0FCcO3cOn3zyCS5dulRv+5L169ejqKgITz31FJycnHD06FF89tlnSE5Oxvr16/WW1Wg0GDVqFG677TZ88MEH2L59OxYtWoSKigpdnfyuXbvw4IMPYtiwYVi6dCkAICoqCocOHdI1bkxPT8fAgQNRVFSEZ599Fk5OTlizZg0mTJiADRs2YNKkSbd0rFxcXPDll1/iqaeewqRJkzB58mQAQPfu3QEAFy5cwO23345OnTrhX//6F6ysrPDzzz/jnnvuwcaNGxv9+QcPHsSvv/6KuXPnAgCWLFmCcePG4ZVXXsEXX3yBp59+GtevX8cHH3yAxx57DHv37tWt29Dj/8cff+CBBx5AeHg4lixZguvXr2PWrFno1KmTwZh++OEH5Ofn44knnoAkSfjggw8wefJkXL58udbz8X//+x8ef/xx9OvXD3PmzAEAvWu8oR5//HF8//33eOihhzBw4EDs3bsXY8eOrbFceno6brvtNt0XhYuLC7Zt24ZZs2YhLy8Pzz//fL2fVfX3raioQGRkJFauXIni4mL873//a3TM+/fvx65du9C1a1e96WvWrMG9996LF198EUeOHMGSJUsQFRVV4z7QENXjOnjwIFauXIlPPvkEzs7OAAA3Nze95Rtyr6y69/Tt2xdLlixBeno6li9fjkOHDtV6T6t+XVTF0ZJu5Zr/888/sXXr1kZ93muvvYbQ0FAUFxfrEk1XV1fMmjWryfuQlZWF48ePw8TEBHPnzkVAQAA2b96MOXPmICsrS1d1LYTAhAkTsG/fPsyaNQs9e/bEjh078PLLLyMlJQWffPKJ3nYPHDiAn376Cc8++yxUKhW++OILjBo1CkePHkVYWJjBWM6cOYN77rkHY8aMwYoVK3TTm+Oa0hHt0KpVqwQAsXv3bpGZmSmuXr0q1q1bJ5ycnISFhYVITk4WQghRUlIiNBqN3roJCQlCpVKJt99+Wzft22+/FQDExx9/XOOztFqtbj0A4sMPP6yxTLdu3cTgwYN17/ft2ycAiE6dOom8vDzd9J9//lkAEMuXL9dtOygoSIwcOVL3OUIIUVRUJPz9/cXdd99d47MGDhwowsLCdO8zMzMFALFo0SLdtMTERKFUKsW7776rt+65c+eEiYlJjemxsbECgFizZo1u2qJFi0T10+fgwYMCgFi7dq3eutu3b68x3dfXV4wdO7ZG7HPnzhU3n5I3x/7KK68IV1dXERERoXdM//e//wmFQiEOHjyot/5XX30lAIhDhw7V+LzqioqKakxbsmSJkCRJXLlyRTdt5syZAoB45plndNO0Wq0YO3asMDMzE5mZmUIIIZ577jlha2srKioqav3M559/XgDQizk/P1/4+/sLPz8/3blZdW6tWrVKt9zgwYP19r96fL6+vrr3hv7+VYYNGybCw8NFSUmJ3r4MHDhQBAUF1Rq3IQCESqUSCQkJumlff/21ACDc3d31zvMFCxYIAHrLNvT4h4eHCy8vL5Gfn6+btn//fgFAb7+rjpmTk5PIzs7WTd+yZYsAIH777TfdtJvPZSGEsLKyEjNnzqwR083Ht7ZtnD59WgAQTz/9tN5yDz30UI2/x6xZs4SHh4dQq9V6y06dOlXY2dkZPDb
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAj4AAAHHCAYAAAC/R1LgAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACCp0lEQVR4nO3dd3xT1f8/8FeStunee5e2lEILhSLIRkCGgLJEBGWIgAounKjI8PMR11dRRJGfCjhRRAQHe8koq2wo0JYO6E73Hsn5/VGaD6EttKXtTZPX8/HIA3Jz77nvc5ObvnPuOefKhBACREREREZALnUARERERK2FiQ8REREZDSY+REREZDSY+BAREZHRYOJDRERERoOJDxERERkNJj5ERERkNJj4EBERkdEwkToAIiIikkZFRQVycnKg0Wjg6ekpdTitgi0+RK1s+vTpsLa2btV9JiYmQiaTYe3ata26X2OxePFiyGSyVtufTCbD4sWLW21/1Hg//PADEhMTtc/Xrl2LlJQU6QK6yYkTJzB58mQ4OztDqVTCw8MD48ePlzqsVsPEpx5r166FTCbTPszNzdG+fXvMmzcPGRkZUodH1GpKSkqwePFi7Nu3T+pQ2pR3330Xf/zxh9RhkEQOHDiAV199FYmJidi+fTvmzp0LuVz6P7mbN29G3759cfHiRfz3v//Fzp07sXPnTnz11VdSh9ZqeKnrDpYuXYqAgACUlZXh4MGD+PLLL/HPP//g/PnzsLS0lDo8ohZXUlKCJUuWAAAGDhwobTB66q233sLrr7+us+zdd9/FhAkTMGbMGGmCIkm9+OKLGDhwIAICAgAA8+fPh4eHh6Qx5eTk4Mknn8SwYcOwYcMGmJmZSRqPVJj43MGIESPQvXt3AMCTTz4JJycnfPzxx9i8eTMeffRRiaMjIn1gYmICExN+ndL/dOjQAfHx8Th//jycnZ0RGBgodUhYs2YNysrKsHbtWqNNegBe6mq0QYMGAQASEhIAVGfQL7/8MsLDw2FtbQ1bW1uMGDECZ86cqbVtWVkZFi9ejPbt28Pc3BweHh4YN24c4uPjAfyvH0Z9j5t/be/btw8ymQy//PIL3njjDbi7u8PKygoPPvggrl27VmvfR48exfDhw2FnZwdLS0sMGDAAhw4dqrOOAwcOrHP/dfUp+OGHHxAZGQkLCws4Ojpi0qRJde7/dnW7mUajwfLly9GpUyeYm5vDzc0Nc+bMQW5urs56/v7+GDVqVK39zJs3r1aZdcX+4Ycf1jqmAFBeXo5FixYhKCgISqUSPj4+ePXVV1FeXl7nsbrZgQMH8PDDD8PX11e77YsvvojS0tI617969SqGDRsGKysreHp6YunSpRBC6Kyzfv16REZGwsbGBra2tggPD8enn35aq5yHH34Yjo6OsLS0xL333ou///77jvEOHDiwzhac6dOnw9/fH0D1++bi4gIAWLJkSZ2fhUuXLmHChAlwdHSEubk5unfvji1bttxx/7eSyWSYN28eNmzYgI4dO8LCwgK9evXCuXPnAABfffUVgoKCYG5ujoEDB+r0nwAad/xr9mFubo6wsDBs2rRJp941dZfJZPjoo4+wevVqBAYGQqlU4p577sHx48d1yru1j49MJkNxcTHWrVunPWbTp0+vdXxvVwZQ/Xl88cUX4eLiAhsbGzz44IO4fv16nccvJSUFTzzxBNzc3KBUKtGpUyd8++239R1uHTefjwqFAl5eXpg9ezby8vIavN2dvrMac2798MMP6NGjBywtLeHg4ID+/ftjx44dAKrP/dvt8+ZjW1xcjJdeegk+Pj5QKpUICQnBRx99VOs8a2j9MzMzMXPmTLi5ucHc3BxdunTBunXrdNa5uT+dlZUVevbsicDAQMydO1fnc1CfW78rTU1N4e/vj1deeQUVFRXa9Wq6Y5w4caLesm49x48cOYKIiAi8++672mMSHByM9957DxqNRmfbqqoqvPPOO9rPvb+/P954441a71fNd/GOHTsQEREBc3NzdOzYEb///rvOejXx3nzeXrhwAQ4ODhg1ahSqqqq0y/Py8vDCCy9oYwwKCsL7779fK8am4k+URqpJUpycnABU/9H5448/8PDDDyMgIAAZGRn46quvMGDAAFy8eFHbS16tVmPUqFHYvXs3Jk2ahOeffx6FhYXYuXMnzp8/r/Nr4NFHH8UDDzygs98FCxbUGc9///tfyGQyvPbaa8jMzMTy5csxZMgQnD59GhYWFgCAPXv2YMSIEYiMjMSiRYsgl8uxZs0aDBo0CAcOHECPHj1qlevt7Y1ly5YBAIqKivD000/Xue+FCxdi4sSJePLJJ5GVlYUVK1agf//+OHXqFOzt7WttM3v2bPTr1w8A8Pvvv2PTpk06r8+ZMwdr167FjBkz8NxzzyEhIQGff/45Tp06hUOHDsHU1LTO49AYeXl52rrdTKPR4MEHH8TBgwcxe/ZshIaG4ty5c/jkk09w5cqVO/bX2LBhA0pKSvD000/DyckJx44dw4oVK3D9+nVs2LBBZ121Wo3hw4fj3nvvxQcffIBt27Zh0aJFqKqqwtKlSwEAO3fuxKOPPorBgwfj/fffBwDExMTg0KFDeP755wEAGRkZ6N27N0pKSvDcc8/ByckJ69atw4MPPojffvsNY8eOvatj5eLigi+//BJPP/00xo4di3HjxgEAOnfuDKD6i6tPnz7w8vLC66+/DisrK/z6668YM2YMNm7c2Oj9HzhwAFu2bMHcuXMBAMuWLcOoUaPw6quv4osvvsAzzzyD3NxcfPDBB3jiiSewZ88e7bYNPf5///03HnnkEYSHh2PZsmXIzc3FzJkz4eXlVWdMP/30EwoLCzFnzhzIZDJ88MEHGDduHK5evVrv5/H777/Hk08+iR49emD27NkA0KRf/E8++SR++OEHTJ48Gb1798aePXswcuTIWutlZGTg3nvv1SaPLi4u2Lp1K2bOnImCggK88MILd9xXzftbVVWFqKgorF69GqWlpfj+++/r3ebm1w4cOIDVq1fjk08+gbOzMwDAzc0NQOPOrSVLlmDx4sXo3bs3li5dCjMzMxw9ehR79uzB0KFDsXz5chQVFQGoPh/effddvPHGGwgNDQUA7cABIQQefPBB7N27FzNnzkRERAS2b9+OV155BSkpKfjkk08aVf/S0lIMHDgQcXFxmDdvHgICArBhwwZMnz4deXl52nOyLnFxcfh//+//3fE9uFnNd2V5eTm2b9+Ojz76CObm5njnnXcaVc7NsrOzcfDgQRw8eBBPPPEEIiMjsXv3bixYsACJiYlYtWqVdt0nn3wS69atw4QJE/DSSy/h6NGjWLZsGWJiYmp9b8fGxuKRRx7BU089hWnTpmHNmjV4+OGHsW3bNtx///11xnLt2jUMHz4cHTp0wK+//qptMS0pKcGAAQOQkpKCOXPmwNfXF4cPH8aCBQuQlpaG5cuXN7n+WoLqtGbNGgFA7Nq1S2RlZYlr166J9evXCycnJ2FhYSGuX78uhBCirKxMqNVqnW0TEhKEUqkUS5cu1S779ttvBQDx8ccf19qXRqPRbgdAfPjhh7XW6dSpkxgwYID2+d69ewUA4eXlJQoKCrTLf/31VwFAfPrpp9qyg4ODxbBhw7T7EUKIkpISERAQIO6///5a++rdu7cICwvTPs/KyhIAxKJFi7TLEhMThUKhEP/97391tj137pwwMTGptTw2NlYAEOvWrdMuW7Rokbj5I3jgwAEBQPz44486227btq3Wcj8/PzFy5Mhasc+dO1fc+rG+NfZXX31VuLq6isjISJ1j+v333wu5XC4OHDigs/2qVasEAHHo0KFa+7tZSUlJrWXLli0TMplMJCUlaZdNmzZNABDPPvusdplGoxEjR44UZmZmIisrSwghxPPPPy9sbW1FVVVVvft84YUXBACdmAsLC0VAQIDw9/fXfjZrPltr1qzRrjdgwACd+t8cn5+fn/Z5Xe9/jcGDB4vw8HBRVlamU5fevXuL4ODgeuOuCwChVCpFQkKCdtlXX30lAAh3d3edz/mCBQsEAJ11G3r8w8PDhbe3tygsLNQu27dvnwC
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"train_data = pd.DataFrame({'absolute_magnitude': y_train})\n",
|
|||
|
"val_data = pd.DataFrame({'absolute_magnitude': y_val})\n",
|
|||
|
"test_data = pd.DataFrame({'absolute_magnitude': y_test})\n",
|
|||
|
"\n",
|
|||
|
"sns.histplot(train_data['absolute_magnitude'], kde=True)\n",
|
|||
|
"plt.title('Распределение absolute_magnitude в обучающей выборке')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"sns.histplot(val_data['absolute_magnitude'], kde=True)\n",
|
|||
|
"plt.title('Распределение absolute_magnitude в контрольной выборке')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"sns.histplot(test_data['absolute_magnitude'], kde=True)\n",
|
|||
|
"plt.title('Распределение absolute_magnitude в тестовой выборке')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Конструирование признаков\n",
|
|||
|
"\n",
|
|||
|
"**Унитарное кодирование**\n",
|
|||
|
"\n",
|
|||
|
"Унитарное кодирование категориальных признаков (one-hot encoding). Преобразуем категориальные признаки в бинарные векторы.\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 6,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" id name est_diameter_min est_diameter_max \\\n",
|
|||
|
"0 2162635 162635 (2000 SS164) 1.198271 2.679415 \n",
|
|||
|
"1 2277475 277475 (2005 WK4) 0.265800 0.594347 \n",
|
|||
|
"2 2512244 512244 (2015 YE18) 0.722030 1.614507 \n",
|
|||
|
"3 3596030 (2012 BV13) 0.096506 0.215794 \n",
|
|||
|
"4 3667127 (2014 GE35) 0.255009 0.570217 \n",
|
|||
|
"\n",
|
|||
|
" relative_velocity miss_distance absolute_magnitude hazardous \\\n",
|
|||
|
"0 13569.249224 5.483974e+07 16.73 False \n",
|
|||
|
"1 73588.726663 6.143813e+07 20.00 True \n",
|
|||
|
"2 114258.692129 4.979872e+07 17.83 False \n",
|
|||
|
"3 24764.303138 2.543497e+07 22.20 False \n",
|
|||
|
"4 42737.733765 4.627557e+07 20.09 True \n",
|
|||
|
"\n",
|
|||
|
" orbiting_body_Earth sentry_object_False \n",
|
|||
|
"0 True True \n",
|
|||
|
"1 True True \n",
|
|||
|
"2 True True \n",
|
|||
|
"3 True True \n",
|
|||
|
"4 True True \n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\"..//..//static//csv//neo.csv\")\n",
|
|||
|
"\n",
|
|||
|
"categorical_columns = ['orbiting_body', 'sentry_object']\n",
|
|||
|
"\n",
|
|||
|
"df_encoded = pd.get_dummies(df, columns=categorical_columns)\n",
|
|||
|
"\n",
|
|||
|
"print(df_encoded.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"**Дискретизация числовых признаков**\n",
|
|||
|
"\n",
|
|||
|
"Процесс преобразования непрерывных числовых значений в дискретные категории или интервалы (бины)."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" miss_distance miss_distance_binned\n",
|
|||
|
"0 5.483974e+07 (44881889.084, 59840270.268]\n",
|
|||
|
"1 6.143813e+07 (59840270.268, 74798651.452]\n",
|
|||
|
"2 4.979872e+07 (44881889.084, 59840270.268]\n",
|
|||
|
"3 2.543497e+07 (14965126.716, 29923507.9]\n",
|
|||
|
"4 4.627557e+07 (44881889.084, 59840270.268]\n",
|
|||
|
" absolute_magnitude absolute_magnitude_binned\n",
|
|||
|
"0 16.73 (9.229000000000001, 21.34]\n",
|
|||
|
"1 20.00 (9.229000000000001, 21.34]\n",
|
|||
|
"2 17.83 (9.229000000000001, 21.34]\n",
|
|||
|
"3 22.20 (21.34, 23.7]\n",
|
|||
|
"4 20.09 (9.229000000000001, 21.34]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"df['miss_distance_binned'] = pd.cut(df['miss_distance'], bins=5)\n",
|
|||
|
"\n",
|
|||
|
"df['absolute_magnitude_binned'] = pd.qcut(df['absolute_magnitude'], q=4)\n",
|
|||
|
"\n",
|
|||
|
"print(df[['miss_distance', 'miss_distance_binned']].head())\n",
|
|||
|
"print(df[['absolute_magnitude', 'absolute_magnitude_binned']].head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"**Ручной синтез**\n",
|
|||
|
"\n",
|
|||
|
"Создание новых признаков на основе экспертных знаний и логики предметной области. В нашем случае можно задействовать расстояния объекта от Земли и скорость движения объекта, синтезировав новый признак - \"скорость в сравнении с расстоянием\". Этот признак показывает, что объект может быть более опасным, если его скорость велика, а расстояние до Земли — маленькое."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Создание нового признака 'Speed VS Distance'\n",
|
|||
|
"df['high_risk'] = ((df['miss_distance'] < threshold_distance) & (df['relative_velocity'] > threshold_velocity)).astype(int)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"**Масштабирование признаков**\n",
|
|||
|
"\n",
|
|||
|
"Процесс преобразования числовых признаков таким образом, чтобы они имели одинаковый масштаб. Это важно для многих алгоритмов машинного обучения, которые чувствительны к масштабу признаков, таких как линейная регрессия, метод опорных векторов (SVM) и нейронные сети."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 6,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"ename": "NameError",
|
|||
|
"evalue": "name 'df_encoded' is not defined",
|
|||
|
"output_type": "error",
|
|||
|
"traceback": [
|
|||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
|||
|
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
|
|||
|
"Cell \u001b[1;32mIn[6], line 7\u001b[0m\n\u001b[0;32m 4\u001b[0m numerical_features \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmiss_distance\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mabsolute_magnitude\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[0;32m 6\u001b[0m scaler \u001b[38;5;241m=\u001b[39m StandardScaler()\n\u001b[1;32m----> 7\u001b[0m df_encoded[numerical_features] \u001b[38;5;241m=\u001b[39m scaler\u001b[38;5;241m.\u001b[39mfit_transform(\u001b[43mdf_encoded\u001b[49m[numerical_features])\n\u001b[0;32m 8\u001b[0m df_encoded[numerical_features] \u001b[38;5;241m=\u001b[39m scaler\u001b[38;5;241m.\u001b[39mtransform(df_encoded[numerical_features])\n\u001b[0;32m 9\u001b[0m df_encoded[numerical_features] \u001b[38;5;241m=\u001b[39m scaler\u001b[38;5;241m.\u001b[39mtransform(df_encoded[numerical_features])\n",
|
|||
|
"\u001b[1;31mNameError\u001b[0m: name 'df_encoded' is not defined"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
|
|||
|
"\n",
|
|||
|
"# Пример масштабирования числовых признаков\n",
|
|||
|
"numerical_features = ['miss_distance', 'absolute_magnitude']\n",
|
|||
|
"\n",
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"df_encoded[numerical_features] = scaler.fit_transform(df_encoded[numerical_features])\n",
|
|||
|
"df_encoded[numerical_features] = scaler.transform(df_encoded[numerical_features])\n",
|
|||
|
"df_encoded[numerical_features] = scaler.transform(df_encoded[numerical_features])"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 9,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"e:\\Aim\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"e:\\Aim\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" est_diameter_min est_diameter_max relative_velocity miss_distance \\\n",
|
|||
|
"id \n",
|
|||
|
"1 1.198271 2.679415 13569.249224 5.483974e+07 \n",
|
|||
|
"2 0.265800 0.594347 73588.726663 6.143813e+07 \n",
|
|||
|
"3 0.722030 1.614507 114258.692129 4.979872e+07 \n",
|
|||
|
"4 0.096506 0.215794 24764.303138 2.543497e+07 \n",
|
|||
|
"5 0.255009 0.570217 42737.733765 4.627557e+07 \n",
|
|||
|
"\n",
|
|||
|
" orbiting_body sentry_object absolute_magnitude hazardous \n",
|
|||
|
"id \n",
|
|||
|
"1 Earth False 16.73 False \n",
|
|||
|
"2 Earth False 20.00 True \n",
|
|||
|
"3 Earth False 17.83 False \n",
|
|||
|
"4 Earth False 22.20 False \n",
|
|||
|
"5 Earth False 20.09 True \n",
|
|||
|
" est_diameter_min est_diameter_max relative_velocity miss_distance \\\n",
|
|||
|
"id \n",
|
|||
|
"17465 0.265800 0.594347 6639.199305 7.248720e+07 \n",
|
|||
|
"10057 0.023150 0.051765 66065.475247 2.182677e+07 \n",
|
|||
|
"6905 0.148784 0.332690 35092.567329 6.261058e+07 \n",
|
|||
|
"40989 0.007321 0.016370 24301.494107 2.765938e+06 \n",
|
|||
|
"23499 0.044112 0.098637 33502.608133 7.025798e+07 \n",
|
|||
|
"\n",
|
|||
|
" orbiting_body sentry_object absolute_magnitude hazardous \n",
|
|||
|
"id \n",
|
|||
|
"17465 Earth False 20.00 False \n",
|
|||
|
"10057 Earth False 25.30 False \n",
|
|||
|
"6905 Earth False 21.26 False \n",
|
|||
|
"40989 Earth False 27.80 False \n",
|
|||
|
"23499 Earth False 23.90 False \n",
|
|||
|
" est_diameter_min est_diameter_max relative_velocity miss_distance \\\n",
|
|||
|
"id \n",
|
|||
|
"66148 0.020163 0.045086 24899.946486 7.427192e+06 \n",
|
|||
|
"68694 0.175612 0.392681 67322.863166 3.526971e+07 \n",
|
|||
|
"17013 0.031809 0.071128 20216.336390 5.832689e+07 \n",
|
|||
|
"69199 0.007321 0.016370 40616.528788 2.591562e+07 \n",
|
|||
|
"45632 0.199781 0.446725 86281.198262 6.763452e+07 \n",
|
|||
|
"\n",
|
|||
|
" orbiting_body sentry_object absolute_magnitude hazardous \n",
|
|||
|
"id \n",
|
|||
|
"66148 Earth False 25.60 False \n",
|
|||
|
"68694 Earth False 20.90 True \n",
|
|||
|
"17013 Earth False 24.61 False \n",
|
|||
|
"69199 Earth False 27.80 False \n",
|
|||
|
"45632 Earth False 20.62 True \n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import featuretools as ft\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\"..//..//static//csv//neo.csv\")\n",
|
|||
|
"\n",
|
|||
|
"df['id'] = range(1, len(df) + 1)\n",
|
|||
|
"\n",
|
|||
|
"df = df.drop_duplicates()\n",
|
|||
|
"\n",
|
|||
|
"es = ft.EntitySet(id='objects_data')\n",
|
|||
|
"\n",
|
|||
|
"es = es.add_dataframe(\n",
|
|||
|
" dataframe_name='objects',\n",
|
|||
|
" dataframe=df,\n",
|
|||
|
" index='id'\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='objects', max_depth=1)\n",
|
|||
|
"\n",
|
|||
|
"print(feature_matrix.head())\n",
|
|||
|
"\n",
|
|||
|
"train_data, test_data = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_data['id'])\n",
|
|||
|
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_data['id'])\n",
|
|||
|
"\n",
|
|||
|
"print(val_feature_matrix.head())\n",
|
|||
|
"print(test_feature_matrix.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Оценка качества каждого набора признаков\n",
|
|||
|
"\n",
|
|||
|
"Представим основные оценки качества наборов признаков: \n",
|
|||
|
"\n",
|
|||
|
"* Предсказательная способность Метрики: RMSE, MAE, R²\n",
|
|||
|
"\n",
|
|||
|
" Методы: Обучение модели на обучающей выборке и оценка на контрольной и тестовой выборках.\n",
|
|||
|
"\n",
|
|||
|
"* Скорость вычисления \n",
|
|||
|
"\n",
|
|||
|
" Методы: Измерение времени выполнения генерации признаков и обучения модели.\n",
|
|||
|
"\n",
|
|||
|
"* Надежность \n",
|
|||
|
"\n",
|
|||
|
" Методы: Кросс-валидация, анализ чувствительности модели к изменениям в данных.\n",
|
|||
|
"\n",
|
|||
|
"* Корреляция \n",
|
|||
|
"\n",
|
|||
|
" Методы: Анализ корреляционной матрицы признаков, удаление мультиколлинеарных признаков.\n",
|
|||
|
"\n",
|
|||
|
"* Цельность \n",
|
|||
|
"\n",
|
|||
|
" Методы: Проверка логической связи между признаками и целевой переменной, интерпретация результатов модели."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 10,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Время обучения модели: 0.06 секунд\n",
|
|||
|
"Среднеквадратичная ошибка: 5.08\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import time\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from sklearn.linear_model import LinearRegression\n",
|
|||
|
"from sklearn.metrics import mean_squared_error\n",
|
|||
|
"\n",
|
|||
|
"X = feature_matrix.drop('absolute_magnitude', axis=1)\n",
|
|||
|
"y = feature_matrix['absolute_magnitude']\n",
|
|||
|
"\n",
|
|||
|
"X = pd.get_dummies(X, drop_first=True)\n",
|
|||
|
"\n",
|
|||
|
"X.fillna(X.median(), inplace=True)\n",
|
|||
|
"\n",
|
|||
|
"X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"model = LinearRegression()\n",
|
|||
|
"\n",
|
|||
|
"start_time = time.time()\n",
|
|||
|
"model.fit(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"train_time = time.time() - start_time\n",
|
|||
|
"\n",
|
|||
|
"predictions = model.predict(X_val)\n",
|
|||
|
"mse = mean_squared_error(y_val, predictions)\n",
|
|||
|
"\n",
|
|||
|
"print(f'Время обучения модели: {train_time:.2f} секунд')\n",
|
|||
|
"print(f'Среднеквадратичная ошибка: {mse:.2f}')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 11,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"e:\\Aim\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n",
|
|||
|
"RMSE: 0.007747870644321186\n",
|
|||
|
"R²: 0.9999928256622078\n",
|
|||
|
"MAE: 0.00013519980189125583 \n",
|
|||
|
"\n",
|
|||
|
"Кросс-валидация RMSE: 0.010153168491376482 \n",
|
|||
|
"\n",
|
|||
|
"Train RMSE: 0.004358914935336195\n",
|
|||
|
"Train R²: 0.999997732046293\n",
|
|||
|
"Train MAE: 4.508435629289199e-05\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"e:\\Aim\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.ensemble import RandomForestRegressor\n",
|
|||
|
"from sklearn.metrics import r2_score, mean_absolute_error\n",
|
|||
|
"from sklearn.model_selection import cross_val_score\n",
|
|||
|
"\n",
|
|||
|
"feature_matrix = feature_matrix.dropna()\n",
|
|||
|
"val_feature_matrix = val_feature_matrix.dropna()\n",
|
|||
|
"test_feature_matrix = test_feature_matrix.dropna()\n",
|
|||
|
"\n",
|
|||
|
"X_train = feature_matrix.drop('absolute_magnitude', axis=1)\n",
|
|||
|
"y_train = feature_matrix['absolute_magnitude']\n",
|
|||
|
"X_val = val_feature_matrix.drop('absolute_magnitude', axis=1)\n",
|
|||
|
"y_val = val_feature_matrix['absolute_magnitude']\n",
|
|||
|
"X_test = test_feature_matrix.drop('absolute_magnitude', axis=1)\n",
|
|||
|
"y_test = test_feature_matrix['absolute_magnitude']\n",
|
|||
|
"\n",
|
|||
|
"X_test = X_test.reindex(columns=X_train.columns, fill_value=0) \n",
|
|||
|
"\n",
|
|||
|
"X = pd.get_dummies(X, drop_first=True)\n",
|
|||
|
"\n",
|
|||
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"model = RandomForestRegressor(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"model.fit(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"y_pred = model.predict(X_test)\n",
|
|||
|
"\n",
|
|||
|
"rmse = mean_squared_error(y_test, y_pred, squared=False)\n",
|
|||
|
"r2 = r2_score(y_test, y_pred)\n",
|
|||
|
"mae = mean_absolute_error(y_test, y_pred)\n",
|
|||
|
"\n",
|
|||
|
"print()\n",
|
|||
|
"print(f\"RMSE: {rmse}\")\n",
|
|||
|
"print(f\"R²: {r2}\")\n",
|
|||
|
"print(f\"MAE: {mae} \\n\")\n",
|
|||
|
"\n",
|
|||
|
"scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n",
|
|||
|
"rmse_cv = (-scores.mean())**0.5\n",
|
|||
|
"print(f\"Кросс-валидация RMSE: {rmse_cv} \\n\")\n",
|
|||
|
"\n",
|
|||
|
"feature_importances = model.feature_importances_\n",
|
|||
|
"feature_names = X_train.columns\n",
|
|||
|
"\n",
|
|||
|
"y_train_pred = model.predict(X_train)\n",
|
|||
|
"\n",
|
|||
|
"rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)\n",
|
|||
|
"r2_train = r2_score(y_train, y_train_pred)\n",
|
|||
|
"mae_train = mean_absolute_error(y_train, y_train_pred)\n",
|
|||
|
"\n",
|
|||
|
"print(f\"Train RMSE: {rmse_train}\")\n",
|
|||
|
"print(f\"Train R²: {r2_train}\")\n",
|
|||
|
"print(f\"Train MAE: {mae_train}\")\n",
|
|||
|
"print()"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "aimenv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.5"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|