842 lines
212 KiB
Plaintext
842 lines
212 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 4,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')\n",
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"RangeIndex: 5251 entries, 0 to 5250\n",
|
|||
|
"Data columns (total 8 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 Date 5251 non-null object \n",
|
|||
|
" 1 Open 5251 non-null float64 \n",
|
|||
|
" 2 High 5251 non-null float64 \n",
|
|||
|
" 3 Low 5251 non-null float64 \n",
|
|||
|
" 4 Close 5251 non-null float64 \n",
|
|||
|
" 5 Adj Close 5251 non-null float64 \n",
|
|||
|
" 6 Volume 5251 non-null int64 \n",
|
|||
|
" 7 date 5251 non-null datetime64[ns]\n",
|
|||
|
"dtypes: datetime64[ns](1), float64(5), int64(1), object(1)\n",
|
|||
|
"memory usage: 328.3+ KB\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pn\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import matplotlib\n",
|
|||
|
"import matplotlib.ticker as ticker\n",
|
|||
|
"from datetime import datetime\n",
|
|||
|
"import matplotlib.dates as md\n",
|
|||
|
"\n",
|
|||
|
"df = pn.read_csv(\".//static//csv//Yamana_Gold_Inc._AUY.csv\")\n",
|
|||
|
"print(df.columns)\n",
|
|||
|
"\n",
|
|||
|
"df[\"date\"] = df.apply(lambda row: datetime.strptime(row[\"Date\"], \"%m/%d/%Y\"), axis=1)\n",
|
|||
|
"df.info()\n",
|
|||
|
"#print(df['date'].head)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Разделим на 3 выборки\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 5,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 3360\n",
|
|||
|
"Размер контрольной выборки: 840\n",
|
|||
|
"Размер тестовой выборки: 1051\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"# Разделение данных на обучающую и тестовую выборки (80% - обучение, 20% - тест)\n",
|
|||
|
"train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (80% - обучение, 20% - контроль)\n",
|
|||
|
"train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_data))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_data))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_data))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 6,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjwAAAHHCAYAAAC7soLdAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABgEklEQVR4nO3deVwU9f8H8NfsstwsyLmggIgHgmd44ZGmiBKafTWPssQ07esXzSOt6OttRZmpqaRZpvZTv5aVVlYqkFeKF4YX3qIoCojKKefu/P5ANlZABRcGx9fz8ZiH7sxnZt4zuwsvZj4zI4iiKIKIiIhIxhRSF0BERERU0xh4iIiISPYYeIiIiEj2GHiIiIhI9hh4iIiISPYYeIiIiEj2GHiIiIhI9hh4iIiISPYYeIiIakBGRgYuXLiA4uJiqUshIxJFEbdv38b58+elLoWqiIGHiMgIioqKMH/+fLRu3RpmZmaoV68emjRpgpiYGKlLeyKcPHkSW7Zs0b+Oj4/Hb7/9Jl1BZWRnZ2P69Olo1qwZTE1N4eDggKZNm+Ls2bNSl0ZVYCJ1AVTz1qxZg9dff13/2szMDB4eHggKCsKMGTPg4uIiYXVET76CggIEBQXhwIED+Pe//4158+bB0tISSqUS/v7+Upf3RMjOzsabb74JjUYDBwcHTJw4EcHBwQgJCZG0rlu3bqF79+5ISkrChAkT0KVLF5iamkKlUqFhw4aS1kZVw8DzFJk7dy68vLyQn5+Pv/76C8uXL8fvv/+OkydPwtLSUuryiJ5Yn3zyCQ4ePIjt27ejR48eUpfzRAoICNAPANC0aVOMGTNG4qqAadOm4caNG4iNjYWfn5/U5dBjYOB5igQHB6Ndu3YAgDfeeAMODg5YuHAhfv75Z7z88ssSV0f0ZCouLsbixYvx9ttvM+w8pi1btiAhIQF5eXlo2bIlTE1NJa0nLS0Na9euxYoVKxh2ZIB9eJ5iPXv2BAAkJiYCAG7fvo2pU6eiZcuWsLa2hlqtRnBwMI4dO1Zu3vz8fMyePRtNmzaFubk5XF1dMXDgQFy8eBEAcPnyZQiCUOlQ9hfDrl27IAgCvvvuO7z//vvQaDSwsrLCCy+8gKtXr5Zb98GDB9G3b1/Y2trC0tIS3bt3x759+yrcxh49elS4/tmzZ5dru27dOvj7+8PCwgL29vYYNmxYhet/0LaVpdPpsHjxYvj5+cHc3BwuLi548803cefOHYN2DRs2RL9+/cqtZ/z48eWWWVHtn376abl9CpScZpk1axYaN24MMzMzuLu745133kFBQUGF+6qsHj16oEWLFuXGL1iwAIIg4PLlywbjMzIyMGnSJLi7u8PMzAyNGzfGJ598Ap1Op29Tut8WLFhQbrktWrSo8DPxww8/VFrjyJEjH+mUQsOGDfXvj0KhgEajwdChQ5GUlPTQeQHgiy++gJ+fH8zMzODm5oawsDBkZGTop589exZ37tyBjY0NunfvDktLS9ja2qJfv344efKkvt3OnTshCAI2b95cbh0bNmyAIAiIjY3V1zxy5EiDNqX7ZNeuXfpxe/fuxeDBg+Hh4aF/jydPnoy8vDyDeWfPnl3us7R+/Xq0adMG5ubmcHBwwMsvv1xun4wcORLW1tYG43744YdydQCAtbV1uZqBR/te9ejRQ//++/r6wt/fH8eOHavwe1WR+7/njo6OCAkJMdj/QMn3Z/z48ZUuZ82aNQaf78OHD0On06GwsBDt2rV74L4CgD///BPdunWDlZUV7OzsMGDAAJw+fdqgTel7cebMGQwZMgRqtVp/Ci8/P79cvWW/78XFxXj++edhb2+PhIQEg7aP+vPracYjPE+x0nDi4OAAALh06RK2bNmCwYMHw8vLC6mpqfjyyy/RvXt3JCQkwM3NDQCg1WrRr18/xMTEYNiwYZg4cSKys7MRFRWFkydPwtvbW7+Ol19+Gc8//7zBesPDwyus58MPP4QgCHj33XeRlpaGxYsXIzAwEPHx8bCwsABQ8gMlODgY/v7+mDVrFhQKBVavXo2ePXti79696NChQ7nlNmjQABEREQCAnJwcjBs3rsJ1z5gxA0OGDMEbb7yBmzdvYunSpXj22Wfx999/w87Ortw8Y8eORbdu3QAAP/30U7lfZG+++aa+/9Rbb72FxMRELFu2DH///Tf27dsHlUpV4X6oioyMDP22laXT6fDCCy/gr7/+wtixY9G8eXOcOHECixYtwrlz5ww6hz6uu3fvonv37khOTsabb74JDw8P7N+/H+Hh4bhx4wYWL15stHVVV7du3TB27FjodDqcPHkSixcvxvXr17F3794Hzjd79mzMmTMHgYGBGDduHM6ePYvly5fj8OHD+vfw1q1bAEo+102aNMGcOXOQn5+PyMhIdOnSBYcPH0bTpk3Ro0cPuLu7Y/369fjXv/5lsJ7169fD29tbfzrnUW3atAl3797FuHHj4ODggEOHDmHp0qW4du0aNm3aVOl8GzZswKuvvorWrVsjIiICt27dwpIlS/DXX3/h77//hqOjY5XqqEx1vlel3n333Sqty8fHB//9738hiiIuXryIhQsX4vnnn3/kYFuR0vd2/Pjx8Pf3x8cff4ybN29WuK+io6MRHByMRo0aYfbs2cjLy8PSpUvRpUsXHD16tFw4HzJkCBo2bIiIiAgcOHAAS5YswZ07d/Dtt99WWs8bb7yBXbt2ISoqCr6+vvrxj7Ofnyoiyd7q1atFAGJ0dLR48+ZN8erVq+LGjRtFBwcH0cLCQrx27ZooiqKYn58varVag3kTExNFMzMzce7cufpx33zzjQhAXLhwYbl16XQ6/XwAxE8//bRcGz8/P7F79+761zt37hQBiPXr1xezsrL047///nsRgPj555/rl92kSROxT58++vWIoijevXtX9PLyEnv37l1uXZ07dxZbtGihf33z5k0RgDhr1iz9uMuXL4tKpVL88MMPDeY9ceKEaGJiUm78+fPnRQDi2rVr9eNmzZollv067d27VwQgrl+/3mDebdu2lRvv6ekphoSElKs9LCxMvP8ren/t77zzjujs7Cz6+/sb7NP/+7//ExUKhbh3716D+VesWCECEPft21dufWV1795d9PPzKzf+008/FQGIiYmJ+nHz5s0TraysxHPnzhm0fe+990SlUikmJSWJoli9z8SmTZsqrTE0NFT09PR84HaIYsn+DQ0NNRj3yiuviJaWlg+cLy0tTTQ1NRWDgoIMvhfLli0TAYjffPONQa2Ojo5ienq6vt25c+dElUolDho0SD8uPDxcNDMzEzMyMgzWY2JiYvC+enl5iSNGjDCop3Q9O3fu1I+7e/duubojIiJEQRDEK1eu6MeV/XwWFxeLLi4uore3t5iTk6Nvs2vXLhGA+Pbbb+vHhYaGilZWVgbL37RpU7k6RFEUraysDPZzVb5X3bt3N3j/f//9dxGA2Ldv33LfgYrcP78oiuL7778vAhDT0tL04wCIYWFhlS6n9Gdl6ee79LWvr6/Bvi59L8ruqzZt2ojOzs7irVu39OOOHTsmKhQKg/ey9L144YUXDNb9n//8RwQgHjt2zKDe0s9FeHi4qFQqxS1bthjMV9WfX08zntJ6igQGBsLJyQnu7u4YNmwYrK2tsXnzZtSvXx9AydVbCkXJR0Kr1eLWrVuwtrZGs2bNcPToUf1yfvzxRzg6OmLChAnl1vEoh58rM2LECNjY2Ohfv/TSS3B1dcXvv/8OoOQy1fPnz+OVV17BrVu3kJ6ejvT0dOTm5qJXr17Ys2ePwSkUoOTUm7m5+QPX+9NPP0Gn02HIkCH6Zaanp0Oj0aBJkybYuXOnQfvCwkIAJfurMps2bYKtrS169+5tsEx/f39YW1uXW2ZRUZFBu/T09HKHt++XnJyMpUuXYsaMGeVOO2zatAnNmzeHj4+PwTJLT2Pev/7HsWnTJnTr1g316tUzWFdgYCC0Wi327Nlj0P7u3bvltlWr1Va47OzsbKSnpxucQqqOgoICpKenIy0tDVFRUfjzzz/Rq1evB84THR2NwsJCTJo0Sf+9AIAxY8ZArVaXu2T69ddf1x8tBYAmTZrghRd
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAHHCAYAAABZbpmkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABkxklEQVR4nO3deVhUZf8G8PvMAMO+7wqCIrjigkvmvotLWaZZWmqW1YtWmlZU5tKiZpm+alm/cnvVTCu1VXNf0RTFFRUJBJFtRHYYmJnz+wOZHFkUBM5wuD/XdS6Yc555zncGZrh55jnnCKIoiiAiIiKSKYXUBRARERHVJoYdIiIikjWGHSIiIpI1hh0iIiKSNYYdIiIikjWGHSIiIpI1hh0iIiKSNYYdIiIikjUzqQsgIiKSg6KiImRkZECv18Pb21vqcuguHNkhIiKTtmHDBsTHxxtur127FklJSdIVdJdTp07h2WefhaurK1QqFby8vDBq1Cipy6J7MOzIyNq1ayEIgmGxtLREYGAgpk6ditTUVKnLIyKqlsOHD+Ott95CfHw8du3ahbCwMCgU0v/52rFjB3r06IFLly7h448/xu7du7F79258/fXXUpdG9+DHWDI0f/58+Pv7o7CwEEeOHMFXX32FP/74AxcuXIC1tbXU5RERVcn06dPRp08f+Pv7AwBmzJgBLy8vSWvKyMjAiy++iMGDB2Pr1q2wsLCQtB6qHMOODIWGhqJTp04AgBdffBEuLi5YsmQJduzYgWeeeUbi6oiIqqZFixaIjY3FhQsX4OrqimbNmkldEtasWYPCwkKsXbuWQacekH4ckGpdv379AABxcXEASv4jmTlzJtq2bQtbW1vY29sjNDQUZ8+eLXPfwsJCzJ07F4GBgbC0tISXlxeefPJJxMbGAgDi4+ONPjq7d+nTp4+hrwMHDkAQBPzwww9499134enpCRsbGzz22GNITEwss+8TJ05gyJAhcHBwgLW1NXr37o2jR4+W+xj79OlT7v7nzp1bpu2GDRsQEhICKysrODs7Y+zYseXuv7LHdje9Xo+lS5eidevWsLS0hIeHB15++WXcvn3bqJ2fnx+GDx9eZj9Tp04t02d5tS9evLjMcwoAGo0Gc+bMQUBAAFQqFXx8fPDWW29Bo9GU+1zdrU+fPmjTpk2Z9Z999hkEQTCaJwEAmZmZeOONN+Dj4wOVSoWAgAAsWrQIer3e0Kb0efvss8/K9NumTZtyfyd+/PHHCmucOHEi/Pz87vtY/Pz8DD8fhUIBT09PPP3000hISHig+06cONFo3ZQpU2BpaYkDBw4Yrf/yyy/RunVrqFQqeHt7IywsDJmZmUZtHvR5vbvm8pbSx333c/rFF1+gSZMmsLKyQu/evXHhwoUy+9m3bx969uwJGxsbODo64vHHH0d0dPR9n7e7l7sfd0W/u3erys8dANLS0jB58mR4eHjA0tIS7dq1w7p168rtc+3atbCxsUHXrl3RrFkzhIWFQRCEMj+zimoqXczNzeHn54dZs2ahqKjI0K50CsCpU6cq7KtPnz5Gj+H48eNo3749PvnkE8ProXnz5li4cKHR6wEAtFotPvzwQzRr1gwqlQp+fn549913y7xGS5/nv/76C+3bt4elpSVatWqFn3/+2ahdab13vz4vXrwIJycnDB8+HFqt1rD+QV6zDQFHdhqA0mDi4uICAPjnn3+wfft2jB49Gv7+/khNTcXXX3+N3r1749KlS4ajCHQ6HYYPH469e/di7NixeP3115GTk4Pdu3fjwoULRv9dPfPMMxg6dKjRfsPDw8ut5+OPP4YgCHj77beRlpaGpUuXYsCAAYiKioKVlRWAkjfr0NBQhISEYM6cOVAoFFizZg369euHw4cPo0uXLmX6bdy4MRYsWAAAyM3NxauvvlruvmfPno0xY8bgxRdfRHp6OpYvX45evXrhzJkzcHR0LHOfKVOmoGfPngCAn3/+Gdu2bTPa/vLLL2Pt2rWYNGkSXnvtNcTFxWHFihU4c+YMjh49CnNz83Kfh6rIzMw0PLa76fV6PPbYYzhy5AimTJmCli1b4vz58/jiiy9w9epVbN++/aH3XSo/Px+9e/dGUlISXn75Zfj6+uLYsWMIDw9HcnIyli5dWmP7qq6ePXtiypQp0Ov1uHDhApYuXYqbN2/i8OHDVepnzpw5+O677/DDDz8Y/YGbO3cu5s2bhwEDBuDVV1/FlStX8NVXX+HkyZPV+lkvXboUubm5AIDo6Gh88sknePfdd9GyZUsAgK2trVH79evXIycnB2FhYSgsLMSyZcvQr18/nD9/Hh4eHgCAPXv2IDQ0FE2bNsXcuXNRUFCA5cuXo3v37jh9+nS5wbH0ebu7jtpUUFCAPn364Nq1a5g6dSr8/f2xdetWTJw4EZmZmXj99dcrvO+1a9fwf//3f1XaX+lrWKPRYNeuXfjss89gaWmJDz/8sNqP4datWzhy5AiOHDmCF154ASEhIdi7dy/Cw8MRHx+PVatWGdq++OKLWLduHZ566im8+eabOHHiBBYsWIDo6Ogy7ycxMTF4+umn8corr2DChAlYs2YNRo8ejZ07d2LgwIHl1pKYmIghQ4agRYsW2LJlC8zMSv6014fXbJ0RSTbWrFkjAhD37Nkjpqeni4mJieLmzZtFFxcX0crKSrxx44YoiqJYWFgo6nQ6o/vGxcWJKpVKnD9/vmHd6tWrRQDikiVLyuxLr9cb7gdAXLx4cZk2rVu3Fnv37m24vX//fhGA2KhRIzE7O9uwfsuWLSIAcdmyZYa+mzdvLg4ePNiwH1EUxfz8fNHf318cOHBgmX09+uijYps2bQy309PTRQDinDlzDOvi4+NFpVIpfvzxx0b3PX/+vGhmZlZmfUxMjAhAXLdunWHdnDlzxLtfNocPHxYBiBs3bjS6786dO8usb9KkiThs2LAytYeFhYn3vhTvrf2tt94S3d3dxZCQEKPn9H//+5+oUCjEw4cPG91/1apVIgDx6NGjZfZ3t969e4utW7cus37x4sUiADEuLs6w7sMPPxRtbGzEq1evGrV95513RKVSKSYkJIiiWL3fia1bt1ZY44QJE8QmTZpU+jhEseT5nTBhgtG6Z599VrS2tq7Sfb/++msRgLh8+XKjNmlpaaKFhYU4aNAgo9fPihUrRADi6tWrDeuq8ryWKn0u9u/fX2Zb6XN69+tYFEXxxIkTIgBx+vTphnXt27cX3d3dxVu3bhnWnT17VlQoFOLzzz9fpu9GjRqJkyZNqrSOin53y6vxQX7uS5cuFQGIGzZsMKwrKioSu3XrJtra2hreH0r7XLNmjaHdmDFjxDZt2og+Pj5lft4V1XT3/UVRFL29vcWhQ4cabpe+d548ebLCvnr37m30GHr37i0CEOfOnWvUbuLEiSIA8fz586IoimJUVJQIQHzxxReN2s2cOVMEIO7bt8+wrkmTJiIA8aeffjKsy8rKEr28vMQOHTqUqTcuLk7MyMgQW7VqJQYFBYlqtdpoHw/6mm0I+DGWDA0YMABubm7w8fHB2LFjYWtri23btqFRo0YAAJVKZTiSQafT4datW7C1tUVQUBBOnz5t6Oenn36Cq6srpk2bVmYf937sUhXPP/887OzsDLefeuopeHl54Y8//gAAREVFISYmBs8++yxu3boFtVoNtVqNvLw89O/fH4cOHSozBFtYWAhLS8tK9/vzzz9Dr9djzJgxhj7VajU8PT3RvHlz7N+/36h96TC3SqWqsM+tW7fCwcEBAwcONOozJCQEtra2ZfosLi42aqdWq1FYWFhp3UlJSVi+fDlmz55d5j/9rVu3omXLlmjRooVRn6UfXd67/4exdetW9OzZE05OTkb7GjBgAHQ6HQ4dOmTUPj8/v8xj1el05fadk5MDtVpd5uOgqtJoNFCr1UhLS8Pu3buxb98+9O/f/4Hvv2PHDvznP//BrFmzMHXqVKNte/bsQVFREd544w2jI4Feeukl2Nvb4/fffzdqr9Ppyjz+/Pz8h3p8I0eONLyOAaBLly7o2rWr4bWTnJyMqKgoTJw4Ec7OzoZ2wcHBGDhwoKHd3YqKiir9HS9
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAHHCAYAAABZbpmkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABcJUlEQVR4nO3dd3wVVf7/8dfcm+SmJ6QnkEDoHZQmKALSxYq9omJZf2DDttho7rKKq7iCbVXQFdYVC3aUjiJdIy0gPbQ0QnrPnd8fIffLJaEkJNzk8n4+HvcR7syZM5+5JXkzc2bGME3TRERERMRNWVxdgIiIiEhdUtgRERERt6awIyIiIm5NYUdERETcmsKOiIiIuDWFHREREXFrCjsiIiLi1hR2RERExK0p7IiIyHmvtLSU1NRUkpKSXF2K1AGFHRERqTPffvstCQkJjufz589ny5YtrivoODt27OC+++4jOjoaLy8vIiMj6d27N7qxgPtR2BEns2fPxjAMx8Pb25vWrVszduxYUlJSXF2eiDQwmzZt4pFHHmHHjh2sXr2av/zlL+Tk5Li6LFavXk3Pnj1ZsmQJf/3rX/nxxx9ZuHAh8+fPxzAMV5cntczQvbHkeLNnz+buu+9m8uTJxMfHU1hYyC+//MJ//vMfmjZtyubNm/H19XV1mSLSQKSlpdGnTx927twJwMiRI/n8889dWlNxcTFdunQhMDCQn376iaCgIJfWI3XPw9UFSP00fPhwunfvDsC9995LaGgor776Kl999RW33HKLi6sTkYYiPDyczZs3O/6j1K5dO1eXxDfffMP27dvZtm2bgs55Qoex5IxcdtllAOzZsweAjIwMnnjiCTp16oS/vz+BgYEMHz6cP/74o9KyhYWFTJw4kdatW+Pt7U10dDQjR45k165dAOzdu9fp0NmJj/79+zv6WrZsGYZh8L///Y9nnnmGqKgo/Pz8uOqqq9i/f3+lda9Zs4Zhw4YRFBSEr68v/fr1Y+XKlVVuY//+/atc/8SJEyu1/fjjj+nWrRs+Pj6EhIRw8803V7n+U23b8ex2O9OnT6dDhw54e3sTGRnJAw88wNGjR53aNWvWjCuuuKLSesaOHVupz6pqnzZtWqXXFKCoqIgJEybQsmVLbDYbsbGxPPXUUxQVFVX5Wh2vf//+dOzYsdL0V155BcMw2Lt3r9P0zMxMHn30UWJjY7HZbLRs2ZKXXnoJu93uaFPxur3yyiuV+u3YsWOVn4nPPvvspDXeddddNGvW7LTb0qxZM8f7Y7FYiIqK4qabbjrtoNXjl6vqcfy6z/S9Bvjhhx/o168fAQEBBAYG0qNHD+bOnQuc/PNa1WestLSUKVOm0KJFC2w2G82aNeOZZ56p9P6e6fbn5eXx+OOPO97DNm3a8Morr1Qa61LxGbTZbHTr1o127dqd9DNYleO3xWq10rhxY+6//34yMzMdbWry/q9evZr4+Hg+//xzWrRogZeXF3FxcTz11FMUFBRUWv7NN9+kQ4cO2Gw2YmJiGDNmjFMN8H/fgw0bNtCnTx98fHyIj4/n7bffdmpXUe+yZcsc0w4dOkSzZs3o3r07ubm5juln870UZ9qzI2ekIpiEhoYCsHv3bubPn88NN9xAfHw8KSkpvPPOO/Tr14+tW7cSExMDQFlZGVdccQWLFy/m5ptv5pFHHiEnJ4eFCxeyefNmWrRo4VjHLbfcwuWXX+603vHjx1dZz9/+9jcMw+Dpp58mNTWV6dOnM2jQIBISEvDx8QFgyZIlDB8+nG7dujFhwgQsFguzZs3isssu4+eff6Znz56V+m3SpAlTp04FIDc3lwcffLDKdT///PPceOON3HvvvaSlpfHGG29w6aWX8vvvvxMcHFxpmfvvv5++ffsC8MUXX/Dll186zX/ggQcchxAffvhh9uzZw4wZM/j9999ZuXIlnp6eVb4O1ZGZmenYtuPZ7XauuuoqfvnlF+6//37atWvHpk2beO211/jzzz+ZP3/+Wa+7Qn5+Pv369ePgwYM88MADxMXF8euvvzJ+/HgOHz7M9OnTa21dNdW3b1/uv/9+7HY7mzdvZvr06Rw6dIiff/75pMtMnz7d8UcqMTGRv//97zzzzDOOvRj+/v6Otmf6Xs+ePZt77rmHDh06MH78eIKDg/n9999ZsGABt956K88++yz33nsvAOnp6Tz22GNOn7Pj3XvvvXz44Ydcf/31PP7446xZs4apU6eSmJhY6bN4uu03TZOrrrqKpUuXMnr0aLp27cqPP/7Ik08+ycGDB3nttddO+jqd7DN4Ktdeey0jR46ktLSUVatW8e6771JQUMB//vOfavVzvCNHjrB7926eeeYZRo4cyeOPP8769euZNm0amzdv5rvvvnOExYkTJzJp0iQGDRrEgw8+yPbt23nrrbdYt25dpe/m0aNHufzyy7nxxhu55ZZb+PTTT3nwwQfx8vLinnvuqbKWrKwshg8fjqenJ99//73js3Iuv5fnBVPkOLNmzTIBc9GiRWZaWpq5f/9+85NPPjFDQ0NNHx8f88CBA6ZpmmZhYaFZVlbmtOyePXtMm81mTp482THtgw8+MAHz1VdfrbQuu93uWA4wp02bVqlNhw4dzH79+jmeL1261ATMxo0bm9nZ2Y7pn376qQmYr7/+uqPvVq1amUOHDnWsxzRNMz8/34yPjzcHDx5caV19+vQxO3bs6HielpZmAuaECRMc0/bu3WtarVbzb3/7m9OymzZtMj08PCpN37FjhwmYH374oWPahAkTzOO/ej///LMJmHPmzHFadsGCBZWmN23a1BwxYkSl2seMGWOe+HU+sfannnrKjIiIMLt16+b0mv7nP/8xLRaL+fPPPzst//bbb5uAuXLlykrrO16/fv3MDh06VJo+bdo0EzD37NnjmDZlyhTTz8/P/PPPP53a/vWvfzWtVquZlJRkmmbNPhPz5s07aY2jRo0ymzZtesrtMM3y13fUqFFO02699VbT19f3tMueWM/SpUsrzTvT9zozM9MMCAgwe/XqZRYUFDi1Pf7zXKHi9Zo1a1aleQkJCSZg3nvvvU7Tn3jiCRMwlyxZ4ph2Jts/f/58EzBffPFFp3bXX3+9aRiGuXPnTse0M/0MnsyJy5tm+fe0ffv2juc1ef9HjRplAuZdd93l1K7iu/nNN9+YpmmaqampppeXlzlkyBCn33czZswwAfODDz5wTOvXr58JmP/85z8d04qKisyuXbuaERERZnFxsVO9S5cuNQsLC83+/fubERERTq+baZ7991Kc6TCWVGnQoEGEh4cTGxvLzTffjL+/P19++SWNGzcGwGazYbGUf3zKyso4cuQI/v7+tGnTht9++83Rz+eff05YWBgPPfRQpXWczRkPd955JwEBAY7n119/PdHR0Xz//fcAJCQksGPHDm699VaOHDlCeno66enp5OXlMXDgQFasWOF02ATKD7d5e3ufcr1ffPEFdrudG2+80dFneno6UVFRtGrViqVLlzq1Ly4uBspfr5OZN28eQUFBDB482KnPbt264e/vX6nPkpISp3bp6ekUFhaesu6DBw/yxhtv8PzzzzvtZahYf7t27Wjbtq1TnxWHLk9c/9mYN28effv2pVGjRk7rGjRoEGVlZaxYscKpfX5+fqVtLSsrq7LvnJwc0tPTKx1eqK6ioiLS09NJTU1l4cKFLFmyhIEDB55VnxXO9L1euHAhOTk5/PWvf630mazu96biOzFu3Din6Y8//jgA3333ndP0023/999/j9Vq5eGHH67Un2ma/PDDD1XWcarP4KlUfAaSk5P5/PPP+eOPP6p8P2ry/j/55JNOzx977DGsVqvjNVm0aBHFxcU8+uijjt93APfddx+BgYGVXjsPDw8eeOABx3MvLy8eeOABUlNT2bBhg1Nbu93OnXfeyerVq/n++++d9nLDuf1eng90GEuqNHPmTFq3bo2HhweRkZG0adPG6ctut9t5/fXXefPNN9mzZ4/TH6CKQ11QfvirTZs2eHjU7ketVatWTs8Nw6Bly5aO8SE7duwAYNSoUSftIysri0aNGjmep6enV+r3RDt27MA0zZO2O/F
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"# Гистограмма распределения объема в обучающей выборке\n",
|
|||
|
"sns.histplot(train_data[\"Volume\"], kde=True)\n",
|
|||
|
"plt.title('Распределение цены в обучающей выборке')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Гистограмма распределения объема в контрольной выборке\n",
|
|||
|
"sns.histplot(val_data[\"Volume\"], kde=True)\n",
|
|||
|
"plt.title('Распределение цены в контрольной выборке')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Гистограмма распределения объема в тестовой выборке\n",
|
|||
|
"sns.histplot(test_data[\"Volume\"], kde=True)\n",
|
|||
|
"plt.title('Распределение цены в тестовой выборке')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Процесс конструирования признаков\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"### Унитарное кодирование категориальных признаков (one-hot encoding)\n",
|
|||
|
"\n",
|
|||
|
"One-hot encoding: Преобразование категориальных признаков в бинарные векторы."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 7,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"# Пример категориальных признаков\n",
|
|||
|
"categorical_features = [\n",
|
|||
|
" \"Date\",\n",
|
|||
|
" \"date\"\n",
|
|||
|
"]\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding\n",
|
|||
|
"train_data_encoded = pd.get_dummies(train_data, columns=categorical_features)\n",
|
|||
|
"val_data_encoded = pd.get_dummies(val_data, columns=categorical_features)\n",
|
|||
|
"test_data_encoded = pd.get_dummies(test_data, columns=categorical_features)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Дискретизация числовых признаков "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 8,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>High</th>\n",
|
|||
|
" <th>High</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5231</th>\n",
|
|||
|
" <td>5.83</td>\n",
|
|||
|
" <td>(1.143, 7.625]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5232</th>\n",
|
|||
|
" <td>5.93</td>\n",
|
|||
|
" <td>(1.143, 7.625]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5233</th>\n",
|
|||
|
" <td>6.06</td>\n",
|
|||
|
" <td>(1.143, 7.625]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5234</th>\n",
|
|||
|
" <td>6.04</td>\n",
|
|||
|
" <td>(1.143, 7.625]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5235</th>\n",
|
|||
|
" <td>6.15</td>\n",
|
|||
|
" <td>(1.143, 7.625]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5236</th>\n",
|
|||
|
" <td>6.23</td>\n",
|
|||
|
" <td>(1.143, 7.625]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5237</th>\n",
|
|||
|
" <td>6.40</td>\n",
|
|||
|
" <td>(1.143, 7.625]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5238</th>\n",
|
|||
|
" <td>6.25</td>\n",
|
|||
|
" <td>(1.143, 7.625]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5239</th>\n",
|
|||
|
" <td>6.33</td>\n",
|
|||
|
" <td>(1.143, 7.625]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5240</th>\n",
|
|||
|
" <td>6.25</td>\n",
|
|||
|
" <td>(1.143, 7.625]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5241</th>\n",
|
|||
|
" <td>6.03</td>\n",
|
|||
|
" <td>(1.143, 7.625]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5242</th>\n",
|
|||
|
" <td>5.72</td>\n",
|
|||
|
" <td>(1.143, 7.625]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5243</th>\n",
|
|||
|
" <td>5.65</td>\n",
|
|||
|
" <td>(1.143, 7.625]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5244</th>\n",
|
|||
|
" <td>5.44</td>\n",
|
|||
|
" <td>(1.143, 7.625]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5245</th>\n",
|
|||
|
" <td>5.61</td>\n",
|
|||
|
" <td>(1.143, 7.625]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5246</th>\n",
|
|||
|
" <td>5.69</td>\n",
|
|||
|
" <td>(1.143, 7.625]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5247</th>\n",
|
|||
|
" <td>5.39</td>\n",
|
|||
|
" <td>(1.143, 7.625]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5248</th>\n",
|
|||
|
" <td>5.53</td>\n",
|
|||
|
" <td>(1.143, 7.625]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5249</th>\n",
|
|||
|
" <td>5.61</td>\n",
|
|||
|
" <td>(1.143, 7.625]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5250</th>\n",
|
|||
|
" <td>5.66</td>\n",
|
|||
|
" <td>(1.143, 7.625]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" High High\n",
|
|||
|
"5231 5.83 (1.143, 7.625]\n",
|
|||
|
"5232 5.93 (1.143, 7.625]\n",
|
|||
|
"5233 6.06 (1.143, 7.625]\n",
|
|||
|
"5234 6.04 (1.143, 7.625]\n",
|
|||
|
"5235 6.15 (1.143, 7.625]\n",
|
|||
|
"5236 6.23 (1.143, 7.625]\n",
|
|||
|
"5237 6.40 (1.143, 7.625]\n",
|
|||
|
"5238 6.25 (1.143, 7.625]\n",
|
|||
|
"5239 6.33 (1.143, 7.625]\n",
|
|||
|
"5240 6.25 (1.143, 7.625]\n",
|
|||
|
"5241 6.03 (1.143, 7.625]\n",
|
|||
|
"5242 5.72 (1.143, 7.625]\n",
|
|||
|
"5243 5.65 (1.143, 7.625]\n",
|
|||
|
"5244 5.44 (1.143, 7.625]\n",
|
|||
|
"5245 5.61 (1.143, 7.625]\n",
|
|||
|
"5246 5.69 (1.143, 7.625]\n",
|
|||
|
"5247 5.39 (1.143, 7.625]\n",
|
|||
|
"5248 5.53 (1.143, 7.625]\n",
|
|||
|
"5249 5.61 (1.143, 7.625]\n",
|
|||
|
"5250 5.66 (1.143, 7.625]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 8,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"labels = [\"low hight price\", \"medium hight price\", \"big hight price\"]\n",
|
|||
|
"num_bins = 3\n",
|
|||
|
"\n",
|
|||
|
"hist1, bins1 = np.histogram(\n",
|
|||
|
" df[\"High\"].fillna(df[\"High\"].median()), bins=num_bins\n",
|
|||
|
")\n",
|
|||
|
"bins1, hist1\n",
|
|||
|
"\n",
|
|||
|
"pd.concat([df[\"High\"], pd.cut(df[\"High\"], list(bins1))], axis=1).tail(20)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 9,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>High</th>\n",
|
|||
|
" <th>High</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>3.428571</td>\n",
|
|||
|
" <td>low hight price</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>3.428571</td>\n",
|
|||
|
" <td>low hight price</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>3.714286</td>\n",
|
|||
|
" <td>low hight price</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>3.714286</td>\n",
|
|||
|
" <td>low hight price</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>3.714286</td>\n",
|
|||
|
" <td>low hight price</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>3.714286</td>\n",
|
|||
|
" <td>low hight price</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <td>3.714286</td>\n",
|
|||
|
" <td>low hight price</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7</th>\n",
|
|||
|
" <td>3.714286</td>\n",
|
|||
|
" <td>low hight price</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8</th>\n",
|
|||
|
" <td>3.428571</td>\n",
|
|||
|
" <td>low hight price</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9</th>\n",
|
|||
|
" <td>3.428571</td>\n",
|
|||
|
" <td>low hight price</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10</th>\n",
|
|||
|
" <td>3.428571</td>\n",
|
|||
|
" <td>low hight price</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>11</th>\n",
|
|||
|
" <td>3.428571</td>\n",
|
|||
|
" <td>low hight price</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12</th>\n",
|
|||
|
" <td>3.428571</td>\n",
|
|||
|
" <td>low hight price</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>13</th>\n",
|
|||
|
" <td>2.857143</td>\n",
|
|||
|
" <td>low hight price</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>14</th>\n",
|
|||
|
" <td>3.142857</td>\n",
|
|||
|
" <td>low hight price</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15</th>\n",
|
|||
|
" <td>3.142857</td>\n",
|
|||
|
" <td>low hight price</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16</th>\n",
|
|||
|
" <td>3.428571</td>\n",
|
|||
|
" <td>low hight price</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>17</th>\n",
|
|||
|
" <td>3.714286</td>\n",
|
|||
|
" <td>low hight price</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18</th>\n",
|
|||
|
" <td>3.142857</td>\n",
|
|||
|
" <td>low hight price</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>19</th>\n",
|
|||
|
" <td>3.142857</td>\n",
|
|||
|
" <td>low hight price</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" High High\n",
|
|||
|
"0 3.428571 low hight price\n",
|
|||
|
"1 3.428571 low hight price\n",
|
|||
|
"2 3.714286 low hight price\n",
|
|||
|
"3 3.714286 low hight price\n",
|
|||
|
"4 3.714286 low hight price\n",
|
|||
|
"5 3.714286 low hight price\n",
|
|||
|
"6 3.714286 low hight price\n",
|
|||
|
"7 3.714286 low hight price\n",
|
|||
|
"8 3.428571 low hight price\n",
|
|||
|
"9 3.428571 low hight price\n",
|
|||
|
"10 3.428571 low hight price\n",
|
|||
|
"11 3.428571 low hight price\n",
|
|||
|
"12 3.428571 low hight price\n",
|
|||
|
"13 2.857143 low hight price\n",
|
|||
|
"14 3.142857 low hight price\n",
|
|||
|
"15 3.142857 low hight price\n",
|
|||
|
"16 3.428571 low hight price\n",
|
|||
|
"17 3.714286 low hight price\n",
|
|||
|
"18 3.142857 low hight price\n",
|
|||
|
"19 3.142857 low hight price"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 9,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"pd.concat(\n",
|
|||
|
" [df[\"High\"], pd.cut(df[\"High\"], list(bins1), labels=labels)], axis=1\n",
|
|||
|
").head(20)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Ручной синтез"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 10,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Пример синтеза признака среднего значения в максимальной и минимальной цене\n",
|
|||
|
"train_data_encoded[\"medium\"] = train_data_encoded[\"High\"] / train_data_encoded[\"Low\"]\n",
|
|||
|
"val_data_encoded[\"medium\"] = val_data_encoded[\"High\"] / val_data_encoded[\"Low\"]\n",
|
|||
|
"test_data_encoded[\"medium\"] = test_data_encoded[\"High\"] / test_data_encoded[\"Low\"]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Масштабирование признаков - это процесс преобразования числовых признаков таким образом, чтобы они имели одинаковый масштаб. Это важно для многих алгоритмов машинного обучения, которые чувствительны к масштабу признаков, таких как линейная регрессия, метод опорных векторов (SVM) и нейронные сети."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 11,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
|
|||
|
"\n",
|
|||
|
"# Пример масштабирования числовых признаков\n",
|
|||
|
"numerical_features = [\"Open\", \"Close\"]\n",
|
|||
|
"\n",
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"train_data_encoded[numerical_features] = scaler.fit_transform(train_data_encoded[numerical_features])\n",
|
|||
|
"val_data_encoded[numerical_features] = scaler.transform(val_data_encoded[numerical_features])\n",
|
|||
|
"test_data_encoded[numerical_features] = scaler.transform(test_data_encoded[numerical_features])"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Конструирование признаков с применением фреймворка Featuretools"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 12,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import featuretools as ft\n",
|
|||
|
"\n",
|
|||
|
"# Определение сущностей\n",
|
|||
|
"es = ft.EntitySet(id='gold_data')\n",
|
|||
|
"\n",
|
|||
|
"es = es.add_dataframe(dataframe_name='yamana', dataframe=train_data_encoded, index='id')\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# Генерация признаков\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(\n",
|
|||
|
" entityset=es, target_dataframe_name=\"yamana\", max_depth=2\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование признаков для контрольной и тестовой выборок\n",
|
|||
|
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_data_encoded.index)\n",
|
|||
|
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_data_encoded.index)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Оценка качества каждого набора признаков\n",
|
|||
|
"Предсказательная способность\n",
|
|||
|
"Метрики: RMSE, MAE, R²\n",
|
|||
|
"\n",
|
|||
|
"Методы: Обучение модели на обучающей выборке и оценка на контрольной и тестовой выборках.\n",
|
|||
|
"\n",
|
|||
|
"Скорость вычисления\n",
|
|||
|
"Методы: Измерение времени выполнения генерации признаков и обучения модели.\n",
|
|||
|
"\n",
|
|||
|
"Надежность\n",
|
|||
|
"Методы: Кросс-валидация, анализ чувствительности модели к изменениям в данных.\n",
|
|||
|
"\n",
|
|||
|
"Корреляция\n",
|
|||
|
"Методы: Анализ корреляционной матрицы признаков, удаление мультиколлинеарных признаков.\n",
|
|||
|
"\n",
|
|||
|
"Цельность\n",
|
|||
|
"Методы: Проверка логической связи между признаками и целевой переменной, интерпретация результатов модели."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 13,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:724: UserWarning: A Woodwork-initialized DataFrame was provided, so the following parameters were ignored: index\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import featuretools as ft\n",
|
|||
|
"\n",
|
|||
|
"# Определение сущностей\n",
|
|||
|
"es = ft.EntitySet(id='gold_data')\n",
|
|||
|
"es = es.add_dataframe(\n",
|
|||
|
" dataframe_name=\"yamana\", dataframe=train_data_encoded, index=\"id\"\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Генерация признаков\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(\n",
|
|||
|
" entityset=es, target_dataframe_name=\"yamana\", max_depth=2\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование признаков для контрольной и тестовой выборок\n",
|
|||
|
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_data_encoded.index)\n",
|
|||
|
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_data_encoded.index)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 14,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"RMSE: 1764152.3991770656\n",
|
|||
|
"R²: 0.942082609353535\n",
|
|||
|
"MAE: 1161195.586464497\n",
|
|||
|
"Cross-validated RMSE: 4835663.342127571\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Train RMSE: 1789275.1500045008\n",
|
|||
|
"Train R²: 0.944270691246794\n",
|
|||
|
"Train MAE: 1134044.475345238\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0EAAAIjCAYAAADFthA8AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAD64UlEQVR4nOzdd3hUZdoG8Puc6T2FFAg9NGkqomBFRUHFgg1XcBWsKCg2FHVVwF6wgm1dsK5ddu2IfqBrQxEBg1ISiInppEyfOXPK90fIyJBCJiSZlPt3XVwX856ZOc/MQHKeeZ/3eQVN0zQQERERERF1E2KiAyAiIiIiImpPTIKIiIiIiKhbYRJERERERETdCpMgIiIiIiLqVpgEERERERFRt8IkiIiIiIiIuhUmQURERERE1K0wCSIiIiIiom6FSRAREREREXUrTIKIiLoAQRCwcOHCRIeRcMcffzyOP/746O38/HwIgoCXXnopYTHta98Y28vMmTPRv3//dj8vEVFHxCSIiGgfzzzzDARBwLhx41r8HMXFxVi4cCE2btzYeoF1cGvXroUgCNE/BoMBAwcOxMUXX4ydO3cmOry4fPfdd1i4cCFqamra/dwbNmyAIAj4xz/+0eh9duzYAUEQcOONN7ZjZEREXQeTICKifbz++uvo378/fvzxR+Tm5rboOYqLi7Fo0aJulQTVue666/Dqq6/ihRdewJQpU/DWW2/h8MMPR3FxcbvH0q9fPwSDQfz973+P63HfffcdFi1alJAkaMyYMRg2bBjeeOONRu/z73//GwBw0UUXtVdYRERdCpMgIqK97Nq1C9999x0ee+wxpKWl4fXXX090SJ3Osccei4suugizZs3C008/jUcffRRVVVV4+eWXG32M3+9vk1gEQYDZbIZOp2uT528rM2bMwM6dO/HDDz80ePyNN97AsGHDMGbMmHaOjIioa2ASRES0l9dffx3JycmYMmUKzjvvvEaToJqaGtxwww3o378/TCYTevfujYsvvhi7d+/G2rVrcfjhhwMAZs2aFS0Pq1uX0r9/f8ycObPec+67VkSSJNx111047LDD4HK5YLPZcOyxx2LNmjVxv66ysjLo9XosWrSo3rFt27ZBEAQsXboUABCJRLBo0SIMHjwYZrMZqampOOaYY7B69eq4zwsAJ554IoDaBBMAFi5cCEEQ8Ntvv2H69OlITk7GMcccE73/a6+9hsMOOwwWiwUpKSn429/+hsLCwnrP+8ILLyA7OxsWiwVHHHEE/ve//9W7T2NrgrZu3Ypp06YhLS0NFosFQ4cOxR133BGNb/78+QCAAQMGRD+//Pz8NomxITNmzADw14zP3n7++Wds27Yteh+gtoRzxIgRMJlM6NWrF+bMmbPfWay68sW1a9fGjDf0ns2cORN2ux0FBQU4/fTTYbfbkZWVhWXLlgEAfv31V5x44omw2Wzo169fg3HX1NTg+uuvR58+fWAymTBo0CA89NBDUFW1We8JEVFrYhJERLSX119/Heeccw6MRiMuvPBC7NixAz/99FPMfXw+H4499lg8/fTTmDRpEp588knMnj0bW7duxZ9//omDDjoIixcvBgBceeWVePXVV/Hqq6/iuOOOiysWj8eDF198EccffzweeughLFy4EBUVFZg8eXLcZXYZGRmYMGEC3n777XrH3nrrLeh0Opx//vkAapOARYsW4YQTTsDSpUtxxx13oG/fvtiwYUNc56yTl5cHAEhNTY0ZP//88xEIBHD//ffjiiuuAADcd999uPjiizF48GA89thjuP766/Hll1/iuOOOi7mo/9e//oWrrroKmZmZePjhh3H00UfjzDPPbDAR2dfmzZsxbtw4/N///R+uuOIKPPnkk5g6dSo+/PBDAMA555yDCy+8EADw+OOPRz+/tLS0dotxwIABOOqoo/D2229DUZSYY3UJxvTp0wHUfl5z5sxBr169sGTJEpx77rl4/vnnMWnSJEQikf2eq7kURcGpp56KPn364OGHH0b//v0xd+5cvPTSSzjllFMwduxYPPTQQ3A4HLj44oujSS8ABAIBTJgwAa+99houvvhiPPXUUzj66KNx2223cV0TESWGRkREmqZp2vr16zUA2urVqzVN0zRVVbXevXtr8+bNi7nfXXfdpQHQ3n///XrPoaqqpmma9tNPP2kAtBUrVtS7T79+/bRLLrmk3viECRO0CRMmRG/LsqyFw+GY+1RXV2sZGRnapZdeGjMOQLv77rubfH3PP/+8BkD79ddfY8aHDx+unXjiidHbBx98sDZlypQmn6sha9as0QBoy5cv1yoqKrTi4mLt448/1vr3768JgqD99NNPmqZp2t13360B0C688MKYx+fn52s6nU677777YsZ//fVXTa/XR8clSdLS09O1Qw45JOb9eeGFFzQAMe/hrl276n0Oxx13nOZwOLQ//vgj5jx1n52madojjzyiAdB27drV5jE2ZtmyZRoAbdWqVdExRVG0rKws7cgjj9Q0TdPKy8s1o9GoTZo0SVMUJXq/pUuXRj+LOpdcconWr1+/6O26z2vNmjUx523oPbvkkks0ANr9998fHauurtYsFosmCIL25ptvRse3bt1a79/jPffco9lsNm379u0x51qwYIGm0+m0goKC/b4fREStiTNBRER7vP7668jIyMAJJ5wAoHY9yQUXXIA333wz5tv49957DwcffDDOPvvses8hCEKrxaPT6WA0GgEAqqqiqqoKsixj7NixLZqVOeecc6DX6/HWW29Fx3JycvDbb7/hggsuiI4lJSVhy5Yt2LFjR4vivvTSS5GWloZevXphypQp8Pv9ePnllzF27NiY+82ePTvm9vvvvw9VVTFt2jTs3r07+iczMxODBw+OlgGuX78e5eXlmD17dvT9AWpLtlwuV5OxVVRU4Ouvv8all16Kvn37xhxrzmfXHjHWueCCC2AwGGJKy7766isUFRVFS+G++OILSJKE66+/HqL416/0K664Ak6nEx9//HGzztVcl19+efTvSUlJGDp0KGw2G6ZNmxYdHzp0KJKSkmI6Ar7zzjs49thjkZycHPO+nXTSSVAUBV9//XWrxklEtD9dJgn6+uuvccYZZ6BXr14QBAH/+c9/4np8XY36vn9sNlvbBExEHYqiKHjzzTdxwgknYNeuXcjNzUVubi7GjRuHsrIyfPnll9H75uXlYeTIke0S18svv4zRo0dH1+akpaXh448/htvtjvu5evTogYkTJ8aUxL311lvQ6/U455xzomOLFy9GTU0NhgwZglGjRmH+/PnYvHlzs89z1113YfXq1fi///s/bN68GcXFxQ12ZxswYEDM7R07dkDTNAwePBhpaWkxf37//XeUl5cDAP744w8AwODBg2MeX9eSuyl1F+Yt/fzaI8Y6qampmDx5MlauXIlQKASgthROr9dHk4668wwdOjTmsUajEQMHDowebw1mszlaEljH5XKhd+/e9RJIl8uF6urq6O0dO3bgs88+q/eenXTSSQAQfd+IiNqLPtEBtBa/34+DDz4Yl156acwv8+a6+eab630rOXHixOjiZiLq2v7v//4PJSUlePPNN/Hmm2/WO/76669j0qRJrXKuxmYcFEWJ6WL22muvYebMmZg6dSrmz5+P9PR06HQ6PPDAA9F1NvH629/+hlmzZmHjxo045JBD8Pbbb2PixIno0aNH9D7HHXcc8vLy8N///heff/45XnzxRTz++ON47rnnYmYCGjNq1KjoxW1TLBZLzG1VVSEIAj799NMGu7nZ7fZmvMK21d4xXnTRRfjoo4/w0Ucf4cwzz8R7772HSZMm1UtGWqKpf4cNaazDXmPjmqZF/66qKk4++WTccsstDd53yJAhTYVKRNTqukwSdOqpp+LUU09t9Hg4HMYdd9yBN954AzU1NRg5ciQeeuihaCcmu90e88tr06ZN+O233/Dcc8+1dehE1AG8/vrrSE9Pj3a72tv777+PlStX4rnnnoPFYkF2djZycnKafL6mSquSk5Mb7Nz1xx9/xMwSvPvuuxg4cCDef//9mOe7++67m/GKGjZ16lRcddVV0ZK47du
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from sklearn.ensemble import RandomForestRegressor\n",
|
|||
|
"from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
|
|||
|
"from sklearn.model_selection import cross_val_score\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"# Удаление строк с NaN\n",
|
|||
|
"feature_matrix = feature_matrix.dropna()\n",
|
|||
|
"val_feature_matrix = val_feature_matrix.dropna()\n",
|
|||
|
"test_feature_matrix = test_feature_matrix.dropna()\n",
|
|||
|
"\n",
|
|||
|
"# Разделение данных на обучающую и тестовую выборки\n",
|
|||
|
"X_train = feature_matrix.drop(\"Volume\", axis=1)\n",
|
|||
|
"y_train = feature_matrix[\"Volume\"]\n",
|
|||
|
"X_val = val_feature_matrix.drop(\"Volume\", axis=1)\n",
|
|||
|
"y_val = val_feature_matrix[\"Volume\"]\n",
|
|||
|
"X_test = test_feature_matrix.drop(\"Volume\", axis=1)\n",
|
|||
|
"y_test = test_feature_matrix[\"Volume\"]\n",
|
|||
|
"\n",
|
|||
|
"# Выбор модели\n",
|
|||
|
"model = RandomForestRegressor(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Обучение модели\n",
|
|||
|
"model.fit(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Предсказание и оценка\n",
|
|||
|
"y_pred = model.predict(X_test)\n",
|
|||
|
"\n",
|
|||
|
"rmse = mean_squared_error(y_test, y_pred, squared=False)\n",
|
|||
|
"r2 = r2_score(y_test, y_pred)\n",
|
|||
|
"mae = mean_absolute_error(y_test, y_pred)\n",
|
|||
|
"\n",
|
|||
|
"print(f\"RMSE: {rmse}\")\n",
|
|||
|
"print(f\"R²: {r2}\")\n",
|
|||
|
"print(f\"MAE: {mae}\")\n",
|
|||
|
"\n",
|
|||
|
"# Кросс-валидация\n",
|
|||
|
"scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n",
|
|||
|
"rmse_cv = (-scores.mean())**0.5\n",
|
|||
|
"print(f\"Cross-validated RMSE: {rmse_cv}\")\n",
|
|||
|
"\n",
|
|||
|
"# Анализ важности признаков\n",
|
|||
|
"feature_importances = model.feature_importances_\n",
|
|||
|
"feature_names = X_train.columns\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# Проверка на переобучение\n",
|
|||
|
"y_train_pred = model.predict(X_train)\n",
|
|||
|
"\n",
|
|||
|
"rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)\n",
|
|||
|
"r2_train = r2_score(y_train, y_train_pred)\n",
|
|||
|
"mae_train = mean_absolute_error(y_train, y_train_pred)\n",
|
|||
|
"\n",
|
|||
|
"print(f\"Train RMSE: {rmse_train}\")\n",
|
|||
|
"print(f\"Train R²: {r2_train}\")\n",
|
|||
|
"print(f\"Train MAE: {mae_train}\")\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация результатов\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(y_test, y_pred, alpha=0.5)\n",
|
|||
|
"plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)\n",
|
|||
|
"plt.xlabel(\"Actual Volume\")\n",
|
|||
|
"plt.ylabel(\"Predicted Volume\")\n",
|
|||
|
"plt.title(\"Actual vs Predicted Volume\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Точность предсказаний: Модель показывает довольно высокий R² (0.944), что указывает на хорошее объяснение вариации распродаж. Значения RMSE и MAE довольно низки, что говорит о том, что модель достаточно точно предсказывает цены.\n",
|
|||
|
"\n",
|
|||
|
"Переобучение: Разница между RMSE на обучающей и тестовой выборках не очень большая, что указывает на то, что переобучение не является критическим. Однако, стоит быть осторожным и продолжать мониторинг этого показателя.\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "aisenv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.6"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|