1404 lines
320 KiB
Plaintext
1404 lines
320 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Набор данных с ценами на мобильные устройства"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Вывод всех столбцов"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 1,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['Unnamed: 0', 'Name', 'Rating', 'Spec_score', 'No_of_sim', 'Ram',\n",
|
|||
|
" 'Battery', 'Display', 'Camera', 'External_Memory', 'Android_version',\n",
|
|||
|
" 'Price', 'company', 'Inbuilt_memory', 'fast_charging',\n",
|
|||
|
" 'Screen_resolution', 'Processor', 'Processor_name'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd \n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Бизнес-цели:\n",
|
|||
|
"1. Классифицировать мобильные устройства по ценовым категориям (например, бюджетные, средний класс, флагманы).\n",
|
|||
|
"2. Определить, какие характеристики мобильных устройств наиболее сильно влияют на их рейтинг."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выполним разбиение на 3 выборки: обучающую, контрольную и тестовую"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 2,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 671\n",
|
|||
|
"Размер контрольной выборки: 288\n",
|
|||
|
"Размер тестовой выборки: 411\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 3,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение классов в company:\n",
|
|||
|
"company\n",
|
|||
|
"Vivo 186\n",
|
|||
|
"Realme 186\n",
|
|||
|
"Samsung 181\n",
|
|||
|
"Motorola 127\n",
|
|||
|
"Xiaomi 90\n",
|
|||
|
"Honor 88\n",
|
|||
|
"Poco 75\n",
|
|||
|
"OnePlus 75\n",
|
|||
|
"Huawei 62\n",
|
|||
|
"iQOO 57\n",
|
|||
|
"OPPO 38\n",
|
|||
|
"Oppo 27\n",
|
|||
|
"TCL 26\n",
|
|||
|
"Google 23\n",
|
|||
|
"Asus 21\n",
|
|||
|
"POCO 19\n",
|
|||
|
"Lava 19\n",
|
|||
|
"Nothing 15\n",
|
|||
|
"Lenovo 14\n",
|
|||
|
"Tecno 13\n",
|
|||
|
"itel 12\n",
|
|||
|
"LG 6\n",
|
|||
|
"Gionee 5\n",
|
|||
|
"Itel 3\n",
|
|||
|
"IQOO 1\n",
|
|||
|
"Coolpad 1\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAmEAAAHHCAYAAAD3WI8lAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACIwklEQVR4nOzdeXwN1//48ddNyM2+WRIhRFZBEGsJYm1CbS0NmiKkqF1Rmo9dEUtttVaL0NprLa211tiXWCqIEKL2LSEqiWR+f/iZrytBRLhZ3s/HYx7NnTlz5n3mJu67Z849R6MoioIQQgghhPigDPQdgBBCCCFEXiRJmBBCCCGEHkgSJoQQQgihB5KECSGEEELogSRhQgghhBB6IEmYEEIIIYQeSBImhBBCCKEHkoQJIYQQQuiBJGFCCCGEEHogSZgQQgghhB5IEibEK4SFhaHRaNTN2NgYd3d3evbsyc2bN/UdnhBCiBwun74DECK7GzVqFCVLluTJkyfs3buX2bNn8+eff3L69GlMTU31HZ4QQogcSpIwId6gUaNGVK5cGYCvvvqKAgUKMHnyZNatW0fbtm31HJ0QQoicSh5HCvGW6tWrB8ClS5cAuHfvHgMGDMDLywtzc3MsLS1p1KgRJ06cSHPukydPGDFiBO7u7hgbG1OkSBE+++wzoqOjAYiJidF5BPryVqdOHbWunTt3otFoWL58Of/73/+wt7fHzMyMZs2aERsbm+baBw8exN/fHysrK0xNTfH19SU8PDzdNtapUyfd648YMSJN2d9++41KlSphYmKCra0tbdq0Sff6r2vbi1JTU5k6dSplypTB2NgYOzs7unbtyv3793XKOTk50aRJkzTX6dmzZ5o604t94sSJae4pQGJiIsOHD8fV1RWtVoujoyMDBw4kMTEx3Xv1ojp16qSpb8yYMRgYGLBkyZJM3Y8ffviBGjVqUKBAAUxMTKhUqRK///57utf/7bffqFq1KqamptjY2FC7dm22bNmiU+avv/7C19cXCwsLLC0tqVKlSprYVq5cqb6nBQsW5Msvv+Tff//VKRMUFKQTs42NDXXq1GHPnj1vvE/vci7A2bNnCQgIoFChQpiYmODh4cHgwYN1yhw/fpxGjRphaWmJubk59evX58CBAzplng852Lt3L71796ZQoUJYW1vTtWtXkpKSePDgAe3bt8fGxgYbGxsGDhyIoijq+c/fwx9++IEpU6ZQokQJTExM8PX15fTp0zrXOnnyJEFBQTg7O2NsbIy9vT2dOnXi7t27OuVGjBiBRqPhwoULBAUFYW1tjZWVFR07duTx48dqOV9fX8qXL5/u/fHw8MDPzy9D91Lol/SECfGWnidMBQoUAODixYusXbuWzz//nJIlS3Lz5k1++uknfH19OXPmDA4ODgCkpKTQpEkTtm/fTps2bejTpw8PHz5k69atnD59GhcXF/Uabdu2pXHjxjrXDQkJSTeeMWPGoNFoGDRoELdu3WLq1Kk0aNCAiIgITExMAPj7779p1KgRlSpVYvjw4RgYGLBgwQLq1avHnj17qFq1app6ixUrRmhoKACPHj2iW7du6V576NChBAQE8NVXX3H79m2mT59O7dq1OX78ONbW1mnO6dKlC7Vq1QJg9erVrFmzRud4165dCQsLo2PHjvTu3ZtLly4xY8YMjh8/Tnh4OPnz50/3PryNBw8eqG17UWpqKs2aNWPv3r106dIFT09PTp06xZQpUzh//jxr1659q+ssWLCAIUOGMGnSJL744ot0y7zpfkybNo1mzZoRGBhIUlISy5Yt4/PPP2fDhg188sknarmRI0cyYsQIatSowahRozAyMuLgwYP8/ffffPzxx8CzpKNTp06UKVOGkJAQrK2tOX78OJs2bVLje37vq1SpQmhoKDdv3mTatGmEh4eneU8LFizIlClTALh69SrTpk2jcePGxMbGpvvevyiz5548eZJatWqRP39+unTpgpOTE9HR0fzxxx+MGTMGgH/++YdatWphaWnJwIEDyZ8/Pz/99BN16tRh165dVKtWTafOXr16YW9vz8iRIzlw4ABz587F2tqaffv2Ubx4ccaOHcuff/7JxIkTKVu2LO3bt9c5f9GiRTx8+JAePXrw5MkTpk2bRr169Th16hR2dnYAbN26lYsXL9KxY0fs7e35559/mDt3Lv/88w8HDhxIk3wHBARQsmRJQkNDOXbsGL/88guFCxdm/PjxALRr147OnTtz+vRpypYtq553+PBhzp8/z5AhQ157/0U2oQgh0rVgwQIFULZt26bcvn1biY2NVZYtW6YUKFBAMTExUa5evaooiqI8efJESUlJ0Tn30qVLilarVUaNGqXumz9/vgIokydPTnOt1NRU9TxAmThxYpoyZcqUUXx9fdXXO3bsUAClaNGiSnx8vLp/xYoVCqBMmzZNrdvNzU3x8/NTr6MoivL48WOlZMmSSsOGDdNcq0aNGkrZsmXV17dv31YAZfjw4eq+mJgYxdDQUBkzZozOuadOnVLy5cuXZn9UVJQCKAsXLlT3DR8+XHnxn6E9e/YogLJ48WKdczdt2pRmf4kSJZRPPvkkTew9evRQXv6n7eXYBw4cqBQuXFipVKmSzj399ddfFQMDA2XPnj0658+ZM0cBlPDw8DTXe5Gvr69a38aNG5V8+fIp/fv3T7dsRu6Hojx7n16UlJSklC1bVqlXr55OXQYGBsqnn36a5nfx+Xv+4MEDxcLCQqlWrZry33//pVsmKSlJKVy4sFK2bFmdMhs2bFAAZdiwYeq+Dh06KCVKlNCpZ+7cuQqgHDp0KN02Z8W5tWvXViwsLJTLly+n2wZFUZQWLVooRkZGSnR0tLrv2rVrioWFhVK7dm113/O/8Zf/NqpXr65oNBrl66+/Vvc9ffpUKVasmM7vy/O/1xf/PVAURTl48KACKN9884267+X3UVEUZenSpQqg7N69W933/HegU6dOOmU//fRTpUCBAurrBw8eKMbGxsqgQYN0yvXu3VsxMzNTHj16lOZ6IvuRx5FCvEGDBg0oVKgQjo6OtGnTBnNzc9asWUPRokUB0Gq1GBg8+1NKSUnh7t27mJub4+HhwbFjx9R6Vq1aRcGCBenVq1eaa7z8f8Fvo3379lhYWKivW7VqRZEiRfjzzz8BiIiIICoqii+++IK7d+9y584d7ty5Q0JCAvXr12f37t2kpqbq1PnkyROMjY1fe93Vq1eTmppKQECAWuedO3ewt7fHzc2NHTt26JRPSkoCnt2vV1m5ciVWVlY0bNhQp85KlSphbm6eps7k5GSdcnfu3OHJkyevjfvff/9l+vTpDB06FHNz8zTX9/T0pFSpUjp1Pn8E/fL1X+XQoUMEBATQsmVLJk6cmG6ZjNwPQO3NBLh//z5xcXHUqlVL53dr7dq1pKamMmzYMPV38bnnv1tbt27l4cOHfPfdd2ne2+dljhw5wq1bt+jevbtOmU8++YRSpUqxceNGnfNSU1PVexQREcGiRYsoUqQInp6er21TZs+9ffs2u3fvplOnThQvXjzdNqSkpLBlyxZatGiBs7OzerxIkSJ88cUX7N27l/j4eJ1zg4ODdf4Gq1WrhqIoBAcHq/sMDQ2pXLkyFy9eTBNXixYt1H8PAKpWrUq1atXUv0HQfR+fPHnCnTt3+OijjwB03svnvv76a53XtWrV4u7du2rsVlZWNG/enKVLl6qPSFNSUli+fDktWrTAzMwsTZ0i+5HHkUK8wcyZM3F3dydfvnzY2dnh4eGh80GXmprKtGnTmDVrFpcuXSIlJUU99vyRJTx7jOnh4UG+fFn7Z+fm5qbzWqPR4OrqSkxMDABRUVEAdOjQ4ZV1xMXFYWNjo76+c+dOmnpfFhUVhaIoryz38mPDBw8eAKRJfF6uMy4ujsKFC6d7/NatWzqvt2zZQqFChV4b58uGDx+Og4MDXbt2TTO2KioqisjIyFfW+fL10/Pvv//yySefkJCQwN27d1+ZYGfkfgBs2LCB0aNHExERoTMu7cV6o6OjMTAwoHTp0q+s5/lj9BcfXb3s8uXLwLMxRS8rVaoUe/fu1dkXGxurc6+KFCn
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение классов в Обучающей выборке:\n",
|
|||
|
"company\n",
|
|||
|
"Vivo 138\n",
|
|||
|
"Samsung 128\n",
|
|||
|
"Realme 125\n",
|
|||
|
"Motorola 89\n",
|
|||
|
"Xiaomi 66\n",
|
|||
|
"Honor 59\n",
|
|||
|
"OnePlus 56\n",
|
|||
|
"Poco 52\n",
|
|||
|
"Huawei 46\n",
|
|||
|
"iQOO 37\n",
|
|||
|
"Oppo 21\n",
|
|||
|
"OPPO 20\n",
|
|||
|
"Google 16\n",
|
|||
|
"Lava 16\n",
|
|||
|
"POCO 14\n",
|
|||
|
"TCL 14\n",
|
|||
|
"Asus 12\n",
|
|||
|
"Lenovo 12\n",
|
|||
|
"itel 10\n",
|
|||
|
"Nothing 8\n",
|
|||
|
"Tecno 8\n",
|
|||
|
"LG 5\n",
|
|||
|
"Gionee 4\n",
|
|||
|
"IQOO 1\n",
|
|||
|
"Itel 1\n",
|
|||
|
"Coolpad 1\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAmEAAAHHCAYAAAD3WI8lAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACOiklEQVR4nOzdd1gVR/vw8e8B5dBBLIAGRZqioqJYsTfsmmhQQ1SMUR+7sYbYjYoltlhjLKixd2MSscSKvWCJKIgSsRcUrICw7x++7M8joEhQ2v25rr307M7O3ruHcjMzZ0ajKIqCEEIIIYT4pPQyOwAhhBBCiNxIkjAhhBBCiEwgSZgQQgghRCaQJEwIIYQQIhNIEiaEEEIIkQkkCRNCCCGEyASShAkhhBBCZAJJwoQQQgghMoEkYUIIId7pyZMnRERE8OzZs8wORWSwx48fc+XKFV69epXZoeRKkoQJIYTQoSgKCxcupGrVqhgbG2Nubk7x4sX57bffMju0bOHGjRsEBASoryMiIli5cmXmBfSG+Ph4pkyZQrly5dBqteTLlw9nZ2f27NmT2aHlSpKEiQwREBCARqNRN0NDQ1xcXOjTpw93797N7PCEyNWCgoL4/PPPsba2RqvVYm9vT48ePbh+/XqK5b/66iv+97//4erqyooVK9i1axe7d+/miy+++MSRZ08ajYbevXsTGBhIREQEQ4cO5eDBg5kdFrGxsTRo0ICRI0dSp04d1q9fz65du/j777+pVq1aZoeXK+XJ7ABEzjJu3DiKFy/Oy5cvOXToEPPnz+fPP//kwoULGBsbZ3Z4QuQ6s2fPpn///jg4ONC3b19sbW0JCQlh0aJFrF27lj///JPq1aur5ZcvX87atWv57bff+OqrrzIx8uyrSJEidOvWjcaNGwNga2vLvn37MjcoYPLkyRw7dozAwEDq1KmT2eEIQCMLeIuMEBAQQJcuXThx4gQeHh7q/kGDBjF9+nRWrVpFhw4dMjFCIXKfoKAgatWqhaenJzt27ND5Qyg8PBxPT0/09PT4559/yJcvHwBubm6ULVs2y3SfZWfh4eE8ePCAMmXKYGJikqmxvHr1ikKFCtGzZ08mTJiQqbGI/yPdkeKjqlevHgDXrl0DICoqisGDB+Pm5oapqSnm5uY0adKEs2fPJjv35cuXjBkzBhcXFwwNDbG1teWLL74gPDwceD3O4s0u0Le3N//S27dvHxqNhrVr1/LDDz9gY2ODiYkJLVu2JDIyMtm1jx07RuPGjbGwsMDY2JjatWsTFBSU4j3WqVMnxeuPGTMmWdnffvuNihUrYmRkhJWVFe3bt0/x+u+6tzclJiYyc+ZMSpcujaGhIdbW1vTo0YNHjx7plLO3t6d58+bJrtOnT59kdaYU+9SpU5M9U3jdvTF69GicnJzQarXY2dkxdOhQYmNjU3xWb6pTp06y+iZMmICenh6rVq1K1/P46aefqF69Ovnz58fIyIiKFSuyYcOGFK//22+/UblyZYyNjcmXLx+1atVi586dOmX++usvateujZmZGebm5lSqVClZbOvXr1ff0wIFCvD1119z8+ZNnTK+vr46MefLl486deqkqYvqv5z7448/otFoWLZsWbKWaEdHR6ZMmcLt27f55ZdfAHj27BkXLlzAzs6OZs2aYW5ujomJSbLrXb16FY1Gw4wZM5Jd8/Dhw2g0GlavXg2k/D4nvZ9vjps6d+4cvr6+ODg4YGhoiI2NDd988w0PHz7UOTdp6ENERIS6LzAwkOrVq2NsbIyFhQXNmzfnwoULOueNGTMGjUbDgwcP1H0nT55MFgdAmTJlUmwp+uuvv6hZsyYmJiaYmZnRrFkz/vnnH50yvr6+2Nvbq8+4SpUqREVFYWRklCzulKT1/U7tezpJ0s+8pBa4y5cv8+jRI8zMzKhdu/Y7nxXAmTNnaNKkCebm5piamlK/fn2OHj2qUybpvThw4AA9evQgf/78mJub06lTpxR/Bvn6+urs6969O4aGhslaCdPynHMK6Y4UH1VSwpQ/f37g9Q/vLVu28OWXX1K8eHHu3r3LL7/8Qu3atbl48SKFCxcGICEhgebNm7Nnzx7at29P//79efLkCbt27eLChQs4Ojqq1+jQoQNNmzbVua6fn1+K8UyYMAGNRsOwYcO4d+8eM2fOpEGDBgQHB2NkZATA33//TZMmTahYsSKjR49GT0+PpUuXUq9ePQ4ePEjlypWT1fvZZ5/h7+8PwNOnT+nZs2eK1x45ciTe3t58++233L9/n9mzZ1OrVi3OnDmDpaVlsnO6d+9OzZo1Adi0aRObN2/WOd6jRw+1FbJfv35cu3aNOXPmcObMGYKCgsibN2+Kz+FDPH78WL23NyUmJtKyZUsOHTpE9+7dcXV15fz588yYMYPQ0FC2bNnyQddZunQpI0aMYNq0aal2g73vecyaNYuWLVvi4+NDXFwca9as4csvv2T79u00a9ZMLTd27FjGjBlD9erVGTduHAYGBhw7doy///6bRo0aAa9/wXzzzTeULl0aPz8/LC0tOXPmDDt27FDjS3r2lSpVwt/fn7t37zJr1iyCgoKSvacFChRQk5YbN24wa9YsmjZtSmRkZIrv/ZvSc+7z58/Zs2cPNWvWpHjx4imWadeuHd27d2f79u18//33asIzefJkbGxsGDJkCIaGhvz66680aNCAXbt2UatWLRwcHPD09GTlypV89913OnWuXLkSMzMzWrVq9c57etuuXbu4evUqXbp0wcbGhn/++YeFCxfyzz//cPTo0WQJd5KDBw/StGlTihUrxujRo4mPj2fevHl4enpy4sQJXFxcPiiO1KxYsYLOnTvj5eXF5MmTef78OfPnz6dGjRqcOXNGTbxSMmrUKF6+fJnma/2Xr5XUJL23fn5+ODs7M3bsWF6+fMncuXOTPat//vmHmjVrYm5uztChQ8mbNy+//PILderUYf/+/VSpUkWn7j59+mBpacmYMWO4fPky8+fP599//1UTwZSMHj2axYsXs3btWp2E978852xJESIDLF26VAGU3bt3K/fv31ciIyOVNWvWKPnz51eMjIyUGzduKIqiKC9fvlQSEhJ0zr127Zqi1WqVcePGqfuWLFmiAMr06dOTXSsxMVE9D1CmTp2arEzp0qWV2rVrq6/37t2rAEqRIkWUmJgYdf+6desUQJk1a5Zat7Ozs+Ll5aVeR1EU5fnz50rx4sWVhg0bJrtW9erVlTJlyqiv79+/rwDK6NGj1X0RERGKvr6+MmHCBJ1zz58/r+TJkyfZ/rCwMAVQli1bpu4bPXq08ua37MGDBxVAWblypc65O3bsSLa/WLFiSrNmzZLF3rt3b+XtHwNvxz506FClUKFCSsWKFXWe6YoVKxQ9PT3l4MGDOucvWLBAAZSgoKBk13tT7dq11fr++OMPJU+ePMqgQYNSLJuW56Eor9+nN8XFxSllypRR6tWrp1OXnp6e8vnnnyf7Wkx6zx8/fqyYmZkpVapUUV68eJFimbi4OKVQoUJKmTJldMps375dAZRRo0ap+zp37qwUK1ZMp56FCxcqgHL8+PEU7/m/nhscHKwASv/+/d9Zf9myZRUrKytFUf7ve8rAwEAJDQ1Vy9y/f1/Jnz+/UrFiRXXfL7/8ogBKSEiIui8uLk4pUKCA0rlzZ3Vf3bp1lVq1aulcM+k6S5cuVfe9/d4piqKsXr1aAZQDBw6o+5J+1ly7dk1RFEWpWLGiYmFhody5c0ctExoaquTNm1dp06aNui/p6+X+/fvqvhMnTiSLQ1GS//x48uSJYmlpqXTr1k2n3J07dxQLCwud/W+/XxcuXFD09PSUJk2a6MSdmrS+36l9TydJ+pm3d+9endcFChRQHjx4oJZL6Vm1bt1aMTAwUMLDw9V9t27dUszMzHTey6T3omLFikpcXJy6f8qUKQqgbN26VSfepK+LpK+d2bNn68T8Ic85p5DuSJGhGjRoQMGCBbGzs6N9+/aYmpqyefNmihQpAoBWq0VP7/WXXUJCAg8fPsTU1JQSJUpw+vRptZ6NGzdSoEAB+vbtm+waqf1llRadOnXCzMxMfd22bVtsbW3
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение классов в Контрольной выборке:\n",
|
|||
|
"company\n",
|
|||
|
"Realme 26\n",
|
|||
|
"Samsung 26\n",
|
|||
|
"Vivo 22\n",
|
|||
|
"Motorola 18\n",
|
|||
|
"Honor 15\n",
|
|||
|
"OPPO 13\n",
|
|||
|
"Poco 12\n",
|
|||
|
"Xiaomi 11\n",
|
|||
|
"iQOO 11\n",
|
|||
|
"OnePlus 8\n",
|
|||
|
"Huawei 7\n",
|
|||
|
"Asus 7\n",
|
|||
|
"TCL 6\n",
|
|||
|
"POCO 5\n",
|
|||
|
"Oppo 4\n",
|
|||
|
"Google 4\n",
|
|||
|
"Tecno 3\n",
|
|||
|
"Nothing 3\n",
|
|||
|
"itel 2\n",
|
|||
|
"Lava 1\n",
|
|||
|
"Lenovo 1\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAmEAAAHHCAYAAAD3WI8lAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB+u0lEQVR4nO3deVxN+f8H8NctdduvEi2ktAqVCEMoa9kNJkyDyDJkX6exr1nGnnUGZYbs25ghy1izb2FEiWjspJuKSp3fH36dr6ukErdbr+fjcR6653w+n/M+p3u7b5/P55wjEQRBABERERF9VWrKDoCIiIioNGISRkRERKQETMKIiIiIlIBJGBEREZESMAkjIiIiUgImYURERERKwCSMiIiISAmYhBEREREpQRllB0BERETFR3p6OhISEpCVlQVzc3Nlh1OisSeMiIjoK/jjjz8QFxcnvg4JCcGDBw+UF9B7Lly4gO+//x7GxsaQSqUwMzND586dlR1WicckjAokJCQEEolEXLS0tGBvb4/BgwfjyZMnyg6PqFSaMmUKJBIJnj9/nmPb+PHjIZFIMGTIECVERu87ceIExo4di7i4OISHhyMgIABqasr/Gt69ezcaNmyIGzduYObMmTh48CAOHjyIVatWKTu0Eo/DkVQo06ZNQ5UqVfDmzRucPHkSK1aswN9//43r169DR0dH2eEREYAlS5Zg1qxZ6Nq1KxYvXqzscEq9ESNGwNPTE1WqVAEAjBw5EmZmZkqNKSEhAX379oWXlxe2bt0KTU1NpcZT2jAJo0Jp1aoV3NzcAAB9+/ZFuXLlsGDBAuzevRvdu3dXcnREtHHjRgwfPhzNmzfH+vXri0WPS2lXtWpVxMbG4vr16zA2NoaNjY2yQ8K6devw5s0bhISEMAFTAn4qqUg0bdoUAHD37l0A7/53NXr0aDg5OUFPTw8GBgZo1aoVIiMjc9R98+YNpkyZAnt7e2hpacHMzAydOnVCbGwsACAuLk5hCPTDxdPTU2zr6NGjkEgk2Lx5M37++WeYmppCV1cX7du3R3x8fI59nz17Ft7e3pDJZNDR0YGHhwciIiJyPUZPT89c9z9lypQcZf/44w/Url0b2traMDIyQrdu3XLdf17H9r6srCwsWrQI1atXh5aWFkxMTDBgwAC8fPlSoZyVlRXatm2bYz+DBw/O0WZusc+bNy/HOQWAtLQ0TJ48Gba2tpBKpbCwsMDYsWORlpaW67l6n6enZ472Zs6cCTU1NWzcuLFQ5+OXX35BgwYNUK5cOWhra6N27drYtm1brvv/448/ULduXejo6MDQ0BCNGzfGgQMHFMrs27cPHh4e0NfXh4GBAerUqZMjtq1bt4q/U2NjY/zwww855vP4+fkpxGxoaAhPT0+cOHHik+fpc+p+KDw8HH5+fqhVqxZ27NiR65fr8uXLUb16dUilUpibmyMgIACJiYkKZTw9PVGjRo0cdX/55RdIJBJxfpOVlVWen1ErKysA//v9/vLLL1i4cCEsLS2hra0NDw8PXL9+Pcd+/vnnHzRq1Ai6urooW7YsOnTogKioqFyP+WMxHD16VKFMbp+P970f44dq1KiR47389OlT+Pv7w8TEBFpaWnBxcUFoaGiubYaEhEBXVxf16tWDjY0NAgICIJFI4Ofnl6+YshcNDQ1YWVlhzJgxSE9PF8tlTxe5cOHCR9v68PN45swZ1KxZE7NmzYKFhQWkUins7Owwe/ZsZGVlKdR9+/Ytpk+fDhsbG0ilUlhZWeHnn3/O8Xcg+zwfOHAANWvWhJaWFqpVq4YdO3YolMuO9/15cv/++y8MDQ3Rtm1bvH37VlyfmJiI4cOHizHa2tpizpw5OWJUNewJoyKRnTCVK1cOAHDnzh3s2rUL3333HapUqYInT55g1apV8PDwwI0bN8QrbjIzM9G2bVscPnwY3bp1w7Bhw/Dq1SscPHgQ169fV/ifYvfu3dG6dWuF/QYGBuYaz8yZMyGRSDBu3Dg8ffoUixYtQvPmzXHlyhVoa2sDePcHvlWrVqhduzYmT54MNTU1rFu3Dk2bNsWJEydQt27dHO1WqlQJQUFBAIDk5GQMHDgw131PnDgRPj4+6Nu3L549e4alS5eicePGuHz5MsqWLZujTv/+/dGoUSMAwI4dO7Bz506F7QMGDEBISAh69+6NoUOH4u7duwgODsbly5cREREBDQ2NXM9DQSQmJorH9r6srCy0b98eJ0+eRP/+/eHo6Ihr165h4cKFiI6Oxq5duwq0n3Xr1mHChAmYP38+vv/++1zLfOp8LF68GO3bt4evry/S09OxadMmfPfdd9i7dy/atGkjlps6dSqmTJmCBg0aYNq0adDU1MTZs2fxzz//oGXLlgDefRH06dMH1atXR2BgIMqWLYvLly9j//79YnzZ575OnToICgrCkydPsHjxYkREROT4nRobG2PhwoUAgP/++w+LFy9G69atER8fn+vv/n2fUzfbuXPn0LlzZ1hZWWHfvn3Q19fPUWbKlCmYOnUqmjdvjoEDB+LWrVtYsWIFzp8/X6j306JFi5CcnAwAiIqKwqxZs/Dzzz/D0dERAKCnp6dQfv369Xj16hUCAgLw5s0bLF68GE2bNsW1a9dgYmICADh06BBatWoFa2trTJkyBa9fv8bSpUvh7u6OS5cuiYnd+xo1aoT+/fsrxPElvX79Gp6enrh9+zYGDx6MKlWqYOvWrfDz80NiYiKGDRv20bq3b9/Gr7/+WqD9ZX8u0tLSEB4ejl9++QVaWlqYPn16oY/hxYsXOHnyJE6ePIk+ffqgdu3aOHz4MAIDAxEXF4eVK1eKZfv27YvQ0FB06dIFo0aNwtmzZxEUFISoqKgcn9GYmBh07doVP/74I3r16oV169bhu+++w/79+9GiRYtcY4mPj4e3tzeqVq2KLVu2oEyZdylKamoqPDw88ODBAwwYMACVK1fGqVOnEBgYiEePHmHRokWFPn6lE4gKYN26dQIA4dChQ8KzZ8+E+Ph4YdOmTUK5cuUEbW1t4b///hMEQRDevHkjZGZmKtS9e/euIJVKhWnTponr1q5dKwAQFixYkGNfWVlZYj0Awrx583KUqV69uuDh4SG+PnLkiABAqFixopCUlCSu37JliwBAWLx4sdi2nZ2d4OXlJe5HEAQhNTVVqFKlitCiRYsc+2rQoIFQo0YN8fWzZ88EAMLkyZPFdXFxcYK6urowc+ZMhbrXrl0TypQpk2N9TEyMAEAIDQ0V102ePFl4/6N54sQJAYCwYcMGhbr79+/Psd7S0lJo06ZNjtgDAgKEDz/uH8Y+duxYoUKFCkLt2rUVzunvv/8uqKmpCSdOnFCov3LlSgGAEBERkWN/7/Pw8BDb++uvv4QyZcoIo0aNyrVsfs6HILz7Pb0vPT1dqFGjhtC0aVOFttTU1IRvv/02x3sx+3eemJgo6OvrC/Xq1RNev36da5n09HShQoUKQo0aNRTK7N27VwAgTJo0SVzXq1cvwdLSUqGd1atXCwCEc+fO5XrMRVE3+xydOHFCKFeunABA6N+/f65lnz59KmhqagotW7ZUOC/BwcECAGHt2rXiOg8PD6F69eo52pg3b54AQLh7926ObdmfwSNHjuTYlv1Zfv9vhSAIwtmzZwUAwogRI8R1NWvWFCpUqCC8ePFCXBcZGSmoqakJPXv2zNF2xYoVhd69e+cZx8c+H7nFmJ+/N4sWLRIACH/88Ye4Lj09Xahfv76gp6cn/g3KbnPdunViOR8fH6FGjRqChYWF0KtXr3zF9H59QRAEc3NzoXXr1uLr7L/P58+f/2hb738es18DEKZMmaJQzs/PTwAgXLt2TRAEQbhy5YoAQOjbt69CudGjRwsAhH/++UdcZ2lpKQAQtm/fLq6Ty+WCmZmZ4OrqmiPeu3fvCgkJCUK1atUEBwcH4fnz5wr7mD59uqCrqytER0crrP/pp58EdXV14f79+x893uKOw5FUKM2bN0f58uVhYWGBbt26QU9PDzt37kTFihUBAFKpVJyDkpmZiRcvXkBPTw8ODg64dOmS2M727dthbGyc65VbHw5BFUTPnj0VegC6dOk
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение классов в Тестовой выборке:\n",
|
|||
|
"company\n",
|
|||
|
"Realme 35\n",
|
|||
|
"Samsung 27\n",
|
|||
|
"Vivo 26\n",
|
|||
|
"Motorola 20\n",
|
|||
|
"Honor 14\n",
|
|||
|
"Xiaomi 13\n",
|
|||
|
"Poco 11\n",
|
|||
|
"OnePlus 11\n",
|
|||
|
"Huawei 9\n",
|
|||
|
"iQOO 9\n",
|
|||
|
"TCL 6\n",
|
|||
|
"OPPO 5\n",
|
|||
|
"Nothing 4\n",
|
|||
|
"Google 3\n",
|
|||
|
"Lava 2\n",
|
|||
|
"Asus 2\n",
|
|||
|
"Oppo 2\n",
|
|||
|
"Tecno 2\n",
|
|||
|
"Itel 2\n",
|
|||
|
"Gionee 1\n",
|
|||
|
"Lenovo 1\n",
|
|||
|
"LG 1\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAmEAAAHHCAYAAAD3WI8lAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACAAUlEQVR4nO3deVxN+f8H8Ndtu+2baCFFm5BkHRoiS2UfTJimEQ3GFiPLNHaD7MvIMsYSxi7bMGSZbNmyZBlJIrKv3VRU6vz+8Ot8XS2Scrv1ej4e56F7zud8zvt87qn79vl87jkSQRAEEBEREdEXpaLoAIiIiIjKIyZhRERERArAJIyIiIhIAZiEERERESkAkzAiIiIiBWASRkRERKQATMKIiIiIFIBJGBEREZECqCk6ACIiIioeb968wYsXL6CmpoZKlSopOhz6CPaEERERfURISAiSkpLE1wsWLEBqaqriAnrPoUOH0KlTJxgaGkJLSwuVK1fGsGHDFB0WFQKTMFKI0NBQSCQScdHU1IS9vT2GDBmCx48fKzo8onIlISFB7vexoCUhIUHR4SrE33//jUmTJiExMRHr16/H+PHjoaWlpeiwsGTJEnh4eEAmk2HhwoU4ePAgDh48iClTpig6NCoEDkeSQk2ZMgXVqlXDmzdvcOLECSxduhT//PMPrl69Cm1tbUWHR1QuVKxYEevWrZNbN3fuXNy7dw/z58/PVbY8+vXXX9GpUycsXLgQKioqmDt3LlRUFNuPERcXhxEjRqB///5YsmQJJBKJQuOhTyfhA7xJEUJDQ9GnTx9ERUWhQYMG4vrAwEDMmzcPGzZsQK9evRQYIVH51qFDB1y9erXc9nzlJSkpCTExMbC0tESVKlUUHQ6GDh2Kv//+G3FxcVBXV1d0OFQEHI6kUsXd3R0AcPv2bQDAixcvMHLkSDg5OUFXVxf6+vrw8vLCpUuXcu375s0bTJo0Cfb29tDU1IS5uTm6du2K+Ph4AB8fcmnRooVY15EjRyCRSLB582b8+uuvMDMzg46ODjp16oTExMRcxz5z5gw8PT1hYGAAbW1tuLm5ITIyMs9zbNGiRZ7HnzRpUq6yf/31F+rXrw8tLS0YGxujZ8+eeR6/oHN7X3Z2NhYsWIBatWpBU1MTpqamGDBgAF6+fClXztraGh06dMh1nCFDhuSqM6/YZ8+enatNASA9PR0TJ06Era0tpFIpLC0tMXr0aKSnp+fZVu9r0aJFrvqmTZsGFRUVbNiwoUjtMWfOHDRt2hQVKlSAlpYW6tevj23btuV5/L/++guNGjWCtrY2jIyM0Lx5cxw4cECuzL59++Dm5gY9PT3o6+ujYcOGuWLbunWr+J6amJjg+++/x/379+XK+Pn5ycVsZGSEFi1a4Pjx4x9tp8/Z91MkJSVh+PDhsLS0hFQqha2tLWbOnIns7Gy5ctnZ2Vi4cCGcnJygqamJihUrwtPTE+fOnQOAjw5/vv+eP3nyBP7+/jA1NYWmpiacnZ2xZs0aueN9+N6rq6vD2toao0aNQkZGhlzZW7du4dtvv4WxsTG0tbXx1VdfYe/evXJlcv4WHDlyBIaGhmjSpAmqVKmC9u3b5/t7m9f+OYtUKoW9vT2Cg4Pxfh/IpEmTIJFI8OzZs3zrsra2hp+fn/j69OnTqF+/PgYNGgRTU1NIpVLUrl0bf/75Z659U1NTERgYKL5fDg4OmDNnDj7sh5FIJBgyZAjWr18PBwcHaGpqon79+jh27JhcuZx43xcREQGpVIqffvpJbv39+/fRt29fMcZatWph1apVBbZbecHhSCpVchKmChUqAHj3R3Lnzp349ttvUa1aNTx+/Bh//PEH3NzccO3aNVhYWAAAsrKy0KFDBxw+fBg9e/bEsGHD8OrVKxw8eBBXr16FjY2NeIxevXqhXbt2cscNCgrKM55p06ZBIpFgzJgxePLkCRYsWIDWrVsjOjpanA/y77//wsvLC/Xr18fEiROhoqKC1atXw93dHcePH0ejRo1y1VulShUEBwcDAFJSUjBw4MA8jz1+/Hh4e3vjxx9/xNOnT7Fo0SI0b94cFy9ehKGhYa59+vfvj2bNmgEAtm/fjh07dshtHzBggNgLGRAQgNu3byMkJAQXL15EZGRksfxvOikpSTy392VnZ6NTp044ceIE+vfvD0dHR1y5cgXz58/HjRs3sHPnzk86zurVqzFu3DjMnTsX3333XZ5lPtYeCxcuRKdOneDj44OMjAxs2rQJ3377Lfbs2YP27duL5SZPnoxJkyahadOmmDJlCjQ0NHDmzBn8+++/aNu2LYB3vbt9+/ZFrVq1EBQUBENDQ1y8eBH79+8X48tp+4YNGyI4OBiPHz/GwoULERkZmes9NTExEYcC7927h4ULF6Jdu3ZITEzM871/3+fsWxhpaWlwc3PD/fv3MWDAAFStWhUnT55EUFAQHj58iAULFohl/f39ERoaCi8vL/z44494+/Ytjh8/jtOnT6NBgwZyw6DHjx/H8uXLMX/+fJiYmAAATE1NAQCvX79GixYtcPPmTQwZMgTVqlXD1q1b4efnh6SkpFwT0XPe+/T0dISHh2POnDnQ1NTEb7/9BgB4/PgxmjZtirS0NAQEBKBChQpYs2YNOnXqhG3btuGbb77J9/yPHTuGf/7555Pa7Ndff4WjoyNev34t/ueuUqVK8Pf3/6R63vf8+XOcO3cOampqGDx4MGxsbLBz5070798fz58/xy+//AIAEAQBnTp1QkREBPz9/VG3bl2Eh4dj1KhRuH//fq4h56NHj2Lz5s0ICAiAVCrFkiVL4OnpibNnz6J27dp5xnLp0iV06dIF7dq1w+LFi8X1jx8/xldffSUmdxUrVsS+ffvg7++P5ORkDB8+vMjnXyYIRAqwevVqAYBw6NAh4enTp0JiYqKwadMmoUKFCoKWlpZw7949QRAE4c2bN0JWVpbcvrdv3xakUqkwZcoUcd2qVasEAMK8efNyHSs7O1vcD4Awe/bsXGVq1aoluLm5ia8jIiIEAELlypWF5ORkcf2WLVsEAMLChQvFuu3s7AQPDw/xOIIgCGlpaUK1atWENm3a5DpW06ZNhdq1a4uvnz59KgAQJk6cKK5LSEgQVFVVhWnTpsnte+XKFUFNTS3X+ri4OAGAsGbNGnHdxIkThfd/xY8fPy4AENavXy+37/79+3Ott7KyEtq3b58r9sGDBwsf/tn4MPbRo0cLlSpVEurXry/XpuvWrRNUVFSE48ePy+2/bNkyAYAQGRmZ63jvc3NzE+vbu3evoKamJgQGBuZZtjDtIQjv3qf3ZWRkCLVr1xbc3d3l6lJRURG++eabXNdiznuelJQk6OnpCY0bNxZev36dZ5mMjAyhUqVKQu3ateXK7NmzRwAgTJgwQVzXu3dvwcrKSq6e5cuXCwCEs2fP5nnOxbHv+9q3b5+rnhy//faboKOjI9y4cUNu/S+//CKoqqoKd+/eFQRBEP79918BgBAQEJCrjvd/X3Lk/F24fft2rm0LFiwQAAh//fWXuC4jI0No0qSJoKurK/6e5vyer169Wm5/CwsLoV27duLr4cOHCwDkrsdXr14J1apVE6ytrcX3OudvQUREhFiucePGgpeXV65rPy957f/mzRtBRUVFGDRokLgu5/p8+vRpvnVZWVkJvXv3lnsNQAgNDRXXvX37VmjVqpUglUqFZ8+eCYIgCDt37hQACFOnTpWrr3v37oJEIhFu3rwprgMgABDOnTsnrrtz546gqakpfPPNN7niFYR3f6/Mzc2Fr7/+Otf17+/vL5ibm4ux5OjZs6dgYGCQ63ewvOFwJClU69atUbFiRVhaWqJnz57Q1dXFjh07ULlyZQCAVCoVJ79mZWXh+fPn0NXVhYODAy5cuCDWExYWBhMTEwwdOjTXMT5nsuoPP/wAPT098XX37t1hbm4u/i84OjoacXFx+O677/D8+XM8e/YMz549Q2pqKlq1aoVjx47lGp558+YNNDU1Czzu9u3bkZ2dDW9vb7HOZ8+ewczMDHZ2doiIiJArnzPMIpVK861z69atMDAwQJs2beTqrF+/PnR1dXPVmZmZKVf
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Проверка распределения классов в столбце company\n",
|
|||
|
"class_distribution = df['company'].value_counts()\n",
|
|||
|
"print(\"Распределение классов в company:\")\n",
|
|||
|
"print(class_distribution)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация распределения классов\n",
|
|||
|
"sns.countplot(y='company', data=df, order=class_distribution.index)\n",
|
|||
|
"plt.title('Распределение классов в company')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Проверка сбалансированности для каждой выборки\n",
|
|||
|
"def check_balance(df, title):\n",
|
|||
|
" class_distribution = df['company'].value_counts()\n",
|
|||
|
" print(f\"Распределение классов в {title}:\")\n",
|
|||
|
" print(class_distribution)\n",
|
|||
|
" sns.countplot(y='company', data=df, order=class_distribution.index)\n",
|
|||
|
" plt.title(f'Распределение классов в {title}')\n",
|
|||
|
" plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Разделение данных на обучающую, контрольную и тестовую выборки\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка сбалансированности для обучающей, контрольной и тестовой выборок\n",
|
|||
|
"check_balance(train_df, 'Обучающей выборке')\n",
|
|||
|
"check_balance(val_df, 'Контрольной выборке')\n",
|
|||
|
"check_balance(test_df, 'Тестовой выборке')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
" Данные по столбцу company являются несбалансированными. Некоторые компании, такие как Vivo, Realme, и Samsung, имеют значительно больше устройств, чем другие, такие как LG, Gionee, и Itel."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 4,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки до upsampling: 671\n",
|
|||
|
"Размер контрольной выборки: 288\n",
|
|||
|
"Размер тестовой выборки: 411\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в всем датасете:\n",
|
|||
|
"Класс Vivo: 186 (13.58%)\n",
|
|||
|
"Класс Realme: 186 (13.58%)\n",
|
|||
|
"Класс Samsung: 181 (13.21%)\n",
|
|||
|
"Класс Motorola: 127 (9.27%)\n",
|
|||
|
"Класс Xiaomi: 90 (6.57%)\n",
|
|||
|
"Класс Honor: 88 (6.42%)\n",
|
|||
|
"Класс Poco: 75 (5.47%)\n",
|
|||
|
"Класс OnePlus: 75 (5.47%)\n",
|
|||
|
"Класс Huawei: 62 (4.53%)\n",
|
|||
|
"Класс iQOO: 57 (4.16%)\n",
|
|||
|
"Класс OPPO: 38 (2.77%)\n",
|
|||
|
"Класс Oppo: 27 (1.97%)\n",
|
|||
|
"Класс TCL: 26 (1.90%)\n",
|
|||
|
"Класс Google: 23 (1.68%)\n",
|
|||
|
"Класс Asus: 21 (1.53%)\n",
|
|||
|
"Класс POCO: 19 (1.39%)\n",
|
|||
|
"Класс Lava: 19 (1.39%)\n",
|
|||
|
"Класс Nothing: 15 (1.09%)\n",
|
|||
|
"Класс Lenovo: 14 (1.02%)\n",
|
|||
|
"Класс Tecno: 13 (0.95%)\n",
|
|||
|
"Класс itel: 12 (0.88%)\n",
|
|||
|
"Класс LG: 6 (0.44%)\n",
|
|||
|
"Класс Gionee: 5 (0.36%)\n",
|
|||
|
"Класс Itel: 3 (0.22%)\n",
|
|||
|
"Класс IQOO: 1 (0.07%)\n",
|
|||
|
"Класс Coolpad: 1 (0.07%)\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в Обучающей выборке до upsampling:\n",
|
|||
|
"Класс Vivo: 94 (14.01%)\n",
|
|||
|
"Класс Samsung: 89 (13.26%)\n",
|
|||
|
"Класс Realme: 82 (12.22%)\n",
|
|||
|
"Класс Motorola: 66 (9.84%)\n",
|
|||
|
"Класс Xiaomi: 46 (6.86%)\n",
|
|||
|
"Класс Honor: 40 (5.96%)\n",
|
|||
|
"Класс OnePlus: 40 (5.96%)\n",
|
|||
|
"Класс Poco: 37 (5.51%)\n",
|
|||
|
"Класс Huawei: 35 (5.22%)\n",
|
|||
|
"Класс iQOO: 28 (4.17%)\n",
|
|||
|
"Класс OPPO: 15 (2.24%)\n",
|
|||
|
"Класс Oppo: 14 (2.09%)\n",
|
|||
|
"Класс Lava: 12 (1.79%)\n",
|
|||
|
"Класс Google: 12 (1.79%)\n",
|
|||
|
"Класс TCL: 10 (1.49%)\n",
|
|||
|
"Класс Lenovo: 9 (1.34%)\n",
|
|||
|
"Класс POCO: 9 (1.34%)\n",
|
|||
|
"Класс Asus: 8 (1.19%)\n",
|
|||
|
"Класс itel: 7 (1.04%)\n",
|
|||
|
"Класс Nothing: 5 (0.75%)\n",
|
|||
|
"Класс Tecno: 5 (0.75%)\n",
|
|||
|
"Класс LG: 3 (0.45%)\n",
|
|||
|
"Класс Gionee: 3 (0.45%)\n",
|
|||
|
"Класс Coolpad: 1 (0.15%)\n",
|
|||
|
"Класс Itel: 1 (0.15%)\n",
|
|||
|
"Размер обучающей выборки после upsampling: 2350\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в Обучающей выборке после upsampling:\n",
|
|||
|
"Класс Realme: 94 (4.00%)\n",
|
|||
|
"Класс Motorola: 94 (4.00%)\n",
|
|||
|
"Класс Vivo: 94 (4.00%)\n",
|
|||
|
"Класс Lava: 94 (4.00%)\n",
|
|||
|
"Класс Lenovo: 94 (4.00%)\n",
|
|||
|
"Класс TCL: 94 (4.00%)\n",
|
|||
|
"Класс OPPO: 94 (4.00%)\n",
|
|||
|
"Класс Honor: 94 (4.00%)\n",
|
|||
|
"Класс Poco: 94 (4.00%)\n",
|
|||
|
"Класс itel: 94 (4.00%)\n",
|
|||
|
"Класс Oppo: 94 (4.00%)\n",
|
|||
|
"Класс iQOO: 94 (4.00%)\n",
|
|||
|
"Класс Samsung: 94 (4.00%)\n",
|
|||
|
"Класс Xiaomi: 94 (4.00%)\n",
|
|||
|
"Класс LG: 94 (4.00%)\n",
|
|||
|
"Класс Huawei: 94 (4.00%)\n",
|
|||
|
"Класс OnePlus: 94 (4.00%)\n",
|
|||
|
"Класс Google: 94 (4.00%)\n",
|
|||
|
"Класс Tecno: 94 (4.00%)\n",
|
|||
|
"Класс Asus: 94 (4.00%)\n",
|
|||
|
"Класс Gionee: 94 (4.00%)\n",
|
|||
|
"Класс POCO: 94 (4.00%)\n",
|
|||
|
"Класс Nothing: 94 (4.00%)\n",
|
|||
|
"Класс Coolpad: 94 (4.00%)\n",
|
|||
|
"Класс Itel: 94 (4.00%)\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в Контрольной выборке:\n",
|
|||
|
"Класс Vivo: 44 (15.28%)\n",
|
|||
|
"Класс Realme: 43 (14.93%)\n",
|
|||
|
"Класс Samsung: 39 (13.54%)\n",
|
|||
|
"Класс Motorola: 23 (7.99%)\n",
|
|||
|
"Класс Xiaomi: 20 (6.94%)\n",
|
|||
|
"Класс Honor: 19 (6.60%)\n",
|
|||
|
"Класс OnePlus: 16 (5.56%)\n",
|
|||
|
"Класс Poco: 15 (5.21%)\n",
|
|||
|
"Класс Huawei: 11 (3.82%)\n",
|
|||
|
"Класс iQOO: 9 (3.12%)\n",
|
|||
|
"Класс Oppo: 7 (2.43%)\n",
|
|||
|
"Класс POCO: 5 (1.74%)\n",
|
|||
|
"Класс OPPO: 5 (1.74%)\n",
|
|||
|
"Класс Google: 4 (1.39%)\n",
|
|||
|
"Класс Asus: 4 (1.39%)\n",
|
|||
|
"Класс TCL: 4 (1.39%)\n",
|
|||
|
"Класс Lava: 4 (1.39%)\n",
|
|||
|
"Класс itel: 3 (1.04%)\n",
|
|||
|
"Класс Nothing: 3 (1.04%)\n",
|
|||
|
"Класс Tecno: 3 (1.04%)\n",
|
|||
|
"Класс Lenovo: 3 (1.04%)\n",
|
|||
|
"Класс LG: 2 (0.69%)\n",
|
|||
|
"Класс Gionee: 1 (0.35%)\n",
|
|||
|
"Класс IQOO: 1 (0.35%)\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в Тестовой выборке:\n",
|
|||
|
"Класс Realme: 61 (14.84%)\n",
|
|||
|
"Класс Samsung: 53 (12.90%)\n",
|
|||
|
"Класс Vivo: 48 (11.68%)\n",
|
|||
|
"Класс Motorola: 38 (9.25%)\n",
|
|||
|
"Класс Honor: 29 (7.06%)\n",
|
|||
|
"Класс Xiaomi: 24 (5.84%)\n",
|
|||
|
"Класс Poco: 23 (5.60%)\n",
|
|||
|
"Класс iQOO: 20 (4.87%)\n",
|
|||
|
"Класс OnePlus: 19 (4.62%)\n",
|
|||
|
"Класс OPPO: 18 (4.38%)\n",
|
|||
|
"Класс Huawei: 16 (3.89%)\n",
|
|||
|
"Класс TCL: 12 (2.92%)\n",
|
|||
|
"Класс Asus: 9 (2.19%)\n",
|
|||
|
"Класс Google: 7 (1.70%)\n",
|
|||
|
"Класс Nothing: 7 (1.70%)\n",
|
|||
|
"Класс Oppo: 6 (1.46%)\n",
|
|||
|
"Класс POCO: 5 (1.22%)\n",
|
|||
|
"Класс Tecno: 5 (1.22%)\n",
|
|||
|
"Класс Lava: 3 (0.73%)\n",
|
|||
|
"Класс Lenovo: 2 (0.49%)\n",
|
|||
|
"Класс itel: 2 (0.49%)\n",
|
|||
|
"Класс Itel: 2 (0.49%)\n",
|
|||
|
"Класс LG: 1 (0.24%)\n",
|
|||
|
"Класс Gionee: 1 (0.24%)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки до upsampling:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
|||
|
"\n",
|
|||
|
"# Функция для проверки балансировки данных\n",
|
|||
|
"def check_balance(df, title):\n",
|
|||
|
" class_distribution = df['company'].value_counts()\n",
|
|||
|
" print(f\"\\nРаспределение классов в {title}:\")\n",
|
|||
|
" for cls, count in class_distribution.items():\n",
|
|||
|
" print(f\"Класс {cls}: {count} ({count / len(df) * 100:.2f}%)\")\n",
|
|||
|
"\n",
|
|||
|
"# Проверка балансировки для всего датасета\n",
|
|||
|
"check_balance(df, 'всем датасете')\n",
|
|||
|
"\n",
|
|||
|
"# Проверка балансировки для обучающей выборки до upsampling\n",
|
|||
|
"check_balance(train_df, 'Обучающей выборке до upsampling')\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling к обучающей выборке\n",
|
|||
|
"X_train = train_df.drop('company', axis=1) # Отделяем признаки от целевой переменной\n",
|
|||
|
"y_train = train_df['company'] # Целевая переменная\n",
|
|||
|
"\n",
|
|||
|
"# Инициализация RandomOverSampler\n",
|
|||
|
"ros = RandomOverSampler(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling\n",
|
|||
|
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового DataFrame с балансированными данными\n",
|
|||
|
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод размеров выборок после upsampling\n",
|
|||
|
"print(\"Размер обучающей выборки после upsampling:\", len(train_df_resampled))\n",
|
|||
|
"\n",
|
|||
|
"# Проверка балансировки для обучающей выборки после upsampling\n",
|
|||
|
"check_balance(train_df_resampled, 'Обучающей выборке после upsampling')\n",
|
|||
|
"\n",
|
|||
|
"# Проверка балансировки для контрольной и тестовой выборок (они не должны измениться)\n",
|
|||
|
"check_balance(val_df, 'Контрольной выборке')\n",
|
|||
|
"check_balance(test_df, 'Тестовой выборке')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Данные были сбалансированы. Теперь можно перейти к конструированию признаков. Поставлены следующие задачи:\n",
|
|||
|
"1. Классифицировать мобильные устройства по ценовым категориям (например, бюджетные, средний класс, флагманы).\n",
|
|||
|
"2. Определить, какие характеристики мобильных устройств наиболее сильно влияют на их рейтинг."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 5,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"\n",
|
|||
|
"# Определение категориальных признаков\n",
|
|||
|
"categorical_features = [\n",
|
|||
|
" 'Rating', 'Ram',\n",
|
|||
|
" 'Battery', 'Display', 'Camera', 'External_Memory', 'Android_version',\n",
|
|||
|
" 'Price', 'company', 'Inbuilt_memory', 'fast_charging',\n",
|
|||
|
" 'Screen_resolution', 'Processor'\n",
|
|||
|
"]\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к обучающей выборке\n",
|
|||
|
"train_df_resampled_encoded = pd.get_dummies(train_df_resampled, columns=categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к контрольной выборке\n",
|
|||
|
"val_df_encoded = pd.get_dummies(val_df, columns=categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к тестовой выборке\n",
|
|||
|
"test_df_encoded = pd.get_dummies(test_df, columns=categorical_features)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Дискретизация числовых признаков"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 6,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки после балансировки: (5600, 22)\n",
|
|||
|
"Размер контрольной выборки: (288, 22)\n",
|
|||
|
"Размер тестовой выборки: (411, 22)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"import re\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Извлечение числовых значений из столбца Battery\n",
|
|||
|
"df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n",
|
|||
|
"df['Ram'] = df['Ram'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n",
|
|||
|
"df['Camera'] = df['Camera'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling к обучающей выборке (если это необходимо)\n",
|
|||
|
"X_train = train_df.drop('Price', axis=1) # Отделяем признаки от целевой переменной\n",
|
|||
|
"y_train = train_df['Price'] # Целевая переменная\n",
|
|||
|
"\n",
|
|||
|
"# Инициализация RandomOverSampler\n",
|
|||
|
"ros = RandomOverSampler(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling\n",
|
|||
|
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового DataFrame с балансированными данными\n",
|
|||
|
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"# Определение числовых признаков для дискретизации\n",
|
|||
|
"numerical_features = ['Spec_score', 'Battery', 'Ram', 'Camera' ]\n",
|
|||
|
"\n",
|
|||
|
"# Функция для дискретизации числовых признаков\n",
|
|||
|
"def discretize_features(df, features, bins=5, labels=False):\n",
|
|||
|
" for feature in features:\n",
|
|||
|
" try:\n",
|
|||
|
" # Заполнение NaN значений, если они есть\n",
|
|||
|
" df[feature] = df[feature].fillna(df[feature].median())\n",
|
|||
|
" df[f'{feature}_bin'] = pd.cut(df[feature], bins=bins, labels=labels)\n",
|
|||
|
" except Exception as e:\n",
|
|||
|
" print(f\"Ошибка при дискретизации признака {feature}: {e}\")\n",
|
|||
|
" return df\n",
|
|||
|
"\n",
|
|||
|
"# Применение дискретизации к обучающей, контрольной и тестовой выборкам\n",
|
|||
|
"train_df_resampled = discretize_features(train_df_resampled, numerical_features)\n",
|
|||
|
"val_df = discretize_features(val_df, numerical_features)\n",
|
|||
|
"test_df = discretize_features(test_df, numerical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки после балансировки:\", train_df_resampled.shape)\n",
|
|||
|
"print(\"Размер контрольной выборки:\", val_df.shape)\n",
|
|||
|
"print(\"Размер тестовой выборки:\", test_df.shape)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Ручной синтез. Создание новых признаков на основе экспертных знаний и логики предметной области."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 7,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки после балансировки: (5600, 19)\n",
|
|||
|
"Размер контрольной выборки: (288, 19)\n",
|
|||
|
"Размер тестовой выборки: (411, 19)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование столбца Battery в числовой формат\n",
|
|||
|
"df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование столбцов Camera и Display в числовой формат\n",
|
|||
|
"df['Camera'] = pd.to_numeric(df['Camera'], errors='coerce')\n",
|
|||
|
"df['Display'] = pd.to_numeric(df['Display'], errors='coerce')\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling к обучающей выборке (если это необходимо)\n",
|
|||
|
"X_train = train_df.drop('Price', axis=1) # Отделяем признаки от целевой переменной\n",
|
|||
|
"y_train = train_df['Price'] # Целевая переменная\n",
|
|||
|
"\n",
|
|||
|
"# Инициализация RandomOverSampler\n",
|
|||
|
"ros = RandomOverSampler(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling\n",
|
|||
|
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового DataFrame с балансированными данными\n",
|
|||
|
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового признака \"Camera_to_Display_Ratio\" на основе признаков \"Camera\" и \"Display\"\n",
|
|||
|
"train_df_resampled['Camera_to_Display_Ratio'] = train_df_resampled['Camera'] / train_df_resampled['Display']\n",
|
|||
|
"val_df['Camera_to_Display_Ratio'] = val_df['Camera'] / val_df['Display']\n",
|
|||
|
"test_df['Camera_to_Display_Ratio'] = test_df['Camera'] / test_df['Display']\n",
|
|||
|
"\n",
|
|||
|
"# Вывод размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки после балансировки:\", train_df_resampled.shape)\n",
|
|||
|
"print(\"Размер контрольной выборки:\", val_df.shape)\n",
|
|||
|
"print(\"Размер тестовой выборки:\", test_df.shape)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Масштабирование признаков - это процесс преобразования числовых признаков таким образом, чтобы они имели одинаковый масштаб. Это важно для многих алгоритмов машинного обучения, которые чувствительны к масштабу признаков, таких как линейная регрессия, метод опорных векторов (SVM) и нейронные сети."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 8,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки после балансировки: (5600, 19)\n",
|
|||
|
"Размер контрольной выборки: (288, 19)\n",
|
|||
|
"Размер тестовой выборки: (411, 19)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\utils\\extmath.py:1137: RuntimeWarning: invalid value encountered in divide\n",
|
|||
|
" updated_mean = (last_sum + new_sum) / updated_sample_count\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\utils\\extmath.py:1142: RuntimeWarning: invalid value encountered in divide\n",
|
|||
|
" T = new_sum / new_sample_count\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\utils\\extmath.py:1162: RuntimeWarning: invalid value encountered in divide\n",
|
|||
|
" new_unnormalized_variance -= correction**2 / new_sample_count\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"import re\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование столбца Battery в числовой формат\n",
|
|||
|
"df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование столбцов Camera и Display в числовой формат\n",
|
|||
|
"df['Camera'] = pd.to_numeric(df['Camera'], errors='coerce')\n",
|
|||
|
"df['Display'] = pd.to_numeric(df['Display'], errors='coerce')\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling к обучающей выборке (если это необходимо)\n",
|
|||
|
"X_train = train_df.drop('Price', axis=1) # Отделяем признаки от целевой переменной\n",
|
|||
|
"y_train = train_df['Price'] # Целевая переменная\n",
|
|||
|
"\n",
|
|||
|
"# Инициализация RandomOverSampler\n",
|
|||
|
"ros = RandomOverSampler(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling\n",
|
|||
|
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового DataFrame с балансированными данными\n",
|
|||
|
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового признака \"Camera_to_Display_Ratio\" на основе признаков \"Camera\" и \"Display\"\n",
|
|||
|
"train_df_resampled['Camera_to_Display_Ratio'] = train_df_resampled['Camera'] / train_df_resampled['Display']\n",
|
|||
|
"val_df['Camera_to_Display_Ratio'] = val_df['Camera'] / val_df['Display']\n",
|
|||
|
"test_df['Camera_to_Display_Ratio'] = test_df['Camera'] / test_df['Display']\n",
|
|||
|
"\n",
|
|||
|
"# Определение числовых признаков для масштабирования\n",
|
|||
|
"numerical_features_to_scale = ['Spec_score', 'No_of_sim', 'Ram', 'Battery', 'Display', 'Camera', 'Inbuilt_memory', 'Screen_resolution', 'Camera_to_Display_Ratio']\n",
|
|||
|
"\n",
|
|||
|
"# Удаление строковых значений из числовых признаков\n",
|
|||
|
"for feature in numerical_features_to_scale:\n",
|
|||
|
" train_df_resampled[feature] = pd.to_numeric(train_df_resampled[feature], errors='coerce')\n",
|
|||
|
" val_df[feature] = pd.to_numeric(val_df[feature], errors='coerce')\n",
|
|||
|
" test_df[feature] = pd.to_numeric(test_df[feature], errors='coerce')\n",
|
|||
|
"\n",
|
|||
|
"# Инициализация StandardScaler\n",
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"\n",
|
|||
|
"# Масштабирование числовых признаков в обучающей выборке\n",
|
|||
|
"train_df_resampled[numerical_features_to_scale] = scaler.fit_transform(train_df_resampled[numerical_features_to_scale])\n",
|
|||
|
"\n",
|
|||
|
"# Масштабирование числовых признаков в контрольной и тестовой выборках\n",
|
|||
|
"val_df[numerical_features_to_scale] = scaler.transform(val_df[numerical_features_to_scale])\n",
|
|||
|
"test_df[numerical_features_to_scale] = scaler.transform(test_df[numerical_features_to_scale])\n",
|
|||
|
"\n",
|
|||
|
"# Вывод размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки после балансировки:\", train_df_resampled.shape)\n",
|
|||
|
"print(\"Размер контрольной выборки:\", val_df.shape)\n",
|
|||
|
"print(\"Размер тестовой выборки:\", test_df.shape)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Конструирование признаков с применением фреймворка Featuretools"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 9,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Обучающая выборка после конструирования признаков:\n",
|
|||
|
" Unnamed: 0 Rating Spec_score No_of_sim Ram \\\n",
|
|||
|
"id \n",
|
|||
|
"0 305 4.70 86 Dual Sim, 3G, 4G, 5G, VoLTE, 12 GB RAM \n",
|
|||
|
"1 941 4.45 71 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM \n",
|
|||
|
"2 800 4.20 68 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM \n",
|
|||
|
"3 97 4.25 69 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM \n",
|
|||
|
"4 1339 4.30 74 Dual Sim, 3G, 4G, VoLTE, 6 GB RAM \n",
|
|||
|
"\n",
|
|||
|
" Battery External_Memory Android_version company \\\n",
|
|||
|
"id \n",
|
|||
|
"0 5000 Android v12 NaN Realme \n",
|
|||
|
"1 5000 Memory Card Supported, upto 1 TB 12 Motorola \n",
|
|||
|
"2 5000 Memory Card Supported 12 Vivo \n",
|
|||
|
"3 5000 Memory Card Supported 12 Vivo \n",
|
|||
|
"4 5000 Memory Card Supported, upto 256 GB 12 Lava \n",
|
|||
|
"\n",
|
|||
|
" Inbuilt_memory fast_charging \\\n",
|
|||
|
"id \n",
|
|||
|
"0 256 GB inbuilt 65W Fast Charging \n",
|
|||
|
"1 64 GB inbuilt 10W Fast Charging \n",
|
|||
|
"2 64 GB inbuilt 10W Fast Charging \n",
|
|||
|
"3 128 GB inbuilt 10W Fast Charging \n",
|
|||
|
"4 128 GB inbuilt NaN \n",
|
|||
|
"\n",
|
|||
|
" Screen_resolution Processor \n",
|
|||
|
"id \n",
|
|||
|
"0 1080 x 2400 px Octa Core \n",
|
|||
|
"1 720 x 1600 px Octa Core \n",
|
|||
|
"2 720 x 1600 px Display with Water Drop Notch Octa Core \n",
|
|||
|
"3 720 x 1600 px Display with Water Drop Notch Octa Core \n",
|
|||
|
"4 1600 x 720 px Octa Core \n",
|
|||
|
"Контрольная выборка после конструирования признаков:\n",
|
|||
|
" Unnamed: 0 Rating Spec_score No_of_sim Ram \\\n",
|
|||
|
"id \n",
|
|||
|
"1028 <NA> NaN <NA> NaN NaN \n",
|
|||
|
"825 <NA> NaN <NA> NaN NaN \n",
|
|||
|
"900 <NA> NaN <NA> NaN NaN \n",
|
|||
|
"702 <NA> NaN <NA> NaN NaN \n",
|
|||
|
"230 1050 4.05 90 Dual Sim, 3G, 4G, 5G, VoLTE, 8 GB RAM \n",
|
|||
|
"\n",
|
|||
|
" Battery External_Memory Android_version company Inbuilt_memory \\\n",
|
|||
|
"id \n",
|
|||
|
"1028 <NA> NaN NaN NaN NaN \n",
|
|||
|
"825 <NA> NaN NaN NaN NaN \n",
|
|||
|
"900 <NA> NaN NaN NaN NaN \n",
|
|||
|
"702 <NA> NaN NaN NaN NaN \n",
|
|||
|
"230 4500 Android v12 NaN Motorola 128 GB inbuilt \n",
|
|||
|
"\n",
|
|||
|
" fast_charging Screen_resolution Processor \n",
|
|||
|
"id \n",
|
|||
|
"1028 NaN NaN NaN \n",
|
|||
|
"825 NaN NaN NaN \n",
|
|||
|
"900 NaN NaN NaN \n",
|
|||
|
"702 NaN NaN NaN \n",
|
|||
|
"230 125W Fast Charging 1080 x 2400 px Octa Core \n",
|
|||
|
"Тестовая выборка после конструирования признаков:\n",
|
|||
|
" Unnamed: 0 Rating Spec_score No_of_sim \\\n",
|
|||
|
"id \n",
|
|||
|
"427 187 4.40 91 Dual Sim, 3G, 4G, 5G, VoLTE, \n",
|
|||
|
"1088 <NA> NaN <NA> NaN \n",
|
|||
|
"668 592 4.45 91 Dual Sim, 3G, 4G, 5G, VoLTE, \n",
|
|||
|
"572 1130 4.60 75 Dual Sim, 3G, 4G, VoLTE, \n",
|
|||
|
"115 117 4.60 72 Dual Sim, 3G, 4G, VoLTE, \n",
|
|||
|
"\n",
|
|||
|
" Ram Battery External_Memory Android_version \\\n",
|
|||
|
"id \n",
|
|||
|
"427 12 GB RAM 5000 Memory Card Not Supported 14 \n",
|
|||
|
"1088 NaN <NA> NaN NaN \n",
|
|||
|
"668 12 GB RAM 4500 Android v12 NaN \n",
|
|||
|
"572 6 GB RAM 5000 Memory Card Supported, upto 1 TB 13 \n",
|
|||
|
"115 4 GB RAM 5000 Memory Card Supported, upto 1 TB 12 \n",
|
|||
|
"\n",
|
|||
|
" company Inbuilt_memory fast_charging \\\n",
|
|||
|
"id \n",
|
|||
|
"427 Vivo 256 GB inbuilt 120W Fast Charging \n",
|
|||
|
"1088 NaN NaN NaN \n",
|
|||
|
"668 Honor 256 GB inbuilt 100W Fast Charging \n",
|
|||
|
"572 Xiaomi 128 GB inbuilt 18W Fast Charging \n",
|
|||
|
"115 Vivo 64 GB inbuilt 18W Fast Charging \n",
|
|||
|
"\n",
|
|||
|
" Screen_resolution Processor \n",
|
|||
|
"id \n",
|
|||
|
"427 1260 x 2800 px Octa Core \n",
|
|||
|
"1088 NaN NaN \n",
|
|||
|
"668 1200 x 2652 px Octa Core \n",
|
|||
|
"572 720 x 1600 px Octa Core \n",
|
|||
|
"115 720 x 1612 px Display with Water Drop Notch Octa Core \n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"import featuretools as ft\n",
|
|||
|
"import re\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование столбца Battery в числовой формат\n",
|
|||
|
"df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование столбцов Camera и Display в числовой формат\n",
|
|||
|
"df['Camera'] = pd.to_numeric(df['Camera'], errors='coerce')\n",
|
|||
|
"df['Display'] = pd.to_numeric(df['Display'], errors='coerce')\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового признака \"Camera_to_Display_Ratio\" на основе признаков \"Camera\" и \"Display\"\n",
|
|||
|
"train_df['Camera_to_Display_Ratio'] = train_df['Camera'] / train_df['Display']\n",
|
|||
|
"val_df['Camera_to_Display_Ratio'] = val_df['Camera'] / val_df['Display']\n",
|
|||
|
"test_df['Camera_to_Display_Ratio'] = test_df['Camera'] / test_df['Display']\n",
|
|||
|
"\n",
|
|||
|
"# Определение сущностей\n",
|
|||
|
"es = ft.EntitySet(id='mobile_data')\n",
|
|||
|
"es = es.add_dataframe(dataframe_name='train', dataframe=train_df, index='id')\n",
|
|||
|
"\n",
|
|||
|
"# Генерация признаков\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='train', max_depth=2)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование признаков для контрольной и тестовой выборок\n",
|
|||
|
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df.index)\n",
|
|||
|
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df.index)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод первых нескольких строк для проверки\n",
|
|||
|
"print(\"Обучающая выборка после конструирования признаков:\")\n",
|
|||
|
"print(feature_matrix.head())\n",
|
|||
|
"print(\"Контрольная выборка после конструирования признаков:\")\n",
|
|||
|
"print(val_feature_matrix.head())\n",
|
|||
|
"print(\"Тестовая выборка после конструирования признаков:\")\n",
|
|||
|
"print(test_feature_matrix.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Оценка качества каждого набора признаков\n",
|
|||
|
"\n",
|
|||
|
"Предсказательная способность Метрики: RMSE, MAE, R²\n",
|
|||
|
"\n",
|
|||
|
"Методы: Обучение модели на обучающей выборке и оценка на контрольной и тестовой выборках.\n",
|
|||
|
"\n",
|
|||
|
"Скорость вычисления Методы: Измерение времени выполнения генерации признаков и обучения модели.\n",
|
|||
|
"\n",
|
|||
|
"Надежность Методы: Кросс-валидация, анализ чувствительности модели к изменениям в данных.\n",
|
|||
|
"\n",
|
|||
|
"Корреляция Методы: Анализ корреляционной матрицы признаков, удаление мультиколлинеарных признаков.\n",
|
|||
|
"\n",
|
|||
|
"Цельность Методы: Проверка логической связи между признаками и целевой переменной, интерпретация результатов модели."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 29,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 671\n",
|
|||
|
"Размер контрольной выборки: 288\n",
|
|||
|
"Размер тестовой выборки: 411\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Feature Importance:\n",
|
|||
|
" feature importance\n",
|
|||
|
"4 Price 0.999443\n",
|
|||
|
"2 Spec_score 0.000227\n",
|
|||
|
"3 Battery 0.000146\n",
|
|||
|
"0 Unnamed: 0 0.000146\n",
|
|||
|
"1 Rating 0.000039\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"import featuretools as ft\n",
|
|||
|
"from sklearn.ensemble import RandomForestRegressor\n",
|
|||
|
"import re\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование столбца Battery в числовой формат\n",
|
|||
|
"df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование столбца Display в числовой формат\n",
|
|||
|
"df['Camera'] = pd.to_numeric(df['Camera'], errors='coerce')\n",
|
|||
|
"df['Display'] = pd.to_numeric(df['Display'], errors='coerce')\n",
|
|||
|
"df['Inbuilt_memory'] = pd.to_numeric(df['Inbuilt_memory'], errors='coerce')\n",
|
|||
|
"df['fast_charging'] = pd.to_numeric(df['fast_charging'], errors='coerce')\n",
|
|||
|
"\n",
|
|||
|
"# Удаление запятых из столбца Price и преобразование в числовой формат\n",
|
|||
|
"df['Price'] = df['Price'].str.replace(',', '').astype(float)\n",
|
|||
|
"\n",
|
|||
|
"# Удаление столбцов с текстовыми значениями, которые не могут быть преобразованы в числа\n",
|
|||
|
"df = df.drop(columns=['Name', 'company', 'Android_version', 'Processor_name', 'External_Memory', 'No_of_sim', 'Ram', 'Screen_resolution', 'Processor' ])\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling к обучающей выборке (если это необходимо)\n",
|
|||
|
"X_train = train_df.drop('Price', axis=1) # Отделяем признаки от целевой переменной\n",
|
|||
|
"y_train = train_df['Price'] # Целевая переменная\n",
|
|||
|
"\n",
|
|||
|
"# Инициализация RandomOverSampler\n",
|
|||
|
"ros = RandomOverSampler(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling\n",
|
|||
|
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового DataFrame с балансированными данными\n",
|
|||
|
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"# Определение сущностей\n",
|
|||
|
"es = ft.EntitySet(id='mobile_data')\n",
|
|||
|
"es = es.add_dataframe(dataframe_name='mobile', dataframe=train_df_resampled, index='id')\n",
|
|||
|
"\n",
|
|||
|
"# Генерация признаков\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='mobile', max_depth=2)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование признаков для контрольной и тестовой выборок\n",
|
|||
|
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df.index)\n",
|
|||
|
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df.index)\n",
|
|||
|
"\n",
|
|||
|
"# Оценка важности признаков\n",
|
|||
|
"X = feature_matrix\n",
|
|||
|
"y = train_df_resampled['Price']\n",
|
|||
|
"\n",
|
|||
|
"# Разделение данных на обучающую и тестовую выборки\n",
|
|||
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Обучение модели\n",
|
|||
|
"model = RandomForestRegressor(n_estimators=100, random_state=42)\n",
|
|||
|
"model.fit(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Получение важности признаков\n",
|
|||
|
"importances = model.feature_importances_\n",
|
|||
|
"feature_names = feature_matrix.columns\n",
|
|||
|
"\n",
|
|||
|
"# Сортировка признаков по важности\n",
|
|||
|
"feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})\n",
|
|||
|
"feature_importance = feature_importance.sort_values(by='importance', ascending=False)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Feature Importance:\")\n",
|
|||
|
"print(feature_importance)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 31,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 66\n",
|
|||
|
"Размер контрольной выборки: 29\n",
|
|||
|
"Размер тестовой выборки: 42\n",
|
|||
|
"Mean Squared Error: 13048795.366100002\n",
|
|||
|
"R2 Score: -0.23881710583662308\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Cross-validated Mean Squared Error: 394482934.1724652\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA5gAAAIjCAYAAABmsrS/AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABGCElEQVR4nO3deVxV1f7/8fcB5ICMTgwagoRjaYam4axZOORQXc0hpxxuOZR59aopKk6Z6XXIpHKiwTRtHtQy0yy1LKfUHBGHTK0cQDRBYf/+6Of5dkJNcMERfD0fj/N4cNZee+3P3isu9+3aZx+bZVmWAAAAAAC4QW6uLgAAAAAAUDgQMAEAAAAARhAwAQAAAABGEDABAAAAAEYQMAEAAAAARhAwAQAAAABGEDABAAAAAEYQMAEAAAAARhAwAQAAAABGEDABAAAAAEYQMAEABU5iYqJsNtsVX8OGDcuTY65fv15jxozRmTNn8mT8G3H5evzwww+uLiXXZs+ercTERFeXAQC4QR6uLgAAgNwaO3asypUr59R255135smx1q9fr/j4eHXv3l2BgYF5coxb2ezZs1WyZEl1797d1aUAAG4AARMAUGA1b95cNWvWdHUZN+TcuXPy8fFxdRkuc/78eRUtWtTVZQAADOEWWQBAobV8+XLVr19fPj4+8vPzU8uWLbVz506nPj/++KO6d++uyMhIeXl5KSQkRI8//rhOnjzp6DNmzBgNGTJEklSuXDnH7bgHDx7UwYMHZbPZrnh7p81m05gxY5zGsdls+umnn9SpUycVK1ZM9erVc2x/8803VaNGDXl7e6t48eLq0KGDjhw5kqtz7969u3x9fXX48GE9+OCD8vX1VZkyZfTSSy9JkrZv364mTZrIx8dH4eHheuutt5z2v3zb7dq1a/Xvf/9bJUqUkL+/v7p27arTp09nO97s2bN1xx13yG63q3Tp0urXr1+224kbNWqkO++8U5s2bVKDBg1UtGhRPfvss4qIiNDOnTv11VdfOa5to0aNJEmnTp3S4MGDVbVqVfn6+srf31/NmzfXtm3bnMZes2aNbDablixZogkTJui2226Tl5eX7rvvPu3fvz9bvd99951atGihYsWKycfHR9WqVdOMGTOc+uzevVv/+te/VLx4cXl5ealmzZr66KOPcjoVAHBLYQUTAFBgpaSk6Pfff3dqK1mypCTpjTfeULdu3RQbG6vnn39e58+fV0JCgurVq6ctW7YoIiJCkrRy5UodOHBAPXr0UEhIiHbu3KlXX31VO3fu1LfffiubzaaHH35Ye/fu1aJFizRt2jTHMUqVKqXffvstx3W3a9dO5cuX18SJE2VZliRpwoQJiouLU/v27dWrVy/99ttvevHFF9WgQQNt2bIlV7flZmZmqnnz5mrQoIEmT56shQsXqn///vLx8dGIESPUuXNnPfzww3r55ZfVtWtXxcTEZLvluH///goMDNSYMWO0Z88eJSQk6NChQ45AJ/0ZnOPj49W0aVM9+eSTjn7ff/+91q1bpyJFijjGO3nypJo3b64OHTroscceU3BwsBo1aqQBAwbI19dXI0aMkCQFBwdLkg4cOKAPPvhA7dq1U7ly5XTixAm98soratiwoX766SeVLl3aqd5JkybJzc1NgwcPVkpKiiZPnqzOnTvru+++c/RZuXKlHnzwQYWGhurpp59WSEiIdu3apU8++URPP/20JGnnzp2qW7euypQpo2HDhsnHx0dLlixR27Zt9e677+qhhx7K8XwAwC3BAgCggFmwYIEl6Yovy7Kss2fPWoGBgVbv3r2d9jt+/LgVEBDg1H7+/Pls4y9atMiSZK1du9bR9sILL1iSrOTkZKe+ycnJliRrwYIF2caRZI0ePdrxfvTo0ZYkq2PHjk79Dh48aLm7u1sTJkxwat++fbvl4eGRrf1q1+P77793tHXr1s2SZE2cONHRdvr0acvb29uy2WzW4sWLHe27d+/OVuvlMWvUqGFlZGQ42idPnmxJsj788EPLsizr119/tTw9Pa0HHnjAyszMdPSbNWuWJcmaP3++o61hw4aWJOvll1/Odg533HGH1bBhw2ztFy5ccBrXsv685na73Ro7dqyjbfXq1ZYkq3LlylZ6erqjfcaMGZYka/v27ZZlWdalS5escuXKWeHh4dbp06edxs3KynL8fN9991lVq1a1Lly44LS9Tp06Vvny5bPVCQD4E7fIAgAKrJdeekkrV650ekl/rlCdOXNGHTt21O+//+54ubu7q3bt2lq9erVjDG9vb8fPFy5c0O+//657771XkrR58+Y8qfuJJ55wev/ee+8pKytL7du3d6o3JCRE5cuXd6o3p3r16uX4OTAwUBUrVpSPj4/at2/vaK9YsaICAwN14MCBbPv36dPHaQXyySeflIeHh5YtWyZJ+uKLL5SRkaGBAwfKze3//m9F79695e/vr08//dRpPLvdrh49elx3/Xa73TFuZmamTp48KV9fX1WsWPGK89OjRw95eno63tevX1+SHOe2ZcsWJScna+DAgdlWhS+vyJ46dUpffvml2rdvr7Nnzzrm4+TJk4qNjdW+fft09OjR6z4HALiVcIssAKDAqlWr1hUf8rNv3z5JUpMmTa64n7+/v+PnU6dOKT4+XosXL9avv/7q1C8lJcVgtf/n77eh7tu3T5ZlqXz58lfs/9eAlxNeXl4qVaqUU1tAQIBuu+02R5j6a/uVPlv595p8fX0VGhqqgwcPSpIOHTok6c+Q+leenp6KjIx0bL+sTJkyTgHwn2RlZWnGjBmaPXu2kpOTlZmZ6dhWokSJbP3Lli3r9L5YsWKS5Di3pKQkSdd+2vD+/ftlWZbi4uIUFxd3xT6//vqrypQpc93nAQC3CgImAKDQycrKkvTn5zBDQkKybffw+L8/f+3bt9f69es1ZMgQVa9eXb6+vsrKylKzZs0c41zL34PaZX8NQn/311XTy/XabDYtX75c7u7u2fr7+vr+Yx1XcqWxrtVu/f/Pg+alv5/7P5k4caLi4uL0+OOPa9y4cSpevLjc3Nw0cODAK86PiXO7PO7gwYMVGxt7xT5RUVHXPR4A3EoImACAQuf222+XJAUFBalp06ZX7Xf69GmtWrVK8fHxGjVqlKP98groX10tSF5eIfv7E1P/vnL3T/ValqVy5cqpQoUK171ffti3b58aN27seJ+WlqZjx46pRYsWkqTw8HBJ0p49exQZGenol5GRoeTk5Gte/7+62vV955131LhxY82bN8+p/cyZM46HLeXE5f82duzYcdXaLp9HkSJFrrt+AMCf+AwmAKDQiY2Nlb+/vyZOnKiLFy9m2375ya+XV7v+vro1ffr0bPtc/q7KvwdJf39/lSxZUmvXrnVqnz179nXX+/DDD8vd3V3x8fHZarEsy+krU/Lbq6++6nQNExISdOnSJTVv3lyS1LRpU3l6emrmzJlOtc+bN08pKSlq2bLldR3Hx8cn27WV/pyjv1+TpUuX5vozkNHR0SpXrpymT5+e7XiXjxMUFKRGjRrplVde0bFjx7KNkZsnBwPArYIVTABAoePv76+EhAR16dJF0dHR6tChg0qVKqXDhw/r008/Vd26dTVr1iz5+/s7vsLj4sWLKlOmjD7//HMlJydnG7NGjRqSpBEjRqhDhw4qUqSIWrVqJR8fH/Xq1UuTJk1Sr169VLNmTa1du1Z79+697npvv/12jR8/XsOHD9fBgwfVtm1b+fn5KTk5We+//7769OmjwYMHG7s+OZGRkaH77rtP7du31549ezR79mzVq1dPrVu3lvTnV7UMHz5c8fHxatasmVq3bu3od8899+ixxx67ruPUqFFDCQkJGj9+vKKiohQUFKQmTZrowQcf1NixY9WjRw/VqVNH27dv18KFC51WS3PCzc1NCQkJatWqlapXr64ePXooNDRUu3fv1s6dO/XZZ59J+vMBUvXq1VPVqlXVu3dvRUZG6sSJE9qwYYN+/vnnbN/DCQD4EwETAFAoderUSaVLl9akSZP0wgsvKD09XWXKlFH9+vWdnmL61ltvacCAAXrppZdkWZYeeOABLV++PNv3K95
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Train Mean Squared Error: 46662951.69621668\n",
|
|||
|
"Train R2 Score: 0.9411587287387594\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2QAAAIjCAYAAABswtioAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABxu0lEQVR4nO3deVxV1f7/8fdhRkZREXEAU1NxzBnNKU0stBwCbTC1wSyt1DLz3iabbM5uk9mgzcbJMWdzLilTQ8VZwyEVwQwQHJj27w9/7K8nSEGBDfh6Ph48rnutdfb57LM7XN+uvde2GYZhCAAAAABQ6pysLgAAAAAArlYEMgAAAACwCIEMAAAAACxCIAMAAAAAixDIAAAAAMAiBDIAAAAAsAiBDAAAAAAsQiADAAAAAIsQyAAAAADAIgQyAIBlbDabnnvuOavLsFy3bt3UrVs3c/vAgQOy2WyaMWOGZTX90z9rLCll8dgBoCQRyACggvjggw9ks9nUvn37y97H0aNH9dxzzykuLq74CivjVq9eLZvNZv64urrqmmuu0d13360//vjD6vKKZP369XruueeUkpJiWQ2hoaEOn2dgYKA6d+6sOXPmWFYTAJRlLlYXAAAoHl9//bVCQ0O1YcMG7du3T/Xr1y/yPo4ePapJkyYpNDRULVu2LP4iy7BHHnlEbdu2VVZWljZv3qxp06Zp4cKF2rZtm4KDg0u1lpCQEJ05c0aurq5Fet369es1adIkDRs2TP7+/iVTXCG0bNlSjz32mKTz/0199NFHGjBggD788EONHDnyoq+93GMHgPKKGTIAqAASEhK0fv16vfXWW6pWrZq+/vprq0sqdzp37qy77rpLw4cP17vvvqs33nhDJ0+e1Oeff/6vr8nIyCiRWmw2mzw8POTs7Fwi+y9pNWvW1F133aW77rpLTzzxhH7++Wd5eXnp7bff/tfXZGdnKzMzs9wfOwAUFYEMACqAr7/+WpUrV1ZkZKRuu+22fw1kKSkpGjt2rEJDQ+Xu7q5atWrp7rvv1okTJ7R69Wq1bdtWkjR8+HDzkrO8e3lCQ0M1bNiwfPv8571FmZmZeuaZZ9S6dWv5+fnJy8tLnTt31qpVq4p8XMePH5eLi4smTZqUr2/37t2y2Wx67733JElZWVmaNGmSGjRoIA8PD1WpUkXXX3+9li9fXuT3laQbbrhB0vmwK0nPPfecbDabduzYoTvuuEOVK1fW9ddfb47/6quv1Lp1a3l6eiogIECDBw/W4cOH8+132rRpqlevnjw9PdWuXTutW7cu35h/u49q165dio6OVrVq1eTp6amGDRvqv//9r1nf+PHjJUl169Y1z9+BAwdKpMaiCAoKUuPGjc3PMu/43njjDU2ZMkX16tWTu7u7duzYcVnHnufIkSO65557VL16dbm7u6tJkyb67LPPrqh2AChpXLIIABXA119/rQEDBsjNzU233367PvzwQ/32229mwJKk9PR0de7cWTt37tQ999yjVq1a6cSJE5o/f77+/PNPNW7cWM8//7yeeeYZjRgxQp07d5YkdezYsUi1pKWl6ZNPPtHtt9+u+++/X6dOndKnn36qiIgIbdiwoUiXQlavXl1du3ZVTEyMnn32WYe+7777Ts7OzoqKipJ0PpBMnjxZ9913n9q1a6e0tDRt3LhRmzdv1o033likY5Ck/fv3S5KqVKni0B4VFaUGDRro5ZdflmEYkqSXXnpJTz/9tKKjo3XfffcpOTlZ7777rrp06aLff//dvHzw008/1QMPPKCOHTtqzJgx+uOPP3TLLbcoICBAtWvXvmg9W7duVefOneXq6qoRI0YoNDRU+/fv1w8//KCXXnpJAwYM0J49e/Ttt9/q7bffVtWqVSVJ1apVK7Ua/01WVpYOHz6c77OcPn26zp49qxEjRsjd3V0BAQHKzc0t8rFL58N7hw4dZLPZNHr0aFWrVk2LFy/Wvffeq7S0NI0ZM+ayageAEmcAAMq1jRs3GpKM5cuXG4ZhGLm5uUatWrWMRx991GHcM888Y0gyZs+enW8fubm5hmEYxm+//WZIMqZPn55vTEhIiDF06NB87V27djW6du1qbmdnZxvnzp1zGPP3338b1atXN+655x6HdknGs88+e9Hj++ijjwxJxrZt2xzaw8LCjBtuuMHcbtGihREZGXnRfRVk1apVhiTjs88+M5KTk42jR48aCxcuNEJDQw2bzWb89ttvhmEYxrPPPmtIMm6//XaH1x84cMBwdnY2XnrpJYf2bdu2GS4uLmZ7ZmamERgYaLRs2dLh85k2bZohyeEzTEhIyHceunTpYvj4+BgHDx50eJ+8c2cYhvH6668bkoyEhIQSr/HfhISEGL169TKSk5ON5ORkY8uWLcbgwYMNScbDDz/scHy+vr5GUlKSw+sv99jvvfdeo0aNGsaJEyccxgwePNjw8/MzTp8+fcnaAcAKXLIIAOXc119/rerVq6t79+6Szt9/NGjQIM2cOVM5OTnmuFmzZqlFixbq379/vn3YbLZiq8fZ2Vlubm6SpNzcXJ08eVLZ2dlq06aNNm/eXOT9DRgwQC4uLvruu+/Mtvj4eO3YsUODBg0y2/z9/bV9+3bt3bv3suq+5557VK1aNQUHBysyMlIZGRn6/PPP1aZNG4dx/1yUYvbs2crNzVV0dLROnDhh/gQFBalBgwbmpZobN25UUlKSRo4caX4+kjRs2DD5+fldtLbk5GStXbtW99xzj+rUqePQV5hzVxo1XmjZsmWqVq2aqlWrphYtWshut2vIkCF69dVXHcYNHDjQnMH7N4U5dsMwNGvWLPXt21eGYTgcY0REhFJTUy/rvz0AKA1csggA5VhOTo5mzpyp7t27m/fnSFL79u315ptvasWKFerVq5ek85fgDRw4sFTq+vzzz/Xmm29q165dysrKMtvr1q1b5H1VrVpVPXr0UExMjF544QVJ5y9XdHFx0YABA8xxzz//vG699VZde+21atq0qXr37q0hQ4aoefPmhXqfZ555Rp07d5azs7OqVq2qxo0by8Ul//9N/vMY9u7dK8Mw1KBBgwL3m7da4MGDByUp37i8ZfYvJm/5/aZNmxbqWP6pNGq8UPv27fXiiy/KZrOpUqVKaty4cYGrPhbmv4fCHHtycrJSUlI0bdo0TZs2rcAxSUlJhSseAEoZgQwAyrGVK1fq2LFjmjlzpmbOnJmv/+uvvzYD2ZX6t5mYnJwchxXxvvrqKw0bNkz9+vXT+PHjFRgYKGdnZ02ePNm8L6uoBg8erOHDhysuLk4tW7ZUTEyMevToYd4nJUldunTR/v37NW/ePC1btkyffPKJ3n77bU2dOlX33XffJd+jWbNm6tmz5yXHeXp6Omzn5ubKZrNp8eLFBa4M6O3tXYgjLFmlXWPVqlUv67O8XHn3nd11110aOnRogWMKG8wBoLQRyACgHPv6668VGBio999/P1/f7NmzNWfOHE2dOlWenp6qV6+e4uPjL7q/i13+Vrly5QIfOHzw4EGH2ZPvv/9e11xzjWbPnu2wv38uylEU/fr10wMPPGBetrhnzx5NnDgx37iAgAANHz5cw4cPV3p6urp06aLnnnuuUIHsctWrV0+GYahu3bq69tpr/3VcSEiIpPOzVXkrOErnF7xISEhQixYt/vW1eZ/v5Z6/0qixpBTm2KtVqyYfHx/l5OQUKggCQFnCPWQAUE6dOXNGs2fPVp8+fXTbbbfl+xk9erROnTql+fPnSzp/v86WLVs0Z86cfPsy/v9qgV5eXpJUYPCqV6+efvnlF2VmZpptCxYsyLdset4MTN4+JenXX39VbGzsZR+rv7+/IiIiFBMTo5kzZ8rNzU39+vVzGPPXX385bHt7e6t+/fo6d+7cZb9vYQwYMEDOzs6aNGmSwzFL5z+DvLratGmjatWqaerUqQ6f4YwZMwr8vC9UrVo1denSRZ999pkOHTqU7z3y/Nv5K40aS0phjt3Z2VkDBw7UrFmzCgxuycnJpVIrAFwOZsgAoJyaP3++Tp06pVtuuaXA/g4dOpgPiR40aJDGjx+v77//XlFRUbrnnnvUunVrnTx5UvPnz9f
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from sklearn.ensemble import RandomForestRegressor\n",
|
|||
|
"from sklearn.metrics import mean_squared_error, r2_score\n",
|
|||
|
"from sklearn.model_selection import cross_val_score\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"import featuretools as ft\n",
|
|||
|
"import re\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Уменьшение размера выборки для ускорения работы (опционально)\n",
|
|||
|
"df = df.sample(frac=0.1, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование столбца Battery в числовой формат\n",
|
|||
|
"df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование столбца Display в числовой формат\n",
|
|||
|
"df['Camera'] = pd.to_numeric(df['Camera'], errors='coerce')\n",
|
|||
|
"df['Display'] = pd.to_numeric(df['Display'], errors='coerce')\n",
|
|||
|
"df['Inbuilt_memory'] = pd.to_numeric(df['Inbuilt_memory'], errors='coerce')\n",
|
|||
|
"df['fast_charging'] = pd.to_numeric(df['fast_charging'], errors='coerce')\n",
|
|||
|
"\n",
|
|||
|
"# Удаление запятых из столбца Price и преобразование в числовой формат\n",
|
|||
|
"df['Price'] = df['Price'].str.replace(',', '').astype(float)\n",
|
|||
|
"\n",
|
|||
|
"# Удаление столбцов с текстовыми значениями, которые не могут быть преобразованы в числа\n",
|
|||
|
"df = df.drop(columns=['Name', 'company', 'Android_version', 'Processor_name', 'External_Memory', 'No_of_sim', 'Ram', 'Screen_resolution', 'Processor' ])\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
|||
|
"\n",
|
|||
|
"# Определение сущностей\n",
|
|||
|
"es = ft.EntitySet(id='mobile_data')\n",
|
|||
|
"es = es.add_dataframe(dataframe_name='mobile', dataframe=train_df, index='id')\n",
|
|||
|
"\n",
|
|||
|
"# Генерация признаков с уменьшенной глубиной\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='mobile', max_depth=1)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование признаков для контрольной и тестовой выборок\n",
|
|||
|
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df.index)\n",
|
|||
|
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df.index)\n",
|
|||
|
"\n",
|
|||
|
"# Удаление строк с NaN\n",
|
|||
|
"feature_matrix = feature_matrix.dropna()\n",
|
|||
|
"val_feature_matrix = val_feature_matrix.dropna()\n",
|
|||
|
"test_feature_matrix = test_feature_matrix.dropna()\n",
|
|||
|
"\n",
|
|||
|
"# Разделение данных на обучающую и тестовую выборки\n",
|
|||
|
"X_train = feature_matrix.drop('Price', axis=1)\n",
|
|||
|
"y_train = feature_matrix['Price']\n",
|
|||
|
"X_val = val_feature_matrix.drop('Price', axis=1)\n",
|
|||
|
"y_val = val_feature_matrix['Price']\n",
|
|||
|
"X_test = test_feature_matrix.drop('Price', axis=1)\n",
|
|||
|
"y_test = test_feature_matrix['Price']\n",
|
|||
|
"\n",
|
|||
|
"# Выбор модели\n",
|
|||
|
"model = RandomForestRegressor(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Обучение модели\n",
|
|||
|
"model.fit(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Предсказание и оценка\n",
|
|||
|
"y_pred = model.predict(X_test)\n",
|
|||
|
"\n",
|
|||
|
"mse = mean_squared_error(y_test, y_pred)\n",
|
|||
|
"r2 = r2_score(y_test, y_pred)\n",
|
|||
|
"\n",
|
|||
|
"print(f\"Mean Squared Error: {mse}\")\n",
|
|||
|
"print(f\"R2 Score: {r2}\")\n",
|
|||
|
"\n",
|
|||
|
"# Кросс-валидация\n",
|
|||
|
"scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n",
|
|||
|
"mse_cv = -scores.mean()\n",
|
|||
|
"print(f\"Cross-validated Mean Squared Error: {mse_cv}\")\n",
|
|||
|
"\n",
|
|||
|
"# Анализ важности признаков\n",
|
|||
|
"feature_importances = model.feature_importances_\n",
|
|||
|
"feature_names = X_train.columns\n",
|
|||
|
"\n",
|
|||
|
"importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})\n",
|
|||
|
"importance_df = importance_df.sort_values(by='Importance', ascending=False)\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.barplot(x='Importance', y='Feature', data=importance_df)\n",
|
|||
|
"plt.title('Feature Importance')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Проверка на переобучение\n",
|
|||
|
"y_train_pred = model.predict(X_train)\n",
|
|||
|
"\n",
|
|||
|
"mse_train = mean_squared_error(y_train, y_train_pred)\n",
|
|||
|
"r2_train = r2_score(y_train, y_train_pred)\n",
|
|||
|
"\n",
|
|||
|
"print(f\"Train Mean Squared Error: {mse_train}\")\n",
|
|||
|
"print(f\"Train R2 Score: {r2_train}\")\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация результатов\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(y_test, y_pred, alpha=0.5)\n",
|
|||
|
"plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)\n",
|
|||
|
"plt.xlabel('Actual Price')\n",
|
|||
|
"plt.ylabel('Predicted Price')\n",
|
|||
|
"plt.title('Actual vs Predicted Price')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "aimenv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.5"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|