AIM-PIbd-32-Puchkina-A-A/lab_3/lab3.ipynb

1404 lines
320 KiB
Plaintext
Raw Normal View History

2024-11-09 11:43:06 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Набор данных с ценами на мобильные устройства"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Вывод всех столбцов"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Unnamed: 0', 'Name', 'Rating', 'Spec_score', 'No_of_sim', 'Ram',\n",
" 'Battery', 'Display', 'Camera', 'External_Memory', 'Android_version',\n",
" 'Price', 'company', 'Inbuilt_memory', 'fast_charging',\n",
" 'Screen_resolution', 'Processor', 'Processor_name'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import pandas as pd \n",
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
"print(df.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Бизнес-цели:\n",
"1. Классифицировать мобильные устройства по ценовым категориям (например, бюджетные, средний класс, флагманы).\n",
"2. Определить, какие характеристики мобильных устройств наиболее сильно влияют на их рейтинг."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выполним разбиение на 3 выборки: обучающую, контрольную и тестовую"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 671\n",
"Размер контрольной выборки: 288\n",
"Размер тестовой выборки: 411\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
"\n",
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
"\n",
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
"\n",
"# Вывод размеров выборок\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение классов в company:\n",
"company\n",
"Vivo 186\n",
"Realme 186\n",
"Samsung 181\n",
"Motorola 127\n",
"Xiaomi 90\n",
"Honor 88\n",
"Poco 75\n",
"OnePlus 75\n",
"Huawei 62\n",
"iQOO 57\n",
"OPPO 38\n",
"Oppo 27\n",
"TCL 26\n",
"Google 23\n",
"Asus 21\n",
"POCO 19\n",
"Lava 19\n",
"Nothing 15\n",
"Lenovo 14\n",
"Tecno 13\n",
"itel 12\n",
"LG 6\n",
"Gionee 5\n",
"Itel 3\n",
"IQOO 1\n",
"Coolpad 1\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAmEAAAHHCAYAAAD3WI8lAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACIwklEQVR4nOzdeXwN1//48ddNyM2+WRIhRFZBEGsJYm1CbS0NmiKkqF1Rmo9dEUtttVaL0NprLa211tiXWCqIEKL2LSEqiWR+f/iZrytBRLhZ3s/HYx7NnTlz5n3mJu67Z849R6MoioIQQgghhPigDPQdgBBCCCFEXiRJmBBCCCGEHkgSJoQQQgihB5KECSGEEELogSRhQgghhBB6IEmYEEIIIYQeSBImhBBCCKEHkoQJIYQQQuiBJGFCCCGEEHogSZgQQgghhB5IEibEK4SFhaHRaNTN2NgYd3d3evbsyc2bN/UdnhBCiBwun74DECK7GzVqFCVLluTJkyfs3buX2bNn8+eff3L69GlMTU31HZ4QQogcSpIwId6gUaNGVK5cGYCvvvqKAgUKMHnyZNatW0fbtm31HJ0QQoicSh5HCvGW6tWrB8ClS5cAuHfvHgMGDMDLywtzc3MsLS1p1KgRJ06cSHPukydPGDFiBO7u7hgbG1OkSBE+++wzoqOjAYiJidF5BPryVqdOHbWunTt3otFoWL58Of/73/+wt7fHzMyMZs2aERsbm+baBw8exN/fHysrK0xNTfH19SU8PDzdNtapUyfd648YMSJN2d9++41KlSphYmKCra0tbdq0Sff6r2vbi1JTU5k6dSplypTB2NgYOzs7unbtyv3793XKOTk50aRJkzTX6dmzZ5o604t94sSJae4pQGJiIsOHD8fV1RWtVoujoyMDBw4kMTEx3Xv1ojp16qSpb8yYMRgYGLBkyZJM3Y8ffviBGjVqUKBAAUxMTKhUqRK///57utf/7bffqFq1KqamptjY2FC7dm22bNmiU+avv/7C19cXCwsLLC0tqVKlSprYVq5cqb6nBQsW5Msvv+Tff//VKRMUFKQTs42NDXXq1GHPnj1vvE/vci7A2bNnCQgIoFChQpiYmODh4cHgwYN1yhw/fpxGjRphaWmJubk59evX58CBAzplng852Lt3L71796ZQoUJYW1vTtWtXkpKSePDgAe3bt8fGxgYbGxsGDhyIoijq+c/fwx9++IEpU6ZQokQJTExM8PX15fTp0zrXOnnyJEFBQTg7O2NsbIy9vT2dOnXi7t27OuVGjBiBRqPhwoULBAUFYW1tjZWVFR07duTx48dqOV9fX8qXL5/u/fHw8MDPzy9D91Lol/SECfGWnidMBQoUAODixYusXbuWzz//nJIlS3Lz5k1++uknfH19OXPmDA4ODgCkpKTQpEkTtm/fTps2bejTpw8PHz5k69atnD59GhcXF/Uabdu2pXHjxjrXDQkJSTeeMWPGoNFoGDRoELdu3WLq1Kk0aNCAiIgITExMAPj7779p1KgRlSpVYvjw4RgYGLBgwQLq1avHnj17qFq1app6ixUrRmhoKACPHj2iW7du6V576NChBAQE8NVXX3H79m2mT59O7dq1OX78ONbW1mnO6dKlC7Vq1QJg9erVrFmzRud4165dCQsLo2PHjvTu3ZtLly4xY8YMjh8/Tnh4OPnz50/3PryNBw8eqG17UWpqKs2aNWPv3r106dIFT09PTp06xZQpUzh//jxr1659q+ssWLCAIUOGMGnSJL744ot0y7zpfkybNo1mzZoRGBhIUlISy5Yt4/PPP2fDhg188sknarmRI0cyYsQIatSowahRozAyMuLgwYP8/ffffPzxx8CzpKNTp06UKVOGkJAQrK2tOX78OJs2bVLje37vq1SpQmhoKDdv3mTatGmEh4eneU8LFizIlClTALh69SrTpk2jcePGxMbGpvvevyiz5548eZJatWqRP39+unTpgpOTE9HR0fzxxx+MGTMGgH/++YdatWphaWnJwIEDyZ8/Pz/99BN16tRh165dVKtWTafOXr16YW9vz8iRIzlw4ABz587F2tqaffv2Ubx4ccaOHcuff/7JxIkTKVu2LO3bt9c5f9GiRTx8+JAePXrw5MkTpk2bRr169Th16hR2dnYAbN26lYsXL9KxY0fs7e35559/mDt3Lv/88w8HDhxIk3wHBARQsmRJQkNDOXbsGL/88guFCxdm/PjxALRr147OnTtz+vRpypYtq553+PBhzp8/z5AhQ157/0U2oQgh0rVgwQIFULZt26bcvn1biY2NVZYtW6YUKFBAMTExUa5evaooiqI8efJESUlJ0Tn30qVLilarVUaNGqXumz9/vgIokydPTnOt1NRU9TxAmThxYpoyZcqUUXx9fdXXO3bsUAClaNGiSnx8vLp/xYoVCqBMmzZNrdvNzU3x8/NTr6MoivL48WOlZMmSSsOGDdNcq0aNGkrZsmXV17dv31YAZfjw4eq+mJgYxdDQUBkzZozOuadOnVLy5cuXZn9UVJQCKAsXLlT3DR8+XHnxn6E9e/YogLJ48WKdczdt2pRmf4kSJZRPPvkkTew9evRQXv6n7eXYBw4cqBQuXFipVKmSzj399ddfFQMDA2XPnj0658+ZM0cBlPDw8DTXe5Gvr69a38aNG5V8+fIp/fv3T7dsRu6Hojx7n16UlJSklC1bVqlXr55OXQYGBsqnn36a5nfx+Xv+4MEDxcLCQqlWrZry33//pVsmKSlJKVy4sFK2bFmdMhs2bFAAZdiwYeq+Dh06KCVKlNCpZ+7cuQqgHDp0KN02Z8W5tWvXViwsLJTLly+n2wZFUZQWLVooRkZGSnR0tLrv2rVrioWFhVK7dm113/O/8Zf/NqpXr65oNBrl66+/Vvc9ffpUKVasmM7vy/O/1xf/PVAURTl48KACKN9884267+X3UVEUZenSpQqg7N69W933/HegU6dOOmU//fRTpUCBAurrBw8eKMbGxsqgQYN0yvXu3VsxMzNTHj16lOZ6IvuRx5FCvEGDBg0oVKgQjo6OtGnTBnNzc9asWUPRokUB0Gq1GBg8+1NKSUnh7t27mJub4+HhwbFjx9R6Vq1aRcGCBenVq1eaa7z8f8Fvo3379lhYWKivW7VqRZEiRfjzzz8BiIiIICoqii+++IK7d+9y584d7ty5Q0JCAvXr12f37t2kpqbq1PnkyROMjY1fe93Vq1eTmppKQECAWuedO3ewt7fHzc2NHTt26JRPSkoCnt2vV1m5ciVWVlY0bNhQp85KlSphbm6eps7k5GSdcnfu3OHJkyevjfvff/9l+vTpDB06FHNz8zTX9/T0pFSpUjp1Pn8E/fL1X+XQoUMEBATQsmVLJk6cmG6ZjNwPQO3NBLh//z5xcXHUqlVL53dr7dq1pKamMmzYMPV38bnnv1tbt27l4cOHfPfdd2ne2+dljhw5wq1bt+jevbtOmU8++YRSpUqxceNGnfNSU1PVexQREcGiRYsoUqQInp6er21TZs+9ffs2u3fvplOnThQvXjzdNqSkpLBlyxZatGiBs7OzerxIkSJ88cUX7N27l/j4eJ1zg4ODdf4Gq1WrhqIoBAcHq/sMDQ2pXLkyFy9eTBNXixYt1H8PAKpWrUq1atXUv0HQfR+fPHnCnTt3+OijjwB03svnvv76a53XtWrV4u7du2rsVlZWNG/enKVLl6qPSFNSUli+fDktWrTAzMwsTZ0i+5HHkUK8wcyZM3F3dydfvnzY2dnh4eGh80GXmprKtGnTmDVrFpcuXSIlJUU99vyRJTx7jOnh4UG+fFn7Z+fm5qbzWqPR4OrqSkxMDABRUVEAdOjQ4ZV1xMXFYWNjo76+c+dOmnpfFhUVhaIoryz38mPDBw8eAKRJfF6uMy4ujsKFC6d7/NatWzqvt2zZQqFChV4b58uGDx+Og4MDXbt2TTO2KioqisjIyFfW+fL10/Pvv//yySefkJCQwN27d1+ZYGfkfgBs2LCB0aNHExERoTMu7cV6o6OjMTAwoHTp0q+s5/lj9BcfXb3s8uXLwLMxRS8rVaoUe/fu1dkXGxurc6+KFCn
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение классов в Обучающей выборке:\n",
"company\n",
"Vivo 138\n",
"Samsung 128\n",
"Realme 125\n",
"Motorola 89\n",
"Xiaomi 66\n",
"Honor 59\n",
"OnePlus 56\n",
"Poco 52\n",
"Huawei 46\n",
"iQOO 37\n",
"Oppo 21\n",
"OPPO 20\n",
"Google 16\n",
"Lava 16\n",
"POCO 14\n",
"TCL 14\n",
"Asus 12\n",
"Lenovo 12\n",
"itel 10\n",
"Nothing 8\n",
"Tecno 8\n",
"LG 5\n",
"Gionee 4\n",
"IQOO 1\n",
"Itel 1\n",
"Coolpad 1\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAmEAAAHHCAYAAAD3WI8lAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACOiklEQVR4nOzdd1gVR/vw8e8B5dBBLIAGRZqioqJYsTfsmmhQQ1SMUR+7sYbYjYoltlhjLKixd2MSscSKvWCJKIgSsRcUrICw7x++7M8joEhQ2v25rr307M7O3ruHcjMzZ0ajKIqCEEIIIYT4pPQyOwAhhBBCiNxIkjAhhBBCiEwgSZgQQgghRCaQJEwIIYQQIhNIEiaEEEIIkQkkCRNCCCGEyASShAkhhBBCZAJJwoQQQgghMoEkYUIIId7pyZMnRERE8OzZs8wORWSwx48fc+XKFV69epXZoeRKkoQJIYTQoSgKCxcupGrVqhgbG2Nubk7x4sX57bffMju0bOHGjRsEBASoryMiIli5cmXmBfSG+Ph4pkyZQrly5dBqteTLlw9nZ2f27NmT2aHlSpKEiQwREBCARqNRN0NDQ1xcXOjTpw93797N7PCEyNWCgoL4/PPPsba2RqvVYm9vT48ePbh+/XqK5b/66iv+97//4erqyooVK9i1axe7d+/miy+++MSRZ08ajYbevXsTGBhIREQEQ4cO5eDBg5kdFrGxsTRo0ICRI0dSp04d1q9fz65du/j777+pVq1aZoeXK+XJ7ABEzjJu3DiKFy/Oy5cvOXToEPPnz+fPP//kwoULGBsbZ3Z4QuQ6s2fPpn///jg4ONC3b19sbW0JCQlh0aJFrF27lj///JPq1aur5ZcvX87atWv57bff+OqrrzIx8uyrSJEidOvWjcaNGwNga2vLvn37MjcoYPLkyRw7dozAwEDq1KmT2eEIQCMLeIuMEBAQQJcuXThx4gQeHh7q/kGDBjF9+nRWrVpFhw4dMjFCIXKfoKAgatWqhaenJzt27ND5Qyg8PBxPT0/09PT4559/yJcvHwBubm6ULVs2y3SfZWfh4eE8ePCAMmXKYGJikqmxvHr1ikKFCtGzZ08mTJiQqbGI/yPdkeKjqlevHgDXrl0DICoqisGDB+Pm5oapqSnm5uY0adKEs2fPJjv35cuXjBkzBhcXFwwNDbG1teWLL74gPDwceD3O4s0u0Le3N//S27dvHxqNhrVr1/LDDz9gY2ODiYkJLVu2JDIyMtm1jx07RuPGjbGwsMDY2JjatWsTFBSU4j3WqVMnxeuPGTMmWdnffvuNihUrYmRkhJWVFe3bt0/x+u+6tzclJiYyc+ZMSpcujaGhIdbW1vTo0YNHjx7plLO3t6d58+bJrtOnT59kdaYU+9SpU5M9U3jdvTF69GicnJzQarXY2dkxdOhQYmNjU3xWb6pTp06y+iZMmICenh6rVq1K1/P46aefqF69Ovnz58fIyIiKFSuyYcOGFK//22+/UblyZYyNjcmXLx+1atVi586dOmX++usvateujZmZGebm5lSqVClZbOvXr1ff0wIFCvD1119z8+ZNnTK+vr46MefLl486deqkqYvqv5z7448/otFoWLZsWbKWaEdHR6ZMmcLt27f55ZdfAHj27BkXLlzAzs6OZs2aYW5ujomJSbLrXb16FY1Gw4wZM5Jd8/Dhw2g0GlavXg2k/D4nvZ9vjps6d+4cvr6+ODg4YGhoiI2NDd988w0PHz7UOTdp6ENERIS6LzAwkOrVq2NsbIyFhQXNmzfnwoULOueNGTMGjUbDgwcP1H0nT55MFgdAmTJlUmwp+uuvv6hZsyYmJiaYmZnRrFkz/vnnH50yvr6+2Nvbq8+4SpUqREVFYWRklCzulKT1/U7tezpJ0s+8pBa4y5cv8+jRI8zMzKhdu/Y7nxXAmTNnaNKkCebm5piamlK/fn2OHj2qUybpvThw4AA9evQgf/78mJub06lTpxR/Bvn6+urs6969O4aGhslaCdPynHMK6Y4UH1VSwpQ/f37g9Q/vLVu28OWXX1K8eHHu3r3LL7/8Qu3atbl48SKFCxcGICEhgebNm7Nnzx7at29P//79efLkCbt27eLChQs4Ojqq1+jQoQNNmzbVua6fn1+K8UyYMAGNRsOwYcO4d+8eM2fOpEGDBgQHB2NkZATA33//TZMmTahYsSKjR49GT0+PpUuXUq9ePQ4ePEjlypWT1fvZZ5/h7+8PwNOnT+nZs2eK1x45ciTe3t58++233L9/n9mzZ1OrVi3OnDmDpaVlsnO6d+9OzZo1Adi0aRObN2/WOd6jRw+1FbJfv35cu3aNOXPmcObMGYKCgsibN2+Kz+FDPH78WL23NyUmJtKyZUsOHTpE9+7dcXV15fz588yYMYPQ0FC2bNnyQddZunQpI0aMYNq0aal2g73vecyaNYuWLVvi4+NDXFwca9as4csvv2T79u00a9ZMLTd27FjGjBlD9erVGTduHAYGBhw7doy///6bRo0aAa9/wXzzzTeULl0aPz8/LC0tOXPmDDt27FDjS3r2lSpVwt/fn7t37zJr1iyCgoKSvacFChRQk5YbN24wa9YsmjZtSmRkZIrv/ZvSc+7z58/Zs2cPNWvWpHjx4imWadeuHd27d2f79u18//33asIzefJkbGxsGDJkCIaGhvz66680aNCAXbt2UatWLRwcHPD09GTlypV89913OnWuXLkSMzMzWrVq9c57etuuXbu4evUqXbp0wcbGhn/++YeFCxfyzz//cPTo0WQJd5KDBw/StGlTihUrxujRo4mPj2fevHl4enpy4sQJXFxcPiiO1KxYsYLOnTvj5eXF5MmTef78OfPnz6dGjRqcOXNGTbxSMmrUKF6+fJnma/2Xr5XUJL23fn5+ODs7M3bsWF6+fMncuXOTPat//vmHmjVrYm5uztChQ8mbNy+//PILderUYf/+/VSpUkWn7j59+mBpacmYMWO4fPky8+fP599//1UTwZSMHj2axYsXs3btWp2E978852xJESIDLF26VAGU3bt3K/fv31ciIyOVNWvWKPnz51eMjIyUGzduKIqiKC9fvlQSEhJ0zr127Zqi1WqVcePGqfuWLFmiAMr06dOTXSsxMVE9D1CmTp2arEzp0qWV2rVrq6/37t2rAEqRIkWUmJgYdf+6desUQJk1a5Zat7Ozs+Ll5aVeR1EU5fnz50rx4sWVhg0bJrtW9erVlTJlyqiv79+/rwDK6NGj1X0RERGKvr6+MmHCBJ1zz58/r+TJkyfZ/rCwMAVQli1bpu4bPXq08ua37MGDBxVAWblypc65O3bsSLa/WLFiSrNmzZLF3rt3b+XtHwNvxz506FClUKFCSsWKFXWe6YoVKxQ9PT3l4MGDOucvWLBAAZSgoKBk13tT7dq11fr++OMPJU+ePMqgQYNSLJuW56Eor9+nN8XFxSllypRR6tWrp1OXnp6e8vnnnyf7Wkx6zx8/fqyYmZkpVapUUV68eJFimbi4OKVQoUJKmTJldMps375dAZRRo0ap+zp37qwUK1ZMp56FCxcqgHL8+PEU7/m/nhscHKwASv/+/d9Zf9myZRUrKytFUf7ve8rAwEAJDQ1Vy9y/f1/Jnz+/UrFiRXXfL7/8ogBKSEiIui8uLk4pUKCA0rlzZ3Vf3bp1lVq1aulcM+k6S5cuVfe9/d4piqKsXr1aAZQDBw6o+5J+1ly7dk1RFEWpWLGiYmFhody5c0ctExoaquTNm1dp06aNui/p6+X+/fvqvhMnTiSLQ1GS//x48uSJYmlpqXTr1k2n3J07dxQLCwud/W+/XxcuXFD09PSUJk2a6MSdmrS+36l9TydJ+pm3d+9endcFChRQHjx4oJZL6Vm1bt1aMTAwUMLDw9V9t27dUszMzHTey6T3omLFikpcXJy6f8qUKQqgbN26VSfepK+LpK+d2bNn68T8Ic85p5DuSJGhGjRoQMGCBbGzs6N9+/aYmpqyefNmihQpAoBWq0VP7/WXXUJCAg8fPsTU1JQSJUpw+vRptZ6NGzdSoEAB+vbtm+waqf1llRadOnXCzMxMfd22bVtsbW3
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение классов в Контрольной выборке:\n",
"company\n",
"Realme 26\n",
"Samsung 26\n",
"Vivo 22\n",
"Motorola 18\n",
"Honor 15\n",
"OPPO 13\n",
"Poco 12\n",
"Xiaomi 11\n",
"iQOO 11\n",
"OnePlus 8\n",
"Huawei 7\n",
"Asus 7\n",
"TCL 6\n",
"POCO 5\n",
"Oppo 4\n",
"Google 4\n",
"Tecno 3\n",
"Nothing 3\n",
"itel 2\n",
"Lava 1\n",
"Lenovo 1\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAmEAAAHHCAYAAAD3WI8lAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB+u0lEQVR4nO3deVxN+f8H8NctdduvEi2ktAqVCEMoa9kNJkyDyDJkX6exr1nGnnUGZYbs25ghy1izb2FEiWjspJuKSp3fH36dr6ukErdbr+fjcR6653w+n/M+p3u7b5/P55wjEQRBABERERF9VWrKDoCIiIioNGISRkRERKQETMKIiIiIlIBJGBEREZESMAkjIiIiUgImYURERERKwCSMiIiISAmYhBEREREpQRllB0BERETFR3p6OhISEpCVlQVzc3Nlh1OisSeMiIjoK/jjjz8QFxcnvg4JCcGDBw+UF9B7Lly4gO+//x7GxsaQSqUwMzND586dlR1WicckjAokJCQEEolEXLS0tGBvb4/BgwfjyZMnyg6PqFSaMmUKJBIJnj9/nmPb+PHjIZFIMGTIECVERu87ceIExo4di7i4OISHhyMgIABqasr/Gt69ezcaNmyIGzduYObMmTh48CAOHjyIVatWKTu0Eo/DkVQo06ZNQ5UqVfDmzRucPHkSK1aswN9//43r169DR0dH2eEREYAlS5Zg1qxZ6Nq1KxYvXqzscEq9ESNGwNPTE1WqVAEAjBw5EmZmZkqNKSEhAX379oWXlxe2bt0KTU1NpcZT2jAJo0Jp1aoV3NzcAAB9+/ZFuXLlsGDBAuzevRvdu3dXcnREtHHjRgwfPhzNmzfH+vXri0WPS2lXtWpVxMbG4vr16zA2NoaNjY2yQ8K6devw5s0bhISEMAFTAn4qqUg0bdoUAHD37l0A7/53NXr0aDg5OUFPTw8GBgZo1aoVIiMjc9R98+YNpkyZAnt7e2hpacHMzAydOnVCbGwsACAuLk5hCPTDxdPTU2zr6NGjkEgk2Lx5M37++WeYmppCV1cX7du3R3x8fI59nz17Ft7e3pDJZNDR0YGHhwciIiJyPUZPT89c9z9lypQcZf/44w/Url0b2traMDIyQrdu3XLdf17H9r6srCwsWrQI1atXh5aWFkxMTDBgwAC8fPlSoZyVlRXatm2bYz+DBw/O0WZusc+bNy/HOQWAtLQ0TJ48Gba2tpBKpbCwsMDYsWORlpaW67l6n6enZ472Zs6cCTU1NWzcuLFQ5+OXX35BgwYNUK5cOWhra6N27drYtm1brvv/448/ULduXejo6MDQ0BCNGzfGgQMHFMrs27cPHh4e0NfXh4GBAerUqZMjtq1bt4q/U2NjY/zwww855vP4+fkpxGxoaAhPT0+cOHHik+fpc+p+KDw8HH5+fqhVqxZ27NiR65fr8uXLUb16dUilUpibmyMgIACJiYkKZTw9PVGjRo0cdX/55RdIJBJxfpOVlVWen1ErKysA//v9/vLLL1i4cCEsLS2hra0NDw8PXL9+Pcd+/vnnHzRq1Ai6urooW7YsOnTogKioqFyP+WMxHD16VKFMbp+P970f44dq1KiR47389OlT+Pv7w8TEBFpaWnBxcUFoaGiubYaEhEBXVxf16tWDjY0NAgICIJFI4Ofnl6+YshcNDQ1YWVlhzJgxSE9PF8tlTxe5cOHCR9v68PN45swZ1KxZE7NmzYKFhQWkUins7Owwe/ZsZGVlKdR9+/Ytpk+fDhsbG0ilUlhZWeHnn3/O8Xcg+zwfOHAANWvWhJaWFqpVq4YdO3YolMuO9/15cv/++y8MDQ3Rtm1bvH37VlyfmJiI4cOHizHa2tpizpw5OWJUNewJoyKRnTCVK1cOAHDnzh3s2rUL3333HapUqYInT55g1apV8PDwwI0bN8QrbjIzM9G2bVscPnwY3bp1w7Bhw/Dq1SscPHgQ169fV/ifYvfu3dG6dWuF/QYGBuYaz8yZMyGRSDBu3Dg8ffoUixYtQvPmzXHlyhVoa2sDePcHvlWrVqhduzYmT54MNTU1rFu3Dk2bNsWJEydQt27dHO1WqlQJQUFBAIDk5GQMHDgw131PnDgRPj4+6Nu3L549e4alS5eicePGuHz5MsqWLZujTv/+/dGoUSMAwI4dO7Bz506F7QMGDEBISAh69+6NoUOH4u7duwgODsbly5cREREBDQ2NXM9DQSQmJorH9r6srCy0b98eJ0+eRP/+/eHo6Ihr165h4cKFiI6Oxq5duwq0n3Xr1mHChAmYP38+vv/++1zLfOp8LF68GO3bt4evry/S09OxadMmfPfdd9i7dy/atGkjlps6dSqmTJmCBg0aYNq0adDU1MTZs2fxzz//oGXLlgDefRH06dMH1atXR2BgIMqWLYvLly9j//79YnzZ575OnToICgrCkydPsHjxYkREROT4nRobG2PhwoUAgP/++w+LFy9G69atER8fn+vv/n2fUzfbuXPn0LlzZ1hZWWHfvn3Q19fPUWbKlCmYOnUqmjdvjoEDB+LWrVtYsWIFzp8/X6j306JFi5CcnAwAiIqKwqxZs/Dzzz/D0dERAKCnp6dQfv369Xj16hUCAgLw5s0bLF68GE2bNsW1a9dgYmICADh06BBatWoFa2trTJkyBa9fv8bSpUvh7u6OS5cuiYnd+xo1aoT+/fsrxPElvX79Gp6enrh9+zYGDx6MKlWqYOvWrfDz80NiYiKGDRv20bq3b9/Gr7/+WqD9ZX8u0tLSEB4ejl9++QVaWlqYPn16oY/hxYsXOHnyJE6ePIk+ffqgdu3aOHz4MAIDAxEXF4eVK1eKZfv27YvQ0FB06dIFo0aNwtmzZxEUFISoqKgcn9GYmBh07doVP/74I3r16oV169bhu+++w/79+9GiRYtcY4mPj4e3tzeqVq2KLVu2oEyZdylKamoqPDw88ODBAwwYMACVK1fGqVOnEBgYiEePHmHRokWFPn6lE4gKYN26dQIA4dChQ8KzZ8+E+Ph4YdOmTUK5cuUEbW1t4b///hMEQRDevHkjZGZmKtS9e/euIJVKhWnTponr1q5dKwAQFixYkGNfWVlZYj0Awrx583KUqV69uuDh4SG+PnLkiABAqFixopCUlCSu37JliwBAWLx4sdi2nZ2d4OXlJe5HEAQhNTVVqFKlitCiRYsc+2rQoIFQo0YN8fWzZ88EAMLkyZPFdXFxcYK6urowc+ZMhbrXrl0TypQpk2N9TEyMAEAIDQ0V102ePFl4/6N54sQJAYCwYcMGhbr79+/Psd7S0lJo06ZNjtgDAgKEDz/uH8Y+duxYoUKFCkLt2rUVzunvv/8uqKmpCSdOnFCov3LlSgGAEBERkWN/7/Pw8BDb++uvv4QyZcoIo0aNyrVsfs6HILz7Pb0vPT1dqFGjhtC0aVOFttTU1IRvv/02x3sx+3eemJgo6OvrC/Xq1RNev36da5n09HShQoUKQo0aNRTK7N27VwAgTJo0SVzXq1cvwdLSUqGd1atXCwCEc+fO5XrMRVE3+xydOHFCKFeunABA6N+/f65lnz59KmhqagotW7ZUOC/BwcECAGHt2rXiOg8PD6F69eo52pg3b54AQLh7926ObdmfwSNHjuTYlv1Zfv9vhSAIwtmzZwUAwogRI8R1NWvWFCpUqCC8ePFCXBcZGSmoqakJPXv2zNF2xYoVhd69e+cZx8c+H7nFmJ+/N4sWLRIACH/88Ye4Lj09Xahfv76gp6cn/g3KbnPdunViOR8fH6FGjRqChYWF0KtXr3zF9H59QRAEc3NzoXXr1uLr7L/P58+f/2hb738es18DEKZMmaJQzs/PTwAgXLt2TRAEQbhy5YoAQOjbt69CudGjRwsAhH/++UdcZ2lpKQAQtm/fLq6Ty+WCmZmZ4OrqmiPeu3fvCgkJCUK1atUEBwcH4fnz5wr7mD59uqCrqytER0crrP/pp58EdXV14f79+x893uKOw5FUKM2bN0f58uVhYWGBbt26QU9PDzt37kTFihUBAFKpVJyDkpmZiRcvXkBPTw8ODg64dOmS2M727dthbGyc65VbHw5BFUTPnj0VegC6dOk
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение классов в Тестовой выборке:\n",
"company\n",
"Realme 35\n",
"Samsung 27\n",
"Vivo 26\n",
"Motorola 20\n",
"Honor 14\n",
"Xiaomi 13\n",
"Poco 11\n",
"OnePlus 11\n",
"Huawei 9\n",
"iQOO 9\n",
"TCL 6\n",
"OPPO 5\n",
"Nothing 4\n",
"Google 3\n",
"Lava 2\n",
"Asus 2\n",
"Oppo 2\n",
"Tecno 2\n",
"Itel 2\n",
"Gionee 1\n",
"Lenovo 1\n",
"LG 1\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAmEAAAHHCAYAAAD3WI8lAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACAAUlEQVR4nO3deVxN+f8H8Ndtu+2baCFFm5BkHRoiS2UfTJimEQ3GFiPLNHaD7MvIMsYSxi7bMGSZbNmyZBlJIrKv3VRU6vz+8Ot8XS2Scrv1ej4e56F7zud8zvt87qn79vl87jkSQRAEEBEREdEXpaLoAIiIiIjKIyZhRERERArAJIyIiIhIAZiEERERESkAkzAiIiIiBWASRkRERKQATMKIiIiIFIBJGBEREZECqCk6ACIiIioeb968wYsXL6CmpoZKlSopOhz6CPaEERERfURISAiSkpLE1wsWLEBqaqriAnrPoUOH0KlTJxgaGkJLSwuVK1fGsGHDFB0WFQKTMFKI0NBQSCQScdHU1IS9vT2GDBmCx48fKzo8onIlISFB7vexoCUhIUHR4SrE33//jUmTJiExMRHr16/H+PHjoaWlpeiwsGTJEnh4eEAmk2HhwoU4ePAgDh48iClTpig6NCoEDkeSQk2ZMgXVqlXDmzdvcOLECSxduhT//PMPrl69Cm1tbUWHR1QuVKxYEevWrZNbN3fuXNy7dw/z58/PVbY8+vXXX9GpUycsXLgQKioqmDt3LlRUFNuPERcXhxEjRqB///5YsmQJJBKJQuOhTyfhA7xJEUJDQ9GnTx9ERUWhQYMG4vrAwEDMmzcPGzZsQK9evRQYIVH51qFDB1y9erXc9nzlJSkpCTExMbC0tESVKlUUHQ6GDh2Kv//+G3FxcVBXV1d0OFQEHI6kUsXd3R0AcPv2bQDAixcvMHLkSDg5OUFXVxf6+vrw8vLCpUuXcu375s0bTJo0Cfb29tDU1IS5uTm6du2K+Ph4AB8fcmnRooVY15EjRyCRSLB582b8+uuvMDMzg46ODjp16oTExMRcxz5z5gw8PT1hYGAAbW1tuLm5ITIyMs9zbNGiRZ7HnzRpUq6yf/31F+rXrw8tLS0YGxujZ8+eeR6/oHN7X3Z2NhYsWIBatWpBU1MTpqamGDBgAF6+fClXztraGh06dMh1nCFDhuSqM6/YZ8+enatNASA9PR0TJ06Era0tpFIpLC0tMXr0aKSnp+fZVu9r0aJFrvqmTZsGFRUVbNiwoUjtMWfOHDRt2hQVKlSAlpYW6tevj23btuV5/L/++guNGjWCtrY2jIyM0Lx5cxw4cECuzL59++Dm5gY9PT3o6+ujYcOGuWLbunWr+J6amJjg+++/x/379+XK+Pn5ycVsZGSEFi1a4Pjx4x9tp8/Z91MkJSVh+PDhsLS0hFQqha2tLWbOnIns7Gy5ctnZ2Vi4cCGcnJygqamJihUrwtPTE+fOnQOAjw5/vv+eP3nyBP7+/jA1NYWmpiacnZ2xZs0aueN9+N6rq6vD2toao0aNQkZGhlzZW7du4dtvv4WxsTG0tbXx1VdfYe/evXJlcv4WHDlyBIaGhmjSpAmqVKmC9u3b5/t7m9f+OYtUKoW9vT2Cg4Pxfh/IpEmTIJFI8OzZs3zrsra2hp+fn/j69OnTqF+/PgYNGgRTU1NIpVLUrl0bf/75Z659U1NTERgYKL5fDg4OmDNnDj7sh5FIJBgyZAjWr18PBwcHaGpqon79+jh27JhcuZx43xcREQGpVIqffvpJbv39+/fRt29fMcZatWph1apVBbZbecHhSCpVchKmChUqAHj3R3Lnzp349ttvUa1aNTx+/Bh//PEH3NzccO3aNVhYWAAAsrKy0KFDBxw+fBg9e/bEsGHD8OrVKxw8eBBXr16FjY2NeIxevXqhXbt2cscNCgrKM55p06ZBIpFgzJgxePLkCRYsWIDWrVsjOjpanA/y77//wsvLC/Xr18fEiROhoqKC1atXw93dHcePH0ejRo1y1VulShUEBwcDAFJSUjBw4MA8jz1+/Hh4e3vjxx9/xNOnT7Fo0SI0b94cFy9ehKGhYa59+vfvj2bNmgEAtm/fjh07dshtHzBggNgLGRAQgNu3byMkJAQXL15EZGRksfxvOikpSTy392VnZ6NTp044ceIE+vfvD0dHR1y5cgXz58/HjRs3sHPnzk86zurVqzFu3DjMnTsX3333XZ5lPtYeCxcuRKdOneDj44OMjAxs2rQJ3377Lfbs2YP27duL5SZPnoxJkyahadOmmDJlCjQ0NHDmzBn8+++/aNu2LYB3vbt9+/ZFrVq1EBQUBENDQ1y8eBH79+8X48tp+4YNGyI4OBiPHz/GwoULERkZmes9NTExEYcC7927h4ULF6Jdu3ZITEzM871/3+fsWxhpaWlwc3PD/fv3MWDAAFStWhUnT55EUFAQHj58iAULFohl/f39ERoaCi8vL/z44494+/Ytjh8/jtOnT6NBgwZyw6DHjx/H8uXLMX/+fJiYmAAATE1NAQCvX79GixYtcPPmTQwZMgTVqlXD1q1b4efnh6SkpFwT0XPe+/T0dISHh2POnDnQ1NTEb7/9BgB4/PgxmjZtirS0NAQEBKBChQpYs2YNOnXqhG3btuGbb77J9/yPHTuGf/7555Pa7Ndff4WjoyNev34t/ueuUqVK8Pf3/6R63vf8+XOcO3cOampqGDx4MGxsbLBz5070798fz58/xy+//AIAEAQBnTp1QkREBPz9/VG3bl2Eh4dj1KhRuH//fq4h56NHj2Lz5s0ICAiAVCrFkiVL4OnpibNnz6J27dp5xnLp0iV06dIF7dq1w+LFi8X1jx8/xldffSUmdxUrVsS+ffvg7++P5ORkDB8+vMjnXyYIRAqwevVqAYBw6NAh4enTp0JiYqKwadMmoUKFCoKWlpZw7949QRAE4c2bN0JWVpbcvrdv3xakUqkwZcoUcd2qVasEAMK8efNyHSs7O1vcD4Awe/bsXGVq1aoluLm5ia8jIiIEAELlypWF5ORkcf2WLVsEAMLChQvFuu3s7AQPDw/xOIIgCGlpaUK1atWENm3a5DpW06ZNhdq1a4uvnz59KgAQJk6cKK5LSEgQVFVVhWnTpsnte+XKFUFNTS3X+ri4OAGAsGbNGnHdxIkThfd/xY8fPy4AENavXy+37/79+3Ott7KyEtq3b58r9sGDBwsf/tn4MPbRo0cLlSpVEurXry/XpuvWrRNUVFSE48ePy+2/bNkyAYAQGRmZ63jvc3NzE+vbu3evoKamJgQGBuZZtjDtIQjv3qf3ZWRkCLVr1xbc3d3l6lJRURG++eabXNdiznuelJQk6OnpCY0bNxZev36dZ5mMjAyhUqVKQu3ateXK7NmzRwAgTJgwQVzXu3dvwcrKSq6e5cuXCwCEs2fP5nnOxbHv+9q3b5+rnhy//faboKOjI9y4cUNu/S+//CKoqqoKd+/eFQRBEP79918BgBAQEJCrjvd/X3Lk/F24fft2rm0LFiwQAAh//fWXuC4jI0No0qSJoKurK/6e5vyer169Wm5/CwsLoV27duLr4cOHCwDkrsdXr14J1apVE6ytrcX3OudvQUREhFiucePGgpeXV65rPy957f/mzRtBRUVFGDRokLgu5/p8+vRpvnVZWVkJvXv3lnsNQAgNDRXXvX37VmjVqpUglUqFZ8+eCYIgCDt37hQACFOnTpWrr3v37oJEIhFu3rwprgMgABDOnTsnrrtz546gqakpfPPNN7niFYR3f6/Mzc2Fr7/+Otf17+/vL5ibm4ux5OjZs6dgYGCQ63ewvOFwJClU69atUbFiRVhaWqJnz57Q1dXFjh07ULlyZQCAVCoVJ79mZWXh+fPn0NXVhYODAy5cuCDWExYWBhMTEwwdOjTXMT5nsuoPP/wAPT098XX37t1hbm4u/i84OjoacXFx+O677/D8+XM8e/YMz549Q2pqKlq1aoVjx47lGp558+YNNDU1Czzu9u3bkZ2dDW9vb7HOZ8+ewczMDHZ2doiIiJArnzPMIpVK861z69atMDAwQJs2beTqrF+/PnR1dXPVmZmZKVf
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
"\n",
"# Проверка распределения классов в столбце company\n",
"class_distribution = df['company'].value_counts()\n",
"print(\"Распределение классов в company:\")\n",
"print(class_distribution)\n",
"\n",
"# Визуализация распределения классов\n",
"sns.countplot(y='company', data=df, order=class_distribution.index)\n",
"plt.title('Распределение классов в company')\n",
"plt.show()\n",
"\n",
"# Проверка сбалансированности для каждой выборки\n",
"def check_balance(df, title):\n",
" class_distribution = df['company'].value_counts()\n",
" print(f\"Распределение классов в {title}:\")\n",
" print(class_distribution)\n",
" sns.countplot(y='company', data=df, order=class_distribution.index)\n",
" plt.title(f'Распределение классов в {title}')\n",
" plt.show()\n",
"\n",
"# Разделение данных на обучающую, контрольную и тестовую выборки\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)\n",
"val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\n",
"\n",
"# Проверка сбалансированности для обучающей, контрольной и тестовой выборок\n",
"check_balance(train_df, 'Обучающей выборке')\n",
"check_balance(val_df, 'Контрольной выборке')\n",
"check_balance(test_df, 'Тестовой выборке')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
" Данные по столбцу company являются несбалансированными. Некоторые компании, такие как Vivo, Realme, и Samsung, имеют значительно больше устройств, чем другие, такие как LG, Gionee, и Itel."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки до upsampling: 671\n",
"Размер контрольной выборки: 288\n",
"Размер тестовой выборки: 411\n",
"\n",
"Распределение классов в всем датасете:\n",
"Класс Vivo: 186 (13.58%)\n",
"Класс Realme: 186 (13.58%)\n",
"Класс Samsung: 181 (13.21%)\n",
"Класс Motorola: 127 (9.27%)\n",
"Класс Xiaomi: 90 (6.57%)\n",
"Класс Honor: 88 (6.42%)\n",
"Класс Poco: 75 (5.47%)\n",
"Класс OnePlus: 75 (5.47%)\n",
"Класс Huawei: 62 (4.53%)\n",
"Класс iQOO: 57 (4.16%)\n",
"Класс OPPO: 38 (2.77%)\n",
"Класс Oppo: 27 (1.97%)\n",
"Класс TCL: 26 (1.90%)\n",
"Класс Google: 23 (1.68%)\n",
"Класс Asus: 21 (1.53%)\n",
"Класс POCO: 19 (1.39%)\n",
"Класс Lava: 19 (1.39%)\n",
"Класс Nothing: 15 (1.09%)\n",
"Класс Lenovo: 14 (1.02%)\n",
"Класс Tecno: 13 (0.95%)\n",
"Класс itel: 12 (0.88%)\n",
"Класс LG: 6 (0.44%)\n",
"Класс Gionee: 5 (0.36%)\n",
"Класс Itel: 3 (0.22%)\n",
"Класс IQOO: 1 (0.07%)\n",
"Класс Coolpad: 1 (0.07%)\n",
"\n",
"Распределение классов в Обучающей выборке до upsampling:\n",
"Класс Vivo: 94 (14.01%)\n",
"Класс Samsung: 89 (13.26%)\n",
"Класс Realme: 82 (12.22%)\n",
"Класс Motorola: 66 (9.84%)\n",
"Класс Xiaomi: 46 (6.86%)\n",
"Класс Honor: 40 (5.96%)\n",
"Класс OnePlus: 40 (5.96%)\n",
"Класс Poco: 37 (5.51%)\n",
"Класс Huawei: 35 (5.22%)\n",
"Класс iQOO: 28 (4.17%)\n",
"Класс OPPO: 15 (2.24%)\n",
"Класс Oppo: 14 (2.09%)\n",
"Класс Lava: 12 (1.79%)\n",
"Класс Google: 12 (1.79%)\n",
"Класс TCL: 10 (1.49%)\n",
"Класс Lenovo: 9 (1.34%)\n",
"Класс POCO: 9 (1.34%)\n",
"Класс Asus: 8 (1.19%)\n",
"Класс itel: 7 (1.04%)\n",
"Класс Nothing: 5 (0.75%)\n",
"Класс Tecno: 5 (0.75%)\n",
"Класс LG: 3 (0.45%)\n",
"Класс Gionee: 3 (0.45%)\n",
"Класс Coolpad: 1 (0.15%)\n",
"Класс Itel: 1 (0.15%)\n",
"Размер обучающей выборки после upsampling: 2350\n",
"\n",
"Распределение классов в Обучающей выборке после upsampling:\n",
"Класс Realme: 94 (4.00%)\n",
"Класс Motorola: 94 (4.00%)\n",
"Класс Vivo: 94 (4.00%)\n",
"Класс Lava: 94 (4.00%)\n",
"Класс Lenovo: 94 (4.00%)\n",
"Класс TCL: 94 (4.00%)\n",
"Класс OPPO: 94 (4.00%)\n",
"Класс Honor: 94 (4.00%)\n",
"Класс Poco: 94 (4.00%)\n",
"Класс itel: 94 (4.00%)\n",
"Класс Oppo: 94 (4.00%)\n",
"Класс iQOO: 94 (4.00%)\n",
"Класс Samsung: 94 (4.00%)\n",
"Класс Xiaomi: 94 (4.00%)\n",
"Класс LG: 94 (4.00%)\n",
"Класс Huawei: 94 (4.00%)\n",
"Класс OnePlus: 94 (4.00%)\n",
"Класс Google: 94 (4.00%)\n",
"Класс Tecno: 94 (4.00%)\n",
"Класс Asus: 94 (4.00%)\n",
"Класс Gionee: 94 (4.00%)\n",
"Класс POCO: 94 (4.00%)\n",
"Класс Nothing: 94 (4.00%)\n",
"Класс Coolpad: 94 (4.00%)\n",
"Класс Itel: 94 (4.00%)\n",
"\n",
"Распределение классов в Контрольной выборке:\n",
"Класс Vivo: 44 (15.28%)\n",
"Класс Realme: 43 (14.93%)\n",
"Класс Samsung: 39 (13.54%)\n",
"Класс Motorola: 23 (7.99%)\n",
"Класс Xiaomi: 20 (6.94%)\n",
"Класс Honor: 19 (6.60%)\n",
"Класс OnePlus: 16 (5.56%)\n",
"Класс Poco: 15 (5.21%)\n",
"Класс Huawei: 11 (3.82%)\n",
"Класс iQOO: 9 (3.12%)\n",
"Класс Oppo: 7 (2.43%)\n",
"Класс POCO: 5 (1.74%)\n",
"Класс OPPO: 5 (1.74%)\n",
"Класс Google: 4 (1.39%)\n",
"Класс Asus: 4 (1.39%)\n",
"Класс TCL: 4 (1.39%)\n",
"Класс Lava: 4 (1.39%)\n",
"Класс itel: 3 (1.04%)\n",
"Класс Nothing: 3 (1.04%)\n",
"Класс Tecno: 3 (1.04%)\n",
"Класс Lenovo: 3 (1.04%)\n",
"Класс LG: 2 (0.69%)\n",
"Класс Gionee: 1 (0.35%)\n",
"Класс IQOO: 1 (0.35%)\n",
"\n",
"Распределение классов в Тестовой выборке:\n",
"Класс Realme: 61 (14.84%)\n",
"Класс Samsung: 53 (12.90%)\n",
"Класс Vivo: 48 (11.68%)\n",
"Класс Motorola: 38 (9.25%)\n",
"Класс Honor: 29 (7.06%)\n",
"Класс Xiaomi: 24 (5.84%)\n",
"Класс Poco: 23 (5.60%)\n",
"Класс iQOO: 20 (4.87%)\n",
"Класс OnePlus: 19 (4.62%)\n",
"Класс OPPO: 18 (4.38%)\n",
"Класс Huawei: 16 (3.89%)\n",
"Класс TCL: 12 (2.92%)\n",
"Класс Asus: 9 (2.19%)\n",
"Класс Google: 7 (1.70%)\n",
"Класс Nothing: 7 (1.70%)\n",
"Класс Oppo: 6 (1.46%)\n",
"Класс POCO: 5 (1.22%)\n",
"Класс Tecno: 5 (1.22%)\n",
"Класс Lava: 3 (0.73%)\n",
"Класс Lenovo: 2 (0.49%)\n",
"Класс itel: 2 (0.49%)\n",
"Класс Itel: 2 (0.49%)\n",
"Класс LG: 1 (0.24%)\n",
"Класс Gionee: 1 (0.24%)\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
"\n",
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
"\n",
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
"\n",
"# Вывод размеров выборок\n",
"print(\"Размер обучающей выборки до upsampling:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))\n",
"\n",
"# Функция для проверки балансировки данных\n",
"def check_balance(df, title):\n",
" class_distribution = df['company'].value_counts()\n",
" print(f\"\\nРаспределение классов в {title}:\")\n",
" for cls, count in class_distribution.items():\n",
" print(f\"Класс {cls}: {count} ({count / len(df) * 100:.2f}%)\")\n",
"\n",
"# Проверка балансировки для всего датасета\n",
"check_balance(df, 'всем датасете')\n",
"\n",
"# Проверка балансировки для обучающей выборки до upsampling\n",
"check_balance(train_df, 'Обучающей выборке до upsampling')\n",
"\n",
"# Применение upsampling к обучающей выборке\n",
"X_train = train_df.drop('company', axis=1) # Отделяем признаки от целевой переменной\n",
"y_train = train_df['company'] # Целевая переменная\n",
"\n",
"# Инициализация RandomOverSampler\n",
"ros = RandomOverSampler(random_state=42)\n",
"\n",
"# Применение upsampling\n",
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
"\n",
"# Создание нового DataFrame с балансированными данными\n",
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
"\n",
"# Вывод размеров выборок после upsampling\n",
"print(\"Размер обучающей выборки после upsampling:\", len(train_df_resampled))\n",
"\n",
"# Проверка балансировки для обучающей выборки после upsampling\n",
"check_balance(train_df_resampled, 'Обучающей выборке после upsampling')\n",
"\n",
"# Проверка балансировки для контрольной и тестовой выборок (они не должны измениться)\n",
"check_balance(val_df, 'Контрольной выборке')\n",
"check_balance(test_df, 'Тестовой выборке')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Данные были сбалансированы. Теперь можно перейти к конструированию признаков. Поставлены следующие задачи:\n",
"1. Классифицировать мобильные устройства по ценовым категориям (например, бюджетные, средний класс, флагманы).\n",
"2. Определить, какие характеристики мобильных устройств наиболее сильно влияют на их рейтинг."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"\n",
"# Определение категориальных признаков\n",
"categorical_features = [\n",
" 'Rating', 'Ram',\n",
" 'Battery', 'Display', 'Camera', 'External_Memory', 'Android_version',\n",
" 'Price', 'company', 'Inbuilt_memory', 'fast_charging',\n",
" 'Screen_resolution', 'Processor'\n",
"]\n",
"\n",
"# Применение one-hot encoding к обучающей выборке\n",
"train_df_resampled_encoded = pd.get_dummies(train_df_resampled, columns=categorical_features)\n",
"\n",
"# Применение one-hot encoding к контрольной выборке\n",
"val_df_encoded = pd.get_dummies(val_df, columns=categorical_features)\n",
"\n",
"# Применение one-hot encoding к тестовой выборке\n",
"test_df_encoded = pd.get_dummies(test_df, columns=categorical_features)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Дискретизация числовых признаков"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки после балансировки: (5600, 22)\n",
"Размер контрольной выборки: (288, 22)\n",
"Размер тестовой выборки: (411, 22)\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"import re\n",
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
"\n",
"# Извлечение числовых значений из столбца Battery\n",
"df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n",
"df['Ram'] = df['Ram'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n",
"df['Camera'] = df['Camera'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n",
"\n",
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
"\n",
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
"\n",
"# Применение upsampling к обучающей выборке (если это необходимо)\n",
"X_train = train_df.drop('Price', axis=1) # Отделяем признаки от целевой переменной\n",
"y_train = train_df['Price'] # Целевая переменная\n",
"\n",
"# Инициализация RandomOverSampler\n",
"ros = RandomOverSampler(random_state=42)\n",
"\n",
"# Применение upsampling\n",
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
"\n",
"# Создание нового DataFrame с балансированными данными\n",
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
"\n",
"# Определение числовых признаков для дискретизации\n",
"numerical_features = ['Spec_score', 'Battery', 'Ram', 'Camera' ]\n",
"\n",
"# Функция для дискретизации числовых признаков\n",
"def discretize_features(df, features, bins=5, labels=False):\n",
" for feature in features:\n",
" try:\n",
" # Заполнение NaN значений, если они есть\n",
" df[feature] = df[feature].fillna(df[feature].median())\n",
" df[f'{feature}_bin'] = pd.cut(df[feature], bins=bins, labels=labels)\n",
" except Exception as e:\n",
" print(f\"Ошибка при дискретизации признака {feature}: {e}\")\n",
" return df\n",
"\n",
"# Применение дискретизации к обучающей, контрольной и тестовой выборкам\n",
"train_df_resampled = discretize_features(train_df_resampled, numerical_features)\n",
"val_df = discretize_features(val_df, numerical_features)\n",
"test_df = discretize_features(test_df, numerical_features)\n",
"\n",
"# Вывод размеров выборок\n",
"print(\"Размер обучающей выборки после балансировки:\", train_df_resampled.shape)\n",
"print(\"Размер контрольной выборки:\", val_df.shape)\n",
"print(\"Размер тестовой выборки:\", test_df.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Ручной синтез. Создание новых признаков на основе экспертных знаний и логики предметной области."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки после балансировки: (5600, 19)\n",
"Размер контрольной выборки: (288, 19)\n",
"Размер тестовой выборки: (411, 19)\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
"\n",
"# Преобразование столбца Battery в числовой формат\n",
"df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n",
"\n",
"# Преобразование столбцов Camera и Display в числовой формат\n",
"df['Camera'] = pd.to_numeric(df['Camera'], errors='coerce')\n",
"df['Display'] = pd.to_numeric(df['Display'], errors='coerce')\n",
"\n",
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
"\n",
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
"\n",
"# Применение upsampling к обучающей выборке (если это необходимо)\n",
"X_train = train_df.drop('Price', axis=1) # Отделяем признаки от целевой переменной\n",
"y_train = train_df['Price'] # Целевая переменная\n",
"\n",
"# Инициализация RandomOverSampler\n",
"ros = RandomOverSampler(random_state=42)\n",
"\n",
"# Применение upsampling\n",
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
"\n",
"# Создание нового DataFrame с балансированными данными\n",
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
"\n",
"# Создание нового признака \"Camera_to_Display_Ratio\" на основе признаков \"Camera\" и \"Display\"\n",
"train_df_resampled['Camera_to_Display_Ratio'] = train_df_resampled['Camera'] / train_df_resampled['Display']\n",
"val_df['Camera_to_Display_Ratio'] = val_df['Camera'] / val_df['Display']\n",
"test_df['Camera_to_Display_Ratio'] = test_df['Camera'] / test_df['Display']\n",
"\n",
"# Вывод размеров выборок\n",
"print(\"Размер обучающей выборки после балансировки:\", train_df_resampled.shape)\n",
"print(\"Размер контрольной выборки:\", val_df.shape)\n",
"print(\"Размер тестовой выборки:\", test_df.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Масштабирование признаков - это процесс преобразования числовых признаков таким образом, чтобы они имели одинаковый масштаб. Это важно для многих алгоритмов машинного обучения, которые чувствительны к масштабу признаков, таких как линейная регрессия, метод опорных векторов (SVM) и нейронные сети."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки после балансировки: (5600, 19)\n",
"Размер контрольной выборки: (288, 19)\n",
"Размер тестовой выборки: (411, 19)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\utils\\extmath.py:1137: RuntimeWarning: invalid value encountered in divide\n",
" updated_mean = (last_sum + new_sum) / updated_sample_count\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\utils\\extmath.py:1142: RuntimeWarning: invalid value encountered in divide\n",
" T = new_sum / new_sample_count\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\utils\\extmath.py:1162: RuntimeWarning: invalid value encountered in divide\n",
" new_unnormalized_variance -= correction**2 / new_sample_count\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"from sklearn.preprocessing import StandardScaler\n",
"import re\n",
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
"\n",
"# Преобразование столбца Battery в числовой формат\n",
"df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n",
"\n",
"# Преобразование столбцов Camera и Display в числовой формат\n",
"df['Camera'] = pd.to_numeric(df['Camera'], errors='coerce')\n",
"df['Display'] = pd.to_numeric(df['Display'], errors='coerce')\n",
"\n",
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
"\n",
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
"\n",
"# Применение upsampling к обучающей выборке (если это необходимо)\n",
"X_train = train_df.drop('Price', axis=1) # Отделяем признаки от целевой переменной\n",
"y_train = train_df['Price'] # Целевая переменная\n",
"\n",
"# Инициализация RandomOverSampler\n",
"ros = RandomOverSampler(random_state=42)\n",
"\n",
"# Применение upsampling\n",
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
"\n",
"# Создание нового DataFrame с балансированными данными\n",
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
"\n",
"# Создание нового признака \"Camera_to_Display_Ratio\" на основе признаков \"Camera\" и \"Display\"\n",
"train_df_resampled['Camera_to_Display_Ratio'] = train_df_resampled['Camera'] / train_df_resampled['Display']\n",
"val_df['Camera_to_Display_Ratio'] = val_df['Camera'] / val_df['Display']\n",
"test_df['Camera_to_Display_Ratio'] = test_df['Camera'] / test_df['Display']\n",
"\n",
"# Определение числовых признаков для масштабирования\n",
"numerical_features_to_scale = ['Spec_score', 'No_of_sim', 'Ram', 'Battery', 'Display', 'Camera', 'Inbuilt_memory', 'Screen_resolution', 'Camera_to_Display_Ratio']\n",
"\n",
"# Удаление строковых значений из числовых признаков\n",
"for feature in numerical_features_to_scale:\n",
" train_df_resampled[feature] = pd.to_numeric(train_df_resampled[feature], errors='coerce')\n",
" val_df[feature] = pd.to_numeric(val_df[feature], errors='coerce')\n",
" test_df[feature] = pd.to_numeric(test_df[feature], errors='coerce')\n",
"\n",
"# Инициализация StandardScaler\n",
"scaler = StandardScaler()\n",
"\n",
"# Масштабирование числовых признаков в обучающей выборке\n",
"train_df_resampled[numerical_features_to_scale] = scaler.fit_transform(train_df_resampled[numerical_features_to_scale])\n",
"\n",
"# Масштабирование числовых признаков в контрольной и тестовой выборках\n",
"val_df[numerical_features_to_scale] = scaler.transform(val_df[numerical_features_to_scale])\n",
"test_df[numerical_features_to_scale] = scaler.transform(test_df[numerical_features_to_scale])\n",
"\n",
"# Вывод размеров выборок\n",
"print(\"Размер обучающей выборки после балансировки:\", train_df_resampled.shape)\n",
"print(\"Размер контрольной выборки:\", val_df.shape)\n",
"print(\"Размер тестовой выборки:\", test_df.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Конструирование признаков с применением фреймворка Featuretools"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Обучающая выборка после конструирования признаков:\n",
" Unnamed: 0 Rating Spec_score No_of_sim Ram \\\n",
"id \n",
"0 305 4.70 86 Dual Sim, 3G, 4G, 5G, VoLTE, 12 GB RAM \n",
"1 941 4.45 71 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM \n",
"2 800 4.20 68 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM \n",
"3 97 4.25 69 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM \n",
"4 1339 4.30 74 Dual Sim, 3G, 4G, VoLTE, 6 GB RAM \n",
"\n",
" Battery External_Memory Android_version company \\\n",
"id \n",
"0 5000 Android v12 NaN Realme \n",
"1 5000 Memory Card Supported, upto 1 TB 12 Motorola \n",
"2 5000 Memory Card Supported 12 Vivo \n",
"3 5000 Memory Card Supported 12 Vivo \n",
"4 5000 Memory Card Supported, upto 256 GB 12 Lava \n",
"\n",
" Inbuilt_memory fast_charging \\\n",
"id \n",
"0 256 GB inbuilt 65W Fast Charging \n",
"1 64 GB inbuilt 10W Fast Charging \n",
"2 64 GB inbuilt 10W Fast Charging \n",
"3 128 GB inbuilt 10W Fast Charging \n",
"4 128 GB inbuilt NaN \n",
"\n",
" Screen_resolution Processor \n",
"id \n",
"0 1080 x 2400 px Octa Core \n",
"1 720 x 1600 px Octa Core \n",
"2 720 x 1600 px Display with Water Drop Notch Octa Core \n",
"3 720 x 1600 px Display with Water Drop Notch Octa Core \n",
"4 1600 x 720 px Octa Core \n",
"Контрольная выборка после конструирования признаков:\n",
" Unnamed: 0 Rating Spec_score No_of_sim Ram \\\n",
"id \n",
"1028 <NA> NaN <NA> NaN NaN \n",
"825 <NA> NaN <NA> NaN NaN \n",
"900 <NA> NaN <NA> NaN NaN \n",
"702 <NA> NaN <NA> NaN NaN \n",
"230 1050 4.05 90 Dual Sim, 3G, 4G, 5G, VoLTE, 8 GB RAM \n",
"\n",
" Battery External_Memory Android_version company Inbuilt_memory \\\n",
"id \n",
"1028 <NA> NaN NaN NaN NaN \n",
"825 <NA> NaN NaN NaN NaN \n",
"900 <NA> NaN NaN NaN NaN \n",
"702 <NA> NaN NaN NaN NaN \n",
"230 4500 Android v12 NaN Motorola 128 GB inbuilt \n",
"\n",
" fast_charging Screen_resolution Processor \n",
"id \n",
"1028 NaN NaN NaN \n",
"825 NaN NaN NaN \n",
"900 NaN NaN NaN \n",
"702 NaN NaN NaN \n",
"230 125W Fast Charging 1080 x 2400 px Octa Core \n",
"Тестовая выборка после конструирования признаков:\n",
" Unnamed: 0 Rating Spec_score No_of_sim \\\n",
"id \n",
"427 187 4.40 91 Dual Sim, 3G, 4G, 5G, VoLTE, \n",
"1088 <NA> NaN <NA> NaN \n",
"668 592 4.45 91 Dual Sim, 3G, 4G, 5G, VoLTE, \n",
"572 1130 4.60 75 Dual Sim, 3G, 4G, VoLTE, \n",
"115 117 4.60 72 Dual Sim, 3G, 4G, VoLTE, \n",
"\n",
" Ram Battery External_Memory Android_version \\\n",
"id \n",
"427 12 GB RAM 5000 Memory Card Not Supported 14 \n",
"1088 NaN <NA> NaN NaN \n",
"668 12 GB RAM 4500 Android v12 NaN \n",
"572 6 GB RAM 5000 Memory Card Supported, upto 1 TB 13 \n",
"115 4 GB RAM 5000 Memory Card Supported, upto 1 TB 12 \n",
"\n",
" company Inbuilt_memory fast_charging \\\n",
"id \n",
"427 Vivo 256 GB inbuilt 120W Fast Charging \n",
"1088 NaN NaN NaN \n",
"668 Honor 256 GB inbuilt 100W Fast Charging \n",
"572 Xiaomi 128 GB inbuilt 18W Fast Charging \n",
"115 Vivo 64 GB inbuilt 18W Fast Charging \n",
"\n",
" Screen_resolution Processor \n",
"id \n",
"427 1260 x 2800 px Octa Core \n",
"1088 NaN NaN \n",
"668 1200 x 2652 px Octa Core \n",
"572 720 x 1600 px Octa Core \n",
"115 720 x 1612 px Display with Water Drop Notch Octa Core \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
" warnings.warn(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
" warnings.warn(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"import featuretools as ft\n",
"import re\n",
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
"\n",
"# Преобразование столбца Battery в числовой формат\n",
"df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n",
"\n",
"# Преобразование столбцов Camera и Display в числовой формат\n",
"df['Camera'] = pd.to_numeric(df['Camera'], errors='coerce')\n",
"df['Display'] = pd.to_numeric(df['Display'], errors='coerce')\n",
"\n",
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
"\n",
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
"\n",
"# Создание нового признака \"Camera_to_Display_Ratio\" на основе признаков \"Camera\" и \"Display\"\n",
"train_df['Camera_to_Display_Ratio'] = train_df['Camera'] / train_df['Display']\n",
"val_df['Camera_to_Display_Ratio'] = val_df['Camera'] / val_df['Display']\n",
"test_df['Camera_to_Display_Ratio'] = test_df['Camera'] / test_df['Display']\n",
"\n",
"# Определение сущностей\n",
"es = ft.EntitySet(id='mobile_data')\n",
"es = es.add_dataframe(dataframe_name='train', dataframe=train_df, index='id')\n",
"\n",
"# Генерация признаков\n",
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='train', max_depth=2)\n",
"\n",
"# Преобразование признаков для контрольной и тестовой выборок\n",
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df.index)\n",
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df.index)\n",
"\n",
"# Вывод первых нескольких строк для проверки\n",
"print(\"Обучающая выборка после конструирования признаков:\")\n",
"print(feature_matrix.head())\n",
"print(\"Контрольная выборка после конструирования признаков:\")\n",
"print(val_feature_matrix.head())\n",
"print(\"Тестовая выборка после конструирования признаков:\")\n",
"print(test_feature_matrix.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Оценка качества каждого набора признаков\n",
"\n",
"Предсказательная способность Метрики: RMSE, MAE, R²\n",
"\n",
"Методы: Обучение модели на обучающей выборке и оценка на контрольной и тестовой выборках.\n",
"\n",
"Скорость вычисления Методы: Измерение времени выполнения генерации признаков и обучения модели.\n",
"\n",
"Надежность Методы: Кросс-валидация, анализ чувствительности модели к изменениям в данных.\n",
"\n",
"Корреляция Методы: Анализ корреляционной матрицы признаков, удаление мультиколлинеарных признаков.\n",
"\n",
"Цельность Методы: Проверка логической связи между признаками и целевой переменной, интерпретация результатов модели."
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 671\n",
"Размер контрольной выборки: 288\n",
"Размер тестовой выборки: 411\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
" warnings.warn(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Feature Importance:\n",
" feature importance\n",
"4 Price 0.999443\n",
"2 Spec_score 0.000227\n",
"3 Battery 0.000146\n",
"0 Unnamed: 0 0.000146\n",
"1 Rating 0.000039\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"import featuretools as ft\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"import re\n",
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
"\n",
"# Преобразование столбца Battery в числовой формат\n",
"df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n",
"\n",
"# Преобразование столбца Display в числовой формат\n",
"df['Camera'] = pd.to_numeric(df['Camera'], errors='coerce')\n",
"df['Display'] = pd.to_numeric(df['Display'], errors='coerce')\n",
"df['Inbuilt_memory'] = pd.to_numeric(df['Inbuilt_memory'], errors='coerce')\n",
"df['fast_charging'] = pd.to_numeric(df['fast_charging'], errors='coerce')\n",
"\n",
"# Удаление запятых из столбца Price и преобразование в числовой формат\n",
"df['Price'] = df['Price'].str.replace(',', '').astype(float)\n",
"\n",
"# Удаление столбцов с текстовыми значениями, которые не могут быть преобразованы в числа\n",
"df = df.drop(columns=['Name', 'company', 'Android_version', 'Processor_name', 'External_Memory', 'No_of_sim', 'Ram', 'Screen_resolution', 'Processor' ])\n",
"\n",
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
"\n",
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
"\n",
"# Вывод размеров выборок\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))\n",
"\n",
"# Применение upsampling к обучающей выборке (если это необходимо)\n",
"X_train = train_df.drop('Price', axis=1) # Отделяем признаки от целевой переменной\n",
"y_train = train_df['Price'] # Целевая переменная\n",
"\n",
"# Инициализация RandomOverSampler\n",
"ros = RandomOverSampler(random_state=42)\n",
"\n",
"# Применение upsampling\n",
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
"\n",
"# Создание нового DataFrame с балансированными данными\n",
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
"\n",
"# Определение сущностей\n",
"es = ft.EntitySet(id='mobile_data')\n",
"es = es.add_dataframe(dataframe_name='mobile', dataframe=train_df_resampled, index='id')\n",
"\n",
"# Генерация признаков\n",
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='mobile', max_depth=2)\n",
"\n",
"# Преобразование признаков для контрольной и тестовой выборок\n",
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df.index)\n",
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df.index)\n",
"\n",
"# Оценка важности признаков\n",
"X = feature_matrix\n",
"y = train_df_resampled['Price']\n",
"\n",
"# Разделение данных на обучающую и тестовую выборки\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"# Обучение модели\n",
"model = RandomForestRegressor(n_estimators=100, random_state=42)\n",
"model.fit(X_train, y_train)\n",
"\n",
"# Получение важности признаков\n",
"importances = model.feature_importances_\n",
"feature_names = feature_matrix.columns\n",
"\n",
"# Сортировка признаков по важности\n",
"feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})\n",
"feature_importance = feature_importance.sort_values(by='importance', ascending=False)\n",
"\n",
"print(\"Feature Importance:\")\n",
"print(feature_importance)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 66\n",
"Размер контрольной выборки: 29\n",
"Размер тестовой выборки: 42\n",
"Mean Squared Error: 13048795.366100002\n",
"R2 Score: -0.23881710583662308\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
" warnings.warn(\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cross-validated Mean Squared Error: 394482934.1724652\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA5gAAAIjCAYAAABmsrS/AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABGCElEQVR4nO3deVxV1f7/8fcB5ICMTgwagoRjaYam4axZOORQXc0hpxxuOZR59aopKk6Z6XXIpHKiwTRtHtQy0yy1LKfUHBGHTK0cQDRBYf/+6Of5dkJNcMERfD0fj/N4cNZee+3P3isu9+3aZx+bZVmWAAAAAAC4QW6uLgAAAAAAUDgQMAEAAAAARhAwAQAAAABGEDABAAAAAEYQMAEAAAAARhAwAQAAAABGEDABAAAAAEYQMAEAAAAARhAwAQAAAABGEDABAAAAAEYQMAEABU5iYqJsNtsVX8OGDcuTY65fv15jxozRmTNn8mT8G3H5evzwww+uLiXXZs+ercTERFeXAQC4QR6uLgAAgNwaO3asypUr59R255135smx1q9fr/j4eHXv3l2BgYF5coxb2ezZs1WyZEl1797d1aUAAG4AARMAUGA1b95cNWvWdHUZN+TcuXPy8fFxdRkuc/78eRUtWtTVZQAADOEWWQBAobV8+XLVr19fPj4+8vPzU8uWLbVz506nPj/++KO6d++uyMhIeXl5KSQkRI8//rhOnjzp6DNmzBgNGTJEklSuXDnH7bgHDx7UwYMHZbPZrnh7p81m05gxY5zGsdls+umnn9SpUycVK1ZM9erVc2x/8803VaNGDXl7e6t48eLq0KGDjhw5kqtz7969u3x9fXX48GE9+OCD8vX1VZkyZfTSSy9JkrZv364mTZrIx8dH4eHheuutt5z2v3zb7dq1a/Xvf/9bJUqUkL+/v7p27arTp09nO97s2bN1xx13yG63q3Tp0urXr1+224kbNWqkO++8U5s2bVKDBg1UtGhRPfvss4qIiNDOnTv11VdfOa5to0aNJEmnTp3S4MGDVbVqVfn6+srf31/NmzfXtm3bnMZes2aNbDablixZogkTJui2226Tl5eX7rvvPu3fvz9bvd99951atGihYsWKycfHR9WqVdOMGTOc+uzevVv/+te/VLx4cXl5ealmzZr66KOPcjoVAHBLYQUTAFBgpaSk6Pfff3dqK1mypCTpjTfeULdu3RQbG6vnn39e58+fV0JCgurVq6ctW7YoIiJCkrRy5UodOHBAPXr0UEhIiHbu3KlXX31VO3fu1LfffiubzaaHH35Ye/fu1aJFizRt2jTHMUqVKqXffvstx3W3a9dO5cuX18SJE2VZliRpwoQJiouLU/v27dWrVy/99ttvevHFF9WgQQNt2bIlV7flZmZmqnnz5mrQoIEmT56shQsXqn///vLx8dGIESPUuXNnPfzww3r55ZfVtWtXxcTEZLvluH///goMDNSYMWO0Z88eJSQk6NChQ45AJ/0ZnOPj49W0aVM9+eSTjn7ff/+91q1bpyJFijjGO3nypJo3b64OHTroscceU3BwsBo1aqQBAwbI19dXI0aMkCQFBwdLkg4cOKAPPvhA7dq1U7ly5XTixAm98soratiwoX766SeVLl3aqd5JkybJzc1NgwcPVkpKiiZPnqzOnTvru+++c/RZuXKlHnzwQYWGhurpp59WSEiIdu3apU8++URPP/20JGnnzp2qW7euypQpo2HDhsnHx0dLlixR27Zt9e677+qhhx7K8XwAwC3BAgCggFmwYIEl6Yovy7Kss2fPWoGBgVbv3r2d9jt+/LgVEBDg1H7+/Pls4y9atMiSZK1du9bR9sILL1iSrOTkZKe+ycnJliRrwYIF2caRZI0ePdrxfvTo0ZYkq2PHjk79Dh48aLm7u1sTJkxwat++fbvl4eGRrf1q1+P77793tHXr1s2SZE2cONHRdvr0acvb29uy2WzW4sWLHe27d+/OVuvlMWvUqGFlZGQ42idPnmxJsj788EPLsizr119/tTw9Pa0HHnjAyszMdPSbNWuWJcmaP3++o61hw4aWJOvll1/Odg533HGH1bBhw2ztFy5ccBrXsv685na73Ro7dqyjbfXq1ZYkq3LlylZ6erqjfcaMGZYka/v27ZZlWdalS5escuXKWeHh4dbp06edxs3KynL8fN9991lVq1a1Lly44LS9Tp06Vvny5bPVCQD4E7fIAgAKrJdeekkrV650ekl/rlCdOXNGHTt21O+//+54ubu7q3bt2lq9erVjDG9vb8fPFy5c0O+//657771XkrR58+Y8qfuJJ55wev/ee+8pKytL7du3d6o3JCRE5cuXd6o3p3r16uX4OTAwUBUrVpSPj4/at2/vaK9YsaICAwN14MCBbPv36dPHaQXyySeflIeHh5YtWyZJ+uKLL5SRkaGBAwfKze3//m9F79695e/vr08//dRpPLvdrh49elx3/Xa73TFuZmamTp48KV9fX1WsWPGK89OjRw95eno63tevX1+SHOe2ZcsWJScna+DAgdlWhS+vyJ46dUpffvml2rdvr7Nnzzrm4+TJk4qNjdW+fft09OjR6z4HALiVcIssAKDAqlWr1hUf8rNv3z5JUpMmTa64n7+/v+PnU6dOKT4+XosXL9avv/7q1C8lJcVgtf/n77eh7tu3T5ZlqXz58lfs/9eAlxNeXl4qVaqUU1tAQIBuu+02R5j6a/uVPlv595p8fX0VGhqqgwcPSpIOHTok6c+Q+leenp6KjIx0bL+sTJkyTgHwn2RlZWnGjBmaPXu2kpOTlZmZ6dhWokSJbP3Lli3r9L5YsWKS5Di3pKQkSdd+2vD+/ftlWZbi4uIUFxd3xT6//vqrypQpc93nAQC3CgImAKDQycrKkvTn5zBDQkKybffw+L8/f+3bt9f69es1ZMgQVa9eXb6+vsrKylKzZs0c41zL34PaZX8NQn/311XTy/XabDYtX75c7u7u2fr7+vr+Yx1XcqWxrtVu/f/Pg+alv5/7P5k4caLi4uL0+OOPa9y4cSpevLjc3Nw0cODAK86PiXO7PO7gwYMVGxt7xT5RUVHXPR4A3EoImACAQuf222+XJAUFBalp06ZX7Xf69GmtWrVK8fHxGjVqlKP98groX10tSF5eIfv7E1P/vnL3T/ValqVy5cqpQoUK171ffti3b58aN27seJ+WlqZjx46pRYsWkqTw8HBJ0p49exQZGenol5GRoeTk5Gte/7+62vV955131LhxY82bN8+p/cyZM46HLeXE5f82duzYcdXaLp9HkSJFrrt+AMCf+AwmAKDQiY2Nlb+/vyZOnKiLFy9m2375ya+XV7v+vro1ffr0bPtc/q7KvwdJf39/lSxZUmvXrnVqnz179nXX+/DDD8vd3V3x8fHZarEsy+krU/Lbq6++6nQNExISdOnSJTVv3lyS1LRpU3l6emrmzJlOtc+bN08pKSlq2bLldR3Hx8cn27WV/pyjv1+TpUuX5vozkNHR0SpXrpymT5+e7XiXjxMUFKRGjRrplVde0bFjx7KNkZsnBwPArYIVTABAoePv76+EhAR16dJF0dHR6tChg0qVKqXDhw/r008/Vd26dTVr1iz5+/s7vsLj4sWLKlOmjD7//HMlJydnG7NGjRqSpBEjRqhDhw4qUqSIWrVqJR8fH/Xq1UuTJk1Sr169VLNmTa1du1Z79+697npvv/12jR8/XsOHD9fBgwfVtm1b+fn5KTk5We+//7769OmjwYMHG7s+OZGRkaH77rtP7du31549ezR79mzVq1dPrVu3lvTnV7UMHz5c8fHxatasmVq3bu3od8899+ixxx67ruPUqFFDCQkJGj9+vKKiohQUFKQmTZrowQcf1NixY9WjRw/VqVNH27dv18KFC51WS3PCzc1NCQkJatWqlapXr64ePXooNDRUu3fv1s6dO/XZZ59J+vMBUvXq1VPVqlXVu3dvRUZG6sSJE9qwYYN+/vnnbN/DCQD4EwETAFAoderUSaVLl9akSZP0wgsvKD09XWXKlFH9+vWdnmL61ltvacCAAXrppZdkWZYeeOABLV++PNv3K95
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Mean Squared Error: 46662951.69621668\n",
"Train R2 Score: 0.9411587287387594\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2QAAAIjCAYAAABswtioAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABxu0lEQVR4nO3deVxV1f7/8fdhRkZREXEAU1NxzBnNKU0stBwCbTC1wSyt1DLz3iabbM5uk9mgzcbJMWdzLilTQ8VZwyEVwQwQHJj27w9/7K8nSEGBDfh6Ph48rnutdfb57LM7XN+uvde2GYZhCAAAAABQ6pysLgAAAAAArlYEMgAAAACwCIEMAAAAACxCIAMAAAAAixDIAAAAAMAiBDIAAAAAsAiBDAAAAAAsQiADAAAAAIsQyAAAAADAIgQyAIBlbDabnnvuOavLsFy3bt3UrVs3c/vAgQOy2WyaMWOGZTX90z9rLCll8dgBoCQRyACggvjggw9ks9nUvn37y97H0aNH9dxzzykuLq74CivjVq9eLZvNZv64urrqmmuu0d13360//vjD6vKKZP369XruueeUkpJiWQ2hoaEOn2dgYKA6d+6sOXPmWFYTAJRlLlYXAAAoHl9//bVCQ0O1YcMG7du3T/Xr1y/yPo4ePapJkyYpNDRULVu2LP4iy7BHHnlEbdu2VVZWljZv3qxp06Zp4cKF2rZtm4KDg0u1lpCQEJ05c0aurq5Fet369es1adIkDRs2TP7+/iVTXCG0bNlSjz32mKTz/0199NFHGjBggD788EONHDnyoq+93GMHgPKKGTIAqAASEhK0fv16vfXWW6pWrZq+/vprq0sqdzp37qy77rpLw4cP17vvvqs33nhDJ0+e1Oeff/6vr8nIyCiRWmw2mzw8POTs7Fwi+y9pNWvW1F133aW77rpLTzzxhH7++Wd5eXnp7bff/tfXZGdnKzMzs9wfOwAUFYEMACqAr7/+WpUrV1ZkZKRuu+22fw1kKSkpGjt2rEJDQ+Xu7q5atWrp7rvv1okTJ7R69Wq1bdtWkjR8+HDzkrO8e3lCQ0M1bNiwfPv8571FmZmZeuaZZ9S6dWv5+fnJy8tLnTt31qpVq4p8XMePH5eLi4smTZqUr2/37t2y2Wx67733JElZWVmaNGmSGjRoIA8PD1WpUkXXX3+9li9fXuT3laQbbrhB0vmwK0nPPfecbDabduzYoTvuuEOVK1fW9ddfb47/6quv1Lp1a3l6eiogIECDBw/W4cOH8+132rRpqlevnjw9PdWuXTutW7cu35h/u49q165dio6OVrVq1eTp6amGDRvqv//9r1nf+PHjJUl169Y1z9+BAwdKpMaiCAoKUuPGjc3PMu/43njjDU2ZMkX16tWTu7u7duzYcVnHnufIkSO65557VL16dbm7u6tJkyb67LPPrqh2AChpXLIIABXA119/rQEDBsjNzU233367PvzwQ/32229mwJKk9PR0de7cWTt37tQ999yjVq1a6cSJE5o/f77+/PNPNW7cWM8//7yeeeYZjRgxQp07d5YkdezYsUi1pKWl6ZNPPtHtt9+u+++/X6dOndKnn36qiIgIbdiwoUiXQlavXl1du3ZVTEyMnn32WYe+7777Ts7OzoqKipJ0PpBMnjxZ9913n9q1a6e0tDRt3LhRmzdv1o033likY5Ck/fv3S5KqVKni0B4VFaUGDRro5ZdflmEYkqSXXnpJTz/9tKKjo3XfffcpOTlZ7777rrp06aLff//dvHzw008/1QMPPKCOHTtqzJgx+uOPP3TLLbcoICBAtWvXvmg9W7duVefOneXq6qoRI0YoNDRU+/fv1w8//KCXXnpJAwYM0J49e/Ttt9/q7bffVtWqVSVJ1apVK7Ua/01WVpYOHz6c77OcPn26zp49qxEjRsjd3V0BAQHKzc0t8rFL58N7hw4dZLPZNHr0aFWrVk2LFy/Wvffeq7S0NI0ZM+ayageAEmcAAMq1jRs3GpKM5cuXG4ZhGLm5uUatWrWMRx991GHcM888Y0gyZs+enW8fubm5hmEYxm+//WZIMqZPn55vTEhIiDF06NB87V27djW6du1qbmdnZxvnzp1zGPP3338b1atXN+655x6HdknGs88+e9Hj++ijjwxJxrZt2xzaw8LCjBtuuMHcbtGihREZGXnRfRVk1apVhiTjs88+M5KTk42jR48aCxcuNEJDQw2bzWb89ttvhmEYxrPPPmtIMm6//XaH1x84cMBwdnY2XnrpJYf2bdu2GS4uLmZ7ZmamERgYaLRs2dLh85k2bZohyeEzTEhIyHceunTpYvj4+BgHDx50eJ+8c2cYhvH6668bkoyEhIQSr/HfhISEGL169TKSk5ON5ORkY8uWLcbgwYMNScbDDz/scHy+vr5GUlKSw+sv99jvvfdeo0aNGsaJEyccxgwePNjw8/MzTp8+fcnaAcAKXLIIAOXc119/rerVq6t79+6Szt9/NGjQIM2cOVM5OTnmuFmzZqlFixbq379/vn3YbLZiq8fZ2Vlubm6SpNzcXJ08eVLZ2dlq06aNNm/eXOT9DRgwQC4uLvruu+/Mtvj4eO3YsUODBg0y2/z9/bV9+3bt3bv3suq+5557VK1aNQUHBysyMlIZGRn6/PPP1aZNG4dx/1yUYvbs2crNzVV0dLROnDhh/gQFBalBgwbmpZobN25UUlKSRo4caX4+kjRs2DD5+fldtLbk5GStXbtW99xzj+rUqePQV5hzVxo1XmjZsmWqVq2aqlWrphYtWshut2vIkCF69dVXHcYNHDjQnMH7N4U5dsMwNGvWLPXt21eGYTgcY0REhFJTUy/rvz0AKA1csggA5VhOTo5mzpyp7t27m/fnSFL79u315ptvasWKFerVq5ek85fgDRw4sFTq+vzzz/Xmm29q165dysrKMtvr1q1b5H1VrVpVPXr0UExMjF544QVJ5y9XdHFx0YABA8xxzz//vG699VZde+21atq0qXr37q0hQ4aoefPmhXqfZ555Rp07d5azs7OqVq2qxo0by8Ul//9N/vMY9u7dK8Mw1KBBgwL3m7da4MGDByUp37i8ZfYvJm/5/aZNmxbqWP6pNGq8UPv27fXiiy/KZrOpUqVKaty4cYGrPhbmv4fCHHtycrJSUlI0bdo0TZs2rcAxSUlJhSseAEoZgQwAyrGVK1fq2LFjmjlzpmbOnJmv/+uvvzYD2ZX6t5mYnJwchxXxvvrqKw0bNkz9+vXT+PHjFRgYKGdnZ02ePNm8L6uoBg8erOHDhysuLk4tW7ZUTEyMevToYd4nJUldunTR/v37NW/ePC1btkyffPKJ3n77bU2dOlX33XffJd+jWbNm6tmz5yXHeXp6Omzn5ubKZrNp8eLFBa4M6O3tXYgjLFmlXWPVqlUv67O8XHn3nd11110aOnRogWMKG8wBoLQRyACgHPv6668VGBio999/P1/f7NmzNWfOHE2dOlWenp6qV6+e4uPjL7q/i13+Vrly5QIfOHzw4EGH2ZPvv/9e11xzjWbPnu2wv38uylEU/fr10wMPPGBetrhnzx5NnDgx37iAgAANHz5cw4cPV3p6urp06aLnnnuuUIHsctWrV0+GYahu3bq69tpr/3VcSEiIpPOzVXkrOErnF7xISEhQixYt/vW1eZ/v5Z6/0qixpBTm2KtVqyYfHx/l5OQUKggCQFnCPWQAUE6dOXNGs2fPVp8+fXTbbbfl+xk9erROnTql+fPnSzp/v86WLVs0Z86cfPsy/v9qgV5eXpJUYPCqV6+efvnlF2VmZpptCxYsyLdset4MTN4+JenXX39VbGzsZR+rv7+/IiIiFBMTo5kzZ8rNzU39+vVzGPPXX385bHt7e6t+/fo6d+7cZb9vYQwYMEDOzs6aNGmSwzFL5z+DvLratGmjatWqaerUqQ6f4YwZMwr8vC9UrVo1denSRZ999pkOHTqU7z3y/Nv5K40aS0phjt3Z2VkDBw7UrFmzCgxuycnJpVIrAFwOZsgAoJyaP3++Tp06pVtuuaXA/g4dOpgPiR40aJDGjx+v77//XlFRUbrnnnvUunVrnTx5UvPnz9f
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.metrics import mean_squared_error, r2_score\n",
"from sklearn.model_selection import cross_val_score\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import featuretools as ft\n",
"import re\n",
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
"\n",
"# Уменьшение размера выборки для ускорения работы (опционально)\n",
"df = df.sample(frac=0.1, random_state=42)\n",
"\n",
"# Преобразование столбца Battery в числовой формат\n",
"df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n",
"\n",
"# Преобразование столбца Display в числовой формат\n",
"df['Camera'] = pd.to_numeric(df['Camera'], errors='coerce')\n",
"df['Display'] = pd.to_numeric(df['Display'], errors='coerce')\n",
"df['Inbuilt_memory'] = pd.to_numeric(df['Inbuilt_memory'], errors='coerce')\n",
"df['fast_charging'] = pd.to_numeric(df['fast_charging'], errors='coerce')\n",
"\n",
"# Удаление запятых из столбца Price и преобразование в числовой формат\n",
"df['Price'] = df['Price'].str.replace(',', '').astype(float)\n",
"\n",
"# Удаление столбцов с текстовыми значениями, которые не могут быть преобразованы в числа\n",
"df = df.drop(columns=['Name', 'company', 'Android_version', 'Processor_name', 'External_Memory', 'No_of_sim', 'Ram', 'Screen_resolution', 'Processor' ])\n",
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
"\n",
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
"\n",
"# Вывод размеров выборок\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))\n",
"\n",
"# Определение сущностей\n",
"es = ft.EntitySet(id='mobile_data')\n",
"es = es.add_dataframe(dataframe_name='mobile', dataframe=train_df, index='id')\n",
"\n",
"# Генерация признаков с уменьшенной глубиной\n",
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='mobile', max_depth=1)\n",
"\n",
"# Преобразование признаков для контрольной и тестовой выборок\n",
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df.index)\n",
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df.index)\n",
"\n",
"# Удаление строк с NaN\n",
"feature_matrix = feature_matrix.dropna()\n",
"val_feature_matrix = val_feature_matrix.dropna()\n",
"test_feature_matrix = test_feature_matrix.dropna()\n",
"\n",
"# Разделение данных на обучающую и тестовую выборки\n",
"X_train = feature_matrix.drop('Price', axis=1)\n",
"y_train = feature_matrix['Price']\n",
"X_val = val_feature_matrix.drop('Price', axis=1)\n",
"y_val = val_feature_matrix['Price']\n",
"X_test = test_feature_matrix.drop('Price', axis=1)\n",
"y_test = test_feature_matrix['Price']\n",
"\n",
"# Выбор модели\n",
"model = RandomForestRegressor(random_state=42)\n",
"\n",
"# Обучение модели\n",
"model.fit(X_train, y_train)\n",
"\n",
"# Предсказание и оценка\n",
"y_pred = model.predict(X_test)\n",
"\n",
"mse = mean_squared_error(y_test, y_pred)\n",
"r2 = r2_score(y_test, y_pred)\n",
"\n",
"print(f\"Mean Squared Error: {mse}\")\n",
"print(f\"R2 Score: {r2}\")\n",
"\n",
"# Кросс-валидация\n",
"scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n",
"mse_cv = -scores.mean()\n",
"print(f\"Cross-validated Mean Squared Error: {mse_cv}\")\n",
"\n",
"# Анализ важности признаков\n",
"feature_importances = model.feature_importances_\n",
"feature_names = X_train.columns\n",
"\n",
"importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})\n",
"importance_df = importance_df.sort_values(by='Importance', ascending=False)\n",
"\n",
"plt.figure(figsize=(10, 6))\n",
"sns.barplot(x='Importance', y='Feature', data=importance_df)\n",
"plt.title('Feature Importance')\n",
"plt.show()\n",
"\n",
"# Проверка на переобучение\n",
"y_train_pred = model.predict(X_train)\n",
"\n",
"mse_train = mean_squared_error(y_train, y_train_pred)\n",
"r2_train = r2_score(y_train, y_train_pred)\n",
"\n",
"print(f\"Train Mean Squared Error: {mse_train}\")\n",
"print(f\"Train R2 Score: {r2_train}\")\n",
"\n",
"# Визуализация результатов\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(y_test, y_pred, alpha=0.5)\n",
"plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)\n",
"plt.xlabel('Actual Price')\n",
"plt.ylabel('Predicted Price')\n",
"plt.title('Actual vs Predicted Price')\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "aimenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}