2024-09-29 13:00:33 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Выгрузка в датафрейм первый набор (игры в Steam)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://www.kaggle.com/datasets/wajihulhassan369/steam-games-dataset. Н а б о р представляет собой данные о б экшенах, доступных в Steam. Эта информация полезна для изучения игровых паттернов, моделирования цен и исследования корреляции между игровыми тегами и методами ценообразования. Этот набор позволяет провести предварительный анализ данных, построить модели машинного обучения или исследовать игровую индустрию. В наборе пресдтавлена дата, различные теги, рейтинг отзывов. Так можно понять, какие теги популярнее, что в играх людям нравится больше, изменилось ли качество игр с о временем и т.д. Для бизнеса такой набор данных может быть полезен для прогнозирования, в разработку каки игр целесообразнее вкладываться. Так компания не потеряет деньги.\n",
"Пример цели: Разработка игры на пк в нужную фазу рынка\n",
"Входные данные: год выпуска, сумма продаж\n",
"Целевой признак: продаваемость игр в текущей фазе рынка в сравнении с предыдущими."
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Name', 'Price', 'Release_date', 'Review_no', 'Review_type', 'Tags',\n",
" 'Description'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"df = pd.read_csv(\".//static//csv//steam_cleaned.csv\")\n",
"print(df.columns)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0EAAAIjCAYAAADFthA8AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABj3klEQVR4nO3dd3gU5f7//9emF5JQQwApAUQ6SFOkKxgQEVRAUY6AHjwiguLRc8QCAiJiwYIe2zkiIiqCICiKgIBSP4B0A0ivoUNCS4Dk/v3Bd/eXJW032ZJkno/rynWxs7Mz79mZXea19z332IwxRgAAAABgEQH+LgAAAAAAfIkQBAAAAMBSCEEAAAAALIUQBAAAAMBSCEEAAAAALIUQBAAAAMBSCEEAAAAALIUQBAAAAMBSCEEAAAAALIUQBMAS9u7dK5vNps8//9zfpTiZN2+eGjdurLCwMNlsNp05c8bfJWXx+eefy2azae/evf4updB6+eWXZbPZdOLECX+XUqTY3zcA8DVCEFDEbd68WT179lTVqlUVFhamSpUqqVOnTpo4caLX1vnVV1/pnXfeyTL98OHDevnll7VhwwavrftaS5Yskc1mc/wFBwerevXqeuihh7R7926PrGPFihV6+eWXPR5QTp48qd69eys8PFwffPCBpkyZosjIyGzntQcR+19QUJAqVaqk/v3769ChQx6tq7Dq37+/bDabGjZsKGNMludtNpueeOIJP1RmDfb33/4XGhqqWrVqacSIEUpNTfV3eX5z7XdQaGioypcvr/bt2+vVV1/V8ePH873sxMREvfzyy/wAAXhBkL8LAJB/K1asUIcOHVSlShUNHDhQcXFxOnDggFatWqV3331XQ4YM8cp6v/rqK23ZskVPPfWU0/TDhw9r1KhRqlatmho3buyVdedk6NChat68uS5fvqx169bpk08+0dy5c7V582ZVrFixQMtesWKFRo0apf79+6tkyZKeKVjSmjVrdPbsWY0ZM0YdO3Z06TWjR49WfHy8UlNTtWrVKn3++edatmyZtmzZorCwMI/Vltnf/vY33X///QoNDfXK8t21efNmzZw5U/fee6+/S7Gc0NBQ/fe//5UkJScna/bs2RozZox27dqlqVOn+rk6/7J/B6Wnp+v48eNasWKFRo4cqQkTJujbb7/Vrbfe6vYyExMTNWrUKLVv317VqlXzfNGAhRGCgCJs7NixiomJ0Zo1a7KcnB87dsw/RXnB+fPnc2whsWvTpo169uwpSRowYIBq1aqloUOHavLkyRo+fLgvynSbfR+5E6y6dOmiZs2aSZL+/ve/q2zZsho/frzmzJmj3r17e6NMBQYGKjAw0CvLdld4eLgqV66s0aNH65577rFcV6oLFy4oIiLCb+sPCgpS3759HY8ff/xx3XLLLfr66681YcIElS9f3m+1+Vvm7yC7jRs36vbbb9e9996rxMREVahQwU/VAbgW3eGAImzXrl2qV69etifRsbGxWaZ9+eWXatGihSIiIlSqVCm1bdtW8+fPdzw/e/Zsde3aVRUrVlRoaKhq1KihMWPGKD093TFP+/btNXfuXO3bt8/R/aNatWpasmSJmjdvLulqCLE/l/kanP/7v/9T586dFRMTo4iICLVr107Lly93qtF+jUBiYqIeeOABlSpVSq1bt3b7vbH/6rpnz55c51u0aJHatGmjyMhIlSxZUt27d9fWrVud6nn22WclSfHx8Y7tyqt7yvTp09W0aVOFh4erbNmy6tu3r1O3tfbt26tfv36SpObNm8tms6l///5ub2ebNm0kXT0WMtu2bZt69uyp0qVLKywsTM2aNdOcOXMcz69du1Y2m02TJ0/OssxffvlFNptNP/74o6Scrwn6+eefHe9dVFSUunbtqj///NPx/Jw5c2Sz2bRp0ybHtO+++042m0333HOP07Lq1Kmj++67L8/tDQgI0IsvvqhNmzZp1qxZuc6bU9327ktLlixxTGvfvr3q16+vTZs2qV27doqIiFDNmjU1Y8YMSdJvv/2mm266SeHh4brhhhu0cOHCbNd54sQJ9e7dW9HR0SpTpoyefPLJbLuKffnll47jo3Tp0rr//vt14MABp3nsNf3xxx9q27atIiIi9Pzzz2e73jfffFM2m0379u3L8tzw4cMVEhKi06dPS5J27Nihe++9V3FxcQoLC9N1112n+++/X8nJyTm+lzmx2Wxq3bq1jDFZup/mdXzkxpX3Z+nSperVq5eqVKmi0NBQVa5cWcOGDdPFixed5jty5IgGDBig6667TqGhoapQoYK6d+/u9vGcH40aNdI777yjM2fO6P3333dM37dvnx5//HHdcMMNCg8PV5kyZdSrVy+nmj7//HP16tVLktShQwfHd0/m49YbNQNWQQgCirCqVavqjz/+0JYtW/Kcd9SoUfrb3/6m4OBgjR49WqNGjVLlypW1aNEixzyff/65SpQooaefflrvvvuumjZtqhEjRui5555zzPPCCy+ocePGKlu2rKZMmaIpU6bonXfeUZ06dTR69GhJ0qOPPup4rm3btpKuho22bdsqJSVFI0eO1KuvvqozZ87o1ltv1erVq7PU26tXL124cEGvvvqqBg4c6PZ7Yw8FZcqUyXGehQsXKiEhQceOHdPLL7+sp59+WitWrFCrVq0cJyP33HOP+vTpI0l6++23HdtVrly5HJf7+eefq3fv3goMDNS4ceM0cOBAzZw5U61bt3ZcV/TCCy/o0UcflXS1i9uUKVP0j3/8w+3ttNdZqlQpx7Q///xTN998s7Zu3arnnntOb731liIjI9WjRw9HcGjWrJmqV6+ub7/9Nssyp02bplKlSikhISHH9U6ZMkVdu3ZViRIlNH78eL300ktKTExU69atHTW1bt1aNptNv//+u+N1S5cuVUBAgJYtW+aYdvz4cW3bts1xrOTlgQce0PXXX6/Ro0dne21Qfp0+fVp33nmnbrrpJr3++usKDQ3V/fffr2nTpun+++/XHXfcoddee03nz59Xz549dfbs2SzL6N27t1JTUzVu3Djdcccdeu+99xz72W7s2LF66KGHdP3112vChAl66qmn9Ouvv6pt27ZZrjs7efKkunTposaNG+udd95Rhw4dsq29d+/estls2e7Pb7/9VrfffrtKlSqlS5cuKSEhQatWrdKQIUP0wQcf6NFHH9Xu3bvzfc1bdsegK8dHTlx9f6ZPn64LFy5o0KBBmjhxohISEjRx4kQ99NBDTsu79957NWvWLA0YMED/+c9/NHToUJ09e1b79+/3SL156dmzp8LDw51+cFqzZo1WrFih+++/X++9954ee+wx/frrr2rfvr0uXLggSWrbtq2GDh0qSXr++ecd3z116tTxes2AJRgARdb8+fNNYGCgCQwMNC1btjT/+te/zC+//GIuXbrkNN+OHTtMQECAufvuu016errTcxkZGY5/X7hwIcs6/vGPf5iIiAiTmprqmNa1a1dTtWrVLPOuWbPGSDKTJk3Kso7rr7/eJCQkZFlffHy86dSpk2PayJEjjSTTp08fl96DxYsXG0nms88+M8ePHzeHDx82c+fONdWqVTM2m82sWbPGGGPMnj17stTWuHFjExsba06ePOmYtnHjRhMQEGAeeughx7Q33njDSDJ79uzJs55Lly6Z2NhYU79+fXPx4kXH9B9//NFIMiNGjHBMmzRpkpHkqDE39nkXLlxojh8/bg4cOGBmzJhhypUrZ0JDQ82BAwcc8952222mQYMGTvssIyPD3HLLLeb66693TBs+fLgJDg42p06dckxLS0szJUuWNA8//HCWddu3/+zZs6ZkyZJm4MCBTjUeOXLExMTEOE2vV6+e6d27t+NxkyZNTK9evYwks3XrVmOMMTNnzjSSzMaNG3N9D/r162ciIyONMcZMnjzZSDIzZ850PC/JDB48OMe67ezHzOLFix3T2rVrZySZr776yjFt27ZtRpIJCAgwq1atckz/5ZdfshxL9uP2rrvuclrX448/7rRte/fuNYGBgWbs2LFO823evNkEBQU5TbfX9NFHH+X6vti1bNnSNG3a1Gna6tWrjST
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Преобразуем дату выпуска в формат datetime\n",
"df['Release_date'] = pd.to_datetime(df['Release_date'])\n",
"\n",
"# Визуализация данных\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df['Release_date'], df['Review_no'])\n",
"plt.xlabel('Release Date')\n",
"plt.ylabel('Review Number')\n",
"plt.title('Scatter Plot of Review Number vs Release Date')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"При проверке на шум можно заметить выброс в 2014 году. количество обзоров там запредельное. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"В с е выбросы удалены путём определения порогов квантилями. Зашумленность не очень высокая. Покрытие данных высокое и подошло бы для поставленной задачи по актуальности."
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Выбросы:\n",
" Name Price Release_date Review_no \\\n",
"18 GUNDAM BREAKER 4 59.99 2024-08-29 1846.0 \n",
"22 LOCKDOWN Protocol 5.49 2024-07-22 2192.0 \n",
"34 CarX Street 19.99 2024-08-29 4166.0 \n",
"45 Harry Potter: Quidditch Champions 25.99 2024-09-03 1216.0 \n",
"61 SMITE 2 18.00 2024-08-27 1633.0 \n",
"... ... ... ... ... \n",
"7695 Dude Simulator 2 2.99 2018-07-28 1734.0 \n",
"7717 Golfing Over It with Alva Majo 2.39 2018-03-28 1367.0 \n",
"7740 Dungeon Siege II 4.99 2005-08-16 2274.0 \n",
"7765 Phantom Doctrine 12.99 2018-08-14 3538.0 \n",
"7768 NECROPOLIS: BRUTAL EDITION 19.99 2016-07-12 3668.0 \n",
"\n",
" Review_type Tags \\\n",
"18 Very Positive Action,Robots,Hack and Slash,RPG,Mechs,Action ... \n",
"22 Very Positive Multiplayer,Social Deduction,Conversation,Acti... \n",
"34 Mixed Racing,Open World,Automobile Sim,PvP,Multiplay... \n",
"45 Mostly Positive Action,Sports,Flight,Arcade,Third Person,Magic... \n",
"61 Mixed Action,MOBA,Third Person,Strategy,Adventure,Ca... \n",
"... ... ... \n",
"7695 Mixed Life Sim,Indie,Simulation,Racing,Action,Advent... \n",
"7717 Mostly Positive Difficult,Physics,Golf,Platformer,Precision Pl... \n",
"7740 Mostly Positive RPG,Fantasy,Action RPG,Hack and Slash,Singlepl... \n",
"7765 Mostly Positive Turn-Based Tactics,Strategy,Cold War,Stealth,R... \n",
"7768 Mixed Souls-like,Action Roguelike,Co-op,Adventure,Ro... \n",
"\n",
" Description \n",
"18 Create your own ultimate Gundam in the newest ... \n",
"22 A first person social deduction game, combinin... \n",
"34 Conquer mountain roads, highways, and city str... \n",
"45 Your next chapter takes flight! Immerse yourse... \n",
"61 Become a god and wage war in SMITE 2, the Unre... \n",
"... ... \n",
"7695 Dude Simulator 2 is an open world sandbox game... \n",
"7717 The higher you climb, the bigger the fall. \n",
"7740 NaN \n",
"7765 The year is 1983. The world teeters on the ver... \n",
"7768 NECROPOLIS: BRUTAL EDITION is a major update f... \n",
"\n",
"[1049 rows x 7 columns]\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1sAAAIjCAYAAAD1OgEdAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOyde3yO9f/HX/c9O7OTYSNsZmKGIafm3EQUJWmoUD/0DUUnVEQq6YTim1Kpr1M6EEUra/o6zSEMM9Vo4xsbbbONbbbZff3+mOvuPlyHz3W4T/N+Ph4eD7vv6/C5Ptfnuu7P+/N+v19vA8dxHAiCIAiCIAiCIAhdMbq6AQRBEARBEARBEHURMrYIgiAIgiAIgiAcABlbBEEQBEEQBEEQDoCMLYIgCIIgCIIgCAdAxhZBEARBEARBEIQDIGOLIAiCIAiCIAjCAZCxRRAEQRAEQRAE4QDI2CIIgiAIgiAIgnAAZGwRBEEQBEEQBEE4ADK2CMLDyM3NhcFgwGeffebqpliRkpKChIQE+Pn5wWAwoLi42NVNsuOzzz6DwWBAbm6uq5vitsyfPx8GgwEFBQWubopHwfcb4Rq+/PJLhIWF4erVq6r2v3jxIkaNGoWGDRvCYDBg6dKl+jawDjBhwgRERUVZfWYwGDB//nzz3zfjO1bo2Y+KisKECROc3paVK1eiRYsWqKysdPq5CXHI2CLchhMnTmDUqFFo2bIl/Pz80KxZMwwaNAjvv/++w865fv16wR/VCxcuYP78+cjIyHDYuW355ZdfYDAYzP+8vb3RqlUrPPLII/jzzz91Oce+ffswf/583Q2hwsJCjB49Gv7+/lixYgXWrFmDwMBAwW35H2P+X7169dCsWTNMmDAB58+f17Vd7sqECRNgMBjQsWNHcBxn973BYMC0adNc0LKbA77/+X++vr5o06YN5s2bh2vXrrm6eS7D9h3k6+uLJk2aoH///nj99dfx999/qz52VlYW5s+f75BJeE1NDV5++WVMnz4d9evXF/y+adOmMBgM+OGHHwSPMXPmTPz444+YM2cO1qxZgyFDhmD79u1WhoQzmDBhguA18NC7QZ69e/fivvvuQ5MmTeDr64uoqChMmTIF586dU33M8vJyzJ8/H7/88ot+DXUAEyZMQFVVFT788ENXN4WwgIwtwi3Yt28fbrvtNhw7dgyTJk3C8uXL8X//938wGo1YtmyZw84rZWwtWLDAqcYWz5NPPok1a9bgo48+wrBhw7Bx40Z069YNFy5c0Hzsffv2YcGCBbobW4cOHcKVK1ewcOFCPPbYY3jooYfg7e0tuc8rr7yCNWvWYOXKlbjrrruwdu1a9OvXz6GT3YcffhgVFRVo2bKlw86hhBMnTmDTpk2ubsZNia+vL9asWYM1a9bg3XffRVRUlHn83uxYvoOee+45hIWF4eWXX0a7du2Qlpam6phZWVlYsGCBQ4yt7777Dr///jsmT54s+H1aWhry8vIQFRWFdevWiW4zYsQIPPvss3jooYfQtm1bbN++HQsWLNC9vXUZV79j33//ffTp0wcnTpzA9OnT8e9//xujRo3Cxo0b0bFjR+zbt0/VccvLy7FgwQJmY+v333/HqlWrVJ1LC35+fhg/fjzeffddwYU8wjXUc3UDCAIAXnvtNQQHB+PQoUMICQmx+u7SpUuuaZQDKCsrE/X48PTp0wejRo0CAEycOBFt2rTBk08+ic8//xxz5sxxRjMVw98j23snxV133YXbbrsNAPB///d/CA8Px+LFi7F161aMHj3aEc2El5cXvLy8HHJspfj7+6N58+Z45ZVXMHLkyJsuBK28vBwBAQEuO3+9evXw0EMPmf9+4okncPvtt2PDhg1499130aRJE5e1zdVYvoN4jh07hjvvvBP3338/srKyEBkZ6aLW2bN69WokJiaiWbNmgt+vXbsWXbp0wfjx4/HCCy8IvocvXbqk6P2lFo7jcO3aNfj7+zv8XK7Ale/YvXv3YsaMGejduzdSUlKs3i//+te/kJiYiFGjRuHkyZMIDQ11aFt8fX11O9b169dhMpng4+PDtP3o0aPx5ptvYufOnRg4cKBu7SDUQ54twi04c+YM2rdvL/hj17hxY7vP1q5di+7duyMgIAChoaHo27cvfvrpJ/P3W7ZswbBhw9C0aVP4+voiJiYGCxcuRE1NjXmb/v37Y9u2bTh79qw5bCYqKgq//PILunXrBqDW2OG/s8yROnDgAIYMGYLg4GAEBASgX79+2Lt3r1Ub+TjurKwsjB07FqGhoejdu7fivuFfljk5OZLbpaWloU+fPggMDERISAhGjBiBU6dOWbXnueeeAwBER0ebr0tupfmrr75C165d4e/vj/DwcDz00ENW4X79+/fH+PHjAQDdunWDwWBQFavep08fALVjwZLffvsNo0aNQlhYGPz8/HDbbbdh69at5u9//fVXGAwGfP7553bH/PHHH2EwGPD9998DEM8n+OGHH8x916BBAwwbNgwnT540f79161YYDAYcP37c/Nk333wDg8GAkSNHWh2rXbt2ePDBB2Wv12g04qWXXsLx48exefNmyW3F2s2HfVmutvbv3x/x8fE4fvw4+vXrh4CAALRu3Rpff/01AOC///0vevToAX9/f9x6661ITU0VPGdBQQFGjx6NoKAgNGzYEE899ZSg13Ht2rXm8REWFobk5GT873//s9qGb9Phw4fRt29fBAQE4IUXXhA879tvvw2DwYCzZ8/afTdnzhz4+Pjg8uXLAIDs7Gzcf//9iIiIgJ+fH2655RYkJyejpKREtC/FMBgM6N27NziOswvblRsfUrD0z+7du/HAAw+gRYsW8PX1RfPmzTFz5kxUVFRYbZefn4+JEyfilltuga+vLyIjIzFixAjF41kNnTp1wtKlS1FcXIzly5ebPz979iyeeOIJ3HrrrfD390fDhg3xwAMPWLXps88+wwMPPAAAGDBggPndYzlu1bb52rVrSElJQVJSkuD3FRUV2Lx5M5KTkzF69GhUVFRgy5YtVm0zGAzgOA4rVqwwt23ChAlYsWIFAFiFVvKYTCYsXboU7du3h5+fH5o0aYIpU6aYxyZPVFQU7r77bvz444+47bbb4O/vr2uIV1VVFebNm4euXbsiODgYgYGB6NOnD3bu3Gm1HZ/r+/bbb+Ojjz5CTEwMfH190a1bNxw6dMjuuN9++y3i4+Ph5+eH+Ph42XcUj9C7iu+DPXv2oHv37vDz80OrVq3wn//8x25//r3l7++PW265Ba+++ipWr17N9Fu1cOFC82+B7UJOTEwM3nzzTeTl5Vn1f//+/dG/f3+7Y1nmp+Xm5qJRo0YAgAULFpjHglSIqVDOVnFxMWbMmIHmzZvD19cXrVu3xuLFi2EymczbWN6npUuXmu9TVlYWgFrPXfv27c1zn9tuuw3r16+3Ok/Xrl0RFhZmNc4J10LGFuEWtGzZEocPH0ZmZqbstgsWLMDDDz8Mb29vvPLKK1iwYAGaN29uFd7y2WefoX79+nj66aexbNkydO3aFfPmzcPs2bPN27z44otISEhAeHi4OZxo6dKlaNeuHV555RUAwOTJk83f9e3bF0CtUdO3b1+Ulpbi5Zdfxuuvv47i4mIMHDgQBw8etGvvAw88gPLycrz++uuYNGmS4r7hjY+GDRuKbpOamorBgwfj0qVLmD9/Pp5++mns27cPiYmJ5h+okSNHYsyYMQCAJUuWmK+L/xER4rPPPsPo0aPh5eWFRYsWYdKkSdi0aRN69+5tDkV88cUXzeE7fGjglClTFF8n307LFceTJ0+iZ8+eOHXqFGbPno133nkHgYGBuPfee80//rfddhtatWqFL7/80u6YGzduRGhoKAYPHix63jVr1mDYsGGoX78+Fi9ejLlz5yIrKwu9e/c2t6l3794wGAzYtWuXeb/du3fDaDRiz5495s/+/vtv/Pbbb+axIsfYsWMRGxuLV155RdeQj8uXL+Puu+9Gjx498Oabb8LX1xfJycnYuHEjkpOTMXToULzxxhsoKyvDqFGjcOXKFbtjjB49GteuXcOiRYswdOhQvPfee3ZhWq+99ho
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"\n",
"# Преобразуем дату выпуска в формат datetime\n",
"df['Release_date'] = pd.to_datetime(df['Release_date'])\n",
"\n",
"# Статистический анализ для определения выбросов\n",
"Q1 = df['Review_no'].quantile(0.25)\n",
"Q3 = df['Review_no'].quantile(0.75)\n",
"IQR = Q3 - Q1\n",
"\n",
"# Определение порога для выбросов\n",
"threshold = 1.5 * IQR\n",
"outliers = (df['Review_no'] < (Q1 - threshold)) | (df['Review_no'] > (Q3 + threshold))\n",
"\n",
"# Вывод выбросов\n",
"print(\"Выбросы:\")\n",
"print(df[outliers])\n",
"\n",
"# Обработка выбросов\n",
"# В данном случае мы заменим выбросы на медианное значение\n",
"median_review_no = df['Review_no'].median()\n",
"df.loc[outliers, 'Review_no'] = median_review_no\n",
"\n",
"# Визуализация данных после обработки\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df['Release_date'], df['Review_no'])\n",
"plt.xlabel('Release Date')\n",
"plt.ylabel('Review Number')\n",
"plt.title('Scatter Plot of Review Number vs Release Date (After Handling Outliers)')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Очистим от строк с пустыми значениями наш датасет"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Количество удаленных строк: 515\n",
"\n",
"DataFrame после удаления строк с пропущенными значениями:\n",
" Name Price Release_date \\\n",
"0 Black Myth: Wukong 59.99 2024-08-20 \n",
"2 Counter-Strike 2 0.00 2012-08-21 \n",
"4 Grand Theft Auto V 10.48 2015-04-14 \n",
"5 Red Dead Redemption 2 17.99 2019-12-05 \n",
"6 PUBG: BATTLEGROUNDS 0.00 2017-12-21 \n",
"... ... ... ... \n",
"7807 Monster Hunter World: Iceborne - MHW:I Monster... 2.99 2020-02-06 \n",
"7808 Gene Shift Auto: Deluxe Edition 8.99 2022-11-28 \n",
"7809 Run Ralph Run 0.45 2021-03-03 \n",
"7810 Quadroids 6.19 2024-02-22 \n",
"7811 Divekick 4.99 2013-08-20 \n",
"\n",
" Review_no Review_type \\\n",
"0 270.0 Overwhelmingly Positive \n",
"2 270.0 Very Positive \n",
"4 270.0 Very Positive \n",
"5 270.0 Very Positive \n",
"6 270.0 Mixed \n",
"... ... ... \n",
"7807 39.0 Positive \n",
"7808 16.0 Positive \n",
"7809 26.0 Mostly Positive \n",
"7810 15.0 Positive \n",
"7811 1118.0 Very Positive \n",
"\n",
" Tags \\\n",
"0 Mythology,Action RPG,Action,Souls-like,RPG,Com... \n",
"2 FPS,Shooter,Multiplayer,Competitive,Action,Tea... \n",
"4 Open World,Action,Multiplayer,Crime,Automobile... \n",
"5 Open World,Story Rich,Western,Adventure,Multip... \n",
"6 Survival,Shooter,Battle Royale,Multiplayer,FPS... \n",
"... ... \n",
"7807 Action \n",
"7808 Indie,Action,Free to Play,Battle Royale,Roguel... \n",
"7809 Adventure,Action,Puzzle,Arcade,Platformer,Shoo... \n",
"7810 Precision Platformer,Puzzle Platformer,2D Plat... \n",
"7811 Fighting,Indie,2D Fighter,Parody ,Local Multip... \n",
"\n",
" Description \n",
"0 Black Myth: Wukong is an action RPG rooted in ... \n",
"2 For over two decades, Counter-Strike has offer... \n",
"4 Grand Theft Auto V for PC offers players the o... \n",
"5 Winner of over 175 Game of the Year Awards and... \n",
"6 Play PUBG: BATTLEGROUNDS for free.\\n\\nLand on ... \n",
"... ... \n",
"7807 A monster figure you can use to decorate your ... \n",
"7808 Gene Shift Auto is a roguelike-inspired battle... \n",
"7809 Ralph is a smart dinosaur, and a great shooter. \n",
"7810 Quadroids is a single-player puzzle platformer... \n",
"7811 Divekick is the world’ s first two-button fight... \n",
"\n",
"[7297 rows x 7 columns]\n"
]
}
],
"source": [
"# Удаление строк с пропущенными значениями\n",
"df_dropna = df.dropna()\n",
"\n",
"# Вывод количества удаленных строк\n",
"num_deleted_rows = len(df) - len(df_dropna)\n",
"print(f\"\\nК о личе с тво удаленных строк: {num_deleted_rows}\")\n",
"\n",
"print(\"\\nDataFrame после удаления строк с пропущенными значениями:\")\n",
"print(df_dropna)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Теперь создадим выборки."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 4687\n",
"Размер контрольной выборки: 1562\n",
"Размер тестовой выборки: 1563\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"df = pd.read_csv(\".//static//csv//steam_cleaned.csv\")\n",
"\n",
"train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)\n",
"\n",
"# Разделение остатка на контрольную и тестовую выборки\n",
"val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\n",
"\n",
"# Проверка размеров выборок\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))\n",
"\n",
2024-09-29 19:24:48 +04:00
"# Сохранение выборок в файлы\n",
2024-09-29 13:00:33 +04:00
"train_df.to_csv(\".//static//csv//train_data.csv\", index=False)\n",
"val_df.to_csv(\".//static//csv//val_data.csv\", index=False)\n",
"test_df.to_csv(\".//static//csv//test_data.csv\", index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проанализируем сбалансированность выборок"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение Review_type в обучающей выборке:\n",
"Review_type\n",
"Very Positive 2117\n",
"Mostly Positive 810\n",
"Mixed 797\n",
"Positive 710\n",
"Overwhelmingly Positive 209\n",
"Mostly Negative 15\n",
"Very Negative 2\n",
"Overwhelmingly Negative 1\n",
"Name: count, dtype: int64\n",
"Процент положительных отзывов: 17.28%\n",
"Процент отрицательных отзывов: 4.46%\n",
"\n",
"Распределение Review_type в контрольной выборке:\n",
"Review_type\n",
"Very Positive 708\n",
"Mostly Positive 290\n",
"Mixed 241\n",
"Positive 224\n",
"Overwhelmingly Positive 78\n",
"Mostly Negative 6\n",
"Very Negative 2\n",
"Name: count, dtype: int64\n",
"Процент положительных отзывов: 18.57%\n",
"Процент отрицательных отзывов: 4.99%\n",
"\n",
"Распределение Review_type в тестовой выборке:\n",
"Review_type\n",
"Very Positive 713\n",
"Mostly Positive 276\n",
"Mixed 253\n",
"Positive 240\n",
"Overwhelmingly Positive 67\n",
"Mostly Negative 5\n",
"Very Negative 1\n",
"Name: count, dtype: int64\n",
"Процент положительных отзывов: 17.66%\n",
"Процент отрицательных отзывов: 4.29%\n",
"\n",
"Необходима аугментация данных для балансировки классов.\n",
"Необходима аугментация данных для балансировки классов.\n",
"Необходима аугментация данных для балансировки классов.\n"
]
}
],
"source": [
"train_df = pd.read_csv(\".//static//csv//train_data.csv\")\n",
"val_df = pd.read_csv(\".//static//csv//val_data.csv\")\n",
"test_df = pd.read_csv(\".//static//csv//test_data.csv\")\n",
"\n",
"# Оценка сбалансированности\n",
"def check_balance(df, name):\n",
" counts = df['Review_type'].value_counts()\n",
" print(f\"Распределение Review_type в {name}:\")\n",
" print(counts)\n",
" print(f\"Процент положительных отзывов: {counts['Mostly Positive'] / len(df) * 100:.2f}%\")\n",
" print(f\"Процент отрицательных отзывов: {counts['Overwhelmingly Positive'] / len(df) * 100:.2f}%\")\n",
" print()\n",
"\n",
"# Определение необходимости аугментации данных\n",
"def need_augmentation(df):\n",
" counts = df['Review_type'].value_counts()\n",
" ratio = counts['Mostly Positive'] / counts['Overwhelmingly Positive']\n",
" if ratio > 1.5 or ratio < 0.67:\n",
" print(\"Необходима аугментация данных для балансировки классов.\")\n",
" else:\n",
" print(\"Аугментация данных не требуется.\")\n",
" \n",
"check_balance(train_df, \"обучающей выборке\")\n",
"check_balance(val_df, \"контрольной выборке\")\n",
"check_balance(test_df, \"тестовой выборке\")\n",
"\n",
"\n",
"\n",
"need_augmentation(train_df)\n",
"need_augmentation(val_df)\n",
"need_augmentation(test_df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"По результатам анализа требуется приращение, соотношения отзывов вне допустимого диапазона"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Оверсэмплинг:\n",
"Распределение Review_type в обучающей выборке:\n",
"Review_type\n",
"Mostly Positive 2117\n",
"Mixed 2117\n",
"Very Positive 2117\n",
"Positive 2117\n",
"Overwhelmingly Positive 2117\n",
"Mostly Negative 2117\n",
"Very Negative 2117\n",
"Overwhelmingly Negative 2117\n",
"Name: count, dtype: int64\n",
"Отсутствуют один или о б а класса (Positive/Negative).\n",
"\n",
"Распределение Review_type в контрольной выборке:\n",
"Review_type\n",
"Very Negative 708\n",
"Mostly Positive 708\n",
"Mixed 708\n",
"Overwhelmingly Positive 708\n",
"Overwhelmingly Negative 708\n",
"Positive 708\n",
"Mostly Negative 708\n",
"Very Positive 708\n",
"Name: count, dtype: int64\n",
"Отсутствуют один или о б а класса (Positive/Negative).\n",
"\n",
"Распределение Review_type в тестовой выборке:\n",
"Review_type\n",
"Very Negative 713\n",
"Mostly Positive 713\n",
"Overwhelmingly Positive 713\n",
"Mixed 713\n",
"Overwhelmingly Negative 713\n",
"Very Positive 713\n",
"Mostly Negative 713\n",
"Positive 713\n",
"Name: count, dtype: int64\n",
"Отсутствуют один или о б а класса (Positive/Negative).\n",
"\n",
"Андерсэмплинг:\n",
"Распределение Review_type в обучающей выборке:\n",
"Review_type\n",
"Mixed 1\n",
"Mostly Negative 1\n",
"Mostly Positive 1\n",
"Overwhelmingly Negative 1\n",
"Overwhelmingly Positive 1\n",
"Positive 1\n",
"Very Negative 1\n",
"Very Positive 1\n",
"Name: count, dtype: int64\n",
"Отсутствуют один или о б а класса (Positive/Negative).\n",
"\n",
"Распределение Review_type в контрольной выборке:\n",
"Review_type\n",
"Mixed 2\n",
"Mostly Negative 2\n",
"Mostly Positive 2\n",
"Overwhelmingly Negative 2\n",
"Overwhelmingly Positive 2\n",
"Positive 2\n",
"Very Negative 2\n",
"Very Positive 2\n",
"Name: count, dtype: int64\n",
"Отсутствуют один или о б а класса (Positive/Negative).\n",
"\n",
"Распределение Review_type в тестовой выборке:\n",
"Review_type\n",
"Mixed 1\n",
"Mostly Negative 1\n",
"Mostly Positive 1\n",
"Overwhelmingly Negative 1\n",
"Overwhelmingly Positive 1\n",
"Positive 1\n",
"Very Negative 1\n",
"Very Positive 1\n",
"Name: count, dtype: int64\n",
"Отсутствуют один или о б а класса (Positive/Negative).\n",
"\n"
]
}
],
"source": [
"import pandas as pd\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"# Загрузка данных\n",
"train_df = pd.read_csv(\".//static//csv//train_data.csv\")\n",
"val_df = pd.read_csv(\".//static//csv//val_data.csv\")\n",
"test_df = pd.read_csv(\".//static//csv//test_data.csv\")\n",
"\n",
"# Преобразование категориальных признаков в числовые\n",
"def encode(df):\n",
" label_encoders = {}\n",
" for column in df.select_dtypes(include=['object']).columns:\n",
" if column != 'Review_type': # Пропускаем целевую переменную\n",
" le = LabelEncoder()\n",
" df[column] = le.fit_transform(df[column])\n",
" label_encoders[column] = le\n",
" return label_encoders\n",
"\n",
"# Преобразование целевой переменной в числовые значения\n",
"def encode_target(df):\n",
" le = LabelEncoder()\n",
" df['Review_type'] = le.fit_transform(df['Review_type'])\n",
" return le\n",
"\n",
"# Применение кодирования\n",
"label_encoders = encode(train_df)\n",
"encode(val_df)\n",
"encode(test_df)\n",
"\n",
"# Кодирование целевой переменной\n",
"le_target = encode_target(train_df)\n",
"encode_target(val_df)\n",
"encode_target(test_df)\n",
"\n",
"# Проверка типов данных\n",
"def check_data_types(df):\n",
" for column in df.columns:\n",
" if df[column].dtype == 'object':\n",
" print(f\"Столбец '{column}' содержит строковые данные.\")\n",
"\n",
"check_data_types(train_df)\n",
"check_data_types(val_df)\n",
"check_data_types(test_df)\n",
"\n",
"# Функция для выполнения oversampling\n",
"def oversample(df):\n",
" if 'Review_type' not in df.columns:\n",
" print(\"Столбец 'Review_type' отсутствует.\")\n",
" return df\n",
" \n",
" X = df.drop('Review_type', axis=1)\n",
" y = df['Review_type']\n",
" \n",
" oversampler = RandomOverSampler(random_state=42)\n",
" X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
" \n",
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"# Функция для выполнения undersampling\n",
"def undersample(df):\n",
" if 'Review_type' not in df.columns:\n",
" print(\"Столбец 'Review_type' отсутствует.\")\n",
" return df\n",
" \n",
" X = df.drop('Review_type', axis=1)\n",
" y = df['Review_type']\n",
" \n",
" undersampler = RandomUnderSampler(random_state=42)\n",
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
" \n",
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"# Применение oversampling и undersampling к каждой выборке\n",
"train_df_oversampled = oversample(train_df)\n",
"val_df_oversampled = oversample(val_df)\n",
"test_df_oversampled = oversample(test_df)\n",
"\n",
"train_df_undersampled = undersample(train_df)\n",
"val_df_undersampled = undersample(val_df)\n",
"test_df_undersampled = undersample(test_df)\n",
"\n",
"# Обратное преобразование целевой переменной в строковые метки\n",
"def decode_target(df, le_target):\n",
" df['Review_type'] = le_target.inverse_transform(df['Review_type'])\n",
"\n",
"decode_target(train_df_oversampled, le_target)\n",
"decode_target(val_df_oversampled, le_target)\n",
"decode_target(test_df_oversampled, le_target)\n",
"\n",
"decode_target(train_df_undersampled, le_target)\n",
"decode_target(val_df_undersampled, le_target)\n",
"decode_target(test_df_undersampled, le_target)\n",
"\n",
"# Проверка результатов\n",
"def check_balance(df, name):\n",
" if 'Review_type' not in df.columns:\n",
" print(f\"Столбец 'Review_type' отсутствует в {name}.\")\n",
" return\n",
" \n",
" counts = df['Review_type'].value_counts()\n",
" print(f\"Распределение Review_type в {name}:\")\n",
" print(counts)\n",
" \n",
" if 'Positive' in counts and 'Negative' in counts:\n",
" print(f\"Процент положительных отзывов: {counts['Positive'] / len(df) * 100:.2f}%\")\n",
" print(f\"Процент отрицательных отзывов: {counts['Negative'] / len(df) * 100:.2f}%\")\n",
" else:\n",
" print(\"Отсутствуют один или о б а класса (Positive/Negative).\")\n",
" print()\n",
"\n",
"# Проверка сбалансированности после oversampling\n",
"print(\"Оверсэмплинг:\")\n",
"check_balance(train_df_oversampled, \"обучающей выборке\")\n",
"check_balance(val_df_oversampled, \"контрольной выборке\")\n",
"check_balance(test_df_oversampled, \"тестовой выборке\")\n",
"\n",
"# Проверка сбалансированности после undersampling\n",
"print(\"Андерсэмплинг:\")\n",
"check_balance(train_df_undersampled, \"обучающей выборке\")\n",
"check_balance(val_df_undersampled, \"контрольной выборке\")\n",
"check_balance(test_df_undersampled, \"тестовой выборке\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 14,400 Classic Rock Tracks (with Spotify Data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://www.kaggle.com/datasets/thebumpkin/14400-classic-rock-tracks-with-spotify-data Этот набор данных, содержащий 1200 уникальных альбомов и 14 400 треков, представляет собой не просто коллекцию — это хроника эволюции классического рока. Каждый трек тщательно каталогизирован с 18 столбцами данных, включая ключевые метаданные, такие как название трека, исполнитель, альбом и год выпуска, наряду с функциями Spotify audio, которые позволяют получить представление о звуковом ландшафте этих неподвластных времени мелодий. Бизнес-цель может заключаться в улучшении стратегии маркетинга и продвижения музыкальных треков. Предположим как этот набор может быть полезен для бизнеса:\n",
"Персонализированные рекомендации: Создание алгоритмов, которые будут рекомендовать пользователям музыку на основе их предпочтений.\n",
"Цель технического проекта: Разработать и внедрить систему рекомендаций, которая будет предсказывать и рекомендовать пользователям музыкальные треки на основе их предпочтений и поведения.\n",
"Входные данные:\n",
"Данные о пользователях: Идентификатор пользователя, история прослушиваний, оценки треков, время прослушивания, частота прослушивания.\n",
"Данные о треках: Атрибуты треков (название, исполнитель, альбом, год, длительность, танцевальность, энергичность, акустичность и т.д.).\n",
"Данные о взаимодействии: Время и частота взаимодействия пользователя с определенными треками.\n",
"Целевой признак:\n",
"Рекомендации: Булева переменная, указывающая, должен ли конкретный трек быть рекомендован пользователю (1 - рекомендуется, 0 - не рекомендуется)."
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Track', 'Artist', 'Album', 'Year', 'Duration', 'Time_Signature',\n",
" 'Danceability', 'Energy', 'Key', 'Loudness', 'Mode', 'Speechiness',\n",
" 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo',\n",
" 'Popularity'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"df = pd.read_csv(\".//static//csv//UltimateClassicRock.csv\")\n",
"print(df.columns)"
]
2024-09-29 19:24:48 +04:00
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Анализируем датафрейм при помощи \"ящика с усами\". Естьсмещение в сторону меньших значений, это можно исправить при помощи oversampling и undersampling."
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIjCAYAAABia6bHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAzVUlEQVR4nO3dd5RV5dnw4XsGptBRUYoCgg3ELqhAFFQs2DXqp6KIvUAUjTEaC9iisb72khcBA/ZEUINRFEskRikLSxTUCKJB5FXpAgPM/v5wzQnDDFV8BsbrWosVdjvnmT07zvmxy+RlWZYFAADATyy/qgcAAAD8PIgPAAAgCfEBAAAkIT4AAIAkxAcAAJCE+AAAAJIQHwAAQBLiAwAASEJ8AAAASYgPgCo0ZcqUyMvLi0GDBlX1UFiBXr16xZZbbrnOXzcvLy/69++/zl8XYH0mPoBqYdCgQZGXl1fuz2abbRb77rtvvPDCC8nH89prr5UbS0FBQbRu3Tp69uwZn3322Tp5j3/84x/Rv3//mDVr1jp5vaqQYj9tKKrD9xNgVWpW9QAA1qVrr702WrVqFVmWxddffx2DBg2KQw45JJ577rk47LDDko/nggsuiA4dOsTixYtj/Pjx8dBDD8Vf//rXeP/996NZs2Y/6rX/8Y9/xDXXXBO9evWKhg0brpsBV5Gfcj+trxYsWBA1a/73x3B1+n4CrIj4AKqV7t27R/v27XPTZ5xxRjRu3Dgee+yxKomPvffeO4499tiIiDjttNNi2223jQsuuCAGDx4cl19+efLxrK9+LvuptLQ0SkpKori4OIqLi6t6OADJuewKqNYaNmwYtWrVKvcvzBER8+fPj1//+tfRvHnzKCoqiu222y5uvfXWyLIsIn74V+k2bdpEmzZtYsGCBbntvvvuu2jatGl06tQpli5dusbj2W+//SIiYvLkyStdb9SoUbH33ntHnTp1omHDhnHkkUfGRx99lFvev3//+M1vfhMREa1atcpdtjRlypRVjmH5y9PK/rz22msV1u3Vq1el6y5/r8LTTz8d7du3j3r16pVb79Zbb13leCpT2X667777ol27dlFUVBTNmjWL3r17V7hEqWvXrrHDDjvEuHHjolOnTlGrVq1o1apVPPDAA+XWK7tMb/n9VXYZWGX7Ylm33nprdOrUKTbZZJOoVatW7L777vH0009XWC8vLy/69OkTQ4cOzY39b3/7W25Z2X5c2fezS5cusfPOO1c6ju222y4OOuiglY4VYH3izAdQrcyePTu++eabyLIsZsyYEXfffXfMmzcvTj755Nw6WZbFEUccEa+++mqcccYZscsuu8SLL74Yv/nNb+I///lP3HHHHVGrVq0YPHhwdO7cOa644oq4/fbbIyKid+/eMXv27Bg0aFDUqFFjjcf373//OyIiNtlkkxWu8/LLL0f37t2jdevW0b9//1iwYEHcfffd0blz5xg/fnxsueWWccwxx8THH38cjz32WNxxxx3RqFGjiIjYdNNNV2scBxxwQPTs2TMiIsaMGRN33XXXCtdt1KhR3HHHHbnpU045pdzyt956K44//vjYeeed46abbooGDRrEN998ExdddNFqjaUyy++n/v37xzXXXBPdunWL8847LyZNmhT3339/jBkzJkaPHh0FBQW5bWfOnBmHHHJIHH/88XHiiSfGk08+Geedd14UFhbG6aefvtZjWtadd94ZRxxxRPTo0SNKSkri8ccfj+OOOy6ef/75OPTQQ8utO2rUqHjyySejT58+0ahRo0pvXl/Z9/OUU06Js846Kz744IPYYYcdctuMGTMmPv7447jyyivXydcEkEQGUA0MHDgwi4gKf4qKirJBgwaVW3fYsGFZRGTXX399ufnHHntslpeXl3366ae5eZdffnmWn5+fvfHGG9lTTz2VRUT2P//zP6scz6uvvppFRPbwww9n//d//5dNmzYt++tf/5ptueWWWV5eXjZmzJgsy7Js8uTJWURkAwcOzG27yy67ZJtttln27bff5ua9++67WX5+ftazZ8/cvFtuuSWLiGzy5MmrvZ9KSkqyiMj69OmTm1f2db366qsV1u/Ro0fWqlWrcvMiIuvXr19u+vLLL88iIvvqq69y88q+rltuuWWl41md/TRjxoyssLAwO/DAA7OlS5fmtr3nnnty25bp0qVLFhHZbbfdlpu3aNGi3D4tKSnJsuy/x8vy+65sPMvui1NPPTVr2bJlufW+//77ctMlJSXZDjvskO23334V9lV+fn72r3/9q8LXvvx+XNH3c9asWVlxcXH229/+ttz8Cy64IKtTp042b968Cq8NsL5y2RVQrdx7770xcuTIGDlyZAwZMiT23XffOPPMM+Mvf/lLbp0RI0ZEjRo14oILLii37a9//evIsqzc07H69+8f7dq1i1NPPTXOP//86NKlS4XtVub000+PTTfdNJo1axaHHnpozJ8/PwYPHlzuvpRlffXVVzFhwoTo1atXbLzxxrn5O+20UxxwwAExYsSI1X7vyixcuDAiYrXvNygpKYmioqKVrjN37tzIz8//UTdJr2w/vfzyy1FSUhJ9+/aN/Pz//tg666yzon79+vHXv/613GvVrFkzzjnnnNx0YWFhnHPOOTFjxowYN27cWo9xWbVq1cr9febMmTF79uzYe++9Y/z48RXW7dKlS2y//fZr/V4NGjSII488Mh577LHcZYFLly6NJ554Io466qioU6fOWr82QGouuwKqlT322KPcB/sTTzwxdt111+jTp08cdthhUVhYGJ9//nk0a9Ys6tWrV27btm3bRkTE559/nptXWFgYDz/8cHTo0CGKi4tj4MCBkZeXt9rjufrqq2PvvfeOGjVqRKNGjaJt27YV7j9ZVtl7b7fddhWWtW3bNl588cWYP3/+Wn/g/OabbyLihw+0q2PWrFlRt27dla7TsWPHuOeee+LCCy+MSy+9NBo0aBAzZ85co3GtbD+taJ8UFhZG69aty32/IiKaNWtWYf9su+22EfHD71XZa6+91mhslXn++efj+uuvjwkTJsSiRYty8ys7Nlq1avWj369nz57xxBNPxN///vfYZ5994uWXX46vv/66wiVwAOs78QFUa/n5+bHvvvvGnXfeGZ988km0a9dujV/jxRdfjIgfzhp88skna/Rhcscdd4xu3bqt8Xv+VMpusF7dX5o3ffr0aNmy5UrXOeGEE2L8+PFx9913x0MPPbRW40q9n1YUkKvzEIG///3vccQRR8Q+++wT9913XzRt2jQKCgpi4MCB8eijj1ZYf9mzJGvroIMOisaNG8eQIUNin332iSFDhkSTJk3Wq2MLYHW47Aqo9pYsWRIREfPmzYuIiJYtW8a0adNi7ty55dabOHFibnmZ9957L6699to47bTTYtddd40zzzwzZs+e/ZONtey9J02aVGHZxIkTo1GjRrl/1V+TMzBlxo4dGxGxwsu+lrV48eL49NNPc2eEViQ/Pz9uvfXW6Nq1a2yzzTa5S97WlRXtk5KSkpg8eXKFOJo2bVrMnz+/3LyPP/44Iv4bXRtttFFERIWnZS1/FqUyf/7zn6O4uDhefPHFOP3006N79+7rJAJW9v2sUaNGnHTSSfH000/HzJkzY9iwYXHiiSeu1UMPAKqS+ACqtcWLF8dLL70UhYWFuQ/RhxxySCxdujTuueeecuvecccdkZeXF927d89t26tXr2jWrFnceeedMWjQoPj6669/1FOcVqVp06axyy67xODBg8t9MP7ggw/ipZdeikMOOSQ3ryxC1uQ3Yj/99NOx3XbbRZs2bVa57vDhw2PBggW5x96uzN133x2jRo2KoUOHRrdu3aJz586rPaZV6datWxQWFsZdd92Vu+chImLAgAExe/bsCk+XWrJkSTz44IO56ZKSknjwwQdj0003jd133z0iIrbaaquIiHjjjTdy6y1dunS1ztzUqFEj8vLyyp0lmTJlSgwbNmytvr4yq/p+nnLKKTFz5sw455xzKjzBDWBD4bIroFp54YUXcmcwZsyYEY8++mh88skncdlll0X9+vUjIuLwww+PfffdN6644oqYMmV
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"# Box plot для столбца 'Popularity'\n",
"plt.figure(figsize=(10, 6))\n",
"sns.boxplot(x=df['Popularity'])\n",
"plt.title('Box Plot для Popularity')\n",
"plt.xlabel('Popularity')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Решим проблему пустых значений при помощи удаления таких строк."
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"df_cleaned = df.dropna()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 8650\n",
"Размер контрольной выборки: 2884\n",
"Размер тестовой выборки: 2884\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Разделение на обучающую и тестовую выборки\n",
"train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)\n",
"\n",
"# Разделение обучающей выборки на обучающую и контрольную\n",
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Оценка сбалансированности выборок, по результатам видно что баланса тут мало"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение Popularity в обучающей выборке:\n",
"Popularity\n",
"23 258\n",
"15 250\n",
"26 246\n",
"21 245\n",
"14 245\n",
" ... \n",
"84 1\n",
"87 1\n",
"91 1\n",
"79 1\n",
"86 1\n",
"Name: count, Length: 88, dtype: int64\n",
"\n",
"Распределение Popularity в контрольной выборке:\n",
"Popularity\n",
"17 90\n",
"26 86\n",
"21 83\n",
"24 83\n",
"28 80\n",
" ..\n",
"85 1\n",
"83 1\n",
"84 1\n",
"80 1\n",
"77 1\n",
"Name: count, Length: 85, dtype: int64\n",
"\n",
"Распределение Popularity в тестовой выборке:\n",
"Popularity\n",
"22 86\n",
"21 85\n",
"12 84\n",
"20 82\n",
"26 81\n",
" ..\n",
"76 2\n",
"71 2\n",
"79 1\n",
"82 1\n",
"80 1\n",
"Name: count, Length: 80, dtype: int64\n",
"\n"
]
}
],
"source": [
"def check_balance(df, name):\n",
" counts = df['Popularity'].value_counts()\n",
" print(f\"Распределение Popularity в {name}:\")\n",
" print(counts)\n",
" print()\n",
"\n",
"check_balance(train_df, \"обучающей выборке\")\n",
"check_balance(val_df, \"контрольной выборке\")\n",
"check_balance(test_df, \"тестовой выборке\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выполним овер- и андер- слемпинг."
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение Popularity в обучающей выборке после oversampling:\n",
"Popularity\n",
"44 258\n",
"20 258\n",
"30 258\n",
"27 258\n",
"8 258\n",
" ... \n",
"78 258\n",
"79 258\n",
"74 258\n",
"81 258\n",
"86 258\n",
"Name: count, Length: 88, dtype: int64\n",
"\n",
"Распределение Popularity в контрольной выборке после oversampling:\n",
"Popularity\n",
"21 90\n",
"11 90\n",
"28 90\n",
"23 90\n",
"37 90\n",
" ..\n",
"61 90\n",
"84 90\n",
"80 90\n",
"77 90\n",
"0 90\n",
"Name: count, Length: 85, dtype: int64\n",
"\n",
"Распределение Popularity в тестовой выборке после oversampling:\n",
"Popularity\n",
"14 86\n",
"47 86\n",
"27 86\n",
"13 86\n",
"66 86\n",
" ..\n",
"63 86\n",
"79 86\n",
"71 86\n",
"82 86\n",
"80 86\n",
"Name: count, Length: 80, dtype: int64\n",
"\n"
]
}
],
"source": [
"from imblearn.over_sampling import RandomOverSampler\n",
"\n",
"def oversample(df):\n",
" X = df.drop('Popularity', axis=1)\n",
" y = df['Popularity']\n",
" \n",
" oversampler = RandomOverSampler(random_state=42)\n",
" X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
" \n",
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"train_df_oversampled = oversample(train_df)\n",
"val_df_oversampled = oversample(val_df)\n",
"test_df_oversampled = oversample(test_df)\n",
"\n",
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение Popularity в обучающей выборке после undersampling:\n",
"Popularity\n",
"0 1\n",
"1 1\n",
"2 1\n",
"3 1\n",
"4 1\n",
" ..\n",
"84 1\n",
"85 1\n",
"86 1\n",
"87 1\n",
"91 1\n",
"Name: count, Length: 88, dtype: int64\n",
"\n",
"Распределение Popularity в контрольной выборке после undersampling:\n",
"Popularity\n",
"0 1\n",
"1 1\n",
"2 1\n",
"3 1\n",
"4 1\n",
" ..\n",
"82 1\n",
"83 1\n",
"84 1\n",
"85 1\n",
"87 1\n",
"Name: count, Length: 85, dtype: int64\n",
"\n",
"Распределение Popularity в тестовой выборке после undersampling:\n",
"Popularity\n",
"0 1\n",
"1 1\n",
"2 1\n",
"3 1\n",
"4 1\n",
" ..\n",
"76 1\n",
"77 1\n",
"79 1\n",
"80 1\n",
"82 1\n",
"Name: count, Length: 80, dtype: int64\n",
"\n"
]
}
],
"source": [
"from imblearn.under_sampling import RandomUnderSampler\n",
"\n",
"def undersample(df):\n",
" X = df.drop('Popularity', axis=1)\n",
" y = df['Popularity']\n",
" \n",
" undersampler = RandomUnderSampler(random_state=42)\n",
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
" \n",
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"train_df_undersampled = undersample(train_df)\n",
"val_df_undersampled = undersample(val_df)\n",
"test_df_undersampled = undersample(test_df)\n",
"\n",
"check_balance(train_df_undersampled, \"обучающей выборке после undersampling\")\n",
"check_balance(val_df_undersampled, \"контрольной выборке после undersampling\")\n",
"check_balance(test_df_undersampled, \"тестовой выборке после undersampling\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Police Shootings in the United States: 2015-2024"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"В этом наборе данных, составленном The Washington Post, регистрируется каждый человек, застреленный дежурным полицейским в Соединенных Штатах с 2015 по 2024 год. Он решает проблему занижения органами власти статистики реальных инцедентов. Это может быть использовано в журналисткой работе, например для прогнозирования или выявления закономерностей преступлений. Цель технического проекта установить закономерность в убийствах полицейскими определённых групп граждан. Входные данные: возраст, пол, штат, вооружённость. Целевой признак: общий портрет убитого гражданина."
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['date', 'name', 'age', 'gender', 'armed', 'race', 'city', 'state',\n",
" 'flee', 'body_camera', 'signs_of_mental_illness',\n",
" 'police_departments_involved'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"df = pd.read_csv(\".//static//csv//2024-07-23-washington-post-police-shootings-export.csv\")\n",
"print(df.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"При помощи ящика с усами и колонки возраста проверим набор на баланс. Он достаточно сбалансирован."
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIjCAYAAABia6bHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAuRklEQVR4nO3deZhWdd348c8MywwKM4gLOwhmYCZpCoXkgxioiKkRmokpiY9omICWSaRQuSWWlahAoegjuWDmkpiioKaSIobYj0DKcQOBTBkWGYGZ8/uDa+4YhmVE/A7L63Vdc+V9tvs70wHOe85y52VZlgUAAMCnLL+2BwAAAOwexAcAAJCE+AAAAJIQHwAAQBLiAwAASEJ8AAAASYgPAAAgCfEBAAAkIT4AAIAkxAfAbuCNN96IvLy8mDhxYm0PBYDdmPgA+BgmTpwYeXl5Vb7222+/6NGjRzz66KPJx/PUU09VGUu9evWiffv2cdZZZ8Xrr7++Xd7j+eefj1GjRsWyZcu2y/YA2H3Vre0BAOyMfvrTn0a7du0iy7JYsmRJTJw4MU444YR4+OGH48QTT0w+nosuuig6d+4ca9eujZdffjnGjx8fjzzySLz66qvRokWLT7Tt559/Pn7yk5/EgAEDonHjxttnwADslsQHwDbo3bt3HHHEEbnXAwcOjKZNm8Zdd91VK/Fx1FFHRb9+/SIi4jvf+U589rOfjYsuuihuv/32GD58ePLxAMCmuOwKYDto3LhxNGjQIOrWrfo7nVWrVsUll1wSrVu3joKCgujQoUNcf/31kWVZRESsXr06OnbsGB07dozVq1fn1nv//fejefPmceSRR0Z5efnHHs8xxxwTERElJSVbXG7atGlx1FFHxZ577hmNGzeOk08+Of7xj3/k5o8aNSp+8IMfREREu3btcpd3vfHGG1sdw8aXp1V+PfXUU9WWHTBgwCaXHTVqVJXl7rvvvjjiiCOiUaNGVZa7/vrrtziW999/P77//e/HIYccEg0bNoyioqLo3bt3vPLKK9WWffPNN+Okk06KPffcM/bbb78YNmxYPPbYY5sc+wsvvBDHH398FBcXxx577BHdu3eP5557bqs/G4DdlTMfANugtLQ03nvvvciyLJYuXRo33nhjrFy5Ms4888zcMlmWxUknnRTTp0+PgQMHxqGHHhqPPfZY/OAHP4iFCxfGDTfcEA0aNIjbb789unXrFiNGjIhf/vKXERExePDgKC0tjYkTJ0adOnU+9vj+9a9/RUTE3nvvvdllnnjiiejdu3e0b98+Ro0aFatXr44bb7wxunXrFi+//HLsv//+0bdv33jttdfirrvuihtuuCH22WefiIjYd999azSOXr16xVlnnRURETNnzozf/OY3m112n332iRtuuCH3+tvf/naV+TNmzIjTTjstvvCFL8S1114bxcXF8d5778WwYcO2Oo7XX389HnjggTj11FOjXbt2sWTJkhg3blx079495s6dm7s0bdWqVXHMMcfEu+++G0OGDIlmzZrF73//+5g+fXq1bU6bNi169+4dhx9+eIwcOTLy8/Pjtttui2OOOSb+8pe/RJcuXWr0MwLYrWQA1Nhtt92WRUS1r4KCgmzixIlVln3ggQeyiMiuvPLKKtP79euX5eXlZf/85z9z04YPH57l5+dnzzzzTDZ58uQsIrJf/epXWx3P9OnTs4jIbr311uzf//53tmjRouyRRx7J9t9//ywvLy+bOXNmlmVZVlJSkkVEdtttt+XWPfTQQ7P99tsv+89//pOb9sorr2T5+fnZWWedlZs2evToLCKykpKSGv+c1qxZk0VEduGFF+amVX5f06dPr7Z8//79s3bt2lWZFhHZyJEjc6+HDx+eRUT27rvv5qZVfl+jR4/e4njKysqy8vLyKtNKSkqygoKC7Kc//Wlu2i9+8YssIrIHHnggN2316tVZx44dq4y9oqIiO/DAA7Pjjjsuq6ioyC374YcfZu3atct69eq1xfEA7K5cdgWwDW666aaYOnVqTJ06Ne68887o0aNHnHvuuXH//ffnlpkyZUrUqVMnLrrooirrXnLJJZFlWZWnY40aNSoOPvjgOPvss+O73/1udO/evdp6W3LOOefEvvvuGy1atIg+ffrEqlWr4vbbb69yX8qG3n333Zg9e3YMGDAgmjRpkpveqVOn6NWrV0yZMqXG770pZWVlERFRWFhYo+XXrFkTBQUFW1xmxYoVkZ+fv003vRcUFER+/vp/8srLy+M///lPNGzYMDp06BAvv/xybrk///nP0bJlyzjppJNy0woLC+N///d/q2xv9uzZsWDBgjjjjDPiP//5T7z33nvx3nvvxapVq+KrX/1qPPPMM1FRUfGxxwmwq3PZFcA26NKlS5UD+29961tx2GGHxYUXXhgnnnhi1K9fP958881o0aJFNGrUqMq6Bx10UESsv7egUv369ePWW2+Nzp07R2FhYdx2222Rl5dX4/FcccUVcdRRR0WdOnVin332iYMOOqja/ScbqnzvDh06VJt30EEHxWOPPRarVq2KPffcs8Zj2NB7770XERHFxcU1Wn7ZsmXRsGHDLS7TtWvXGDNmTAwZMiQuvfTSKC4ujg8++KBG26+oqIhf//rXcfPNN0dJSUmV+2g2vDTtzTffjAMOOKDaz/4zn/lMldcLFiyIiIizzz57s+9ZWloae+21V43GB7C7EB8A20F+fn706NEjfv3rX8eCBQvi4IMP/tjbeOyxxyJi/VmDBQsWRLt27Wq87iGHHBI9e/b82O/5aam8IX3//fev0fKLFy+Otm3bbnGZ008/PV5++eW48cYbY/z48R9rPFdffXVcfvnlcc4558TPfvazaNKkSeTn58fQoUO36QxF5TqjR4+OQw89dJPLbC2mAHZH4gNgO1m3bl1ERKxcuTIiItq2bRtPPPFErFixosrZj3nz5uXmV5ozZ0789Kc/je985zsxe/bsOPfcc+PVV1+t8ZmDj6vyvefPn19t3rx582KfffbJnfX4OGdgKr300ksREZu97GtDa9eujX/+859x/PHHb3G5/Pz8uP766+PVV1+NkpKSuPnmm2PJkiVVbvLfnPvuuy969OgREyZMqDJ92bJluZvoI9b/XObOnRtZllX5vv/5z39WWe+AAw6IiIiioqIdKvoAdnTu+QDYDtauXRuPP/541K9fP3dZ1QknnBDl5eUxZsyYKsvecMMNkZeXF717986tO2DAgGjRokX8+te/jokTJ8aSJUtq9BSnbdW8efM49NBD4/bbb6/yyeV///vf4/HHH48TTjghN60yQj7OJ5zfd9990aFDh+jYseNWl33wwQdj9erVuccDb8mNN94Y06ZNi0mTJkXPnj2jW7duNRpPnTp1co83rjR58uRYuHBhlWnHHXdcLFy4MB566KHctLKysvjtb39bZbnDDz88DjjggLj++utzsbmhf//73zUaF8DuxpkPgG3w6KOP5s5gLF26NH7/+9/HggUL4rLLLouioqKIiPja174WPXr0iBEjRsQbb7wRX/jCF+Lxxx+PBx98MIYOHZr77fmVV14Zs2fPjieffDIaNWoUnTp1iiuuuCJ+/OMfR79+/aqEwPY0evTo6N27d3Tt2jUGDhyYe9RucXFxlc/XOPzwwyMiYsSIEXH66adHvXr14mtf+9om7wd5/fXX47rrrosXX3wx+vbtG3feeWdu3syZMyMiYurUqdGmTZto1qxZjBw5Mm6++eY48sgj49hjj93ieP/f//t/cemll8aoUaOic+fOH+t7PfHEE3Nnlo488sh49dVXY9KkSdG+ffsqyw0aNCjGjBkT3/rWt2LIkCHRvHnzmDRpUu7G+cqzIfn5+fG73/0uevfuHQcffHB85zvfiZYtW8bChQtj+vTpUVRUFA8//PDHGiPAbqGWn7YFsFPZ1KN2CwsLs0MPPTS75ZZbqjx2NcuybMWKFdmwYcOyFi1aZPXq1csOPPDAbPTo0bnlZs2aldWtWzf73ve+V2W9devWZZ07d85atGiRffDBB5sdT+WjdidPnrzFcW/qUbtZlmVPPPFE1q1bt6xBgwZZUVF
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"# Box plot для столбца 'age'\n",
"plt.figure(figsize=(10, 6))\n",
"sns.boxplot(x=df['age'])\n",
"plt.title('Box Plot для age')\n",
"plt.xlabel('Age')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Теперь проверим на шум, здесь тоже о с о б о проблем нет, однако смущает сочетание white и black, вероятно это мулаты."
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABAIAAAIjCAYAAACZALkcAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACg80lEQVR4nOzdeVhU9f4H8PcIDNswAzIqiwvk4A5hagsqi1a4pqhZpBnazcq0qMglLbfcKkvT22IpWLmkKdxuXlNTcEFzKXC/hSaupIEwI/t2fn94mZ/jMHOQwziD8349zzyPns/5fs/nfOfMmXM+nDlHJgiCACIiIiIiIiKyC02snQARERERERER3T0sBBARERERERHZERYCiIiIiIiIiOwICwFEREREREREdoSFACIiIiIiIiI7wkIAERERERERkR1hIYCIiIiIiIjIjrAQQERERERERGRHWAggIiIiIiIisiMsBBARERHdoezsbMhkMiQlJVk7FSIiojvGQgARERHpHT9+HCNGjECbNm3g4uICf39/PPbYY1i2bJnFlrl27VosWbLEaPqVK1cwa9YsZGZmWmzZt0tLS4NMJtO/nJyccN9992HMmDH4888/G2QZ+/fvx6xZs1BQUNAg/REREd0pFgKIiIgIwM0T1O7du+Po0aN44YUXsHz5cvzjH/9AkyZNsHTpUost11whYPbs2Xe1EFDj1VdfxTfffIMVK1Zg4MCB+O6779CjRw9cuXJFct/79+/H7NmzWQggIiKrcbR2AkRERGQb5s2bB5VKhcOHD8PT09Mgdu3aNeskZQFFRUVwd3c3O0/v3r0xYsQIAMDYsWPRrl07vPrqq1i9ejWmTZt2N9IkKystLYVcLkeTJvy7GRHde7hnIyIiIgDA2bNn0blzZ6MiAAA0b97caNq3336LBx98EG5ubvDy8kJ4eDi2b9+uj//rX//CwIED4efnB2dnZ7Rt2xZz585FVVWVfp7IyEhs2bIF58+f11+OHxAQgLS0NPTo0QPAzRPxmtitv8k/ePAg+vXrB5VKBTc3N0RERCA9Pd0gx1mzZkEmk+HUqVN45pln4OXlhV69et3x2PTp0wcAcO7cObPz7dq1C71794a7uzs8PT0xZMgQnD592iCft956CwAQGBioX6/s7GzRHG79ycKtr7S0NKN54+Liap131qxZBvN9//336N69Ozw8PAzm+/DDD83mcv36dSQkJCA4OBgKhQJKpRL9+/fH0aNHjeY9f/48nnjiCbi7u6N58+Z4/fXXsW3btlpzr8t7Wpuan3Tc3t/AgQNrXW9T7devX48ZM2bA398fbm5u0Ol0d7SupaWlmDVrFtq1awcXFxf4+vpi2LBhOHv2rH6e6upqLFmyBJ07d4aLiwtatGiBF198Efn5+aLrSUTUUHhFABEREQEA2rRpgwMHDuDEiRPo0qWL2Xlnz56NWbNmISwsDHPmzIFcLsfBgwexa9cuPP744wCApKQkKBQKvPHGG1AoFNi1axfeffdd6HQ6fPDBBwCA6dOnQ6vV4tKlS/j4448BAAqFAh07dsScOXPw7rvvYvz48ejduzcAICwsDMDNE+7+/fujW7dumDlzJpo0aYLExET06dMHe/fuxYMPPmiQ75NPPomgoCDMnz8fgiDc8djUnMh5e3ubnOfnn39G//79cd9992HWrFkoKSnBsmXL0LNnT/z2228ICAjAsGHD8Mcff2DdunX4+OOPoVarAQDNmjWrUx6PPfYYxowZAwA4fPgwPvnkE5PzqtVq/ZgCwLPPPmsQP3DgAEaOHIn7778fCxcuhEqlQm5uLl5//XXRPP7880+kpKTgySefRGBgIK5evYovvvgCEREROHXqFPz8/ADcvPqiT58+yMnJwWuvvQYfHx+sXbsWqampRn3e6XsqZs+ePfjPf/5zR23mzp0LuVyOhIQElJWVQS6X49SpU3Va16qqKgwaNAg7d+7E008/jddeew03btzAjh07cOLECbRt2xYA8OKLLyIpKQljx47Fq6++inPnzmH58uXIyMhAeno6nJyc7ihnIqJ6EYiIiIgEQdi+fbvg4OAgODg4CI888ogwefJkYdu2bUJ5ebnBfFlZWUKTJk2EmJgYoaqqyiBWXV2t/3dxcbHRMl588UXBzc1NKC0t1U8bOHCg0KZNG6N5Dx8+LAAQEhMTjZYRFBQkREdHGy0vMDBQeOyxx/TTZs6cKQAQYmNj6zQGqampAgBh1apVwt9//y1cuXJF2LJlixAQECDIZDLh8OHDgiAIwrlz54xyCw0NFZo3by7k5eXppx09elRo0qSJMGbMGP20Dz74QAAgnDt3rk45CYIglJeXCwCEiRMn6qdt3LhRACCkpqYazT9q1CghMDDQYBoAYebMmfr/T5s2TQAg5OTk6KfVrNcHH3xgNp/S0lKj9/7cuXOCs7OzMGfOHP20xYsXCwCElJQU/bSSkhKhQ4cOBrnfyXtam5r37daxeOihh4T+/fsbrbe59vfdd5/RdlvXdV21apUAQPjoo4+M+q9Zp7179woAhDVr1hjEf/rpp1qnExFZCn8aQERERABu/rX5wIEDeOKJJ3D06FG8//77iI6Ohr+/P3744Qf9fCkpKaiursa7775r9PtpmUym/7erq6v+3zdu3EBubi569+6N4uJi/Pe//613npmZmcjKysIzzzyDvLw85ObmIjc3F0VFRejbty/27NmD6upqgzYvvfTSHS1j3LhxaNasGfz8/DBw4EAUFRVh9erV6N69e63z5+TkIDMzE3FxcWjatKl+ekhICB577LE7/sv07UpLSwEALi4udZq/vLwczs7OZue5ceMGmjRpUutPQcQ4Ozvr3/uqqirk5eVBoVCgffv2+O233/Tz/fTTT/D398cTTzyhn+bi4oIXXnjBoL/6vKfmbN68GYcPH8bChQvvaL2ee+45g+32TtZ106ZNUKvVmDRpklG/NZ+LjRs3QqVS4bHHHtOvY25uLrp16waFQlHrlRJERJbAnwYQERGRXo8ePbB582aUl5fj6NGjSE5Oxscff4wRI0YgMzMTnTp1wtmzZ9GkSRN06tTJbF8nT57EjBkzsGvXLuh0OoOYVqutd45ZWVkAbp60maLVauHl5aX/f2Bg4B0t491330Xv3r3h4OAAtVqNjh07wtHR9GHT+fPnAQDt27c3inXs2BHbtm2r000KTcnNzQUAqFSqOs1fUFAAhUJhdp5HHnkEy5cvx2uvvYbJkydDpVLV+Xfq1dXVWLp0KT799FOcO3fO4L4Pt/584vz582jbtq1BgQgANBqNwf/r856aUlVVhbfffhujRo1CSEhIndanRm3bSV3X9ezZs2jfvr3Z7SQrKwtarbbWe24A99ZNOYnItrEQQEREREbkcjl69OiBHj16oF27dhg7diw2btyImTNn1ql9QUEBIiIioFQqMWfOHLRt2xYuLi747bffMGXKlDv66+7tatp+8MEHCA0NrXWe20+Cb/8rr5jg4GA8+uij9crPEmpuJhgQEFCn+f/66y+0adPG7DxPP/00fvvtNyxbtgwrVqy4o3zmz5+Pd955B+PGjcPcuXPRtGlTNGnSBPHx8fV6b+vznpqycuVKZGdnY9u2bXecR23bSUOua3V1NZo3b441a9bUGq/rvSKIiKRiIYCIiIjMqrkcPicnBwDQtm1bVFdX49SpUyZP2tLS0pCXl4fNmzcjPDxcP722u+7f/tdisek1N11TKpU2c7Jec9L9+++/G8X++9//Qq1W668GMLVe5hw5cgQATP404VYVFRU4c+YM+vXrZ3a+Jk2a4MMPP8Tx48dx7tw5fPrpp7h69SpGjx4tuozvv/8eUVFRWLlypcH0goIC/Q0QgZvjcurUKQiCYLDeZ86cMWjXUO9pcXExZs+ejQkTJogWQuqqruvatm1bHDx4EBUVFSZv+Ne2bVv8/PPP6Nmz5x0Xp4iIGhLvEUBEREQAgNTU1FrvqF/z+/aay96HDh2KJk2aYM6cOUZ/Ea1p7+DgYPB/4Obv1j/99FOj/t3d3Wv9qUDNiXNBQYHB9G7duqFt27b48MMPUVhYaNTu77//Nrm
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"# Scatter plot для столбцов 'age' и 'race'\n",
"plt.figure(figsize=(10, 6))\n",
"sns.scatterplot(x='age', y='race', data=df)\n",
"plt.title('Scatter Plot для age и race')\n",
"plt.xlabel('Age')\n",
"plt.ylabel('Race')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Удаление строк с пустыми значениями"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"df_cleaned = df.dropna()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 4770\n",
"Размер контрольной выборки: 1591\n",
"Размер тестовой выборки: 1591\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Разделение на обучающую и тестовую выборки\n",
"train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)\n",
"\n",
"# Разделение обучающей выборки на обучающую и контрольную\n",
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Применение методов приращения данных (аугментации)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение reace в обучающей выборке после oversampling:\n",
"race\n",
"Black 2187\n",
"White 2187\n",
"Hispanic 2187\n",
"Unknown 2187\n",
"Native American 2187\n",
"Asian 2187\n",
"White,Black,Native American 2187\n",
"Other 2187\n",
"White,Black 2187\n",
"Name: count, dtype: int64\n",
"\n",
"Распределение reace в контрольной выборке после oversampling:\n",
"race\n",
"White 718\n",
"Black 718\n",
"Unknown 718\n",
"Hispanic 718\n",
"Asian 718\n",
"Native American 718\n",
"Other 718\n",
"Name: count, dtype: int64\n",
"\n",
"Распределение reace в тестовой выборке после oversampling:\n",
"race\n",
"Unknown 750\n",
"White 750\n",
"Black 750\n",
"Hispanic 750\n",
"Asian 750\n",
"Native American 750\n",
"Black,Hispanic 750\n",
"Other 750\n",
"White,Black 750\n",
"Native American,Hispanic 750\n",
"Name: count, dtype: int64\n",
"\n",
"Распределение reace в обучающей выборке после undersampling:\n",
"race\n",
"Asian 1\n",
"Black 1\n",
"Hispanic 1\n",
"Native American 1\n",
"Other 1\n",
"Unknown 1\n",
"White 1\n",
"White,Black 1\n",
"White,Black,Native American 1\n",
"Name: count, dtype: int64\n",
"\n",
"Распределение reace в контрольной выборке после undersampling:\n",
"race\n",
"Asian 7\n",
"Black 7\n",
"Hispanic 7\n",
"Native American 7\n",
"Other 7\n",
"Unknown 7\n",
"White 7\n",
"Name: count, dtype: int64\n",
"\n",
"Распределение reace в тестовой выборке после undersampling:\n",
"race\n",
"Asian 1\n",
"Black 1\n",
"Black,Hispanic 1\n",
"Hispanic 1\n",
"Native American 1\n",
"Native American,Hispanic 1\n",
"Other 1\n",
"Unknown 1\n",
"White 1\n",
"White,Black 1\n",
"Name: count, dtype: int64\n",
"\n"
]
}
],
"source": [
"from imblearn.over_sampling import RandomOverSampler\n",
"\n",
"def check_balance(df, name):\n",
" counts = df['race'].value_counts()\n",
" print(f\"Распределение reace в {name}:\")\n",
" print(counts)\n",
" print()\n",
"\n",
"def oversample(df):\n",
" X = df.drop('race', axis=1)\n",
" y = df['race']\n",
" \n",
" oversampler = RandomOverSampler(random_state=42)\n",
" X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
" \n",
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"train_df_oversampled = oversample(train_df)\n",
"val_df_oversampled = oversample(val_df)\n",
"test_df_oversampled = oversample(test_df)\n",
"\n",
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")\n",
"\n",
"def undersample(df):\n",
" X = df.drop('race', axis=1)\n",
" y = df['race']\n",
" \n",
" undersampler = RandomUnderSampler(random_state=42)\n",
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
" \n",
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"train_df_undersampled = undersample(train_df)\n",
"val_df_undersampled = undersample(val_df)\n",
"test_df_undersampled = undersample(test_df)\n",
"\n",
"check_balance(train_df_undersampled, \"обучающей выборке после undersampling\")\n",
"check_balance(val_df_undersampled, \"контрольной выборке после undersampling\")\n",
"check_balance(test_df_undersampled, \"тестовой выборке после undersampling\")"
]
2024-09-29 13:00:33 +04:00
}
],
"metadata": {
"kernelspec": {
"display_name": "aimenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}