4 punkt

2024-11-01 13:05:34 +04:00 · 2024-11-01 13:05:34 +04:00 · dc5d1ac892
commit dc5d1ac892
parent 0a65d77a16
1 changed files with 111 additions and 0 deletions
--- a/lab_4/lab4.ipynb
+++ b/lab_4/lab4.ipynb
@ -72,6 +72,117 @@
    "print(df.head())\n",
    "print(df.columns)"
   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Предобработка данных"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ID                  0\n",
+      "Price               0\n",
+      "Levy                0\n",
+      "Manufacturer        0\n",
+      "Model               0\n",
+      "Prod. year          0\n",
+      "Category            0\n",
+      "Leather interior    0\n",
+      "Fuel type           0\n",
+      "Engine volume       0\n",
+      "Mileage             0\n",
+      "Cylinders           0\n",
+      "Gear box type       0\n",
+      "Drive wheels        0\n",
+      "Doors               0\n",
+      "Wheel               0\n",
+      "Color               0\n",
+      "Airbags             0\n",
+      "dtype: int64\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\Egor\\AppData\\Local\\Temp\\ipykernel_16964\\3618217033.py:9: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
+      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
+      "\n",
+      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
+      "\n",
+      "\n",
+      "  df['Levy'].fillna(df['Levy'].median(), inplace=True)\n",
+      "C:\\Users\\Egor\\AppData\\Local\\Temp\\ipykernel_16964\\3618217033.py:10: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
+      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
+      "\n",
+      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
+      "\n",
+      "\n",
+      "  df['Mileage'].fillna(df['Mileage'].median(), inplace=True)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Проверка наличия пропущенных значений\n",
+    "print(df.isnull().sum())\n",
+    "\n",
+    "# Очистка столбца 'Levy' от нечисловых значений\n",
+    "df['Levy'] = pd.to_numeric(df['Levy'], errors='coerce')\n",
+    "df['Mileage'] = pd.to_numeric(df['Levy'], errors='coerce')\n",
+    "\n",
+    "# Заполнение пропущенных значений\n",
+    "df['Levy'].fillna(df['Levy'].median(), inplace=True)\n",
+    "df['Mileage'].fillna(df['Mileage'].median(), inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Определение числовых и категориальных признаков\n",
+    "numeric_features = ['Levy', 'Prod. year', 'Engine volume', 'Mileage', 'Cylinders', 'Airbags']\n",
+    "categorical_features = ['Manufacturer', 'Model', 'Category', 'Leather interior', 'Fuel type', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color']\n",
+    "\n",
+    "# Преобразование категориальных признаков в числовые\n",
+    "df = pd.get_dummies(df, columns=categorical_features, drop_first=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Разделение данных на тренировочный и тестовый наборы"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Задача регрессии\n",
+    "X_reg = df.drop(['ID', 'Price'], axis=1)\n",
+    "y_reg = df['Price']\n",
+    "\n",
+    "# Задача классификации\n",
+    "df['Category'] = pd.cut(df['Price'], bins=[0, 10000, 20000, np.inf], labels=['Эконом', 'Средний', 'Премиум'])\n",
+    "X_class = df.drop(['ID', 'Price', 'Category'], axis=1)\n",
+    "y_class = df['Category']\n",
+    "\n",
+    "X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
+    "X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)"
+   ]
  }
 ],
 "metadata": {