diff --git a/lab_4/lab4.ipynb b/lab_4/lab4.ipynb index b677942..11a3ea3 100644 --- a/lab_4/lab4.ipynb +++ b/lab_4/lab4.ipynb @@ -72,6 +72,117 @@ "print(df.head())\n", "print(df.columns)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Предобработка данных" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ID 0\n", + "Price 0\n", + "Levy 0\n", + "Manufacturer 0\n", + "Model 0\n", + "Prod. year 0\n", + "Category 0\n", + "Leather interior 0\n", + "Fuel type 0\n", + "Engine volume 0\n", + "Mileage 0\n", + "Cylinders 0\n", + "Gear box type 0\n", + "Drive wheels 0\n", + "Doors 0\n", + "Wheel 0\n", + "Color 0\n", + "Airbags 0\n", + "dtype: int64\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Egor\\AppData\\Local\\Temp\\ipykernel_16964\\3618217033.py:9: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df['Levy'].fillna(df['Levy'].median(), inplace=True)\n", + "C:\\Users\\Egor\\AppData\\Local\\Temp\\ipykernel_16964\\3618217033.py:10: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df['Mileage'].fillna(df['Mileage'].median(), inplace=True)\n" + ] + } + ], + "source": [ + "# Проверка наличия пропущенных значений\n", + "print(df.isnull().sum())\n", + "\n", + "# Очистка столбца 'Levy' от нечисловых значений\n", + "df['Levy'] = pd.to_numeric(df['Levy'], errors='coerce')\n", + "df['Mileage'] = pd.to_numeric(df['Levy'], errors='coerce')\n", + "\n", + "# Заполнение пропущенных значений\n", + "df['Levy'].fillna(df['Levy'].median(), inplace=True)\n", + "df['Mileage'].fillna(df['Mileage'].median(), inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Определение числовых и категориальных признаков\n", + "numeric_features = ['Levy', 'Prod. year', 'Engine volume', 'Mileage', 'Cylinders', 'Airbags']\n", + "categorical_features = ['Manufacturer', 'Model', 'Category', 'Leather interior', 'Fuel type', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color']\n", + "\n", + "# Преобразование категориальных признаков в числовые\n", + "df = pd.get_dummies(df, columns=categorical_features, drop_first=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Разделение данных на тренировочный и тестовый наборы" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Задача регрессии\n", + "X_reg = df.drop(['ID', 'Price'], axis=1)\n", + "y_reg = df['Price']\n", + "\n", + "# Задача классификации\n", + "df['Category'] = pd.cut(df['Price'], bins=[0, 10000, 20000, np.inf], labels=['Эконом', 'Средний', 'Премиум'])\n", + "X_class = df.drop(['ID', 'Price', 'Category'], axis=1)\n", + "y_class = df['Category']\n", + "\n", + "X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n", + "X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)" + ] } ], "metadata": {