ПИбд-32 Петрушин Егор Лабораторная работа 4 #5
111
lab_4/lab4.ipynb
111
lab_4/lab4.ipynb
@ -72,6 +72,117 @@
|
||||
"print(df.head())\n",
|
||||
"print(df.columns)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Предобработка данных"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"ID 0\n",
|
||||
"Price 0\n",
|
||||
"Levy 0\n",
|
||||
"Manufacturer 0\n",
|
||||
"Model 0\n",
|
||||
"Prod. year 0\n",
|
||||
"Category 0\n",
|
||||
"Leather interior 0\n",
|
||||
"Fuel type 0\n",
|
||||
"Engine volume 0\n",
|
||||
"Mileage 0\n",
|
||||
"Cylinders 0\n",
|
||||
"Gear box type 0\n",
|
||||
"Drive wheels 0\n",
|
||||
"Doors 0\n",
|
||||
"Wheel 0\n",
|
||||
"Color 0\n",
|
||||
"Airbags 0\n",
|
||||
"dtype: int64\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\Users\\Egor\\AppData\\Local\\Temp\\ipykernel_16964\\3618217033.py:9: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
|
||||
"The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
|
||||
"\n",
|
||||
"For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" df['Levy'].fillna(df['Levy'].median(), inplace=True)\n",
|
||||
"C:\\Users\\Egor\\AppData\\Local\\Temp\\ipykernel_16964\\3618217033.py:10: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
|
||||
"The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
|
||||
"\n",
|
||||
"For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" df['Mileage'].fillna(df['Mileage'].median(), inplace=True)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Проверка наличия пропущенных значений\n",
|
||||
"print(df.isnull().sum())\n",
|
||||
"\n",
|
||||
"# Очистка столбца 'Levy' от нечисловых значений\n",
|
||||
"df['Levy'] = pd.to_numeric(df['Levy'], errors='coerce')\n",
|
||||
"df['Mileage'] = pd.to_numeric(df['Levy'], errors='coerce')\n",
|
||||
"\n",
|
||||
"# Заполнение пропущенных значений\n",
|
||||
"df['Levy'].fillna(df['Levy'].median(), inplace=True)\n",
|
||||
"df['Mileage'].fillna(df['Mileage'].median(), inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Определение числовых и категориальных признаков\n",
|
||||
"numeric_features = ['Levy', 'Prod. year', 'Engine volume', 'Mileage', 'Cylinders', 'Airbags']\n",
|
||||
"categorical_features = ['Manufacturer', 'Model', 'Category', 'Leather interior', 'Fuel type', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color']\n",
|
||||
"\n",
|
||||
"# Преобразование категориальных признаков в числовые\n",
|
||||
"df = pd.get_dummies(df, columns=categorical_features, drop_first=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Разделение данных на тренировочный и тестовый наборы"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Задача регрессии\n",
|
||||
"X_reg = df.drop(['ID', 'Price'], axis=1)\n",
|
||||
"y_reg = df['Price']\n",
|
||||
"\n",
|
||||
"# Задача классификации\n",
|
||||
"df['Category'] = pd.cut(df['Price'], bins=[0, 10000, 20000, np.inf], labels=['Эконом', 'Средний', 'Премиум'])\n",
|
||||
"X_class = df.drop(['ID', 'Price', 'Category'], axis=1)\n",
|
||||
"y_class = df['Category']\n",
|
||||
"\n",
|
||||
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
|
||||
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
Loading…
x
Reference in New Issue
Block a user