This commit is contained in:
GokaPek 2024-11-01 13:05:34 +04:00
parent 0a65d77a16
commit dc5d1ac892

View File

@ -72,6 +72,117 @@
"print(df.head())\n",
"print(df.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Предобработка данных"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ID 0\n",
"Price 0\n",
"Levy 0\n",
"Manufacturer 0\n",
"Model 0\n",
"Prod. year 0\n",
"Category 0\n",
"Leather interior 0\n",
"Fuel type 0\n",
"Engine volume 0\n",
"Mileage 0\n",
"Cylinders 0\n",
"Gear box type 0\n",
"Drive wheels 0\n",
"Doors 0\n",
"Wheel 0\n",
"Color 0\n",
"Airbags 0\n",
"dtype: int64\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Egor\\AppData\\Local\\Temp\\ipykernel_16964\\3618217033.py:9: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
"The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
"\n",
"For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
"\n",
"\n",
" df['Levy'].fillna(df['Levy'].median(), inplace=True)\n",
"C:\\Users\\Egor\\AppData\\Local\\Temp\\ipykernel_16964\\3618217033.py:10: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
"The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
"\n",
"For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
"\n",
"\n",
" df['Mileage'].fillna(df['Mileage'].median(), inplace=True)\n"
]
}
],
"source": [
"# Проверка наличия пропущенных значений\n",
"print(df.isnull().sum())\n",
"\n",
"# Очистка столбца 'Levy' от нечисловых значений\n",
"df['Levy'] = pd.to_numeric(df['Levy'], errors='coerce')\n",
"df['Mileage'] = pd.to_numeric(df['Levy'], errors='coerce')\n",
"\n",
"# Заполнение пропущенных значений\n",
"df['Levy'].fillna(df['Levy'].median(), inplace=True)\n",
"df['Mileage'].fillna(df['Mileage'].median(), inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Определение числовых и категориальных признаков\n",
"numeric_features = ['Levy', 'Prod. year', 'Engine volume', 'Mileage', 'Cylinders', 'Airbags']\n",
"categorical_features = ['Manufacturer', 'Model', 'Category', 'Leather interior', 'Fuel type', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color']\n",
"\n",
"# Преобразование категориальных признаков в числовые\n",
"df = pd.get_dummies(df, columns=categorical_features, drop_first=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Разделение данных на тренировочный и тестовый наборы"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Задача регрессии\n",
"X_reg = df.drop(['ID', 'Price'], axis=1)\n",
"y_reg = df['Price']\n",
"\n",
"# Задача классификации\n",
"df['Category'] = pd.cut(df['Price'], bins=[0, 10000, 20000, np.inf], labels=['Эконом', 'Средний', 'Премиум'])\n",
"X_class = df.drop(['ID', 'Price', 'Category'], axis=1)\n",
"y_class = df['Category']\n",
"\n",
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)"
]
}
],
"metadata": {