правки
This commit is contained in:
parent
264d0a4de8
commit
19073f816e
236
Lab_3/lab3.ipynb
236
Lab_3/lab3.ipynb
@ -0,0 +1,236 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import seaborn as sns\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder\n",
|
||||
"from sklearn.impute import SimpleImputer\n",
|
||||
"from imblearn.over_sampling import SMOTE\n",
|
||||
"import featuretools as ft\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<>:1: SyntaxWarning: invalid escape sequence '\\j'\n",
|
||||
"<>:1: SyntaxWarning: invalid escape sequence '\\j'\n",
|
||||
"C:\\Users\\MaD\\AppData\\Local\\Temp\\ipykernel_6188\\750029597.py:1: SyntaxWarning: invalid escape sequence '\\j'\n",
|
||||
" df = pd.read_csv(\"../data\\jio_mart_items.csv\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"RangeIndex: 162313 entries, 0 to 162312\n",
|
||||
"Data columns (total 5 columns):\n",
|
||||
" # Column Non-Null Count Dtype \n",
|
||||
"--- ------ -------------- ----- \n",
|
||||
" 0 category 162313 non-null object \n",
|
||||
" 1 sub_category 162313 non-null object \n",
|
||||
" 2 href 162313 non-null object \n",
|
||||
" 3 items 162280 non-null object \n",
|
||||
" 4 price 162282 non-null float64\n",
|
||||
"dtypes: float64(1), object(4)\n",
|
||||
"memory usage: 6.2+ MB\n",
|
||||
"None\n",
|
||||
"Пропущенные значения:\n",
|
||||
" category 0\n",
|
||||
"sub_category 0\n",
|
||||
"href 0\n",
|
||||
"items 33\n",
|
||||
"price 31\n",
|
||||
"dtype: int64\n",
|
||||
" price\n",
|
||||
"count 1.622820e+05\n",
|
||||
"mean 1.991633e+03\n",
|
||||
"std 1.593479e+04\n",
|
||||
"min 5.000000e+00\n",
|
||||
"25% 2.840000e+02\n",
|
||||
"50% 4.990000e+02\n",
|
||||
"75% 9.990000e+02\n",
|
||||
"max 3.900000e+06\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df = pd.read_csv(\"../data\\jio_mart_items.csv\")\n",
|
||||
"\n",
|
||||
"print(df.info())\n",
|
||||
"# print(df.head())\n",
|
||||
"\n",
|
||||
"print(\"Пропущенные значения:\\n\", df.isnull().sum())\n",
|
||||
"\n",
|
||||
"print(df.describe())\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Бизнес-цели:\n",
|
||||
"1. Предсказать категорию продукта (классификация), чтобы рекомендовать новые товары на основе текущей базы.\n",
|
||||
"2. Определить ценовой диапазон (дискретизация + регрессия), чтобы лучше сегментировать продукты.\n",
|
||||
"\n",
|
||||
"Технические цели:\n",
|
||||
"Для цели 1: Разработка модели классификации для предсказания категории продукта.\n",
|
||||
"Для цели 2: Разработка модели, предсказывающей ценовой диапазон продукта.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Удаление бесполезных столбцов\n",
|
||||
"df = df.drop(columns=[\"Product_ID\", \"Unnamed: 0\"], errors=\"ignore\")\n",
|
||||
"\n",
|
||||
"# Обработка пропущенных значений\n",
|
||||
"imputer = SimpleImputer(strategy=\"most_frequent\") # Для категориальных данных\n",
|
||||
"df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)\n",
|
||||
"\n",
|
||||
"# Преобразование числовых столбцов\n",
|
||||
"numeric_cols = df.select_dtypes(include=[\"float64\", \"int64\"]).columns\n",
|
||||
"categorical_cols = df.select_dtypes(include=[\"object\"]).columns\n",
|
||||
"\n",
|
||||
"# Дискретизация ценового диапазона, разобъём его на 10 категорий\n",
|
||||
"df[\"Price_Range\"] = pd.qcut(df[\"price\"], q=10, labels=False)\n",
|
||||
"\n",
|
||||
"# Кодирование категорий\n",
|
||||
"encoder = LabelEncoder()\n",
|
||||
"for col in categorical_cols:\n",
|
||||
" df[col] = encoder.fit_transform(df[col])\n",
|
||||
"\n",
|
||||
"# Проверяем результат\n",
|
||||
"# print(df.head())\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# # Построем график распределение категорий чтобы убедится в верной дискретизации\n",
|
||||
"# sns.countplot(data=df, x=\"Price_Range\", palette=\"viridis\")\n",
|
||||
"# plt.title(\"Распределение значений Price_Range\")\n",
|
||||
"# plt.xlabel(\"Диапазон цен (Price_Range)\")\n",
|
||||
"# plt.ylabel(\"Количество товаров\")\n",
|
||||
"# plt.show()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Train shape (classification): (97387, 4)\n",
|
||||
"Validation shape (classification): (32463, 4)\n",
|
||||
"Test shape (classification): (32463, 4)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Разделение данных на X и y для каждой задачи\n",
|
||||
"X = df.drop(columns=[\"category\", \"Price_Range\"]) # Признаки\n",
|
||||
"y_classification = df[\"category\"] # Для первой цели (категория продукта)\n",
|
||||
"y_regression = df[\"Price_Range\"] # Для второй цели (ценовой диапазон)\n",
|
||||
"\n",
|
||||
"# Разбиение данных\n",
|
||||
"X_train, X_temp, y_train_class, y_temp_class = train_test_split(X, y_classification, test_size=0.4, stratify=y_classification, random_state=42)\n",
|
||||
"X_val, X_test, y_val_class, y_test_class = train_test_split(X_temp, y_temp_class, test_size=0.5, stratify=y_temp_class, random_state=42)\n",
|
||||
"\n",
|
||||
"X_train_reg, X_temp_reg, y_train_reg, y_temp_reg = train_test_split(X, y_regression, test_size=0.4, stratify=y_regression, random_state=42)\n",
|
||||
"X_val_reg, X_test_reg, y_val_reg, y_test_reg = train_test_split(X_temp_reg, y_temp_reg, test_size=0.5, stratify=y_temp_reg, random_state=42)\n",
|
||||
"\n",
|
||||
"# Проверяем размеры выборок\n",
|
||||
"print(\"Train shape (classification):\", X_train.shape)\n",
|
||||
"print(\"Validation shape (classification):\", X_val.shape)\n",
|
||||
"print(\"Test shape (classification):\", X_test.shape)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Распределение классов (Classification):\n",
|
||||
" category\n",
|
||||
"4 36201\n",
|
||||
"3 27626\n",
|
||||
"2 15660\n",
|
||||
"1 11413\n",
|
||||
"0 6445\n",
|
||||
"5 42\n",
|
||||
"Name: count, dtype: int64\n",
|
||||
"Распределение классов после балансировки:\n",
|
||||
" category\n",
|
||||
"4 36201\n",
|
||||
"3 36201\n",
|
||||
"1 36201\n",
|
||||
"2 36201\n",
|
||||
"0 36201\n",
|
||||
"5 36201\n",
|
||||
"Name: count, dtype: int64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Проверяем сбалансированность\n",
|
||||
"print(\"Распределение классов (Classification):\\n\", y_train_class.value_counts())\n",
|
||||
"\n",
|
||||
"# Применяем SMOTE для балансировки классов\n",
|
||||
"smote = SMOTE(random_state=42)\n",
|
||||
"X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train_class)\n",
|
||||
"\n",
|
||||
"# Проверяем результат\n",
|
||||
"print(\"Распределение классов после балансировки:\\n\", pd.Series(y_train_balanced).value_counts())\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Loading…
Reference in New Issue
Block a user