2024-12-15 19:50:26 +04:00

237 lines
8.5 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder\n",
"from sklearn.impute import SimpleImputer\n",
"from imblearn.over_sampling import SMOTE\n",
"import featuretools as ft\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<>:1: SyntaxWarning: invalid escape sequence '\\j'\n",
"<>:1: SyntaxWarning: invalid escape sequence '\\j'\n",
"C:\\Users\\MaD\\AppData\\Local\\Temp\\ipykernel_6188\\750029597.py:1: SyntaxWarning: invalid escape sequence '\\j'\n",
" df = pd.read_csv(\"../data\\jio_mart_items.csv\")\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 162313 entries, 0 to 162312\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 category 162313 non-null object \n",
" 1 sub_category 162313 non-null object \n",
" 2 href 162313 non-null object \n",
" 3 items 162280 non-null object \n",
" 4 price 162282 non-null float64\n",
"dtypes: float64(1), object(4)\n",
"memory usage: 6.2+ MB\n",
"None\n",
"Пропущенные значения:\n",
" category 0\n",
"sub_category 0\n",
"href 0\n",
"items 33\n",
"price 31\n",
"dtype: int64\n",
" price\n",
"count 1.622820e+05\n",
"mean 1.991633e+03\n",
"std 1.593479e+04\n",
"min 5.000000e+00\n",
"25% 2.840000e+02\n",
"50% 4.990000e+02\n",
"75% 9.990000e+02\n",
"max 3.900000e+06\n"
]
}
],
"source": [
"df = pd.read_csv(\"../data\\jio_mart_items.csv\")\n",
"\n",
"print(df.info())\n",
"# print(df.head())\n",
"\n",
"print(\"Пропущенные значения:\\n\", df.isnull().sum())\n",
"\n",
"print(df.describe())\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Бизнес-цели:\n",
"1. Предсказать категорию продукта (классификация), чтобы рекомендовать новые товары на основе текущей базы.\n",
"2. Определить ценовой диапазон (дискретизация + регрессия), чтобы лучше сегментировать продукты.\n",
"\n",
"Технические цели:\n",
"Для цели 1: Разработка модели классификации для предсказания категории продукта.\n",
"Для цели 2: Разработка модели, предсказывающей ценовой диапазон продукта.\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Удаление бесполезных столбцов\n",
"df = df.drop(columns=[\"Product_ID\", \"Unnamed: 0\"], errors=\"ignore\")\n",
"\n",
"# Обработка пропущенных значений\n",
"imputer = SimpleImputer(strategy=\"most_frequent\") # Для категориальных данных\n",
"df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)\n",
"\n",
"# Преобразование числовых столбцов\n",
"numeric_cols = df.select_dtypes(include=[\"float64\", \"int64\"]).columns\n",
"categorical_cols = df.select_dtypes(include=[\"object\"]).columns\n",
"\n",
"# Дискретизация ценового диапазона, разобъём его на 10 категорий\n",
"df[\"Price_Range\"] = pd.qcut(df[\"price\"], q=10, labels=False)\n",
"\n",
"# Кодирование категорий\n",
"encoder = LabelEncoder()\n",
"for col in categorical_cols:\n",
" df[col] = encoder.fit_transform(df[col])\n",
"\n",
"# Проверяем результат\n",
"# print(df.head())\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# # Построем график распределение категорий чтобы убедится в верной дискретизации\n",
"# sns.countplot(data=df, x=\"Price_Range\", palette=\"viridis\")\n",
"# plt.title(\"Распределение значений Price_Range\")\n",
"# plt.xlabel(\"Диапазон цен (Price_Range)\")\n",
"# plt.ylabel(\"Количество товаров\")\n",
"# plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train shape (classification): (97387, 4)\n",
"Validation shape (classification): (32463, 4)\n",
"Test shape (classification): (32463, 4)\n"
]
}
],
"source": [
"# Разделение данных на X и y для каждой задачи\n",
"X = df.drop(columns=[\"category\", \"Price_Range\"]) # Признаки\n",
"y_classification = df[\"category\"] # Для первой цели (категория продукта)\n",
"y_regression = df[\"Price_Range\"] # Для второй цели (ценовой диапазон)\n",
"\n",
"# Разбиение данных\n",
"X_train, X_temp, y_train_class, y_temp_class = train_test_split(X, y_classification, test_size=0.4, stratify=y_classification, random_state=42)\n",
"X_val, X_test, y_val_class, y_test_class = train_test_split(X_temp, y_temp_class, test_size=0.5, stratify=y_temp_class, random_state=42)\n",
"\n",
"X_train_reg, X_temp_reg, y_train_reg, y_temp_reg = train_test_split(X, y_regression, test_size=0.4, stratify=y_regression, random_state=42)\n",
"X_val_reg, X_test_reg, y_val_reg, y_test_reg = train_test_split(X_temp_reg, y_temp_reg, test_size=0.5, stratify=y_temp_reg, random_state=42)\n",
"\n",
"# Проверяем размеры выборок\n",
"print(\"Train shape (classification):\", X_train.shape)\n",
"print(\"Validation shape (classification):\", X_val.shape)\n",
"print(\"Test shape (classification):\", X_test.shape)\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение классов (Classification):\n",
" category\n",
"4 36201\n",
"3 27626\n",
"2 15660\n",
"1 11413\n",
"0 6445\n",
"5 42\n",
"Name: count, dtype: int64\n",
"Распределение классов после балансировки:\n",
" category\n",
"4 36201\n",
"3 36201\n",
"1 36201\n",
"2 36201\n",
"0 36201\n",
"5 36201\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"# Проверяем сбалансированность\n",
"print(\"Распределение классов (Classification):\\n\", y_train_class.value_counts())\n",
"\n",
"# Применяем SMOTE для балансировки классов\n",
"smote = SMOTE(random_state=42)\n",
"X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train_class)\n",
"\n",
"# Проверяем результат\n",
"print(\"Распределение классов после балансировки:\\n\", pd.Series(y_train_balanced).value_counts())\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}