{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder\n", "from sklearn.impute import SimpleImputer\n", "from imblearn.over_sampling import SMOTE\n", "import featuretools as ft\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "<>:1: SyntaxWarning: invalid escape sequence '\\j'\n", "<>:1: SyntaxWarning: invalid escape sequence '\\j'\n", "C:\\Users\\MaD\\AppData\\Local\\Temp\\ipykernel_6188\\750029597.py:1: SyntaxWarning: invalid escape sequence '\\j'\n", " df = pd.read_csv(\"../data\\jio_mart_items.csv\")\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 162313 entries, 0 to 162312\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 category 162313 non-null object \n", " 1 sub_category 162313 non-null object \n", " 2 href 162313 non-null object \n", " 3 items 162280 non-null object \n", " 4 price 162282 non-null float64\n", "dtypes: float64(1), object(4)\n", "memory usage: 6.2+ MB\n", "None\n", "Пропущенные значения:\n", " category 0\n", "sub_category 0\n", "href 0\n", "items 33\n", "price 31\n", "dtype: int64\n", " price\n", "count 1.622820e+05\n", "mean 1.991633e+03\n", "std 1.593479e+04\n", "min 5.000000e+00\n", "25% 2.840000e+02\n", "50% 4.990000e+02\n", "75% 9.990000e+02\n", "max 3.900000e+06\n" ] } ], "source": [ "df = pd.read_csv(\"../data\\jio_mart_items.csv\")\n", "\n", "print(df.info())\n", "# print(df.head())\n", "\n", "print(\"Пропущенные значения:\\n\", df.isnull().sum())\n", "\n", "print(df.describe())\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Бизнес-цели:\n", "1. Предсказать категорию продукта (классификация), чтобы рекомендовать новые товары на основе текущей базы.\n", "2. Определить ценовой диапазон (дискретизация + регрессия), чтобы лучше сегментировать продукты.\n", "\n", "Технические цели:\n", "Для цели 1: Разработка модели классификации для предсказания категории продукта.\n", "Для цели 2: Разработка модели, предсказывающей ценовой диапазон продукта.\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Удаление бесполезных столбцов\n", "df = df.drop(columns=[\"Product_ID\", \"Unnamed: 0\"], errors=\"ignore\")\n", "\n", "# Обработка пропущенных значений\n", "imputer = SimpleImputer(strategy=\"most_frequent\") # Для категориальных данных\n", "df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)\n", "\n", "# Преобразование числовых столбцов\n", "numeric_cols = df.select_dtypes(include=[\"float64\", \"int64\"]).columns\n", "categorical_cols = df.select_dtypes(include=[\"object\"]).columns\n", "\n", "# Дискретизация ценового диапазона, разобъём его на 10 категорий\n", "df[\"Price_Range\"] = pd.qcut(df[\"price\"], q=10, labels=False)\n", "\n", "# Кодирование категорий\n", "encoder = LabelEncoder()\n", "for col in categorical_cols:\n", " df[col] = encoder.fit_transform(df[col])\n", "\n", "# Проверяем результат\n", "# print(df.head())\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# # Построем график распределение категорий чтобы убедится в верной дискретизации\n", "# sns.countplot(data=df, x=\"Price_Range\", palette=\"viridis\")\n", "# plt.title(\"Распределение значений Price_Range\")\n", "# plt.xlabel(\"Диапазон цен (Price_Range)\")\n", "# plt.ylabel(\"Количество товаров\")\n", "# plt.show()\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train shape (classification): (97387, 4)\n", "Validation shape (classification): (32463, 4)\n", "Test shape (classification): (32463, 4)\n" ] } ], "source": [ "# Разделение данных на X и y для каждой задачи\n", "X = df.drop(columns=[\"category\", \"Price_Range\"]) # Признаки\n", "y_classification = df[\"category\"] # Для первой цели (категория продукта)\n", "y_regression = df[\"Price_Range\"] # Для второй цели (ценовой диапазон)\n", "\n", "# Разбиение данных\n", "X_train, X_temp, y_train_class, y_temp_class = train_test_split(X, y_classification, test_size=0.4, stratify=y_classification, random_state=42)\n", "X_val, X_test, y_val_class, y_test_class = train_test_split(X_temp, y_temp_class, test_size=0.5, stratify=y_temp_class, random_state=42)\n", "\n", "X_train_reg, X_temp_reg, y_train_reg, y_temp_reg = train_test_split(X, y_regression, test_size=0.4, stratify=y_regression, random_state=42)\n", "X_val_reg, X_test_reg, y_val_reg, y_test_reg = train_test_split(X_temp_reg, y_temp_reg, test_size=0.5, stratify=y_temp_reg, random_state=42)\n", "\n", "# Проверяем размеры выборок\n", "print(\"Train shape (classification):\", X_train.shape)\n", "print(\"Validation shape (classification):\", X_val.shape)\n", "print(\"Test shape (classification):\", X_test.shape)\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Распределение классов (Classification):\n", " category\n", "4 36201\n", "3 27626\n", "2 15660\n", "1 11413\n", "0 6445\n", "5 42\n", "Name: count, dtype: int64\n", "Распределение классов после балансировки:\n", " category\n", "4 36201\n", "3 36201\n", "1 36201\n", "2 36201\n", "0 36201\n", "5 36201\n", "Name: count, dtype: int64\n" ] } ], "source": [ "# Проверяем сбалансированность\n", "print(\"Распределение классов (Classification):\\n\", y_train_class.value_counts())\n", "\n", "# Применяем SMOTE для балансировки классов\n", "smote = SMOTE(random_state=42)\n", "X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train_class)\n", "\n", "# Проверяем результат\n", "print(\"Распределение классов после балансировки:\\n\", pd.Series(y_train_balanced).value_counts())\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.6" } }, "nbformat": 4, "nbformat_minor": 2 }