AIM-PIbd-31-Medvedkov-A-D/Lab_3/lab3.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder\n",
    "from sklearn.impute import SimpleImputer\n",
    "from imblearn.over_sampling import SMOTE\n",
    "import featuretools as ft\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<>:1: SyntaxWarning: invalid escape sequence '\\j'\n",
      "<>:1: SyntaxWarning: invalid escape sequence '\\j'\n",
      "C:\\Users\\MaD\\AppData\\Local\\Temp\\ipykernel_6188\\750029597.py:1: SyntaxWarning: invalid escape sequence '\\j'\n",
      "  df = pd.read_csv(\"../data\\jio_mart_items.csv\")\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 162313 entries, 0 to 162312\n",
      "Data columns (total 5 columns):\n",
      " #   Column        Non-Null Count   Dtype  \n",
      "---  ------        --------------   -----  \n",
      " 0   category      162313 non-null  object \n",
      " 1   sub_category  162313 non-null  object \n",
      " 2   href          162313 non-null  object \n",
      " 3   items         162280 non-null  object \n",
      " 4   price         162282 non-null  float64\n",
      "dtypes: float64(1), object(4)\n",
      "memory usage: 6.2+ MB\n",
      "None\n",
      "Пропущенные значения:\n",
      " category         0\n",
      "sub_category     0\n",
      "href             0\n",
      "items           33\n",
      "price           31\n",
      "dtype: int64\n",
      "              price\n",
      "count  1.622820e+05\n",
      "mean   1.991633e+03\n",
      "std    1.593479e+04\n",
      "min    5.000000e+00\n",
      "25%    2.840000e+02\n",
      "50%    4.990000e+02\n",
      "75%    9.990000e+02\n",
      "max    3.900000e+06\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv(\"../data\\jio_mart_items.csv\")\n",
    "\n",
    "print(df.info())\n",
    "# print(df.head())\n",
    "\n",
    "print(\"Пропущенные значения:\\n\", df.isnull().sum())\n",
    "\n",
    "print(df.describe())\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Бизнес-цели:\n",
    "1. Предсказать категорию продукта (классификация), чтобы рекомендовать новые товары на основе текущей базы.\n",
    "2. Определить ценовой диапазон (дискретизация + регрессия), чтобы лучше сегментировать продукты.\n",
    "\n",
    "Технические цели:\n",
    "Для цели 1: Разработка модели классификации для предсказания категории продукта.\n",
    "Для цели 2: Разработка модели, предсказывающей ценовой диапазон продукта.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Удаление бесполезных столбцов\n",
    "df = df.drop(columns=[\"Product_ID\", \"Unnamed: 0\"], errors=\"ignore\")\n",
    "\n",
    "# Обработка пропущенных значений\n",
    "imputer = SimpleImputer(strategy=\"most_frequent\")  # Для категориальных данных\n",
    "df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)\n",
    "\n",
    "# Преобразование числовых столбцов\n",
    "numeric_cols = df.select_dtypes(include=[\"float64\", \"int64\"]).columns\n",
    "categorical_cols = df.select_dtypes(include=[\"object\"]).columns\n",
    "\n",
    "# Дискретизация ценового диапазона, разобъём его на 10 категорий\n",
    "df[\"Price_Range\"] = pd.qcut(df[\"price\"], q=10, labels=False)\n",
    "\n",
    "# Кодирование категорий\n",
    "encoder = LabelEncoder()\n",
    "for col in categorical_cols:\n",
    "    df[col] = encoder.fit_transform(df[col])\n",
    "\n",
    "# Проверяем результат\n",
    "# print(df.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # Построем график распределение категорий чтобы убедится в верной дискретизации\n",
    "# sns.countplot(data=df, x=\"Price_Range\", palette=\"viridis\")\n",
    "# plt.title(\"Распределение значений Price_Range\")\n",
    "# plt.xlabel(\"Диапазон цен (Price_Range)\")\n",
    "# plt.ylabel(\"Количество товаров\")\n",
    "# plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train shape (classification): (97387, 4)\n",
      "Validation shape (classification): (32463, 4)\n",
      "Test shape (classification): (32463, 4)\n"
     ]
    }
   ],
   "source": [
    "# Разделение данных на X и y для каждой задачи\n",
    "X = df.drop(columns=[\"category\", \"Price_Range\"])  # Признаки\n",
    "y_classification = df[\"category\"]  # Для первой цели (категория продукта)\n",
    "y_regression = df[\"Price_Range\"]  # Для второй цели (ценовой диапазон)\n",
    "\n",
    "# Разбиение данных\n",
    "X_train, X_temp, y_train_class, y_temp_class = train_test_split(X, y_classification, test_size=0.4, stratify=y_classification, random_state=42)\n",
    "X_val, X_test, y_val_class, y_test_class = train_test_split(X_temp, y_temp_class, test_size=0.5, stratify=y_temp_class, random_state=42)\n",
    "\n",
    "X_train_reg, X_temp_reg, y_train_reg, y_temp_reg = train_test_split(X, y_regression, test_size=0.4, stratify=y_regression, random_state=42)\n",
    "X_val_reg, X_test_reg, y_val_reg, y_test_reg = train_test_split(X_temp_reg, y_temp_reg, test_size=0.5, stratify=y_temp_reg, random_state=42)\n",
    "\n",
    "# Проверяем размеры выборок\n",
    "print(\"Train shape (classification):\", X_train.shape)\n",
    "print(\"Validation shape (classification):\", X_val.shape)\n",
    "print(\"Test shape (classification):\", X_test.shape)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Распределение классов (Classification):\n",
      " category\n",
      "4    36201\n",
      "3    27626\n",
      "2    15660\n",
      "1    11413\n",
      "0     6445\n",
      "5       42\n",
      "Name: count, dtype: int64\n",
      "Распределение классов после балансировки:\n",
      " category\n",
      "4    36201\n",
      "3    36201\n",
      "1    36201\n",
      "2    36201\n",
      "0    36201\n",
      "5    36201\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# Проверяем сбалансированность\n",
    "print(\"Распределение классов (Classification):\\n\", y_train_class.value_counts())\n",
    "\n",
    "# Применяем SMOTE для балансировки классов\n",
    "smote = SMOTE(random_state=42)\n",
    "X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train_class)\n",
    "\n",
    "# Проверяем результат\n",
    "print(\"Распределение классов после балансировки:\\n\", pd.Series(y_train_balanced).value_counts())\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}