правки

2024-12-15 19:50:26 +04:00 · 2024-12-15 19:50:26 +04:00 · 19073f816e
commit 19073f816e
parent 264d0a4de8
1 changed files with 236 additions and 0 deletions
--- a/Lab_3/lab3.ipynb
+++ b/Lab_3/lab3.ipynb
@ -0,0 +1,236 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder\n",
+    "from sklearn.impute import SimpleImputer\n",
+    "from imblearn.over_sampling import SMOTE\n",
+    "import featuretools as ft\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<>:1: SyntaxWarning: invalid escape sequence '\\j'\n",
+      "<>:1: SyntaxWarning: invalid escape sequence '\\j'\n",
+      "C:\\Users\\MaD\\AppData\\Local\\Temp\\ipykernel_6188\\750029597.py:1: SyntaxWarning: invalid escape sequence '\\j'\n",
+      "  df = pd.read_csv(\"../data\\jio_mart_items.csv\")\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 162313 entries, 0 to 162312\n",
+      "Data columns (total 5 columns):\n",
+      " #   Column        Non-Null Count   Dtype  \n",
+      "---  ------        --------------   -----  \n",
+      " 0   category      162313 non-null  object \n",
+      " 1   sub_category  162313 non-null  object \n",
+      " 2   href          162313 non-null  object \n",
+      " 3   items         162280 non-null  object \n",
+      " 4   price         162282 non-null  float64\n",
+      "dtypes: float64(1), object(4)\n",
+      "memory usage: 6.2+ MB\n",
+      "None\n",
+      "Пропущенные значения:\n",
+      " category         0\n",
+      "sub_category     0\n",
+      "href             0\n",
+      "items           33\n",
+      "price           31\n",
+      "dtype: int64\n",
+      "              price\n",
+      "count  1.622820e+05\n",
+      "mean   1.991633e+03\n",
+      "std    1.593479e+04\n",
+      "min    5.000000e+00\n",
+      "25%    2.840000e+02\n",
+      "50%    4.990000e+02\n",
+      "75%    9.990000e+02\n",
+      "max    3.900000e+06\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = pd.read_csv(\"../data\\jio_mart_items.csv\")\n",
+    "\n",
+    "print(df.info())\n",
+    "# print(df.head())\n",
+    "\n",
+    "print(\"Пропущенные значения:\\n\", df.isnull().sum())\n",
+    "\n",
+    "print(df.describe())\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Бизнес-цели:\n",
+    "1. Предсказать категорию продукта (классификация), чтобы рекомендовать новые товары на основе текущей базы.\n",
+    "2. Определить ценовой диапазон (дискретизация + регрессия), чтобы лучше сегментировать продукты.\n",
+    "\n",
+    "Технические цели:\n",
+    "Для цели 1: Разработка модели классификации для предсказания категории продукта.\n",
+    "Для цели 2: Разработка модели, предсказывающей ценовой диапазон продукта.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Удаление бесполезных столбцов\n",
+    "df = df.drop(columns=[\"Product_ID\", \"Unnamed: 0\"], errors=\"ignore\")\n",
+    "\n",
+    "# Обработка пропущенных значений\n",
+    "imputer = SimpleImputer(strategy=\"most_frequent\")  # Для категориальных данных\n",
+    "df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)\n",
+    "\n",
+    "# Преобразование числовых столбцов\n",
+    "numeric_cols = df.select_dtypes(include=[\"float64\", \"int64\"]).columns\n",
+    "categorical_cols = df.select_dtypes(include=[\"object\"]).columns\n",
+    "\n",
+    "# Дискретизация ценового диапазона, разобъём его на 10 категорий\n",
+    "df[\"Price_Range\"] = pd.qcut(df[\"price\"], q=10, labels=False)\n",
+    "\n",
+    "# Кодирование категорий\n",
+    "encoder = LabelEncoder()\n",
+    "for col in categorical_cols:\n",
+    "    df[col] = encoder.fit_transform(df[col])\n",
+    "\n",
+    "# Проверяем результат\n",
+    "# print(df.head())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # Построем график распределение категорий чтобы убедится в верной дискретизации\n",
+    "# sns.countplot(data=df, x=\"Price_Range\", palette=\"viridis\")\n",
+    "# plt.title(\"Распределение значений Price_Range\")\n",
+    "# plt.xlabel(\"Диапазон цен (Price_Range)\")\n",
+    "# plt.ylabel(\"Количество товаров\")\n",
+    "# plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train shape (classification): (97387, 4)\n",
+      "Validation shape (classification): (32463, 4)\n",
+      "Test shape (classification): (32463, 4)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Разделение данных на X и y для каждой задачи\n",
+    "X = df.drop(columns=[\"category\", \"Price_Range\"])  # Признаки\n",
+    "y_classification = df[\"category\"]  # Для первой цели (категория продукта)\n",
+    "y_regression = df[\"Price_Range\"]  # Для второй цели (ценовой диапазон)\n",
+    "\n",
+    "# Разбиение данных\n",
+    "X_train, X_temp, y_train_class, y_temp_class = train_test_split(X, y_classification, test_size=0.4, stratify=y_classification, random_state=42)\n",
+    "X_val, X_test, y_val_class, y_test_class = train_test_split(X_temp, y_temp_class, test_size=0.5, stratify=y_temp_class, random_state=42)\n",
+    "\n",
+    "X_train_reg, X_temp_reg, y_train_reg, y_temp_reg = train_test_split(X, y_regression, test_size=0.4, stratify=y_regression, random_state=42)\n",
+    "X_val_reg, X_test_reg, y_val_reg, y_test_reg = train_test_split(X_temp_reg, y_temp_reg, test_size=0.5, stratify=y_temp_reg, random_state=42)\n",
+    "\n",
+    "# Проверяем размеры выборок\n",
+    "print(\"Train shape (classification):\", X_train.shape)\n",
+    "print(\"Validation shape (classification):\", X_val.shape)\n",
+    "print(\"Test shape (classification):\", X_test.shape)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Распределение классов (Classification):\n",
+      " category\n",
+      "4    36201\n",
+      "3    27626\n",
+      "2    15660\n",
+      "1    11413\n",
+      "0     6445\n",
+      "5       42\n",
+      "Name: count, dtype: int64\n",
+      "Распределение классов после балансировки:\n",
+      " category\n",
+      "4    36201\n",
+      "3    36201\n",
+      "1    36201\n",
+      "2    36201\n",
+      "0    36201\n",
+      "5    36201\n",
+      "Name: count, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Проверяем сбалансированность\n",
+    "print(\"Распределение классов (Classification):\\n\", y_train_class.value_counts())\n",
+    "\n",
+    "# Применяем SMOTE для балансировки классов\n",
+    "smote = SMOTE(random_state=42)\n",
+    "X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train_class)\n",
+    "\n",
+    "# Проверяем результат\n",
+    "print(\"Распределение классов после балансировки:\\n\", pd.Series(y_train_balanced).value_counts())\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}