diff --git a/.gitignore b/.gitignore index 7124522..f0028f5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ data/jio_mart_items.csv /data +/Lab_2/lab_2.ipynb +/Lab_3/lab_3.ipynb \ No newline at end of file diff --git a/Lab_2/lab2.ipynb b/Lab_2/lab2.ipynb new file mode 100644 index 0000000..e69de29 diff --git a/Lab_3/lab3.ipynb b/Lab_3/lab3.ipynb new file mode 100644 index 0000000..b129952 --- /dev/null +++ b/Lab_3/lab3.ipynb @@ -0,0 +1,236 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder\n", + "from sklearn.impute import SimpleImputer\n", + "from imblearn.over_sampling import SMOTE\n", + "import featuretools as ft\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "<>:1: SyntaxWarning: invalid escape sequence '\\j'\n", + "<>:1: SyntaxWarning: invalid escape sequence '\\j'\n", + "C:\\Users\\MaD\\AppData\\Local\\Temp\\ipykernel_6188\\750029597.py:1: SyntaxWarning: invalid escape sequence '\\j'\n", + " df = pd.read_csv(\"../data\\jio_mart_items.csv\")\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 162313 entries, 0 to 162312\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 category 162313 non-null object \n", + " 1 sub_category 162313 non-null object \n", + " 2 href 162313 non-null object \n", + " 3 items 162280 non-null object \n", + " 4 price 162282 non-null float64\n", + "dtypes: float64(1), object(4)\n", + "memory usage: 6.2+ MB\n", + "None\n", + "Пропущенные значения:\n", + " category 0\n", + "sub_category 0\n", + "href 0\n", + "items 33\n", + "price 31\n", + "dtype: int64\n", + " price\n", + "count 1.622820e+05\n", + "mean 1.991633e+03\n", + "std 1.593479e+04\n", + "min 5.000000e+00\n", + "25% 2.840000e+02\n", + "50% 4.990000e+02\n", + "75% 9.990000e+02\n", + "max 3.900000e+06\n" + ] + } + ], + "source": [ + "df = pd.read_csv(\"../data\\jio_mart_items.csv\")\n", + "\n", + "print(df.info())\n", + "# print(df.head())\n", + "\n", + "print(\"Пропущенные значения:\\n\", df.isnull().sum())\n", + "\n", + "print(df.describe())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Бизнес-цели:\n", + "1. Предсказать категорию продукта (классификация), чтобы рекомендовать новые товары на основе текущей базы.\n", + "2. Определить ценовой диапазон (дискретизация + регрессия), чтобы лучше сегментировать продукты.\n", + "\n", + "Технические цели:\n", + "Для цели 1: Разработка модели классификации для предсказания категории продукта.\n", + "Для цели 2: Разработка модели, предсказывающей ценовой диапазон продукта.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Удаление бесполезных столбцов\n", + "df = df.drop(columns=[\"Product_ID\", \"Unnamed: 0\"], errors=\"ignore\")\n", + "\n", + "# Обработка пропущенных значений\n", + "imputer = SimpleImputer(strategy=\"most_frequent\") # Для категориальных данных\n", + "df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)\n", + "\n", + "# Преобразование числовых столбцов\n", + "numeric_cols = df.select_dtypes(include=[\"float64\", \"int64\"]).columns\n", + "categorical_cols = df.select_dtypes(include=[\"object\"]).columns\n", + "\n", + "# Дискретизация ценового диапазона, разобъём его на 10 категорий\n", + "df[\"Price_Range\"] = pd.qcut(df[\"price\"], q=10, labels=False)\n", + "\n", + "# Кодирование категорий\n", + "encoder = LabelEncoder()\n", + "for col in categorical_cols:\n", + " df[col] = encoder.fit_transform(df[col])\n", + "\n", + "# Проверяем результат\n", + "# print(df.head())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# # Построем график распределение категорий чтобы убедится в верной дискретизации\n", + "# sns.countplot(data=df, x=\"Price_Range\", palette=\"viridis\")\n", + "# plt.title(\"Распределение значений Price_Range\")\n", + "# plt.xlabel(\"Диапазон цен (Price_Range)\")\n", + "# plt.ylabel(\"Количество товаров\")\n", + "# plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train shape (classification): (97387, 4)\n", + "Validation shape (classification): (32463, 4)\n", + "Test shape (classification): (32463, 4)\n" + ] + } + ], + "source": [ + "# Разделение данных на X и y для каждой задачи\n", + "X = df.drop(columns=[\"category\", \"Price_Range\"]) # Признаки\n", + "y_classification = df[\"category\"] # Для первой цели (категория продукта)\n", + "y_regression = df[\"Price_Range\"] # Для второй цели (ценовой диапазон)\n", + "\n", + "# Разбиение данных\n", + "X_train, X_temp, y_train_class, y_temp_class = train_test_split(X, y_classification, test_size=0.4, stratify=y_classification, random_state=42)\n", + "X_val, X_test, y_val_class, y_test_class = train_test_split(X_temp, y_temp_class, test_size=0.5, stratify=y_temp_class, random_state=42)\n", + "\n", + "X_train_reg, X_temp_reg, y_train_reg, y_temp_reg = train_test_split(X, y_regression, test_size=0.4, stratify=y_regression, random_state=42)\n", + "X_val_reg, X_test_reg, y_val_reg, y_test_reg = train_test_split(X_temp_reg, y_temp_reg, test_size=0.5, stratify=y_temp_reg, random_state=42)\n", + "\n", + "# Проверяем размеры выборок\n", + "print(\"Train shape (classification):\", X_train.shape)\n", + "print(\"Validation shape (classification):\", X_val.shape)\n", + "print(\"Test shape (classification):\", X_test.shape)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Распределение классов (Classification):\n", + " category\n", + "4 36201\n", + "3 27626\n", + "2 15660\n", + "1 11413\n", + "0 6445\n", + "5 42\n", + "Name: count, dtype: int64\n", + "Распределение классов после балансировки:\n", + " category\n", + "4 36201\n", + "3 36201\n", + "1 36201\n", + "2 36201\n", + "0 36201\n", + "5 36201\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "# Проверяем сбалансированность\n", + "print(\"Распределение классов (Classification):\\n\", y_train_class.value_counts())\n", + "\n", + "# Применяем SMOTE для балансировки классов\n", + "smote = SMOTE(random_state=42)\n", + "X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train_class)\n", + "\n", + "# Проверяем результат\n", + "print(\"Распределение классов после балансировки:\\n\", pd.Series(y_train_balanced).value_counts())\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}