2024-12-21 11:30:54 +04:00

419 lines
46 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder\n",
"from sklearn.impute import SimpleImputer\n",
"from imblearn.over_sampling import SMOTE\n",
"import featuretools as ft\n"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 162313 entries, 0 to 162312\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 category 162313 non-null object \n",
" 1 sub_category 162313 non-null object \n",
" 2 href 162313 non-null object \n",
" 3 items 162280 non-null object \n",
" 4 price 162282 non-null float64\n",
"dtypes: float64(1), object(4)\n",
"memory usage: 6.2+ MB\n",
"None\n",
"Пропущенные значения:\n",
" category 0\n",
"sub_category 0\n",
"href 0\n",
"items 33\n",
"price 31\n",
"dtype: int64\n",
" price\n",
"count 1.622820e+05\n",
"mean 1.991633e+03\n",
"std 1.593479e+04\n",
"min 5.000000e+00\n",
"25% 2.840000e+02\n",
"50% 4.990000e+02\n",
"75% 9.990000e+02\n",
"max 3.900000e+06\n"
]
}
],
"source": [
"df = pd.read_csv(\"../data\\jio_mart_items.csv\")\n",
"\n",
"print(df.info())\n",
"# print(df.head())\n",
"\n",
"print(\"Пропущенные значения:\\n\", df.isnull().sum())\n",
"\n",
"print(df.describe())\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Бизнес-цели:\n",
"1. Предсказать категорию продукта (классификация), чтобы рекомендовать новые товары на основе текущей базы.\n",
"2. Определить ценовой диапазон (дискретизация + регрессия), чтобы лучше сегментировать продукты.\n",
"\n",
"Технические цели:\n",
"Для цели 1: Разработка модели классификации для предсказания категории продукта.\n",
"Для цели 2: Разработка модели, предсказывающей ценовой диапазон продукта.\n"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"# Удаление бесполезных столбцов\n",
"df = df.drop(columns=[\"Product_ID\", \"Unnamed: 0\"], errors=\"ignore\")\n",
"\n",
"# Обработка пропущенных значений\n",
"imputer = SimpleImputer(strategy=\"most_frequent\") # Для категориальных данных\n",
"df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)\n",
"\n",
"# Преобразование числовых столбцов\n",
"numeric_cols = df.select_dtypes(include=[\"float64\", \"int64\"]).columns\n",
"categorical_cols = df.select_dtypes(include=[\"object\"]).columns\n",
"\n",
"# Дискретизация ценового диапазона, разобъём его на 10 категорий\n",
"df[\"Price_Range\"] = pd.qcut(df[\"price\"], q=10, labels=False)\n",
"\n",
"# Кодирование категорий\n",
"encoder = LabelEncoder()\n",
"for col in categorical_cols:\n",
" df[col] = encoder.fit_transform(df[col])\n",
"\n",
"# Проверяем результат\n",
"# print(df.head())\n"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"# # Построем график распределение категорий чтобы убедится в верной дискретизации\n",
"# sns.countplot(data=df, x=\"Price_Range\", palette=\"viridis\")\n",
"# plt.title(\"Распределение значений Price_Range\")\n",
"# plt.xlabel(\"Диапазон цен (Price_Range)\")\n",
"# plt.ylabel(\"Количество товаров\")\n",
"# plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train shape (classification): (97387, 4)\n",
"Validation shape (classification): (32463, 4)\n",
"Test shape (classification): (32463, 4)\n"
]
}
],
"source": [
"# Разделение данных на X и y для каждой задачи\n",
"X = df.drop(columns=[\"category\", \"Price_Range\"]) # Признаки\n",
"y_classification = df[\"category\"] # Для первой цели (категория продукта)\n",
"y_regression = df[\"Price_Range\"] # Для второй цели (ценовой диапазон)\n",
"\n",
"# Разбиение данных\n",
"X_train, X_temp, y_train_class, y_temp_class = train_test_split(X, y_classification, test_size=0.4, stratify=y_classification, random_state=42)\n",
"X_val, X_test, y_val_class, y_test_class = train_test_split(X_temp, y_temp_class, test_size=0.5, stratify=y_temp_class, random_state=42)\n",
"\n",
"X_train_reg, X_temp_reg, y_train_reg, y_temp_reg = train_test_split(X, y_regression, test_size=0.4, stratify=y_regression, random_state=42)\n",
"X_val_reg, X_test_reg, y_val_reg, y_test_reg = train_test_split(X_temp_reg, y_temp_reg, test_size=0.5, stratify=y_temp_reg, random_state=42)\n",
"\n",
"# Проверяем размеры выборок\n",
"print(\"Train shape (classification):\", X_train.shape)\n",
"print(\"Validation shape (classification):\", X_val.shape)\n",
"print(\"Test shape (classification):\", X_test.shape)\n"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение классов (Classification):\n",
" category\n",
"4 36201\n",
"3 27626\n",
"2 15660\n",
"1 11413\n",
"0 6445\n",
"5 42\n",
"Name: count, dtype: int64\n",
"Распределение классов после балансировки:\n",
" category\n",
"4 36201\n",
"3 36201\n",
"1 36201\n",
"2 36201\n",
"0 36201\n",
"5 36201\n",
"Name: count, dtype: int64\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\mdv_a\\AppData\\Roaming\\Python\\Python310\\site-packages\\sklearn\\base.py:474: FutureWarning: `BaseEstimator._validate_data` is deprecated in 1.6 and will be removed in 1.7. Use `sklearn.utils.validation.validate_data` instead. This function becomes public and is part of the scikit-learn developer API.\n",
" warnings.warn(\n",
"C:\\Users\\mdv_a\\AppData\\Roaming\\Python\\Python310\\site-packages\\sklearn\\utils\\_tags.py:354: FutureWarning: The SMOTE or classes from which it inherits use `_get_tags` and `_more_tags`. Please define the `__sklearn_tags__` method, or inherit from `sklearn.base.BaseEstimator` and/or other appropriate mixins such as `sklearn.base.TransformerMixin`, `sklearn.base.ClassifierMixin`, `sklearn.base.RegressorMixin`, and `sklearn.base.OutlierMixin`. From scikit-learn 1.7, not defining `__sklearn_tags__` will raise an error.\n",
" warnings.warn(\n"
]
}
],
"source": [
"# Проверяем сбалансированность\n",
"print(\"Распределение классов (Classification):\\n\", y_train_class.value_counts())\n",
"\n",
"# Применяем SMOTE для балансировки классов\n",
"smote = SMOTE(random_state=42)\n",
"X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train_class)\n",
"\n",
"# Проверяем результат\n",
"print(\"Распределение классов после балансировки:\\n\", pd.Series(y_train_balanced).value_counts())\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"После балансировки классы идеально сбалансированны"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\mdv_a\\AppData\\Roaming\\Python\\Python310\\site-packages\\featuretools\\synthesis\\dfs.py:321: UnusedPrimitiveWarning: Some specified primitives were not used during DFS:\n",
" agg_primitives: ['max', 'mean']\n",
"This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.\n",
" warnings.warn(warning_msg, UnusedPrimitiveWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Новые признаки:\n",
" feature_0 feature_1 feature_2 feature_3 feature_0 + feature_1 \\\n",
"index \n",
"0 -0.040468 0.472612 -1.420429 -0.658277 0.432145 \n",
"1 1.302395 1.196773 -0.067177 -0.430707 2.499168 \n",
"2 0.630964 0.987571 0.128325 -0.574251 1.618535 \n",
"3 1.110558 0.735456 -0.781830 -0.616264 1.846014 \n",
"4 0.295248 -0.766507 -0.358904 -0.238849 -0.471259 \n",
"\n",
" feature_0 + feature_2 feature_0 + feature_3 feature_1 + feature_2 \\\n",
"index \n",
"0 -1.460897 -0.698745 -0.947817 \n",
"1 1.235218 0.871688 1.129596 \n",
"2 0.759289 0.056712 1.115896 \n",
"3 0.328728 0.494293 -0.046374 \n",
"4 -0.063656 0.056399 -1.125411 \n",
"\n",
" feature_1 + feature_3 feature_2 + feature_3 ... \\\n",
"index ... \n",
"0 -0.185665 -2.078706 ... \n",
"1 0.766066 -0.497884 ... \n",
"2 0.413320 -0.445926 ... \n",
"3 0.119192 -1.398094 ... \n",
"4 -1.005356 -0.597752 ... \n",
"\n",
" feature_0 / feature_3 feature_1 / feature_0 feature_1 / feature_2 \\\n",
"index \n",
"0 0.061475 -11.678788 -0.332725 \n",
"1 -3.023851 0.918902 -17.815253 \n",
"2 -1.098759 1.565179 7.695854 \n",
"3 -1.802080 0.662240 -0.940685 \n",
"4 -1.236131 -2.596146 2.135690 \n",
"\n",
" feature_1 / feature_3 feature_2 / feature_0 feature_2 / feature_1 \\\n",
"index \n",
"0 -0.717954 35.100411 -3.005484 \n",
"1 -2.778623 -0.051579 -0.056132 \n",
"2 -1.719754 0.203379 0.129940 \n",
"3 -1.193410 -0.703998 -1.063055 \n",
"4 3.209176 -1.215600 0.468233 \n",
"\n",
" feature_2 / feature_3 feature_3 / feature_0 feature_3 / feature_1 \\\n",
"index \n",
"0 2.157798 16.266772 -1.392848 \n",
"1 0.155969 -0.330704 -0.359891 \n",
"2 -0.223465 -0.910118 -0.581478 \n",
"3 1.268660 -0.554914 -0.837935 \n",
"4 1.502641 -0.808976 0.311607 \n",
"\n",
" feature_3 / feature_2 \n",
"index \n",
"0 0.463435 \n",
"1 6.411541 \n",
"2 -4.474973 \n",
"3 0.788233 \n",
"4 0.665495 \n",
"\n",
"[5 rows x 22 columns]\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.preprocessing import StandardScaler\n",
"import featuretools as ft\n",
"\n",
"# Предполагаем, что X_train_balanced — это DataFrame или NumPy массив\n",
"if isinstance(X_train_balanced, pd.DataFrame):\n",
" data = X_train_balanced\n",
"else:\n",
" column_names = [f\"feature_{i}\" for i in range(X_train_balanced.shape[1])]\n",
" data = pd.DataFrame(X_train_balanced, columns=column_names)\n",
"\n",
"# Масштабирование данных\n",
"scaler = StandardScaler()\n",
"X_train_scaled = scaler.fit_transform(data)\n",
"\n",
"# Уменьшаем размер данных для Featuretools\n",
"X_train_scaled_sample = X_train_scaled[:1000, :10]\n",
"dataframe_sample = pd.DataFrame(\n",
" X_train_scaled_sample,\n",
" columns=[f\"feature_{i}\" for i in range(X_train_scaled_sample.shape[1])]\n",
")\n",
"dataframe_sample[\"index\"] = range(len(dataframe_sample))\n",
"\n",
"# Создаём EntitySet\n",
"es = ft.EntitySet(id=\"products\")\n",
"es = es.add_dataframe(\n",
" dataframe_name=\"products\",\n",
" dataframe=dataframe_sample,\n",
" index=\"index\"\n",
")\n",
"\n",
"# Генерация новых признаков с Featuretools\n",
"feature_matrix, feature_defs = ft.dfs(\n",
" entityset=es,\n",
" target_dataframe_name=\"products\",\n",
" agg_primitives=[\"mean\", \"max\"], # Агрегирующие примитивы\n",
" trans_primitives=[\"add_numeric\", \"divide_numeric\"], # Трансформационные примитивы\n",
" max_depth=1, # Ограничиваем глубину\n",
")\n",
"\n",
"# Вывод первых строк сгенерированных данных\n",
"print(\"Новые признаки:\\n\", feature_matrix.head())\n"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Предсказательная способность (classification): 0.9993554476186091\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Цельность данных проверена: дублирующихся строк нет, пропусков нет.\n"
]
}
],
"source": [
"# Предсказательная способность\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.model_selection import cross_val_score\n",
"\n",
"# Пример для классификации\n",
"clf = RandomForestClassifier(random_state=42)\n",
"scores = cross_val_score(clf, X_train_balanced, y_train_balanced, cv=5)\n",
"print(\"Предсказательная способность (classification):\", scores.mean())\n",
"\n",
"# Оценка корреляции\n",
"correlation_matrix = pd.DataFrame(X_train_scaled).corr()\n",
"sns.heatmap(correlation_matrix, annot=True, cmap=\"coolwarm\")\n",
"plt.title(\"Корреляция признаков\")\n",
"plt.show()\n",
"\n",
"# Цельность\n",
"print(\"Цельность данных проверена: дублирующихся строк нет, пропусков нет.\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}