diff --git a/.gitignore b/.gitignore index 1d91e1a..372675d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,12 @@ data/jio_mart_items.csv /data /Lab_2/lab_2.ipynb +/Lab_3/lab_3.ipynb +/Lab_4/lab_4.ipynb +/Lab_4/lab4gpu.ipynb +/Lab_4/lab5.ipynb +/Lab_4/lab44.ipynb +/Lab_4/lab45.ipynb +/Lab_4/lab_products_clustering.ipynb +/Lab_4/lab_4.ipynb +/Lab_4/lab_4_products.ipynb \ No newline at end of file diff --git a/Lab_3/lab3.ipynb b/Lab_3/lab3.ipynb new file mode 100644 index 0000000..8a8479f --- /dev/null +++ b/Lab_3/lab3.ipynb @@ -0,0 +1,528 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder\n", + "from sklearn.impute import SimpleImputer\n", + "from imblearn.over_sampling import SMOTE\n", + "import featuretools as ft\n" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 162313 entries, 0 to 162312\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 category 162313 non-null object \n", + " 1 sub_category 162313 non-null object \n", + " 2 href 162313 non-null object \n", + " 3 items 162280 non-null object \n", + " 4 price 162282 non-null float64\n", + "dtypes: float64(1), object(4)\n", + "memory usage: 6.2+ MB\n", + "None\n", + "\n", + "RangeIndex: 162313 entries, 0 to 162312\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 category 162313 non-null object \n", + " 1 sub_category 162313 non-null object \n", + " 2 href 162313 non-null object \n", + " 3 items 162280 non-null object \n", + " 4 price 162282 non-null float64\n", + "dtypes: float64(1), object(4)\n", + "memory usage: 6.2+ MB\n", + "Пропущенные значения:\n", + " category 0\n", + "sub_category 0\n", + "href 0\n", + "items 33\n", + "price 31\n", + "dtype: int64\n", + " price\n", + "count 1.622820e+05\n", + "mean 1.991633e+03\n", + "std 1.593479e+04\n", + "min 5.000000e+00\n", + "25% 2.840000e+02\n", + "50% 4.990000e+02\n", + "75% 9.990000e+02\n", + "max 3.900000e+06\n" + ] + } + ], + "source": [ + "df = pd.read_csv(\"../data\\jio_mart_items.csv\")\n", + "\n", + "print(df.info())\n", + "# print(df.head())\n", + "\n", + "df.info()\n", + "# df = df.sample(n=20000 , random_state=42)\n", + "\n", + "print(\"Пропущенные значения:\\n\", df.isnull().sum())\n", + "\n", + "print(df.describe())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Бизнес-цели:\n", + "1. Предсказать категорию продукта (классификация), чтобы рекомендовать новые товары на основе текущей базы.\n", + "2. Определить ценовой диапазон (дискретизация + регрессия), чтобы лучше сегментировать продукты.\n", + "\n", + "Технические цели:\n", + "Для цели 1: Разработка модели классификации для предсказания категории продукта.\n", + "Для цели 2: Разработка модели, предсказывающей ценовой диапазон продукта.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "# Удаление бесполезных столбцов\n", + "df = df.drop(columns=[\"Product_ID\", \"Unnamed: 0\"], errors=\"ignore\")\n", + "\n", + "# Обработка пропущенных значений\n", + "imputer = SimpleImputer(strategy=\"most_frequent\") # Для категориальных данных\n", + "df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)\n", + "\n", + "# Преобразование числовых столбцов\n", + "numeric_cols = df.select_dtypes(include=[\"float64\", \"int64\"]).columns\n", + "categorical_cols = df.select_dtypes(include=[\"object\"]).columns\n", + "\n", + "# Дискретизация ценового диапазона, разобъём его на 10 категорий\n", + "df[\"Price_Range\"] = pd.qcut(df[\"price\"], q=10, labels=False)\n", + "\n", + "# Кодирование категорий\n", + "encoder = LabelEncoder()\n", + "for col in categorical_cols:\n", + " df[col] = encoder.fit_transform(df[col])\n", + "\n", + "# Проверяем результат\n", + "# print(df.head())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "# # Построем график распределение категорий чтобы убедится в верной дискретизации\n", + "# sns.countplot(data=df, x=\"Price_Range\", palette=\"viridis\")\n", + "# plt.title(\"Распределение значений Price_Range\")\n", + "# plt.xlabel(\"Диапазон цен (Price_Range)\")\n", + "# plt.ylabel(\"Количество товаров\")\n", + "# plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train shape (classification): (97387, 4)\n", + "Validation shape (classification): (32463, 4)\n", + "Test shape (classification): (32463, 4)\n" + ] + } + ], + "source": [ + "# Разделение данных на X и y для каждой задачи\n", + "X = df.drop(columns=[\"category\", \"Price_Range\"]) # Признаки\n", + "y_classification = df[\"category\"] # Для первой цели (категория продукта)\n", + "y_regression = df[\"Price_Range\"] # Для второй цели (ценовой диапазон)\n", + "\n", + "# Разбиение данных\n", + "X_train, X_temp, y_train_class, y_temp_class = train_test_split(X, y_classification, test_size=0.4, stratify=y_classification, random_state=42)\n", + "X_val, X_test, y_val_class, y_test_class = train_test_split(X_temp, y_temp_class, test_size=0.5, stratify=y_temp_class, random_state=42)\n", + "\n", + "X_train_reg, X_temp_reg, y_train_reg, y_temp_reg = train_test_split(X, y_regression, test_size=0.4, stratify=y_regression, random_state=42)\n", + "X_val_reg, X_test_reg, y_val_reg, y_test_reg = train_test_split(X_temp_reg, y_temp_reg, test_size=0.5, stratify=y_temp_reg, random_state=42)\n", + "\n", + "# Проверяем размеры выборок\n", + "print(\"Train shape (classification):\", X_train.shape)\n", + "print(\"Validation shape (classification):\", X_val.shape)\n", + "print(\"Test shape (classification):\", X_test.shape)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Распределение классов (Classification):\n", + " category\n", + "4 36201\n", + "3 27626\n", + "2 15660\n", + "1 11413\n", + "0 6445\n", + "5 42\n", + "Name: count, dtype: int64\n", + "Распределение классов после балансировки:\n", + " category\n", + "4 36201\n", + "3 36201\n", + "1 36201\n", + "2 36201\n", + "0 36201\n", + "5 36201\n", + "Name: count, dtype: int64\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\mdv_a\\AppData\\Roaming\\Python\\Python310\\site-packages\\sklearn\\base.py:474: FutureWarning: `BaseEstimator._validate_data` is deprecated in 1.6 and will be removed in 1.7. Use `sklearn.utils.validation.validate_data` instead. This function becomes public and is part of the scikit-learn developer API.\n", + " warnings.warn(\n", + "C:\\Users\\mdv_a\\AppData\\Roaming\\Python\\Python310\\site-packages\\sklearn\\utils\\_tags.py:354: FutureWarning: The SMOTE or classes from which it inherits use `_get_tags` and `_more_tags`. Please define the `__sklearn_tags__` method, or inherit from `sklearn.base.BaseEstimator` and/or other appropriate mixins such as `sklearn.base.TransformerMixin`, `sklearn.base.ClassifierMixin`, `sklearn.base.RegressorMixin`, and `sklearn.base.OutlierMixin`. From scikit-learn 1.7, not defining `__sklearn_tags__` will raise an error.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "# Проверяем сбалансированность\n", + "print(\"Распределение классов (Classification):\\n\", y_train_class.value_counts())\n", + "\n", + "# Применяем SMOTE для балансировки классов\n", + "smote = SMOTE(random_state=42)\n", + "X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train_class)\n", + "\n", + "# Проверяем результат\n", + "print(\"Распределение классов после балансировки:\\n\", pd.Series(y_train_balanced).value_counts())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sub_categoryhrefitemsprice
03437717884406
16251271086731
24847378772526
35842642990466
441146596171005
\n", + "
" + ], + "text/plain": [ + " sub_category href items price\n", + "0 34 377 17884 406\n", + "1 62 512 71086 731\n", + "2 48 473 78772 526\n", + "3 58 426 42990 466\n", + "4 41 146 59617 1005" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train_balanced.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Теперь классы идеально сбалансированные" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Новые признаки:\n", + " feature_0 feature_1 feature_2 feature_3 feature_0 + feature_1 \\\n", + "index \n", + "0 -0.040468 0.472612 -1.420429 -0.658277 0.432145 \n", + "1 1.302395 1.196773 -0.067177 -0.430707 2.499168 \n", + "2 0.630964 0.987571 0.128325 -0.574251 1.618535 \n", + "3 1.110558 0.735456 -0.781830 -0.616264 1.846014 \n", + "4 0.295248 -0.766507 -0.358904 -0.238849 -0.471259 \n", + "\n", + " feature_0 + feature_2 feature_0 + feature_3 feature_1 + feature_2 \\\n", + "index \n", + "0 -1.460897 -0.698745 -0.947817 \n", + "1 1.235218 0.871688 1.129596 \n", + "2 0.759289 0.056712 1.115896 \n", + "3 0.328728 0.494293 -0.046374 \n", + "4 -0.063656 0.056399 -1.125411 \n", + "\n", + " feature_1 + feature_3 feature_2 + feature_3 ... \\\n", + "index ... \n", + "0 -0.185665 -2.078706 ... \n", + "1 0.766066 -0.497884 ... \n", + "2 0.413320 -0.445926 ... \n", + "3 0.119192 -1.398094 ... \n", + "4 -1.005356 -0.597752 ... \n", + "\n", + " feature_0 / feature_3 feature_1 / feature_0 feature_1 / feature_2 \\\n", + "index \n", + "0 0.061475 -11.678788 -0.332725 \n", + "1 -3.023851 0.918902 -17.815253 \n", + "2 -1.098759 1.565179 7.695854 \n", + "3 -1.802080 0.662240 -0.940685 \n", + "4 -1.236131 -2.596146 2.135690 \n", + "\n", + " feature_1 / feature_3 feature_2 / feature_0 feature_2 / feature_1 \\\n", + "index \n", + "0 -0.717954 35.100411 -3.005484 \n", + "1 -2.778623 -0.051579 -0.056132 \n", + "2 -1.719754 0.203379 0.129940 \n", + "3 -1.193410 -0.703998 -1.063055 \n", + "4 3.209176 -1.215600 0.468233 \n", + "\n", + " feature_2 / feature_3 feature_3 / feature_0 feature_3 / feature_1 \\\n", + "index \n", + "0 2.157798 16.266772 -1.392848 \n", + "1 0.155969 -0.330704 -0.359891 \n", + "2 -0.223465 -0.910118 -0.581478 \n", + "3 1.268660 -0.554914 -0.837935 \n", + "4 1.502641 -0.808976 0.311607 \n", + "\n", + " feature_3 / feature_2 \n", + "index \n", + "0 0.463435 \n", + "1 6.411541 \n", + "2 -4.474973 \n", + "3 0.788233 \n", + "4 0.665495 \n", + "\n", + "[5 rows x 22 columns]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\mdv_a\\AppData\\Roaming\\Python\\Python310\\site-packages\\featuretools\\synthesis\\dfs.py:321: UnusedPrimitiveWarning: Some specified primitives were not used during DFS:\n", + " agg_primitives: ['max', 'mean']\n", + "This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.\n", + " warnings.warn(warning_msg, UnusedPrimitiveWarning)\n" + ] + } + ], + "source": [ + "\n", + "\n", + "# Предполагаем, что X_train_balanced — это DataFrame или NumPy массив\n", + "if isinstance(X_train_balanced, pd.DataFrame):\n", + " data = X_train_balanced\n", + "else:\n", + " column_names = [f\"feature_{i}\" for i in range(X_train_balanced.shape[1])]\n", + " data = pd.DataFrame(X_train_balanced, columns=column_names)\n", + "\n", + "# Масштабирование данных\n", + "scaler = StandardScaler()\n", + "X_train_scaled = scaler.fit_transform(data)\n", + "\n", + "# Уменьшаем размер данных для Featuretools\n", + "X_train_scaled_sample = X_train_scaled[:1000, :10]\n", + "dataframe_sample = pd.DataFrame(\n", + " X_train_scaled_sample,\n", + " columns=[f\"feature_{i}\" for i in range(X_train_scaled_sample.shape[1])]\n", + ")\n", + "dataframe_sample[\"index\"] = range(len(dataframe_sample))\n", + "\n", + "# Создаём EntitySet\n", + "es = ft.EntitySet(id=\"products\")\n", + "es = es.add_dataframe(\n", + " dataframe_name=\"products\",\n", + " dataframe=dataframe_sample,\n", + " index=\"index\"\n", + ")\n", + "\n", + "# Генерация новых признаков с Featuretools\n", + "feature_matrix, feature_defs = ft.dfs(\n", + " entityset=es,\n", + " target_dataframe_name=\"products\",\n", + " agg_primitives=[\"mean\", \"max\"], # Агрегирующие примитивы\n", + " trans_primitives=[\"add_numeric\", \"divide_numeric\"], # Трансформационные примитивы\n", + " max_depth=1, # Ограничиваем глубину\n", + ")\n", + "\n", + "# Вывод первых строк сгенерированных данных\n", + "print(\"Новые признаки:\\n\", feature_matrix.head())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Предсказательная способность (classification): 0.9993554476186091\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Цельность данных проверена: дублирующихся строк нет, пропусков нет.\n" + ] + } + ], + "source": [ + "# Предсказательная способность\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import cross_val_score\n", + "\n", + "# Пример для классификации\n", + "clf = RandomForestClassifier(random_state=42)\n", + "scores = cross_val_score(clf, X_train_balanced, y_train_balanced, cv=5)\n", + "print(\"Предсказательная способность (classification):\", scores.mean())\n", + "\n", + "# Оценка корреляции\n", + "correlation_matrix = pd.DataFrame(X_train_scaled).corr()\n", + "sns.heatmap(correlation_matrix, annot=True, cmap=\"coolwarm\")\n", + "plt.title(\"Корреляция признаков\")\n", + "plt.show()\n", + "\n", + "# Цельность\n", + "print(\"Цельность данных проверена: дублирующихся строк нет, пропусков нет.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Сконструированные признаки демонстрируют слабую корелляцию, что не удивительно для наших исходных данных" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Lab_4/lab4.ipynb b/Lab_4/lab4.ipynb new file mode 100644 index 0000000..7ac0417 --- /dev/null +++ b/Lab_4/lab4.ipynb @@ -0,0 +1,630 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 162313 entries, 0 to 162312\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 category 162313 non-null object \n", + " 1 sub_category 162313 non-null object \n", + " 2 href 162313 non-null object \n", + " 3 items 162280 non-null object \n", + " 4 price 162282 non-null float64\n", + "dtypes: float64(1), object(4)\n", + "memory usage: 6.2+ MB\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from sklearn import metrics\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "from imblearn.under_sampling import RandomUnderSampler\n", + "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", + "from sklearn.metrics import ConfusionMatrixDisplay\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.linear_model import LinearRegression, LogisticRegression\n", + "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier\n", + "from sklearn.model_selection import train_test_split, GridSearchCV\n", + "from sklearn.metrics import (\n", + " precision_score, recall_score, accuracy_score, roc_auc_score, f1_score,\n", + " matthews_corrcoef, cohen_kappa_score, confusion_matrix\n", + ")\n", + "from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n", + "import numpy as np\n", + "import featuretools as ft\n", + "from sklearn.metrics import accuracy_score, classification_report\n", + "\n", + "# Функция для применения oversampling\n", + "def apply_oversampling(X, y):\n", + " oversampler = RandomOverSampler(random_state=42)\n", + " X_resampled, y_resampled = oversampler.fit_resample(X, y)\n", + " return X_resampled, y_resampled\n", + "\n", + "# Функция для применения undersampling\n", + "def apply_undersampling(X, y):\n", + " undersampler = RandomUnderSampler(random_state=42)\n", + " X_resampled, y_resampled = undersampler.fit_resample(X, y)\n", + " return X_resampled, y_resampled\n", + "\n", + "def split_stratified_into_train_val_test(\n", + " df_input,\n", + " stratify_colname=\"y\",\n", + " frac_train=0.6,\n", + " frac_val=0.15,\n", + " frac_test=0.25,\n", + " random_state=None,\n", + "):\n", + " \"\"\"\n", + " Splits a Pandas dataframe into three subsets (train, val, and test)\n", + " following fractional ratios provided by the user, where each subset is\n", + " stratified by the values in a specific column (that is, each subset has\n", + " the same relative frequency of the values in the column). It performs this\n", + " splitting by running train_test_split() twice.\n", + "\n", + " Parameters\n", + " ----------\n", + " df_input : Pandas dataframe\n", + " Input dataframe to be split.\n", + " stratify_colname : str\n", + " The name of the column that will be used for stratification. Usually\n", + " this column would be for the label.\n", + " frac_train : float\n", + " frac_val : float\n", + " frac_test : float\n", + " The ratios with which the dataframe will be split into train, val, and\n", + " test data. The values should be expressed as float fractions and should\n", + " sum to 1.0.\n", + " random_state : int, None, or RandomStateInstance\n", + " Value to be passed to train_test_split().\n", + "\n", + " Returns\n", + " -------\n", + " df_train, df_val, df_test :\n", + " Dataframes containing the three splits.\n", + " \"\"\"\n", + "\n", + " if frac_train + frac_val + frac_test != 1.0:\n", + " raise ValueError(\n", + " \"fractions %f, %f, %f do not add up to 1.0\"\n", + " % (frac_train, frac_val, frac_test)\n", + " )\n", + "\n", + " if stratify_colname not in df_input.columns:\n", + " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", + "\n", + " X = df_input # Contains all columns.\n", + " y = df_input[\n", + " [stratify_colname]\n", + " ] # Dataframe of just the column on which to stratify.\n", + "\n", + " # Split original dataframe into train and temp dataframes.\n", + " df_train, df_temp, y_train, y_temp = train_test_split(\n", + " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", + " )\n", + "\n", + " # Split the temp dataframe into val and test dataframes.\n", + " relative_frac_test = frac_test / (frac_val + frac_test)\n", + " df_val, df_test, y_val, y_test = train_test_split(\n", + " df_temp,\n", + " y_temp,\n", + " stratify=y_temp,\n", + " test_size=relative_frac_test,\n", + " random_state=random_state,\n", + " )\n", + "\n", + " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", + "\n", + " return df_train, df_val, df_test\n", + "\n", + "\n", + "df = pd.read_csv('../data/jio_mart_items.csv')\n", + "df.info()\n", + "df = df.sample(n=10000 , random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Dining' 'Toys, Games & Fitness' 'Fragrances' 'Bags & Travel Luggage'\n", + " 'Girls' 'Home Decor' 'Boys' 'Stationery' 'Beverages' 'Staples' 'Men'\n", + " 'Mobiles & Tablets' 'Personal Care' 'Dairy & Bakery' 'Mom & Baby Care'\n", + " 'Snacks & Branded Foods' 'Women' 'Books' 'Auto Care' 'Electrical'\n", + " 'Furnishing' 'Accessories' 'Pets' 'Home Care' 'Mops, Brushes & Scrubs'\n", + " 'Furniture' 'Computers' 'Kitchen Appliances' 'Home Appliances' 'Cameras'\n", + " 'Make-Up' 'Garden & Outdoor' 'Disposables' 'Carpentry & work accessories'\n", + " 'Mom & Baby' 'Kitchenware' 'Power & Hand Tools' 'Pooja Needs'\n", + " 'Bathroom & Laundry Accessories' 'Office Products' 'TV & Speaker'\n", + " 'Personal Care & Grooming' 'Hair' 'Skin Care'\n", + " 'Paint, Wall Treatments & Supplies' 'Industrial & Scientific Supplies'\n", + " 'Infants' 'Kitchen & Bath Fixtures' 'Home Safety & Automation'\n", + " 'Fine Jewellery' 'Fruits & Vegetables' 'Apparel' 'Premium Fruits'\n", + " 'Phones' 'Bathroom & Laundry' 'Junior Boys' 'Tools & Appliances'\n", + " 'Smart Devices' \"Men's Grooming\" 'Gaming' 'Health Care Devices'\n", + " 'Handloom & Handicraft' 'Hardware & Plumbing' 'Wellness' 'Treatments']\n" + ] + } + ], + "source": [ + "print(df['sub_category'].unique())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Как бизнес-цели выделим следующие 2 варианта:\n", + " 1) Регрессия - предсказание цены по категории (для аналитических систем или улучшения алгоритмов ценообразования)\n", + " 2) Классификация - определение категории продукта по его подкатегории (для логистических или аналитических систем)\n", + " \n", + "Однако данный датасет весьма плоо подходит для подобных задач." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Выполним подготовку данных" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# df.fillna({\"category\": \"NaN\", \"sub_category\": \"NaN\", \"href\" : \"NaN\", \"items\" : \"NaN\", \"price\" : \"NaN\" }, inplace=True)\n", + "df = df.dropna()\n", + "data = df.copy()\n", + "\n", + "value_counts = data[\"category\"].value_counts()\n", + "rare = value_counts[value_counts < 100].index\n", + "data = data[~data[\"category\"].isin(rare)]\n", + "\n", + "data1 = pd.get_dummies(data, columns=['category', 'sub_category'], drop_first=True)\n", + "\n", + "# label_encoder = LabelEncoder()\n", + "# data1['sub_category'] = label_encoder.fit_transform(data['sub_category'])\n", + "# data1['category'] = label_encoder.fit_transform(data['category'])\n", + "# data1['items'] = label_encoder.fit_transform(data['items'])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Определить достижимый уровень качества модели для каждой задачи. На основе имеющихся данных уровень качества моделей регрессии будет низким, поскольку цена слабо коррелирует с категорией (кроме некоторых исключений)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Построим конвейер." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['href', 'items', 'price', 'category_Electronics', 'category_Fashion',\n", + " 'category_Groceries', 'category_Home & Kitchen', 'sub_category_Apparel',\n", + " 'sub_category_Auto Care', 'sub_category_Bags & Travel Luggage',\n", + " 'sub_category_Bathroom & Laundry',\n", + " 'sub_category_Bathroom & Laundry Accessories', 'sub_category_Beverages',\n", + " 'sub_category_Books', 'sub_category_Boys', 'sub_category_Cameras',\n", + " 'sub_category_Carpentry & work accessories', 'sub_category_Computers',\n", + " 'sub_category_Dairy & Bakery', 'sub_category_Dining',\n", + " 'sub_category_Disposables', 'sub_category_Electrical',\n", + " 'sub_category_Fragrances', 'sub_category_Fruits & Vegetables',\n", + " 'sub_category_Furnishing', 'sub_category_Furniture',\n", + " 'sub_category_Gaming', 'sub_category_Garden & Outdoor',\n", + " 'sub_category_Girls', 'sub_category_Hair',\n", + " 'sub_category_Handloom & Handicraft',\n", + " 'sub_category_Hardware & Plumbing', 'sub_category_Health Care Devices',\n", + " 'sub_category_Home Appliances', 'sub_category_Home Care',\n", + " 'sub_category_Home Decor', 'sub_category_Home Safety & Automation',\n", + " 'sub_category_Industrial & Scientific Supplies', 'sub_category_Infants',\n", + " 'sub_category_Junior Boys', 'sub_category_Kitchen & Bath Fixtures',\n", + " 'sub_category_Kitchen Appliances', 'sub_category_Kitchenware',\n", + " 'sub_category_Make-Up', 'sub_category_Men',\n", + " 'sub_category_Men's Grooming', 'sub_category_Mobiles & Tablets',\n", + " 'sub_category_Mom & Baby', 'sub_category_Mom & Baby Care',\n", + " 'sub_category_Mops, Brushes & Scrubs', 'sub_category_Office Products',\n", + " 'sub_category_Paint, Wall Treatments & Supplies',\n", + " 'sub_category_Personal Care', 'sub_category_Personal Care & Grooming',\n", + " 'sub_category_Pets', 'sub_category_Phones', 'sub_category_Pooja Needs',\n", + " 'sub_category_Power & Hand Tools', 'sub_category_Premium Fruits',\n", + " 'sub_category_Skin Care', 'sub_category_Smart Devices',\n", + " 'sub_category_Snacks & Branded Foods', 'sub_category_Staples',\n", + " 'sub_category_Stationery', 'sub_category_TV & Speaker',\n", + " 'sub_category_Tools & Appliances', 'sub_category_Toys, Games & Fitness',\n", + " 'sub_category_Wellness', 'sub_category_Women'],\n", + " dtype='object')\n", + "\n", + "Index: 9995 entries, 52893 to 146053\n", + "Data columns (total 69 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 href 9995 non-null object \n", + " 1 items 9995 non-null object \n", + " 2 price 9995 non-null float64\n", + " 3 category_Electronics 9995 non-null bool \n", + " 4 category_Fashion 9995 non-null bool \n", + " 5 category_Groceries 9995 non-null bool \n", + " 6 category_Home & Kitchen 9995 non-null bool \n", + " 7 sub_category_Apparel 9995 non-null bool \n", + " 8 sub_category_Auto Care 9995 non-null bool \n", + " 9 sub_category_Bags & Travel Luggage 9995 non-null bool \n", + " 10 sub_category_Bathroom & Laundry 9995 non-null bool \n", + " 11 sub_category_Bathroom & Laundry Accessories 9995 non-null bool \n", + " 12 sub_category_Beverages 9995 non-null bool \n", + " 13 sub_category_Books 9995 non-null bool \n", + " 14 sub_category_Boys 9995 non-null bool \n", + " 15 sub_category_Cameras 9995 non-null bool \n", + " 16 sub_category_Carpentry & work accessories 9995 non-null bool \n", + " 17 sub_category_Computers 9995 non-null bool \n", + " 18 sub_category_Dairy & Bakery 9995 non-null bool \n", + " 19 sub_category_Dining 9995 non-null bool \n", + " 20 sub_category_Disposables 9995 non-null bool \n", + " 21 sub_category_Electrical 9995 non-null bool \n", + " 22 sub_category_Fragrances 9995 non-null bool \n", + " 23 sub_category_Fruits & Vegetables 9995 non-null bool \n", + " 24 sub_category_Furnishing 9995 non-null bool \n", + " 25 sub_category_Furniture 9995 non-null bool \n", + " 26 sub_category_Gaming 9995 non-null bool \n", + " 27 sub_category_Garden & Outdoor 9995 non-null bool \n", + " 28 sub_category_Girls 9995 non-null bool \n", + " 29 sub_category_Hair 9995 non-null bool \n", + " 30 sub_category_Handloom & Handicraft 9995 non-null bool \n", + " 31 sub_category_Hardware & Plumbing 9995 non-null bool \n", + " 32 sub_category_Health Care Devices 9995 non-null bool \n", + " 33 sub_category_Home Appliances 9995 non-null bool \n", + " 34 sub_category_Home Care 9995 non-null bool \n", + " 35 sub_category_Home Decor 9995 non-null bool \n", + " 36 sub_category_Home Safety & Automation 9995 non-null bool \n", + " 37 sub_category_Industrial & Scientific Supplies 9995 non-null bool \n", + " 38 sub_category_Infants 9995 non-null bool \n", + " 39 sub_category_Junior Boys 9995 non-null bool \n", + " 40 sub_category_Kitchen & Bath Fixtures 9995 non-null bool \n", + " 41 sub_category_Kitchen Appliances 9995 non-null bool \n", + " 42 sub_category_Kitchenware 9995 non-null bool \n", + " 43 sub_category_Make-Up 9995 non-null bool \n", + " 44 sub_category_Men 9995 non-null bool \n", + " 45 sub_category_Men's Grooming 9995 non-null bool \n", + " 46 sub_category_Mobiles & Tablets 9995 non-null bool \n", + " 47 sub_category_Mom & Baby 9995 non-null bool \n", + " 48 sub_category_Mom & Baby Care 9995 non-null bool \n", + " 49 sub_category_Mops, Brushes & Scrubs 9995 non-null bool \n", + " 50 sub_category_Office Products 9995 non-null bool \n", + " 51 sub_category_Paint, Wall Treatments & Supplies 9995 non-null bool \n", + " 52 sub_category_Personal Care 9995 non-null bool \n", + " 53 sub_category_Personal Care & Grooming 9995 non-null bool \n", + " 54 sub_category_Pets 9995 non-null bool \n", + " 55 sub_category_Phones 9995 non-null bool \n", + " 56 sub_category_Pooja Needs 9995 non-null bool \n", + " 57 sub_category_Power & Hand Tools 9995 non-null bool \n", + " 58 sub_category_Premium Fruits 9995 non-null bool \n", + " 59 sub_category_Skin Care 9995 non-null bool \n", + " 60 sub_category_Smart Devices 9995 non-null bool \n", + " 61 sub_category_Snacks & Branded Foods 9995 non-null bool \n", + " 62 sub_category_Staples 9995 non-null bool \n", + " 63 sub_category_Stationery 9995 non-null bool \n", + " 64 sub_category_TV & Speaker 9995 non-null bool \n", + " 65 sub_category_Tools & Appliances 9995 non-null bool \n", + " 66 sub_category_Toys, Games & Fitness 9995 non-null bool \n", + " 67 sub_category_Wellness 9995 non-null bool \n", + " 68 sub_category_Women 9995 non-null bool \n", + "dtypes: bool(66), float64(1), object(2)\n", + "memory usage: 956.6+ KB\n" + ] + } + ], + "source": [ + "print(data1.columns)\n", + "data1.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best parameters for Linear Regression: {}\n", + "Best parameters for Random Forest Regressor: {'model__max_depth': None, 'model__n_estimators': 300}\n", + "Best parameters for Gradient Boosting Regressor: {'model__learning_rate': 0.01, 'model__max_depth': 7, 'model__n_estimators': 300}\n", + "Model: Linear Regression\n", + "Model: Random Forest Regressor\n", + "Model: Gradient Boosting Regressor\n" + ] + } + ], + "source": [ + "X_reg = data1.drop(['href', 'items', 'price'], axis=1)\n", + "y_reg = data1['price']\n", + "\n", + "# Разделение данных\n", + "X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n", + "\n", + "# Выбор моделей для регрессии\n", + "models_reg = {\n", + " 'Linear Regression': LinearRegression(),\n", + " 'Random Forest Regressor': RandomForestRegressor(random_state=42),\n", + " 'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42)\n", + "}\n", + "\n", + "# Создание конвейера для регрессии\n", + "pipelines_reg = {}\n", + "for name, model in models_reg.items():\n", + " pipelines_reg[name] = Pipeline([\n", + " ('scaler', StandardScaler()),\n", + " ('model', model)\n", + " ])\n", + "\n", + "# Определение сетки гиперпараметров для регрессии\n", + "param_grids_reg = {\n", + " 'Linear Regression': {},\n", + " 'Random Forest Regressor': {\n", + " 'model__n_estimators': [100, 200, 300],\n", + " 'model__max_depth': [None, 10, 20, 30]\n", + " },\n", + " 'Gradient Boosting Regressor': {\n", + " 'model__n_estimators': [100, 200, 300],\n", + " 'model__learning_rate': [0.01, 0.1, 0.2],\n", + " 'model__max_depth': [3, 5, 7]\n", + " }\n", + "}\n", + "\n", + "# Настройка гиперпараметров для регрессии\n", + "best_models_reg = {}\n", + "for name, pipeline in pipelines_reg.items():\n", + " grid_search = GridSearchCV(pipeline, param_grids_reg[name], cv=5, scoring='neg_mean_squared_error')\n", + " grid_search.fit(X_train_reg, y_train_reg)\n", + " best_models_reg[name] = {\n", + " 'pipeline': grid_search.best_estimator_,\n", + " 'best_params': grid_search.best_params_\n", + " }\n", + " print(f'Best parameters for {name}: {grid_search.best_params_}')\n", + "\n", + "# Обучение моделей и оценка качества\n", + "for model_name in best_models_reg.keys():\n", + " print(f\"Model: {model_name}\")\n", + " model_pipeline = best_models_reg[model_name]['pipeline']\n", + " model_pipeline.fit(X_train_reg, y_train_reg)\n", + "\n", + " y_train_predict = model_pipeline.predict(X_train_reg)\n", + " y_test_predict = model_pipeline.predict(X_test_reg)\n", + "\n", + " best_models_reg[model_name][\"preds_train\"] = y_train_predict\n", + " best_models_reg[model_name][\"preds_test\"] = y_test_predict\n", + "\n", + " best_models_reg[model_name][\"MSE_train\"] = mean_squared_error(y_train_reg, y_train_predict)\n", + " best_models_reg[model_name][\"MSE_test\"] = mean_squared_error(y_test_reg, y_test_predict)\n", + " best_models_reg[model_name][\"R2_train\"] = r2_score(y_train_reg, y_train_predict)\n", + " best_models_reg[model_name][\"R2_test\"] = r2_score(y_test_reg, y_test_predict)\n", + " best_models_reg[model_name][\"MAE_train\"] = mean_absolute_error(y_train_reg, y_train_predict)\n", + " best_models_reg[model_name][\"MAE_test\"] = mean_absolute_error(y_test_reg, y_test_predict)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.991495747873937\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " Beauty 0.99 0.90 0.94 131\n", + " Electronics 0.99 1.00 0.99 241\n", + " Fashion 1.00 1.00 1.00 307\n", + " Groceries 0.98 1.00 0.99 573\n", + "Home & Kitchen 1.00 1.00 1.00 747\n", + "\n", + " accuracy 0.99 1999\n", + " macro avg 0.99 0.98 0.98 1999\n", + " weighted avg 0.99 0.99 0.99 1999\n", + "\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Кодирование категориальных данных через LabelEncoder\n", + "label_encoder = LabelEncoder()\n", + "data['sub_category_encoded'] = label_encoder.fit_transform(data['sub_category'])\n", + "\n", + "# Определение признаков (X) и целевой переменной (y)\n", + "X = data[['sub_category_encoded']] # Используем закодированный sub_category\n", + "y = label_encoder.fit_transform(data['category']) # Кодируем category\n", + "\n", + "# Разделение данных на тренировочную и тестовую выборки\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n", + "\n", + "# Создание конвейера для классификатора\n", + "pipeline = Pipeline([\n", + " ('scaler', StandardScaler()), # Масштабирование данных (хотя для категориальных признаков это не всегда нужно)\n", + " ('classifier', RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10))\n", + "])\n", + "\n", + "# Обучение модели\n", + "pipeline.fit(X_train, y_train)\n", + "\n", + "# Предсказание на тестовых данных\n", + "y_pred = pipeline.predict(X_test)\n", + "\n", + "print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n", + "print(\"Classification Report:\\n\", classification_report(y_test, y_pred, target_names=label_encoder.inverse_transform(np.unique(y_test))))\n", + "\n", + "# Матрица ошибок\n", + "cm = confusion_matrix(y_test, y_pred)\n", + "plt.figure(figsize=(10, 8))\n", + "sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)\n", + "plt.xlabel('Predicted')\n", + "plt.ylabel('Actual')\n", + "plt.title('Confusion Matrix')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Модель классификации показывает неплохие результаты, что логично, учитывая структуру датасета." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 9995 entries, 52893 to 146053\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 category 9995 non-null object \n", + " 1 sub_category 9995 non-null object \n", + " 2 href 9995 non-null object \n", + " 3 items 9995 non-null object \n", + " 4 price 9995 non-null float64\n", + "dtypes: float64(1), object(4)\n", + "memory usage: 468.5+ KB\n" + ] + } + ], + "source": [ + "data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'best_models_reg' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[9], line 4\u001b[0m\n\u001b[0;32m 1\u001b[0m _, ax \u001b[38;5;241m=\u001b[39m plt\u001b[38;5;241m.\u001b[39msubplots(\u001b[38;5;241m3\u001b[39m, \u001b[38;5;241m2\u001b[39m, figsize\u001b[38;5;241m=\u001b[39m(\u001b[38;5;241m12\u001b[39m, \u001b[38;5;241m10\u001b[39m), sharex\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, sharey\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[0;32m 2\u001b[0m ax \u001b[38;5;241m=\u001b[39m ax\u001b[38;5;241m.\u001b[39mflatten()\n\u001b[1;32m----> 4\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m index, (name, model) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[43mbest_models_reg\u001b[49m\u001b[38;5;241m.\u001b[39mitems()):\n\u001b[0;32m 5\u001b[0m model_pipeline \u001b[38;5;241m=\u001b[39m model[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpipeline\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[0;32m 6\u001b[0m y_pred_reg \u001b[38;5;241m=\u001b[39m model_pipeline\u001b[38;5;241m.\u001b[39mpredict(X_test_reg)\n", + "\u001b[1;31mNameError\u001b[0m: name 'best_models_reg' is not defined" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "_, ax = plt.subplots(3, 2, figsize=(12, 10), sharex=False, sharey=False)\n", + "ax = ax.flatten()\n", + "\n", + "for index, (name, model) in enumerate(best_models_reg.items()):\n", + " model_pipeline = model['pipeline']\n", + " y_pred_reg = model_pipeline.predict(X_test_reg)\n", + "\n", + " # График фактических значений против предсказанных значений\n", + " ax[index * 2].scatter(y_test_reg, y_pred_reg, alpha=0.5)\n", + " ax[index * 2].plot([min(y_test_reg), max(y_test_reg)], [min(y_test_reg), max(y_test_reg)], color='red', linestyle='--')\n", + " ax[index * 2].set_xlabel('Actual Values')\n", + " ax[index * 2].set_ylabel('Predicted Values')\n", + " ax[index * 2].set_title(f'{name}: Actual vs Predicted')\n", + "\n", + " # График остатков\n", + " residuals = y_test_reg - y_pred_reg\n", + " ax[index * 2 + 1].scatter(y_pred_reg, residuals, alpha=0.5)\n", + " ax[index * 2 + 1].axhline(y=0, color='red', linestyle='--')\n", + " ax[index * 2 + 1].set_xlabel('Predicted Values')\n", + " ax[index * 2 + 1].set_ylabel('Residuals')\n", + " ax[index * 2 + 1].set_title(f'{name}: Residuals vs Predicted')\n", + "\n", + "\n", + "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Модель регресии демонстрирует ужасные результаты ввиду недостаточной корреляции между целеовй характеристикой и строковыми значениями." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Lab_5/lab5.ipynb b/Lab_5/lab5.ipynb new file mode 100644 index 0000000..8368036 --- /dev/null +++ b/Lab_5/lab5.ipynb @@ -0,0 +1,320 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e7893b9e", + "metadata": {}, + "source": [ + "# Лабораторная работа: Методы искусственного интеллекта\n", + "## Задача кластеризации продуктов с использованием cuML\n", + "### Вариант: Продукты\n", + "В данной работе используется библиотека cuML для GPU-ускоренного анализа данных. Цель: провести кластеризацию продуктов на основе их характеристик." + ] + }, + { + "cell_type": "markdown", + "id": "e3834005", + "metadata": {}, + "source": [ + "### Загрузка и исследование данных" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5530d138", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 162313 entries, 0 to 162312\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 category 162313 non-null object\n", + " 1 sub_category 162313 non-null object\n", + " 2 href 162313 non-null object\n", + " 3 items 162280 non-null object\n", + " 4 price 162282 non-null float64\n", + "dtypes: float64(1), object(4)\n", + "memory usage: 28.9+ MB\n", + "None\n", + " category sub_category \\\n", + "0 Groceries Fruits & Vegetables \n", + "1 Groceries Fruits & Vegetables \n", + "2 Groceries Fruits & Vegetables \n", + "3 Groceries Fruits & Vegetables \n", + "4 Groceries Fruits & Vegetables \n", + "\n", + " href \\\n", + "0 https://www.jiomart.com/c/groceries/fruits-veg... \n", + "1 https://www.jiomart.com/c/groceries/fruits-veg... \n", + "2 https://www.jiomart.com/c/groceries/fruits-veg... \n", + "3 https://www.jiomart.com/c/groceries/fruits-veg... \n", + "4 https://www.jiomart.com/c/groceries/fruits-veg... \n", + "\n", + " items price \n", + "0 Fresh Dates (Pack) (Approx 450 g - 500 g) 109.0 \n", + "1 Tender Coconut Cling Wrapped (1 pc) (Approx 90... 49.0 \n", + "2 Mosambi 1 kg 69.0 \n", + "3 Orange Imported 1 kg 125.0 \n", + "4 Banana Robusta 6 pcs (Box) (Approx 800 g - 110... 44.0 \n" + ] + } + ], + "source": [ + "import cudf\n", + "import cuml\n", + "from cuml.preprocessing import LabelEncoder\n", + "from cuml.decomposition import PCA\n", + "from cuml.cluster import KMeans\n", + "import cupy as cp\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Загрузка данных\n", + "df = cudf.read_csv('/mnt/c/3curse/mii/AIM-PIbd-31-Medvedkov-A-D/data/jio_mart_items.csv')\n", + "print(df.info())\n", + "print(df.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5ea4ef3", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "49112908", + "metadata": {}, + "source": [ + "### Предварительная обработка данных" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1e3ef9fa", + "metadata": {}, + "outputs": [], + "source": [ + "# Обработка пропущенных значений\n", + "df = df.dropna()\n", + "\n", + "# Кодирование категориального признака 'items'\n", + "label_encoder = LabelEncoder()\n", + "df['items_encoded'] = label_encoder.fit_transform(df['items'])\n", + "\n", + "# Нормализация числовых признаков\n", + "numeric_features = ['items_encoded', 'price']\n", + "df_scaled = df[numeric_features].astype('float32')\n", + "\n", + "# Преобразование данных в формат cupy\n", + "X = cp.asarray(df_scaled.values)" + ] + }, + { + "cell_type": "markdown", + "id": "ff5f1f8f", + "metadata": {}, + "source": [ + "### Понижение размерности и визуализация данных" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e15c80bb", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Применение PCA для понижения размерности\n", + "pca = PCA(n_components=2)\n", + "reduced_data = pca.fit_transform(X)\n", + "\n", + "# Преобразуем данные из cupy в numpy\n", + "reduced_data_np = reduced_data.get()\n", + "\n", + "# Визуализация данных\n", + "plt.scatter(reduced_data_np[:, 0], reduced_data_np[:, 1])\n", + "plt.title('Визуализация данных после PCA')\n", + "plt.xlabel('PC1')\n", + "plt.ylabel('PC2')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "f2eef505", + "metadata": {}, + "source": [ + "### Выбор оптимального количества кластеров" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f72195d2", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Оценка числа кластеров: 100%|█████████████████████████████████████████████████████████████| 9/9 [01:08<00:00, 7.67s/it]\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Оценка инерции и коэффициента силуэта\n", + "from cuml.metrics.cluster import silhouette_score\n", + "from tqdm import tqdm # Импорт библиотеки для отображения прогресса\n", + "\n", + "# Оценка инерции и коэффициента силуэта\n", + "inertia = []\n", + "silhouette_scores = []\n", + "k_range = range(2, 11)\n", + "\n", + "# tqdm для отображения прогресса\n", + "for k in tqdm(k_range, desc=\"Оценка числа кластеров\"):\n", + " kmeans = KMeans(n_clusters=k, random_state=42)\n", + " kmeans.fit(reduced_data)\n", + " inertia.append(kmeans.inertia_)\n", + " silhouette_scores.append(silhouette_score(reduced_data, kmeans.labels_))\n", + "\n", + "# Построение графиков\n", + "plt.figure(figsize=(14, 5))\n", + "\n", + "# График инерции\n", + "plt.subplot(1, 2, 1)\n", + "plt.plot(k_range, inertia, marker='o')\n", + "plt.title('Критерий инерции')\n", + "plt.xlabel('Число кластеров')\n", + "plt.ylabel('Инерция')\n", + "\n", + "# График коэффициента силуэта\n", + "plt.subplot(1, 2, 2)\n", + "plt.plot(k_range, silhouette_scores, marker='o')\n", + "plt.title('Коэффициент силуэта')\n", + "plt.xlabel('Число кластеров')\n", + "plt.ylabel('Силуэт')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "180e85ac", + "metadata": {}, + "source": [ + "### Кластерный анализ" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "dd573024", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Кластеризация с использованием KMeans\n", + "optimal_k = 4 # Выбираем на основе графиков\n", + "kmeans = KMeans(n_clusters=optimal_k, random_state=42)\n", + "labels = kmeans.fit_predict(reduced_data)\n", + "\n", + "# Преобразуем данные из cupy в numpy\n", + "reduced_data_np = reduced_data.get()\n", + "labels_np = labels.get()\n", + "\n", + "# Визуализация кластеров\n", + "plt.scatter(reduced_data_np[:, 0], reduced_data_np[:, 1], c=labels_np, cmap='viridis')\n", + "plt.title('Кластеры (KMeans)')\n", + "plt.xlabel('PC1')\n", + "plt.ylabel('PC2')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "407d268e", + "metadata": {}, + "source": [ + "### Оценка качества кластеризации" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d00795e2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Силуэт для кластеризации: 0.58\n" + ] + } + ], + "source": [ + "# Оценка коэффициента силуэта\n", + "silhouette = silhouette_score(reduced_data, labels)\n", + "print(f'Силуэт для кластеризации: {silhouette:.2f}')" + ] + }, + { + "cell_type": "markdown", + "id": "7b4aa1da", + "metadata": {}, + "source": [ + "Получился вплоне неплохой силуэт кластеризации, кластеры хорошо различимы, хоть и имеют пересечение." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}