588 lines
322 KiB
Plaintext
Raw Normal View History

2024-12-20 14:14:59 +04:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 79,
2024-12-20 14:14:59 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-12-21 02:12:15 +04:00
"<class 'pandas.core.frame.DataFrame'>\n",
2024-12-20 14:14:59 +04:00
"RangeIndex: 162313 entries, 0 to 162312\n",
"Data columns (total 5 columns):\n",
2024-12-21 02:12:15 +04:00
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 category 162313 non-null object \n",
" 1 sub_category 162313 non-null object \n",
" 2 href 162313 non-null object \n",
" 3 items 162280 non-null object \n",
2024-12-20 14:14:59 +04:00
" 4 price 162282 non-null float64\n",
"dtypes: float64(1), object(4)\n",
2024-12-21 02:12:15 +04:00
"memory usage: 6.2+ MB\n"
2024-12-20 14:14:59 +04:00
]
}
],
"source": [
2024-12-21 02:12:15 +04:00
"import pandas as pd\n",
"import seaborn as sns\n",
2024-12-20 14:14:59 +04:00
"import matplotlib.pyplot as plt\n",
"from sklearn.preprocessing import LabelEncoder\n",
2024-12-21 02:12:15 +04:00
"from sklearn import metrics\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"from sklearn.metrics import ConfusionMatrixDisplay\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier\n",
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
"from sklearn.metrics import (\n",
" precision_score, recall_score, accuracy_score, roc_auc_score, f1_score,\n",
" matthews_corrcoef, cohen_kappa_score, confusion_matrix\n",
")\n",
"from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
"import numpy as np\n",
"import featuretools as ft\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
2024-12-20 14:14:59 +04:00
"\n",
2024-12-21 02:12:15 +04:00
"# Функция для применения oversampling\n",
"def apply_oversampling(X, y):\n",
" oversampler = RandomOverSampler(random_state=42)\n",
" X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
" return X_resampled, y_resampled\n",
"\n",
"# Функция для применения undersampling\n",
"def apply_undersampling(X, y):\n",
" undersampler = RandomUnderSampler(random_state=42)\n",
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
" return X_resampled, y_resampled\n",
2024-12-20 14:14:59 +04:00
"\n",
2024-12-21 02:12:15 +04:00
"def split_stratified_into_train_val_test(\n",
" df_input,\n",
" stratify_colname=\"y\",\n",
" frac_train=0.6,\n",
" frac_val=0.15,\n",
" frac_test=0.25,\n",
" random_state=None,\n",
"):\n",
" \"\"\"\n",
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
" following fractional ratios provided by the user, where each subset is\n",
" stratified by the values in a specific column (that is, each subset has\n",
" the same relative frequency of the values in the column). It performs this\n",
" splitting by running train_test_split() twice.\n",
2024-12-20 14:14:59 +04:00
"\n",
2024-12-21 02:12:15 +04:00
" Parameters\n",
" ----------\n",
" df_input : Pandas dataframe\n",
" Input dataframe to be split.\n",
" stratify_colname : str\n",
" The name of the column that will be used for stratification. Usually\n",
" this column would be for the label.\n",
" frac_train : float\n",
" frac_val : float\n",
" frac_test : float\n",
" The ratios with which the dataframe will be split into train, val, and\n",
" test data. The values should be expressed as float fractions and should\n",
" sum to 1.0.\n",
" random_state : int, None, or RandomStateInstance\n",
" Value to be passed to train_test_split().\n",
2024-12-20 14:14:59 +04:00
"\n",
2024-12-21 02:12:15 +04:00
" Returns\n",
" -------\n",
" df_train, df_val, df_test :\n",
" Dataframes containing the three splits.\n",
" \"\"\"\n",
"\n",
" if frac_train + frac_val + frac_test != 1.0:\n",
" raise ValueError(\n",
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
" % (frac_train, frac_val, frac_test)\n",
" )\n",
"\n",
" if stratify_colname not in df_input.columns:\n",
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
"\n",
" X = df_input # Contains all columns.\n",
" y = df_input[\n",
" [stratify_colname]\n",
" ] # Dataframe of just the column on which to stratify.\n",
"\n",
" # Split original dataframe into train and temp dataframes.\n",
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
" )\n",
"\n",
" # Split the temp dataframe into val and test dataframes.\n",
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
" df_val, df_test, y_val, y_test = train_test_split(\n",
" df_temp,\n",
" y_temp,\n",
" stratify=y_temp,\n",
" test_size=relative_frac_test,\n",
" random_state=random_state,\n",
" )\n",
"\n",
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
"\n",
" return df_train, df_val, df_test\n",
"\n",
"\n",
"df = pd.read_csv('/mnt/c/3curse/mii/AIM-PIbd-31-Medvedkov-A-D/data/jio_mart_items.csv')\n",
"df.info()\n",
"df = df.sample(n=10000 , random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Dining' 'Toys, Games & Fitness' 'Fragrances' 'Bags & Travel Luggage'\n",
" 'Girls' 'Home Decor' 'Boys' 'Stationery' 'Beverages' 'Staples' 'Men'\n",
" 'Mobiles & Tablets' 'Personal Care' 'Dairy & Bakery' 'Mom & Baby Care'\n",
" 'Snacks & Branded Foods' 'Women' 'Books' 'Auto Care' 'Electrical'\n",
" 'Furnishing' 'Accessories' 'Pets' 'Home Care' 'Mops, Brushes & Scrubs'\n",
" 'Furniture' 'Computers' 'Kitchen Appliances' 'Home Appliances' 'Cameras'\n",
" 'Make-Up' 'Garden & Outdoor' 'Disposables' 'Carpentry & work accessories'\n",
" 'Mom & Baby' 'Kitchenware' 'Power & Hand Tools' 'Pooja Needs'\n",
" 'Bathroom & Laundry Accessories' 'Office Products' 'TV & Speaker'\n",
" 'Personal Care & Grooming' 'Hair' 'Skin Care'\n",
" 'Paint, Wall Treatments & Supplies' 'Industrial & Scientific Supplies'\n",
" 'Infants' 'Kitchen & Bath Fixtures' 'Home Safety & Automation'\n",
" 'Fine Jewellery' 'Fruits & Vegetables' 'Apparel' 'Premium Fruits'\n",
" 'Phones' 'Bathroom & Laundry' 'Junior Boys' 'Tools & Appliances'\n",
" 'Smart Devices' \"Men's Grooming\" 'Gaming' 'Health Care Devices'\n",
" 'Handloom & Handicraft' 'Hardware & Plumbing' 'Wellness' 'Treatments']\n"
]
}
],
"source": [
"print(df['sub_category'].unique())"
2024-12-20 14:14:59 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-12-21 02:12:15 +04:00
"Как бизнес-цели выделим следующие 2 варианта:\n",
" 1) Регрессия - предсказание цены по категории (для аналитических систем или улучшения алгоритмов ценообразования)\n",
" 2) Классификация - определение категории продукта по его подкатегории (для логистических или аналитических систем)\n",
2024-12-21 02:12:15 +04:00
" \n",
"Однако данный датасет весьма плоо подходит для подобных задач."
2024-12-20 14:14:59 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-12-21 02:12:15 +04:00
"Выполним подготовку данных"
2024-12-20 14:14:59 +04:00
]
},
{
"cell_type": "code",
"execution_count": 81,
2024-12-20 14:14:59 +04:00
"metadata": {},
"outputs": [],
2024-12-20 14:14:59 +04:00
"source": [
"# df.fillna({\"category\": \"NaN\", \"sub_category\": \"NaN\", \"href\" : \"NaN\", \"items\" : \"NaN\", \"price\" : \"NaN\" }, inplace=True)\n",
2024-12-21 02:12:15 +04:00
"df = df.dropna()\n",
"data = df.copy()\n",
2024-12-20 14:14:59 +04:00
"\n",
2024-12-21 02:12:15 +04:00
"value_counts = data[\"category\"].value_counts()\n",
"rare = value_counts[value_counts < 100].index\n",
"data = data[~data[\"category\"].isin(rare)]\n",
"\n",
"data1 = pd.get_dummies(data, columns=['category', 'sub_category'], drop_first=True)\n",
"\n",
"# label_encoder = LabelEncoder()\n",
"# data1['sub_category'] = label_encoder.fit_transform(data['sub_category'])\n",
"# data1['category'] = label_encoder.fit_transform(data['category'])\n",
"# data1['items'] = label_encoder.fit_transform(data['items'])\n"
2024-12-20 14:14:59 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Определить достижимый уровень качества модели для каждой задачи. На основе имеющихся данных уровень качества моделей регрессии будет низким, поскольку цена слабо коррелирует с категорией (кроме некоторых исключений)."
2024-12-20 14:14:59 +04:00
]
},
{
2024-12-21 02:12:15 +04:00
"cell_type": "markdown",
2024-12-20 14:14:59 +04:00
"metadata": {},
"source": [
"Построим конвейер."
2024-12-20 14:14:59 +04:00
]
},
{
"cell_type": "code",
"execution_count": 82,
2024-12-20 14:14:59 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['href', 'items', 'price', 'category_Electronics', 'category_Fashion',\n",
" 'category_Groceries', 'category_Home & Kitchen', 'sub_category_Apparel',\n",
" 'sub_category_Auto Care', 'sub_category_Bags & Travel Luggage',\n",
" 'sub_category_Bathroom & Laundry',\n",
" 'sub_category_Bathroom & Laundry Accessories', 'sub_category_Beverages',\n",
" 'sub_category_Books', 'sub_category_Boys', 'sub_category_Cameras',\n",
" 'sub_category_Carpentry & work accessories', 'sub_category_Computers',\n",
" 'sub_category_Dairy & Bakery', 'sub_category_Dining',\n",
" 'sub_category_Disposables', 'sub_category_Electrical',\n",
" 'sub_category_Fragrances', 'sub_category_Fruits & Vegetables',\n",
" 'sub_category_Furnishing', 'sub_category_Furniture',\n",
" 'sub_category_Gaming', 'sub_category_Garden & Outdoor',\n",
" 'sub_category_Girls', 'sub_category_Hair',\n",
" 'sub_category_Handloom & Handicraft',\n",
" 'sub_category_Hardware & Plumbing', 'sub_category_Health Care Devices',\n",
" 'sub_category_Home Appliances', 'sub_category_Home Care',\n",
" 'sub_category_Home Decor', 'sub_category_Home Safety & Automation',\n",
" 'sub_category_Industrial & Scientific Supplies', 'sub_category_Infants',\n",
" 'sub_category_Junior Boys', 'sub_category_Kitchen & Bath Fixtures',\n",
" 'sub_category_Kitchen Appliances', 'sub_category_Kitchenware',\n",
" 'sub_category_Make-Up', 'sub_category_Men',\n",
" 'sub_category_Men's Grooming', 'sub_category_Mobiles & Tablets',\n",
" 'sub_category_Mom & Baby', 'sub_category_Mom & Baby Care',\n",
" 'sub_category_Mops, Brushes & Scrubs', 'sub_category_Office Products',\n",
" 'sub_category_Paint, Wall Treatments & Supplies',\n",
" 'sub_category_Personal Care', 'sub_category_Personal Care & Grooming',\n",
" 'sub_category_Pets', 'sub_category_Phones', 'sub_category_Pooja Needs',\n",
" 'sub_category_Power & Hand Tools', 'sub_category_Premium Fruits',\n",
" 'sub_category_Skin Care', 'sub_category_Smart Devices',\n",
" 'sub_category_Snacks & Branded Foods', 'sub_category_Staples',\n",
" 'sub_category_Stationery', 'sub_category_TV & Speaker',\n",
" 'sub_category_Tools & Appliances', 'sub_category_Toys, Games & Fitness',\n",
" 'sub_category_Wellness', 'sub_category_Women'],\n",
" dtype='object')\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 9995 entries, 52893 to 146053\n",
"Data columns (total 69 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 href 9995 non-null object \n",
" 1 items 9995 non-null object \n",
" 2 price 9995 non-null float64\n",
" 3 category_Electronics 9995 non-null bool \n",
" 4 category_Fashion 9995 non-null bool \n",
" 5 category_Groceries 9995 non-null bool \n",
" 6 category_Home & Kitchen 9995 non-null bool \n",
" 7 sub_category_Apparel 9995 non-null bool \n",
" 8 sub_category_Auto Care 9995 non-null bool \n",
" 9 sub_category_Bags & Travel Luggage 9995 non-null bool \n",
" 10 sub_category_Bathroom & Laundry 9995 non-null bool \n",
" 11 sub_category_Bathroom & Laundry Accessories 9995 non-null bool \n",
" 12 sub_category_Beverages 9995 non-null bool \n",
" 13 sub_category_Books 9995 non-null bool \n",
" 14 sub_category_Boys 9995 non-null bool \n",
" 15 sub_category_Cameras 9995 non-null bool \n",
" 16 sub_category_Carpentry & work accessories 9995 non-null bool \n",
" 17 sub_category_Computers 9995 non-null bool \n",
" 18 sub_category_Dairy & Bakery 9995 non-null bool \n",
" 19 sub_category_Dining 9995 non-null bool \n",
" 20 sub_category_Disposables 9995 non-null bool \n",
" 21 sub_category_Electrical 9995 non-null bool \n",
" 22 sub_category_Fragrances 9995 non-null bool \n",
" 23 sub_category_Fruits & Vegetables 9995 non-null bool \n",
" 24 sub_category_Furnishing 9995 non-null bool \n",
" 25 sub_category_Furniture 9995 non-null bool \n",
" 26 sub_category_Gaming 9995 non-null bool \n",
" 27 sub_category_Garden & Outdoor 9995 non-null bool \n",
" 28 sub_category_Girls 9995 non-null bool \n",
" 29 sub_category_Hair 9995 non-null bool \n",
" 30 sub_category_Handloom & Handicraft 9995 non-null bool \n",
" 31 sub_category_Hardware & Plumbing 9995 non-null bool \n",
" 32 sub_category_Health Care Devices 9995 non-null bool \n",
" 33 sub_category_Home Appliances 9995 non-null bool \n",
" 34 sub_category_Home Care 9995 non-null bool \n",
" 35 sub_category_Home Decor 9995 non-null bool \n",
" 36 sub_category_Home Safety & Automation 9995 non-null bool \n",
" 37 sub_category_Industrial & Scientific Supplies 9995 non-null bool \n",
" 38 sub_category_Infants 9995 non-null bool \n",
" 39 sub_category_Junior Boys 9995 non-null bool \n",
" 40 sub_category_Kitchen & Bath Fixtures 9995 non-null bool \n",
" 41 sub_category_Kitchen Appliances 9995 non-null bool \n",
" 42 sub_category_Kitchenware 9995 non-null bool \n",
" 43 sub_category_Make-Up 9995 non-null bool \n",
" 44 sub_category_Men 9995 non-null bool \n",
" 45 sub_category_Men's Grooming 9995 non-null bool \n",
" 46 sub_category_Mobiles & Tablets 9995 non-null bool \n",
" 47 sub_category_Mom & Baby 9995 non-null bool \n",
" 48 sub_category_Mom & Baby Care 9995 non-null bool \n",
" 49 sub_category_Mops, Brushes & Scrubs 9995 non-null bool \n",
" 50 sub_category_Office Products 9995 non-null bool \n",
" 51 sub_category_Paint, Wall Treatments & Supplies 9995 non-null bool \n",
" 52 sub_category_Personal Care 9995 non-null bool \n",
" 53 sub_category_Personal Care & Grooming 9995 non-null bool \n",
" 54 sub_category_Pets 9995 non-null bool \n",
" 55 sub_category_Phones 9995 non-null bool \n",
" 56 sub_category_Pooja Needs 9995 non-null bool \n",
" 57 sub_category_Power & Hand Tools 9995 non-null bool \n",
" 58 sub_category_Premium Fruits 9995 non-null bool \n",
" 59 sub_category_Skin Care 9995 non-null bool \n",
" 60 sub_category_Smart Devices 9995 non-null bool \n",
" 61 sub_category_Snacks & Branded Foods 9995 non-null bool \n",
" 62 sub_category_Staples 9995 non-null bool \n",
" 63 sub_category_Stationery 9995 non-null bool \n",
" 64 sub_category_TV & Speaker 9995 non-null bool \n",
" 65 sub_category_Tools & Appliances 9995 non-null bool \n",
" 66 sub_category_Toys, Games & Fitness 9995 non-null bool \n",
" 67 sub_category_Wellness 9995 non-null bool \n",
" 68 sub_category_Women 9995 non-null bool \n",
"dtypes: bool(66), float64(1), object(2)\n",
"memory usage: 956.6+ KB\n"
]
}
],
2024-12-20 14:14:59 +04:00
"source": [
"print(data1.columns)\n",
"data1.info()"
2024-12-20 14:14:59 +04:00
]
},
{
"cell_type": "code",
"execution_count": 83,
2024-12-20 14:14:59 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters for Linear Regression: {}\n",
"Best parameters for Random Forest Regressor: {'model__max_depth': None, 'model__n_estimators': 300}\n",
"Best parameters for Gradient Boosting Regressor: {'model__learning_rate': 0.01, 'model__max_depth': 7, 'model__n_estimators': 300}\n",
"Model: Linear Regression\n",
"Model: Random Forest Regressor\n",
"Model: Gradient Boosting Regressor\n"
2024-12-20 14:14:59 +04:00
]
}
],
"source": [
"X_reg = data1.drop(['href', 'items', 'price'], axis=1)\n",
"y_reg = data1['price']\n",
"\n",
"# Разделение данных\n",
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
"\n",
"# Выбор моделей для регрессии\n",
"models_reg = {\n",
" 'Linear Regression': LinearRegression(),\n",
" 'Random Forest Regressor': RandomForestRegressor(random_state=42),\n",
" 'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42)\n",
"}\n",
"\n",
"# Создание конвейера для регрессии\n",
"pipelines_reg = {}\n",
"for name, model in models_reg.items():\n",
" pipelines_reg[name] = Pipeline([\n",
" ('scaler', StandardScaler()),\n",
" ('model', model)\n",
" ])\n",
"\n",
"# Определение сетки гиперпараметров для регрессии\n",
"param_grids_reg = {\n",
" 'Linear Regression': {},\n",
" 'Random Forest Regressor': {\n",
" 'model__n_estimators': [100, 200, 300],\n",
" 'model__max_depth': [None, 10, 20, 30]\n",
" },\n",
" 'Gradient Boosting Regressor': {\n",
" 'model__n_estimators': [100, 200, 300],\n",
" 'model__learning_rate': [0.01, 0.1, 0.2],\n",
" 'model__max_depth': [3, 5, 7]\n",
" }\n",
"}\n",
"\n",
"# Настройка гиперпараметров для регрессии\n",
"best_models_reg = {}\n",
"for name, pipeline in pipelines_reg.items():\n",
" grid_search = GridSearchCV(pipeline, param_grids_reg[name], cv=5, scoring='neg_mean_squared_error')\n",
" grid_search.fit(X_train_reg, y_train_reg)\n",
" best_models_reg[name] = {\n",
" 'pipeline': grid_search.best_estimator_,\n",
" 'best_params': grid_search.best_params_\n",
" }\n",
" print(f'Best parameters for {name}: {grid_search.best_params_}')\n",
"\n",
"# Обучение моделей и оценка качества\n",
"for model_name in best_models_reg.keys():\n",
" print(f\"Model: {model_name}\")\n",
" model_pipeline = best_models_reg[model_name]['pipeline']\n",
" model_pipeline.fit(X_train_reg, y_train_reg)\n",
"\n",
" y_train_predict = model_pipeline.predict(X_train_reg)\n",
" y_test_predict = model_pipeline.predict(X_test_reg)\n",
"\n",
" best_models_reg[model_name][\"preds_train\"] = y_train_predict\n",
" best_models_reg[model_name][\"preds_test\"] = y_test_predict\n",
"\n",
" best_models_reg[model_name][\"MSE_train\"] = mean_squared_error(y_train_reg, y_train_predict)\n",
" best_models_reg[model_name][\"MSE_test\"] = mean_squared_error(y_test_reg, y_test_predict)\n",
" best_models_reg[model_name][\"R2_train\"] = r2_score(y_train_reg, y_train_predict)\n",
" best_models_reg[model_name][\"R2_test\"] = r2_score(y_test_reg, y_test_predict)\n",
" best_models_reg[model_name][\"MAE_train\"] = mean_absolute_error(y_train_reg, y_train_predict)\n",
" best_models_reg[model_name][\"MAE_test\"] = mean_absolute_error(y_test_reg, y_test_predict)"
2024-12-20 14:14:59 +04:00
]
2024-12-21 02:12:15 +04:00
},
{
"cell_type": "code",
"execution_count": 84,
2024-12-21 02:12:15 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.991495747873937\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" 0 0.99 0.90 0.94 131\n",
" 1 0.99 1.00 0.99 241\n",
" 2 1.00 1.00 1.00 307\n",
" 3 0.98 1.00 0.99 573\n",
" 4 1.00 1.00 1.00 747\n",
"\n",
" accuracy 0.99 1999\n",
" macro avg 0.99 0.98 0.98 1999\n",
"weighted avg 0.99 0.99 0.99 1999\n",
"\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAxQAAAK9CAYAAAC95yoDAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjAsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvlHJYcgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAjKVJREFUeJzs3Xl8TOf7//H3JLJIyCIkoSX2JUQpPsRS1L6V0kWphupCLSWKatXaSutTpa2tK6p7FZ+W2ilF7PtaS0iVCLHEEgnJ+f3hZ76ZJshMJWcir2cf51Fzn+2aOZlJrrnu+9wWwzAMAQAAAIADXMwOAAAAAEDuRUIBAAAAwGEkFAAAAAAcRkIBAAAAwGEkFAAAAAAcRkIBAAAAwGEkFAAAAAAcRkIBAAAAwGEkFAAAAAAcRkIBAJk4dOiQmjdvLl9fX1ksFs2fP/+eHv/YsWOyWCyaOXPmPT1ubtaoUSM1atTI7DAAAHYioQDgtI4cOaKXX35ZpUuXlqenp3x8fFSvXj19+OGHSkpKytZzR0REaPfu3XrnnXc0e/Zs1axZM1vPl5O6d+8ui8UiHx+fTF/HQ4cOyWKxyGKx6P3337f7+CdPntSoUaO0Y8eOexAtAMDZ5TM7AADIzMKFC/Xkk0/Kw8NDzz33nKpUqaKUlBStXbtWgwcP1t69e/Xpp59my7mTkpIUHR2tN998U3379s2Wc4SEhCgpKUlubm7Zcvy7yZcvn65evapff/1VTz31lM26b775Rp6enrp27ZpDxz558qRGjx6tkiVLqlq1alneb+nSpQ6dDwBgLhIKAE4nJiZGnTt3VkhIiFauXKmiRYta1/Xp00eHDx/WwoULs+38Z86ckST5+fll2zksFos8PT2z7fh34+HhoXr16um7777LkFB8++23atOmjX7++eccieXq1avy8vKSu7t7jpwPAHBv0eUJgNMZP368Ll++rC+++MImmbilbNmyevXVV62Pb9y4obFjx6pMmTLy8PBQyZIl9cYbbyg5Odlmv5IlS6pt27Zau3at/vOf/8jT01OlS5fWV199Zd1m1KhRCgkJkSQNHjxYFotFJUuWlHSzq9Ctf6c3atQoWSwWm7Zly5apfv368vPzU4ECBVShQgW98cYb1vW3G0OxcuVKNWjQQN7e3vLz81P79u21f//+TM93+PBhde/eXX5+fvL19VWPHj109erV27+w/9ClSxctWrRIFy5csLZt3rxZhw4dUpcuXTJsf+7cOb322msKCwtTgQIF5OPjo1atWmnnzp3WbX7//XfVqlVLktSjRw9r16lbz7NRo0aqUqWKtm7dqkceeUReXl7W1+WfYygiIiLk6emZ4fm3aNFC/v7+OnnyZJafKwAg+5BQAHA6v/76q0qXLq26detmafsXXnhBI0aM0MMPP6yJEyeqYcOGioqKUufOnTNse/jwYT3xxBNq1qyZJkyYIH9/f3Xv3l179+6VJHXs2FETJ06UJD3zzDOaPXu2Jk2aZFf8e/fuVdu2bZWcnKwxY8ZowoQJeuyxx7Ru3bo77rd8+XK1aNFC8fHxGjVqlCIjI7V+/XrVq1dPx44dy7D9U089pUuXLikqKkpPPfWUZs6cqdGjR2c5zo4dO8pisWju3LnWtm+//VYVK1bUww8/nGH7o0ePav78+Wrbtq0++OADDR48WLt371bDhg2tf9xXqlRJY8aMkSS99NJLmj17tmbPnq1HHnnEepyEhAS1atVK1apV06RJk9S4ceNM4/vwww9VpEgRRUREKDU1VZL0ySefaOnSpfr4449VrFixLD9XAEA2MgDAiVy8eNGQZLRv3z5L2+/YscOQZLzwwgs27a+99pohyVi5cqW1LSQkxJBkrFmzxtoWHx9veHh4GIMGDbK2xcTEGJKM//73vzbHjIiIMEJCQjLEMHLkSCP9x+nEiRMNScaZM2duG/etc8yYMcPaVq1aNSMwMNBISEiwtu3cudNwcXExnnvuuQzne/75522O+fjjjxsBAQG3PWf65+Ht7W0YhmE88cQTRpMmTQzDMIzU1FQjODjYGD16dKavwbVr14zU1NQMz8PDw8MYM2aMtW3z5s0ZntstDRs2NCQZ06dPz3Rdw4YNbdqWLFliSDLefvtt4+jRo0aBAgWMDh063PU5AgByDhUKAE4lMTFRklSwYMEsbf/bb79JkiIjI23aBw0aJEkZxlqEhoaqQYMG1sdFihRRhQoVdPToUYdj/qdbYy/+97//KS0tLUv7nDp1Sjt27FD37t1VqFAha3vVqlXVrFkz6/NMr1evXjaPGzRooISEBOtrmBVdunTR77//rri4OK1cuVJxcXGZdneSbo67cHG5+WsjNTVVCQkJ1u5c27Zty/I5PTw81KNHjyxt27x5c7388ssaM2aMOnbsKE9PT33yySdZPhcAIPuRUABwKj4+PpKkS5cuZWn748ePy8XFRWXLlrVpDw4Olp+fn44fP27TXqJEiQzH8Pf31/nz5x2MOKOnn35a9erV0wsvvKCgoCB17txZP/744x2Ti1txVqhQIcO6SpUq6ezZs7py5YpN+z+fi7+/vyTZ9Vxat26tggUL6ocfftA333yjWrVqZXgtb0lLS9PEiRNVrlw5eXh4qHDhwipSpIh27dqlixcvZvmcDzzwgF0DsN9//30VKlRIO3bs0EcffaTAwMAs7wsAyH4kFACcio+Pj4oVK6Y9e/bYtd8/B0Xfjqura6bthmE4fI5b/ftvyZ8/v9asWaPly5erW7du2rVrl55++mk1a9Ysw7b/xr95Lrd4eHioY8eOmjVrlubNm3fb6oQkjRs3TpGRkXrkkUf09ddfa8mSJVq2bJkqV66c5UqMdPP1scf27dsVHx8vSdq9e7dd+wIAsh8JBQCn07ZtWx05ckTR0dF33TYkJERpaWk6dOiQTfvp06d14cIF6x2b7gV/f3+bOyLd8s8qiCS5uLioSZMm+uCDD7Rv3z698847WrlypVatWpXpsW/FefDgwQzrDhw4oMKFC8vb2/vfPYHb6NKli7Zv365Lly5lOpD9ljlz5qhx48b64osv1LlzZzVv3lxNmzbN8JpkNbnLiitXrqhHjx4KDQ3VSy+9pPHjx2vz5s337PgAgH+PhAKA0xkyZIi8vb31wgsv6PTp0xnWHzlyRB9++KGkm112JGW4E9MHH3wgSWrTps09i6tMmTK6ePGidu3aZW07deqU5s2bZ7PduXPnMux7a4K3f97K9paiRYuqWrVqmjVrls0f6Hv27NHSpUutzzM7NG7cWGPHjtXkyZMVHBx82+1cXV0zVD9++ukn/f333zZttxKfzJIvew0dOlSxsbGaNWuWPvjgA5UsWVIRERG3fR0BADmPie0AOJ0yZcro22+/1dNPP61KlSrZzJS9fv16/fTTT+revbsk6aGHHlJERIQ+/fRTXbhwQQ0bNtSmTZs0a9YsdejQ4ba3JHVE586dNXToUD3++OPq37+/rl69qmnTpql8+fI2g5LHjBmjNWvWqE2bNgoJCVF8fLymTp2qBx98UPXr17/t8f/73/+qVatWCg8PV8+ePZWUlKSPP/5Yvr6+GjVq1D17Hv/k4uKi4cOH33W7tm3basyYMerRo4fq1q2r3bt365tvvlHp0qVttitTpoz8/Pw0ffp0FSxYUN7e3qpdu7ZKlSplV1wrV67U1KlTNXLkSOttbGfMmKFGjRrprbfe0vjx4+06HgAge1ChAOCUHnvsMe3atUtPPPGE/ve//6lPnz56/fXXdezYMU2YMEEfffSRddvPP/9co0eP1ubNmzVgwACtXLlSw4YN0/fff39PYwoICNC8efPk5eWlIUOGaNasWYqKilK7du0yxF6iRAl9+eWX6tOnj6ZMmaJHHnlEK1eulK+v722P37RpUy1evFgBAQEaMWKE3n//fdWpU0fr1q2z+4/x7PDGG29o0KBBWrJkiV599VVt27ZNCxcuVPHixW22c3Nz06xZs+Tq6qpevXrpmWee0erVq+0616VLl/T888+revXqevPNN63tDRo00KuvvqoJEyZow4YN9+R5AQD+HYthz+g9AAAAAEiHCgUAAAA
"text/plain": [
"<Figure size 1000x800 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Кодирование категориальных данных\n",
"label_encoder = LabelEncoder()\n",
"data['sub_category_encoded'] = label_encoder.fit_transform(data['sub_category'])\n",
"\n",
"# Определение признаков (X) и целевой переменной (y)\n",
"X = data[['sub_category_encoded']] # Используем закодированный sub_category\n",
"y = label_encoder.fit_transform(data['category']) # Кодируем category\n",
"\n",
"# Разделение данных на тренировочную и тестовую выборки\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
"\n",
"# Создание и обучение модели\n",
"classifier = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10)\n",
"classifier.fit(X_train, y_train)\n",
"\n",
"# Предсказание на тестовых данных\n",
"y_pred = classifier.predict(X_test)\n",
"\n",
"# Оценка качества модели\n",
"print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n",
"print(\"Classification Report:\\n\", classification_report(y_test, y_pred))\n",
"\n",
"# Матрица ошибок\n",
"cm = confusion_matrix(y_test, y_pred)\n",
"plt.figure(figsize=(10, 8))\n",
"sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)\n",
"plt.xlabel('Predicted')\n",
"plt.ylabel('Actual')\n",
"plt.title('Confusion Matrix')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Модель классификации показывает неплохие результаты, что логично, учитывая структуру датасета."
]
2024-12-21 02:12:15 +04:00
},
{
"cell_type": "code",
"execution_count": 85,
2024-12-21 02:12:15 +04:00
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABBgAAAQ9CAYAAADzgP7sAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjAsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvlHJYcgAAAAlwSFlzAAAPYQAAD2EBqD+naQABAABJREFUeJzs3Xl4TGf7wPHvTJKZyR5bNoKI2IOi0thbqahU3xS1176W2mpt7bW0VJUWqbbo22qLVr39oSqCKlK7Ei0VtZNYk5F9mfP7Y5rDSJCQ3f25rnPJOeeZc+4zk3hm7nnO/WgURVEQQgghhBBCCCGEeALawg5ACCGEEEIIIYQQxZ8kGIQQQgghhBBCCPHEJMEghBBCCCGEEEKIJyYJBiGEEEIIIYQQQjwxSTAIIYQQQgghhBDiiUmCQQghhBBCCCGEEE9MEgxCCCGEEEIIIYR4YpJgEEIIIYQQQgghxBOTBIMQQgghhBBCCCGemCQYRJFy7tw5NBoNq1atKuxQxBPo06cPlStXLuwwSgyNRsP06dMLO4xC16pVK1q1aqWuF8X/L+6PUYiipij+3Yjck37WbPr06Wg0mhy1LYi+VPqAnJM+veSSBIMoMKtWrUKj0XDw4MHCDiXfZHZ0mYuNjQ2VK1dmxIgRxMbGFnZ4Ali6dCkajQZ/f//HPsaVK1eYPn06R48ezbvAiridO3dm+d2uUqUKvXr14p9//ins8HJl7969TJ8+Xf4mRYkj/WxsYYf31Mr83ctcrK2tKV++PH369OHy5cuFHZ64j/TpIj9ZF3YAQtyrUqVKJCUlYWNjU9ihPJFly5bh4OBAQkIC4eHhfPzxxxw+fJjdu3cXdmgF4rPPPsNkMhV2GNlavXo1lStXZv/+/URFRVG1atVcH+PKlSvMmDGDypUrU79+/bwPsggbMWIEzz77LGlpaRw+fJjly5ezadMmjh8/jqenZ4HG8rj/X+zdu5cZM2bQp08fXFxc8ic4IYoo6WdLhqLaz86cORNvb2+Sk5P5/fffWbVqFbt37yYyMhKDwZDn55s8eTITJ07M8+M+LaRPF/lBRjCIIkWj0WAwGLCysirsUB4oMTHxkW06depEz549GTx4MGvXrqVLly7s2bOH/fv3F0CEd5lMJpKTkwv0nAA2Njbo9foCP++jnD17lr179/Lhhx9Srlw5Vq9eXdghFTvNmzenZ8+e9O3bl48//pgPPviAW7du8eWXXz7wMQkJCfkSS3H4/0KIoqY4/N1IP/toRbWffemll+jZsycDBgzg888/Z+zYsZw5c4affvopX85nbW2dL4mLp4X06SI/SIJBFCnZ3X/Vp08fHBwcuHz5MiEhITg4OFCuXDnGjh1LRkaGxeNNJhMfffQRtWvXxmAw4ObmxuDBg7l9+7ZFu//9738EBwfj6emJXq/Hx8eHd999N8vxWrVqRZ06dTh06BAtWrTAzs6Ot99+O9fX1bx5cwDOnDljsX3fvn20bdsWZ2dn7OzsaNmyJXv27Mny+J07d9KoUSMMBgM+Pj58+umn2d53qNFoGD58OKtXr6Z27dro9Xq2bNkCwOXLl+nXrx9ubm7o9Xpq167NihUrspzr448/pnbt2tjZ2VGqVCkaNWrEN998o+6/c+cOo0aNonLlyuj1elxdXXnxxRc5fPiw2ia7e0MTEhJ466238PLyQq/XU716dT744AMURcn2GjZs2ECdOnXUWDOv414nT57kwoUL2T3l2Vq9ejWlSpUiODiYTp06PTDBEBsby+jRo9VrrFChAr169eLGjRvs3LmTZ599FoC+ffuqwwszf2crV65Mnz59shzz/vv4UlNTmTp1Kg0bNsTZ2Rl7e3uaN2/Ojh07cnw9mWJiYrC2tmbGjBlZ9p06dQqNRsMnn3wCQFpaGjNmzMDX1xeDwUCZMmVo1qwZYWFhuT4vwAsvvACYkzdwd/jyn3/+Sffu3SlVqhTNmjVT23/99dc0bNgQW1tbSpcuTdeuXbl48WKW4y5fvhwfHx9sbW1p3Lgxv/32W5Y2D7pf8+TJk3Tu3Jly5cpha2tL9erVeeedd9T4xo0bB4C3t7f6+p07dy5fYhSiqJF+VvrZ/Oxn7/eg1+XkyZN06tSJ0qVLYzAYaNSoUZYkRE76q+xeo5SUFEaPHk25cuVwdHTklVde4dKlS1lie1Adi+yOuXLlSl544QVcXV3R6/XUqlWLZcuW5eg5eNTrfT/p01dZbJc+vfiRWyREsZCRkUFQUBD+/v588MEHbNu2jQULFuDj48PQoUPVdoMHD2bVqlX07duXESNGcPbsWT755BOOHDnCnj171GFXq1atwsHBgTFjxuDg4MD27duZOnUqRqOR+fPnW5z75s2bvPTSS3Tt2pWePXvi5uaW6/gz/6MrVaqUum379u289NJLNGzYkGnTpqHVatUO7LfffqNx48YAHDlyhLZt2+Lh4cGMGTPIyMhg5syZlCtXLttzbd++nbVr1zJ8+HDKli1L5cqViYmJ4bnnnlPfVJQrV46ff/6Z/v37YzQaGTVqFGAecjlixAg6derEyJEjSU5O5tixY+zbt4/u3bsDMGTIEL7//nuGDx9OrVq1uHnzJrt37+avv/6iQYMG2cakKAqvvPIKO3bsoH///tSvX59ffvmFcePGcfnyZRYuXGjRfvfu3axfv5433ngDR0dHFi9eTMeOHblw4QJlypRR29WsWZOWLVuyc+fOHL0Oq1evpkOHDuh0Orp168ayZcs4cOCAmjAAiI+Pp3nz5vz111/069ePBg0acOPGDX766ScuXbpEzZo1mTlzJlOnTmXQoEHqm6cmTZrkKIZMRqORzz//nG7dujFw4EDu3LnDF198QVBQEPv378/VrRdubm60bNmStWvXMm3aNIt9a9aswcrKitdeew0wd8Zz585lwIABNG7cGKPRyMGDBzl8+DAvvvhirq4B7r5pvPd1AXjttdfw9fVlzpw56pvb2bNnM2XKFDp37syAAQO4fv06H3/8MS1atODIkSPq0MYvvviCwYMH06RJE0aNGsU///zDK6+8QunSpfHy8npoPMeOHaN58+bY2NgwaNAgKleuzJkzZ/i///s/Zs+eTYcOHfj777/59ttvWbhwIWXLlgVQ/54KIkYhiiLpZ6WfzYt+9n7ZvS4nTpygadOmlC9fnokTJ2Jvb8/atWsJCQnhhx9+4NVXXwUev78aMGAAX3/9Nd27d6dJkyZs376d4ODgx4o/07Jly6hduzavvPIK1tbW/N///R9vvPEGJpOJYcOGPfBxOXm97yd9+l3SpxdTihAFZOXKlQqgHDhw4IFtzp49qwDKypUr1W29e/dWAGXmzJkWbZ955hmlYcOG6vpvv/2mAMrq1ast2m3ZsiXL9sTExCznHjx4sGJnZ6ckJyer21q2bKkASmhoaI6ucdq0aQqgnDp1Srl+/bpy7tw5ZcWKFYqtra1Srlw5JSEhQVEURTGZTIqvr68SFBSkmEwmi7i8vb2VF198Ud3Wvn17xc7OTrl8+bK67fTp04q1tbVy/58woGi1WuXEiRMW2/v37694eHgoN27csNjetWtXxdnZWX0+/vOf/yi1a9d+6DU6Ozsrw4YNe2ib3r17K5UqVVLXN2zYoADKrFmzLNp16tRJ0Wg0SlRUlMU16HQ6i21//PGHAigff/xxlutt2bLlQ2PJdPDgQQVQwsLCFEUxvwYVKlRQRo4cadFu6tSpCqCsX78+yzEyX6sDBw5k+T3NVKlSJaV3795Ztrds2dIi1vT0dCUlJcWize3btxU3NzelX79+FtsBZdq0aQ+9vk8//VQBlOPHj1tsr1WrlvLCCy+o6/Xq1VOCg4Mfeqzs7NixQwGUFStWKNevX1euXLmibNq0SalcubKi0WjUv+vMv4Fu3bpZPP7cuXOKlZWVMnv2bIvtx48fV6ytrdXtqampiqurq1K/fn2L52f58uVZXu/s/r9o0aKF4ujoqJw/f97iPPf+nc2fP18BlLNnz+Z7jEIUJOlnpZ8trH4283dv27ZtyvXr15WLFy8q33//vVKuXDlFr9crFy9eVNu2bt1a8fPzs/g9MJlMSpM
"text/plain": [
"<Figure size 1200x1000 with 6 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"_, ax = plt.subplots(3, 2, figsize=(12, 10), sharex=False, sharey=False)\n",
"ax = ax.flatten()\n",
"\n",
"for index, (name, model) in enumerate(best_models_reg.items()):\n",
" model_pipeline = model['pipeline']\n",
" y_pred_reg = model_pipeline.predict(X_test_reg)\n",
"\n",
" # График фактических значений против предсказанных значений\n",
" ax[index * 2].scatter(y_test_reg, y_pred_reg, alpha=0.5)\n",
" ax[index * 2].plot([min(y_test_reg), max(y_test_reg)], [min(y_test_reg), max(y_test_reg)], color='red', linestyle='--')\n",
" ax[index * 2].set_xlabel('Actual Values')\n",
" ax[index * 2].set_ylabel('Predicted Values')\n",
" ax[index * 2].set_title(f'{name}: Actual vs Predicted')\n",
"\n",
" # График остатков\n",
" residuals = y_test_reg - y_pred_reg\n",
" ax[index * 2 + 1].scatter(y_pred_reg, residuals, alpha=0.5)\n",
" ax[index * 2 + 1].axhline(y=0, color='red', linestyle='--')\n",
" ax[index * 2 + 1].set_xlabel('Predicted Values')\n",
" ax[index * 2 + 1].set_ylabel('Residuals')\n",
" ax[index * 2 + 1].set_title(f'{name}: Residuals vs Predicted')\n",
"\n",
"\n",
"plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Модель регресии демонстрирует ужасные результаты ввиду недостаточной корреляции между целеовй характеристикой и строковыми значениями."
]
2024-12-20 14:14:59 +04:00
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2024-12-21 02:12:15 +04:00
"version": "3.12.3"
2024-12-20 14:14:59 +04:00
}
},
"nbformat": 4,
2024-12-21 02:12:15 +04:00
"nbformat_minor": 2
2024-12-20 14:14:59 +04:00
}