{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 162313 entries, 0 to 162312\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 category 162313 non-null object \n", " 1 sub_category 162313 non-null object \n", " 2 href 162313 non-null object \n", " 3 items 162280 non-null object \n", " 4 price 162282 non-null float64\n", "dtypes: float64(1), object(4)\n", "memory usage: 6.2+ MB\n" ] } ], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from cuml.preprocessing import LabelEncoder\n", "from sklearn import metrics\n", "from imblearn.over_sampling import RandomOverSampler\n", "from imblearn.under_sampling import RandomUnderSampler\n", "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", "from sklearn.metrics import ConfusionMatrixDisplay\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.linear_model import LinearRegression, LogisticRegression\n", "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier\n", "from sklearn.model_selection import train_test_split, GridSearchCV\n", "from sklearn.metrics import (\n", " precision_score, recall_score, accuracy_score, roc_auc_score, f1_score,\n", " matthews_corrcoef, cohen_kappa_score, confusion_matrix\n", ")\n", "from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n", "import numpy as np\n", "import featuretools as ft\n", "from sklearn.metrics import accuracy_score, classification_report\n", "\n", "# Функция для применения oversampling\n", "def apply_oversampling(X, y):\n", " oversampler = RandomOverSampler(random_state=42)\n", " X_resampled, y_resampled = oversampler.fit_resample(X, y)\n", " return X_resampled, y_resampled\n", "\n", "# Функция для применения undersampling\n", "def apply_undersampling(X, y):\n", " undersampler = RandomUnderSampler(random_state=42)\n", " X_resampled, y_resampled = undersampler.fit_resample(X, y)\n", " return X_resampled, y_resampled\n", "\n", "def split_stratified_into_train_val_test(\n", " df_input,\n", " stratify_colname=\"y\",\n", " frac_train=0.6,\n", " frac_val=0.15,\n", " frac_test=0.25,\n", " random_state=None,\n", "):\n", " \"\"\"\n", " Splits a Pandas dataframe into three subsets (train, val, and test)\n", " following fractional ratios provided by the user, where each subset is\n", " stratified by the values in a specific column (that is, each subset has\n", " the same relative frequency of the values in the column). It performs this\n", " splitting by running train_test_split() twice.\n", "\n", " Parameters\n", " ----------\n", " df_input : Pandas dataframe\n", " Input dataframe to be split.\n", " stratify_colname : str\n", " The name of the column that will be used for stratification. Usually\n", " this column would be for the label.\n", " frac_train : float\n", " frac_val : float\n", " frac_test : float\n", " The ratios with which the dataframe will be split into train, val, and\n", " test data. The values should be expressed as float fractions and should\n", " sum to 1.0.\n", " random_state : int, None, or RandomStateInstance\n", " Value to be passed to train_test_split().\n", "\n", " Returns\n", " -------\n", " df_train, df_val, df_test :\n", " Dataframes containing the three splits.\n", " \"\"\"\n", "\n", " if frac_train + frac_val + frac_test != 1.0:\n", " raise ValueError(\n", " \"fractions %f, %f, %f do not add up to 1.0\"\n", " % (frac_train, frac_val, frac_test)\n", " )\n", "\n", " if stratify_colname not in df_input.columns:\n", " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", "\n", " X = df_input # Contains all columns.\n", " y = df_input[\n", " [stratify_colname]\n", " ] # Dataframe of just the column on which to stratify.\n", "\n", " # Split original dataframe into train and temp dataframes.\n", " df_train, df_temp, y_train, y_temp = train_test_split(\n", " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", " )\n", "\n", " # Split the temp dataframe into val and test dataframes.\n", " relative_frac_test = frac_test / (frac_val + frac_test)\n", " df_val, df_test, y_val, y_test = train_test_split(\n", " df_temp,\n", " y_temp,\n", " stratify=y_temp,\n", " test_size=relative_frac_test,\n", " random_state=random_state,\n", " )\n", "\n", " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", "\n", " return df_train, df_val, df_test\n", "\n", "\n", "df = pd.read_csv('/mnt/c/3curse/mii/AIM-PIbd-31-Medvedkov-A-D/data/jio_mart_items.csv')\n", "df.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Как бизнес-цели выделим следующие 2 варианта:\n", " 1) GameDev. Создание игры про конкретного персонажа, живущего в конкретном временном промежутке в конкретной стране. \n", " 2) Исследование зависимости длительности жизни от страны проживания.\n", " \n", "Поскольку именно эти бизнес-цели были выбраны в предыдущей лабораторной работе, будем их использовать.\n", "Но возникает проблема с 1 целью: её невозможно использовать для задачи классификации. Заменим ее на классификацию людей по возрастным группам, что может быть полезно для рекламных целей." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Выполним подготовку данных" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_833/3539008564.py:1: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", " df.fillna({\"category\": \"NaN\", \"sub_category\": \"NaN\", \"href\" : \"NaN\", \"items\" : \"NaN\", \"price\" : \"NaN\" }, inplace=True)\n" ] } ], "source": [ "df.fillna({\"category\": \"NaN\", \"sub_category\": \"NaN\", \"href\" : \"NaN\", \"items\" : \"NaN\", \"price\" : \"NaN\" }, inplace=True)\n", "df = df.dropna()\n", "data = df.copy()\n", "\n", "value_counts = data[\"category\"].value_counts()\n", "rare = value_counts[value_counts < 100].index\n", "data = data[~data[\"category\"].isin(rare)]\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Определить достижимый уровень качества модели для каждой задачи. На основе имеющихся данных уровень качества моделей не будет высоким, поскольку все таки длительность жизни лишь примерная и точно ее угадать невозможно." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Выберем ориентиры для наших 2х задач:\n", " 1)Регрессии - средний возраст человека\n", " 2)Классификации - аиболее часто встречающаяся возрастная группа" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Построим конвейер." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['category', 'sub_category', 'href', 'items', 'price'], dtype='object')\n" ] } ], "source": [ "print(data.columns)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 2 }