258 lines
9.8 KiB
Plaintext
Raw Normal View History

2024-12-20 14:14:59 +04:00
{
"cells": [
{
"cell_type": "code",
2024-12-21 02:12:15 +04:00
"execution_count": 1,
2024-12-20 14:14:59 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-12-21 02:12:15 +04:00
"<class 'pandas.core.frame.DataFrame'>\n",
2024-12-20 14:14:59 +04:00
"RangeIndex: 162313 entries, 0 to 162312\n",
"Data columns (total 5 columns):\n",
2024-12-21 02:12:15 +04:00
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 category 162313 non-null object \n",
" 1 sub_category 162313 non-null object \n",
" 2 href 162313 non-null object \n",
" 3 items 162280 non-null object \n",
2024-12-20 14:14:59 +04:00
" 4 price 162282 non-null float64\n",
"dtypes: float64(1), object(4)\n",
2024-12-21 02:12:15 +04:00
"memory usage: 6.2+ MB\n"
2024-12-20 14:14:59 +04:00
]
}
],
"source": [
2024-12-21 02:12:15 +04:00
"import pandas as pd\n",
2024-12-20 14:14:59 +04:00
"import matplotlib.pyplot as plt\n",
2024-12-21 02:12:15 +04:00
"from cuml.preprocessing import LabelEncoder\n",
"from sklearn import metrics\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"from sklearn.metrics import ConfusionMatrixDisplay\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier\n",
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
"from sklearn.metrics import (\n",
" precision_score, recall_score, accuracy_score, roc_auc_score, f1_score,\n",
" matthews_corrcoef, cohen_kappa_score, confusion_matrix\n",
")\n",
"from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
"import numpy as np\n",
"import featuretools as ft\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
2024-12-20 14:14:59 +04:00
"\n",
2024-12-21 02:12:15 +04:00
"# Функция для применения oversampling\n",
"def apply_oversampling(X, y):\n",
" oversampler = RandomOverSampler(random_state=42)\n",
" X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
" return X_resampled, y_resampled\n",
"\n",
"# Функция для применения undersampling\n",
"def apply_undersampling(X, y):\n",
" undersampler = RandomUnderSampler(random_state=42)\n",
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
" return X_resampled, y_resampled\n",
2024-12-20 14:14:59 +04:00
"\n",
2024-12-21 02:12:15 +04:00
"def split_stratified_into_train_val_test(\n",
" df_input,\n",
" stratify_colname=\"y\",\n",
" frac_train=0.6,\n",
" frac_val=0.15,\n",
" frac_test=0.25,\n",
" random_state=None,\n",
"):\n",
" \"\"\"\n",
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
" following fractional ratios provided by the user, where each subset is\n",
" stratified by the values in a specific column (that is, each subset has\n",
" the same relative frequency of the values in the column). It performs this\n",
" splitting by running train_test_split() twice.\n",
2024-12-20 14:14:59 +04:00
"\n",
2024-12-21 02:12:15 +04:00
" Parameters\n",
" ----------\n",
" df_input : Pandas dataframe\n",
" Input dataframe to be split.\n",
" stratify_colname : str\n",
" The name of the column that will be used for stratification. Usually\n",
" this column would be for the label.\n",
" frac_train : float\n",
" frac_val : float\n",
" frac_test : float\n",
" The ratios with which the dataframe will be split into train, val, and\n",
" test data. The values should be expressed as float fractions and should\n",
" sum to 1.0.\n",
" random_state : int, None, or RandomStateInstance\n",
" Value to be passed to train_test_split().\n",
2024-12-20 14:14:59 +04:00
"\n",
2024-12-21 02:12:15 +04:00
" Returns\n",
" -------\n",
" df_train, df_val, df_test :\n",
" Dataframes containing the three splits.\n",
" \"\"\"\n",
"\n",
" if frac_train + frac_val + frac_test != 1.0:\n",
" raise ValueError(\n",
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
" % (frac_train, frac_val, frac_test)\n",
" )\n",
"\n",
" if stratify_colname not in df_input.columns:\n",
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
"\n",
" X = df_input # Contains all columns.\n",
" y = df_input[\n",
" [stratify_colname]\n",
" ] # Dataframe of just the column on which to stratify.\n",
"\n",
" # Split original dataframe into train and temp dataframes.\n",
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
" )\n",
"\n",
" # Split the temp dataframe into val and test dataframes.\n",
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
" df_val, df_test, y_val, y_test = train_test_split(\n",
" df_temp,\n",
" y_temp,\n",
" stratify=y_temp,\n",
" test_size=relative_frac_test,\n",
" random_state=random_state,\n",
" )\n",
"\n",
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
"\n",
" return df_train, df_val, df_test\n",
"\n",
"\n",
"df = pd.read_csv('/mnt/c/3curse/mii/AIM-PIbd-31-Medvedkov-A-D/data/jio_mart_items.csv')\n",
"df.info()"
2024-12-20 14:14:59 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-12-21 02:12:15 +04:00
"Как бизнес-цели выделим следующие 2 варианта:\n",
" 1) GameDev. Создание игры про конкретного персонажа, живущего в конкретном временном промежутке в конкретной стране. \n",
" 2) Исследование зависимости длительности жизни от страны проживания.\n",
" \n",
"Поскольку именно эти бизнес-цели были выбраны в предыдущей лабораторной работе, будем их использовать.\n",
"Но возникает проблема с 1 целью: её невозможно использовать для задачи классификации. Заменим ее на классификацию людей по возрастным группам, что может быть полезно для рекламных целей."
2024-12-20 14:14:59 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-12-21 02:12:15 +04:00
"Выполним подготовку данных"
2024-12-20 14:14:59 +04:00
]
},
{
"cell_type": "code",
2024-12-21 02:12:15 +04:00
"execution_count": 3,
2024-12-20 14:14:59 +04:00
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-12-21 02:12:15 +04:00
"/tmp/ipykernel_833/3539008564.py:1: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" df.fillna({\"category\": \"NaN\", \"sub_category\": \"NaN\", \"href\" : \"NaN\", \"items\" : \"NaN\", \"price\" : \"NaN\" }, inplace=True)\n"
2024-12-20 14:14:59 +04:00
]
}
],
"source": [
2024-12-21 02:12:15 +04:00
"df.fillna({\"category\": \"NaN\", \"sub_category\": \"NaN\", \"href\" : \"NaN\", \"items\" : \"NaN\", \"price\" : \"NaN\" }, inplace=True)\n",
"df = df.dropna()\n",
"data = df.copy()\n",
2024-12-20 14:14:59 +04:00
"\n",
2024-12-21 02:12:15 +04:00
"value_counts = data[\"category\"].value_counts()\n",
"rare = value_counts[value_counts < 100].index\n",
"data = data[~data[\"category\"].isin(rare)]\n"
2024-12-20 14:14:59 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-12-21 02:12:15 +04:00
"Определить достижимый уровень качества модели для каждой задачи. На основе имеющихся данных уровень качества моделей не будет высоким, поскольку все таки длительность жизни лишь примерная и точно ее угадать невозможно."
2024-12-20 14:14:59 +04:00
]
},
{
2024-12-21 02:12:15 +04:00
"cell_type": "markdown",
2024-12-20 14:14:59 +04:00
"metadata": {},
"source": [
2024-12-21 02:12:15 +04:00
"Выберем ориентиры для наших 2х задач:\n",
" 1)Регрессии - средний возраст человека\n",
" 2)Классификации - аиболее часто встречающаяся возрастная группа"
2024-12-20 14:14:59 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-12-21 02:12:15 +04:00
"Построим конвейер."
2024-12-20 14:14:59 +04:00
]
},
{
"cell_type": "code",
2024-12-21 02:12:15 +04:00
"execution_count": 4,
2024-12-20 14:14:59 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-12-21 02:12:15 +04:00
"Index(['category', 'sub_category', 'href', 'items', 'price'], dtype='object')\n"
2024-12-20 14:14:59 +04:00
]
}
],
"source": [
2024-12-21 02:12:15 +04:00
"print(data.columns)"
2024-12-20 14:14:59 +04:00
]
2024-12-21 02:12:15 +04:00
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
2024-12-20 14:14:59 +04:00
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2024-12-21 02:12:15 +04:00
"version": "3.12.3"
2024-12-20 14:14:59 +04:00
}
},
"nbformat": 4,
2024-12-21 02:12:15 +04:00
"nbformat_minor": 2
2024-12-20 14:14:59 +04:00
}