214 lines
8.5 KiB
Plaintext
214 lines
8.5 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Вариант: Список людей. "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 1,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"RangeIndex: 100000 entries, 0 to 99999\n",
|
|||
|
"Data columns (total 10 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 Id 100000 non-null object \n",
|
|||
|
" 1 Name 100000 non-null object \n",
|
|||
|
" 2 Short description 99923 non-null object \n",
|
|||
|
" 3 Gender 98015 non-null object \n",
|
|||
|
" 4 Country 94533 non-null object \n",
|
|||
|
" 5 Occupation 97299 non-null object \n",
|
|||
|
" 6 Birth year 100000 non-null int64 \n",
|
|||
|
" 7 Death year 99999 non-null float64\n",
|
|||
|
" 8 Manner of death 14821 non-null object \n",
|
|||
|
" 9 Age of death 99999 non-null float64\n",
|
|||
|
"dtypes: float64(2), int64(1), object(7)\n",
|
|||
|
"memory usage: 7.6+ MB\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from sklearn.preprocessing import LabelEncoder\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
|||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"import featuretools as ft\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# Функция для применения oversampling\n",
|
|||
|
"def apply_oversampling(X, y):\n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
|
|||
|
" return X_resampled, y_resampled\n",
|
|||
|
"\n",
|
|||
|
"# Функция для применения undersampling\n",
|
|||
|
"def apply_undersampling(X, y):\n",
|
|||
|
" undersampler = RandomUnderSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
|
|||
|
" return X_resampled, y_resampled\n",
|
|||
|
"\n",
|
|||
|
"def split_stratified_into_train_val_test(\n",
|
|||
|
" df_input,\n",
|
|||
|
" stratify_colname=\"y\",\n",
|
|||
|
" frac_train=0.6,\n",
|
|||
|
" frac_val=0.15,\n",
|
|||
|
" frac_test=0.25,\n",
|
|||
|
" random_state=None,\n",
|
|||
|
"):\n",
|
|||
|
" \"\"\"\n",
|
|||
|
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
|
|||
|
" following fractional ratios provided by the user, where each subset is\n",
|
|||
|
" stratified by the values in a specific column (that is, each subset has\n",
|
|||
|
" the same relative frequency of the values in the column). It performs this\n",
|
|||
|
" splitting by running train_test_split() twice.\n",
|
|||
|
"\n",
|
|||
|
" Parameters\n",
|
|||
|
" ----------\n",
|
|||
|
" df_input : Pandas dataframe\n",
|
|||
|
" Input dataframe to be split.\n",
|
|||
|
" stratify_colname : str\n",
|
|||
|
" The name of the column that will be used for stratification. Usually\n",
|
|||
|
" this column would be for the label.\n",
|
|||
|
" frac_train : float\n",
|
|||
|
" frac_val : float\n",
|
|||
|
" frac_test : float\n",
|
|||
|
" The ratios with which the dataframe will be split into train, val, and\n",
|
|||
|
" test data. The values should be expressed as float fractions and should\n",
|
|||
|
" sum to 1.0.\n",
|
|||
|
" random_state : int, None, or RandomStateInstance\n",
|
|||
|
" Value to be passed to train_test_split().\n",
|
|||
|
"\n",
|
|||
|
" Returns\n",
|
|||
|
" -------\n",
|
|||
|
" df_train, df_val, df_test :\n",
|
|||
|
" Dataframes containing the three splits.\n",
|
|||
|
" \"\"\"\n",
|
|||
|
"\n",
|
|||
|
" if frac_train + frac_val + frac_test != 1.0:\n",
|
|||
|
" raise ValueError(\n",
|
|||
|
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
|||
|
" % (frac_train, frac_val, frac_test)\n",
|
|||
|
" )\n",
|
|||
|
"\n",
|
|||
|
" if stratify_colname not in df_input.columns:\n",
|
|||
|
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
|||
|
"\n",
|
|||
|
" X = df_input # Contains all columns.\n",
|
|||
|
" y = df_input[\n",
|
|||
|
" [stratify_colname]\n",
|
|||
|
" ] # Dataframe of just the column on which to stratify.\n",
|
|||
|
"\n",
|
|||
|
" # Split original dataframe into train and temp dataframes.\n",
|
|||
|
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
|||
|
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
|||
|
" )\n",
|
|||
|
"\n",
|
|||
|
" # Split the temp dataframe into val and test dataframes.\n",
|
|||
|
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
|||
|
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
|||
|
" df_temp,\n",
|
|||
|
" y_temp,\n",
|
|||
|
" stratify=y_temp,\n",
|
|||
|
" test_size=relative_frac_test,\n",
|
|||
|
" random_state=random_state,\n",
|
|||
|
" )\n",
|
|||
|
"\n",
|
|||
|
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
|||
|
"\n",
|
|||
|
" return df_train, df_val, df_test\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\"../data/age.csv\", nrows=100000)\n",
|
|||
|
"df.info()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Как бизнес-цели выделим следующие 2 варианта:\n",
|
|||
|
" 1) GameDev. Создание игры про конкретного персонажа, живущего в конкретном временном промежутке в конкретной стране. \n",
|
|||
|
" 2) Исследование зависимости длительности жизни от страны проживания.\n",
|
|||
|
" \n",
|
|||
|
"Поскольку именно эти бизнес-цели были выбраны в предыдущей лабораторной работе, будем их использовать.\n",
|
|||
|
"Но возникает проблема с 1 целью: её невозможно использовать для машинного обучения. Заменим ее на следующую:\n",
|
|||
|
" Прогнозирование страны. Необходимо не имея такой параметр как страна примерно ее угадать для дальнейшей рекламы."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выполним подготовку данных"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 2,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df.fillna({\"Gender\": \"NaN\", \"Country\": \"NaN\", \"Occupation\" : \"NaN\", \"Manner of death\" : \"NaN\"}, inplace=True)\n",
|
|||
|
"df = df.dropna()\n",
|
|||
|
"df['Country'] = df['Country'].str.split('; ')\n",
|
|||
|
"df = df.explode('Country')\n",
|
|||
|
"data = df.copy()\n",
|
|||
|
"\n",
|
|||
|
"value_counts = data[\"Country\"].value_counts()\n",
|
|||
|
"rare = value_counts[value_counts < 50].index\n",
|
|||
|
"data = data[~data[\"Country\"].isin(rare)]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Определить достижимый уровень качества модели для каждой задачи. На основе имеющихся данных уровень качества моделей не будет высоким, поскольку все таки длительность жизни лишь примерная и точно ее угадать невозможно. А угадывание страны является трудной задачей, поскольку данные между людьми, живущими в разных странах, могут совпадать между собой."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"data['Age_Category'] = pd.cut(data['Age'], bins=[0, 29, 59, float('inf')], labels=[\"young\", \"middle-aged\", \"old\"])"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "aimvenv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.5"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|