diff --git a/lab_4/lab_4.ipynb b/lab_4/lab_4.ipynb new file mode 100644 index 0000000..b1cf291 --- /dev/null +++ b/lab_4/lab_4.ipynb @@ -0,0 +1,213 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Вариант: Список людей. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 100000 entries, 0 to 99999\n", + "Data columns (total 10 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Id 100000 non-null object \n", + " 1 Name 100000 non-null object \n", + " 2 Short description 99923 non-null object \n", + " 3 Gender 98015 non-null object \n", + " 4 Country 94533 non-null object \n", + " 5 Occupation 97299 non-null object \n", + " 6 Birth year 100000 non-null int64 \n", + " 7 Death year 99999 non-null float64\n", + " 8 Manner of death 14821 non-null object \n", + " 9 Age of death 99999 non-null float64\n", + "dtypes: float64(2), int64(1), object(7)\n", + "memory usage: 7.6+ MB\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "from imblearn.under_sampling import RandomUnderSampler\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "import numpy as np\n", + "import featuretools as ft\n", + "\n", + "\n", + "# Функция для применения oversampling\n", + "def apply_oversampling(X, y):\n", + " oversampler = RandomOverSampler(random_state=42)\n", + " X_resampled, y_resampled = oversampler.fit_resample(X, y)\n", + " return X_resampled, y_resampled\n", + "\n", + "# Функция для применения undersampling\n", + "def apply_undersampling(X, y):\n", + " undersampler = RandomUnderSampler(random_state=42)\n", + " X_resampled, y_resampled = undersampler.fit_resample(X, y)\n", + " return X_resampled, y_resampled\n", + "\n", + "def split_stratified_into_train_val_test(\n", + " df_input,\n", + " stratify_colname=\"y\",\n", + " frac_train=0.6,\n", + " frac_val=0.15,\n", + " frac_test=0.25,\n", + " random_state=None,\n", + "):\n", + " \"\"\"\n", + " Splits a Pandas dataframe into three subsets (train, val, and test)\n", + " following fractional ratios provided by the user, where each subset is\n", + " stratified by the values in a specific column (that is, each subset has\n", + " the same relative frequency of the values in the column). It performs this\n", + " splitting by running train_test_split() twice.\n", + "\n", + " Parameters\n", + " ----------\n", + " df_input : Pandas dataframe\n", + " Input dataframe to be split.\n", + " stratify_colname : str\n", + " The name of the column that will be used for stratification. Usually\n", + " this column would be for the label.\n", + " frac_train : float\n", + " frac_val : float\n", + " frac_test : float\n", + " The ratios with which the dataframe will be split into train, val, and\n", + " test data. The values should be expressed as float fractions and should\n", + " sum to 1.0.\n", + " random_state : int, None, or RandomStateInstance\n", + " Value to be passed to train_test_split().\n", + "\n", + " Returns\n", + " -------\n", + " df_train, df_val, df_test :\n", + " Dataframes containing the three splits.\n", + " \"\"\"\n", + "\n", + " if frac_train + frac_val + frac_test != 1.0:\n", + " raise ValueError(\n", + " \"fractions %f, %f, %f do not add up to 1.0\"\n", + " % (frac_train, frac_val, frac_test)\n", + " )\n", + "\n", + " if stratify_colname not in df_input.columns:\n", + " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", + "\n", + " X = df_input # Contains all columns.\n", + " y = df_input[\n", + " [stratify_colname]\n", + " ] # Dataframe of just the column on which to stratify.\n", + "\n", + " # Split original dataframe into train and temp dataframes.\n", + " df_train, df_temp, y_train, y_temp = train_test_split(\n", + " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", + " )\n", + "\n", + " # Split the temp dataframe into val and test dataframes.\n", + " relative_frac_test = frac_test / (frac_val + frac_test)\n", + " df_val, df_test, y_val, y_test = train_test_split(\n", + " df_temp,\n", + " y_temp,\n", + " stratify=y_temp,\n", + " test_size=relative_frac_test,\n", + " random_state=random_state,\n", + " )\n", + "\n", + " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", + "\n", + " return df_train, df_val, df_test\n", + "\n", + "\n", + "df = pd.read_csv(\"../data/age.csv\", nrows=100000)\n", + "df.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Как бизнес-цели выделим следующие 2 варианта:\n", + " 1) GameDev. Создание игры про конкретного персонажа, живущего в конкретном временном промежутке в конкретной стране. \n", + " 2) Исследование зависимости длительности жизни от страны проживания.\n", + " \n", + "Поскольку именно эти бизнес-цели были выбраны в предыдущей лабораторной работе, будем их использовать.\n", + "Но возникает проблема с 1 целью: её невозможно использовать для машинного обучения. Заменим ее на следующую:\n", + " Прогнозирование страны. Необходимо не имея такой параметр как страна примерно ее угадать для дальнейшей рекламы." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Выполним подготовку данных" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "df.fillna({\"Gender\": \"NaN\", \"Country\": \"NaN\", \"Occupation\" : \"NaN\", \"Manner of death\" : \"NaN\"}, inplace=True)\n", + "df = df.dropna()\n", + "df['Country'] = df['Country'].str.split('; ')\n", + "df = df.explode('Country')\n", + "data = df.copy()\n", + "\n", + "value_counts = data[\"Country\"].value_counts()\n", + "rare = value_counts[value_counts < 50].index\n", + "data = data[~data[\"Country\"].isin(rare)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Определить достижимый уровень качества модели для каждой задачи. На основе имеющихся данных уровень качества моделей не будет высоким, поскольку все таки длительность жизни лишь примерная и точно ее угадать невозможно. А угадывание страны является трудной задачей, поскольку данные между людьми, живущими в разных странах, могут совпадать между собой." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data['Age_Category'] = pd.cut(data['Age'], bins=[0, 29, 59, float('inf')], labels=[\"young\", \"middle-aged\", \"old\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "aimvenv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}