From a006a5f4523198b04980624011adbc30ad996d77 Mon Sep 17 00:00:00 2001 From: Serxiolog Date: Fri, 8 Nov 2024 15:59:46 +0400 Subject: [PATCH] Lab_3 --- lab_3/lab_3.ipynb | 1300 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1300 insertions(+) create mode 100644 lab_3/lab_3.ipynb diff --git a/lab_3/lab_3.ipynb b/lab_3/lab_3.ipynb new file mode 100644 index 0000000..6c083fc --- /dev/null +++ b/lab_3/lab_3.ipynb @@ -0,0 +1,1300 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Вариант: Список людей. " + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 100000 entries, 0 to 99999\n", + "Data columns (total 10 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Id 100000 non-null object \n", + " 1 Name 100000 non-null object \n", + " 2 Short description 99923 non-null object \n", + " 3 Gender 98015 non-null object \n", + " 4 Country 94533 non-null object \n", + " 5 Occupation 97299 non-null object \n", + " 6 Birth year 100000 non-null int64 \n", + " 7 Death year 99999 non-null float64\n", + " 8 Manner of death 14821 non-null object \n", + " 9 Age of death 99999 non-null float64\n", + "dtypes: float64(2), int64(1), object(7)\n", + "memory usage: 7.6+ MB\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "from imblearn.under_sampling import RandomUnderSampler\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "import numpy as np\n", + "import featuretools as ft\n", + "\n", + "\n", + "# Функция для применения oversampling\n", + "def apply_oversampling(X, y):\n", + " oversampler = RandomOverSampler(random_state=42)\n", + " X_resampled, y_resampled = oversampler.fit_resample(X, y)\n", + " return X_resampled, y_resampled\n", + "\n", + "# Функция для применения undersampling\n", + "def apply_undersampling(X, y):\n", + " undersampler = RandomUnderSampler(random_state=42)\n", + " X_resampled, y_resampled = undersampler.fit_resample(X, y)\n", + " return X_resampled, y_resampled\n", + "\n", + "def split_stratified_into_train_val_test(\n", + " df_input,\n", + " stratify_colname=\"y\",\n", + " frac_train=0.6,\n", + " frac_val=0.15,\n", + " frac_test=0.25,\n", + " random_state=None,\n", + "):\n", + " \"\"\"\n", + " Splits a Pandas dataframe into three subsets (train, val, and test)\n", + " following fractional ratios provided by the user, where each subset is\n", + " stratified by the values in a specific column (that is, each subset has\n", + " the same relative frequency of the values in the column). It performs this\n", + " splitting by running train_test_split() twice.\n", + "\n", + " Parameters\n", + " ----------\n", + " df_input : Pandas dataframe\n", + " Input dataframe to be split.\n", + " stratify_colname : str\n", + " The name of the column that will be used for stratification. Usually\n", + " this column would be for the label.\n", + " frac_train : float\n", + " frac_val : float\n", + " frac_test : float\n", + " The ratios with which the dataframe will be split into train, val, and\n", + " test data. The values should be expressed as float fractions and should\n", + " sum to 1.0.\n", + " random_state : int, None, or RandomStateInstance\n", + " Value to be passed to train_test_split().\n", + "\n", + " Returns\n", + " -------\n", + " df_train, df_val, df_test :\n", + " Dataframes containing the three splits.\n", + " \"\"\"\n", + "\n", + " if frac_train + frac_val + frac_test != 1.0:\n", + " raise ValueError(\n", + " \"fractions %f, %f, %f do not add up to 1.0\"\n", + " % (frac_train, frac_val, frac_test)\n", + " )\n", + "\n", + " if stratify_colname not in df_input.columns:\n", + " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", + "\n", + " X = df_input # Contains all columns.\n", + " y = df_input[\n", + " [stratify_colname]\n", + " ] # Dataframe of just the column on which to stratify.\n", + "\n", + " # Split original dataframe into train and temp dataframes.\n", + " df_train, df_temp, y_train, y_temp = train_test_split(\n", + " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", + " )\n", + "\n", + " # Split the temp dataframe into val and test dataframes.\n", + " relative_frac_test = frac_test / (frac_val + frac_test)\n", + " df_val, df_test, y_val, y_test = train_test_split(\n", + " df_temp,\n", + " y_temp,\n", + " stratify=y_temp,\n", + " test_size=relative_frac_test,\n", + " random_state=random_state,\n", + " )\n", + "\n", + " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", + "\n", + " return df_train, df_val, df_test\n", + "\n", + "\n", + "df = pd.read_csv(\"../data/age.csv\", nrows=100000)\n", + "df.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Такую информацию могут использовать компании связанные с историей/культурой, с GameDev-ом, с созданием кинематографа. Реальные имена могут сделать тот же фильм более историчным. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Как бизнес-цели выделим следующие 2 варианта:\n", + " 1) GameDev. Создание игры про конкретного персонажа, живущего в конкретном временном промежутке в конкретной стране. \n", + " 2) Исследование зависимости длительности жизни от страны проживания.\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Поскольку данные не полные, их необходимо заполнить стандартными значениями:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Id 0\n", + "Name 0\n", + "Short description 77\n", + "Gender 1985\n", + "Country 5467\n", + "Occupation 2701\n", + "Birth year 0\n", + "Death year 1\n", + "Manner of death 85179\n", + "Age of death 1\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "print(df.isnull().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 99922 entries, 0 to 99999\n", + "Data columns (total 10 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Id 99922 non-null object \n", + " 1 Name 99922 non-null object \n", + " 2 Short description 99922 non-null object \n", + " 3 Gender 99922 non-null object \n", + " 4 Country 99922 non-null object \n", + " 5 Occupation 99922 non-null object \n", + " 6 Birth year 99922 non-null int64 \n", + " 7 Death year 99922 non-null float64\n", + " 8 Manner of death 99922 non-null object \n", + " 9 Age of death 99922 non-null float64\n", + "dtypes: float64(2), int64(1), object(7)\n", + "memory usage: 8.4+ MB\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdNameShort descriptionGenderCountryOccupationBirth yearDeath yearManner of deathAge of death
99995Q729652Jacques-Joseph MoreauFrench psychiatristMaleFrancePsychiatrist; psychologist18041884.0NaN80.0
99996Q729661Jerome WiesnerAmerican academic engineerMaleUnited States of AmericaResearcher19151994.0NaN79.0
99997Q729662Westmoreland DavisAmerican politician (1859-1942)MaleUnited States of AmericaPolitician18591942.0NaN83.0
99998Q729674John NeedhamEnglish biologist and Roman Catholic priestMaleEnglandReligious figure17131810.0NaN97.0
99999Q729679Francis BourneCatholic cardinalMaleUnited KingdomReligious figure18611934.0NaN73.0
\n", + "
" + ], + "text/plain": [ + " Id Name \\\n", + "99995 Q729652 Jacques-Joseph Moreau \n", + "99996 Q729661 Jerome Wiesner \n", + "99997 Q729662 Westmoreland Davis \n", + "99998 Q729674 John Needham \n", + "99999 Q729679 Francis Bourne \n", + "\n", + " Short description Gender \\\n", + "99995 French psychiatrist Male \n", + "99996 American academic engineer Male \n", + "99997 American politician (1859-1942) Male \n", + "99998 English biologist and Roman Catholic priest Male \n", + "99999 Catholic cardinal Male \n", + "\n", + " Country Occupation Birth year \\\n", + "99995 France Psychiatrist; psychologist 1804 \n", + "99996 United States of America Researcher 1915 \n", + "99997 United States of America Politician 1859 \n", + "99998 England Religious figure 1713 \n", + "99999 United Kingdom Religious figure 1861 \n", + "\n", + " Death year Manner of death Age of death \n", + "99995 1884.0 NaN 80.0 \n", + "99996 1994.0 NaN 79.0 \n", + "99997 1942.0 NaN 83.0 \n", + "99998 1810.0 NaN 97.0 \n", + "99999 1934.0 NaN 73.0 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.fillna({\"Gender\": \"NaN\", \"Country\": \"NaN\", \"Occupation\" : \"NaN\", \"Manner of death\" : \"NaN\"}, inplace=True)\n", + "df = df.dropna()\n", + "df.info()\n", + "df.tail()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Данные приращены, удалены только те строки, в которых не было даты смерти или короткого описания" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.plot.hist(column=[\"Birth year\"], xlim=(1000, 2000), bins=4000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Помимо этого обработаем колонку страны таким образом, что каждый человек, который жил не в одной стране, будет занимать более одной строки, в соответствии с количеством стран в которых он жил." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 116555 entries, 0 to 99999\n", + "Data columns (total 10 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Id 116555 non-null object \n", + " 1 Name 116555 non-null object \n", + " 2 Short description 116555 non-null object \n", + " 3 Gender 116555 non-null object \n", + " 4 Country 116555 non-null object \n", + " 5 Occupation 116555 non-null object \n", + " 6 Birth year 116555 non-null int64 \n", + " 7 Death year 116555 non-null float64\n", + " 8 Manner of death 116555 non-null object \n", + " 9 Age of death 116555 non-null float64\n", + "dtypes: float64(2), int64(1), object(7)\n", + "memory usage: 9.8+ MB\n" + ] + } + ], + "source": [ + "df['Country'] = df['Country'].str.split('; ')\n", + "df = df.explode('Country')\n", + "df.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Далее выполним разбиение на обучающую, контрольную и тестовую выборки." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "121\n", + "Обучающая выборка: (67038, 10)\n", + "Country\n", + "Germany 15128\n", + "United States of America 8946\n", + "France 4715\n", + "NaN 3248\n", + "United Kingdom 2796\n", + " ... \n", + "Song dynasty 32\n", + "Paraguay 31\n", + "Kingdom of Sardinia 31\n", + "Confederation of the Rhine 30\n", + "Kingdom of Saxony 30\n", + "Name: count, Length: 121, dtype: int64\n", + "Контрольная выборка: (22346, 10)\n", + "Country\n", + "Germany 5043\n", + "United States of America 2982\n", + "France 1572\n", + "NaN 1082\n", + "United Kingdom 932\n", + " ... \n", + "Vietnam 11\n", + "Paraguay 10\n", + "Kingdom of Saxony 10\n", + "Confederation of the Rhine 10\n", + "Kingdom of Sardinia 10\n", + "Name: count, Length: 121, dtype: int64\n", + "Тестовая выборка: (22347, 10)\n", + "Country\n", + "Germany 5043\n", + "United States of America 2982\n", + "France 1572\n", + "NaN 1083\n", + "United Kingdom 933\n", + " ... \n", + "England 11\n", + "Confederation of the Rhine 10\n", + "Paraguay 10\n", + "Kingdom of Sardinia 10\n", + "Kingdom of Saxony 10\n", + "Name: count, Length: 121, dtype: int64\n" + ] + } + ], + "source": [ + "data = df.copy()\n", + "\n", + "value_counts = data[\"Country\"].value_counts()\n", + "rare = value_counts[value_counts < 50].index\n", + "data = data[~data[\"Country\"].isin(rare)]\n", + "\n", + "print(len(data[\"Country\"].unique()))\n", + "\n", + " \n", + "df_train, df_val, df_test = split_stratified_into_train_val_test(\n", + " data, stratify_colname=\"Country\", frac_train=0.60, frac_val=0.20, frac_test=0.20)\n", + "\n", + "print(\"Обучающая выборка: \", df_train.shape)\n", + "print(df_train[\"Country\"].value_counts())\n", + "\n", + "print(\"Контрольная выборка: \", df_val.shape)\n", + "print(df_val[\"Country\"].value_counts())\n", + "\n", + "print(\"Тестовая выборка: \", df_test.shape)\n", + "print(df_test[\"Country\"].value_counts())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В данных были удалены строки, у которых были \"редкие\" страны. Данные наращивать не будем, поскольку в этом нет необходимости\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Выполним конструирование признаков. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Начнем с унитарного кодирования категориальных признаков. Под этот пункт подходит столбец страна" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Country_AlbaniaCountry_ArgentinaCountry_AustraliaCountry_AustriaCountry_Austria-HungaryCountry_Austrian EmpireCountry_BelgiumCountry_BoliviaCountry_BrazilCountry_British Raj...Country_United Kingdom of Great Britain and IrelandCountry_United States of AmericaCountry_UruguayCountry_VenezuelaCountry_VietnamCountry_WalesCountry_Weimar RepublicCountry_West GermanyCountry_YugoslaviaCountry_ancient Rome
00.00.00.00.00.00.00.00.00.00.0...0.01.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
30.00.00.00.00.00.00.00.00.00.0...0.01.00.00.00.00.00.00.00.00.0
40.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
..................................................................
1117260.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
1117270.00.00.00.00.00.00.00.00.00.0...0.01.00.00.00.00.00.00.00.00.0
1117280.00.00.00.00.00.00.00.00.00.0...0.01.00.00.00.00.00.00.00.00.0
1117290.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
1117300.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
\n", + "

111731 rows × 120 columns

\n", + "
" + ], + "text/plain": [ + " Country_Albania Country_Argentina Country_Australia \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "111726 0.0 0.0 0.0 \n", + "111727 0.0 0.0 0.0 \n", + "111728 0.0 0.0 0.0 \n", + "111729 0.0 0.0 0.0 \n", + "111730 0.0 0.0 0.0 \n", + "\n", + " Country_Austria Country_Austria-Hungary Country_Austrian Empire \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "111726 0.0 0.0 0.0 \n", + "111727 0.0 0.0 0.0 \n", + "111728 0.0 0.0 0.0 \n", + "111729 0.0 0.0 0.0 \n", + "111730 0.0 0.0 0.0 \n", + "\n", + " Country_Belgium Country_Bolivia Country_Brazil Country_British Raj \\\n", + "0 0.0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 0.0 \n", + "... ... ... ... ... \n", + "111726 0.0 0.0 0.0 0.0 \n", + "111727 0.0 0.0 0.0 0.0 \n", + "111728 0.0 0.0 0.0 0.0 \n", + "111729 0.0 0.0 0.0 0.0 \n", + "111730 0.0 0.0 0.0 0.0 \n", + "\n", + " ... Country_United Kingdom of Great Britain and Ireland \\\n", + "0 ... 0.0 \n", + "1 ... 0.0 \n", + "2 ... 0.0 \n", + "3 ... 0.0 \n", + "4 ... 0.0 \n", + "... ... ... \n", + "111726 ... 0.0 \n", + "111727 ... 0.0 \n", + "111728 ... 0.0 \n", + "111729 ... 0.0 \n", + "111730 ... 0.0 \n", + "\n", + " Country_United States of America Country_Uruguay Country_Venezuela \\\n", + "0 1.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 1.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "111726 0.0 0.0 0.0 \n", + "111727 1.0 0.0 0.0 \n", + "111728 1.0 0.0 0.0 \n", + "111729 0.0 0.0 0.0 \n", + "111730 0.0 0.0 0.0 \n", + "\n", + " Country_Vietnam Country_Wales Country_Weimar Republic \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "111726 0.0 0.0 0.0 \n", + "111727 0.0 0.0 0.0 \n", + "111728 0.0 0.0 0.0 \n", + "111729 0.0 0.0 0.0 \n", + "111730 0.0 0.0 0.0 \n", + "\n", + " Country_West Germany Country_Yugoslavia Country_ancient Rome \n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "111726 0.0 0.0 0.0 \n", + "111727 0.0 0.0 0.0 \n", + "111728 0.0 0.0 0.0 \n", + "111729 0.0 0.0 0.0 \n", + "111730 0.0 0.0 0.0 \n", + "\n", + "[111731 rows x 120 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n", + "\n", + "encoded_values = encoder.fit_transform(data[[\"Country\"]])\n", + "\n", + "encoded_columns = encoder.get_feature_names_out([\"Country\"])\n", + "\n", + "encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n", + "\n", + "encoded_values_df\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Далее выполним дискретизацию числовых признаков" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Age of deathAge of death
067.0middle-aged
067.0middle-aged
149.0middle-aged
256.0middle-aged
457.0middle-aged
457.0middle-aged
542.0middle-aged
688.0old
786.0old
861.0middle-aged
973.0middle-aged
973.0middle-aged
1042.0middle-aged
1298.0old
1356.0middle-aged
1456.0middle-aged
1456.0middle-aged
1456.0middle-aged
1663.0middle-aged
1791.0old
\n", + "
" + ], + "text/plain": [ + " Age of death Age of death\n", + "0 67.0 middle-aged\n", + "0 67.0 middle-aged\n", + "1 49.0 middle-aged\n", + "2 56.0 middle-aged\n", + "4 57.0 middle-aged\n", + "4 57.0 middle-aged\n", + "5 42.0 middle-aged\n", + "6 88.0 old\n", + "7 86.0 old\n", + "8 61.0 middle-aged\n", + "9 73.0 middle-aged\n", + "9 73.0 middle-aged\n", + "10 42.0 middle-aged\n", + "12 98.0 old\n", + "13 56.0 middle-aged\n", + "14 56.0 middle-aged\n", + "14 56.0 middle-aged\n", + "14 56.0 middle-aged\n", + "16 63.0 middle-aged\n", + "17 91.0 old" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labels = [\"young\", \"middle-aged\", \"old\"]\n", + "num_bins = 3\n", + "hist1, bins1 = np.histogram(data[\"Age of death\"].fillna(data[\"Age of death\"].median()), bins=num_bins)\n", + "pd.concat([data[\"Age of death\"], pd.cut(data[\"Age of death\"], list(bins1), labels=labels)], axis=1).head(20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Выполнить «ручной» синтез признаков в рамках данного набора данных не является возможным." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Масштабирование признаков на основе нормировки и стандартизации в рамках данного набора данных не является необходимым." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Выполним конструирование признаков с применением фреймворка Featuretools. " + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Built 7 features\n", + "Elapsed: 00:00 | Progress: 100%|██████████\n", + " Gender Country Occupation Birth year Death year \\\n", + "Id \n", + "Q23 Male United States of America Politician 1732 1799.0 \n", + "Q42 Male United Kingdom Artist 1952 2001.0 \n", + "Q91 Male United States of America Politician 1809 1865.0 \n", + "Q255 Male Holy Roman Empire Artist 1770 1827.0 \n", + "Q260 Male Kingdom of France Egyptologist 1790 1832.0 \n", + "\n", + " Manner of death Age of death \n", + "Id \n", + "Q23 natural causes 67.0 \n", + "Q42 natural causes 49.0 \n", + "Q91 homicide 56.0 \n", + "Q255 NaN 57.0 \n", + "Q260 natural causes 42.0 \n" + ] + } + ], + "source": [ + "data1 = data.drop_duplicates(subset=\"Id\", keep=\"first\")\n", + "\n", + "df_train = pd.DataFrame(data1)\n", + "\n", + "# Создание EntitySet\n", + "es = ft.EntitySet(id='death_data')\n", + "\n", + "# Добавление DataFrame в EntitySet\n", + "es = es.add_dataframe(\n", + " dataframe_name='deaths',\n", + " dataframe=df_train,\n", + " index='Id',\n", + " make_index=False\n", + ")\n", + "\n", + "# Определение примитивов (операций) для конструирования признаков\n", + "feature_matrix, feature_defs = ft.dfs(\n", + " entityset=es,\n", + " target_dataframe_name='deaths',\n", + " max_depth=2,\n", + " verbose=1,\n", + " n_jobs=1\n", + ")\n", + "\n", + "# Вывод сгенерированных признаков\n", + "print(feature_matrix.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Все наборы признаков имеют плохую предсказательную способность, высокую скорость вычисления, малую надежность, корреляцию и цельность. Они не являются информативными, как и сам набор данных" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "aimvenv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}