diff --git a/Lab_3/lab_3.ipynb b/Lab_3/lab_3.ipynb new file mode 100644 index 0000000..39498e9 --- /dev/null +++ b/Lab_3/lab_3.ipynb @@ -0,0 +1,1501 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Вариант: Экономика стран" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 369 entries, 0 to 368\n", + "Data columns (total 14 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 stock index 369 non-null object \n", + " 1 country 369 non-null object \n", + " 2 year 369 non-null float64\n", + " 3 index price 317 non-null float64\n", + " 4 log_indexprice 369 non-null float64\n", + " 5 inflationrate 326 non-null float64\n", + " 6 oil prices 369 non-null float64\n", + " 7 exchange_rate 367 non-null float64\n", + " 8 gdppercent 350 non-null float64\n", + " 9 percapitaincome 368 non-null float64\n", + " 10 unemploymentrate 348 non-null float64\n", + " 11 manufacturingoutput 278 non-null float64\n", + " 12 tradebalance 365 non-null float64\n", + " 13 USTreasury 369 non-null float64\n", + "dtypes: float64(12), object(2)\n", + "memory usage: 40.5+ KB\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "from imblearn.under_sampling import RandomUnderSampler\n", + "import featuretools as ft\n", + "import numpy as np\n", + "\n", + "label_encoder = LabelEncoder()\n", + "\n", + "# Функция для применения oversampling\n", + "def apply_oversampling(X, y):\n", + " oversampler = RandomOverSampler(random_state=42)\n", + " X_resampled, y_resampled = oversampler.fit_resample(X, y)\n", + " return X_resampled, y_resampled\n", + "\n", + "# Функция для применения undersampling\n", + "def apply_undersampling(X, y):\n", + " undersampler = RandomUnderSampler(random_state=42)\n", + " X_resampled, y_resampled = undersampler.fit_resample(X, y)\n", + " return X_resampled, y_resampled\n", + "\n", + "def split_stratified_into_train_val_test(\n", + " df_input,\n", + " stratify_colname=\"y\",\n", + " frac_train=0.6,\n", + " frac_val=0.15,\n", + " frac_test=0.25,\n", + " random_state=None,\n", + "):\n", + " \"\"\"\n", + " Splits a Pandas dataframe into three subsets (train, val, and test)\n", + " following fractional ratios provided by the user, where each subset is\n", + " stratified by the values in a specific column (that is, each subset has\n", + " the same relative frequency of the values in the column). It performs this\n", + " splitting by running train_test_split() twice.\n", + "\n", + " Parameters\n", + " ----------\n", + " df_input : Pandas dataframe\n", + " Input dataframe to be split.\n", + " stratify_colname : str\n", + " The name of the column that will be used for stratification. Usually\n", + " this column would be for the label.\n", + " frac_train : float\n", + " frac_val : float\n", + " frac_test : float\n", + " The ratios with which the dataframe will be split into train, val, and\n", + " test data. The values should be expressed as float fractions and should\n", + " sum to 1.0.\n", + " random_state : int, None, or RandomStateInstance\n", + " Value to be passed to train_test_split().\n", + "\n", + " Returns\n", + " -------\n", + " df_train, df_val, df_test :\n", + " Dataframes containing the three splits.\n", + " \"\"\"\n", + "\n", + " if frac_train + frac_val + frac_test != 1.0:\n", + " raise ValueError(\n", + " \"fractions %f, %f, %f do not add up to 1.0\"\n", + " % (frac_train, frac_val, frac_test)\n", + " )\n", + "\n", + " if stratify_colname not in df_input.columns:\n", + " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", + "\n", + " X = df_input # Contains all columns.\n", + " y = df_input[\n", + " [stratify_colname]\n", + " ] # Dataframe of just the column on which to stratify.\n", + "\n", + " # Split original dataframe into train and temp dataframes.\n", + " df_train, df_temp, y_train, y_temp = train_test_split(\n", + " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", + " )\n", + "\n", + " # Split the temp dataframe into val and test dataframes.\n", + " relative_frac_test = frac_test / (frac_val + frac_test)\n", + " df_val, df_test, y_val, y_test = train_test_split(\n", + " df_temp,\n", + " y_temp,\n", + " stratify=y_temp,\n", + " test_size=relative_frac_test,\n", + " random_state=random_state,\n", + " )\n", + "\n", + " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", + "\n", + " return df_train, df_val, df_test\n", + "\n", + "data = pd.read_csv(\"../data/Economic.csv\")\n", + "data.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Определение бизнес-целей\n", + "1. Прогнозирование ВВП на душу населения (GDP per capita) для каждой из 9 стран на следующие 5 лет. Это позволит бизнесу и правительствам принимать обоснованные решения в области экономической политики и инвестиций.\n", + "\n", + "2. Определение факторов, наиболее сильно влияющих на ВВП на душу населения. Это поможет выявить области, требующие особого внимания для стимулирования экономического роста." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Определение целей технического проекта\n", + "1. Разработка модели машинного обучения, способной с высокой точностью прогнозировать ВВП на душу населения на основе исторических данных.\n", + "\n", + "2. Анализ корреляции между различными экономическими показателями и ВВП на душу населения, выявление наиболее значимых факторов." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Дополнение данных" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "stock index 0\n", + "country 0\n", + "year 0\n", + "index price 52\n", + "log_indexprice 0\n", + "inflationrate 43\n", + "oil prices 0\n", + "exchange_rate 2\n", + "gdppercent 19\n", + "percapitaincome 1\n", + "unemploymentrate 21\n", + "manufacturingoutput 91\n", + "tradebalance 4\n", + "USTreasury 0\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "print(data.isnull().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stock indexcountryyearindex pricelog_indexpriceinflationrateoil pricesexchange_rategdppercentpercapitaincomeunemploymentratemanufacturingoutputtradebalanceUSTreasury
0NASDAQUnited States of America1980.0168.612.230.1421.591.000.0912575.00.071.00-13.060.11
1NASDAQUnited States of America1981.0203.152.310.1031.771.000.1213976.00.081.00-12.520.14
2NASDAQUnited States of America1982.0188.982.280.0628.521.000.0414434.00.101.00-19.970.13
3NASDAQUnited States of America1983.0285.432.460.0326.191.000.0915544.00.101.00-51.640.11
4NASDAQUnited States of America1984.0248.892.400.0425.881.000.1117121.00.081.00-102.730.12
.............................................
364IEX 35Spain2016.09352.103.970.0051.971.110.0326523.00.20139.0149.160.02
365IEX 35Spain2017.010043.904.000.0257.881.130.0328170.00.17148.8047.330.02
366IEX 35Spain2018.08539.903.930.0249.521.180.0230389.00.15158.3338.700.03
367IEX 35Spain2019.09549.203.980.0159.881.120.0229565.00.14155.4941.940.02
368IEX 35Spain2020.08073.703.910.0047.021.14-0.1127057.00.16143.0519.100.01
\n", + "

369 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " stock index country year index price \\\n", + "0 NASDAQ United States of America 1980.0 168.61 \n", + "1 NASDAQ United States of America 1981.0 203.15 \n", + "2 NASDAQ United States of America 1982.0 188.98 \n", + "3 NASDAQ United States of America 1983.0 285.43 \n", + "4 NASDAQ United States of America 1984.0 248.89 \n", + ".. ... ... ... ... \n", + "364 IEX 35 Spain 2016.0 9352.10 \n", + "365 IEX 35 Spain 2017.0 10043.90 \n", + "366 IEX 35 Spain 2018.0 8539.90 \n", + "367 IEX 35 Spain 2019.0 9549.20 \n", + "368 IEX 35 Spain 2020.0 8073.70 \n", + "\n", + " log_indexprice inflationrate oil prices exchange_rate gdppercent \\\n", + "0 2.23 0.14 21.59 1.00 0.09 \n", + "1 2.31 0.10 31.77 1.00 0.12 \n", + "2 2.28 0.06 28.52 1.00 0.04 \n", + "3 2.46 0.03 26.19 1.00 0.09 \n", + "4 2.40 0.04 25.88 1.00 0.11 \n", + ".. ... ... ... ... ... \n", + "364 3.97 0.00 51.97 1.11 0.03 \n", + "365 4.00 0.02 57.88 1.13 0.03 \n", + "366 3.93 0.02 49.52 1.18 0.02 \n", + "367 3.98 0.01 59.88 1.12 0.02 \n", + "368 3.91 0.00 47.02 1.14 -0.11 \n", + "\n", + " percapitaincome unemploymentrate manufacturingoutput tradebalance \\\n", + "0 12575.0 0.07 1.00 -13.06 \n", + "1 13976.0 0.08 1.00 -12.52 \n", + "2 14434.0 0.10 1.00 -19.97 \n", + "3 15544.0 0.10 1.00 -51.64 \n", + "4 17121.0 0.08 1.00 -102.73 \n", + ".. ... ... ... ... \n", + "364 26523.0 0.20 139.01 49.16 \n", + "365 28170.0 0.17 148.80 47.33 \n", + "366 30389.0 0.15 158.33 38.70 \n", + "367 29565.0 0.14 155.49 41.94 \n", + "368 27057.0 0.16 143.05 19.10 \n", + "\n", + " USTreasury \n", + "0 0.11 \n", + "1 0.14 \n", + "2 0.13 \n", + "3 0.11 \n", + "4 0.12 \n", + ".. ... \n", + "364 0.02 \n", + "365 0.02 \n", + "366 0.03 \n", + "367 0.02 \n", + "368 0.01 \n", + "\n", + "[369 rows x 14 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.fillna({\"index_price\": 1, \"inflationrate\": 0, \"gpdpercent\": 0, \"percapitaincome\": 100,\"unemploymentrate\": 0, \"manufacturingoutput\": 1, \"tradebalance\": -350})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Разбиение данных на выборки и оценка сбалансированности выборки\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Обучающая выборка: (221, 14)\n", + "country\n", + "Spain 25\n", + "India 25\n", + "Germany 25\n", + "China 25\n", + "United Kingdom 25\n", + "Hong Kong 24\n", + "Japan 24\n", + "United States of America 24\n", + "France 24\n", + "Name: count, dtype: int64\n", + "Контрольная выборка: (74, 14)\n", + "country\n", + "United States of America 9\n", + "Japan 9\n", + "France 8\n", + "Germany 8\n", + "Hong Kong 8\n", + "Spain 8\n", + "India 8\n", + "China 8\n", + "United Kingdom 8\n", + "Name: count, dtype: int64\n", + "Тестовая выборка: (74, 14)\n", + "country\n", + "France 9\n", + "Hong Kong 9\n", + "United Kingdom 8\n", + "China 8\n", + "India 8\n", + "Spain 8\n", + "United States of America 8\n", + "Japan 8\n", + "Germany 8\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "main_data = data.copy()\n", + "\n", + "value_counts = data[\"country\"].value_counts()\n", + " \n", + "df_train, df_val, df_test = split_stratified_into_train_val_test(\n", + " data, stratify_colname=\"country\", frac_train=0.60, frac_val=0.20, frac_test=0.20)\n", + "\n", + "print(\"Обучающая выборка: \", df_train.shape)\n", + "print(df_train[\"country\"].value_counts())\n", + "\n", + "print(\"Контрольная выборка: \", df_val.shape)\n", + "print(df_val[\"country\"].value_counts())\n", + "\n", + "print(\"Тестовая выборка: \", df_test.shape)\n", + "print(df_test[\"country\"].value_counts())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Конструирование признаков" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. Унитарное кодирование категориальных признаков" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stock indexyearindex pricelog_indexpriceinflationrateoil pricesexchange_rategdppercentpercapitaincomeunemploymentrate...USTreasurycountry_Chinacountry_Francecountry_Germanycountry_Hong Kongcountry_Indiacountry_Japancountry_Spaincountry_United Kingdomcountry_United States of America
0NASDAQ1980.0168.612.230.1421.591.00.0912575.00.07...0.11FalseFalseFalseFalseFalseFalseFalseFalseTrue
1NASDAQ1981.0203.152.310.1031.771.00.1213976.00.08...0.14FalseFalseFalseFalseFalseFalseFalseFalseTrue
2NASDAQ1982.0188.982.280.0628.521.00.0414434.00.10...0.13FalseFalseFalseFalseFalseFalseFalseFalseTrue
3NASDAQ1983.0285.432.460.0326.191.00.0915544.00.10...0.11FalseFalseFalseFalseFalseFalseFalseFalseTrue
4NASDAQ1984.0248.892.400.0425.881.00.1117121.00.08...0.12FalseFalseFalseFalseFalseFalseFalseFalseTrue
\n", + "

5 rows × 22 columns

\n", + "
" + ], + "text/plain": [ + " stock index year index price log_indexprice inflationrate oil prices \\\n", + "0 NASDAQ 1980.0 168.61 2.23 0.14 21.59 \n", + "1 NASDAQ 1981.0 203.15 2.31 0.10 31.77 \n", + "2 NASDAQ 1982.0 188.98 2.28 0.06 28.52 \n", + "3 NASDAQ 1983.0 285.43 2.46 0.03 26.19 \n", + "4 NASDAQ 1984.0 248.89 2.40 0.04 25.88 \n", + "\n", + " exchange_rate gdppercent percapitaincome unemploymentrate ... \\\n", + "0 1.0 0.09 12575.0 0.07 ... \n", + "1 1.0 0.12 13976.0 0.08 ... \n", + "2 1.0 0.04 14434.0 0.10 ... \n", + "3 1.0 0.09 15544.0 0.10 ... \n", + "4 1.0 0.11 17121.0 0.08 ... \n", + "\n", + " USTreasury country_China country_France country_Germany \\\n", + "0 0.11 False False False \n", + "1 0.14 False False False \n", + "2 0.13 False False False \n", + "3 0.11 False False False \n", + "4 0.12 False False False \n", + "\n", + " country_Hong Kong country_India country_Japan country_Spain \\\n", + "0 False False False False \n", + "1 False False False False \n", + "2 False False False False \n", + "3 False False False False \n", + "4 False False False False \n", + "\n", + " country_United Kingdom country_United States of America \n", + "0 False True \n", + "1 False True \n", + "2 False True \n", + "3 False True \n", + "4 False True \n", + "\n", + "[5 rows x 22 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Пример для кодирования стран\n", + "data = pd.get_dummies(data, columns=['country'], prefix='country')\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stock indexyearindex pricelog_indexpriceinflationrateoil pricesexchange_rategdppercentpercapitaincomeunemploymentrate...USTreasurycountry_Chinacountry_Francecountry_Germanycountry_Hong Kongcountry_Indiacountry_Japancountry_Spaincountry_United Kingdomcountry_United States of America
364IEX 352016.09352.13.97NaN51.971.110.0326523.00.20...0.02FalseFalseFalseFalseFalseFalseTrueFalseFalse
365IEX 352017.010043.94.000.0257.881.130.0328170.00.17...0.02FalseFalseFalseFalseFalseFalseTrueFalseFalse
366IEX 352018.08539.93.930.0249.521.180.0230389.00.15...0.03FalseFalseFalseFalseFalseFalseTrueFalseFalse
367IEX 352019.09549.23.980.0159.881.120.0229565.00.14...0.02FalseFalseFalseFalseFalseFalseTrueFalseFalse
368IEX 352020.08073.73.91NaN47.021.14-0.1127057.00.16...0.01FalseFalseFalseFalseFalseFalseTrueFalseFalse
\n", + "

5 rows × 22 columns

\n", + "
" + ], + "text/plain": [ + " stock index year index price log_indexprice inflationrate \\\n", + "364 IEX 35 2016.0 9352.1 3.97 NaN \n", + "365 IEX 35 2017.0 10043.9 4.00 0.02 \n", + "366 IEX 35 2018.0 8539.9 3.93 0.02 \n", + "367 IEX 35 2019.0 9549.2 3.98 0.01 \n", + "368 IEX 35 2020.0 8073.7 3.91 NaN \n", + "\n", + " oil prices exchange_rate gdppercent percapitaincome unemploymentrate \\\n", + "364 51.97 1.11 0.03 26523.0 0.20 \n", + "365 57.88 1.13 0.03 28170.0 0.17 \n", + "366 49.52 1.18 0.02 30389.0 0.15 \n", + "367 59.88 1.12 0.02 29565.0 0.14 \n", + "368 47.02 1.14 -0.11 27057.0 0.16 \n", + "\n", + " ... USTreasury country_China country_France country_Germany \\\n", + "364 ... 0.02 False False False \n", + "365 ... 0.02 False False False \n", + "366 ... 0.03 False False False \n", + "367 ... 0.02 False False False \n", + "368 ... 0.01 False False False \n", + "\n", + " country_Hong Kong country_India country_Japan country_Spain \\\n", + "364 False False False True \n", + "365 False False False True \n", + "366 False False False True \n", + "367 False False False True \n", + "368 False False False True \n", + "\n", + " country_United Kingdom country_United States of America \n", + "364 False False \n", + "365 False False \n", + "366 False False \n", + "367 False False \n", + "368 False False \n", + "\n", + "[5 rows x 22 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.tail()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. Дискретизация числовых признаков" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " oil prices oil_price_category\n", + "0 21.59 cheap\n", + "1 31.77 cheap\n", + "2 28.52 cheap\n", + "3 26.19 cheap\n", + "4 25.88 cheap\n", + "5 24.09 cheap\n", + "6 12.51 cheap\n", + "7 15.40 cheap\n", + "8 12.58 cheap\n", + "9 15.86 cheap\n", + "10 27.28 cheap\n", + "11 19.50 cheap\n", + "12 19.41 cheap\n", + "13 14.52 cheap\n", + "14 17.16 cheap\n", + "15 19.03 cheap\n", + "16 25.23 cheap\n", + "17 18.33 cheap\n", + "18 11.35 cheap\n", + "19 26.10 cheap\n", + "20 28.44 cheap\n", + "21 19.39 cheap\n", + "22 29.46 cheap\n", + "23 32.13 cheap\n", + "24 43.15 normal\n", + "25 59.41 normal\n", + "26 61.96 normal\n", + "27 91.69 rich\n", + "28 41.12 normal\n", + "29 74.47 rich\n" + ] + } + ], + "source": [ + "# Пример для дискретизации года\n", + "bin = [0, 40, 70, float('inf')]\n", + "label = [\"cheap\", \"normal\", \"rich\"]\n", + "\n", + "data[\"oil_price_category\"] = pd.cut(data['oil prices'], bins=bin, labels=label)\n", + "print(data[[\"oil prices\", \"oil_price_category\"]].head(30))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3. \"Ручной\" синтез признаков" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stock indexyearindex pricelog_indexpriceinflationrateoil pricesexchange_rategdppercentpercapitaincomeunemploymentrate...country_Francecountry_Germanycountry_Hong Kongcountry_Indiacountry_Japancountry_Spaincountry_United Kingdomcountry_United States of Americaoil_price_categoryEconomic_Growth
0NASDAQ1980.0168.612.230.1421.591.00.0912575.00.07...FalseFalseFalseFalseFalseFalseFalseTruecheapNaN
1NASDAQ1981.0203.152.310.1031.771.00.1213976.00.08...FalseFalseFalseFalseFalseFalseFalseTruecheap0.03
2NASDAQ1982.0188.982.280.0628.521.00.0414434.00.10...FalseFalseFalseFalseFalseFalseFalseTruecheap-0.08
3NASDAQ1983.0285.432.460.0326.191.00.0915544.00.10...FalseFalseFalseFalseFalseFalseFalseTruecheap0.05
4NASDAQ1984.0248.892.400.0425.881.00.1117121.00.08...FalseFalseFalseFalseFalseFalseFalseTruecheap0.02
\n", + "

5 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + " stock index year index price log_indexprice inflationrate oil prices \\\n", + "0 NASDAQ 1980.0 168.61 2.23 0.14 21.59 \n", + "1 NASDAQ 1981.0 203.15 2.31 0.10 31.77 \n", + "2 NASDAQ 1982.0 188.98 2.28 0.06 28.52 \n", + "3 NASDAQ 1983.0 285.43 2.46 0.03 26.19 \n", + "4 NASDAQ 1984.0 248.89 2.40 0.04 25.88 \n", + "\n", + " exchange_rate gdppercent percapitaincome unemploymentrate ... \\\n", + "0 1.0 0.09 12575.0 0.07 ... \n", + "1 1.0 0.12 13976.0 0.08 ... \n", + "2 1.0 0.04 14434.0 0.10 ... \n", + "3 1.0 0.09 15544.0 0.10 ... \n", + "4 1.0 0.11 17121.0 0.08 ... \n", + "\n", + " country_France country_Germany country_Hong Kong country_India \\\n", + "0 False False False False \n", + "1 False False False False \n", + "2 False False False False \n", + "3 False False False False \n", + "4 False False False False \n", + "\n", + " country_Japan country_Spain country_United Kingdom \\\n", + "0 False False False \n", + "1 False False False \n", + "2 False False False \n", + "3 False False False \n", + "4 False False False \n", + "\n", + " country_United States of America oil_price_category Economic_Growth \n", + "0 True cheap NaN \n", + "1 True cheap 0.03 \n", + "2 True cheap -0.08 \n", + "3 True cheap 0.05 \n", + "4 True cheap 0.02 \n", + "\n", + "[5 rows x 24 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Пример синтеза признака \"Экономический рост\"\n", + "data['Economic_Growth'] = data['gdppercent'].diff()\n", + "data.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "4. Масштабирование признаков" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Масштабирование признаков на основе нормировки и стандартизации в рамках данного набора данных не является необходимым" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Конструирование признаков с использованием Featuretools" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\laba_MII\\aimvenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index index not found in dataframe, creating new integer column\n", + " warnings.warn(\n", + "d:\\laba_MII\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\laba_MII\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\laba_MII\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\laba_MII\\aimvenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Built 24 features\n", + "Elapsed: 00:00 | Progress: 100%|██████████\n", + " stock index year index price log_indexprice inflationrate \\\n", + "index \n", + "0 NASDAQ 1980.0 168.61 2.23 0.14 \n", + "1 NASDAQ 1981.0 203.15 2.31 0.10 \n", + "2 NASDAQ 1982.0 188.98 2.28 0.06 \n", + "3 NASDAQ 1983.0 285.43 2.46 0.03 \n", + "4 NASDAQ 1984.0 248.89 2.40 0.04 \n", + "5 NASDAQ 1985.0 290.25 2.46 0.04 \n", + "6 NASDAQ 1986.0 366.97 2.56 0.02 \n", + "7 NASDAQ 1987.0 402.57 2.60 0.04 \n", + "8 NASDAQ 1988.0 374.43 2.57 0.04 \n", + "9 NASDAQ 1989.0 437.80 2.64 0.05 \n", + "\n", + " oil prices exchange_rate gdppercent percapitaincome \\\n", + "index \n", + "0 21.59 1.0 0.09 12575 \n", + "1 31.77 1.0 0.12 13976 \n", + "2 28.52 1.0 0.04 14434 \n", + "3 26.19 1.0 0.09 15544 \n", + "4 25.88 1.0 0.11 17121 \n", + "5 24.09 1.0 0.07 18237 \n", + "6 12.51 1.0 0.06 19071 \n", + "7 15.40 1.0 0.06 20039 \n", + "8 12.58 1.0 0.08 21417 \n", + "9 15.86 1.0 0.08 22857 \n", + "\n", + " unemploymentrate ... country_France country_Germany \\\n", + "index ... \n", + "0 0.07 ... False False \n", + "1 0.08 ... False False \n", + "2 0.10 ... False False \n", + "3 0.10 ... False False \n", + "4 0.08 ... False False \n", + "5 0.07 ... False False \n", + "6 0.07 ... False False \n", + "7 0.06 ... False False \n", + "8 0.05 ... False False \n", + "9 0.05 ... False False \n", + "\n", + " country_Hong Kong country_India country_Japan country_Spain \\\n", + "index \n", + "0 False False False False \n", + "1 False False False False \n", + "2 False False False False \n", + "3 False False False False \n", + "4 False False False False \n", + "5 False False False False \n", + "6 False False False False \n", + "7 False False False False \n", + "8 False False False False \n", + "9 False False False False \n", + "\n", + " country_United Kingdom country_United States of America \\\n", + "index \n", + "0 False True \n", + "1 False True \n", + "2 False True \n", + "3 False True \n", + "4 False True \n", + "5 False True \n", + "6 False True \n", + "7 False True \n", + "8 False True \n", + "9 False True \n", + "\n", + " oil_price_category Economic_Growth \n", + "index \n", + "0 cheap NaN \n", + "1 cheap 0.03 \n", + "2 cheap -0.08 \n", + "3 cheap 0.05 \n", + "4 cheap 0.02 \n", + "5 cheap -0.04 \n", + "6 cheap -0.01 \n", + "7 cheap 0.00 \n", + "8 cheap 0.02 \n", + "9 cheap 0.00 \n", + "\n", + "[10 rows x 24 columns]\n" + ] + } + ], + "source": [ + "# Определение сущностей\n", + "es = ft.EntitySet(id='economic')\n", + "es = es.add_dataframe(dataframe_name=\"dataEconomic\", dataframe=data, index='index', make_index=False)\n", + "\n", + "# Автоматическое конструирование признаков\n", + "feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name=\"dataEconomic\", max_depth=2, verbose=1, n_jobs=1)\n", + "print(feature_matrix.head(10))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Оценка качества наборов признаков" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Все наборы признаков имеют неплохую предсказательную способность, высокую скорость вычисления, высокую надежность, при правильности их предварительной обработки, высокую корреляцию и цельность. Данные могут быть использованы для дальнейшего улучшения модели и принятия обоснованных бизнес-решений в области экономики." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "aimvenv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}