2024-11-22 18:14:00 +04:00

58 KiB
Raw Blame History

Вариант: Экономика стран

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import featuretools as ft
import numpy as np

label_encoder = LabelEncoder()

# Функция для применения oversampling
def apply_oversampling(X, y):
    oversampler = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = oversampler.fit_resample(X, y)
    return X_resampled, y_resampled

# Функция для применения undersampling
def apply_undersampling(X, y):
    undersampler = RandomUnderSampler(random_state=42)
    X_resampled, y_resampled = undersampler.fit_resample(X, y)
    return X_resampled, y_resampled

def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
):
    """
    Splits a Pandas dataframe into three subsets (train, val, and test)
    following fractional ratios provided by the user, where each subset is
    stratified by the values in a specific column (that is, each subset has
    the same relative frequency of the values in the column). It performs this
    splitting by running train_test_split() twice.

    Parameters
    ----------
    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. Usually
        this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into train, val, and
        test data. The values should be expressed as float fractions and should
        sum to 1.0.
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split().

    Returns
    -------
    df_train, df_val, df_test :
        Dataframes containing the three splits.
    """

    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )

    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))

    X = df_input  # Contains all columns.
    y = df_input[
        [stratify_colname]
    ]  # Dataframe of just the column on which to stratify.

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )

    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)

    return df_train, df_val, df_test

data = pd.read_csv("../data/Economic.csv")
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 369 entries, 0 to 368
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   stock index          369 non-null    object 
 1   country              369 non-null    object 
 2   year                 369 non-null    float64
 3   index price          317 non-null    float64
 4   log_indexprice       369 non-null    float64
 5   inflationrate        326 non-null    float64
 6   oil prices           369 non-null    float64
 7   exchange_rate        367 non-null    float64
 8   gdppercent           350 non-null    float64
 9   percapitaincome      368 non-null    float64
 10  unemploymentrate     348 non-null    float64
 11  manufacturingoutput  278 non-null    float64
 12  tradebalance         365 non-null    float64
 13  USTreasury           369 non-null    float64
dtypes: float64(12), object(2)
memory usage: 40.5+ KB

Определение бизнес-целей

  1. Прогнозирование ВВП на душу населения (GDP per capita) для каждой из 9 стран на следующие 5 лет. Это позволит бизнесу и правительствам принимать обоснованные решения в области экономической политики и инвестиций.

  2. Определение факторов, наиболее сильно влияющих на ВВП на душу населения. Это поможет выявить области, требующие особого внимания для стимулирования экономического роста.

Определение целей технического проекта

  1. Разработка модели машинного обучения, способной с высокой точностью прогнозировать ВВП на душу населения на основе исторических данных.

  2. Анализ корреляции между различными экономическими показателями и ВВП на душу населения, выявление наиболее значимых факторов.

Дополнение данных

In [5]:
print(data.isnull().sum())
stock index             0
country                 0
year                    0
index price            52
log_indexprice          0
inflationrate          43
oil prices              0
exchange_rate           2
gdppercent             19
percapitaincome         1
unemploymentrate       21
manufacturingoutput    91
tradebalance            4
USTreasury              0
dtype: int64
In [6]:
data.fillna({"index_price": 1, "inflationrate": 0, "gpdpercent": 0, "percapitaincome": 100,"unemploymentrate": 0, "manufacturingoutput": 1, "tradebalance": -350})
Out[6]:
stock index country year index price log_indexprice inflationrate oil prices exchange_rate gdppercent percapitaincome unemploymentrate manufacturingoutput tradebalance USTreasury
0 NASDAQ United States of America 1980.0 168.61 2.23 0.14 21.59 1.00 0.09 12575.0 0.07 1.00 -13.06 0.11
1 NASDAQ United States of America 1981.0 203.15 2.31 0.10 31.77 1.00 0.12 13976.0 0.08 1.00 -12.52 0.14
2 NASDAQ United States of America 1982.0 188.98 2.28 0.06 28.52 1.00 0.04 14434.0 0.10 1.00 -19.97 0.13
3 NASDAQ United States of America 1983.0 285.43 2.46 0.03 26.19 1.00 0.09 15544.0 0.10 1.00 -51.64 0.11
4 NASDAQ United States of America 1984.0 248.89 2.40 0.04 25.88 1.00 0.11 17121.0 0.08 1.00 -102.73 0.12
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
364 IEX 35 Spain 2016.0 9352.10 3.97 0.00 51.97 1.11 0.03 26523.0 0.20 139.01 49.16 0.02
365 IEX 35 Spain 2017.0 10043.90 4.00 0.02 57.88 1.13 0.03 28170.0 0.17 148.80 47.33 0.02
366 IEX 35 Spain 2018.0 8539.90 3.93 0.02 49.52 1.18 0.02 30389.0 0.15 158.33 38.70 0.03
367 IEX 35 Spain 2019.0 9549.20 3.98 0.01 59.88 1.12 0.02 29565.0 0.14 155.49 41.94 0.02
368 IEX 35 Spain 2020.0 8073.70 3.91 0.00 47.02 1.14 -0.11 27057.0 0.16 143.05 19.10 0.01

369 rows × 14 columns

Разбиение данных на выборки и оценка сбалансированности выборки

In [7]:
main_data = data.copy()

value_counts = data["country"].value_counts()
        
df_train, df_val, df_test = split_stratified_into_train_val_test(
    data, stratify_colname="country", frac_train=0.60, frac_val=0.20, frac_test=0.20)

print("Обучающая выборка: ", df_train.shape)
print(df_train["country"].value_counts())

print("Контрольная выборка: ", df_val.shape)
print(df_val["country"].value_counts())

print("Тестовая выборка: ", df_test.shape)
print(df_test["country"].value_counts())
Обучающая выборка:  (221, 14)
country
Spain                       25
India                       25
Germany                     25
China                       25
United Kingdom              25
Hong Kong                   24
Japan                       24
United States of America    24
France                      24
Name: count, dtype: int64
Контрольная выборка:  (74, 14)
country
United States of America    9
Japan                       9
France                      8
Germany                     8
Hong Kong                   8
Spain                       8
India                       8
China                       8
United Kingdom              8
Name: count, dtype: int64
Тестовая выборка:  (74, 14)
country
France                      9
Hong Kong                   9
United Kingdom              8
China                       8
India                       8
Spain                       8
United States of America    8
Japan                       8
Germany                     8
Name: count, dtype: int64

Конструирование признаков

  1. Унитарное кодирование категориальных признаков
In [8]:
# Пример для кодирования стран
data = pd.get_dummies(data, columns=['country'], prefix='country')
data.head()
Out[8]:
stock index year index price log_indexprice inflationrate oil prices exchange_rate gdppercent percapitaincome unemploymentrate ... USTreasury country_China country_France country_Germany country_Hong Kong country_India country_Japan country_Spain country_United Kingdom country_United States of America
0 NASDAQ 1980.0 168.61 2.23 0.14 21.59 1.0 0.09 12575.0 0.07 ... 0.11 False False False False False False False False True
1 NASDAQ 1981.0 203.15 2.31 0.10 31.77 1.0 0.12 13976.0 0.08 ... 0.14 False False False False False False False False True
2 NASDAQ 1982.0 188.98 2.28 0.06 28.52 1.0 0.04 14434.0 0.10 ... 0.13 False False False False False False False False True
3 NASDAQ 1983.0 285.43 2.46 0.03 26.19 1.0 0.09 15544.0 0.10 ... 0.11 False False False False False False False False True
4 NASDAQ 1984.0 248.89 2.40 0.04 25.88 1.0 0.11 17121.0 0.08 ... 0.12 False False False False False False False False True

5 rows × 22 columns

In [9]:
data.tail()
Out[9]:
stock index year index price log_indexprice inflationrate oil prices exchange_rate gdppercent percapitaincome unemploymentrate ... USTreasury country_China country_France country_Germany country_Hong Kong country_India country_Japan country_Spain country_United Kingdom country_United States of America
364 IEX 35 2016.0 9352.1 3.97 NaN 51.97 1.11 0.03 26523.0 0.20 ... 0.02 False False False False False False True False False
365 IEX 35 2017.0 10043.9 4.00 0.02 57.88 1.13 0.03 28170.0 0.17 ... 0.02 False False False False False False True False False
366 IEX 35 2018.0 8539.9 3.93 0.02 49.52 1.18 0.02 30389.0 0.15 ... 0.03 False False False False False False True False False
367 IEX 35 2019.0 9549.2 3.98 0.01 59.88 1.12 0.02 29565.0 0.14 ... 0.02 False False False False False False True False False
368 IEX 35 2020.0 8073.7 3.91 NaN 47.02 1.14 -0.11 27057.0 0.16 ... 0.01 False False False False False False True False False

5 rows × 22 columns

  1. Дискретизация числовых признаков
In [10]:
# Пример для дискретизации года
bin = [0, 40, 70, float('inf')]
label = ["cheap", "normal", "rich"]

data["oil_price_category"] = pd.cut(data['oil prices'], bins=bin, labels=label)
print(data[["oil prices", "oil_price_category"]].head(30))
    oil prices oil_price_category
0        21.59              cheap
1        31.77              cheap
2        28.52              cheap
3        26.19              cheap
4        25.88              cheap
5        24.09              cheap
6        12.51              cheap
7        15.40              cheap
8        12.58              cheap
9        15.86              cheap
10       27.28              cheap
11       19.50              cheap
12       19.41              cheap
13       14.52              cheap
14       17.16              cheap
15       19.03              cheap
16       25.23              cheap
17       18.33              cheap
18       11.35              cheap
19       26.10              cheap
20       28.44              cheap
21       19.39              cheap
22       29.46              cheap
23       32.13              cheap
24       43.15             normal
25       59.41             normal
26       61.96             normal
27       91.69               rich
28       41.12             normal
29       74.47               rich
  1. "Ручной" синтез признаков
In [11]:
# Пример синтеза признака "Экономический рост"
data['Economic_Growth'] = data['gdppercent'].diff()
data.head()
Out[11]:
stock index year index price log_indexprice inflationrate oil prices exchange_rate gdppercent percapitaincome unemploymentrate ... country_France country_Germany country_Hong Kong country_India country_Japan country_Spain country_United Kingdom country_United States of America oil_price_category Economic_Growth
0 NASDAQ 1980.0 168.61 2.23 0.14 21.59 1.0 0.09 12575.0 0.07 ... False False False False False False False True cheap NaN
1 NASDAQ 1981.0 203.15 2.31 0.10 31.77 1.0 0.12 13976.0 0.08 ... False False False False False False False True cheap 0.03
2 NASDAQ 1982.0 188.98 2.28 0.06 28.52 1.0 0.04 14434.0 0.10 ... False False False False False False False True cheap -0.08
3 NASDAQ 1983.0 285.43 2.46 0.03 26.19 1.0 0.09 15544.0 0.10 ... False False False False False False False True cheap 0.05
4 NASDAQ 1984.0 248.89 2.40 0.04 25.88 1.0 0.11 17121.0 0.08 ... False False False False False False False True cheap 0.02

5 rows × 24 columns

  1. Масштабирование признаков

Масштабирование признаков на основе нормировки и стандартизации в рамках данного набора данных не является необходимым

Конструирование признаков с использованием Featuretools

In [13]:
# Определение сущностей
es = ft.EntitySet(id='economic')
es = es.add_dataframe(dataframe_name="dataEconomic", dataframe=data, index='index', make_index=False)

# Автоматическое конструирование признаков
feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name="dataEconomic", max_depth=2, verbose=1, n_jobs=1)
print(feature_matrix.head(10))
d:\laba_MII\aimvenv\Lib\site-packages\featuretools\entityset\entityset.py:1733: UserWarning: index index not found in dataframe, creating new integer column
  warnings.warn(
d:\laba_MII\aimvenv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
d:\laba_MII\aimvenv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
d:\laba_MII\aimvenv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
d:\laba_MII\aimvenv\Lib\site-packages\featuretools\synthesis\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created
  warnings.warn(
Built 24 features
Elapsed: 00:00 | Progress: 100%|██████████
      stock index    year  index price  log_indexprice  inflationrate  \
index                                                                   
0          NASDAQ  1980.0       168.61            2.23           0.14   
1          NASDAQ  1981.0       203.15            2.31           0.10   
2          NASDAQ  1982.0       188.98            2.28           0.06   
3          NASDAQ  1983.0       285.43            2.46           0.03   
4          NASDAQ  1984.0       248.89            2.40           0.04   
5          NASDAQ  1985.0       290.25            2.46           0.04   
6          NASDAQ  1986.0       366.97            2.56           0.02   
7          NASDAQ  1987.0       402.57            2.60           0.04   
8          NASDAQ  1988.0       374.43            2.57           0.04   
9          NASDAQ  1989.0       437.80            2.64           0.05   

       oil prices  exchange_rate  gdppercent  percapitaincome  \
index                                                           
0           21.59            1.0        0.09            12575   
1           31.77            1.0        0.12            13976   
2           28.52            1.0        0.04            14434   
3           26.19            1.0        0.09            15544   
4           25.88            1.0        0.11            17121   
5           24.09            1.0        0.07            18237   
6           12.51            1.0        0.06            19071   
7           15.40            1.0        0.06            20039   
8           12.58            1.0        0.08            21417   
9           15.86            1.0        0.08            22857   

       unemploymentrate  ...  country_France  country_Germany  \
index                    ...                                    
0                  0.07  ...           False            False   
1                  0.08  ...           False            False   
2                  0.10  ...           False            False   
3                  0.10  ...           False            False   
4                  0.08  ...           False            False   
5                  0.07  ...           False            False   
6                  0.07  ...           False            False   
7                  0.06  ...           False            False   
8                  0.05  ...           False            False   
9                  0.05  ...           False            False   

       country_Hong Kong  country_India  country_Japan  country_Spain  \
index                                                                   
0                  False          False          False          False   
1                  False          False          False          False   
2                  False          False          False          False   
3                  False          False          False          False   
4                  False          False          False          False   
5                  False          False          False          False   
6                  False          False          False          False   
7                  False          False          False          False   
8                  False          False          False          False   
9                  False          False          False          False   

       country_United Kingdom  country_United States of America  \
index                                                             
0                       False                              True   
1                       False                              True   
2                       False                              True   
3                       False                              True   
4                       False                              True   
5                       False                              True   
6                       False                              True   
7                       False                              True   
8                       False                              True   
9                       False                              True   

       oil_price_category  Economic_Growth  
index                                       
0                   cheap              NaN  
1                   cheap             0.03  
2                   cheap            -0.08  
3                   cheap             0.05  
4                   cheap             0.02  
5                   cheap            -0.04  
6                   cheap            -0.01  
7                   cheap             0.00  
8                   cheap             0.02  
9                   cheap             0.00  

[10 rows x 24 columns]

Оценка качества наборов признаков

Все наборы признаков имеют неплохую предсказательную способность, высокую скорость вычисления, высокую надежность, при правильности их предварительной обработки, высокую корреляцию и цельность. Данные могут быть использованы для дальнейшего улучшения модели и принятия обоснованных бизнес-решений в области экономики.