2024-12-20 16:38:38 +04:00

332 KiB

Вариант: Экономика стран

In [53]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn import metrics
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, roc_auc_score, f1_score,
    matthews_corrcoef, cohen_kappa_score, confusion_matrix
)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import featuretools as ft
from sklearn.metrics import accuracy_score, classification_report

# Функция для применения oversampling
def apply_oversampling(X, y):
    oversampler = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = oversampler.fit_resample(X, y)
    return X_resampled, y_resampled

# Функция для применения undersampling
def apply_undersampling(X, y):
    undersampler = RandomUnderSampler(random_state=42)
    X_resampled, y_resampled = undersampler.fit_resample(X, y)
    return X_resampled, y_resampled

def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
):
    """
    Splits a Pandas dataframe into three subsets (train, val, and test)
    following fractional ratios provided by the user, where each subset is
    stratified by the values in a specific column (that is, each subset has
    the same relative frequency of the values in the column). It performs this
    splitting by running train_test_split() twice.

    Parameters
    ----------
    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. Usually
        this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into train, val, and
        test data. The values should be expressed as float fractions and should
        sum to 1.0.
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split().

    Returns
    -------
    df_train, df_val, df_test :
        Dataframes containing the three splits.
    """

    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )

    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))

    X = df_input  # Contains all columns.
    y = df_input[
        [stratify_colname]
    ]  # Dataframe of just the column on which to stratify.

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )

    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)

    return df_train, df_val, df_test


df = pd.read_csv("../data/Economic.csv")
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 369 entries, 0 to 368
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   stock index          369 non-null    object 
 1   country              369 non-null    object 
 2   year                 369 non-null    float64
 3   index price          317 non-null    float64
 4   log_indexprice       369 non-null    float64
 5   inflationrate        326 non-null    float64
 6   oil prices           369 non-null    float64
 7   exchange_rate        367 non-null    float64
 8   gdppercent           350 non-null    float64
 9   percapitaincome      368 non-null    float64
 10  unemploymentrate     348 non-null    float64
 11  manufacturingoutput  278 non-null    float64
 12  tradebalance         365 non-null    float64
 13  USTreasury           369 non-null    float64
dtypes: float64(12), object(2)
memory usage: 40.5+ KB

Бизнес-цель: сегментировать страны на основе экономических показателей для определения схожих групп стран и последующего анализа каждой группы.

In [54]:
data = df.copy()
data_cleaned = df.dropna()

if not np.issubdtype(data_cleaned.dtypes.iloc[1], np.number):
    print("Данные содержат текстовые значения. Убедитесь, что только числовые данные используются для анализа.")
    cleaned_data = data_cleaned.select_dtypes(include=[np.number])

print(f"Исходный размер датасета: {df.shape[0]}")
print(f"Очищенный размер датасета: {data_cleaned.shape[0]}")

data1 = pd.get_dummies(data_cleaned, columns=['country'], drop_first=True)
Данные содержат текстовые значения. Убедитесь, что только числовые данные используются для анализа.
Исходный размер датасета: 369
Очищенный размер датасета: 219
In [55]:
print(data1.columns)
print(data1.isnull().sum())
data2 = data1.drop(['stock index'], axis = 1)
data2.info()
Index(['stock index', 'year', 'index price', 'log_indexprice', 'inflationrate',
       'oil prices', 'exchange_rate', 'gdppercent', 'percapitaincome',
       'unemploymentrate', 'manufacturingoutput', 'tradebalance', 'USTreasury',
       'country_France', 'country_Germany', 'country_Hong Kong',
       'country_India', 'country_Japan', 'country_Spain',
       'country_United Kingdom', 'country_United States of America'],
      dtype='object')
stock index                         0
year                                0
index price                         0
log_indexprice                      0
inflationrate                       0
oil prices                          0
exchange_rate                       0
gdppercent                          0
percapitaincome                     0
unemploymentrate                    0
manufacturingoutput                 0
tradebalance                        0
USTreasury                          0
country_France                      0
country_Germany                     0
country_Hong Kong                   0
country_India                       0
country_Japan                       0
country_Spain                       0
country_United Kingdom              0
country_United States of America    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 219 entries, 10 to 367
Data columns (total 20 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   year                              219 non-null    float64
 1   index price                       219 non-null    float64
 2   log_indexprice                    219 non-null    float64
 3   inflationrate                     219 non-null    float64
 4   oil prices                        219 non-null    float64
 5   exchange_rate                     219 non-null    float64
 6   gdppercent                        219 non-null    float64
 7   percapitaincome                   219 non-null    float64
 8   unemploymentrate                  219 non-null    float64
 9   manufacturingoutput               219 non-null    float64
 10  tradebalance                      219 non-null    float64
 11  USTreasury                        219 non-null    float64
 12  country_France                    219 non-null    bool   
 13  country_Germany                   219 non-null    bool   
 14  country_Hong Kong                 219 non-null    bool   
 15  country_India                     219 non-null    bool   
 16  country_Japan                     219 non-null    bool   
 17  country_Spain                     219 non-null    bool   
 18  country_United Kingdom            219 non-null    bool   
 19  country_United States of America  219 non-null    bool   
dtypes: bool(8), float64(12)
memory usage: 24.0 KB
In [56]:
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(data2)

print(reduced_data)

plt.figure(figsize=(10,6))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], alpha=0.7, cmap='viridis')
plt.title("Данные после PCA")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()
[[  -175.11730098  -8125.91541511]
 [  1359.15430649  -7976.83866444]
 [  2330.12842909  -7886.62363629]
 [  3638.58468227  -7885.49439407]
 [  4638.69365316  -7738.97353013]
 [  5921.57827905  -7533.60974468]
 [  7420.03426153  -7269.3752748 ]
 [  8823.38406687  -6981.62904291]
 [ 10508.03737208  -6093.36153206]
 [ 12357.01890855  -5087.16328305]
 [ 13108.1576432   -6856.08050495]
 [ 13984.76420629  -7375.23274241]
 [ 15460.23414568  -7307.74768791]
 [ 17685.72825995  -7028.50596781]
 [ 20090.21526684  -6980.26253392]
 [ 22277.89668445  -6874.90441336]
 [ 23962.33912597  -6605.01263864]
 [ 24446.64571772  -6845.1952535 ]
 [ 25871.01649677  -6556.01887636]
 [ 27597.98078369  -6314.0423994 ]
 [ 29116.38461377  -5778.79720501]
 [ 31080.78776834  -4997.72117578]
 [ 34067.26159619  -4464.16933609]
 [ 36188.91962302  -3273.16517374]
 [ 39173.66351113  -2162.39563038]
 [ 41402.60872282  -1707.33871091]
 [ -4920.26128698  -6264.09433006]
 [ -4107.33309919  -5936.17563897]
 [ -5591.9683601   -4970.64964709]
 [ -4284.01907339  -5358.91854133]
 [  -851.8323232   -4827.08675426]
 [   369.10904029  -4430.5671039 ]
 [  2778.18409795  -3478.03512091]
 [  4317.56600567  -2772.06424501]
 [  4775.4295974   -1736.3805783 ]
 [  4243.84270452  -2429.91924498]
 [  3781.33828765  -3422.66358249]
 [  6009.39340531  -4759.76856132]
 [ 10424.87883391  -4341.57001167]
 [ 16256.5577047   -4160.49272194]
 [ 18000.35381788  -3402.49405369]
 [ 20556.36944866  -2868.79479514]
 [ 26534.29033635  -2792.87710749]
 [ 14799.28968455  -3522.03466609]
 [ 15612.92493655  -3056.85953865]
 [ 18114.06439047  -3451.64906502]
 [ 18523.60322894  -3137.05524082]
 [ 19497.88792183  -2311.83435969]
 [ 23542.33878862  -2603.44082121]
 [ 17156.41944477  -1855.11336425]
 [ 16429.21734114  -1290.51296177]
 [ 19092.49490231  -2321.82962639]
 [ 18472.52923296  -1490.57697639]
 [ 16375.47555831  -2516.33360096]
 [-23669.6107605   -6856.0257174 ]
 [-23678.77675534  -5288.17745923]
 [-23675.29226616  -4557.37607353]
 [-23614.87756504  -3978.16434834]
 [-23608.70369197  -4795.08252579]
 [-23583.41444535  -4821.11540317]
 [-23553.11851947  -4248.00207401]
 [-23571.19840587  -4851.19514063]
 [-23490.20520886  -2902.51879176]
 [-23516.8183412   -3935.81513361]
 [-23526.75429279  -4645.50718999]
 [-23504.72354379  -4531.19715956]
 [-23363.19551277  -2072.91896281]
 [-23261.92618182  -1311.98677798]
 [-23100.48320798   1479.32071988]
 [-22891.64152145   5863.3059029 ]
 [-22497.55043967  12353.76223468]
 [-22810.24606437   1719.80676484]
 [-22498.91033109   9530.51743361]
 [-22162.14605795  12565.71566922]
 [-22196.83923531   7511.01242735]
 [-22104.88476049  11481.38894531]
 [-22052.6046314   13224.64916476]
 [-21760.03118667  19546.72612234]
 [-21765.08108727  18164.36724379]
 [-21624.70393447  18669.39338085]
 [-21178.78608654  26088.80277007]
 [-21109.11279888  28098.84971046]
 [-20866.86464368  33279.39393894]
 [-20893.51896865  39780.06254589]
 [-14423.1457247   -1282.18684029]
 [-13509.2003846    -655.76348716]
 [-14296.15572734   -755.96193511]
 [-13413.69439115    639.71572953]
 [-12809.29884004   2375.19409329]
 [-12158.52959914   4348.17715487]
 [ -6522.44157462   8033.5362674 ]
 [  1704.70099935  18471.27282034]
 [  1655.56058562  25476.71405478]
 [  2080.3623691   20850.54443176]
 [  5485.77754575  15616.59302296]
 [  7819.78947585   9366.29074685]
 [ 12111.55694469  10242.68841252]
 [ 15650.51181433  10984.00736822]
 [ 11433.19601313   9557.94037665]
 [  8234.20852471   6601.43492437]
 [ 14908.39218447   8212.21166551]
 [ 15582.44630853   3213.09254948]
 [ 17022.83119681    354.98642592]
 [ 20691.76843817    917.30352414]
 [ 10957.19855236  10377.0947459 ]
 [  2024.91216378   4271.80296384]
 [  1577.53768178   7262.90870814]
 [   923.30323464   3888.13700397]
 [   177.88519727   1781.94675355]
 [  2920.96982312   4376.20609959]
 [  4532.07472006   5747.27132176]
 [  6973.62358473   8398.96029259]
 [  8070.77035599  14953.33034889]
 [  7169.62495215  11882.96240347]
 [  8957.83841078   9430.82539374]
 [ 11632.4269169   12493.44565284]
 [ 13222.91784378  12528.81546068]
 [ 14875.88602712  11756.77156727]
 [ 16838.60967462  13672.27804775]
 [ 18971.65627197  14248.82756954]
 [ 20293.03610972  15071.21521838]
 [ 22656.89308108  12373.65973606]
 [ 25164.7880265   17249.89060492]
 [ 25038.51948028  19604.32898337]
 [-22521.51647532  -6475.6488675 ]
 [-22287.15333873  -6806.44040052]
 [-21928.10308673  -6275.45882841]
 [-21264.78463135  -3653.78528522]
 [-20531.00831341  -5095.02609565]
 [-20172.14558233  -5282.85209575]
 [-19454.7374316   -5247.74237248]
 [-18394.16059996  -5440.24957641]
 [-17708.66878744  -5890.61030489]
 [-16977.19895608  -5942.37154953]
 [-16348.31897325  -5865.65397511]
 [-15924.38194624  -4498.96322609]
 [-15860.82110486  -5179.59873881]
 [-15124.35357677  -4924.63281227]
 [-14037.81314916  -5297.21871369]
 [-13797.61834264  -5294.07509501]
 [-13510.87104888  -5120.83918239]
 [-12087.007858    -6638.79003576]
 [-10512.58841775  -6714.07353917]
 [-10962.01711103  -5979.83585746]
 [-10171.64621343  -6161.24259993]
 [ -7826.84801297  -6077.1684473 ]
 [ -8407.60947409  -5426.43845034]
 [-10173.15559564  -4042.87459349]
 [ -9985.51047281  -3269.68499945]
 [  2842.53923731  -1659.64644782]
 [  -210.01838598  -2102.09484993]
 [  -311.0406242   -3373.08487142]
 [  6335.70060224  -4747.4495941 ]
 [ 10138.40650878  -4559.05113457]
 [ 10581.93058082  -3418.56352793]
 [ 12446.58907491  -2280.15757639]
 [ 17768.95074632   -953.007551  ]
 [ 21653.34615304  -4315.39855815]
 [ 17670.70161619  -2103.24709782]
 [ 22775.09988097  -3257.17802075]
 [ 24196.31624589    612.69839918]
 [ 17303.57975039   1736.2821211 ]
 [ 20809.83874293   3816.64931923]
 [ 24003.4803278    1371.27597591]
 [ 22733.16145132   4096.77689857]
 [ 22002.33518214   4587.07826629]
 [-22527.05006009  -7381.85374308]
 [-22612.5967173   -7288.92543614]
 [-22552.96608493  -7003.45088673]
 [-21964.59843912  -6545.73342438]
 [-21554.53662783  -6960.49818877]
 [-21329.32661106  -6392.47113847]
 [-21315.93410183  -5965.58018732]
 [-20704.3226911   -6474.22835092]
 [-20715.47475303  -6217.16016385]
 [-20387.16579041  -6133.85309515]
 [-20594.13107878  -5717.76992433]
 [-20434.51858193  -6109.14576725]
 [-19917.99313807  -6132.23909419]
 [-19908.15323203  -5688.67632044]
 [-20292.77585504  -4994.97321597]
 [-20154.66080781  -4054.69817659]
 [   764.91853433  -2600.79072445]
 [ -1496.89053334  -2571.96787734]
 [ -1498.61057888  -3874.0102535 ]
 [   298.9782736   -5483.28126325]
 [  5650.9006054   -5132.90198485]
 [  9825.18879465  -4981.65429224]
 [ 10818.74371737  -4113.98676633]
 [ 12541.14641327  -3333.40223951]
 [ 17627.97654687  -3397.70050337]
 [ 16697.34233815  -5182.04865092]
 [ 19848.87251026  -5911.89362344]
 [ 18636.58173303  -4742.88385811]
 [ 19099.76425172  -4778.5442203 ]
 [ 14841.29503493  -3624.0984356 ]
 [ 17590.64652191  -4279.85387892]
 [ 16457.40721459  -3001.71826466]
 [-23875.42539397  -4266.38628493]
 [-23830.7846687   -2743.19109952]
 [-23782.78332812   -643.28924727]
 [-23709.95844212   1936.59508244]
 [ -8040.24745841   3320.76379591]
 [ -9078.32541232    816.39865948]
 [ -8478.58542565     87.95138959]
 [ -6804.24833781  -2318.23626326]
 [ -2356.71756272   -736.88616465]
 [  1073.78794125    514.96450948]
 [  2639.31978814   2126.46003002]
 [  4689.57935304   5484.86147412]
 [  8917.53112497   6407.45915828]
 [ 11676.61475759    345.44254243]
 [  7828.232595     -180.49504026]
 [  4463.92176667   -489.00527974]
 [  5255.16254195   1239.45767925]
 [  1920.63195671    956.24415893]
 [  4360.91701569   1390.62029681]
 [  6538.93427087   -172.17625484]
 [  5742.17430744    858.73932849]]
C:\Users\mitat\AppData\Local\Temp\ipykernel_19512\3444879312.py:7: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  plt.scatter(reduced_data[:, 0], reduced_data[:, 1], alpha=0.7, cmap='viridis')
No description has been provided for this image

Выбор количества кластеров

In [57]:
inertia = []
k_values = range(1, 10)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(reduced_data)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(k_values, inertia, marker='o')
plt.title("Метод локтя")
plt.xlabel("Количество кластеров")
plt.ylabel("Инерция")
plt.show()

# Коэффициент силуэта для выбора оптимального количества кластеров
silhouette_scores = []

for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(reduced_data)
    score = silhouette_score(reduced_data, labels)
    silhouette_scores.append(score)

plt.figure(figsize=(10, 6))
plt.plot(range(2, 10), silhouette_scores, marker='o')
plt.title("Коэффициент силуэта")
plt.xlabel("Количество кластеров")
plt.ylabel("Силуэт")
plt.show()
No description has been provided for this image
No description has been provided for this image

Кластерны анализ

In [58]:
# Оптимальное количество кластеров
optimal_k = 4

# Неиерархическая кластеризация (K-Means)
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
labels_kmeans = kmeans.fit_predict(reduced_data)

plt.figure(figsize=(10, 6))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=labels_kmeans, cmap='viridis', alpha=0.5)
plt.title("K-Means Кластеры")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

# Агломеративная кластеризация
agglomerative = AgglomerativeClustering(n_clusters=optimal_k)
agglomerative_labels = agglomerative.fit_predict(reduced_data)

plt.figure(figsize=(10, 6))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=agglomerative_labels, cmap='viridis', alpha=0.5)
plt.title("Агломеративная кластеризация")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()
No description has been provided for this image
No description has been provided for this image

Оценка качества решения

In [59]:
# Коэффициент силуэта для K-Means
kmeans_inertia = kmeans.inertia_
kmeans_silhouette = silhouette_score(reduced_data, labels_kmeans)
print(f'K-Means - Инерция: {kmeans_inertia}, Коэффициент силуэта: {kmeans_silhouette}')

# Коэффициент силуэта для иерархической кластеризации
agglomerative_silhouette = silhouette_score(reduced_data, agglomerative_labels)
print(f"Коэффициент силуэта для иерархической кластеризации: {agglomerative_silhouette:.2f}")
K-Means - Инерция: 14420617136.370611, Коэффициент силуэта: 0.48226441542888965
Коэффициент силуэта для иерархической кластеризации: 0.46