Загрузка данных в DataFrame

In [13]:
import pandas as pd

df = pd.read_csv("data/population.csv", index_col="no")

df.info()

print(df.shape)

df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 235 entries, 1 to 235
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Country (or dependency)  235 non-null    object
 1   Population 2020          235 non-null    object
 2   Yearly Change            235 non-null    object
 3   Net Change               235 non-null    object
 4   Density(P/Km²)           235 non-null    object
 5   Land Area (Km²)          235 non-null    object
 6   Migrants (net)           201 non-null    object
 7   Fert. Rate               235 non-null    object
 8   MedAge                   235 non-null    object
 9   Urban Pop %              235 non-null    object
 10  World Share              235 non-null    object
dtypes: object(11)
memory usage: 22.0+ KB
(235, 11)


Unnamed: 0_level_0,Country (or dependency),Population 2020,Yearly Change,Net Change,Density(P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,MedAge,Urban Pop %,World Share
no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,China,1439323776,0.39%,5540090,153,9388211,-348399,1.7,38,61%,18.47%
2,India,1380004385,0.99%,13586631,464,2973190,-532687,2.2,28,35%,17.70%
3,United States,331002651,0.59%,1937734,36,9147420,954806,1.8,38,83%,4.25%
4,Indonesia,273523615,1.07%,2898047,151,1811570,-98955,2.3,30,56%,3.51%
5,Pakistan,220892340,2.00%,4327022,287,770880,-233379,3.6,23,35%,2.83%


Получение сведений о пропущенных данных

Типы пропущенных данных:
- None - представление пустых данных в Python
- NaN - представление пустых данных в Pandas
- '' - пустая строка

In [6]:


# Количество пустых значений признаков
print(df.isnull().sum())

print()

# Есть ли пустые значения признаков
print(df.isnull().any())

print()

# Процент пустых значений признаков
for i in df.columns:
    null_rate = df[i].isnull().sum() / len(df) * 100
    if null_rate > 0:
        print(f"{i} процент пустых значений: %{null_rate:.2f}")

Country (or dependency)     0
Population 2020             0
Yearly Change               0
Net Change                  0
Density  (P/Km²)            0
Land Area (Km²)             0
Migrants (net)             34
Fert. Rate                  0
Med. Age                    0
Urban Pop %                 0
World Share                 0
dtype: int64

Country (or dependency)    False
Population 2020            False
Yearly Change              False
Net Change                 False
Density  (P/Km²)           False
Land Area (Km²)            False
Migrants (net)              True
Fert. Rate                 False
Med. Age                   False
Urban Pop %                False
World Share                False
dtype: bool

Migrants (net) процент пустых значений: %14.47


Заполнение пропущенных данных

https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values

https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/

In [11]:
fillna_df = df.fillna(0)

print(fillna_df.shape) # размеры

print(fillna_df.isnull().any())

# Замена пустых данных на 0
df["MigrantsFill"] = df["Migrants (net)"].fillna(0)

# Замена пустых данных на медиану
df["MigrantsMedian"] = df["Migrants (net)"].fillna(df["Migrants (net)"].median())

df.tail()

(235, 12)
Country (or dependency)    False
Population 2020            False
Yearly Change              False
Net Change                 False
Density  (P/Km²)           False
Land Area (Km²)            False
Migrants (net)             False
Fert. Rate                 False
Med. Age                   False
Urban Pop %                False
World Share                False
MigrantsFill               False
dtype: bool


TypeError: Cannot convert ['-348,399' '-532,687' '954,806' '-98,955' '-233,379' '21,200' '-60,000'
 '-369,501' '182,456' '-60,000' '71,560' '30,000' '-67,152' '-38,033'
 '-80,000' '23,861' '283,922' '-55,000' '543,822' '19,444' '260,650'
 '36,527' '148,943' '-40,076' '145,405' '-163,313' '-10,000' '11,731'
 '204,796' '40,000' '168,694' '4,800' '-10,000' '-50,000' '10,000' '7,834'
 '-62,920' '-29,395' '242,032' '-51,419' '134,979' '-8,863' '99,069'
 '6,413' '50,000' '-5,000' '-10,000' '-30,000' '41,710' '-653,249'
 '-1,500' '-4,800' '-8,000' '-5,403' '158,246' '4,000' '30,001' '-97,986'
 '-25,000' '-40,000' '-73,999' '-16,053' '111,708' '-18,000' '-8,000'
 '-9,215' '36,400' '-427,391' '16,000' '-20,000' '-30,000' '2,000'
 '-40,000' '-116,858' '-4,000' '-9,000' '-2,000' '2,001' '-4,000' '-9,504'
 '48,000' '-35,000' '-14,400' '-174,200' '-30,000' '22,011' '-16,000'
 '10,220' '-6,000' '1,200' '40,000' '-6,800' '40,000' '6,000' '-20,000'
 '8,730' '65,000' '-800' '4,000' '10,000' '52,000' '-2,000' '-4,200'
 '29,308' '-14,704' '-16,556' '-4,800' '-1,999' '-30,012' '-21,272'
 '-4,000' '-40,539' '-5,000' '27,028' '15,200' '14,000' '-4,000' '1,485'
 '28,000' '87,400' '-10,563' '4,200' '-5,000' '23,604' '-40,000' '14,881'
 '5,000' '11,200' '39,520' '-8,001' '-1,387' '-10,000' '-39,858' '-3,000'
 '-21,585' '-852' '-4,998' '-11,332' '40,000' '-14,000' '-97,986'
 '-32,780' '-4,806' '-3,087' '3,000' '3,260' '-10,047' '-1,000' '2,000'
 '-1,399' '-14,837' '47,800' '16,000' '-800' '3,911' '-5,385' '0' '5,000'
 '-8,353' '900' '-6,202' '-1,256' '-2,000' '-6,000' '320' '-1,600' '5,000'
 '-480' '9,741' '5,582' '-1,000' '-1,342' '-2,957' '11,370' '900' '0'
 '-1,440' '1,200' '1,000' '-960' '380' '120' '1,200' '-79' '502' '-1,000'
 '0' '-1,680' '-2,803' '0' '1,351' '-506' '515' '-800' '-200' '-200' '201'
 '-800' '-451' '-200' '0' nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan] to numeric

In [13]:
df["MigrantsCopy"] = df["Migrants (net)"]

# Замена данных сразу в DataFrame без копирования
df.fillna({"MigrantsCopy": 0}, inplace=True)

df.tail()

Unnamed: 0_level_0,Country (or dependency),Population 2020,Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share,MigrantsFill,MigrantsCopy
no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
231,Montserrat,4992,0.06%,3,50,100,,N.A.,N.A.,10%,0.00%,0,0
232,Falkland Islands,3480,3.05%,103,0,12170,,N.A.,N.A.,66%,0.00%,0,0
233,Niue,1626,0.68%,11,6,260,,N.A.,N.A.,46%,0.00%,0,0
234,Tokelau,1357,1.27%,17,136,10,,N.A.,N.A.,0%,0.00%,0,0
235,Holy See,801,0.25%,2,2003,0,,N.A.,N.A.,N.A.,0.00%,0,0


Удаление наблюдений с пропусками

In [14]:
dropna_df = df.dropna()

print(dropna_df.shape)

print(fillna_df.isnull().any())

(201, 13)
Country (or dependency)    False
Population 2020            False
Yearly Change              False
Net Change                 False
Density  (P/Km²)           False
Land Area (Km²)            False
Migrants (net)             False
Fert. Rate                 False
Med. Age                   False
Urban Pop %                False
World Share                False
MigrantsFill               False
dtype: bool


Создание выборок данных

Библиотека scikit-learn

https://scikit-learn.org/stable/index.html

<img src="assets/lec2-split.png" width="600" style="background-color: white">

In [16]:
# Функция для создания выборок
from sklearn.model_selection import train_test_split


def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
):
    """
    Splits a Pandas dataframe into three subsets (train, val, and test)
    following fractional ratios provided by the user, where each subset is
    stratified by the values in a specific column (that is, each subset has
    the same relative frequency of the values in the column). It performs this
    splitting by running train_test_split() twice.

    Parameters
    ----------
    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. Usually
        this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into train, val, and
        test data. The values should be expressed as float fractions and should
        sum to 1.0.
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split().

    Returns
    -------
    df_train, df_val, df_test :
        Dataframes containing the three splits.
    """

    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )

    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))

    X = df_input  # Contains all columns.
    y = df_input[
        [stratify_colname]
    ]  # Dataframe of just the column on which to stratify.

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )

    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)

    return df_train, df_val, df_test

In [17]:
# Вывод распределения количества наблюдений по меткам (классам)
print(df.MedAge.value_counts())

data = df[["MedAge", "Fert. Rate", "Density(P/Km²)"]].copy()

df_train, df_val, df_test = split_stratified_into_train_val_test(
    data,
    stratify_colname="MedAge",
    frac_train=0.60,
    frac_val=0.20,
    frac_test=0.20,
)

print("Обучающая выборка: ", df_train.shape)
print(df_train.MedAge.value_counts())

print("Контрольная выборка: ", df_val.shape)
print(df_val.MedAge.value_counts())

print("Тестовая выборка: ", df_test.shape)
print(df_test.MedAge.value_counts())

MedAge
N.A.    34
19      14
28      12
43      11
32      11
42      10
18      10
20       9
30       8
38       7
26       7
40       7
22       7
31       6
34       6
24       6
17       6
44       5
29       5
41       5
33       5
21       5
45       5
23       4
37       4
36       4
25       4
27       4
39       3
46       3
35       3
47       2
48       1
15       1
16       1
Name: count, dtype: int64


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

Выборка с избытком (oversampling)

https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/

https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/

Выборка с недостатком (undersampling)

https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/

Библиотека imbalanced-learn

https://imbalanced-learn.org/stable/

In [1]:
from imblearn.over_sampling import ADASYN

ada = ADASYN()

print("Обучающая выборка: ", df_train.shape)
print(df_train.Pclass.value_counts())

X_resampled, y_resampled = ada.fit_resample(df_train, df_train["Pclass"])
df_train_adasyn = pd.DataFrame(X_resampled)

print("Обучающая выборка после oversampling: ", df_train_adasyn.shape)
print(df_train_adasyn.Pclass.value_counts())

df_train_adasyn

ModuleNotFoundError: No module named 'imblearn'