Загрузка данных в DataFrame

In [13]:
import pandas as pd

df = pd.read_csv("data/population.csv", index_col="no")



<class 'pandas.core.frame.DataFrame'>
Index: 235 entries, 1 to 235
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Country (or dependency)  235 non-null    object
 1   Population 2020          235 non-null    object
 2   Yearly Change            235 non-null    object
 3   Net Change               235 non-null    object
 4   Density(P/Km²)           235 non-null    object
 5   Land Area (Km²)          235 non-null    object
 6   Migrants (net)           201 non-null    object
 7   Fert. Rate               235 non-null    object
 8   MedAge                   235 non-null    object
 9   Urban Pop %              235 non-null    object
 10  World Share              235 non-null    object
dtypes: object(11)
memory usage: 22.0+ KB
(235, 11)
Country (or dependency) Population 2020 Yearly Change Net Change Density(P/Km²) Land Area (Km²) Migrants (net) Fert. Rate MedAge Urban Pop % World Share
1 China 1,439,323,776 0.39% 5,540,090 153 9,388,211 -348,399 1.7 38 61% 18.47%
2 India 1,380,004,385 0.99% 13,586,631 464 2,973,190 -532,687 2.2 28 35% 17.70%
3 United States 331,002,651 0.59% 1,937,734 36 9,147,420 954,806 1.8 38 83% 4.25%
4 Indonesia 273,523,615 1.07% 2,898,047 151 1,811,570 -98,955 2.3 30 56% 3.51%
5 Pakistan 220,892,340 2.00% 4,327,022 287 770,880 -233,379 3.6 23 35% 2.83%

Получение сведений о пропущенных данных

Типы пропущенных данных:

  • None - представление пустых данных в Python
  • NaN - представление пустых данных в Pandas
  • '' - пустая строка
In [6]:
# Количество пустых значений признаков


# Есть ли пустые значения признаков


# Процент пустых значений признаков
for i in df.columns:
    null_rate = df[i].isnull().sum() / len(df) * 100
    if null_rate > 0:
        print(f"{i} процент пустых значений: %{null_rate:.2f}")
Country (or dependency)     0
Population 2020             0
Yearly Change               0
Net Change                  0
Density  (P/Km²)            0
Land Area (Km²)             0
Migrants (net)             34
Fert. Rate                  0
Med. Age                    0
Urban Pop %                 0
World Share                 0
dtype: int64

Country (or dependency)    False
Population 2020            False
Yearly Change              False
Net Change                 False
Density  (P/Km²)           False
Land Area (Km²)            False
Migrants (net)              True
Fert. Rate                 False
Med. Age                   False
Urban Pop %                False
World Share                False
dtype: bool

Migrants (net) процент пустых значений: %14.47
In [11]:
fillna_df = df.fillna(0)

print(fillna_df.shape) # размеры


# Замена пустых данных на 0
df["MigrantsFill"] = df["Migrants (net)"].fillna(0)

# Замена пустых данных на медиану
df["MigrantsMedian"] = df["Migrants (net)"].fillna(df["Migrants (net)"].median())

(235, 12)
Country (or dependency)    False
Population 2020            False
Yearly Change              False
Net Change                 False
Density  (P/Km²)           False
Land Area (Km²)            False
Migrants (net)             False
Fert. Rate                 False
Med. Age                   False
Urban Pop %                False
World Share                False
MigrantsFill               False
dtype: bool
In [13]:
df["MigrantsCopy"] = df["Migrants (net)"]

# Замена данных сразу в DataFrame без копирования
df.fillna({"MigrantsCopy": 0}, inplace=True)

Country (or dependency) Population 2020 Yearly Change Net Change Density (P/Km²) Land Area (Km²) Migrants (net) Fert. Rate Med. Age Urban Pop % World Share MigrantsFill MigrantsCopy
231 Montserrat 4,992 0.06% 3 50 100 NaN N.A. N.A. 10% 0.00% 0 0
232 Falkland Islands 3,480 3.05% 103 0 12,170 NaN N.A. N.A. 66% 0.00% 0 0
233 Niue 1,626 0.68% 11 6 260 NaN N.A. N.A. 46% 0.00% 0 0
234 Tokelau 1,357 1.27% 17 136 10 NaN N.A. N.A. 0% 0.00% 0 0
235 Holy See 801 0.25% 2 2,003 0 NaN N.A. N.A. N.A. 0.00% 0 0

Удаление наблюдений с пропусками

In [14]:
dropna_df = df.dropna()


(201, 13)
Country (or dependency)    False
Population 2020            False
Yearly Change              False
Net Change                 False
Density  (P/Km²)           False
Land Area (Km²)            False
Migrants (net)             False
Fert. Rate                 False
Med. Age                   False
Urban Pop %                False
World Share                False
MigrantsFill               False
dtype: bool

Создание выборок данных

Библиотека scikit-learn


No description has been provided for this image
In [16]:
# Функция для создания выборок
from sklearn.model_selection import train_test_split

def split_stratified_into_train_val_test(
    Splits a Pandas dataframe into three subsets (train, val, and test)
    following fractional ratios provided by the user, where each subset is
    stratified by the values in a specific column (that is, each subset has
    the same relative frequency of the values in the column). It performs this
    splitting by running train_test_split() twice.

    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. Usually
        this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into train, val, and
        test data. The values should be expressed as float fractions and should
        sum to 1.0.
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split().

    df_train, df_val, df_test :
        Dataframes containing the three splits.

    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)

    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))

    X = df_input  # Contains all columns.
    y = df_input[
    ]  # Dataframe of just the column on which to stratify.

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state

    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)

    return df_train, df_val, df_test
In [17]:
# Вывод распределения количества наблюдений по меткам (классам)

data = df[["MedAge", "Fert. Rate", "Density(P/Km²)"]].copy()

df_train, df_val, df_test = split_stratified_into_train_val_test(

print("Обучающая выборка: ", df_train.shape)

print("Контрольная выборка: ", df_val.shape)

print("Тестовая выборка: ", df_test.shape)
N.A.    34
19      14
28      12
43      11
32      11
42      10
18      10
20       9
30       8
38       7
26       7
40       7
22       7
31       6
34       6
24       6
17       6
44       5
29       5
41       5
33       5
21       5
45       5
23       4
37       4
36       4
25       4
27       4
39       3
46       3
35       3
47       2
48       1
15       1
16       1
Name: count, dtype: int64
In [1]:
from imblearn.over_sampling import ADASYN

ada = ADASYN()

print("Обучающая выборка: ", df_train.shape)

X_resampled, y_resampled = ada.fit_resample(df_train, df_train["Pclass"])
df_train_adasyn = pd.DataFrame(X_resampled)

print("Обучающая выборка после oversampling: ", df_train_adasyn.shape)

