MII/lec2.ipynb
2024-10-24 21:50:12 +04:00

52 KiB
Raw Permalink Blame History

Загрузка данных в DataFrame

In [13]:
import pandas as pd

df = pd.read_csv("data/population.csv", index_col="no")

df.info()

print(df.shape)

df.head()
<class 'pandas.core.frame.DataFrame'>
Index: 235 entries, 1 to 235
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Country (or dependency)  235 non-null    object
 1   Population 2020          235 non-null    object
 2   Yearly Change            235 non-null    object
 3   Net Change               235 non-null    object
 4   Density(P/Km²)           235 non-null    object
 5   Land Area (Km²)          235 non-null    object
 6   Migrants (net)           201 non-null    object
 7   Fert. Rate               235 non-null    object
 8   MedAge                   235 non-null    object
 9   Urban Pop %              235 non-null    object
 10  World Share              235 non-null    object
dtypes: object(11)
memory usage: 22.0+ KB
(235, 11)
Out[13]:
Country (or dependency) Population 2020 Yearly Change Net Change Density(P/Km²) Land Area (Km²) Migrants (net) Fert. Rate MedAge Urban Pop % World Share
no
1 China 1,439,323,776 0.39% 5,540,090 153 9,388,211 -348,399 1.7 38 61% 18.47%
2 India 1,380,004,385 0.99% 13,586,631 464 2,973,190 -532,687 2.2 28 35% 17.70%
3 United States 331,002,651 0.59% 1,937,734 36 9,147,420 954,806 1.8 38 83% 4.25%
4 Indonesia 273,523,615 1.07% 2,898,047 151 1,811,570 -98,955 2.3 30 56% 3.51%
5 Pakistan 220,892,340 2.00% 4,327,022 287 770,880 -233,379 3.6 23 35% 2.83%

Получение сведений о пропущенных данных

Типы пропущенных данных:

  • None - представление пустых данных в Python
  • NaN - представление пустых данных в Pandas
  • '' - пустая строка
In [6]:
# Количество пустых значений признаков
print(df.isnull().sum())

print()

# Есть ли пустые значения признаков
print(df.isnull().any())

print()

# Процент пустых значений признаков
for i in df.columns:
    null_rate = df[i].isnull().sum() / len(df) * 100
    if null_rate > 0:
        print(f"{i} процент пустых значений: %{null_rate:.2f}")
Country (or dependency)     0
Population 2020             0
Yearly Change               0
Net Change                  0
Density  (P/Km²)            0
Land Area (Km²)             0
Migrants (net)             34
Fert. Rate                  0
Med. Age                    0
Urban Pop %                 0
World Share                 0
dtype: int64

Country (or dependency)    False
Population 2020            False
Yearly Change              False
Net Change                 False
Density  (P/Km²)           False
Land Area (Km²)            False
Migrants (net)              True
Fert. Rate                 False
Med. Age                   False
Urban Pop %                False
World Share                False
dtype: bool

Migrants (net) процент пустых значений: %14.47
In [11]:
fillna_df = df.fillna(0)

print(fillna_df.shape) # размеры

print(fillna_df.isnull().any())

# Замена пустых данных на 0
df["MigrantsFill"] = df["Migrants (net)"].fillna(0)

# Замена пустых данных на медиану
df["MigrantsMedian"] = df["Migrants (net)"].fillna(df["Migrants (net)"].median())

df.tail()
(235, 12)
Country (or dependency)    False
Population 2020            False
Yearly Change              False
Net Change                 False
Density  (P/Km²)           False
Land Area (Km²)            False
Migrants (net)             False
Fert. Rate                 False
Med. Age                   False
Urban Pop %                False
World Share                False
MigrantsFill               False
dtype: bool
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[11], line 11
      8 df["MigrantsFill"] = df["Migrants (net)"].fillna(0)
     10 # Замена пустых данных на медиану
---> 11 df["MigrantsMedian"] = df["Migrants (net)"].fillna(df["Migrants (net)"].median())
     13 df.tail()

File c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\.venv\Lib\site-packages\pandas\core\series.py:6559, in Series.median(self, axis, skipna, numeric_only, **kwargs)
   6551 @doc(make_doc("median", ndim=1))
   6552 def median(
   6553     self,
   (...)
   6557     **kwargs,
   6558 ):
-> 6559     return NDFrame.median(self, axis, skipna, numeric_only, **kwargs)

File c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\.venv\Lib\site-packages\pandas\core\generic.py:12431, in NDFrame.median(self, axis, skipna, numeric_only, **kwargs)
  12424 def median(
  12425     self,
  12426     axis: Axis | None = 0,
   (...)
  12429     **kwargs,
  12430 ) -> Series | float:
> 12431     return self._stat_function(
  12432         "median", nanops.nanmedian, axis, skipna, numeric_only, **kwargs
  12433     )

File c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\.venv\Lib\site-packages\pandas\core\generic.py:12377, in NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs)
  12373 nv.validate_func(name, (), kwargs)
  12375 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
> 12377 return self._reduce(
  12378     func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
  12379 )

File c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\.venv\Lib\site-packages\pandas\core\series.py:6457, in Series._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
   6452     # GH#47500 - change to TypeError to match other methods
   6453     raise TypeError(
   6454         f"Series.{name} does not allow {kwd_name}={numeric_only} "
   6455         "with non-numeric dtypes."
   6456     )
-> 6457 return op(delegate, skipna=skipna, **kwds)

File c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\.venv\Lib\site-packages\pandas\core\nanops.py:147, in bottleneck_switch.__call__.<locals>.f(values, axis, skipna, **kwds)
    145         result = alt(values, axis=axis, skipna=skipna, **kwds)
    146 else:
--> 147     result = alt(values, axis=axis, skipna=skipna, **kwds)
    149 return result

File c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\.venv\Lib\site-packages\pandas\core\nanops.py:787, in nanmedian(values, axis, skipna, mask)
    785     inferred = lib.infer_dtype(values)
    786     if inferred in ["string", "mixed"]:
--> 787         raise TypeError(f"Cannot convert {values} to numeric")
    788 try:
    789     values = values.astype("f8")

TypeError: Cannot convert ['-348,399' '-532,687' '954,806' '-98,955' '-233,379' '21,200' '-60,000'
 '-369,501' '182,456' '-60,000' '71,560' '30,000' '-67,152' '-38,033'
 '-80,000' '23,861' '283,922' '-55,000' '543,822' '19,444' '260,650'
 '36,527' '148,943' '-40,076' '145,405' '-163,313' '-10,000' '11,731'
 '204,796' '40,000' '168,694' '4,800' '-10,000' '-50,000' '10,000' '7,834'
 '-62,920' '-29,395' '242,032' '-51,419' '134,979' '-8,863' '99,069'
 '6,413' '50,000' '-5,000' '-10,000' '-30,000' '41,710' '-653,249'
 '-1,500' '-4,800' '-8,000' '-5,403' '158,246' '4,000' '30,001' '-97,986'
 '-25,000' '-40,000' '-73,999' '-16,053' '111,708' '-18,000' '-8,000'
 '-9,215' '36,400' '-427,391' '16,000' '-20,000' '-30,000' '2,000'
 '-40,000' '-116,858' '-4,000' '-9,000' '-2,000' '2,001' '-4,000' '-9,504'
 '48,000' '-35,000' '-14,400' '-174,200' '-30,000' '22,011' '-16,000'
 '10,220' '-6,000' '1,200' '40,000' '-6,800' '40,000' '6,000' '-20,000'
 '8,730' '65,000' '-800' '4,000' '10,000' '52,000' '-2,000' '-4,200'
 '29,308' '-14,704' '-16,556' '-4,800' '-1,999' '-30,012' '-21,272'
 '-4,000' '-40,539' '-5,000' '27,028' '15,200' '14,000' '-4,000' '1,485'
 '28,000' '87,400' '-10,563' '4,200' '-5,000' '23,604' '-40,000' '14,881'
 '5,000' '11,200' '39,520' '-8,001' '-1,387' '-10,000' '-39,858' '-3,000'
 '-21,585' '-852' '-4,998' '-11,332' '40,000' '-14,000' '-97,986'
 '-32,780' '-4,806' '-3,087' '3,000' '3,260' '-10,047' '-1,000' '2,000'
 '-1,399' '-14,837' '47,800' '16,000' '-800' '3,911' '-5,385' '0' '5,000'
 '-8,353' '900' '-6,202' '-1,256' '-2,000' '-6,000' '320' '-1,600' '5,000'
 '-480' '9,741' '5,582' '-1,000' '-1,342' '-2,957' '11,370' '900' '0'
 '-1,440' '1,200' '1,000' '-960' '380' '120' '1,200' '-79' '502' '-1,000'
 '0' '-1,680' '-2,803' '0' '1,351' '-506' '515' '-800' '-200' '-200' '201'
 '-800' '-451' '-200' '0' nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan] to numeric
In [13]:
df["MigrantsCopy"] = df["Migrants (net)"]

# Замена данных сразу в DataFrame без копирования
df.fillna({"MigrantsCopy": 0}, inplace=True)

df.tail()
Out[13]:
Country (or dependency) Population 2020 Yearly Change Net Change Density (P/Km²) Land Area (Km²) Migrants (net) Fert. Rate Med. Age Urban Pop % World Share MigrantsFill MigrantsCopy
no
231 Montserrat 4,992 0.06% 3 50 100 NaN N.A. N.A. 10% 0.00% 0 0
232 Falkland Islands 3,480 3.05% 103 0 12,170 NaN N.A. N.A. 66% 0.00% 0 0
233 Niue 1,626 0.68% 11 6 260 NaN N.A. N.A. 46% 0.00% 0 0
234 Tokelau 1,357 1.27% 17 136 10 NaN N.A. N.A. 0% 0.00% 0 0
235 Holy See 801 0.25% 2 2,003 0 NaN N.A. N.A. N.A. 0.00% 0 0

Удаление наблюдений с пропусками

In [14]:
dropna_df = df.dropna()

print(dropna_df.shape)

print(fillna_df.isnull().any())
(201, 13)
Country (or dependency)    False
Population 2020            False
Yearly Change              False
Net Change                 False
Density  (P/Km²)           False
Land Area (Km²)            False
Migrants (net)             False
Fert. Rate                 False
Med. Age                   False
Urban Pop %                False
World Share                False
MigrantsFill               False
dtype: bool

Создание выборок данных

Библиотека scikit-learn

https://scikit-learn.org/stable/index.html

No description has been provided for this image
In [16]:
# Функция для создания выборок
from sklearn.model_selection import train_test_split


def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
):
    """
    Splits a Pandas dataframe into three subsets (train, val, and test)
    following fractional ratios provided by the user, where each subset is
    stratified by the values in a specific column (that is, each subset has
    the same relative frequency of the values in the column). It performs this
    splitting by running train_test_split() twice.

    Parameters
    ----------
    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. Usually
        this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into train, val, and
        test data. The values should be expressed as float fractions and should
        sum to 1.0.
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split().

    Returns
    -------
    df_train, df_val, df_test :
        Dataframes containing the three splits.
    """

    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )

    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))

    X = df_input  # Contains all columns.
    y = df_input[
        [stratify_colname]
    ]  # Dataframe of just the column on which to stratify.

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )

    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)

    return df_train, df_val, df_test
In [17]:
# Вывод распределения количества наблюдений по меткам (классам)
print(df.MedAge.value_counts())

data = df[["MedAge", "Fert. Rate", "Density(P/Km²)"]].copy()

df_train, df_val, df_test = split_stratified_into_train_val_test(
    data,
    stratify_colname="MedAge",
    frac_train=0.60,
    frac_val=0.20,
    frac_test=0.20,
)

print("Обучающая выборка: ", df_train.shape)
print(df_train.MedAge.value_counts())

print("Контрольная выборка: ", df_val.shape)
print(df_val.MedAge.value_counts())

print("Тестовая выборка: ", df_test.shape)
print(df_test.MedAge.value_counts())
MedAge
N.A.    34
19      14
28      12
43      11
32      11
42      10
18      10
20       9
30       8
38       7
26       7
40       7
22       7
31       6
34       6
24       6
17       6
44       5
29       5
41       5
33       5
21       5
45       5
23       4
37       4
36       4
25       4
27       4
39       3
46       3
35       3
47       2
48       1
15       1
16       1
Name: count, dtype: int64
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[17], line 6
      2 print(df.MedAge.value_counts())
      4 data = df[["MedAge", "Fert. Rate", "Density(P/Km²)"]].copy()
----> 6 df_train, df_val, df_test = split_stratified_into_train_val_test(
      7     data,
      8     stratify_colname="MedAge",
      9     frac_train=0.60,
     10     frac_val=0.20,
     11     frac_test=0.20,
     12 )
     14 print("Обучающая выборка: ", df_train.shape)
     15 print(df_train.MedAge.value_counts())

Cell In[16], line 57, in split_stratified_into_train_val_test(df_input, stratify_colname, frac_train, frac_val, frac_test, random_state)
     52 y = df_input[
     53     [stratify_colname]
     54 ]  # Dataframe of just the column on which to stratify.
     56 # Split original dataframe into train and temp dataframes.
---> 57 df_train, df_temp, y_train, y_temp = train_test_split(
     58     X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
     59 )
     61 # Split the temp dataframe into val and test dataframes.
     62 relative_frac_test = frac_test / (frac_val + frac_test)

File c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\.venv\Lib\site-packages\sklearn\utils\_param_validation.py:213, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    207 try:
    208     with config_context(
    209         skip_parameter_validation=(
    210             prefer_skip_nested_validation or global_skip_validation
    211         )
    212     ):
--> 213         return func(*args, **kwargs)
    214 except InvalidParameterError as e:
    215     # When the function is just a wrapper around an estimator, we allow
    216     # the function to delegate validation to the estimator, but we replace
    217     # the name of the estimator by the name of the function in the error
    218     # message to avoid confusion.
    219     msg = re.sub(
    220         r"parameter of \w+ must be",
    221         f"parameter of {func.__qualname__} must be",
    222         str(e),
    223     )

File c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\.venv\Lib\site-packages\sklearn\model_selection\_split.py:2806, in train_test_split(test_size, train_size, random_state, shuffle, stratify, *arrays)
   2802         CVClass = ShuffleSplit
   2804     cv = CVClass(test_size=n_test, train_size=n_train, random_state=random_state)
-> 2806     train, test = next(cv.split(X=arrays[0], y=stratify))
   2808 train, test = ensure_common_namespace_device(arrays[0], train, test)
   2810 return list(
   2811     chain.from_iterable(
   2812         (_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays
   2813     )
   2814 )

File c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\.venv\Lib\site-packages\sklearn\model_selection\_split.py:1843, in BaseShuffleSplit.split(self, X, y, groups)
   1813 """Generate indices to split data into training and test set.
   1814 
   1815 Parameters
   (...)
   1840 to an integer.
   1841 """
   1842 X, y, groups = indexable(X, y, groups)
-> 1843 for train, test in self._iter_indices(X, y, groups):
   1844     yield train, test

File c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\.venv\Lib\site-packages\sklearn\model_selection\_split.py:2252, in StratifiedShuffleSplit._iter_indices(self, X, y, groups)
   2250 class_counts = np.bincount(y_indices)
   2251 if np.min(class_counts) < 2:
-> 2252     raise ValueError(
   2253         "The least populated class in y has only 1"
   2254         " member, which is too few. The minimum"
   2255         " number of groups for any class cannot"
   2256         " be less than 2."
   2257     )
   2259 if n_train < n_classes:
   2260     raise ValueError(
   2261         "The train_size = %d should be greater or "
   2262         "equal to the number of classes = %d" % (n_train, n_classes)
   2263     )

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
In [1]:
from imblearn.over_sampling import ADASYN

ada = ADASYN()

print("Обучающая выборка: ", df_train.shape)
print(df_train.Pclass.value_counts())

X_resampled, y_resampled = ada.fit_resample(df_train, df_train["Pclass"])
df_train_adasyn = pd.DataFrame(X_resampled)

print("Обучающая выборка после oversampling: ", df_train_adasyn.shape)
print(df_train_adasyn.Pclass.value_counts())

df_train_adasyn
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 1
----> 1 from imblearn.over_sampling import ADASYN
      3 ada = ADASYN()
      5 print("Обучающая выборка: ", df_train.shape)

ModuleNotFoundError: No module named 'imblearn'