MII_Salin_Oleg_PIbd-33/lec2unicorns.ipynb
2024-10-22 18:54:39 +04:00

23 KiB
Raw Blame History

Загрузка данных в DataFrame

In [76]:
import pandas as pd

df = pd.read_csv("data/unicorns.csv", index_col="Company", sep=';')

df.info()

df["Valuation"] = df["Valuation"].apply(
    lambda x: float(x[:-4].replace(',', '.')),
)

df["TotalFunding"] = df["TotalFunding"].apply(
    lambda x: float(x.strip("$M").replace(",", "")),
)

df["IsChina"] = [int(country == 'China') for country in df["Country"]]
print(df.shape)

df.head()
<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, Bytedance to iCapital Network
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Valuation            100 non-null    object
 1   Country              100 non-null    object
 2   State                79 non-null     object
 3   City                 99 non-null     object
 4   Industries           99 non-null     object
 5   FoundedYear          100 non-null    int64 
 6   Name of Founders     100 non-null    object
 7   TotalFunding         100 non-null    object
 8   Number of Employees  100 non-null    object
dtypes: int64(1), object(8)
memory usage: 7.8+ KB
(100, 10)
Out[76]:
Valuation Country State City Industries FoundedYear Name of Founders TotalFunding Number of Employees IsChina
Company
Bytedance 140.0 China Beijing Beijing Content, Data Mining, Internet 2012 Yiming Zhang 7440.00 10.000 1
SpaceX 100.3 United States California Hawthorne Aerospace, Manufacturing, Space Travel, Transp... 2002 Elon Musk 383.02 5,000-10,000 0
Stripe 95.0 United States California San Francisco Finance, FinTech, Mobile Payments, SaaS 2010 John Collison, Patrick Collison 300.00 1,000-5,000 0
Klarna 45.6 Sweden NaN Stockholm E-Commerce, FinTech, Payments, Shopping 2005 Niklas Adalberth, Sebastian Siemiatkowski, Vic... 3471.72 5,000-10,000 0
Epic Games 42.0 United States North Carolina Cary Developer Platform, Gaming, Software, Video Games 1991 Mark Rein, Tim Sweeney 544.93 1,000-5,000 0

Получение сведений о пропущенных данных

Типы пропущенных данных:

  • None - представление пустых данных в Python
  • NaN - представление пустых данных в Pandas
  • '' - пустая строка
In [77]:
# Количество пустых значений признаков
print(df.isnull().sum())

print()

# Есть ли пустые значения признаков
print(df.isnull().any())

print()

# Процент пустых значений признаков
for i in df.columns:
    null_rate = df[i].isnull().sum() / len(df) * 100
    if null_rate > 0:
        print(f"{i} процент пустых значений: %{null_rate:.2f}")
Valuation               0
Country                 0
State                  21
City                    1
Industries              1
FoundedYear             0
Name of Founders        0
TotalFunding            0
Number of Employees     0
IsChina                 0
dtype: int64

Valuation              False
Country                False
State                   True
City                    True
Industries              True
FoundedYear            False
Name of Founders       False
TotalFunding           False
Number of Employees    False
IsChina                False
dtype: bool

State процент пустых значений: %21.00
City процент пустых значений: %1.00
Industries процент пустых значений: %1.00
In [78]:
# fillna_df = df.fillna(0)

# print(fillna_df.shape)

# print(fillna_df.isnull().any())

# # Замена пустых данных на 0
# df["AgeFillNA"] = df["Age"].fillna(0)

# # Замена пустых данных на медиану
# df["AgeFillMedian"] = df["Age"].fillna(df["Age"].median())

# df.tail()
In [79]:
# df["AgeCopy"] = df["Age"]

# # Замена данных сразу в DataFrame без копирования
# df.fillna({"AgeCopy": 0}, inplace=True)

# df.tail()

Удаление наблюдений с пропусками

In [80]:
df = df.dropna(axis=1)

print(df.shape)

print(df.isnull().any())
(100, 7)
Valuation              False
Country                False
FoundedYear            False
Name of Founders       False
TotalFunding           False
Number of Employees    False
IsChina                False
dtype: bool

Создание выборок данных

Библиотека scikit-learn

https://scikit-learn.org/stable/index.html

No description has been provided for this image
In [81]:
# Функция для создания выборок
from sklearn.model_selection import train_test_split


def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
):
    """
    Splits a Pandas dataframe into three subsets (train, val, and test)
    following fractional ratios provided by the user, where each subset is
    stratified by the values in a specific column (that is, each subset has
    the same relative frequency of the values in the column). It performs this
    splitting by running train_test_split() twice.

    Parameters
    ----------
    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. Usually
        this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into train, val, and
        test data. The values should be expressed as float fractions and should
        sum to 1.0.
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split().

    Returns
    -------
    df_train, df_val, df_test :
        Dataframes containing the three splits.
    """

    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )

    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))

    X = df_input  # Contains all columns.
    y = df_input[
        [stratify_colname]
    ]  # Dataframe of just the column on which to stratify.

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )

    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)

    return df_train, df_val, df_test
In [82]:
# Вывод распределения количества наблюдений по меткам (классам)
print(df.IsChina.value_counts())

data = df[["TotalFunding", "Valuation", "IsChina"]].copy()

df_train, df_val, df_test = split_stratified_into_train_val_test(
    data,
    stratify_colname="IsChina",
    frac_train=0.60,
    frac_val=0.20,
    frac_test=0.20,
)

print("Обучающая выборка: ", df_train.shape)
print(df_train.IsChina.value_counts())

print("Контрольная выборка: ", df_val.shape)
print(df_val.IsChina.value_counts())

print("Тестовая выборка: ", df_test.shape)
print(df_test.IsChina.value_counts())
IsChina
0    86
1    14
Name: count, dtype: int64
Обучающая выборка:  (60, 3)
IsChina
0    52
1     8
Name: count, dtype: int64
Контрольная выборка:  (20, 3)
IsChina
0    17
1     3
Name: count, dtype: int64
Тестовая выборка:  (20, 3)
IsChina
0    17
1     3
Name: count, dtype: int64
In [83]:
from imblearn.over_sampling import ADASYN

ada = ADASYN()

print("Обучающая выборка: ", df_train.shape)
print(df_train.IsChina.value_counts())

X_resampled, y_resampled = ada.fit_resample(df_train, df_train["IsChina"])  # type: ignore
df_train_adasyn = pd.DataFrame(X_resampled)

print("Обучающая выборка после oversampling: ", df_train_adasyn.shape)
print(df_train_adasyn.IsChina.value_counts())

df_train_adasyn
Обучающая выборка:  (60, 3)
IsChina
0    52
1     8
Name: count, dtype: int64
Обучающая выборка после oversampling:  (105, 3)
IsChina
1    53
0    52
Name: count, dtype: int64
Out[83]:
TotalFunding Valuation IsChina
0 208.000000 9.500000 0
1 4044.200000 15.500000 1
2 447.120000 6.500000 0
3 2121.000000 6.600000 1
4 2686.010000 39.000000 0
... ... ... ...
100 1306.334794 14.179790 1
101 1492.220325 10.610196 1
102 1125.438822 16.887502 1
103 1728.312129 7.708914 1
104 1785.708076 7.004370 1

105 rows × 3 columns