MII_Yunusov_Niyaz/notebooks/lec2.ipynb
2024-09-30 23:02:17 +04:00

50 KiB
Raw Permalink Blame History

Загрузка данных в DataFrame

In [83]:
import pandas as pd

df1 = pd.read_csv("../data/mobile_phone_price_prediction.csv", index_col="ID")
df1["Spec_score"] = df1["Spec_score"].replace({"": None})
df1["Rating_index"] = df1["Rating"].apply(lambda x: 1 if float(x) > 4.5 else 0)

df1.info()

print(df1.shape)

df1.head()
<class 'pandas.core.frame.DataFrame'>
Index: 1370 entries, 0 to 1369
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               1370 non-null   object 
 1   Rating             1370 non-null   float64
 2   Spec_score         1367 non-null   float64
 3   No_of_sim          1370 non-null   object 
 4   Ram                1370 non-null   object 
 5   Battery            1370 non-null   object 
 6   Display            1370 non-null   object 
 7   Camera             1370 non-null   object 
 8   External_Memory    1370 non-null   object 
 9   Android_version    927 non-null    object 
 10  Price              1370 non-null   object 
 11  company            1370 non-null   object 
 12  Inbuilt_memory     1351 non-null   object 
 13  fast_charging      1281 non-null   object 
 14  Screen_resolution  1368 non-null   object 
 15  Processor          1342 non-null   object 
 16  Processor_name     1370 non-null   object 
 17  Rating_index       1370 non-null   int64  
dtypes: float64(2), int64(1), object(15)
memory usage: 203.4+ KB
(1370, 18)
Out[83]:
Name Rating Spec_score No_of_sim Ram Battery Display Camera External_Memory Android_version Price company Inbuilt_memory fast_charging Screen_resolution Processor Processor_name Rating_index
ID
0 Samsung Galaxy F14 5G 4.65 68.0 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 6000 mAh Battery 6.6 inches 50 MP + 2 MP Dual Rear &amp; 13 MP Front Camera Memory Card Supported, upto 1 TB 13 9.999 Samsung 128 GB inbuilt 25W Fast Charging 2408 x 1080 px Display with Water Drop Notch Octa Core Processor Exynos 1330 1
1 Samsung Galaxy A11 4.20 63.0 Dual Sim, 3G, 4G, VoLTE, 2 GB RAM 4000 mAh Battery 6.4 inches 13 MP + 5 MP + 2 MP Triple Rear &amp; 8 MP Fro... Memory Card Supported, upto 512 GB 10 9,990 Samsung 32 GB inbuilt 15W Fast Charging 720 x 1560 px Display with Punch Hole 1.8 GHz Processor Octa Core 0
2 Samsung Galaxy A13 4.30 NaN Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.6 inches 50 MP Quad Rear &amp; 8 MP Front Camera Memory Card Supported, upto 1 TB 12 11,999 Samsung 64 GB inbuilt 25W Fast Charging 1080 x 2408 px Display with Water Drop Notch 2 GHz Processor Octa Core 0
3 Samsung Galaxy F23 4.10 73.0 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 6000 mAh Battery 6.4 inches 48 MP Quad Rear &amp; 13 MP Front Camera Memory Card Supported, upto 1 TB 12 11,999 Samsung 64 GB inbuilt NaN 720 x 1600 px Octa Core Helio G88 0
4 Samsung Galaxy A03s (4GB RAM + 64GB) 4.10 69.0 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.5 inches 13 MP + 2 MP + 2 MP Triple Rear &amp; 5 MP Fro... Memory Card Supported, upto 1 TB 11 11,999 Samsung 64 GB inbuilt 15W Fast Charging 720 x 1600 px Display with Water Drop Notch Octa Core Helio P35 0

Получение сведений о пропущенных данных

Типы пропущенных данных:

  • None - представление пустых данных в Python
  • NaN - представление пустых данных в Pandas
  • '' - пустая строка
In [68]:
# Количество пустых значений признаков
print(df1.isnull().sum())

print()

# Есть ли пустые значения признаков
print(df1.isnull().any())

print()

# Процент пустых значений признаков
for i in df1.columns:
    null_rate = df1[i].isnull().sum() / len(df1) * 100
    if null_rate > 0:
        print(f"{i} процент пустых значений: %{null_rate:.2f}")
Name                   0
Rating                 0
Spec_score             3
No_of_sim              0
Ram                    0
Battery                0
Display                0
Camera                 0
External_Memory        0
Android_version      443
Price                  0
company                0
Inbuilt_memory        19
fast_charging         89
Screen_resolution      2
Processor             28
Processor_name         0
company_index          0
dtype: int64

Name                 False
Rating               False
Spec_score            True
No_of_sim            False
Ram                  False
Battery              False
Display              False
Camera               False
External_Memory      False
Android_version       True
Price                False
company              False
Inbuilt_memory        True
fast_charging         True
Screen_resolution     True
Processor             True
Processor_name       False
company_index        False
dtype: bool

Spec_score процент пустых значений: %0.22
Android_version процент пустых значений: %32.34
Inbuilt_memory процент пустых значений: %1.39
fast_charging процент пустых значений: %6.50
Screen_resolution процент пустых значений: %0.15
Processor процент пустых значений: %2.04
In [86]:
fillna_df = df1.fillna(0)

print(fillna_df.shape)

print(fillna_df.isnull().any())

# Замена пустых данных на 0
df1["Spec_scoreFillNA"] = df1["Spec_score"].fillna(0)

# Замена пустых данных на медиану
df1["Spec_scoreFillMedian"] = df1["Spec_score"].fillna(df1["Spec_scoreFillNA"].median())

df1.tail()
(1370, 18)
Name                 False
Rating               False
Spec_score           False
No_of_sim            False
Ram                  False
Battery              False
Display              False
Camera               False
External_Memory      False
Android_version      False
Price                False
company              False
Inbuilt_memory       False
fast_charging        False
Screen_resolution    False
Processor            False
Processor_name       False
Rating_index         False
dtype: bool
Out[86]:
Name Rating Spec_score No_of_sim Ram Battery Display Camera External_Memory Android_version Price company Inbuilt_memory fast_charging Screen_resolution Processor Processor_name Rating_index Spec_scoreFillNA Spec_scoreFillMedian
ID
1365 TCL 40R 4.05 75.0 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 5000 mAh Battery 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear &amp; 8 MP Fro... Memory Card (Hybrid) 12 18,999 TCL 64 GB inbuilt 15W Fast Charging 720 x 1612 px Octa Core Dimensity 700 5G 0 75.0 75.0
1366 TCL 50 XL NxtPaper 5G 4.10 80.0 Dual Sim, 3G, 4G, VoLTE, 8 GB RAM 5000 mAh Battery 6.8 inches 50 MP + 2 MP Dual Rear &amp; 16 MP Front Camera Memory Card (Hybrid) 14 24,990 TCL 128 GB inbuilt 33W Fast Charging 1200 x 2400 px Octa Core Dimensity 7050 0 80.0 80.0
1367 TCL 50 XE NxtPaper 5G 4.00 80.0 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery 6.6 inches 50 MP + 2 MP Dual Rear &amp; 16 MP Front Camera Memory Card Supported, upto 1 TB 13 23,990 TCL 256 GB inbuilt 18W Fast Charging 720 x 1612 px Octa Core Dimensity 6080 0 80.0 80.0
1368 TCL 40 NxtPaper 5G 4.50 79.0 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear &amp; 8 MP Fro... Memory Card Supported, upto 1 TB 13 22,499 TCL 256 GB inbuilt 15W Fast Charging 720 x 1612 px Octa Core Dimensity 6020 0 79.0 79.0
1369 TCL Trifold 4.65 93.0 Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G, 12 GB RAM 4600 mAh Battery 10 inches Foldable Display, Dual Display 50 MP + 48 MP + 8 MP Triple Rear &amp; 32 MP F... 13 1,19,990 TCL 256 GB inbuilt 67W Fast Charging 1916 x 2160 px Octa Core Snapdragon 8 Gen2 1 93.0 93.0
In [70]:
df1["Android_versionCopy"] = df1["Android_version"]

# Замена данных сразу в DataFrame без копирования
df1.fillna({"Android_versionCopy": 0}, inplace=True)

df1.tail()
Out[70]:
Name Rating Spec_score No_of_sim Ram Battery Display Camera External_Memory Android_version ... company Inbuilt_memory fast_charging Screen_resolution Processor Processor_name company_index Spec_scoreFillNA Spec_scoreFillMedian Android_versionCopy
ID
1365 TCL 40R 4.05 75.0 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 5000 mAh Battery 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear &amp; 8 MP Fro... Memory Card (Hybrid) 12 ... TCL 64 GB inbuilt 15W Fast Charging 720 x 1612 px Octa Core Dimensity 700 5G TCL 75.0 75.0 12
1366 TCL 50 XL NxtPaper 5G 4.10 80.0 Dual Sim, 3G, 4G, VoLTE, 8 GB RAM 5000 mAh Battery 6.8 inches 50 MP + 2 MP Dual Rear &amp; 16 MP Front Camera Memory Card (Hybrid) 14 ... TCL 128 GB inbuilt 33W Fast Charging 1200 x 2400 px Octa Core Dimensity 7050 TCL 80.0 80.0 14
1367 TCL 50 XE NxtPaper 5G 4.00 80.0 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery 6.6 inches 50 MP + 2 MP Dual Rear &amp; 16 MP Front Camera Memory Card Supported, upto 1 TB 13 ... TCL 256 GB inbuilt 18W Fast Charging 720 x 1612 px Octa Core Dimensity 6080 TCL 80.0 80.0 13
1368 TCL 40 NxtPaper 5G 4.50 79.0 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear &amp; 8 MP Fro... Memory Card Supported, upto 1 TB 13 ... TCL 256 GB inbuilt 15W Fast Charging 720 x 1612 px Octa Core Dimensity 6020 TCL 79.0 79.0 13
1369 TCL Trifold 4.65 93.0 Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G, 12 GB RAM 4600 mAh Battery 10 inches Foldable Display, Dual Display 50 MP + 48 MP + 8 MP Triple Rear &amp; 32 MP F... 13 ... TCL 256 GB inbuilt 67W Fast Charging 1916 x 2160 px Octa Core Snapdragon 8 Gen2 TCL 93.0 93.0 13

5 rows × 21 columns

Удаление наблюдений с пропусками

In [71]:
dropna_df = df1.dropna()

print(dropna_df.shape)

print(fillna_df.isnull().any())
(814, 21)
Name                 False
Rating               False
Spec_score           False
No_of_sim            False
Ram                  False
Battery              False
Display              False
Camera               False
External_Memory      False
Android_version      False
Price                False
company              False
Inbuilt_memory       False
fast_charging        False
Screen_resolution    False
Processor            False
Processor_name       False
company_index        False
dtype: bool

Создание выборок данных

Библиотека scikit-learn

https://scikit-learn.org/stable/index.html

No description has been provided for this image
In [72]:
# Функция для создания выборок
from sklearn.model_selection import train_test_split


def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
):
    """
    Splits a Pandas dataframe into three subsets (train, val, and test)
    following fractional ratios provided by the user, where each subset is
    stratified by the values in a specific column (that is, each subset has
    the same relative frequency of the values in the column). It performs this
    splitting by running train_test_split() twice.

    Parameters
    ----------
    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. Usually
        this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into train, val, and
        test data. The values should be expressed as float fractions and should
        sum to 1.0.
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split().

    Returns
    -------
    df_train, df_val, df_test :
        Dataframes containing the three splits.
    """

    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )

    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))

    X = df_input  # Contains all columns.
    y = df_input[
        [stratify_colname]
    ]  # Dataframe of just the column on which to stratify.

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )

    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)

    return df_train, df_val, df_test
In [89]:
# Вывод распределения количества наблюдений по меткам (классам)
print(df1.Rating_index.value_counts())

data = df1[["Rating_index", "Spec_scoreFillMedian"]].copy()

df_train, df_val, df_test = split_stratified_into_train_val_test(
    data,
    stratify_colname="Rating_index",
    frac_train=0.60,
    frac_val=0.20,
    frac_test=0.20,
)

print("Обучающая выборка: ", df_train.shape)
print(df_train.Rating_index.value_counts())

print("Контрольная выборка: ", df_val.shape)
print(df_val.Rating_index.value_counts())

print("Тестовая выборка: ", df_test.shape)
print(df_test.Rating_index.value_counts())
Rating_index
0    942
1    428
Name: count, dtype: int64
Обучающая выборка:  (822, 2)
Rating_index
0    565
1    257
Name: count, dtype: int64
Контрольная выборка:  (274, 2)
Rating_index
0    189
1     85
Name: count, dtype: int64
Тестовая выборка:  (274, 2)
Rating_index
0    188
1     86
Name: count, dtype: int64
In [90]:
from imblearn.over_sampling import ADASYN

ada = ADASYN()

print("Обучающая выборка: ", df_train.shape)
print(df_train.Rating_index.value_counts())

X_resampled, y_resampled = ada.fit_resample(df_train, df_train["Rating_index"])
df_train_adasyn = pd.DataFrame(X_resampled)

print("Обучающая выборка после oversampling: ", df_train_adasyn.shape)
print(df_train_adasyn.Rating_index.value_counts())

df_train_adasyn
Обучающая выборка:  (822, 2)
Rating_index
0    565
1    257
Name: count, dtype: int64
Обучающая выборка после oversampling:  (1127, 2)
Rating_index
0    565
1    562
Name: count, dtype: int64
Out[90]:
Rating_index Spec_scoreFillMedian
0 0 75.000000
1 0 83.000000
2 0 76.000000
3 0 80.000000
4 0 65.000000
... ... ...
1122 1 64.392639
1123 1 63.805529
1124 1 64.607184
1125 1 64.688453
1126 1 64.376356

1127 rows × 2 columns