AIM-PIbd-31-Yaruskin-S-A/laba3.ipynb at lab_4

2024-11-30 13:19:01 +04:00

853 KiB

Raw Permalink Blame History

In [1]:

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


df = pd.read_csv("..//static//csv//DiamondsPrices2022.csv", index_col="Unnamed: 0")

print(df.columns, "\n")

print(df.info, "\n")

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object') 

<bound method DataFrame.info of        carat        cut color clarity  depth  table  price     x     y     z
1       0.23      Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43
2       0.21    Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31
3       0.23       Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31
4       0.29    Premium     I     VS2   62.4   58.0    334  4.20  4.23  2.63
5       0.31       Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75
...      ...        ...   ...     ...    ...    ...    ...   ...   ...   ...
53939   0.86    Premium     H     SI2   61.0   58.0   2757  6.15  6.12  3.74
53940   0.75      Ideal     D     SI2   62.2   55.0   2757  5.83  5.87  3.64
53941   0.71    Premium     E     SI1   60.5   55.0   2756  5.79  5.74  3.49
53942   0.71    Premium     F     SI1   59.8   62.0   2756  5.74  5.73  3.43
53943   0.70  Very Good     E     VS2   60.5   59.0   2757  5.71  5.76  3.47

[53943 rows x 10 columns]>

Бизнес цели

Оптимизация ценообразования. Анализ между характиристик и цен. Это поможет опеделять цену камня в зависимости от его качеств.
Разделение камней для разного сигмента рынка. В зависимости от характиристик камня делать его более доступным или премиальным.

Цели технического проекта

Для первой БЦ

Разработка модели предсказания стоимости бриллианта
Анализ факторов, влияющих на стоимость

Для второй БЦ

Создание системы кластеризации бриллиантов

Нужно выявить какие проблемы есть в данных. Начнем с поиском зашумленности

In [2]:

for column in df.select_dtypes(include=['float64', 'int64']).columns:
    plt.figure(figsize=(6, 4))
    sns.histplot(df[column], kde=True)
    plt.title(f'Шум в {column}')
    plt.show

No description has been provided for this image

In [3]:

for column in ['cut', 'color', 'clarity']:
    plt.figure(figsize=(6,4))
    sns.countplot(data=df, x=column)
    plt.title(f'Распределение {column}')
    plt.show

Ищем выбросы

In [4]:

from scipy.stats import zscore

outliers = df[(zscore(df.select_dtypes(include=['float64', 'int64'])) > 3).any(axis=1)]
print(f"Количество выбросов (z-score): {len(outliers)}")

df_copy = df.copy()
for column in df.select_dtypes(include=['float64', 'int64']).columns:
    median = df[column].median()
    std_dev = df[column].std()
    df_copy[column] = np.where(zscore(df[column]) > 3, median, df[column])

outliers_after = df_copy[(zscore(df_copy.select_dtypes(include=['float64', 'int64'])) > 3).any(axis=1)]
print(f"Количество выбросов после замены на медиану: {len(outliers_after)}")

Количество выбросов (z-score): 2077
Количество выбросов после замены на медиану: 1532

Скорее всего тут реальные данные, поэтому убрав самые большие остальные оставим.

Ищем корреляции

In [5]:

correlations = df.select_dtypes(include=['float64', 'int64']).corr()['price'].sort_values(ascending=False)
print("Корреляция признаков с целевой переменной 'price':")
print(correlations)

Корреляция признаков с целевой переменной 'price':
price    1.000000
carat    0.921591
x        0.884433
y        0.865419
z        0.861249
table    0.127118
depth   -0.010630
Name: price, dtype: float64

In [6]:

plt.figure(figsize=(10, 6))
sns.heatmap(df.select_dtypes(include=['float64', 'int64']).corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Корреляционная матрица")
plt.show()

In [7]:

df_dummies = pd.get_dummies(df, columns=['cut', 'color', 'clarity'], drop_first=True)

selected_columns = ['price'] + [col for col in df_dummies.columns if 'cut_' in col or 'color_' in col or 'clarity_' in col]
correlation_matrix = df_dummies[selected_columns].corr()

correlations = df_dummies[selected_columns].corr()['price'].sort_values(ascending=False)
print("Корреляция признаков с целевой переменной 'price' для 'cut', 'color', 'clarity':")
print(correlations)

plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Корреляционная матрица для 'cut', 'color', 'clarity' и 'price'")
plt.show()

Корреляция признаков с целевой переменной 'price' для 'cut', 'color', 'clarity':
price            1.000000
clarity_SI2      0.128427
color_I          0.097130
cut_Premium      0.095685
color_J          0.081714
color_H          0.059229
clarity_SI1      0.008940
color_G          0.008564
cut_Very Good    0.006589
cut_Good        -0.000307
clarity_VS2     -0.001066
clarity_VS1     -0.009879
color_F         -0.024166
clarity_IF      -0.049593
clarity_VVS2    -0.052375
clarity_VVS1    -0.095261
cut_Ideal       -0.097160
color_E         -0.101101
Name: price, dtype: float64

Вывод такое:
Сильное влияние на цену оказывают характиристики 'carat', 'x', 'y', 'z'.
Низкое влияние оказывают 'table', 'cut', 'color', 'clarity'.
А признак 'depth' не оказывает влияни вовсе.

Выполним разбиение каждого набора данных на обучающую, контрольную и тестовую выборки для устранения проблемы просачивания данных

In [8]:

train_data, temp_data = train_test_split(df, test_size=0.3, random_state=42)
validation_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

train_size = len(train_data)
validation_size = len(validation_data)
test_size = len(test_data)

train_size, validation_size, test_size

Out[8]:

(37760, 8091, 8092)

Оцень сбалансированность выборки

In [9]:

def plot_category_balance(train_data, validation_data, test_data, column, title):
    fig, ax = plt.subplots(1, 3, figsize=(18, 5), sharey=True)
    
    train_data[column].value_counts(normalize=True).plot(kind='bar', ax=ax[0])
    ax[0].set_title(f'Training {title}')
    ax[0].set_ylabel('Proportion')
    
    validation_data[column].value_counts(normalize=True).plot(kind='bar', ax=ax[1])
    ax[1].set_title(f'Validation {title}')
    
    test_data[column].value_counts(normalize=True).plot(kind='bar', ax=ax[2])
    ax[2].set_title(f'Test {title}')
    
    plt.suptitle(f'Category Balance for {title}')
    plt.show()

plot_category_balance(train_data, validation_data, test_data, 'carat', 'Carat')
plot_category_balance(train_data, validation_data, test_data, 'cut', 'Cut')
plot_category_balance(train_data, validation_data, test_data, 'color', 'Color')
plot_category_balance(train_data, validation_data, test_data, 'clarity', 'Clarity')

Буду использовать увеличение выборки (Oversampling) для "Clarity"

In [10]:

from imblearn.over_sampling import RandomOverSampler

print("До Oversampling: ", train_data['clarity'].value_counts())

ros = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_train = train_data.drop(columns=['clarity']) # все данные, кроме столбца clarity, то есть признаки, используемые для предсказания
y_train = train_data['clarity'] # целевой столбец clarity, который содержит классы, которые нужно сбалансировать

X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

train_data_resampled = X_resampled.copy()
train_data_resampled['clarity'] = y_resampled

print("После Oversampling: ", train_data_resampled['clarity'].value_counts())

До Oversampling:  clarity
SI1     9256
VS2     8590
SI2     6355
VS1     5694
VVS2    3551
VVS1    2547
IF      1254
I1       513
Name: count, dtype: int64
После Oversampling:  clarity
VVS1    9256
SI1     9256
SI2     9256
VVS2    9256
VS2     9256
VS1     9256
IF      9256
I1      9256
Name: count, dtype: int64

Используем Undersampling для "Cut"

In [11]:

from imblearn.under_sampling import RandomUnderSampler

print("До Undersampling: ", train_data_resampled['cut'].value_counts())

undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train_cut = train_data_resampled.drop(columns=['cut'])
y_train_cut = train_data_resampled['cut']
X_resampled_cut, y_resampled_cut = undersampler.fit_resample(X_train_cut, y_train_cut)

train_data_resampled_cut = X_resampled_cut.copy()
train_data_resampled_cut['cut'] = y_resampled_cut

print("После Undersampling: ", train_data_resampled_cut['cut'].value_counts())
print("Столбцы после Undersampling:", train_data_resampled_cut.columns)

До Undersampling:  cut
Ideal        31497
Premium      17342
Very Good    14994
Good          6338
Fair          3877
Name: count, dtype: int64
После Undersampling:  cut
Fair         3877
Good         3877
Ideal        3877
Premium      3877
Very Good    3877
Name: count, dtype: int64
Столбцы после Undersampling: Index(['carat', 'color', 'depth', 'table', 'price', 'x', 'y', 'z', 'clarity',
       'cut'],
      dtype='object')

И увеличиваем количество меньших значений в 'carat' с помощью Oversampling

In [12]:

train_data_resampled_cut['carat_binned'] = pd.cut(train_data_resampled_cut['carat'], bins=5, labels=False)

X_train_carat = train_data_resampled_cut.drop(columns=['carat_binned'])
y_train_carat = train_data_resampled_cut['carat_binned']

oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_resampled_carat, y_resampled_carat = oversampler.fit_resample(X_train_carat, y_train_carat)

train_data_resampled_carat = X_resampled_carat.copy()
train_data_resampled_carat['carat_binned'] = y_resampled_carat

train_data_resampled_carat['carat_binned'].value_counts().plot(kind='bar')
plt.title("Distribution of Carat Bins after Oversampling")
plt.xlabel("Carat Bin")
plt.ylabel("Frequency")
plt.show()

print("Столбцы после Oversampling:", train_data_resampled_carat.columns)

Столбцы после Oversampling: Index(['carat', 'color', 'depth', 'table', 'price', 'x', 'y', 'z', 'clarity',
       'cut', 'carat_binned'],
      dtype='object')

Конструирование признаков

In [13]:

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler

categorical_features = ['cut', 'color', 'clarity']

encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_data = pd.DataFrame(encoder.fit_transform(train_data_resampled_carat[categorical_features]))
encoded_data.columns = encoder.get_feature_names_out(categorical_features)

train_data_encoded = pd.concat([train_data_resampled_carat.reset_index(drop=True), encoded_data], axis=1)

print(train_data_encoded.head())

   carat color  depth  table  price     x     y     z clarity   cut  ...  \
0   1.50     G   64.5   57.0  10352  7.15  7.09  4.59     SI1  Fair  ...   
1   0.60     G   65.7   55.0   1197  5.31  5.23  3.46     SI1  Fair  ...   
2   1.83     J   70.0   58.0   5083  7.34  7.28  5.12      I1  Fair  ...   
3   0.90     D   63.8   61.0   4252  6.07  5.99  3.85     SI1  Fair  ...   
4   0.71     G   65.7   56.0   2274  5.51  5.54  3.63     VS2  Fair  ...   

   color_H  color_I  color_J  clarity_IF  clarity_SI1  clarity_SI2  \
0      0.0      0.0      0.0         0.0          1.0          0.0   
1      0.0      0.0      0.0         0.0          1.0          0.0   
2      0.0      0.0      1.0         0.0          0.0          0.0   
3      0.0      0.0      0.0         0.0          1.0          0.0   
4      0.0      0.0      0.0         0.0          0.0          0.0   

   clarity_VS1  clarity_VS2  clarity_VVS1  clarity_VVS2  
0          0.0          0.0           0.0           0.0  
1          0.0          0.0           0.0           0.0  
2          0.0          0.0           0.0           0.0  
3          0.0          0.0           0.0           0.0  
4          0.0          1.0           0.0           0.0  

[5 rows x 28 columns]

In [14]:

num_bins = 5

train_data_encoded['carat_binned'] = pd.cut(train_data_encoded['carat'], bins=num_bins, labels=False)
train_data_encoded['depth_binned'] = pd.cut(train_data_encoded['depth'], bins=num_bins, labels=False)
train_data_encoded['table_binned'] = pd.cut(train_data_encoded['table'], bins=num_bins, labels=False)
train_data_encoded['price_binned'] = pd.cut(train_data_encoded['price'], bins=num_bins, labels=False)

print(train_data_encoded[['carat', 'carat_binned', 'depth', 'depth_binned', 
                         'table', 'table_binned', 'price', 'price_binned']].head())

   carat  carat_binned  depth  depth_binned  table  table_binned  price  \
0   1.50             1   64.5             2   57.0             1  10352   
1   0.60             0   65.7             3   55.0             1   1197   
2   1.83             1   70.0             3   58.0             2   5083   
3   0.90             0   63.8             2   61.0             2   4252   
4   0.71             0   65.7             3   56.0             1   2274   

   price_binned  
0             2  
1             0  
2             1  
3             1  
4             0

Ручной синтез

In [15]:

data = train_data_encoded.copy()

data['price_per_carat'] = data['price'] / data['carat']
data['volume'] = data['x'] * data['y'] * data['z']
data['surface_area'] = data['table'] * data['depth'] / 100


data['cut_score'] = data['cut'].map({'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5})
data['color_score'] = data['color'].map({'J': 1, 'I': 2, 'H': 3, 'G': 4, 'F': 5, 'E': 6, 'D': 7})
data['clarity_score'] = data['clarity'].map({'I1': 1, 'SI2': 2, 'SI1': 3, 'VS2': 4, 'VS1': 5, 'VVS2': 6, 'VVS1': 7, 'IF': 8})
data['quality_score'] = data['cut_score'] + data['color_score'] + data['clarity_score']

Масштабирование признаков

In [16]:

features_to_scale = ['carat', 'price', 'price_per_carat', 'volume', 'surface_area', 'quality_score']

#  Стандартизация признаков
scaler_standard = StandardScaler()
data_standardized = pd.DataFrame(scaler_standard.fit_transform(data[features_to_scale]), columns=[f"{col}_standard" for col in features_to_scale])

# Нормализация признаков
scaler_minmax = MinMaxScaler()
data_normalized = pd.DataFrame(scaler_minmax.fit_transform(data[features_to_scale]), columns=[f"{col}_norm" for col in features_to_scale])

data = pd.concat([data.reset_index(drop=True), data_standardized, data_normalized], axis=1) # теперь их объеденяем

print(data.head())

   carat color  depth  table  price     x     y     z clarity   cut  ...  \
0   1.50     G   64.5   57.0  10352  7.15  7.09  4.59     SI1  Fair  ...   
1   0.60     G   65.7   55.0   1197  5.31  5.23  3.46     SI1  Fair  ...   
2   1.83     J   70.0   58.0   5083  7.34  7.28  5.12      I1  Fair  ...   
3   0.90     D   63.8   61.0   4252  6.07  5.99  3.85     SI1  Fair  ...   
4   0.71     G   65.7   56.0   2274  5.51  5.54  3.63     VS2  Fair  ...   

   price_per_carat_standard  volume_standard  surface_area_standard  \
0                  1.486375        -0.636594               0.011271   
1                 -1.179201        -1.340819              -0.335758   
2                 -0.754022        -0.425704               2.123745   
3                  0.303687        -1.114517               1.197231   
4                 -0.523003        -1.264937               0.026144   

   quality_score_standard  carat_norm  price_norm  price_per_carat_norm  \
0                0.046561    0.302326    0.542508              0.348688   
1                0.046561    0.093023    0.047080              0.056256   
2               -1.288747    0.379070    0.257373              0.102901   
3                0.847745    0.162791    0.212403              0.218939   
4                0.313622    0.118605    0.105363              0.128245   

   volume_norm  surface_area_norm  quality_score_norm  
0     0.333140           0.391412            0.294118  
1     0.137573           0.373071            0.294118  
2     0.391705           0.503057            0.000000  
3     0.200418           0.454090            0.470588  
4     0.158646           0.392198            0.352941  

[5 rows x 50 columns]

In [17]:

import featuretools as ft

data = train_data_encoded.copy()  # Используем предобработанные данные

es = ft.EntitySet(id="diamonds")

es = es.add_dataframe(dataframe_name="diamonds_data", dataframe=data, index="index")

feature_matrix, feature_defs = ft.dfs(
    entityset=es, 
    target_dataframe_name="diamonds_data",
    max_depth=2
)

print(feature_matrix.head())

c:\Users\salih\OneDrive\Рабочий стол\3 курас\МИИ\laba1\AIM-PIbd-31-Yaruskin-S-A\aimenv\Lib\site-packages\featuretools\entityset\entityset.py:1733: UserWarning: index index not found in dataframe, creating new integer column
  warnings.warn(
c:\Users\salih\OneDrive\Рабочий стол\3 курас\МИИ\laba1\AIM-PIbd-31-Yaruskin-S-A\aimenv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
c:\Users\salih\OneDrive\Рабочий стол\3 курас\МИИ\laba1\AIM-PIbd-31-Yaruskin-S-A\aimenv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
c:\Users\salih\OneDrive\Рабочий стол\3 курас\МИИ\laba1\AIM-PIbd-31-Yaruskin-S-A\aimenv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
c:\Users\salih\OneDrive\Рабочий стол\3 курас\МИИ\laba1\AIM-PIbd-31-Yaruskin-S-A\aimenv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
c:\Users\salih\OneDrive\Рабочий стол\3 курас\МИИ\laba1\AIM-PIbd-31-Yaruskin-S-A\aimenv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
c:\Users\salih\OneDrive\Рабочий стол\3 курас\МИИ\laba1\AIM-PIbd-31-Yaruskin-S-A\aimenv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
c:\Users\salih\OneDrive\Рабочий стол\3 курас\МИИ\laba1\AIM-PIbd-31-Yaruskin-S-A\aimenv\Lib\site-packages\featuretools\synthesis\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created
  warnings.warn(

       carat color  depth  table  price     x     y     z clarity   cut  ...  \
index                                                                    ...   
0       1.50     G   64.5   57.0  10352  7.15  7.09  4.59     SI1  Fair  ...   
1       0.60     G   65.7   55.0   1197  5.31  5.23  3.46     SI1  Fair  ...   
2       1.83     J   70.0   58.0   5083  7.34  7.28  5.12      I1  Fair  ...   
3       0.90     D   63.8   61.0   4252  6.07  5.99  3.85     SI1  Fair  ...   
4       0.71     G   65.7   56.0   2274  5.51  5.54  3.63     VS2  Fair  ...   

       clarity_IF  clarity_SI1  clarity_SI2  clarity_VS1  clarity_VS2  \
index                                                                   
0             0.0          1.0          0.0          0.0          0.0   
1             0.0          1.0          0.0          0.0          0.0   
2             0.0          0.0          0.0          0.0          0.0   
3             0.0          1.0          0.0          0.0          0.0   
4             0.0          0.0          0.0          0.0          1.0   

       clarity_VVS1  clarity_VVS2  depth_binned  table_binned  price_binned  
index                                                                        
0               0.0           0.0             2             1             2  
1               0.0           0.0             3             1             0  
2               0.0           0.0             3             2             1  
3               0.0           0.0             2             2             1  
4               0.0           0.0             3             1             0  

[5 rows x 31 columns]

Оцениваем качество каждого набора
В коде есть комментарии указывающие что мы сейчас оцениваем.

In [18]:

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score
import time

categorical_features = ['cut', 'color', 'clarity']
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_data = pd.DataFrame(encoder.fit_transform(data[categorical_features]))
encoded_data.columns = encoder.get_feature_names_out(categorical_features)

data_encoded = pd.concat([data.drop(columns=categorical_features), encoded_data], axis=1)

X = data_encoded.drop(columns=['price'])  # Признаки
y = data_encoded['price']  # Целевая переменная


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 1. Оценка предсказательной способности
model = LinearRegression()

cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
mean_mse = -np.mean(cv_scores)
print("Предсказательная способность (MSE):", mean_mse)


# 2. Оценка скорости вычисления
start_time = time.time()
model.fit(X_train, y_train)
train_time = time.time() - start_time

start_time = time.time()
y_pred = model.predict(X_test)
predict_time = time.time() - start_time

print("Скорость обучения:", train_time, "секунд")
print("Скорость предсказания:", predict_time, "секунд")

# 3. Оценка надежности
std_mse = np.std(-cv_scores)
print("Надежность (стабильность MSE):", std_mse)

# 4. Оценка корреляции
correlation_matrix = X.corr()
print("Корреляционная матрица признаков:\n", correlation_matrix)

# 5. Оценка цельности
print("Пропуски в данных:\n", data.isnull().sum())
print("Сводка по данным:\n", data.describe())

Предсказательная способность (MSE): 769447.425412744
Скорость обучения: 0.9312052726745605 секунд
Скорость предсказания: 0.009320497512817383 секунд
Надежность (стабильность MSE): 9415.05041335384
Корреляционная матрица признаков:
                   index     carat     depth     table         x         y  \
index          1.000000  0.918976  0.256560 -0.012994  0.881745  0.881997   
carat          0.918976  1.000000  0.317029  0.011338  0.965007  0.964633   
depth          0.256560  0.317029  1.000000 -0.523388  0.214227  0.212123   
table         -0.012994  0.011338 -0.523388  1.000000  0.068165  0.061429   
x              0.881745  0.965007  0.214227  0.068165  1.000000  0.998661   
y              0.881997  0.964633  0.212123  0.061429  0.998661  1.000000   
z              0.874093  0.962961  0.422732 -0.067003  0.966991  0.966578   
carat_binned   0.936744  0.980069  0.299586 -0.006563  0.954800  0.954348   
cut_Good      -0.197076 -0.178975 -0.244465  0.324476 -0.140993 -0.137115   
cut_Ideal     -0.266915 -0.293529 -0.159223 -0.225245 -0.308059 -0.303008   
cut_Premium   -0.106465 -0.150300 -0.286254  0.108195 -0.104058 -0.105738   
cut_Very Good -0.086479 -0.163226 -0.164405 -0.010754 -0.162451 -0.152861   
color_E       -0.174812 -0.199907 -0.120239  0.132333 -0.204134 -0.202557   
color_F       -0.253824 -0.269185 -0.048395 -0.001821 -0.268046 -0.268242   
color_G       -0.264394 -0.282227 -0.076425 -0.032431 -0.262968 -0.263331   
color_H        0.260031  0.235182  0.080885 -0.015329  0.229047  0.223693   
color_I        0.117366  0.102229 -0.113178  0.080852  0.144121  0.146060   
color_J        0.202485  0.292015  0.215944 -0.024922  0.241304  0.244249   
clarity_IF    -0.181797 -0.220384 -0.112810 -0.049651 -0.255666 -0.252669   
clarity_SI1   -0.212170 -0.222121 -0.108779  0.010751 -0.204318 -0.201694   
clarity_SI2   -0.162937 -0.168896 -0.141779  0.073969 -0.108325 -0.104168   
clarity_VS1   -0.202684 -0.220126 -0.116339  0.007504 -0.220583 -0.218858   
clarity_VS2   -0.190586 -0.206457 -0.112282  0.012328 -0.194913 -0.192109   
clarity_VVS1  -0.187562 -0.229651 -0.101446 -0.032463 -0.271730 -0.270343   
clarity_VVS2  -0.193781 -0.224968 -0.091511 -0.030425 -0.254022 -0.251780   
depth_binned   0.245417  0.295187  0.870892 -0.566471  0.205631  0.206883   
table_binned   0.074231  0.114544 -0.377101  0.817051  0.134186  0.131247   
price_binned   0.747669  0.815253  0.160678  0.042235  0.811691  0.817555   
cut_Good      -0.197076 -0.178975 -0.244465  0.324476 -0.140993 -0.137115   
cut_Ideal     -0.266915 -0.293529 -0.159223 -0.225245 -0.308059 -0.303008   
cut_Premium   -0.106465 -0.150300 -0.286254  0.108195 -0.104058 -0.105738   
cut_Very Good -0.086479 -0.163226 -0.164405 -0.010754 -0.162451 -0.152861   
color_E       -0.174812 -0.199907 -0.120239  0.132333 -0.204134 -0.202557   
color_F       -0.253824 -0.269185 -0.048395 -0.001821 -0.268046 -0.268242   
color_G       -0.264394 -0.282227 -0.076425 -0.032431 -0.262968 -0.263331   
color_H        0.260031  0.235182  0.080885 -0.015329  0.229047  0.223693   
color_I        0.117366  0.102229 -0.113178  0.080852  0.144121  0.146060   
color_J        0.202485  0.292015  0.215944 -0.024922  0.241304  0.244249   
clarity_IF    -0.181797 -0.220384 -0.112810 -0.049651 -0.255666 -0.252669   
clarity_SI1   -0.212170 -0.222121 -0.108779  0.010751 -0.204318 -0.201694   
clarity_SI2   -0.162937 -0.168896 -0.141779  0.073969 -0.108325 -0.104168   
clarity_VS1   -0.202684 -0.220126 -0.116339  0.007504 -0.220583 -0.218858   
clarity_VS2   -0.190586 -0.206457 -0.112282  0.012328 -0.194913 -0.192109   
clarity_VVS1  -0.187562 -0.229651 -0.101446 -0.032463 -0.271730 -0.270343   
clarity_VVS2  -0.193781 -0.224968 -0.091511 -0.030425 -0.254022 -0.251780   

                      z  carat_binned  cut_Good  cut_Ideal  ...   color_H  \
index          0.874093      0.936744 -0.197076  -0.266915  ...  0.260031   
carat          0.962961      0.980069 -0.178975  -0.293529  ...  0.235182   
depth          0.422732      0.299586 -0.244465  -0.159223  ...  0.080885   
table         -0.067003     -0.006563  0.324476  -0.225245  ... -0.015329   
x              0.966991      0.954800 -0.140993  -0.308059  ...  0.229047   
y              0.966578      0.954348 -0.137115  -0.303008  ...  0.223693   
z              1.000000      0.948784 -0.191932  -0.311613  ...  0.226114   
carat_binned   0.948784      1.000000 -0.175908  -0.285435  ...  0.273119   
cut_Good      -0.191932     -0.175908  1.000000  -0.131819  ... -0.135046   
cut_Ideal     -0.311613     -0.285435 -0.131819   1.000000  ... -0.071273   
cut_Premium   -0.161545     -0.138476 -0.180437  -0.141673  ... -0.101136   
cut_Very Good -0.178681     -0.160568 -0.158199  -0.124212  ... -0.066590   
color_E       -0.210177     -0.199353  0.180753   0.043481  ... -0.192503   
color_F       -0.256080     -0.267207  0.020500   0.079422  ... -0.199995   
color_G       -0.268521     -0.273979  0.029838   0.102871  ... -0.255060   
color_H        0.226114      0.273119 -0.135046  -0.071273  ...  1.000000   
color_I        0.102372      0.129941  0.064618  -0.038568  ... -0.309138   
color_J        0.279789      0.219450 -0.098352  -0.072643  ... -0.264988   
clarity_IF    -0.253221     -0.213292 -0.008462   0.203602  ... -0.054606   
clarity_SI1   -0.210001     -0.227317  0.081039   0.051568  ... -0.024638   
clarity_SI2   -0.130004     -0.162642  0.110280   0.019205  ... -0.037742   
clarity_VS1   -0.225411     -0.224820  0.045321   0.096886  ... -0.065186   
clarity_VS2   -0.200809     -0.208082  0.030034   0.081414  ... -0.071251   
clarity_VVS1  -0.266684     -0.221868  0.002891   0.158572  ... -0.052581   
clarity_VVS2  -0.248962     -0.224522  0.018263   0.129101  ... -0.068533   
depth_binned   0.393501      0.276906 -0.352815  -0.126332  ... -0.037413   
table_binned   0.034246      0.076420  0.218950  -0.309130  ... -0.059176   
price_binned   0.792153      0.798011 -0.109808  -0.163178  ...  0.151799   
cut_Good      -0.191932     -0.175908  1.000000  -0.131819  ... -0.135046   
cut_Ideal     -0.311613     -0.285435 -0.131819   1.000000  ... -0.071273   
cut_Premium   -0.161545     -0.138476 -0.180437  -0.141673  ... -0.101136   
cut_Very Good -0.178681     -0.160568 -0.158199  -0.124212  ... -0.066590   
color_E       -0.210177     -0.199353  0.180753   0.043481  ... -0.192503   
color_F       -0.256080     -0.267207  0.020500   0.079422  ... -0.199995   
color_G       -0.268521     -0.273979  0.029838   0.102871  ... -0.255060   
color_H        0.226114      0.273119 -0.135046  -0.071273  ...  1.000000   
color_I        0.102372      0.129941  0.064618  -0.038568  ... -0.309138   
color_J        0.279789      0.219450 -0.098352  -0.072643  ... -0.264988   
clarity_IF    -0.253221     -0.213292 -0.008462   0.203602  ... -0.054606   
clarity_SI1   -0.210001     -0.227317  0.081039   0.051568  ... -0.024638   
clarity_SI2   -0.130004     -0.162642  0.110280   0.019205  ... -0.037742   
clarity_VS1   -0.225411     -0.224820  0.045321   0.096886  ... -0.065186   
clarity_VS2   -0.200809     -0.208082  0.030034   0.081414  ... -0.071251   
clarity_VVS1  -0.266684     -0.221868  0.002891   0.158572  ... -0.052581   
clarity_VVS2  -0.248962     -0.224522  0.018263   0.129101  ... -0.068533   

                color_I   color_J  clarity_IF  clarity_SI1  clarity_SI2  \
index          0.117366  0.202485   -0.181797    -0.212170    -0.162937   
carat          0.102229  0.292015   -0.220384    -0.222121    -0.168896   
depth         -0.113178  0.215944   -0.112810    -0.108779    -0.141779   
table          0.080852 -0.024922   -0.049651     0.010751     0.073969   
x              0.144121  0.241304   -0.255666    -0.204318    -0.108325   
y              0.146060  0.244249   -0.252669    -0.201694    -0.104168   
z              0.102372  0.279789   -0.253221    -0.210001    -0.130004   
carat_binned   0.129941  0.219450   -0.213292    -0.227317    -0.162642   
cut_Good       0.064618 -0.098352   -0.008462     0.081039     0.110280   
cut_Ideal     -0.038568 -0.072643    0.203602     0.051568     0.019205   
cut_Premium    0.105213 -0.084217    0.012019     0.073734     0.076342   
cut_Very Good  0.062781 -0.077472    0.062992     0.077184     0.071627   
color_E       -0.146245 -0.125359    0.006989     0.048760     0.060037   
color_F       -0.151937 -0.130238    0.067814     0.026771     0.056315   
color_G       -0.193770 -0.166096    0.118431     0.009543     0.012232   
color_H       -0.309138 -0.264988   -0.054606    -0.024638    -0.037742   
color_I        1.000000 -0.201312   -0.056505    -0.029603    -0.010066   
color_J       -0.201312  1.000000   -0.042570    -0.032981    -0.082478   
clarity_IF    -0.056505 -0.042570    1.000000    -0.049155    -0.065849   
clarity_SI1   -0.029603 -0.032981   -0.049155     1.000000    -0.099840   
clarity_SI2   -0.010066 -0.082478   -0.065849    -0.099840     1.000000   
clarity_VS1   -0.008767 -0.024548   -0.041445    -0.062839    -0.084180   
clarity_VS2   -0.001448  0.005467   -0.044484    -0.067447    -0.090353   
clarity_VVS1  -0.038316 -0.052719   -0.033043    -0.050101    -0.067116   
clarity_VVS2  -0.051227 -0.036439   -0.034892    -0.052904    -0.070871   
depth_binned  -0.075038  0.318499   -0.079937    -0.107482    -0.137301   
table_binned   0.017798  0.061547   -0.052251     0.019997     0.046550   
price_binned   0.080443  0.194458   -0.116621    -0.094520     0.030333   
cut_Good       0.064618 -0.098352   -0.008462     0.081039     0.110280   
cut_Ideal     -0.038568 -0.072643    0.203602     0.051568     0.019205   
cut_Premium    0.105213 -0.084217    0.012019     0.073734     0.076342   
cut_Very Good  0.062781 -0.077472    0.062992     0.077184     0.071627   
color_E       -0.146245 -0.125359    0.006989     0.048760     0.060037   
color_F       -0.151937 -0.130238    0.067814     0.026771     0.056315   
color_G       -0.193770 -0.166096    0.118431     0.009543     0.012232   
color_H       -0.309138 -0.264988   -0.054606    -0.024638    -0.037742   
color_I        1.000000 -0.201312   -0.056505    -0.029603    -0.010066   
color_J       -0.201312  1.000000   -0.042570    -0.032981    -0.082478   
clarity_IF    -0.056505 -0.042570    1.000000    -0.049155    -0.065849   
clarity_SI1   -0.029603 -0.032981   -0.049155     1.000000    -0.099840   
clarity_SI2   -0.010066 -0.082478   -0.065849    -0.099840     1.000000   
clarity_VS1   -0.008767 -0.024548   -0.041445    -0.062839    -0.084180   
clarity_VS2   -0.001448  0.005467   -0.044484    -0.067447    -0.090353   
clarity_VVS1  -0.038316 -0.052719   -0.033043    -0.050101    -0.067116   
clarity_VVS2  -0.051227 -0.036439   -0.034892    -0.052904    -0.070871   

               clarity_VS1  clarity_VS2  clarity_VVS1  clarity_VVS2  
index            -0.202684    -0.190586     -0.187562     -0.193781  
carat            -0.220126    -0.206457     -0.229651     -0.224968  
depth            -0.116339    -0.112282     -0.101446     -0.091511  
table             0.007504     0.012328     -0.032463     -0.030425  
x                -0.220583    -0.194913     -0.271730     -0.254022  
y                -0.218858    -0.192109     -0.270343     -0.251780  
z                -0.225411    -0.200809     -0.266684     -0.248962  
carat_binned     -0.224820    -0.208082     -0.221868     -0.224522  
cut_Good          0.045321     0.030034      0.002891      0.018263  
cut_Ideal         0.096886     0.081414      0.158572      0.129101  
cut_Premium       0.058620     0.089749      0.029293      0.012928  
cut_Very Good     0.059445     0.073481      0.070278      0.087436  
color_E           0.028625     0.023490      0.045116      0.055769  
color_F           0.046575     0.032305      0.065653      0.059278  
color_G           0.064229     0.036578      0.072317      0.078458  
color_H          -0.065186    -0.071251     -0.052581     -0.068533  
color_I          -0.008767    -0.001448     -0.038316     -0.051227  
color_J          -0.024548     0.005467     -0.052719     -0.036439  
clarity_IF       -0.041445    -0.044484     -0.033043     -0.034892  
clarity_SI1      -0.062839    -0.067447     -0.050101     -0.052904  
clarity_SI2      -0.084180    -0.090353     -0.067116     -0.070871  
clarity_VS1       1.000000    -0.056868     -0.042242     -0.044606  
clarity_VS2      -0.056868     1.000000     -0.045340     -0.047877  
clarity_VVS1     -0.042242    -0.045340      1.000000     -0.035564  
clarity_VVS2     -0.044606    -0.047877     -0.035564      1.000000  
depth_binned     -0.094953    -0.096364     -0.082692     -0.080568  
table_binned      0.006478     0.018598     -0.041154     -0.040427  
price_binned     -0.097141    -0.070453     -0.143954     -0.134063  
cut_Good          0.045321     0.030034      0.002891      0.018263  
cut_Ideal         0.096886     0.081414      0.158572      0.129101  
cut_Premium       0.058620     0.089749      0.029293      0.012928  
cut_Very Good     0.059445     0.073481      0.070278      0.087436  
color_E           0.028625     0.023490      0.045116      0.055769  
color_F           0.046575     0.032305      0.065653      0.059278  
color_G           0.064229     0.036578      0.072317      0.078458  
color_H          -0.065186    -0.071251     -0.052581     -0.068533  
color_I          -0.008767    -0.001448     -0.038316     -0.051227  
color_J          -0.024548     0.005467     -0.052719     -0.036439  
clarity_IF       -0.041445    -0.044484     -0.033043     -0.034892  
clarity_SI1      -0.062839    -0.067447     -0.050101     -0.052904  
clarity_SI2      -0.084180    -0.090353     -0.067116     -0.070871  
clarity_VS1       1.000000    -0.056868     -0.042242     -0.044606  
clarity_VS2      -0.056868     1.000000     -0.045340     -0.047877  
clarity_VVS1     -0.042242    -0.045340      1.000000     -0.035564  
clarity_VVS2     -0.044606    -0.047877     -0.035564      1.000000  

[45 rows x 45 columns]
Пропуски в данных:
 index            0
carat            0
color            0
depth            0
table            0
price            0
x                0
y                0
z                0
clarity          0
cut              0
carat_binned     0
cut_Good         0
cut_Ideal        0
cut_Premium      0
cut_Very Good    0
color_E          0
color_F          0
color_G          0
color_H          0
color_I          0
color_J          0
clarity_IF       0
clarity_SI1      0
clarity_SI2      0
clarity_VS1      0
clarity_VS2      0
clarity_VVS1     0
clarity_VVS2     0
depth_binned     0
table_binned     0
price_binned     0
dtype: int64
Сводка по данным:
               index         carat         depth         table         price  \
count  70315.000000  70315.000000  70315.000000  70315.000000  70315.000000   
mean   35157.000000      2.257318     63.299607     58.124889   9377.150124   
std    20298.336426      1.249989      3.131217      2.946560   5572.610635   
min        0.000000      0.200000     44.000000     43.000000    327.000000   
25%    17578.500000      1.200000     61.400000     56.000000   4916.000000   
50%    35157.000000      2.040000     63.300000     58.000000   9664.000000   
75%    52735.500000      3.110000     65.800000     60.000000  13945.000000   
max    70314.000000      4.500000     79.000000     79.000000  18806.000000   

                  x             y             z  carat_binned      cut_Good  \
count  70315.000000  70315.000000  70315.000000  70315.000000  70315.000000   
mean       7.939428      7.885353      5.016576      2.000000      0.143753   
std        1.696162      1.661164      1.150922      1.414224      0.350842   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        6.760000      6.740000      4.190000      1.000000      0.000000   
50%        8.050000      8.010000      5.140000      2.000000      0.000000   
75%        9.420000      9.340000      5.970000      3.000000      0.000000   
max       10.230000     10.160000      6.720000      4.000000      1.000000   

       ...    clarity_IF   clarity_SI1   clarity_SI2   clarity_VS1  \
count  ...  70315.000000  70315.000000  70315.000000  70315.000000   
mean   ...      0.031402      0.069359      0.117969      0.050316   
std    ...      0.174402      0.254066      0.322574      0.218599   
min    ...      0.000000      0.000000      0.000000      0.000000   
25%    ...      0.000000      0.000000      0.000000      0.000000   
50%    ...      0.000000      0.000000      0.000000      0.000000   
75%    ...      0.000000      0.000000      0.000000      0.000000   
max    ...      1.000000      1.000000      1.000000      1.000000   

        clarity_VS2  clarity_VVS1  clarity_VVS2  depth_binned  table_binned  \
count  70315.000000  70315.000000  70315.000000  70315.000000  70315.000000   
mean       0.057527      0.032582      0.036194      2.244059      1.599346   
std        0.232848      0.177541      0.186775      0.581360      0.537429   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        0.000000      0.000000      0.000000      2.000000      1.000000   
50%        0.000000      0.000000      0.000000      2.000000      2.000000   
75%        0.000000      0.000000      0.000000      3.000000      2.000000   
max        1.000000      1.000000      1.000000      4.000000      4.000000   

       price_binned  
count  70315.000000  
mean       1.960620  
std        1.454469  
min        0.000000  
25%        1.000000  
50%        2.000000  
75%        3.000000  
max        4.000000  

[8 rows x 29 columns]

По итогу MSE у меня равен 769447.43, что можно считать относительно высоким. В последующих работах я буду лучше больше уделять времени на выборки данных, для повышения точности предсказаний.

853 KiB Raw Permalink Blame History Unescape Escape

853 KiB

Raw Permalink Blame History