AIM-PIbd-31-Barsukov-P-O/lab3.ipynb at lab7

frog24/AIM-PIbd-31-Barsukov-P-O

Fork 0

frog24 eecbb94578 3 лаба

2024-11-09 23:02:45 +04:00

211 KiB

Raw Permalink Blame History

Определить бизнес-цели Бизнес-цели: а. Прогнозирование цены страховки б. Оценка влияния данных страхователя на цену страховки

Определить цели технического проета для каждой бизнес-цели а. Построить можедь, которая на основе данных страхователя будет предсказывать цену страховки б. Провести анализ для выявления факторов, которые наиболее сильно влияют на итоговую цену страховки

Подготовка данных

In [592]:

import pandas as pd
df = pd.read_csv("../dataset.csv")
print(df.shape[0])

данных достаточно чтобы шумы усреднились

In [593]:

print("было ", df.shape[0])
for column in df.select_dtypes(include=['int', 'float']).columns:
    mean = df[column].mean()
    std_dev = df[column].std()
    print(column, mean, std_dev)
    
    lower_bound = mean - 3 * std_dev
    upper_bound = mean + 3 * std_dev
    
    df = df[(df[column] <= upper_bound) & (df[column] >= lower_bound)]
    
print("стало ", df.shape[0])
df = df.reset_index(drop=True)

было  2772
age 39.10966810966811 14.081459420836477
bmi 30.70134920634921 6.1294486949652205
children 1.1026753434562546 1.2157555494600176
charges 13325.498588795157 12200.175109274192
стало  2710

были устранены выбросы, отобранные по правилу трех сигм

In [594]:

print(df.isnull().sum())

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

Пропущенных значений нет

Разбиение на выборки

In [595]:

from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.3, random_state=52)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=52)

print(train_df.shape[0], val_df.shape[0], test_df.shape[0])
print(df.shape[0], train_df.shape[0] + val_df.shape[0] + test_df.shape[0])

test_df = test_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
train_df = train_df.reset_index(drop=True)

1897 406 407
2710 2710

данные были разделены на обучающую (70%), контрольную (15%) и тестовую (15%) выборки

Оценка сбалансированности выборок

In [596]:

import matplotlib.pyplot as plt
import seaborn as sns

def draw(data, title):
    sns.histplot(data['charges'], kde=True)
    plt.title(title)
    plt.show()
    
draw(train_df, 'Распределение цен в обучающей выборке')
draw(val_df, 'Распределение цен в контрольной выборке')
draw(test_df, 'Распределение цен в тестовой выборке')

No description has been provided for this image

6,7. Конструирование признаков

In [597]:

print(df['region'].unique())
print(df['children'].unique())

from sklearn.preprocessing import OneHotEncoder
import numpy as np

encoder = OneHotEncoder(sparse_output=False, drop="first")

encoded_values = encoder.fit_transform(train_df[["smoker", "sex", "region", "children"]])
encoded_columns = encoder.get_feature_names_out(["smoker", "sex", "region", "children"])
encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)
train_df = pd.concat([train_df, encoded_values_df], axis=1)

encoded_values = encoder.fit_transform(test_df[["smoker", "sex", "region", "children"]])
encoded_columns = encoder.get_feature_names_out(["smoker", "sex", "region", "children"])
encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)
test_df = pd.concat([test_df, encoded_values_df], axis=1)

encoded_values = encoder.fit_transform(val_df[["smoker", "sex", "region", "children"]])
encoded_columns = encoder.get_feature_names_out(["smoker", "sex", "region", "children"])
encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)
val_df = pd.concat([val_df, encoded_values_df], axis=1)

print(test_df.columns)
print(val_df.columns)
print(train_df.columns)

['southwest' 'southeast' 'northwest' 'northeast']
[0 1 3 2 4]
Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges',
       'smoker_yes', 'sex_male', 'region_northwest', 'region_southeast',
       'region_southwest', 'children_1', 'children_2', 'children_3',
       'children_4'],
      dtype='object')
Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges',
       'smoker_yes', 'sex_male', 'region_northwest', 'region_southeast',
       'region_southwest', 'children_1', 'children_2', 'children_3',
       'children_4'],
      dtype='object')
Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges',
       'smoker_yes', 'sex_male', 'region_northwest', 'region_southeast',
       'region_southwest', 'children_1', 'children_2', 'children_3',
       'children_4'],
      dtype='object')

Было совершено унитарное кодирование признаков Пол (sex), Курильщик (smoker) и Регион (region). Полученные признаки были добавлены в исходный сет.

In [598]:

print('age', min(df['age']), max(df['age']))
#print('charges', min(df['charges']), max(df['charges']))
print('bmi', min(df['bmi']), max(df['bmi']))

labels_age = ['young', 'middle-aged', 'old']
labels_bmi = ['underweight', 'normal weight', 'overweight']
#labels_charges = ['low_charges', 'medium_charges', 'high_charges']

hist_age, bins_age = np.histogram(test_df['age'], bins = [0, 27, 45, 100])
age_df = pd.concat([test_df['age'], pd.cut(test_df['age'], list(bins_age), labels = labels_age)], axis=1)
test_df['age_category'] = pd.cut(test_df['age'], bins=bins_age, labels=labels_age)

hist_bmi, bins_bmi = np.histogram(test_df['bmi'], bins = [0, 18.5, 25, 100])
bmi_df = pd.concat([test_df['bmi'], pd.cut(test_df['bmi'], list(bins_bmi), labels = labels_bmi)], axis=1)
test_df['bmi_category'] = pd.cut(test_df['bmi'], bins=bins_bmi, labels=labels_bmi)

hist_age, bins_age = np.histogram(train_df['age'], bins = [0, 27, 45, 100])
age_df = pd.concat([train_df['age'], pd.cut(train_df['age'], list(bins_age), labels = labels_age)], axis=1)
train_df['age_category'] = pd.cut(train_df['age'], bins=bins_age, labels=labels_age)

hist_bmi, bins_bmi = np.histogram(train_df['bmi'], bins = [0, 18.5, 25, 100])
bmi_df = pd.concat([train_df['bmi'], pd.cut(train_df['bmi'], list(bins_bmi), labels = labels_bmi)], axis=1)
train_df['bmi_category'] = pd.cut(train_df['bmi'], bins=bins_bmi, labels=labels_bmi)

hist_age, bins_age = np.histogram(val_df['age'], bins = [0, 27, 45, 100])
age_df = pd.concat([val_df['age'], pd.cut(val_df['age'], list(bins_age), labels = labels_age)], axis=1)
val_df['age_category'] = pd.cut(val_df['age'], bins=bins_age, labels=labels_age)

hist_bmi, bins_bmi = np.histogram(val_df['bmi'], bins = [0, 18.5, 25, 100])
bmi_df = pd.concat([val_df['bmi'], pd.cut(val_df['bmi'], list(bins_bmi), labels = labels_bmi)], axis=1)
val_df['bmi_category'] = pd.cut(val_df['bmi'], bins=bins_bmi, labels=labels_bmi)

category_counts = val_df['bmi_category'].value_counts()
print(category_counts)
print('========================')
category_counts = test_df['bmi_category'].value_counts()
print(category_counts)
print('========================')
category_counts = train_df['bmi_category'].value_counts()
print(category_counts)

age 18 64
bmi 15.96 49.06
bmi_category
overweight       335
normal weight     66
underweight        5
Name: count, dtype: int64
========================
bmi_category
overweight       332
normal weight     70
underweight        5
Name: count, dtype: int64
========================
bmi_category
overweight       1543
normal weight     324
underweight        30
Name: count, dtype: int64

Была выполнена дискретизация числовых признаков Индекс массы тела (bmi) и Возраст (age)

In [599]:

train_df['parent_yes'] = train_df['children'] > 0
train_df['parent_yes'] = train_df['parent_yes'].map({True: 1.0, False: 0.0})

test_df['parent_yes'] = test_df['children'] > 0
test_df['parent_yes'] = test_df['parent_yes'].map({True: 1.0, False: 0.0})

val_df['parent_yes'] = val_df['children'] > 0
val_df['parent_yes'] = val_df['parent_yes'].map({True: 1.0, False: 0.0})

print(train_df['parent_yes'].head(20))
print('========================')
print(test_df['parent_yes'].head(20))
print('========================')
print(val_df['parent_yes'].head(20))

0     1.0
1     1.0
2     0.0
3     0.0
4     1.0
5     0.0
6     0.0
7     0.0
8     1.0
9     1.0
10    0.0
11    0.0
12    1.0
13    0.0
14    0.0
15    0.0
16    1.0
17    1.0
18    1.0
19    0.0
Name: parent_yes, dtype: float64
========================
0     1.0
1     1.0
2     0.0
3     1.0
4     0.0
5     1.0
6     0.0
7     0.0
8     1.0
9     0.0
10    1.0
11    1.0
12    0.0
13    0.0
14    0.0
15    1.0
16    1.0
17    0.0
18    1.0
19    1.0
Name: parent_yes, dtype: float64
========================
0     1.0
1     0.0
2     1.0
3     1.0
4     1.0
5     0.0
6     0.0
7     1.0
8     0.0
9     1.0
10    0.0
11    1.0
12    1.0
13    0.0
14    1.0
15    1.0
16    1.0
17    0.0
18    0.0
19    1.0
Name: parent_yes, dtype: float64

Был выполнен ручной синтез признака Родитель, на основе того, есть ли дети у страхователя или нет

In [600]:

from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
test_df['age_norm'] = scaler.fit_transform(test_df[['age']])
print(test_df['age_norm'].head(10))
print('========================')
train_df['age_norm'] = scaler.fit_transform(train_df[['age']])
print(train_df['age_norm'].head(10))
print('========================')
val_df['age_norm'] = scaler.fit_transform(val_df[['age']])
print(val_df['age_norm'].head(10))

0    0.695652
1    0.500000
2    0.239130
3    0.565217
4    0.978261
5    0.304348
6    0.847826
7    0.630435
8    0.804348
9    0.173913
Name: age_norm, dtype: float64
========================
0    0.586957
1    0.413043
2    0.847826
3    0.956522
4    0.521739
5    0.869565
6    0.021739
7    0.130435
8    0.565217
9    0.500000
Name: age_norm, dtype: float64
========================
0    0.217391
1    0.173913
2    0.760870
3    0.108696
4    0.326087
5    0.065217
6    0.282609
7    0.260870
8    0.152174
9    0.565217
Name: age_norm, dtype: float64

Было выполнено маштабирование признака Возраст (age) на основе нормировки.

In [601]:

scaler = preprocessing.StandardScaler()
train_df['age_stand'] = scaler.fit_transform(train_df[['age']])
print(train_df['age_stand'].head(10))
print('========================')
test_df['age_stand'] = scaler.fit_transform(test_df[['age']])
print(test_df['age_stand'].head(10))
print('========================')
val_df['age_stand'] = scaler.fit_transform(val_df[['age']])
print(val_df['age_stand'].head(10))

0    0.414950
1   -0.153961
2    1.268317
3    1.623887
4    0.201608
5    1.339431
6   -1.434012
7   -1.078443
8    0.343836
9    0.130494
Name: age_stand, dtype: float64
========================
0    0.727622
1    0.101529
2   -0.733262
3    0.310226
4    1.631978
5   -0.524564
6    1.214583
7    0.518924
8    1.075451
9   -0.941960
Name: age_stand, dtype: float64
========================
0   -0.766548
1   -0.907530
2    0.995731
3   -1.119003
4   -0.414092
5   -1.259986
6   -0.555074
7   -0.625565
8   -0.978021
9    0.361310
Name: age_stand, dtype: float64

Было выполнено маштабирование признака Возраст (age) на основе стандартизации.

Использование Featuretools

In [602]:

import featuretools as ft

es = ft.EntitySet(id='insurance')

es = es.add_dataframe(dataframe_name="insurance_data", dataframe=df, index='index')

agg_primitives = ["sum", "mean", "median", "std", "max", "min", "count"]
trans_primitives = ["add_numeric", "multiply_numeric", "divide_numeric", "subtract_numeric"]

feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name='insurance_data',
    agg_primitives=agg_primitives,
    trans_primitives=trans_primitives,
    max_depth=2
)
feature_matrix

c:\ulstu\mii\AIM-PIbd-31-Barsukov-P-O\aimenv\Lib\site-packages\featuretools\entityset\entityset.py:1733: UserWarning: index index not found in dataframe, creating new integer column
  warnings.warn(
c:\ulstu\mii\AIM-PIbd-31-Barsukov-P-O\aimenv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
c:\ulstu\mii\AIM-PIbd-31-Barsukov-P-O\aimenv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
c:\ulstu\mii\AIM-PIbd-31-Barsukov-P-O\aimenv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
c:\ulstu\mii\AIM-PIbd-31-Barsukov-P-O\aimenv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
c:\ulstu\mii\AIM-PIbd-31-Barsukov-P-O\aimenv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
c:\ulstu\mii\AIM-PIbd-31-Barsukov-P-O\aimenv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
c:\ulstu\mii\AIM-PIbd-31-Barsukov-P-O\aimenv\Lib\site-packages\featuretools\synthesis\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created
  warnings.warn(
c:\ulstu\mii\AIM-PIbd-31-Barsukov-P-O\aimenv\Lib\site-packages\featuretools\synthesis\dfs.py:321: UnusedPrimitiveWarning: Some specified primitives were not used during DFS:
  agg_primitives: ['count', 'max', 'mean', 'median', 'min', 'std', 'sum']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.
  warnings.warn(warning_msg, UnusedPrimitiveWarning)

Out[602]:

	age	sex	bmi	children	smoker	region	charges	age + bmi	age + charges	age + children	...	age * children	bmi * charges	bmi * children	charges * children	age - bmi	age - charges	age - children	bmi - charges	bmi - children	charges - children
index
0	19	female	27.900	0	True	southwest	16884.92400	46.900	16903.92400	19.0	...	0.0	471089.379600	0.00	0.0000	-8.900	-16865.92400	19.0	-16857.02400	27.900	16884.92400
1	18	male	33.770	1	False	southeast	1725.55230	51.770	1743.55230	19.0	...	18.0	58271.901171	33.77	1725.5523	-15.770	-1707.55230	17.0	-1691.78230	32.770	1724.55230
2	28	male	33.000	3	False	southeast	4449.46200	61.000	4477.46200	31.0	...	84.0	146832.246000	99.00	13348.3860	-5.000	-4421.46200	25.0	-4416.46200	30.000	4446.46200
3	33	male	22.705	0	False	northwest	21984.47061	55.705	22017.47061	33.0	...	0.0	499157.405200	0.00	0.0000	10.295	-21951.47061	33.0	-21961.76561	22.705	21984.47061
4	32	male	28.880	0	False	northwest	3866.85520	60.880	3898.85520	32.0	...	0.0	111674.778176	0.00	0.0000	3.120	-3834.85520	32.0	-3837.97520	28.880	3866.85520
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2705	47	female	45.320	1	False	southeast	8569.86180	92.320	8616.86180	48.0	...	47.0	388386.136776	45.32	8569.8618	1.680	-8522.86180	46.0	-8524.54180	44.320	8568.86180
2706	21	female	34.600	0	False	southwest	2020.17700	55.600	2041.17700	21.0	...	0.0	69898.124200	0.00	0.0000	-13.600	-1999.17700	21.0	-1985.57700	34.600	2020.17700
2707	19	male	26.030	1	True	northwest	16450.89470	45.030	16469.89470	20.0	...	19.0	428216.789041	26.03	16450.8947	-7.030	-16431.89470	18.0	-16424.86470	25.030	16449.89470
2708	23	male	18.715	0	False	northwest	21595.38229	41.715	21618.38229	23.0	...	0.0	404157.579557	0.00	0.0000	4.285	-21572.38229	23.0	-21576.66729	18.715	21595.38229
2709	54	male	31.600	0	False	southwest	9850.43200	85.600	9904.43200	54.0	...	0.0	311273.651200	0.00	0.0000	22.400	-9796.43200	54.0	-9818.83200	31.600	9850.43200

2710 rows × 37 columns

Были сконструированы признаки с помощью Featuretools

In [603]:

train_X = train_df.drop("charges", axis=1)
train_Y = train_df['charges']

test_X = test_df.drop("charges", axis=1)
test_Y = test_df['charges']

val_X = val_df.drop("charges", axis=1)
val_Y = val_df['charges']

train_X = train_X.drop(['smoker', 'sex', 'region', 'age_category', 'bmi_category'], axis=1)
test_X = test_X.drop(['smoker', 'sex', 'region', 'age_category', 'bmi_category'], axis=1)
val_X = val_X.drop(['smoker', 'sex', 'region', 'age_category', 'bmi_category'], axis=1)

print(train_X.columns)

from sklearn.linear_model import LinearRegression
import time 
from sklearn.metrics import mean_squared_error, r2_score

model = LinearRegression()

start_time = time.time()
model.fit(train_X, train_Y)
train_time = time.time() - start_time

val_predictions = model.predict(val_X)
mse = mean_squared_error(val_Y, val_predictions)
r2 = r2_score(val_Y, val_predictions)

print(train_time, mse, r2)

plt.scatter(val_Y, val_predictions, alpha=0.5)
plt.plot([val_Y.min(), val_Y.max()], [val_Y.min(), val_Y.max()], 'k--', lw=1)
plt.xlabel('Фактическая цена')
plt.ylabel('Прогнозируемая цена')

Index(['age', 'bmi', 'children', 'smoker_yes', 'sex_male', 'region_northwest',
       'region_southeast', 'region_southwest', 'children_1', 'children_2',
       'children_3', 'children_4', 'parent_yes', 'age_norm', 'age_stand'],
      dtype='object')
0.02249455451965332 33039788.648656577 0.7496335938888106

Out[603]:

Text(0, 0.5, 'Прогнозируемая цена')

Обученная модель довольно точно предсказывает цены ниже двух тысяч, для цен более двух тысяч модель занижает или завышает цены

211 KiB Raw Permalink Blame History Unescape Escape

211 KiB

Raw Permalink Blame History