88 lines
3.4 KiB
Python
88 lines
3.4 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
import sklearn
|
|
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge
|
|
|
|
|
|
from sklearn.preprocessing import (LabelEncoder,
|
|
StandardScaler,
|
|
MinMaxScaler,
|
|
RobustScaler)
|
|
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, learning_curve, ShuffleSplit
|
|
|
|
|
|
def str_features_to_numeric(data):
|
|
# Преобразовывает все строковые признаки в числовые.
|
|
|
|
# Определение категориальных признаков
|
|
categorical_columns = []
|
|
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
|
|
features = data.columns.values.tolist()
|
|
for col in features:
|
|
if data[col].dtype in numerics: continue
|
|
categorical_columns.append(col)
|
|
|
|
# Кодирование категориальных признаков
|
|
for col in categorical_columns:
|
|
if col in data.columns:
|
|
le = LabelEncoder()
|
|
le.fit(list(data[col].astype(str).values))
|
|
data[col] = le.transform(list(data[col].astype(str).values))
|
|
|
|
return data
|
|
|
|
|
|
if __name__ == "__main__":
|
|
data = pd.read_csv("..//heart_disease_uci.csv")
|
|
data['target'] = data['trestbps']
|
|
data = data.drop(columns=['id', 'dataset', 'trestbps'])
|
|
|
|
data_wo_null = data.dropna()
|
|
print(len(data_wo_null))
|
|
|
|
encoded_data_wo_null = str_features_to_numeric(data_wo_null)
|
|
print(len(encoded_data_wo_null))
|
|
|
|
# Model standartization
|
|
# The standard score of a sample x is calculated as:
|
|
# z = (x - мат.ож.) / (стандартное отклонение)
|
|
scaler = StandardScaler()
|
|
new_data = pd.DataFrame(scaler.fit_transform(encoded_data_wo_null), columns = encoded_data_wo_null.columns)
|
|
|
|
dataset = data_wo_null.copy() # original data
|
|
target_name = 'target'
|
|
target = data_wo_null.pop(target_name)
|
|
|
|
test_train_split_part = 0.2
|
|
random_state = 42
|
|
|
|
train, valid, train_target, valid_target = train_test_split(new_data, target,
|
|
test_size=test_train_split_part,
|
|
random_state=random_state)
|
|
|
|
reg = LinearRegression().fit(train, train_target)
|
|
|
|
print("---"*15, " LinearRegression ", "---"*15)
|
|
print(f"Accuracy: {reg.score(valid, valid_target)}")
|
|
print(f"коэффициенты: {reg.coef_}")
|
|
print(f"Смещение относительно начала координат (bias): {reg.intercept_}")
|
|
|
|
SGD_reg = SGDRegressor(max_iter=1000, tol=1e-3)
|
|
SGD_reg.fit(train, train_target)
|
|
|
|
print("---"*15, " SGDRegressor ", "---"*15)
|
|
print(f"Accuracy: {SGD_reg.score(valid, valid_target)}")
|
|
print(f"коэффициенты: {SGD_reg.coef_}")
|
|
print(f"Смещение относительно начала координат (bias): {SGD_reg.intercept_}")
|
|
|
|
Ridge_clf = Ridge(alpha=1.0)
|
|
Ridge_clf.fit(train, train_target)
|
|
|
|
print("---"*15, " Ridge ", "---"*15)
|
|
print(f"Accuracy: {Ridge_clf.score(valid, valid_target)}")
|
|
print(f"коэффициенты: {Ridge_clf.coef_}")
|
|
print(f"Смещение относительно начала координат (bias): {Ridge_clf.intercept_}")
|
|
|