from matplotlib import pyplot as plt from sklearn import metrics from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import Pipeline import pandas as pd def start(): data = pd.read_csv('sberbank_data.csv', index_col='id') x = data[['timestamp', 'full_sq', 'floor', 'max_floor', 'build_year', 'num_room', 'material', 'kremlin_km']] y = data[['price_doc']] x = x.replace('NA', 0) x.fillna(0, inplace=True) col_date = [] for val in x['timestamp']: col_date.append(val.split('-', 1)[0]) x = x.drop(columns='timestamp') x['timestamp'] = col_date x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.01, random_state=42) poly = Pipeline([('poly', PolynomialFeatures(degree=3)), ('linear', LinearRegression())]) poly.fit(x_train, y_train) y_mean = y['price_doc'].mean() y_predicted = poly.predict(x_test) for i, n in enumerate(y_predicted): if n < 10000: y_predicted[i] = y_mean print('Оценка обучения:') print(metrics.r2_score(y_test, y_predicted)) plt.figure(1, figsize=(16, 9)) plt.title('Сравнение результатов обучения') plt.scatter(x=[i for i in range(len(y_test))], y=y_test, c='g', s=5) plt.scatter(x=[i for i in range(len(y_test))], y=y_predicted, c='r', s=5) plt.show() start()