49 lines
1.5 KiB
Python
49 lines
1.5 KiB
Python
|
from matplotlib import pyplot as plt
|
||
|
from sklearn import metrics
|
||
|
from sklearn.linear_model import LinearRegression
|
||
|
from sklearn.model_selection import train_test_split
|
||
|
from sklearn.preprocessing import PolynomialFeatures
|
||
|
from sklearn.pipeline import Pipeline
|
||
|
import pandas as pd
|
||
|
|
||
|
|
||
|
def start():
|
||
|
data = pd.read_csv('sberbank_data.csv', index_col='id')
|
||
|
x = data[['timestamp', 'full_sq', 'floor', 'max_floor', 'build_year', 'num_room', 'material', 'kremlin_km']]
|
||
|
y = data[['price_doc']]
|
||
|
|
||
|
x = x.replace('NA', 0)
|
||
|
x.fillna(0, inplace=True)
|
||
|
|
||
|
col_date = []
|
||
|
|
||
|
for val in x['timestamp']:
|
||
|
col_date.append(val.split('-', 1)[0])
|
||
|
|
||
|
x = x.drop(columns='timestamp')
|
||
|
x['timestamp'] = col_date
|
||
|
|
||
|
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.01, random_state=42)
|
||
|
|
||
|
poly = Pipeline([('poly', PolynomialFeatures(degree=3)),
|
||
|
('linear', LinearRegression())])
|
||
|
poly.fit(x_train, y_train)
|
||
|
|
||
|
y_mean = y['price_doc'].mean()
|
||
|
y_predicted = poly.predict(x_test)
|
||
|
for i, n in enumerate(y_predicted):
|
||
|
if n < 10000:
|
||
|
y_predicted[i] = y_mean
|
||
|
|
||
|
print('Оценка обучения:')
|
||
|
print(metrics.r2_score(y_test, y_predicted))
|
||
|
|
||
|
plt.figure(1, figsize=(16, 9))
|
||
|
plt.title('Сравнение результатов обучения')
|
||
|
plt.scatter(x=[i for i in range(len(y_test))], y=y_test, c='g', s=5)
|
||
|
plt.scatter(x=[i for i in range(len(y_test))], y=y_predicted, c='r', s=5)
|
||
|
plt.show()
|
||
|
|
||
|
|
||
|
start()
|