Merge pull request 'malkova_anastasia_lab_5 ready' (#162) from malkova_anastasia_lab_5 into main
Reviewed-on: http://student.git.athene.tech/Alexey/IIS_2023_1/pulls/162
This commit is contained in:
commit
34651ab250
48
malkova_anastasia_lab_5/README.md
Normal file
48
malkova_anastasia_lab_5/README.md
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
# Лабораторная работа №5
|
||||||
|
|
||||||
|
> Регрессия
|
||||||
|
|
||||||
|
### Как запустить лабораторную работу
|
||||||
|
|
||||||
|
1. Установить python, numpy, sklearn
|
||||||
|
2. Запустить команду `python main1.py` в корне проекта для запуска 1 части
|
||||||
|
3. Запустить команду `python main2.py` в корне проекта для запуска 2 части
|
||||||
|
|
||||||
|
### Использованные технологии
|
||||||
|
|
||||||
|
* Язык программирования `python`
|
||||||
|
* Библиотеки `numpy, sklearn`
|
||||||
|
* Среда разработки `PyCharm`
|
||||||
|
|
||||||
|
### Что делает программа?
|
||||||
|
|
||||||
|
Цель программы: на основе данных об автомобилях на вторичном рынке обучить модель регрессии на предсказание цены.
|
||||||
|
|
||||||
|
Используемая модель: Лассо-регрессия
|
||||||
|
|
||||||
|
#### Определим признаки
|
||||||
|
|
||||||
|
LINEAR [('Year', 1.0), ('Mileage', 0.4868), ('State', 0.0729), ('Vin', 0.015), ('City', 0.0037), ('Model', 0.0002), ('Make', 0.0)]
|
||||||
|
|
||||||
|
RIDGE [('Year', 1.0), ('Mileage', 0.4868), ('State', 0.0729), ('Vin', 0.015), ('City', 0.0037), ('Model', 0.0002), ('Make', 0.0)]
|
||||||
|
|
||||||
|
LASSO [('Year', 1.0), ('Mileage', 0.4868), ('State', 0.0729), ('Vin', 0.015), ('City', 0.0037), ('Model', 0.0002), ('Make', 0.0)]
|
||||||
|
|
||||||
|
RFE [('Year', 1.0), ('State', 1.0), ('Mileage', 1.0), ('City', 0.5), ('Vin', 0.5), ('Model', 0.0), ('Make', 0.0)]
|
||||||
|
|
||||||
|
f_regression [('Year', 1.0), ('State', 0.1438), ('Vin', 0.0878), ('City', 0.0845), ('Mileage', 0.0711), ('Model', 0.0335), ('Make', 0.0)]
|
||||||
|
|
||||||
|
MEAN [('Year', 0.2), ('Mileage', 0.0974), ('State', 0.0146), ('Vin', 0.003), ('City', 0.0007), ('Model', 0.0), ('Make', 0.0)]
|
||||||
|
|
||||||
|
Отсечение признаков, у которых MEAN ниже 0.001. Выделенные признаки для дальнейшего обучения модели Lasso.
|
||||||
|
|
||||||
|
##### Запуск модели Lasso с параметрами
|
||||||
|
|
||||||
|
![main2.py](main2.png)
|
||||||
|
|
||||||
|
#### Итоговые выводы
|
||||||
|
|
||||||
|
Были выбраны основные признаки для обучения модели, которые имеют большее влияние на предсказание.
|
||||||
|
Проведены несколько тестов по обучению модели Lasso с разными alpha (силой влияния регуляризации),
|
||||||
|
но это не оказало большого влияния. Точность в 94.33% приемлемая.
|
||||||
|
|
8
malkova_anastasia_lab_5/config.py
Normal file
8
malkova_anastasia_lab_5/config.py
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
LINEAR_TITLE = 'LINEAR'
|
||||||
|
RIDGE_TITLE = 'RIDGE'
|
||||||
|
LASSO_TITLE = 'LASSO'
|
||||||
|
RFE_TITLE = 'RFE'
|
||||||
|
F_REGRESSION_TITLE = 'f_regression'
|
||||||
|
DATA_SIZE = 1000
|
||||||
|
|
||||||
|
FEATURES_AMOUNT = 14
|
23
malkova_anastasia_lab_5/dataset.py
Normal file
23
malkova_anastasia_lab_5/dataset.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from config import DATA_SIZE
|
||||||
|
|
||||||
|
|
||||||
|
def load_dataset():
|
||||||
|
data = pd.read_csv('true_car_listings.csv')[:DATA_SIZE]
|
||||||
|
|
||||||
|
names = ['Year', 'Mileage', 'City', 'State', 'Vin', 'Make', 'Model']
|
||||||
|
convert_to_num(data, 'City')
|
||||||
|
convert_to_num(data, 'State')
|
||||||
|
convert_to_num(data, 'Vin')
|
||||||
|
convert_to_num(data, 'Make')
|
||||||
|
convert_to_num(data, 'Model')
|
||||||
|
Y = data['Price']
|
||||||
|
X = data[names]
|
||||||
|
|
||||||
|
return X, Y, names
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_num(data, col):
|
||||||
|
unique_numbers = list(set(data[col]))
|
||||||
|
data[col] = data[col].apply(unique_numbers.index)
|
20
malkova_anastasia_lab_5/fit.py
Normal file
20
malkova_anastasia_lab_5/fit.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
from sklearn.linear_model import Lasso, Ridge, LinearRegression
|
||||||
|
from sklearn.feature_selection import RFE, f_regression
|
||||||
|
|
||||||
|
|
||||||
|
def fit_models(x, y):
|
||||||
|
lm = LinearRegression()
|
||||||
|
lm.fit(x, y)
|
||||||
|
|
||||||
|
ridge = Ridge(alpha=0.001)
|
||||||
|
ridge.fit(x, y)
|
||||||
|
|
||||||
|
lasso = Lasso(alpha=0.001)
|
||||||
|
lasso.fit(x, y)
|
||||||
|
|
||||||
|
rfe = RFE(lasso, step=2)
|
||||||
|
rfe.fit(x, y)
|
||||||
|
|
||||||
|
f, pval = f_regression(x, y, center=False)
|
||||||
|
|
||||||
|
return lm, ridge, lasso, rfe, f
|
17
malkova_anastasia_lab_5/lasso_test.py
Normal file
17
malkova_anastasia_lab_5/lasso_test.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
from scores import MAPE
|
||||||
|
from sklearn.linear_model import Lasso
|
||||||
|
|
||||||
|
|
||||||
|
def lasso_test(X_train, X_test, y_train, y_test, alpha):
|
||||||
|
lasso = Lasso(alpha=alpha)
|
||||||
|
|
||||||
|
# Train model
|
||||||
|
lasso.fit(X_train, y_train)
|
||||||
|
|
||||||
|
# Show score of model
|
||||||
|
lasso_predict = lasso.predict(X_test)
|
||||||
|
lasso_MAPE = MAPE(y_test, lasso_predict)
|
||||||
|
print("MAPE value: ", lasso_MAPE)
|
||||||
|
Accuracy = 100 - lasso_MAPE
|
||||||
|
print(
|
||||||
|
'Accuracy of Lasso Regression(alpha={:}): {:0.2f}%.'.format(alpha, Accuracy))
|
13
malkova_anastasia_lab_5/main1.py
Normal file
13
malkova_anastasia_lab_5/main1.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
from dataset import load_dataset
|
||||||
|
from fit import fit_models
|
||||||
|
from ranks import calc_mean, get_ranks
|
||||||
|
|
||||||
|
x, y, names = load_dataset()
|
||||||
|
|
||||||
|
lm, ridge, lasso, rfe, f = fit_models(x, y)
|
||||||
|
|
||||||
|
ranks = get_ranks(lm, ridge, lasso, rfe, f, names)
|
||||||
|
|
||||||
|
mean = calc_mean(ranks)
|
||||||
|
|
||||||
|
print("MEAN", mean)
|
BIN
malkova_anastasia_lab_5/main2.png
Normal file
BIN
malkova_anastasia_lab_5/main2.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 60 KiB |
15
malkova_anastasia_lab_5/main2.py
Normal file
15
malkova_anastasia_lab_5/main2.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
from dataset import load_dataset
|
||||||
|
from lasso_test import lasso_test
|
||||||
|
|
||||||
|
|
||||||
|
# Load dataset
|
||||||
|
X, Y, name = load_dataset()
|
||||||
|
|
||||||
|
# Split dataset
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(
|
||||||
|
X, Y, test_size=0.05, random_state=42)
|
||||||
|
|
||||||
|
for i in range(6):
|
||||||
|
lasso_test(X_train, X_test, y_train, y_test, i/pow(10, i))
|
48
malkova_anastasia_lab_5/ranks.py
Normal file
48
malkova_anastasia_lab_5/ranks.py
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
import config
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
|
from operator import itemgetter
|
||||||
|
from sklearn.linear_model import Lasso, Ridge, LinearRegression
|
||||||
|
|
||||||
|
|
||||||
|
def rank_to_dict(ranks, names):
|
||||||
|
ranks = np.abs(ranks)
|
||||||
|
minmax = MinMaxScaler()
|
||||||
|
ranks = minmax.fit_transform(
|
||||||
|
np.array(ranks).reshape(len(names), 1)).ravel()
|
||||||
|
ranks = map(lambda x: round(x, 4), ranks)
|
||||||
|
ranks = dict(zip(names, ranks))
|
||||||
|
ranks = sorted(ranks.items(), key=itemgetter(1), reverse=True)
|
||||||
|
return ranks
|
||||||
|
|
||||||
|
|
||||||
|
def flip_array(arr):
|
||||||
|
return -1 * arr + np.max(arr)
|
||||||
|
|
||||||
|
|
||||||
|
def get_ranks(lm: LinearRegression, ridge: Ridge, lasso: Lasso, rfe, f, names):
|
||||||
|
ranks = dict()
|
||||||
|
|
||||||
|
ranks[config.LINEAR_TITLE] = rank_to_dict(lm.coef_, names)
|
||||||
|
ranks[config.RIDGE_TITLE] = rank_to_dict(ridge.coef_, names)
|
||||||
|
ranks[config.LASSO_TITLE] = rank_to_dict(lasso.coef_, names)
|
||||||
|
ranks[config.RFE_TITLE] = rank_to_dict(flip_array(rfe.ranking_), names)
|
||||||
|
ranks[config.F_REGRESSION_TITLE] = rank_to_dict(f, names)
|
||||||
|
|
||||||
|
for key, value in ranks.items():
|
||||||
|
print(key, value, '\n')
|
||||||
|
|
||||||
|
return ranks
|
||||||
|
|
||||||
|
|
||||||
|
def calc_mean(ranks):
|
||||||
|
mean = {}
|
||||||
|
for key, value in ranks.items():
|
||||||
|
for item in value:
|
||||||
|
if (item[0] not in mean):
|
||||||
|
mean[item[0]] = 0
|
||||||
|
mean[item[0]] += item[1]
|
||||||
|
for key, value in mean.items():
|
||||||
|
res = value/len(ranks)
|
||||||
|
mean[key] = round(res, 4)
|
||||||
|
return sorted(mean.items(), key=itemgetter(1), reverse=True)
|
6
malkova_anastasia_lab_5/scores.py
Normal file
6
malkova_anastasia_lab_5/scores.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def MAPE(Y_actual, Y_Predicted):
|
||||||
|
mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual))*100
|
||||||
|
return mape
|
852123
malkova_anastasia_lab_5/true_car_listings.csv
Normal file
852123
malkova_anastasia_lab_5/true_car_listings.csv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user