Merge pull request 'malkova_anastasia_lab_5 ready' (#162) from malkova_anastasia_lab_5 into main

Reviewed-on: http://student.git.athene.tech/Alexey/IIS_2023_1/pulls/162
This commit is contained in:
Alexey 2023-12-07 15:45:06 +04:00
commit 34651ab250
11 changed files with 852321 additions and 0 deletions

View File

@ -0,0 +1,48 @@
# Лабораторная работа №5
> Регрессия
### Как запустить лабораторную работу
1. Установить python, numpy, sklearn
2. Запустить команду `python main1.py` в корне проекта для запуска 1 части
3. Запустить команду `python main2.py` в корне проекта для запуска 2 части
### Использованные технологии
* Язык программирования `python`
* Библиотеки `numpy, sklearn`
* Среда разработки `PyCharm`
### Что делает программа?
Цель программы: на основе данных об автомобилях на вторичном рынке обучить модель регрессии на предсказание цены.
Используемая модель: Лассо-регрессия
#### Определим признаки
LINEAR [('Year', 1.0), ('Mileage', 0.4868), ('State', 0.0729), ('Vin', 0.015), ('City', 0.0037), ('Model', 0.0002), ('Make', 0.0)]
RIDGE [('Year', 1.0), ('Mileage', 0.4868), ('State', 0.0729), ('Vin', 0.015), ('City', 0.0037), ('Model', 0.0002), ('Make', 0.0)]
LASSO [('Year', 1.0), ('Mileage', 0.4868), ('State', 0.0729), ('Vin', 0.015), ('City', 0.0037), ('Model', 0.0002), ('Make', 0.0)]
RFE [('Year', 1.0), ('State', 1.0), ('Mileage', 1.0), ('City', 0.5), ('Vin', 0.5), ('Model', 0.0), ('Make', 0.0)]
f_regression [('Year', 1.0), ('State', 0.1438), ('Vin', 0.0878), ('City', 0.0845), ('Mileage', 0.0711), ('Model', 0.0335), ('Make', 0.0)]
MEAN [('Year', 0.2), ('Mileage', 0.0974), ('State', 0.0146), ('Vin', 0.003), ('City', 0.0007), ('Model', 0.0), ('Make', 0.0)]
Отсечение признаков, у которых MEAN ниже 0.001. Выделенные признаки для дальнейшего обучения модели Lasso.
##### Запуск модели Lasso с параметрами
![main2.py](main2.png)
#### Итоговые выводы
Были выбраны основные признаки для обучения модели, которые имеют большее влияние на предсказание.
Проведены несколько тестов по обучению модели Lasso с разными alpha (силой влияния регуляризации),
но это не оказало большого влияния. Точность в 94.33% приемлемая.

View File

@ -0,0 +1,8 @@
LINEAR_TITLE = 'LINEAR'
RIDGE_TITLE = 'RIDGE'
LASSO_TITLE = 'LASSO'
RFE_TITLE = 'RFE'
F_REGRESSION_TITLE = 'f_regression'
DATA_SIZE = 1000
FEATURES_AMOUNT = 14

View File

@ -0,0 +1,23 @@
import pandas as pd
from config import DATA_SIZE
def load_dataset():
data = pd.read_csv('true_car_listings.csv')[:DATA_SIZE]
names = ['Year', 'Mileage', 'City', 'State', 'Vin', 'Make', 'Model']
convert_to_num(data, 'City')
convert_to_num(data, 'State')
convert_to_num(data, 'Vin')
convert_to_num(data, 'Make')
convert_to_num(data, 'Model')
Y = data['Price']
X = data[names]
return X, Y, names
def convert_to_num(data, col):
unique_numbers = list(set(data[col]))
data[col] = data[col].apply(unique_numbers.index)

View File

@ -0,0 +1,20 @@
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.feature_selection import RFE, f_regression
def fit_models(x, y):
lm = LinearRegression()
lm.fit(x, y)
ridge = Ridge(alpha=0.001)
ridge.fit(x, y)
lasso = Lasso(alpha=0.001)
lasso.fit(x, y)
rfe = RFE(lasso, step=2)
rfe.fit(x, y)
f, pval = f_regression(x, y, center=False)
return lm, ridge, lasso, rfe, f

View File

@ -0,0 +1,17 @@
from scores import MAPE
from sklearn.linear_model import Lasso
def lasso_test(X_train, X_test, y_train, y_test, alpha):
lasso = Lasso(alpha=alpha)
# Train model
lasso.fit(X_train, y_train)
# Show score of model
lasso_predict = lasso.predict(X_test)
lasso_MAPE = MAPE(y_test, lasso_predict)
print("MAPE value: ", lasso_MAPE)
Accuracy = 100 - lasso_MAPE
print(
'Accuracy of Lasso Regression(alpha={:}): {:0.2f}%.'.format(alpha, Accuracy))

View File

@ -0,0 +1,13 @@
from dataset import load_dataset
from fit import fit_models
from ranks import calc_mean, get_ranks
x, y, names = load_dataset()
lm, ridge, lasso, rfe, f = fit_models(x, y)
ranks = get_ranks(lm, ridge, lasso, rfe, f, names)
mean = calc_mean(ranks)
print("MEAN", mean)

Binary file not shown.

After

Width:  |  Height:  |  Size: 60 KiB

View File

@ -0,0 +1,15 @@
from sklearn.model_selection import train_test_split
from dataset import load_dataset
from lasso_test import lasso_test
# Load dataset
X, Y, name = load_dataset()
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.05, random_state=42)
for i in range(6):
lasso_test(X_train, X_test, y_train, y_test, i/pow(10, i))

View File

@ -0,0 +1,48 @@
import config
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from operator import itemgetter
from sklearn.linear_model import Lasso, Ridge, LinearRegression
def rank_to_dict(ranks, names):
ranks = np.abs(ranks)
minmax = MinMaxScaler()
ranks = minmax.fit_transform(
np.array(ranks).reshape(len(names), 1)).ravel()
ranks = map(lambda x: round(x, 4), ranks)
ranks = dict(zip(names, ranks))
ranks = sorted(ranks.items(), key=itemgetter(1), reverse=True)
return ranks
def flip_array(arr):
return -1 * arr + np.max(arr)
def get_ranks(lm: LinearRegression, ridge: Ridge, lasso: Lasso, rfe, f, names):
ranks = dict()
ranks[config.LINEAR_TITLE] = rank_to_dict(lm.coef_, names)
ranks[config.RIDGE_TITLE] = rank_to_dict(ridge.coef_, names)
ranks[config.LASSO_TITLE] = rank_to_dict(lasso.coef_, names)
ranks[config.RFE_TITLE] = rank_to_dict(flip_array(rfe.ranking_), names)
ranks[config.F_REGRESSION_TITLE] = rank_to_dict(f, names)
for key, value in ranks.items():
print(key, value, '\n')
return ranks
def calc_mean(ranks):
mean = {}
for key, value in ranks.items():
for item in value:
if (item[0] not in mean):
mean[item[0]] = 0
mean[item[0]] += item[1]
for key, value in mean.items():
res = value/len(ranks)
mean[key] = round(res, 4)
return sorted(mean.items(), key=itemgetter(1), reverse=True)

View File

@ -0,0 +1,6 @@
import numpy as np
def MAPE(Y_actual, Y_Predicted):
mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual))*100
return mape

File diff suppressed because it is too large Load Diff