Added work with scrapped data for TVs.
This commit is contained in:
parent
0d41dfa2f2
commit
d5184e4419
@ -54,9 +54,16 @@ def predict_price(data: TVCreate):
|
|||||||
raise HTTPException(status_code=400, detail=str(e))
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
@router.get('/get_unique_data_laptops', summary="Get unique data for laptops species")
|
@router.get('/get_unique_data_laptop', summary="Get unique data for laptops species")
|
||||||
def get_unique_laptops():
|
def get_unique_laptops():
|
||||||
try:
|
try:
|
||||||
return laptop_service.get_unique_data()
|
return laptop_service.get_unique_data()
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
|
|
||||||
|
@router.get('/get_unique_data_tv', summary="Get unique data for tvs species")
|
||||||
|
def get_unique_laptops():
|
||||||
|
try:
|
||||||
|
return tv_service.get_unique_data()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(status_code=400, detail=str(e))
|
raise HTTPException(status_code=400, detail=str(e))
|
@ -1,9 +0,0 @@
|
|||||||
from sqlalchemy import create_engine
|
|
||||||
from sqlalchemy.ext.declarative import declarative_base
|
|
||||||
from sqlalchemy.orm import sessionmaker
|
|
||||||
|
|
||||||
DATABASE_URL = "postgresql://postgres:postgres@localhost/price-builder"
|
|
||||||
|
|
||||||
engine = create_engine(DATABASE_URL)
|
|
||||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
|
||||||
Base = declarative_base()
|
|
@ -1,31 +0,0 @@
|
|||||||
from sqlalchemy import Column, Integer, Float, String
|
|
||||||
from database import Base
|
|
||||||
|
|
||||||
|
|
||||||
class Laptop(Base):
|
|
||||||
__tablename__ = "laptops"
|
|
||||||
|
|
||||||
id = Column(Integer, primary_key=True, index=True)
|
|
||||||
brand = Column(String, index=True)
|
|
||||||
processor = Column(String, index=True)
|
|
||||||
ram = Column(Integer)
|
|
||||||
os = Column(String, index=True)
|
|
||||||
ssd = Column(Integer)
|
|
||||||
display = Column(Float)
|
|
||||||
gpu = Column(String, index=True)
|
|
||||||
weight = Column(Float)
|
|
||||||
battery_size = Column(Integer)
|
|
||||||
release_year = Column(Integer)
|
|
||||||
display_type = Column(String, index=True)
|
|
||||||
|
|
||||||
class TV(Base):
|
|
||||||
__tablename__ = "tvs"
|
|
||||||
|
|
||||||
id = Column(Integer, primary_key=True, index=True)
|
|
||||||
display = Column(String, index=True)
|
|
||||||
tuners = Column(String)
|
|
||||||
features = Column(String)
|
|
||||||
os = Column(String)
|
|
||||||
power_of_volume = Column(String)
|
|
||||||
screen_sizes: int
|
|
||||||
color = Column(String)
|
|
@ -34,7 +34,6 @@ df['ssd'] = df['ssd'].str.extract(r'(\d+)').astype(float)
|
|||||||
# Преобразование цен в числовой формат
|
# Преобразование цен в числовой формат
|
||||||
df['price'] = df['price'].astype(str).str.replace(' ', '').astype(int)
|
df['price'] = df['price'].astype(str).str.replace(' ', '').astype(int)
|
||||||
|
|
||||||
|
|
||||||
# Шаг 5: Очистка и преобразование колонок
|
# Шаг 5: Очистка и преобразование колонок
|
||||||
def clean_numeric_column(column, remove_chars=['₹', ',', ' ', 'ГБ', 'МГц', '']):
|
def clean_numeric_column(column, remove_chars=['₹', ',', ' ', 'ГБ', 'МГц', '']):
|
||||||
if column.dtype == object:
|
if column.dtype == object:
|
||||||
|
@ -6,9 +6,11 @@ from sklearn.preprocessing import PolynomialFeatures, StandardScaler
|
|||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import joblib
|
import joblib
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
# Загрузка данных
|
# Загрузка данных
|
||||||
df = pd.read_csv('../../../../datasets/synthetic_tvs.csv')
|
df = pd.read_csv('../../../../datasets/tv.csv')
|
||||||
|
|
||||||
# Проверка и очистка данных
|
# Проверка и очистка данных
|
||||||
required_columns = ['display', 'tuners', 'features', 'os', 'power_of_volume', 'color', 'screen_size', 'price']
|
required_columns = ['display', 'tuners', 'features', 'os', 'power_of_volume', 'color', 'screen_size', 'price']
|
||||||
@ -18,6 +20,27 @@ if missing_columns:
|
|||||||
|
|
||||||
df = df.dropna(subset=required_columns)
|
df = df.dropna(subset=required_columns)
|
||||||
|
|
||||||
|
# Преобразование цен в числовой формат
|
||||||
|
df['price'] = df['price'].astype(str).str.replace(' ', '').astype(int)
|
||||||
|
|
||||||
|
# Создаем список уникальных значений для категориальных признаков
|
||||||
|
unique_values = {
|
||||||
|
'display': df['display'].dropna().unique().tolist(),
|
||||||
|
'tuners': df['tuners'].dropna().unique().tolist(),
|
||||||
|
'features': df['features'].dropna().unique().tolist(),
|
||||||
|
'os': df['os'].dropna().unique().tolist(),
|
||||||
|
'power_of_volume': df['power_of_volume'].dropna().unique().tolist(),
|
||||||
|
'color': df['color'].dropna().unique().tolist(),
|
||||||
|
'screen_size': df['screen_size'].dropna().unique().tolist()
|
||||||
|
}
|
||||||
|
|
||||||
|
# Создание директории, если она не существует
|
||||||
|
output_dir = 'columns'
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
with open(os.path.join(output_dir, 'unique_values_tv.json'), 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(unique_values, f, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
# Преобразование категориальных переменных
|
# Преобразование категориальных переменных
|
||||||
categorical_features = ['display', 'tuners', 'features', 'os', 'power_of_volume','color']
|
categorical_features = ['display', 'tuners', 'features', 'os', 'power_of_volume','color']
|
||||||
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)
|
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)
|
||||||
@ -50,20 +73,6 @@ grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, c
|
|||||||
grid_search.fit(X_train, y_train)
|
grid_search.fit(X_train, y_train)
|
||||||
best_model = grid_search.best_estimator_
|
best_model = grid_search.best_estimator_
|
||||||
|
|
||||||
# Вывод важности признаков
|
|
||||||
feature_importances = best_model.feature_importances_
|
|
||||||
feature_names = poly.get_feature_names_out(X.columns)
|
|
||||||
|
|
||||||
# Построение графика важности признаков
|
|
||||||
sorted_indices = np.argsort(feature_importances)[::-1]
|
|
||||||
plt.figure(figsize=(10, 8))
|
|
||||||
plt.barh([feature_names[i] for i in sorted_indices[:20]], feature_importances[sorted_indices[:20]])
|
|
||||||
plt.xlabel('Importance')
|
|
||||||
plt.ylabel('Feature')
|
|
||||||
plt.title('Top 20 Feature Importances')
|
|
||||||
plt.gca().invert_yaxis()
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
# Сохранение модели
|
# Сохранение модели
|
||||||
feature_columns = X.columns.tolist()
|
feature_columns = X.columns.tolist()
|
||||||
joblib.dump(feature_columns, '../../tvML/feature_columns.pkl')
|
joblib.dump(feature_columns, '../../tvML/feature_columns.pkl')
|
||||||
@ -71,3 +80,21 @@ joblib.dump(best_model, '../../tvML/tv_price_model.pkl')
|
|||||||
joblib.dump(poly, '../../tvML/poly_transformer.pkl')
|
joblib.dump(poly, '../../tvML/poly_transformer.pkl')
|
||||||
joblib.dump(scaler, '../../tvML/scaler.pkl')
|
joblib.dump(scaler, '../../tvML/scaler.pkl')
|
||||||
print("Модель для телевизоров сохранена.")
|
print("Модель для телевизоров сохранена.")
|
||||||
|
|
||||||
|
# Вывод важности признаков
|
||||||
|
feature_importances = best_model.feature_importances_
|
||||||
|
feature_names = poly.get_feature_names_out(X.columns)
|
||||||
|
|
||||||
|
# Построение графика важности признаков (снизу вверх)
|
||||||
|
sorted_indices = np.argsort(feature_importances)[::-1] # Сортировка индексов по важности
|
||||||
|
top_features = [feature_names[i] for i in sorted_indices[:15]]
|
||||||
|
top_importances = feature_importances[sorted_indices[:15]]
|
||||||
|
|
||||||
|
plt.figure(figsize=(12, 10)) # Увеличиваем размер графика
|
||||||
|
plt.bar(range(len(top_features)), top_importances, tick_label=top_features)
|
||||||
|
plt.xticks(rotation=45, ha='right', fontsize=10) # Наклон подписей и выравнивание
|
||||||
|
plt.ylabel('Важность')
|
||||||
|
plt.xlabel('Признаки')
|
||||||
|
plt.title('Топ 20 Важнейших параметров')
|
||||||
|
plt.tight_layout() # Добавляем автоматическое выравнивание элементов графика
|
||||||
|
plt.show()
|
73
services/ml/scripts/modelBuilders/modelBuilderTVSynthetic.py
Normal file
73
services/ml/scripts/modelBuilders/modelBuilderTVSynthetic.py
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from sklearn.model_selection import train_test_split, GridSearchCV
|
||||||
|
from sklearn.ensemble import RandomForestRegressor
|
||||||
|
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
||||||
|
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import joblib
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# Загрузка данных
|
||||||
|
df = pd.read_csv('../../../../datasets/synthetic_tvs.csv')
|
||||||
|
|
||||||
|
# Проверка и очистка данных
|
||||||
|
required_columns = ['display', 'tuners', 'features', 'os', 'power_of_volume', 'color', 'screen_size', 'price']
|
||||||
|
missing_columns = [col for col in required_columns if col not in df.columns]
|
||||||
|
if missing_columns:
|
||||||
|
raise Exception(f"Отсутствуют столбцы: {missing_columns}")
|
||||||
|
|
||||||
|
df = df.dropna(subset=required_columns)
|
||||||
|
|
||||||
|
# Преобразование категориальных переменных
|
||||||
|
categorical_features = ['display', 'tuners', 'features', 'os', 'power_of_volume','color']
|
||||||
|
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)
|
||||||
|
|
||||||
|
# Разделение на X и y
|
||||||
|
X = df.drop('price', axis=1)
|
||||||
|
y = df['price']
|
||||||
|
|
||||||
|
# Полиномиальные признаки
|
||||||
|
poly = PolynomialFeatures(degree=1, interaction_only=True, include_bias=False)
|
||||||
|
X_poly = poly.fit_transform(X)
|
||||||
|
|
||||||
|
# Масштабирование
|
||||||
|
scaler = StandardScaler()
|
||||||
|
X_poly_scaled = scaler.fit_transform(X_poly)
|
||||||
|
|
||||||
|
# Разделение на обучающую и тестовую выборки
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X_poly_scaled, y, test_size=0.5, random_state=42)
|
||||||
|
|
||||||
|
# Настройка Random Forest
|
||||||
|
param_grid = {
|
||||||
|
'n_estimators': [100, 200],
|
||||||
|
'max_depth': [10, 20],
|
||||||
|
'max_features': ['sqrt', 'log2', 0.5],
|
||||||
|
'min_samples_split': [5, 10],
|
||||||
|
'min_samples_leaf': [2, 4]
|
||||||
|
}
|
||||||
|
|
||||||
|
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_absolute_error')
|
||||||
|
grid_search.fit(X_train, y_train)
|
||||||
|
best_model = grid_search.best_estimator_
|
||||||
|
|
||||||
|
# Вывод важности признаков
|
||||||
|
feature_importances = best_model.feature_importances_
|
||||||
|
feature_names = poly.get_feature_names_out(X.columns)
|
||||||
|
|
||||||
|
# Построение графика важности признаков
|
||||||
|
sorted_indices = np.argsort(feature_importances)[::-1]
|
||||||
|
plt.figure(figsize=(10, 8))
|
||||||
|
plt.barh([feature_names[i] for i in sorted_indices[:20]], feature_importances[sorted_indices[:20]])
|
||||||
|
plt.xlabel('Importance')
|
||||||
|
plt.ylabel('Feature')
|
||||||
|
plt.title('Top 20 Feature Importances')
|
||||||
|
plt.gca().invert_yaxis()
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
# Сохранение модели
|
||||||
|
feature_columns = X.columns.tolist()
|
||||||
|
joblib.dump(feature_columns, '../../tvML/feature_columns.pkl')
|
||||||
|
joblib.dump(best_model, '../../tvML/tv_price_model.pkl')
|
||||||
|
joblib.dump(poly, '../../tvML/poly_transformer.pkl')
|
||||||
|
joblib.dump(scaler, '../../tvML/scaler.pkl')
|
||||||
|
print("Модель для телевизоров сохранена.")
|
@ -133,4 +133,13 @@ class TVService:
|
|||||||
predicted_price = self.model.predict(input_scaled)[0]
|
predicted_price = self.model.predict(input_scaled)[0]
|
||||||
return PredictPriceResponse(predicted_price=round(predicted_price, 2))
|
return PredictPriceResponse(predicted_price=round(predicted_price, 2))
|
||||||
|
|
||||||
|
def get_unique_data(self):
|
||||||
|
# Указываем путь к файлу
|
||||||
|
file_path = 'services/ml/scripts/modelBuilders/columns/unique_values_tv.json'
|
||||||
|
|
||||||
|
# Открываем и читаем данные из файла
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
data = json.load(file) # Загружаем данные из JSON
|
||||||
|
|
||||||
|
# Возвращаем данные, которые будут переданы в ответ
|
||||||
|
return data
|
||||||
|
Loading…
x
Reference in New Issue
Block a user