From ee01e85053abfcc8bbab8d9b3db841b9f3578af7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=90=D0=BB=D0=B5=D0=B9?= =?UTF-8?q?=D0=BA=D0=B8=D0=BD?= Date: Mon, 2 Dec 2024 21:27:11 +0400 Subject: [PATCH] Predict for TV --- controllers/controller.py | 27 +++-- models/models.py | 12 ++ schemas/schemas.py | 19 ++++ services/ml/feature_importances.py | 26 ----- .../generate_synthetic_data_laptop.py} | 2 +- .../generate_synthetic_data_tv.py | 107 ++++++++++++++++++ .../modelBuilders/modelBuilderLaptop.py} | 10 +- .../scripts/modelBuilders/modelBuilderTV.py | 73 ++++++++++++ services/service.py | 46 ++++++++ 9 files changed, 283 insertions(+), 39 deletions(-) delete mode 100644 services/ml/feature_importances.py rename services/ml/{generate_synthetic_data.py => scripts/dataGenerators/generate_synthetic_data_laptop.py} (98%) create mode 100644 services/ml/scripts/dataGenerators/generate_synthetic_data_tv.py rename services/ml/{modelBuilder.py => scripts/modelBuilders/modelBuilderLaptop.py} (93%) create mode 100644 services/ml/scripts/modelBuilders/modelBuilderTV.py diff --git a/controllers/controller.py b/controllers/controller.py index fa640c0..4f21740 100644 --- a/controllers/controller.py +++ b/controllers/controller.py @@ -1,18 +1,24 @@ from fastapi import APIRouter, HTTPException -from schemas.schemas import LaptopCreate, LaptopResponse, PredictPriceResponse -from services.service import LaptopService +from schemas.schemas import LaptopCreate, TVCreate, PredictPriceResponse +from services.service import LaptopService, TVService import os router = APIRouter() # Инициализация сервиса -MODEL_PATH = os.getenv("MODEL_PATH", "services/ml/laptop_price_model.pkl") -FEATURE_COLUMNS_PATH = os.getenv("FEATURE_COLUMNS_PATH", "services/ml/feature_columns.pkl") -POLY_PATH = os.getenv("POLY_PATH", "services/ml/poly_transformer.pkl") -SCALER_PATH = os.getenv("SCALER_PATH", "services/ml/scaler.pkl") +MODEL_PATH = os.getenv("MODEL_PATH", "services/ml/laptopML/laptop_price_model.pkl") +FEATURE_COLUMNS_PATH = os.getenv("FEATURE_COLUMNS_PATH", "services/ml/laptopML/feature_columns.pkl") +POLY_PATH = os.getenv("POLY_PATH", "services/ml/laptopML/poly_transformer.pkl") +SCALER_PATH = os.getenv("SCALER_PATH", "services/ml/laptopML/scaler.pkl") laptop_service = LaptopService(model_path=MODEL_PATH, feature_columns_path=FEATURE_COLUMNS_PATH, poly_path=POLY_PATH, scaler_path=SCALER_PATH) -@router.post("/predict_price/", response_model=PredictPriceResponse, summary="Predict laptop price", description="Predict the price of a laptop based on its specifications.", response_description="The predicted price of the laptop.") +MODEL_PATH = os.getenv("MODEL_PATH", "services/ml/tvML/tv_price_model.pkl") +FEATURE_COLUMNS_PATH = os.getenv("FEATURE_COLUMNS_PATH", "services/ml/tvML/feature_columns_tv.pkl") +POLY_PATH = os.getenv("POLY_PATH", "services/ml/tvML/poly_transformer.pkl") +SCALER_PATH = os.getenv("SCALER_PATH", "services/ml/tvML/scaler.pkl") +tv_service = TVService(model_path=MODEL_PATH, feature_columns_path=FEATURE_COLUMNS_PATH, poly_path=POLY_PATH, scaler_path=SCALER_PATH) + +@router.post("/predict_price/laptop/", response_model=PredictPriceResponse, summary="Predict laptop price", description="Predict the price of a laptop based on its specifications.", response_description="The predicted price of the laptop.") def predict_price(data: LaptopCreate): """ Predict the price of a laptop given its specifications. @@ -25,5 +31,12 @@ def predict_price(data: LaptopCreate): """ try: return laptop_service.predict_price(data.dict()) + except Exception as e: + raise HTTPException(status_code=400, detail=str(e)) + +@router.post("/predict_price/tv/", response_model=PredictPriceResponse, summary="Predict TV price", description="Predict the price of a TV based on its specifications.", response_description="The predicted price of the TV.") +def predict_price(data: TVCreate): + try: + return tv_service.predict_price(data.dict()) except Exception as e: raise HTTPException(status_code=400, detail=str(e)) \ No newline at end of file diff --git a/models/models.py b/models/models.py index bec0914..7f84ae6 100644 --- a/models/models.py +++ b/models/models.py @@ -17,3 +17,15 @@ class Laptop(Base): battery_size = Column(Integer) release_year = Column(Integer) display_type = Column(String, index=True) + +class TV(Base): + __tablename__ = "tvs" + + id = Column(Integer, primary_key=True, index=True) + display = Column(String, index=True) + tuners = Column(String) + features = Column(String) + os = Column(String) + power_of_volume = Column(String) + screen_sizes: int + color = Column(String) \ No newline at end of file diff --git a/schemas/schemas.py b/schemas/schemas.py index 05bd7ff..f93aebf 100644 --- a/schemas/schemas.py +++ b/schemas/schemas.py @@ -14,6 +14,15 @@ class LaptopCreate(BaseModel): release_year: int display_type: str +class TVCreate(BaseModel): + display: str + tuners: str + features: str + os: str + power_of_volume: str + screen_sizes: int + color: str + class LaptopResponse(BaseModel): id: int brand: str @@ -31,5 +40,15 @@ class LaptopResponse(BaseModel): class Config: orm_mode = True +class TVResponse(BaseModel): + id: int + display: str + tuners: str + features: str + os: str + power_of_volume: str + screen_sizes: int + color: str + class PredictPriceResponse(BaseModel): predicted_price: float diff --git a/services/ml/feature_importances.py b/services/ml/feature_importances.py deleted file mode 100644 index accc13c..0000000 --- a/services/ml/feature_importances.py +++ /dev/null @@ -1,26 +0,0 @@ -import matplotlib.pyplot as plt -import joblib -import numpy as np - -from services.ml.modelBuilder import X_train - -# Загрузка модели и признаков -model_rf = joblib.load('laptop_price_model.pkl') -feature_columns = joblib.load('feature_columns.pkl') - -# Получение важности признаков -importances = model_rf.feature_importances_ -indices = np.argsort(importances)[::-1] - -# Вывод наиболее важных признаков -print("Важность признаков:") -for f in range(X_train.shape[1]): - print(f"{f + 1}. {feature_columns[indices[f]]} ({importances[indices[f]]})") - -# Визуализация важности признаков -plt.figure(figsize=(12, 8)) -plt.title("Важность признаков (Random Forest)") -plt.bar(range(X_train.shape[1]), importances[indices], align='center') -plt.xticks(range(X_train.shape[1]), [feature_columns[i] for i in indices], rotation=90) -plt.tight_layout() -plt.show() diff --git a/services/ml/generate_synthetic_data.py b/services/ml/scripts/dataGenerators/generate_synthetic_data_laptop.py similarity index 98% rename from services/ml/generate_synthetic_data.py rename to services/ml/scripts/dataGenerators/generate_synthetic_data_laptop.py index 92dd818..9b7e2c6 100644 --- a/services/ml/generate_synthetic_data.py +++ b/services/ml/scripts/dataGenerators/generate_synthetic_data_laptop.py @@ -174,5 +174,5 @@ print("\nСтатистика по ценам:") print(synthetic_df['price'].describe()) # Сохранение в CSV -synthetic_df.to_csv('synthetic_laptops.csv', index=False) +synthetic_df.to_csv('../../../../datasets/synthetic_laptops.csv', index=False) print("\nСинтетические данные сохранены в 'synthetic_laptops.csv'.") diff --git a/services/ml/scripts/dataGenerators/generate_synthetic_data_tv.py b/services/ml/scripts/dataGenerators/generate_synthetic_data_tv.py new file mode 100644 index 0000000..8028972 --- /dev/null +++ b/services/ml/scripts/dataGenerators/generate_synthetic_data_tv.py @@ -0,0 +1,107 @@ +import pandas as pd +import numpy as np +import random +from datetime import datetime + +# Установка случайного зерна для воспроизводимости +np.random.seed(42) +random.seed(42) + +# Определение возможных значений для категориальных признаков +displays = ['LED', 'OLED', 'QLED', 'LCD', 'Plasma'] +screen_sizes = [32, 40, 43, 50, 55, 65, 75, 85] # в дюймах +tuners = ['DVB-T2', 'DVB-C', 'DVB-S2', 'ATSC', 'ISDB-T'] +features = ['Smart TV', 'HDR', '3D', 'Voice Control', 'Bluetooth', 'WiFi', 'Ambient Mode'] +oss = ['WebOS', 'Android TV', 'Tizen', 'Roku', 'Fire TV'] +power_of_volume = ['10W', '20W', '30W', '40W', '50W'] # мощность динамиков +colors = ['Black', 'Silver', 'White', 'Gray', 'Metallic'] + +# Функции для генерации признаков +def generate_display(): + return random.choice(displays) + +def generate_screen_size(): + return random.choice(screen_sizes) + +def generate_tuners(): + return random.choice(tuners) + +def generate_features(): + return ', '.join(random.sample(features, random.randint(1, 4))) # случайный набор фич + +def generate_os(): + return random.choice(oss) + +def generate_power_of_volume(): + return random.choice(power_of_volume) + +def generate_color(): + return random.choice(colors) + +# Функция для расчёта цены +def calculate_price(display, screen_size, tuners, features, os, power_of_volume, color): + base_price = 20000 # базовая цена + + # Тип дисплея + display_premium = {'LED': 0, 'OLED': 40000, 'QLED': 30000, 'LCD': 10000, 'Plasma': 15000} + base_price += display_premium.get(display, 0) + + # Размер экрана + base_price += (screen_size - 32) * 1000 + + # Функции + base_price += len(features.split(', ')) * 5000 + + # ОС + os_premium = {'WebOS': 10000, 'Android TV': 15000, 'Tizen': 12000, 'Roku': 8000, 'Fire TV': 7000} + base_price += os_premium.get(os, 5000) + + # Мощность звука + power_value = int(power_of_volume.rstrip('W')) + base_price += power_value * 500 + + + # Добавление случайного шума + noise = np.random.normal(0, 3000) + final_price = base_price + noise + + return max(round(final_price, 2), 5000) + +# Функция для генерации синтетических данных +def generate_synthetic_data(num_samples=100000): + data = [] + for _ in range(num_samples): + display= generate_display() + screen_size = generate_screen_size() + tuners = generate_tuners() + features = generate_features() + os = generate_os() + power_of_volume = generate_power_of_volume() + color = generate_color() + + price = calculate_price( + display, screen_size, tuners, features, os, power_of_volume, color + ) + + data.append({ + 'display': display, + 'screen_size': screen_size, + 'tuners': tuners, + 'features': features, + 'os': os, + 'power_of_volume': power_of_volume, + 'color': color, + 'price': price + }) + return pd.DataFrame(data) + +print("Генерация синтетических данных для телевизоров...") +synthetic_df = generate_synthetic_data(num_samples=100000) + +# Просмотр первых строк +print("\nПример данных после генерации:") +print(synthetic_df.head()) + +# Сохранение в CSV +synthetic_df.to_csv('../../../../datasets/synthetic_tvs.csv', index=False) +print("\nСинтетические данные сохранены в 'synthetic_tvs.csv'.") diff --git a/services/ml/modelBuilder.py b/services/ml/scripts/modelBuilders/modelBuilderLaptop.py similarity index 93% rename from services/ml/modelBuilder.py rename to services/ml/scripts/modelBuilders/modelBuilderLaptop.py index 28644cf..61ec9d7 100644 --- a/services/ml/modelBuilder.py +++ b/services/ml/scripts/modelBuilders/modelBuilderLaptop.py @@ -8,7 +8,7 @@ import joblib import numpy as np # Шаг 1: Загрузка данных -df = pd.read_csv('../../datasets/synthetic_laptops.csv') +df = pd.read_csv('../../../../datasets/synthetic_laptops.csv') # Шаг 2: Проверка и очистка имен столбцов df.columns = df.columns.str.strip().str.lower() @@ -84,10 +84,10 @@ print(f"Random Forest - MAE: {mae}, RMSE: {rmse}, R²: {r2}") # Шаг 13: Сохранение модели feature_columns = X.columns.tolist() -joblib.dump(feature_columns, 'feature_columns.pkl') -joblib.dump(best_model, 'laptop_price_model.pkl') -joblib.dump(poly, 'poly_transformer.pkl') -joblib.dump(scaler, 'scaler.pkl') +joblib.dump(feature_columns, '../../laptopML/feature_columns.pkl') +joblib.dump(best_model, '../../laptopML/laptop_price_model.pkl') +joblib.dump(poly, '../../laptopML/poly_transformer.pkl') +joblib.dump(scaler, '../../laptopML/scaler.pkl') print("Модель, трансформер и скейлер сохранены.") # Шаг 14: Важность признаков diff --git a/services/ml/scripts/modelBuilders/modelBuilderTV.py b/services/ml/scripts/modelBuilders/modelBuilderTV.py new file mode 100644 index 0000000..2c69983 --- /dev/null +++ b/services/ml/scripts/modelBuilders/modelBuilderTV.py @@ -0,0 +1,73 @@ +import pandas as pd +from sklearn.model_selection import train_test_split, GridSearchCV +from sklearn.ensemble import RandomForestRegressor +from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score +from sklearn.preprocessing import PolynomialFeatures, StandardScaler +import matplotlib.pyplot as plt +import joblib +import numpy as np + +# Загрузка данных +df = pd.read_csv('../../../../datasets/synthetic_tvs.csv') + +# Проверка и очистка данных +required_columns = ['display', 'tuners', 'features', 'os', 'power_of_volume', 'color', 'screen_size', 'price'] +missing_columns = [col for col in required_columns if col not in df.columns] +if missing_columns: + raise Exception(f"Отсутствуют столбцы: {missing_columns}") + +df = df.dropna(subset=required_columns) + +# Преобразование категориальных переменных +categorical_features = ['display', 'tuners', 'features', 'os', 'power_of_volume','color'] +df = pd.get_dummies(df, columns=categorical_features, drop_first=True) + +# Разделение на X и y +X = df.drop('price', axis=1) +y = df['price'] + +# Полиномиальные признаки +poly = PolynomialFeatures(degree=1, interaction_only=True, include_bias=False) +X_poly = poly.fit_transform(X) + +# Масштабирование +scaler = StandardScaler() +X_poly_scaled = scaler.fit_transform(X_poly) + +# Разделение на обучающую и тестовую выборки +X_train, X_test, y_train, y_test = train_test_split(X_poly_scaled, y, test_size=0.5, random_state=42) + +# Настройка Random Forest +param_grid = { + 'n_estimators': [100, 200], + 'max_depth': [10, 20], + 'max_features': ['sqrt', 'log2', 0.5], + 'min_samples_split': [5, 10], + 'min_samples_leaf': [2, 4] +} + +grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_absolute_error') +grid_search.fit(X_train, y_train) +best_model = grid_search.best_estimator_ + +# Вывод важности признаков +feature_importances = best_model.feature_importances_ +feature_names = poly.get_feature_names_out(X.columns) + +# Построение графика важности признаков +sorted_indices = np.argsort(feature_importances)[::-1] +plt.figure(figsize=(10, 8)) +plt.barh([feature_names[i] for i in sorted_indices[:20]], feature_importances[sorted_indices[:20]]) +plt.xlabel('Importance') +plt.ylabel('Feature') +plt.title('Top 20 Feature Importances') +plt.gca().invert_yaxis() +plt.show() + +# Сохранение модели +feature_columns = X.columns.tolist() +joblib.dump(feature_columns, '../../tvML/feature_columns.pkl') +joblib.dump(best_model, '../../tvML/tv_price_model.pkl') +joblib.dump(poly, '../../tvML/poly_transformer.pkl') +joblib.dump(scaler, '../../tvML/scaler.pkl') +print("Модель для телевизоров сохранена.") diff --git a/services/service.py b/services/service.py index 7fe0ad5..fbf19ae 100644 --- a/services/service.py +++ b/services/service.py @@ -52,3 +52,49 @@ class LaptopService: predicted_price = self.model.predict(input_scaled)[0] return PredictPriceResponse(predicted_price=round(predicted_price, 2)) + +class TVService: + def __init__(self, model_path: str, feature_columns_path: str, poly_path: str, scaler_path: str): + try: + self.model = joblib.load(model_path) + except FileNotFoundError: + raise Exception(f"Model file not found at {model_path}") + except Exception as e: + raise Exception(f"Error loading model: {str(e)}") + + try: + self.feature_columns = joblib.load(feature_columns_path) + except FileNotFoundError: + raise Exception(f"Feature columns file not found at {feature_columns_path}") + except Exception as e: + raise Exception(f"Error loading feature columns: {str(e)}") + + try: + self.poly_transformer = joblib.load(poly_path) + self.scaler = joblib.load(scaler_path) + except FileNotFoundError: + raise Exception("Polynomial transformer or scaler file not found.") + except Exception as e: + raise Exception(f"Error loading polynomial transformer or scaler: {str(e)}") + + def predict_price(self, data: Dict[str, any]) -> PredictPriceResponse: + input_df = pd.DataFrame([data]) + + # Применение One-Hot Encoding + input_df = pd.get_dummies(input_df, columns=['display', 'tuners', 'features', 'os', 'color'], drop_first=True) + + # Добавление отсутствующих признаков + for col in self.feature_columns: + if col not in input_df.columns and col != 'price': + input_df[col] = 0 + + input_df = input_df[self.feature_columns] + + # Полиномиальные и масштабированные данные + input_poly = self.poly_transformer.transform(input_df) + input_scaled = self.scaler.transform(input_poly) + + # Предсказание + predicted_price = self.model.predict(input_scaled)[0] + return PredictPriceResponse(predicted_price=round(predicted_price, 2)) +