diff --git a/controllers/controller.py b/controllers/controller.py index 525aa63..3c70c33 100644 --- a/controllers/controller.py +++ b/controllers/controller.py @@ -54,9 +54,16 @@ def predict_price(data: TVCreate): raise HTTPException(status_code=400, detail=str(e)) -@router.get('/get_unique_data_laptops', summary="Get unique data for laptops species") +@router.get('/get_unique_data_laptop', summary="Get unique data for laptops species") def get_unique_laptops(): try: return laptop_service.get_unique_data() + except Exception as e: + raise HTTPException(status_code=400, detail=str(e)) + +@router.get('/get_unique_data_tv', summary="Get unique data for tvs species") +def get_unique_laptops(): + try: + return tv_service.get_unique_data() except Exception as e: raise HTTPException(status_code=400, detail=str(e)) \ No newline at end of file diff --git a/database.py b/database.py deleted file mode 100644 index d88797d..0000000 --- a/database.py +++ /dev/null @@ -1,9 +0,0 @@ -from sqlalchemy import create_engine -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import sessionmaker - -DATABASE_URL = "postgresql://postgres:postgres@localhost/price-builder" - -engine = create_engine(DATABASE_URL) -SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) -Base = declarative_base() diff --git a/models/models.py b/models/models.py deleted file mode 100644 index 7f84ae6..0000000 --- a/models/models.py +++ /dev/null @@ -1,31 +0,0 @@ -from sqlalchemy import Column, Integer, Float, String -from database import Base - - -class Laptop(Base): - __tablename__ = "laptops" - - id = Column(Integer, primary_key=True, index=True) - brand = Column(String, index=True) - processor = Column(String, index=True) - ram = Column(Integer) - os = Column(String, index=True) - ssd = Column(Integer) - display = Column(Float) - gpu = Column(String, index=True) - weight = Column(Float) - battery_size = Column(Integer) - release_year = Column(Integer) - display_type = Column(String, index=True) - -class TV(Base): - __tablename__ = "tvs" - - id = Column(Integer, primary_key=True, index=True) - display = Column(String, index=True) - tuners = Column(String) - features = Column(String) - os = Column(String) - power_of_volume = Column(String) - screen_sizes: int - color = Column(String) \ No newline at end of file diff --git a/services/ml/scripts/modelBuilders/modelBuilderLaptop.py b/services/ml/scripts/modelBuilders/modelBuilderLaptop.py index 763cbf0..562a03e 100644 --- a/services/ml/scripts/modelBuilders/modelBuilderLaptop.py +++ b/services/ml/scripts/modelBuilders/modelBuilderLaptop.py @@ -34,7 +34,6 @@ df['ssd'] = df['ssd'].str.extract(r'(\d+)').astype(float) # Преобразование цен в числовой формат df['price'] = df['price'].astype(str).str.replace(' ', '').astype(int) - # Шаг 5: Очистка и преобразование колонок def clean_numeric_column(column, remove_chars=['₹', ',', ' ', 'ГБ', 'МГц', '']): if column.dtype == object: diff --git a/services/ml/scripts/modelBuilders/modelBuilderTV.py b/services/ml/scripts/modelBuilders/modelBuilderTV.py index 2c69983..078b69f 100644 --- a/services/ml/scripts/modelBuilders/modelBuilderTV.py +++ b/services/ml/scripts/modelBuilders/modelBuilderTV.py @@ -6,9 +6,11 @@ from sklearn.preprocessing import PolynomialFeatures, StandardScaler import matplotlib.pyplot as plt import joblib import numpy as np +import json +import os # Загрузка данных -df = pd.read_csv('../../../../datasets/synthetic_tvs.csv') +df = pd.read_csv('../../../../datasets/tv.csv') # Проверка и очистка данных required_columns = ['display', 'tuners', 'features', 'os', 'power_of_volume', 'color', 'screen_size', 'price'] @@ -18,6 +20,27 @@ if missing_columns: df = df.dropna(subset=required_columns) +# Преобразование цен в числовой формат +df['price'] = df['price'].astype(str).str.replace(' ', '').astype(int) + +# Создаем список уникальных значений для категориальных признаков +unique_values = { + 'display': df['display'].dropna().unique().tolist(), + 'tuners': df['tuners'].dropna().unique().tolist(), + 'features': df['features'].dropna().unique().tolist(), + 'os': df['os'].dropna().unique().tolist(), + 'power_of_volume': df['power_of_volume'].dropna().unique().tolist(), + 'color': df['color'].dropna().unique().tolist(), + 'screen_size': df['screen_size'].dropna().unique().tolist() +} + +# Создание директории, если она не существует +output_dir = 'columns' +os.makedirs(output_dir, exist_ok=True) + +with open(os.path.join(output_dir, 'unique_values_tv.json'), 'w', encoding='utf-8') as f: + json.dump(unique_values, f, ensure_ascii=False, indent=4) + # Преобразование категориальных переменных categorical_features = ['display', 'tuners', 'features', 'os', 'power_of_volume','color'] df = pd.get_dummies(df, columns=categorical_features, drop_first=True) @@ -50,20 +73,6 @@ grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, c grid_search.fit(X_train, y_train) best_model = grid_search.best_estimator_ -# Вывод важности признаков -feature_importances = best_model.feature_importances_ -feature_names = poly.get_feature_names_out(X.columns) - -# Построение графика важности признаков -sorted_indices = np.argsort(feature_importances)[::-1] -plt.figure(figsize=(10, 8)) -plt.barh([feature_names[i] for i in sorted_indices[:20]], feature_importances[sorted_indices[:20]]) -plt.xlabel('Importance') -plt.ylabel('Feature') -plt.title('Top 20 Feature Importances') -plt.gca().invert_yaxis() -plt.show() - # Сохранение модели feature_columns = X.columns.tolist() joblib.dump(feature_columns, '../../tvML/feature_columns.pkl') @@ -71,3 +80,21 @@ joblib.dump(best_model, '../../tvML/tv_price_model.pkl') joblib.dump(poly, '../../tvML/poly_transformer.pkl') joblib.dump(scaler, '../../tvML/scaler.pkl') print("Модель для телевизоров сохранена.") + +# Вывод важности признаков +feature_importances = best_model.feature_importances_ +feature_names = poly.get_feature_names_out(X.columns) + +# Построение графика важности признаков (снизу вверх) +sorted_indices = np.argsort(feature_importances)[::-1] # Сортировка индексов по важности +top_features = [feature_names[i] for i in sorted_indices[:15]] +top_importances = feature_importances[sorted_indices[:15]] + +plt.figure(figsize=(12, 10)) # Увеличиваем размер графика +plt.bar(range(len(top_features)), top_importances, tick_label=top_features) +plt.xticks(rotation=45, ha='right', fontsize=10) # Наклон подписей и выравнивание +plt.ylabel('Важность') +plt.xlabel('Признаки') +plt.title('Топ 20 Важнейших параметров') +plt.tight_layout() # Добавляем автоматическое выравнивание элементов графика +plt.show() \ No newline at end of file diff --git a/services/ml/scripts/modelBuilders/modelBuilderTVSynthetic.py b/services/ml/scripts/modelBuilders/modelBuilderTVSynthetic.py new file mode 100644 index 0000000..2c69983 --- /dev/null +++ b/services/ml/scripts/modelBuilders/modelBuilderTVSynthetic.py @@ -0,0 +1,73 @@ +import pandas as pd +from sklearn.model_selection import train_test_split, GridSearchCV +from sklearn.ensemble import RandomForestRegressor +from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score +from sklearn.preprocessing import PolynomialFeatures, StandardScaler +import matplotlib.pyplot as plt +import joblib +import numpy as np + +# Загрузка данных +df = pd.read_csv('../../../../datasets/synthetic_tvs.csv') + +# Проверка и очистка данных +required_columns = ['display', 'tuners', 'features', 'os', 'power_of_volume', 'color', 'screen_size', 'price'] +missing_columns = [col for col in required_columns if col not in df.columns] +if missing_columns: + raise Exception(f"Отсутствуют столбцы: {missing_columns}") + +df = df.dropna(subset=required_columns) + +# Преобразование категориальных переменных +categorical_features = ['display', 'tuners', 'features', 'os', 'power_of_volume','color'] +df = pd.get_dummies(df, columns=categorical_features, drop_first=True) + +# Разделение на X и y +X = df.drop('price', axis=1) +y = df['price'] + +# Полиномиальные признаки +poly = PolynomialFeatures(degree=1, interaction_only=True, include_bias=False) +X_poly = poly.fit_transform(X) + +# Масштабирование +scaler = StandardScaler() +X_poly_scaled = scaler.fit_transform(X_poly) + +# Разделение на обучающую и тестовую выборки +X_train, X_test, y_train, y_test = train_test_split(X_poly_scaled, y, test_size=0.5, random_state=42) + +# Настройка Random Forest +param_grid = { + 'n_estimators': [100, 200], + 'max_depth': [10, 20], + 'max_features': ['sqrt', 'log2', 0.5], + 'min_samples_split': [5, 10], + 'min_samples_leaf': [2, 4] +} + +grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_absolute_error') +grid_search.fit(X_train, y_train) +best_model = grid_search.best_estimator_ + +# Вывод важности признаков +feature_importances = best_model.feature_importances_ +feature_names = poly.get_feature_names_out(X.columns) + +# Построение графика важности признаков +sorted_indices = np.argsort(feature_importances)[::-1] +plt.figure(figsize=(10, 8)) +plt.barh([feature_names[i] for i in sorted_indices[:20]], feature_importances[sorted_indices[:20]]) +plt.xlabel('Importance') +plt.ylabel('Feature') +plt.title('Top 20 Feature Importances') +plt.gca().invert_yaxis() +plt.show() + +# Сохранение модели +feature_columns = X.columns.tolist() +joblib.dump(feature_columns, '../../tvML/feature_columns.pkl') +joblib.dump(best_model, '../../tvML/tv_price_model.pkl') +joblib.dump(poly, '../../tvML/poly_transformer.pkl') +joblib.dump(scaler, '../../tvML/scaler.pkl') +print("Модель для телевизоров сохранена.") diff --git a/services/service.py b/services/service.py index e671788..2c37d83 100644 --- a/services/service.py +++ b/services/service.py @@ -133,4 +133,13 @@ class TVService: predicted_price = self.model.predict(input_scaled)[0] return PredictPriceResponse(predicted_price=round(predicted_price, 2)) + def get_unique_data(self): + # Указываем путь к файлу + file_path = 'services/ml/scripts/modelBuilders/columns/unique_values_tv.json' + # Открываем и читаем данные из файла + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) # Загружаем данные из JSON + + # Возвращаем данные, которые будут переданы в ответ + return data