price-builder-backend/services/ml/generate_synthetic_data.py

179 lines
5.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import numpy as np
import random
import re
from datetime import datetime
# Установка случайного зерна для воспроизводимости
np.random.seed(42)
random.seed(42)
# Определение возможных значений для категориальных признаков
brands = ['Dell', 'HP', 'Lenovo', 'Apple', 'Asus', 'Acer', 'MSI', 'Microsoft', 'Samsung', 'Toshiba']
processors = [
'Intel Core i3 10th Gen', 'Intel Core i5 10th Gen', 'Intel Core i7 10th Gen',
'AMD Ryzen 3 4000 Series', 'AMD Ryzen 5 4000 Series', 'AMD Ryzen 7 4000 Series'
]
oss = ['Windows 10', 'Windows 11', 'macOS', 'Linux']
gpus = ['Integrated', 'NVIDIA GeForce GTX 1650', 'NVIDIA GeForce RTX 3060', 'AMD Radeon RX 5600M']
display_sizes = [13.3, 14.0, 15.6, 17.3]
display_types = ['HD', 'Full HD', '4K', 'OLED']
ram_options = [4, 8, 16, 32] # в GB
ssd_options = [0, 256, 512, 1024] # в GB
weights = [1.2, 1.5, 2.0, 2.5, 3.0] # в кг
battery_sizes = [45, 60, 70, 90, 100] # в Вт⋅ч
release_years = list(range(2015, datetime.now().year + 1)) # от 2015 до текущего года
# Функции для генерации признаков
def generate_brand():
return random.choice(brands)
def generate_processor():
return random.choice(processors)
def generate_os():
return random.choice(oss)
def generate_gpu():
return random.choice(gpus)
def generate_display():
return random.choice(display_sizes)
def generate_display_type():
return random.choice(display_types)
def generate_ram():
return random.choice(ram_options)
def generate_ssd():
return random.choice(ssd_options)
def generate_weight():
return random.choice(weights)
def generate_battery_size():
return random.choice(battery_sizes)
def generate_release_year():
return random.choice(release_years)
# Функция для расчёта цены
def calculate_price(brand, processor, ram, os, ssd, display, gpu, weight, battery_size, release_year, display_type):
base_price = 30000 # базовая цена в условных единицах
# Бренд
brand_premium = {
'Apple': 40000, 'MSI': 35000, 'Dell': 15000, 'HP': 12000, 'Lenovo': 10000,
'Microsoft': 18000, 'Asus': 8000, 'Acer': 7000, 'Samsung': 9000, 'Toshiba': 8500
}
base_price += brand_premium.get(brand, 10000)
# Процессор
processor_premium = {
'Intel Core i3': 5000, 'Intel Core i5': 10000, 'Intel Core i7': 15000,
'AMD Ryzen 3': 5000, 'AMD Ryzen 5': 10000, 'AMD Ryzen 7': 15000
}
for key, value in processor_premium.items():
if key in processor:
base_price += value
break
# RAM - уменьшаем его коэффициент
base_price += ram * 1000
# SSD - также уменьшаем его коэффициент
base_price += ssd * 50
# Дисплей
base_price += (display - 13) * 5000
# Тип дисплея
display_type_premium = {'HD': 0, 'Full HD': 12000, '4K': 30000, 'OLED': 35000}
base_price += display_type_premium.get(display_type, 0)
# GPU
gpu_premium = {'Integrated': 0, 'NVIDIA GeForce GTX 1650': 25000, 'NVIDIA GeForce RTX 3060': 40000, 'AMD Radeon RX 5600M': 35000}
base_price += gpu_premium.get(gpu, 0)
# Вес
base_price += (3.0 - weight) * 8000 # Чем легче, тем дороже
# Батарея
base_price += battery_size * 250
# Год выпуска
current_year = datetime.now().year
base_price += (current_year - release_year) * 5000
# Добавление случайного шума
noise = np.random.normal(0, 5000) # Шум для увеличения разброса
final_price = base_price + noise
return max(round(final_price, 2), 5000)
# Функция для генерации синтетических данных
def generate_synthetic_data(num_samples=100000):
data = []
for _ in range(num_samples):
brand = generate_brand()
processor = generate_processor()
os = generate_os()
gpu = generate_gpu()
display = generate_display()
display_type = generate_display_type()
ram = generate_ram()
ssd = generate_ssd()
weight = generate_weight()
battery_size = generate_battery_size()
release_year = generate_release_year()
price = calculate_price(
brand, processor, ram, os, ssd, display, gpu, weight, battery_size, release_year, display_type
)
data.append({
'brand': brand,
'processor': processor,
'ram': ram,
'os': os,
'ssd': ssd,
'display': display,
'gpu': gpu,
'weight': weight,
'battery_size': battery_size,
'release_year': release_year,
'display_type': display_type,
'price': price
})
return pd.DataFrame(data)
print("Генерация синтетических данных...")
synthetic_df = generate_synthetic_data(num_samples=100000)
# Просмотр первых нескольких строк
print("\nПример данных после генерации:")
print(synthetic_df.head())
# Проверка распределения цен
print("\nСтатистика по ценам:")
print(synthetic_df['price'].describe())
# Сохранение в CSV
synthetic_df.to_csv('synthetic_laptops.csv', index=False)
print("\nСинтетические данные сохранены в 'synthetic_laptops.csv'.")