2024-10-14 18:19:44 +04:00
|
|
|
|
import pandas as pd
|
|
|
|
|
import numpy as np
|
|
|
|
|
import random
|
|
|
|
|
import re
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
|
|
|
|
|
# Установка случайного зерна для воспроизводимости
|
|
|
|
|
np.random.seed(42)
|
|
|
|
|
random.seed(42)
|
|
|
|
|
|
|
|
|
|
# Определение возможных значений для категориальных признаков
|
|
|
|
|
brands = ['Dell', 'HP', 'Lenovo', 'Apple', 'Asus', 'Acer', 'MSI', 'Microsoft', 'Samsung', 'Toshiba']
|
|
|
|
|
processors = [
|
|
|
|
|
'Intel Core i3 10th Gen', 'Intel Core i5 10th Gen', 'Intel Core i7 10th Gen',
|
|
|
|
|
'AMD Ryzen 3 4000 Series', 'AMD Ryzen 5 4000 Series', 'AMD Ryzen 7 4000 Series'
|
|
|
|
|
]
|
|
|
|
|
oss = ['Windows 10', 'Windows 11', 'macOS', 'Linux']
|
|
|
|
|
gpus = ['Integrated', 'NVIDIA GeForce GTX 1650', 'NVIDIA GeForce RTX 3060', 'AMD Radeon RX 5600M']
|
|
|
|
|
display_sizes = [13.3, 14.0, 15.6, 17.3]
|
|
|
|
|
display_types = ['HD', 'Full HD', '4K', 'OLED']
|
|
|
|
|
ram_options = [4, 8, 16, 32] # в GB
|
|
|
|
|
ssd_options = [0, 256, 512, 1024] # в GB
|
|
|
|
|
weights = [1.2, 1.5, 2.0, 2.5, 3.0] # в кг
|
|
|
|
|
battery_sizes = [45, 60, 70, 90, 100] # в Вт⋅ч
|
|
|
|
|
release_years = list(range(2015, datetime.now().year + 1)) # от 2015 до текущего года
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Функции для генерации признаков
|
|
|
|
|
def generate_brand():
|
|
|
|
|
return random.choice(brands)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_processor():
|
|
|
|
|
return random.choice(processors)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_os():
|
|
|
|
|
return random.choice(oss)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_gpu():
|
|
|
|
|
return random.choice(gpus)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_display():
|
|
|
|
|
return random.choice(display_sizes)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_display_type():
|
|
|
|
|
return random.choice(display_types)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_ram():
|
|
|
|
|
return random.choice(ram_options)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_ssd():
|
|
|
|
|
return random.choice(ssd_options)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_weight():
|
|
|
|
|
return random.choice(weights)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_battery_size():
|
|
|
|
|
return random.choice(battery_sizes)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_release_year():
|
|
|
|
|
return random.choice(release_years)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Функция для расчёта цены
|
|
|
|
|
def calculate_price(brand, processor, ram, os, ssd, display, gpu, weight, battery_size, release_year, display_type):
|
|
|
|
|
base_price = 30000 # базовая цена в условных единицах
|
|
|
|
|
|
2024-10-30 01:16:58 +04:00
|
|
|
|
# Бренд
|
2024-10-14 18:19:44 +04:00
|
|
|
|
brand_premium = {
|
2024-10-30 01:16:58 +04:00
|
|
|
|
'Apple': 40000, 'MSI': 35000, 'Dell': 15000, 'HP': 12000, 'Lenovo': 10000,
|
|
|
|
|
'Microsoft': 18000, 'Asus': 8000, 'Acer': 7000, 'Samsung': 9000, 'Toshiba': 8500
|
2024-10-14 18:19:44 +04:00
|
|
|
|
}
|
2024-10-30 01:16:58 +04:00
|
|
|
|
base_price += brand_premium.get(brand, 10000)
|
|
|
|
|
|
|
|
|
|
# Процессор
|
|
|
|
|
processor_premium = {
|
|
|
|
|
'Intel Core i3': 5000, 'Intel Core i5': 10000, 'Intel Core i7': 15000,
|
|
|
|
|
'AMD Ryzen 3': 5000, 'AMD Ryzen 5': 10000, 'AMD Ryzen 7': 15000
|
2024-10-14 18:19:44 +04:00
|
|
|
|
}
|
2024-10-30 01:16:58 +04:00
|
|
|
|
for key, value in processor_premium.items():
|
|
|
|
|
if key in processor:
|
|
|
|
|
base_price += value
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# RAM - уменьшаем его коэффициент
|
|
|
|
|
base_price += ram * 1000
|
|
|
|
|
|
|
|
|
|
# SSD - также уменьшаем его коэффициент
|
|
|
|
|
base_price += ssd * 50
|
|
|
|
|
|
|
|
|
|
# Дисплей
|
|
|
|
|
base_price += (display - 13) * 5000
|
|
|
|
|
|
|
|
|
|
# Тип дисплея
|
|
|
|
|
display_type_premium = {'HD': 0, 'Full HD': 12000, '4K': 30000, 'OLED': 35000}
|
2024-10-14 18:19:44 +04:00
|
|
|
|
base_price += display_type_premium.get(display_type, 0)
|
|
|
|
|
|
2024-10-30 01:16:58 +04:00
|
|
|
|
# GPU
|
|
|
|
|
gpu_premium = {'Integrated': 0, 'NVIDIA GeForce GTX 1650': 25000, 'NVIDIA GeForce RTX 3060': 40000, 'AMD Radeon RX 5600M': 35000}
|
2024-10-14 18:19:44 +04:00
|
|
|
|
base_price += gpu_premium.get(gpu, 0)
|
|
|
|
|
|
2024-10-30 01:16:58 +04:00
|
|
|
|
# Вес
|
|
|
|
|
base_price += (3.0 - weight) * 8000 # Чем легче, тем дороже
|
2024-10-14 18:19:44 +04:00
|
|
|
|
|
2024-10-30 01:16:58 +04:00
|
|
|
|
# Батарея
|
|
|
|
|
base_price += battery_size * 250
|
2024-10-14 18:19:44 +04:00
|
|
|
|
|
2024-10-30 01:16:58 +04:00
|
|
|
|
# Год выпуска
|
2024-10-14 18:19:44 +04:00
|
|
|
|
current_year = datetime.now().year
|
2024-10-30 01:16:58 +04:00
|
|
|
|
base_price += (current_year - release_year) * 5000
|
2024-10-14 18:19:44 +04:00
|
|
|
|
|
2024-10-30 01:16:58 +04:00
|
|
|
|
# Добавление случайного шума
|
|
|
|
|
noise = np.random.normal(0, 5000) # Шум для увеличения разброса
|
2024-10-14 18:19:44 +04:00
|
|
|
|
final_price = base_price + noise
|
|
|
|
|
|
2024-10-30 01:16:58 +04:00
|
|
|
|
return max(round(final_price, 2), 5000)
|
2024-10-14 18:19:44 +04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Функция для генерации синтетических данных
|
|
|
|
|
def generate_synthetic_data(num_samples=100000):
|
|
|
|
|
data = []
|
|
|
|
|
for _ in range(num_samples):
|
|
|
|
|
brand = generate_brand()
|
|
|
|
|
processor = generate_processor()
|
|
|
|
|
os = generate_os()
|
|
|
|
|
gpu = generate_gpu()
|
|
|
|
|
display = generate_display()
|
|
|
|
|
display_type = generate_display_type()
|
|
|
|
|
ram = generate_ram()
|
|
|
|
|
ssd = generate_ssd()
|
|
|
|
|
weight = generate_weight()
|
|
|
|
|
battery_size = generate_battery_size()
|
|
|
|
|
release_year = generate_release_year()
|
|
|
|
|
|
|
|
|
|
price = calculate_price(
|
|
|
|
|
brand, processor, ram, os, ssd, display, gpu, weight, battery_size, release_year, display_type
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
data.append({
|
|
|
|
|
'brand': brand,
|
|
|
|
|
'processor': processor,
|
|
|
|
|
'ram': ram,
|
|
|
|
|
'os': os,
|
|
|
|
|
'ssd': ssd,
|
|
|
|
|
'display': display,
|
|
|
|
|
'gpu': gpu,
|
|
|
|
|
'weight': weight,
|
|
|
|
|
'battery_size': battery_size,
|
|
|
|
|
'release_year': release_year,
|
|
|
|
|
'display_type': display_type,
|
|
|
|
|
'price': price
|
|
|
|
|
})
|
|
|
|
|
return pd.DataFrame(data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Генерация синтетических данных...")
|
|
|
|
|
synthetic_df = generate_synthetic_data(num_samples=100000)
|
|
|
|
|
|
|
|
|
|
# Просмотр первых нескольких строк
|
|
|
|
|
print("\nПример данных после генерации:")
|
|
|
|
|
print(synthetic_df.head())
|
|
|
|
|
|
|
|
|
|
# Проверка распределения цен
|
|
|
|
|
print("\nСтатистика по ценам:")
|
|
|
|
|
print(synthetic_df['price'].describe())
|
|
|
|
|
|
|
|
|
|
# Сохранение в CSV
|
|
|
|
|
synthetic_df.to_csv('synthetic_laptops.csv', index=False)
|
|
|
|
|
print("\nСинтетические данные сохранены в 'synthetic_laptops.csv'.")
|