price-builder-backend/scraping/scrappingLaptop.py
2024-12-11 00:20:39 +04:00

123 lines
5.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
def init_driver():
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
def scrape_all_pages(base_url, max_pages=5):
driver = init_driver()
all_laptops_specs = []
try:
driver.get(base_url)
current_page = 1
while current_page <= max_pages:
# Ожидание загрузки товаров
WebDriverWait(driver, 15).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.app-catalog-1o4umte.ec53oil0'))
)
# Скрапинг характеристик
laptop_blocks = driver.find_elements(By.CSS_SELECTOR, 'div.app-catalog-1o4umte.ec53oil0')
specs_list = []
for laptop in laptop_blocks:
specs = {}
spec_items = laptop.find_elements(By.CSS_SELECTOR, 'li.app-catalog-12y5psc.e4qu3682')
for item in spec_items:
try:
label_elem = item.find_element(By.CSS_SELECTOR, 'span')
label = label_elem.text.strip()
value = item.text.replace(label, '').strip()
specs[label] = value
except Exception as e:
print(f"Ошибка в характеристике: {e}")
specs_list.append(specs)
# Скрапинг цен
price_blocks = driver.find_elements(By.CSS_SELECTOR, 'div.app-catalog-817h00.ean5xps0')
for idx, laptop in enumerate(price_blocks):
try:
price_element = laptop.find_element(By.CSS_SELECTOR, 'span.e1j9birj0')
price = price_element.text.strip()
# Если соответствующий specs существует, добавляем цену
if idx < len(specs_list):
specs_list[idx]["Цена"] = price
except Exception as e:
print(f"Ошибка при скрапинге цены: {e}")
if idx < len(specs_list):
specs_list[idx]["Цена"] = "Не указана"
# Итоговые данные
all_laptops_specs.extend(specs_list)
# Переход на следующую страницу
try:
next_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH,
'//*[@id="__next"]/div/main/section/div[2]/div/div/section/div[2]/div[3]/div/div[2]/div[3]/a/div'))
)
# Прокрутка к кнопке
driver.execute_script("arguments[0].scrollIntoView({block: 'center', inline: 'center'});", next_button)
next_button.click()
WebDriverWait(driver, 10).until(EC.staleness_of(laptop_blocks[0]))
current_page += 1 # Переход на следующую страницу
except Exception as e:
print("Кнопка 'Следующая' не найдена или конец каталога:", e)
break
finally:
driver.quit()
return all_laptops_specs
def save_to_csv(data, filename, ignore_fields=None):
# Устанавливаем игнорируемые поля, если они не заданы
if ignore_fields is None:
ignore_fields = []
# Фиксированные заголовки
fieldnames = [
"processor", "ram", "os", "ssd",
"display", "gpu", "price"
]
# Сохранение данных в CSV
with open(filename, mode='w', newline='', encoding='utf-8') as file:
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader() # Записываем заголовок
for row in data:
# Убираем игнорируемые поля и добавляем пропущенные характеристики
complete_row = {
"processor": row.get("Процессор", ""),
"ram": row.get("Оперативная память", ""),
"os": row.get("Операционная система", ""),
"ssd": row.get("Диск", ""),
"display": row.get("Экран", ""),
"gpu": row.get("Графический процессор", ""),
"price": row.get("Цена", ""),
}
# Убираем поля, которые нужно игнорировать
filtered_row = {k: v for k, v in complete_row.items() if k not in ignore_fields}
writer.writerow(filtered_row)
if __name__ == "__main__":
url = 'https://www.citilink.ru/catalog/noutbuki/?ref=mainpage'
laptops = scrape_all_pages(url, max_pages=20)
ignore_fields = ["Технология Intel", "Комплектация", "Клавиатура"]
save_to_csv(laptops, 'laptops.csv', ignore_fields)
print(f"Данные сохранены в файл 'laptops.csv'.")