price-builder-backend/scraping/scrapingMain.py

68 lines
3.0 KiB
Python

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
def init_driver():
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
def scrape_all_pages(base_url, max_pages=5):
driver = init_driver()
all_laptops_specs = []
try:
driver.get(base_url)
current_page = 1
while current_page <= max_pages:
# Ожидание загрузки товаров
WebDriverWait(driver, 15).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.app-catalog-1o4umte.ec53oil0'))
)
# Скрапинг товаров
laptop_blocks = driver.find_elements(By.CSS_SELECTOR, 'div.app-catalog-1o4umte.ec53oil0')
for laptop in laptop_blocks:
specs = {}
spec_items = laptop.find_elements(By.CSS_SELECTOR, 'li.app-catalog-12y5psc.e4qu3682')
for item in spec_items:
try:
label_elem = item.find_element(By.CSS_SELECTOR, 'span')
label = label_elem.text.strip()
value = item.text.replace(label, '').strip()
specs[label] = value
except Exception as e:
print(f"Ошибка в характеристике: {e}")
if specs:
all_laptops_specs.append(specs)
# Переход на следующую страницу
try:
next_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH,
'//*[@id="__next"]/div/main/section/div[2]/div/div/section/div[2]/div[3]/div/div[2]/div[3]/a/div'))
)
# Прокрутка к кнопке
driver.execute_script("arguments[0].scrollIntoView({block: 'center', inline: 'center'});", next_button)
next_button.click()
WebDriverWait(driver, 10).until(EC.staleness_of(laptop_blocks[0]))
current_page += 1 # Переход на следующую страницу
except Exception as e:
print("Кнопка 'Следующая' не найдена или конец каталога:", e)
break
finally:
driver.quit()
return all_laptops_specs
if __name__ == "__main__":
url = 'https://www.citilink.ru/catalog/noutbuki/?ref=mainpage'
laptops = scrape_all_pages(url, max_pages=5) # Установите количество страниц
for i, specs in enumerate(laptops, 1):
print(f'Ноутбук {i}: {specs}')