2024-12-02 00:09:42 +04:00
|
|
|
|
import csv
|
2024-10-27 20:49:39 +04:00
|
|
|
|
from selenium import webdriver
|
|
|
|
|
from selenium.webdriver.chrome.service import Service
|
|
|
|
|
from selenium.webdriver.common.by import By
|
|
|
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
|
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
|
|
from webdriver_manager.chrome import ChromeDriverManager
|
|
|
|
|
|
2024-12-02 00:09:42 +04:00
|
|
|
|
|
2024-12-01 21:46:36 +04:00
|
|
|
|
def init_driver():
|
|
|
|
|
options = webdriver.ChromeOptions()
|
|
|
|
|
options.add_argument("--start-maximized")
|
|
|
|
|
return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
2024-10-27 20:49:39 +04:00
|
|
|
|
|
2024-12-01 21:46:36 +04:00
|
|
|
|
def scrape_all_pages(base_url, max_pages=5):
|
|
|
|
|
driver = init_driver()
|
2024-10-29 20:17:54 +04:00
|
|
|
|
all_laptops_specs = []
|
|
|
|
|
|
2024-12-01 21:46:36 +04:00
|
|
|
|
try:
|
|
|
|
|
driver.get(base_url)
|
|
|
|
|
current_page = 1
|
2024-10-29 20:17:54 +04:00
|
|
|
|
|
2024-12-01 21:46:36 +04:00
|
|
|
|
while current_page <= max_pages:
|
|
|
|
|
# Ожидание загрузки товаров
|
|
|
|
|
WebDriverWait(driver, 15).until(
|
|
|
|
|
EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.app-catalog-1o4umte.ec53oil0'))
|
|
|
|
|
)
|
2024-10-29 20:17:54 +04:00
|
|
|
|
|
2024-12-01 21:46:36 +04:00
|
|
|
|
# Скрапинг товаров
|
|
|
|
|
laptop_blocks = driver.find_elements(By.CSS_SELECTOR, 'div.app-catalog-1o4umte.ec53oil0')
|
|
|
|
|
for laptop in laptop_blocks:
|
|
|
|
|
specs = {}
|
|
|
|
|
spec_items = laptop.find_elements(By.CSS_SELECTOR, 'li.app-catalog-12y5psc.e4qu3682')
|
|
|
|
|
for item in spec_items:
|
|
|
|
|
try:
|
|
|
|
|
label_elem = item.find_element(By.CSS_SELECTOR, 'span')
|
|
|
|
|
label = label_elem.text.strip()
|
|
|
|
|
value = item.text.replace(label, '').strip()
|
|
|
|
|
specs[label] = value
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Ошибка в характеристике: {e}")
|
|
|
|
|
if specs:
|
|
|
|
|
all_laptops_specs.append(specs)
|
2024-10-27 20:49:39 +04:00
|
|
|
|
|
2024-12-01 21:46:36 +04:00
|
|
|
|
# Переход на следующую страницу
|
|
|
|
|
try:
|
|
|
|
|
next_button = WebDriverWait(driver, 10).until(
|
|
|
|
|
EC.element_to_be_clickable((By.XPATH,
|
|
|
|
|
'//*[@id="__next"]/div/main/section/div[2]/div/div/section/div[2]/div[3]/div/div[2]/div[3]/a/div'))
|
|
|
|
|
)
|
|
|
|
|
# Прокрутка к кнопке
|
|
|
|
|
driver.execute_script("arguments[0].scrollIntoView({block: 'center', inline: 'center'});", next_button)
|
|
|
|
|
next_button.click()
|
|
|
|
|
WebDriverWait(driver, 10).until(EC.staleness_of(laptop_blocks[0]))
|
|
|
|
|
current_page += 1 # Переход на следующую страницу
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print("Кнопка 'Следующая' не найдена или конец каталога:", e)
|
|
|
|
|
break
|
2024-10-27 20:49:39 +04:00
|
|
|
|
|
2024-12-01 21:46:36 +04:00
|
|
|
|
finally:
|
|
|
|
|
driver.quit()
|
2024-10-27 20:49:39 +04:00
|
|
|
|
|
2024-12-01 21:46:36 +04:00
|
|
|
|
return all_laptops_specs
|
2024-10-27 20:49:39 +04:00
|
|
|
|
|
2024-12-02 00:09:42 +04:00
|
|
|
|
def save_to_csv(data, filename):
|
|
|
|
|
# Сбор всех уникальных заголовков
|
|
|
|
|
fieldnames = set()
|
|
|
|
|
for row in data:
|
|
|
|
|
fieldnames.update(row.keys())
|
|
|
|
|
|
|
|
|
|
# Сохранение данных в CSV
|
|
|
|
|
with open(filename, mode='w', newline='', encoding='utf-8') as file:
|
|
|
|
|
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
|
|
|
|
writer.writeheader() # Записываем заголовок
|
|
|
|
|
for row in data:
|
|
|
|
|
writer.writerow(row)
|
|
|
|
|
|
2024-12-01 21:46:36 +04:00
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
url = 'https://www.citilink.ru/catalog/noutbuki/?ref=mainpage'
|
2024-12-02 00:09:42 +04:00
|
|
|
|
laptops = scrape_all_pages(url, max_pages=2) # Установите количество страниц
|
|
|
|
|
save_to_csv(laptops, 'laptops_specs.csv') # Сохраняем в CSV файл
|
|
|
|
|
print(f"Данные сохранены в файл 'laptops_specs.csv'.")
|