Feature/parsing-service: Add parsing Ozon, need test on another system

This commit is contained in:
danil.markov 2024-10-13 17:44:52 +04:00
parent f58b0a4a02
commit ef2240e8ab
17 changed files with 538 additions and 75 deletions

View File

@ -0,0 +1,18 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="ParsingService [local]" type="SpringBootApplicationConfigurationType" factoryName="Spring Boot">
<option name="ACTIVE_PROFILES" value="dev" />
<option name="SCHEDULED_DEBUGGER" value="true" />
<envs>
<env name="JDBC_PASSWORD" value="postgres" />
<env name="JDBC_URL" value="localhost:5432/parsed_data" />
<env name="JDBC_USERNAME" value="postgres" />
<env name="SERVER_PORT" value="8080" />
<env name="WEBDRIVER_CHROME_PATH" value="$PROJECT_DIR$/parsing-service/web-driver/chromedriver" />
</envs>
<module name="parsing-service.main" />
<option name="SPRING_BOOT_MAIN_CLASS" value="ru.pricepulse.parsingservice.ParsingServiceApplication" />
<method v="2">
<option name="Make" enabled="true" />
</method>
</configuration>
</component>

View File

@ -1,6 +1,6 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="ParsingService [local]" type="SpringBootApplicationConfigurationType" factoryName="Spring Boot">
<option name="ACTIVE_PROFILES" value="dev" />
<option name="ACTIVE_PROFILES" value="dev,headless" />
<option name="SCHEDULED_DEBUGGER" value="true" />
<envs>
<env name="JDBC_PASSWORD" value="postgres" />

View File

@ -25,6 +25,7 @@ repositories {
ext {
jsoupVesion = '1.18.1'
seleniumVersion = '4.25.0'
}
dependencies {
@ -33,6 +34,9 @@ dependencies {
implementation 'org.liquibase:liquibase-core'
implementation 'org.springframework.kafka:spring-kafka'
implementation "org.jsoup:jsoup:${jsoupVesion}"
implementation "org.seleniumhq.selenium:selenium-java:${seleniumVersion}"
implementation 'io.github.bonigarcia:webdrivermanager:5.5.0'
implementation 'org.apache.commons:commons-pool2:2.12.0'
compileOnly 'org.projectlombok:lombok'

View File

@ -0,0 +1,8 @@
package ru.pricepulse.parsingservice.config;
import org.springframework.context.annotation.Configuration;
import org.springframework.retry.annotation.EnableRetry;
@Configuration
@EnableRetry
public class RetryConfig {}

View File

@ -0,0 +1,10 @@
package ru.pricepulse.parsingservice.config;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.context.annotation.Configuration;
import ru.pricepulse.parsingservice.config.properties.SeleniumConfigProperties;
@Configuration
@EnableConfigurationProperties(SeleniumConfigProperties.class)
public class SeleniumConfig {
}

View File

@ -0,0 +1,59 @@
package ru.pricepulse.parsingservice.config;
import java.util.HashMap;
import java.util.Map;
import io.github.bonigarcia.wdm.WebDriverManager;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.springframework.beans.factory.config.ConfigurableBeanFactory;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Profile;
import org.springframework.context.annotation.Scope;
@Configuration
public class WebDriverConfig {
@Bean
@Profile("visible")
@Scope(ConfigurableBeanFactory.SCOPE_PROTOTYPE)
public WebDriver webDriverVisible() {
Map<String, Object> prefs = new HashMap<>();
prefs.put("profile.managed_default_content_settings.images", 2);
prefs.put("profile.managed_default_content_settings.geolocation", 2);
var options = new ChromeOptions();
options.setExperimentalOption("prefs", prefs);
WebDriverManager.chromedriver().setup();
return new ChromeDriver(options);
}
@Bean
@Profile("headless")
@Scope(ConfigurableBeanFactory.SCOPE_PROTOTYPE)
public WebDriver webDriverHeadless(ChromeOptions options) {
WebDriverManager.chromedriver().setup();
return new ChromeDriver(options);
}
@Bean
@Profile("headless")
public ChromeOptions chromeOptions() {
Map<String, Object> prefs = new HashMap<>();
prefs.put("profile.managed_default_content_settings.images", 2);
prefs.put("profile.managed_default_content_settings.stylesheets", 2);
var options = new ChromeOptions();
options.setExperimentalOption("prefs", prefs);
options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36");
//options.addArguments("--window-size=1920,2000");
options.addArguments("--headless");
options.addArguments("--disable-gpu");
options.addArguments("--no-sandbox");
options.addArguments("--disable-dev-shm-usage");
return options;
}
}

View File

@ -0,0 +1,7 @@
package ru.pricepulse.parsingservice.config.properties;
import org.springframework.boot.context.properties.ConfigurationProperties;
@ConfigurationProperties("selenium")
public class SeleniumConfigProperties {
}

View File

@ -22,6 +22,7 @@ import ru.pricepulse.parsingservice.persistence.enums.MarketplaceEnum;
@Entity
@Table(name = "product")
public class ProductEntity {
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
@Column(name = "id", nullable = false)
@ -69,4 +70,5 @@ public class ProductEntity {
protected void onCreate() {
createdAt = LocalDateTime.now();
}
}

View File

@ -0,0 +1,67 @@
package ru.pricepulse.parsingservice.pool;
import java.util.Queue;
import java.util.concurrent.ConcurrentLinkedQueue;
import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.springframework.beans.factory.ObjectFactory;
import org.springframework.stereotype.Component;
@Slf4j
@Component
public class WebDriverPool {
private final Queue<WebDriver> availableDrivers = new ConcurrentLinkedQueue<>(); // Список доступных драйверов
private final Queue<WebDriver> busyDrivers = new ConcurrentLinkedQueue<>(); // Список занятых драйверов
private final ObjectFactory<WebDriver> webDriverFactory;
public WebDriverPool(ObjectFactory<WebDriver> webDriverFactory) {
this.webDriverFactory = webDriverFactory;
int poolSize = 12;
// Инициализация пула с указанным количеством драйверов
for (int i = 0; i < poolSize; i++) {
availableDrivers.add(createNewDriver());
}
}
// Метод для создания нового экземпляра WebDriver
private WebDriver createNewDriver() {
return webDriverFactory.getObject();
}
// Метод для заимствования драйвера
public WebDriver borrowDriver() {
WebDriver driver = availableDrivers.poll(); // Получаем драйвер из доступных
if (driver != null) {
busyDrivers.add(driver); // Добавляем драйвер в занятые
}
log.info("Занимаем драйвер {}", driver);
return driver; // Возвращаем драйвер
}
// Метод для возврата драйвера в пул
public void returnDriver(WebDriver driver) {
busyDrivers.remove(driver); // Убираем драйвер из занятых
availableDrivers.add(driver); // Возвращаем драйвер в доступные
log.info("Вернули драйвер {}", driver);
}
// Метод для закрытия всех драйверов в пуле
public void shutdownPool() {
// Закрываем доступные драйверы
for (WebDriver driver : availableDrivers) {
driver.quit();
}
// Закрываем занятые драйверы
for (WebDriver driver : busyDrivers) {
driver.quit();
}
availableDrivers.clear();
busyDrivers.clear();
}
}

View File

@ -1,67 +0,0 @@
package ru.pricepulse.parsingservice.service.impl.parsing;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicBoolean;
import lombok.extern.slf4j.Slf4j;
import org.springframework.retry.annotation.Recover;
import org.springframework.retry.annotation.Retryable;
import org.springframework.stereotype.Service;
import ru.pricepulse.parsingservice.service.DataParser;
import ru.pricepulse.parsingservice.service.MarketplaceParsingService;
import ru.pricepulse.parsingservice.service.request.PageFetcher;
@Slf4j
@Service
public class OzonParsingService implements MarketplaceParsingService {
private final PageFetcher pageFetcher;
private final DataParser dataParser;
private final ExecutorService executorService;
public OzonParsingService(PageFetcher pageFetcher, DataParser dataParser) {
this.pageFetcher = pageFetcher;
this.dataParser = dataParser;
this.executorService = Executors.newFixedThreadPool(1);
}
@Override
public void processCategory(String categoryUrl) {
int pageNumber = 1;
AtomicBoolean hasMoreData = new AtomicBoolean(true);
while (hasMoreData.get()) {
int finalPageNumber = pageNumber;
executorService.submit(() -> processTask(categoryUrl, finalPageNumber, hasMoreData));
pageNumber++;
}
}
@Retryable
private void processTask(String categoryUrl, int pageNumber, AtomicBoolean hasMoreData) {
String pageUrl = categoryUrl + "?page=" + pageNumber;
String pageContent;
try {
log.info("Получение страницы {}", pageUrl);
pageContent = pageFetcher.fetchPage(pageUrl);
} catch (Exception e) {
log.error("Ошибка получения страницы - {} \n {}", pageUrl, e.getMessage(), e);
throw new RuntimeException(e);
}
if (!dataParser.pageHasData(pageContent)) {
log.warn("Данные не найдены - {}", pageUrl);
hasMoreData.set(false);
return;
}
dataParser.parseAndQueueData(pageContent);
}
@Recover
private void recover(Exception e, String categoryUrl, int pageNumber, AtomicBoolean hasMoreData) {
log.error(e.getMessage(), e);
}
}

View File

@ -0,0 +1,7 @@
package ru.pricepulse.parsingservice.service.marketplace.ozon;
public interface MarketplacePage {
boolean isLoaded();
}

View File

@ -0,0 +1,67 @@
package ru.pricepulse.parsingservice.service.marketplace.ozon.page;
import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
import ru.pricepulse.parsingservice.service.marketplace.ozon.MarketplacePage;
@Slf4j
public class AccessDeniedPage implements MarketplacePage {
private static final String RELOAD_BUTTON_ID = "reload-button";
private static final String RELOAD_BUTTON_XPATH = "//button[contains(text(),'Обновить')]";
private static final String WARNING_IMAGE_CSS = "img[alt='warning']";
private static final String ACCESS_DENIED_TEXT_XPATH = "//h1[text()='Доступ ограничен']";
private final By reloadButtonById = By.id(RELOAD_BUTTON_ID);
private final By reloadButtonByXpath = By.xpath(RELOAD_BUTTON_XPATH);
private final By warningImage = By.cssSelector(WARNING_IMAGE_CSS);
private final By accessDeniedText = By.xpath(ACCESS_DENIED_TEXT_XPATH);
private WebDriver driver;
private WebDriverWait wait;
public AccessDeniedPage(WebDriver driver,
WebDriverWait wait) {
this.driver = driver;
this.wait = wait;
}
public void clickReloadButton() {
try {
log.debug("Пытаемся найти кнопку по id и нажать");
driver.findElement(reloadButtonById).click();
return;
} catch (Exception e) {
log.debug("Кнопка обновления страницы не найдена по id");
}
try {
log.debug("Пытаемся найти кнопку по xpath и нажать");
driver.findElement(reloadButtonByXpath).click();
log.debug("Успешно нашли кнопку по xpath");
return;
} catch (Exception e) {
log.debug("Кнопка обновления страницы не найдена по xpath");
}
}
private boolean isWarningImage() {
return driver.findElement(warningImage) != null;
}
private boolean isAccessDeniedText() {
return driver.findElement(accessDeniedText) != null;
}
@Override
public boolean isLoaded() {
try {
return isWarningImage() && isAccessDeniedText();
} catch (Exception e) {
return false;
}
}
}

View File

@ -0,0 +1,79 @@
package ru.pricepulse.parsingservice.service.marketplace.ozon.page;
import static org.openqa.selenium.support.ui.ExpectedConditions.visibilityOfAllElements;
import static org.openqa.selenium.support.ui.ExpectedConditions.visibilityOfElementLocated;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.By;
import org.openqa.selenium.StaleElementReferenceException;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.springframework.retry.annotation.Recover;
import org.springframework.retry.annotation.Retryable;
import ru.pricepulse.parsingservice.service.marketplace.ozon.MarketplacePage;
@Slf4j
public class CategoryPage implements MarketplacePage {
private static final int PAGE_SIZE = 12;
private static final String SEARCH_RESULTS = "div[data-widget='searchResultsV2']";
private final By searchResults = By.cssSelector(SEARCH_RESULTS);
private WebDriver driver;
private WebDriverWait wait;
public CategoryPage(WebDriver driver, WebDriverWait wait) {
this.driver = driver;
this.wait = wait;
}
public Set<String> getProductsLinks() {
wait.until(visibilityOfElementLocated(searchResults));
var searchResultsElement = driver.findElement(searchResults);
wait.until(driver -> visibilityOfElementLocated(By.cssSelector(":scope > div")));
var outerDiv = searchResultsElement.findElement(By.cssSelector(":scope > div")); // Внешний блок со списком товаров
wait.until(driver -> visibilityOfAllElements(outerDiv.findElements(By.cssSelector(":scope > div"))));
var innerDivs = outerDiv.findElements(By.cssSelector(":scope > div")); // Блок карточки товара
return searchProductsLinks(innerDivs, driver);
}
private Set<String> searchProductsLinks(List<WebElement> innerDivs, WebDriver driver) {
return innerDivs.stream()
.map(div -> {
waitVisibility(div);
List<WebElement> linkTags = null;
try {
linkTags = div.findElements(By.tagName("a"));
} catch (Exception ignored) {}
return linkTags != null && !linkTags.isEmpty()
? linkTags.getFirst().getAttribute("href")
: null;
})
.filter(href -> href != null && !href.isEmpty())
.collect(Collectors.toSet());
}
private void waitVisibility(WebElement outerElement) {
wait.until(driver -> !outerElement.findElements(By.tagName("a")).isEmpty());
}
@Override
public boolean isLoaded() {
try {
return driver.findElement(searchResults) != null;
} catch (Exception e) {
return false;
}
}
}

View File

@ -0,0 +1,141 @@
package ru.pricepulse.parsingservice.service.marketplace.ozon.parsing;
import java.time.Duration;
import java.time.temporal.ChronoUnit;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.StaleElementReferenceException;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.slf4j.MDC;
import org.springframework.beans.factory.ObjectFactory;
import org.springframework.retry.annotation.Recover;
import org.springframework.retry.annotation.Retryable;
import org.springframework.stereotype.Service;
import ru.pricepulse.parsingservice.pool.WebDriverPool;
import ru.pricepulse.parsingservice.service.marketplace.ozon.page.AccessDeniedPage;
import ru.pricepulse.parsingservice.service.marketplace.ozon.page.CategoryPage;
@Slf4j
@Service
public class CategoryPageParsingService {
private final ExecutorService productPageExecutor = Executors.newFixedThreadPool(3);
private final WebDriverPool webDriverPool;
public CategoryPageParsingService(WebDriverPool webDriverPool) {
this.webDriverPool = webDriverPool;
}
@Retryable(maxAttempts = 10, recover = "recover")
public void parseCategoryPage(int finalPageIndex, String url, ArrayList<String> errors) throws InterruptedException {
MDC.put("pageIndex", String.valueOf(finalPageIndex));
String pageUrl = url + "/?page=" + finalPageIndex;
var driver = webDriverPool.borrowDriver();
try {
driver.manage().timeouts().pageLoadTimeout(Duration.of(10, ChronoUnit.SECONDS));
driver.get(pageUrl);
WebDriverWait wait = new WebDriverWait(driver, Duration.of(10, ChronoUnit.SECONDS));
var accessDeniedPage = new AccessDeniedPage(driver, wait); // TODO подумать как не создавать кучу PageObject
var categoryPage = new CategoryPage(driver, wait);
wait.until(d -> checkForWaitingPageLoading(accessDeniedPage, categoryPage));
if (checkAccessDeniedPage(accessDeniedPage)) {
log.info("Доступ ограничен, пробуем решить проблему: {}", pageUrl);
resolveAccessDeniedPage(accessDeniedPage);
log.info("Проблема успешно решена: {}", pageUrl);
}
log.info("Получаем список ссылок на товары на текущей странице: {}", pageUrl);
Set<String> hrefs = Set.of();
try {
hrefs = categoryPage.getProductsLinks();
} catch (Exception e) {
throw new Exception(e);
}
webDriverPool.returnDriver(driver);
log.info("Страница {} Получены ссылки на товары: {}", finalPageIndex, hrefs.size());
hrefs.forEach(href -> {
MDC.put("pageIndex", String.valueOf(finalPageIndex));
try {
processPage(href);
errors.add(href);
log.error(String.valueOf(errors.size()));
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
});
/*hrefs.forEach(href -> productPageExecutor.submit(() -> {
MDC.put("pageIndex", String.valueOf(finalPageIndex));
try {
processPage(href);
errors.add(href);
log.error(String.valueOf(errors.size()));
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}));*/
productPageExecutor.awaitTermination(10, TimeUnit.SECONDS);
} catch (Exception ignored) {
throw new RuntimeException(ignored);
} finally {
webDriverPool.returnDriver(driver); // Завершаем работу драйвера
}
}
private String processPage(String href) throws InterruptedException {
var driver = webDriverPool.borrowDriver();
try {
driver.get(href);
log.info("Страница обработана");
} catch (Throwable ignored) {
} finally {
webDriverPool.returnDriver(driver); // Завершаем работу драйвера
}
return href;
}
private boolean checkForWaitingPageLoading(AccessDeniedPage accessDeniedPage,
CategoryPage categoryPage) {
log.debug("Проверка что страница 'Доступ ограничен'");
try {
if (checkAccessDeniedPage(accessDeniedPage)) {
return true;
}
} catch (Exception e) {
log.debug("Ошибка проверки", e);
}
log.debug("Проверка что страница 'Страница категории'");
if (checkCategoryPage(categoryPage)) {
return true;
}
log.debug("Проверка загрузки страницы неудачна");
return false;
}
private boolean checkAccessDeniedPage(AccessDeniedPage accessDeniedPage) {
return accessDeniedPage.isLoaded();
}
private boolean checkCategoryPage(CategoryPage categoryPage) {
return categoryPage.isLoaded();
}
private void resolveAccessDeniedPage(AccessDeniedPage accessDeniedPage) {
accessDeniedPage.clickReloadButton();
}
@Recover
private void recover(Exception e) {
log.error("Все ретраи провалились");
}
}

View File

@ -0,0 +1,60 @@
package ru.pricepulse.parsingservice.service.marketplace.ozon.parsing;
import java.time.Duration;
import java.time.temporal.ChronoUnit;
import java.util.ArrayList;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicBoolean;
import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.StaleElementReferenceException;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.slf4j.MDC;
import org.springframework.beans.factory.ObjectFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.retry.annotation.Retryable;
import org.springframework.stereotype.Service;
import ru.pricepulse.parsingservice.service.MarketplaceParsingService;
import ru.pricepulse.parsingservice.service.marketplace.ozon.page.AccessDeniedPage;
import ru.pricepulse.parsingservice.service.marketplace.ozon.page.CategoryPage;
@Slf4j
@Service
public class ParsingService implements MarketplaceParsingService {
private final AtomicBoolean stopFlag = new AtomicBoolean(false);
private final ExecutorService categoryExecutor = Executors.newFixedThreadPool(1);
private final CategoryPageParsingService categoryPageParsingService;
public ParsingService(CategoryPageParsingService categoryPageParsingService) {
this.categoryPageParsingService = categoryPageParsingService;
}
public void processCategory(String url) {
var startTime = System.currentTimeMillis();
log.info("Начало обработки категории: {}", url);
int pageIndex = 1;
var errors = new ArrayList<String>();
while (!stopFlag.get()) {
int finalPageIndex = pageIndex;
try {
categoryPageParsingService.parseCategoryPage(finalPageIndex, url, errors);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
++pageIndex;
if (pageIndex > 5) {
stopFlag.set(true);
}
}
log.info("Время выполнения {} ", (System.currentTimeMillis() - startTime) / 1000);
}
}

View File

@ -4,20 +4,19 @@ import lombok.RequiredArgsConstructor;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Service;
import ru.pricepulse.parsingservice.config.properties.OzonConfigProperties;
import ru.pricepulse.parsingservice.service.impl.parsing.OzonParsingService;
import ru.pricepulse.parsingservice.service.marketplace.ozon.parsing.ParsingService;
@Service
@RequiredArgsConstructor
public class OzonProductUpdater {
private final OzonConfigProperties properties;
private final OzonParsingService parsingService;
private final ParsingService ozonParsingService;
@Scheduled(fixedRate = 3600000)
public void updateOzonProducts() {
properties.getCategoriesUrls().forEach(
parsingService::processCategory);
properties.getCategoriesUrls()
.forEach(ozonParsingService::processCategory);
}
}

View File

@ -15,10 +15,12 @@ spring:
liquibase:
change-log: classpath:/db/changelog/master.yml
kafka:
selenium:
marketplace:
ozon:
categories-urls:
- https://www.ozon.ru/category/noutbuki-15692
logging:
pattern:
console: "%d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg %X%n"