Feature/parsing-service: Add parsing Ozon, need test on another system
This commit is contained in:
parent
f58b0a4a02
commit
ef2240e8ab
18
.run/ParsingService [local].run.xml
Normal file
18
.run/ParsingService [local].run.xml
Normal file
@ -0,0 +1,18 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="ParsingService [local]" type="SpringBootApplicationConfigurationType" factoryName="Spring Boot">
|
||||
<option name="ACTIVE_PROFILES" value="dev" />
|
||||
<option name="SCHEDULED_DEBUGGER" value="true" />
|
||||
<envs>
|
||||
<env name="JDBC_PASSWORD" value="postgres" />
|
||||
<env name="JDBC_URL" value="localhost:5432/parsed_data" />
|
||||
<env name="JDBC_USERNAME" value="postgres" />
|
||||
<env name="SERVER_PORT" value="8080" />
|
||||
<env name="WEBDRIVER_CHROME_PATH" value="$PROJECT_DIR$/parsing-service/web-driver/chromedriver" />
|
||||
</envs>
|
||||
<module name="parsing-service.main" />
|
||||
<option name="SPRING_BOOT_MAIN_CLASS" value="ru.pricepulse.parsingservice.ParsingServiceApplication" />
|
||||
<method v="2">
|
||||
<option name="Make" enabled="true" />
|
||||
</method>
|
||||
</configuration>
|
||||
</component>
|
@ -1,6 +1,6 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="ParsingService [local]" type="SpringBootApplicationConfigurationType" factoryName="Spring Boot">
|
||||
<option name="ACTIVE_PROFILES" value="dev" />
|
||||
<option name="ACTIVE_PROFILES" value="dev,headless" />
|
||||
<option name="SCHEDULED_DEBUGGER" value="true" />
|
||||
<envs>
|
||||
<env name="JDBC_PASSWORD" value="postgres" />
|
||||
|
@ -25,6 +25,7 @@ repositories {
|
||||
|
||||
ext {
|
||||
jsoupVesion = '1.18.1'
|
||||
seleniumVersion = '4.25.0'
|
||||
}
|
||||
|
||||
dependencies {
|
||||
@ -33,6 +34,9 @@ dependencies {
|
||||
implementation 'org.liquibase:liquibase-core'
|
||||
implementation 'org.springframework.kafka:spring-kafka'
|
||||
implementation "org.jsoup:jsoup:${jsoupVesion}"
|
||||
implementation "org.seleniumhq.selenium:selenium-java:${seleniumVersion}"
|
||||
implementation 'io.github.bonigarcia:webdrivermanager:5.5.0'
|
||||
implementation 'org.apache.commons:commons-pool2:2.12.0'
|
||||
|
||||
compileOnly 'org.projectlombok:lombok'
|
||||
|
||||
|
@ -0,0 +1,8 @@
|
||||
package ru.pricepulse.parsingservice.config;
|
||||
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
import org.springframework.retry.annotation.EnableRetry;
|
||||
|
||||
@Configuration
|
||||
@EnableRetry
|
||||
public class RetryConfig {}
|
@ -0,0 +1,10 @@
|
||||
package ru.pricepulse.parsingservice.config;
|
||||
|
||||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
import ru.pricepulse.parsingservice.config.properties.SeleniumConfigProperties;
|
||||
|
||||
@Configuration
|
||||
@EnableConfigurationProperties(SeleniumConfigProperties.class)
|
||||
public class SeleniumConfig {
|
||||
}
|
@ -0,0 +1,59 @@
|
||||
package ru.pricepulse.parsingservice.config;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import io.github.bonigarcia.wdm.WebDriverManager;
|
||||
import org.openqa.selenium.WebDriver;
|
||||
import org.openqa.selenium.chrome.ChromeDriver;
|
||||
import org.openqa.selenium.chrome.ChromeOptions;
|
||||
import org.springframework.beans.factory.config.ConfigurableBeanFactory;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
import org.springframework.context.annotation.Profile;
|
||||
import org.springframework.context.annotation.Scope;
|
||||
|
||||
@Configuration
|
||||
public class WebDriverConfig {
|
||||
|
||||
@Bean
|
||||
@Profile("visible")
|
||||
@Scope(ConfigurableBeanFactory.SCOPE_PROTOTYPE)
|
||||
public WebDriver webDriverVisible() {
|
||||
Map<String, Object> prefs = new HashMap<>();
|
||||
prefs.put("profile.managed_default_content_settings.images", 2);
|
||||
prefs.put("profile.managed_default_content_settings.geolocation", 2);
|
||||
|
||||
var options = new ChromeOptions();
|
||||
options.setExperimentalOption("prefs", prefs);
|
||||
WebDriverManager.chromedriver().setup();
|
||||
return new ChromeDriver(options);
|
||||
}
|
||||
|
||||
@Bean
|
||||
@Profile("headless")
|
||||
@Scope(ConfigurableBeanFactory.SCOPE_PROTOTYPE)
|
||||
public WebDriver webDriverHeadless(ChromeOptions options) {
|
||||
WebDriverManager.chromedriver().setup();
|
||||
return new ChromeDriver(options);
|
||||
}
|
||||
|
||||
@Bean
|
||||
@Profile("headless")
|
||||
public ChromeOptions chromeOptions() {
|
||||
Map<String, Object> prefs = new HashMap<>();
|
||||
prefs.put("profile.managed_default_content_settings.images", 2);
|
||||
prefs.put("profile.managed_default_content_settings.stylesheets", 2);
|
||||
|
||||
var options = new ChromeOptions();
|
||||
options.setExperimentalOption("prefs", prefs);
|
||||
options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36");
|
||||
//options.addArguments("--window-size=1920,2000");
|
||||
options.addArguments("--headless");
|
||||
options.addArguments("--disable-gpu");
|
||||
options.addArguments("--no-sandbox");
|
||||
options.addArguments("--disable-dev-shm-usage");
|
||||
return options;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,7 @@
|
||||
package ru.pricepulse.parsingservice.config.properties;
|
||||
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
|
||||
@ConfigurationProperties("selenium")
|
||||
public class SeleniumConfigProperties {
|
||||
}
|
@ -22,6 +22,7 @@ import ru.pricepulse.parsingservice.persistence.enums.MarketplaceEnum;
|
||||
@Entity
|
||||
@Table(name = "product")
|
||||
public class ProductEntity {
|
||||
|
||||
@Id
|
||||
@GeneratedValue(strategy = GenerationType.IDENTITY)
|
||||
@Column(name = "id", nullable = false)
|
||||
@ -69,4 +70,5 @@ public class ProductEntity {
|
||||
protected void onCreate() {
|
||||
createdAt = LocalDateTime.now();
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,67 @@
|
||||
package ru.pricepulse.parsingservice.pool;
|
||||
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.ConcurrentLinkedQueue;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.openqa.selenium.WebDriver;
|
||||
import org.openqa.selenium.chrome.ChromeDriver;
|
||||
import org.openqa.selenium.chrome.ChromeOptions;
|
||||
import org.springframework.beans.factory.ObjectFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Slf4j
|
||||
@Component
|
||||
public class WebDriverPool {
|
||||
|
||||
private final Queue<WebDriver> availableDrivers = new ConcurrentLinkedQueue<>(); // Список доступных драйверов
|
||||
private final Queue<WebDriver> busyDrivers = new ConcurrentLinkedQueue<>(); // Список занятых драйверов
|
||||
private final ObjectFactory<WebDriver> webDriverFactory;
|
||||
|
||||
public WebDriverPool(ObjectFactory<WebDriver> webDriverFactory) {
|
||||
this.webDriverFactory = webDriverFactory;
|
||||
int poolSize = 12;
|
||||
|
||||
// Инициализация пула с указанным количеством драйверов
|
||||
for (int i = 0; i < poolSize; i++) {
|
||||
availableDrivers.add(createNewDriver());
|
||||
}
|
||||
}
|
||||
|
||||
// Метод для создания нового экземпляра WebDriver
|
||||
private WebDriver createNewDriver() {
|
||||
return webDriverFactory.getObject();
|
||||
}
|
||||
|
||||
// Метод для заимствования драйвера
|
||||
public WebDriver borrowDriver() {
|
||||
WebDriver driver = availableDrivers.poll(); // Получаем драйвер из доступных
|
||||
if (driver != null) {
|
||||
busyDrivers.add(driver); // Добавляем драйвер в занятые
|
||||
}
|
||||
log.info("Занимаем драйвер {}", driver);
|
||||
return driver; // Возвращаем драйвер
|
||||
}
|
||||
|
||||
// Метод для возврата драйвера в пул
|
||||
public void returnDriver(WebDriver driver) {
|
||||
busyDrivers.remove(driver); // Убираем драйвер из занятых
|
||||
availableDrivers.add(driver); // Возвращаем драйвер в доступные
|
||||
log.info("Вернули драйвер {}", driver);
|
||||
}
|
||||
|
||||
// Метод для закрытия всех драйверов в пуле
|
||||
public void shutdownPool() {
|
||||
// Закрываем доступные драйверы
|
||||
for (WebDriver driver : availableDrivers) {
|
||||
driver.quit();
|
||||
}
|
||||
// Закрываем занятые драйверы
|
||||
for (WebDriver driver : busyDrivers) {
|
||||
driver.quit();
|
||||
}
|
||||
availableDrivers.clear();
|
||||
busyDrivers.clear();
|
||||
}
|
||||
|
||||
}
|
@ -1,67 +0,0 @@
|
||||
package ru.pricepulse.parsingservice.service.impl.parsing;
|
||||
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.retry.annotation.Recover;
|
||||
import org.springframework.retry.annotation.Retryable;
|
||||
import org.springframework.stereotype.Service;
|
||||
import ru.pricepulse.parsingservice.service.DataParser;
|
||||
import ru.pricepulse.parsingservice.service.MarketplaceParsingService;
|
||||
import ru.pricepulse.parsingservice.service.request.PageFetcher;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
public class OzonParsingService implements MarketplaceParsingService {
|
||||
|
||||
private final PageFetcher pageFetcher;
|
||||
private final DataParser dataParser;
|
||||
private final ExecutorService executorService;
|
||||
|
||||
public OzonParsingService(PageFetcher pageFetcher, DataParser dataParser) {
|
||||
this.pageFetcher = pageFetcher;
|
||||
this.dataParser = dataParser;
|
||||
this.executorService = Executors.newFixedThreadPool(1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processCategory(String categoryUrl) {
|
||||
int pageNumber = 1;
|
||||
AtomicBoolean hasMoreData = new AtomicBoolean(true);
|
||||
|
||||
while (hasMoreData.get()) {
|
||||
int finalPageNumber = pageNumber;
|
||||
executorService.submit(() -> processTask(categoryUrl, finalPageNumber, hasMoreData));
|
||||
pageNumber++;
|
||||
}
|
||||
}
|
||||
|
||||
@Retryable
|
||||
private void processTask(String categoryUrl, int pageNumber, AtomicBoolean hasMoreData) {
|
||||
String pageUrl = categoryUrl + "?page=" + pageNumber;
|
||||
String pageContent;
|
||||
try {
|
||||
log.info("Получение страницы {}", pageUrl);
|
||||
pageContent = pageFetcher.fetchPage(pageUrl);
|
||||
} catch (Exception e) {
|
||||
log.error("Ошибка получения страницы - {} \n {}", pageUrl, e.getMessage(), e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
if (!dataParser.pageHasData(pageContent)) {
|
||||
log.warn("Данные не найдены - {}", pageUrl);
|
||||
hasMoreData.set(false);
|
||||
return;
|
||||
}
|
||||
|
||||
dataParser.parseAndQueueData(pageContent);
|
||||
}
|
||||
|
||||
@Recover
|
||||
private void recover(Exception e, String categoryUrl, int pageNumber, AtomicBoolean hasMoreData) {
|
||||
log.error(e.getMessage(), e);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,7 @@
|
||||
package ru.pricepulse.parsingservice.service.marketplace.ozon;
|
||||
|
||||
public interface MarketplacePage {
|
||||
|
||||
boolean isLoaded();
|
||||
|
||||
}
|
@ -0,0 +1,67 @@
|
||||
package ru.pricepulse.parsingservice.service.marketplace.ozon.page;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.openqa.selenium.By;
|
||||
import org.openqa.selenium.WebDriver;
|
||||
import org.openqa.selenium.WebElement;
|
||||
import org.openqa.selenium.support.ui.ExpectedConditions;
|
||||
import org.openqa.selenium.support.ui.WebDriverWait;
|
||||
import ru.pricepulse.parsingservice.service.marketplace.ozon.MarketplacePage;
|
||||
|
||||
@Slf4j
|
||||
public class AccessDeniedPage implements MarketplacePage {
|
||||
|
||||
private static final String RELOAD_BUTTON_ID = "reload-button";
|
||||
private static final String RELOAD_BUTTON_XPATH = "//button[contains(text(),'Обновить')]";
|
||||
private static final String WARNING_IMAGE_CSS = "img[alt='warning']";
|
||||
private static final String ACCESS_DENIED_TEXT_XPATH = "//h1[text()='Доступ ограничен']";
|
||||
|
||||
private final By reloadButtonById = By.id(RELOAD_BUTTON_ID);
|
||||
private final By reloadButtonByXpath = By.xpath(RELOAD_BUTTON_XPATH);
|
||||
private final By warningImage = By.cssSelector(WARNING_IMAGE_CSS);
|
||||
private final By accessDeniedText = By.xpath(ACCESS_DENIED_TEXT_XPATH);
|
||||
|
||||
private WebDriver driver;
|
||||
private WebDriverWait wait;
|
||||
|
||||
public AccessDeniedPage(WebDriver driver,
|
||||
WebDriverWait wait) {
|
||||
this.driver = driver;
|
||||
this.wait = wait;
|
||||
}
|
||||
|
||||
public void clickReloadButton() {
|
||||
try {
|
||||
log.debug("Пытаемся найти кнопку по id и нажать");
|
||||
driver.findElement(reloadButtonById).click();
|
||||
return;
|
||||
} catch (Exception e) {
|
||||
log.debug("Кнопка обновления страницы не найдена по id");
|
||||
}
|
||||
try {
|
||||
log.debug("Пытаемся найти кнопку по xpath и нажать");
|
||||
driver.findElement(reloadButtonByXpath).click();
|
||||
log.debug("Успешно нашли кнопку по xpath");
|
||||
return;
|
||||
} catch (Exception e) {
|
||||
log.debug("Кнопка обновления страницы не найдена по xpath");
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isWarningImage() {
|
||||
return driver.findElement(warningImage) != null;
|
||||
}
|
||||
|
||||
private boolean isAccessDeniedText() {
|
||||
return driver.findElement(accessDeniedText) != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isLoaded() {
|
||||
try {
|
||||
return isWarningImage() && isAccessDeniedText();
|
||||
} catch (Exception e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,79 @@
|
||||
package ru.pricepulse.parsingservice.service.marketplace.ozon.page;
|
||||
|
||||
import static org.openqa.selenium.support.ui.ExpectedConditions.visibilityOfAllElements;
|
||||
import static org.openqa.selenium.support.ui.ExpectedConditions.visibilityOfElementLocated;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.openqa.selenium.By;
|
||||
import org.openqa.selenium.StaleElementReferenceException;
|
||||
import org.openqa.selenium.WebDriver;
|
||||
import org.openqa.selenium.WebElement;
|
||||
import org.openqa.selenium.support.ui.ExpectedConditions;
|
||||
import org.openqa.selenium.support.ui.WebDriverWait;
|
||||
import org.springframework.retry.annotation.Recover;
|
||||
import org.springframework.retry.annotation.Retryable;
|
||||
import ru.pricepulse.parsingservice.service.marketplace.ozon.MarketplacePage;
|
||||
|
||||
@Slf4j
|
||||
public class CategoryPage implements MarketplacePage {
|
||||
|
||||
private static final int PAGE_SIZE = 12;
|
||||
private static final String SEARCH_RESULTS = "div[data-widget='searchResultsV2']";
|
||||
|
||||
private final By searchResults = By.cssSelector(SEARCH_RESULTS);
|
||||
|
||||
private WebDriver driver;
|
||||
|
||||
private WebDriverWait wait;
|
||||
|
||||
public CategoryPage(WebDriver driver, WebDriverWait wait) {
|
||||
this.driver = driver;
|
||||
this.wait = wait;
|
||||
}
|
||||
|
||||
public Set<String> getProductsLinks() {
|
||||
wait.until(visibilityOfElementLocated(searchResults));
|
||||
var searchResultsElement = driver.findElement(searchResults);
|
||||
wait.until(driver -> visibilityOfElementLocated(By.cssSelector(":scope > div")));
|
||||
var outerDiv = searchResultsElement.findElement(By.cssSelector(":scope > div")); // Внешний блок со списком товаров
|
||||
wait.until(driver -> visibilityOfAllElements(outerDiv.findElements(By.cssSelector(":scope > div"))));
|
||||
var innerDivs = outerDiv.findElements(By.cssSelector(":scope > div")); // Блок карточки товара
|
||||
return searchProductsLinks(innerDivs, driver);
|
||||
}
|
||||
|
||||
private Set<String> searchProductsLinks(List<WebElement> innerDivs, WebDriver driver) {
|
||||
return innerDivs.stream()
|
||||
.map(div -> {
|
||||
waitVisibility(div);
|
||||
List<WebElement> linkTags = null;
|
||||
try {
|
||||
linkTags = div.findElements(By.tagName("a"));
|
||||
} catch (Exception ignored) {}
|
||||
return linkTags != null && !linkTags.isEmpty()
|
||||
? linkTags.getFirst().getAttribute("href")
|
||||
: null;
|
||||
})
|
||||
.filter(href -> href != null && !href.isEmpty())
|
||||
.collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
private void waitVisibility(WebElement outerElement) {
|
||||
wait.until(driver -> !outerElement.findElements(By.tagName("a")).isEmpty());
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isLoaded() {
|
||||
try {
|
||||
return driver.findElement(searchResults) != null;
|
||||
} catch (Exception e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,141 @@
|
||||
package ru.pricepulse.parsingservice.service.marketplace.ozon.parsing;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.time.temporal.ChronoUnit;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.openqa.selenium.JavascriptExecutor;
|
||||
import org.openqa.selenium.StaleElementReferenceException;
|
||||
import org.openqa.selenium.WebDriver;
|
||||
import org.openqa.selenium.support.ui.WebDriverWait;
|
||||
import org.slf4j.MDC;
|
||||
import org.springframework.beans.factory.ObjectFactory;
|
||||
import org.springframework.retry.annotation.Recover;
|
||||
import org.springframework.retry.annotation.Retryable;
|
||||
import org.springframework.stereotype.Service;
|
||||
import ru.pricepulse.parsingservice.pool.WebDriverPool;
|
||||
import ru.pricepulse.parsingservice.service.marketplace.ozon.page.AccessDeniedPage;
|
||||
import ru.pricepulse.parsingservice.service.marketplace.ozon.page.CategoryPage;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
public class CategoryPageParsingService {
|
||||
|
||||
private final ExecutorService productPageExecutor = Executors.newFixedThreadPool(3);
|
||||
|
||||
private final WebDriverPool webDriverPool;
|
||||
|
||||
public CategoryPageParsingService(WebDriverPool webDriverPool) {
|
||||
this.webDriverPool = webDriverPool;
|
||||
}
|
||||
|
||||
@Retryable(maxAttempts = 10, recover = "recover")
|
||||
public void parseCategoryPage(int finalPageIndex, String url, ArrayList<String> errors) throws InterruptedException {
|
||||
MDC.put("pageIndex", String.valueOf(finalPageIndex));
|
||||
String pageUrl = url + "/?page=" + finalPageIndex;
|
||||
var driver = webDriverPool.borrowDriver();
|
||||
|
||||
try {
|
||||
driver.manage().timeouts().pageLoadTimeout(Duration.of(10, ChronoUnit.SECONDS));
|
||||
driver.get(pageUrl);
|
||||
WebDriverWait wait = new WebDriverWait(driver, Duration.of(10, ChronoUnit.SECONDS));
|
||||
var accessDeniedPage = new AccessDeniedPage(driver, wait); // TODO подумать как не создавать кучу PageObject
|
||||
var categoryPage = new CategoryPage(driver, wait);
|
||||
wait.until(d -> checkForWaitingPageLoading(accessDeniedPage, categoryPage));
|
||||
if (checkAccessDeniedPage(accessDeniedPage)) {
|
||||
log.info("Доступ ограничен, пробуем решить проблему: {}", pageUrl);
|
||||
resolveAccessDeniedPage(accessDeniedPage);
|
||||
log.info("Проблема успешно решена: {}", pageUrl);
|
||||
}
|
||||
log.info("Получаем список ссылок на товары на текущей странице: {}", pageUrl);
|
||||
Set<String> hrefs = Set.of();
|
||||
try {
|
||||
hrefs = categoryPage.getProductsLinks();
|
||||
} catch (Exception e) {
|
||||
throw new Exception(e);
|
||||
}
|
||||
webDriverPool.returnDriver(driver);
|
||||
log.info("Страница {} Получены ссылки на товары: {}", finalPageIndex, hrefs.size());
|
||||
hrefs.forEach(href -> {
|
||||
MDC.put("pageIndex", String.valueOf(finalPageIndex));
|
||||
try {
|
||||
processPage(href);
|
||||
errors.add(href);
|
||||
log.error(String.valueOf(errors.size()));
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
/*hrefs.forEach(href -> productPageExecutor.submit(() -> {
|
||||
MDC.put("pageIndex", String.valueOf(finalPageIndex));
|
||||
try {
|
||||
processPage(href);
|
||||
errors.add(href);
|
||||
log.error(String.valueOf(errors.size()));
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}));*/
|
||||
productPageExecutor.awaitTermination(10, TimeUnit.SECONDS);
|
||||
} catch (Exception ignored) {
|
||||
throw new RuntimeException(ignored);
|
||||
} finally {
|
||||
webDriverPool.returnDriver(driver); // Завершаем работу драйвера
|
||||
}
|
||||
}
|
||||
|
||||
private String processPage(String href) throws InterruptedException {
|
||||
var driver = webDriverPool.borrowDriver();
|
||||
try {
|
||||
driver.get(href);
|
||||
log.info("Страница обработана");
|
||||
} catch (Throwable ignored) {
|
||||
|
||||
} finally {
|
||||
webDriverPool.returnDriver(driver); // Завершаем работу драйвера
|
||||
}
|
||||
return href;
|
||||
}
|
||||
|
||||
private boolean checkForWaitingPageLoading(AccessDeniedPage accessDeniedPage,
|
||||
CategoryPage categoryPage) {
|
||||
log.debug("Проверка что страница 'Доступ ограничен'");
|
||||
try {
|
||||
if (checkAccessDeniedPage(accessDeniedPage)) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.debug("Ошибка проверки", e);
|
||||
}
|
||||
log.debug("Проверка что страница 'Страница категории'");
|
||||
if (checkCategoryPage(categoryPage)) {
|
||||
return true;
|
||||
}
|
||||
log.debug("Проверка загрузки страницы неудачна");
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean checkAccessDeniedPage(AccessDeniedPage accessDeniedPage) {
|
||||
return accessDeniedPage.isLoaded();
|
||||
}
|
||||
|
||||
private boolean checkCategoryPage(CategoryPage categoryPage) {
|
||||
return categoryPage.isLoaded();
|
||||
}
|
||||
|
||||
private void resolveAccessDeniedPage(AccessDeniedPage accessDeniedPage) {
|
||||
accessDeniedPage.clickReloadButton();
|
||||
}
|
||||
|
||||
@Recover
|
||||
private void recover(Exception e) {
|
||||
log.error("Все ретраи провалились");
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,60 @@
|
||||
package ru.pricepulse.parsingservice.service.marketplace.ozon.parsing;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.time.temporal.ChronoUnit;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.openqa.selenium.JavascriptExecutor;
|
||||
import org.openqa.selenium.StaleElementReferenceException;
|
||||
import org.openqa.selenium.WebDriver;
|
||||
import org.openqa.selenium.support.ui.WebDriverWait;
|
||||
import org.slf4j.MDC;
|
||||
import org.springframework.beans.factory.ObjectFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.retry.annotation.Retryable;
|
||||
import org.springframework.stereotype.Service;
|
||||
import ru.pricepulse.parsingservice.service.MarketplaceParsingService;
|
||||
import ru.pricepulse.parsingservice.service.marketplace.ozon.page.AccessDeniedPage;
|
||||
import ru.pricepulse.parsingservice.service.marketplace.ozon.page.CategoryPage;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
public class ParsingService implements MarketplaceParsingService {
|
||||
|
||||
private final AtomicBoolean stopFlag = new AtomicBoolean(false);
|
||||
|
||||
private final ExecutorService categoryExecutor = Executors.newFixedThreadPool(1);
|
||||
|
||||
private final CategoryPageParsingService categoryPageParsingService;
|
||||
|
||||
public ParsingService(CategoryPageParsingService categoryPageParsingService) {
|
||||
this.categoryPageParsingService = categoryPageParsingService;
|
||||
}
|
||||
|
||||
public void processCategory(String url) {
|
||||
var startTime = System.currentTimeMillis();
|
||||
log.info("Начало обработки категории: {}", url);
|
||||
int pageIndex = 1;
|
||||
var errors = new ArrayList<String>();
|
||||
|
||||
while (!stopFlag.get()) {
|
||||
int finalPageIndex = pageIndex;
|
||||
try {
|
||||
categoryPageParsingService.parseCategoryPage(finalPageIndex, url, errors);
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
++pageIndex;
|
||||
if (pageIndex > 5) {
|
||||
stopFlag.set(true);
|
||||
}
|
||||
}
|
||||
log.info("Время выполнения {} ", (System.currentTimeMillis() - startTime) / 1000);
|
||||
}
|
||||
|
||||
}
|
@ -4,20 +4,19 @@ import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.scheduling.annotation.Scheduled;
|
||||
import org.springframework.stereotype.Service;
|
||||
import ru.pricepulse.parsingservice.config.properties.OzonConfigProperties;
|
||||
import ru.pricepulse.parsingservice.service.impl.parsing.OzonParsingService;
|
||||
import ru.pricepulse.parsingservice.service.marketplace.ozon.parsing.ParsingService;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class OzonProductUpdater {
|
||||
|
||||
private final OzonConfigProperties properties;
|
||||
|
||||
private final OzonParsingService parsingService;
|
||||
private final ParsingService ozonParsingService;
|
||||
|
||||
@Scheduled(fixedRate = 3600000)
|
||||
public void updateOzonProducts() {
|
||||
properties.getCategoriesUrls().forEach(
|
||||
parsingService::processCategory);
|
||||
properties.getCategoriesUrls()
|
||||
.forEach(ozonParsingService::processCategory);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -15,10 +15,12 @@ spring:
|
||||
liquibase:
|
||||
change-log: classpath:/db/changelog/master.yml
|
||||
kafka:
|
||||
|
||||
selenium:
|
||||
|
||||
marketplace:
|
||||
ozon:
|
||||
categories-urls:
|
||||
- https://www.ozon.ru/category/noutbuki-15692
|
||||
|
||||
logging:
|
||||
pattern:
|
||||
console: "%d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg %X%n"
|
Loading…
Reference in New Issue
Block a user