Feature/parsing-service: Add parsing Ozon, need test on another system
This commit is contained in:
parent
f58b0a4a02
commit
ef2240e8ab
18
.run/ParsingService [local].run.xml
Normal file
18
.run/ParsingService [local].run.xml
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
<component name="ProjectRunConfigurationManager">
|
||||||
|
<configuration default="false" name="ParsingService [local]" type="SpringBootApplicationConfigurationType" factoryName="Spring Boot">
|
||||||
|
<option name="ACTIVE_PROFILES" value="dev" />
|
||||||
|
<option name="SCHEDULED_DEBUGGER" value="true" />
|
||||||
|
<envs>
|
||||||
|
<env name="JDBC_PASSWORD" value="postgres" />
|
||||||
|
<env name="JDBC_URL" value="localhost:5432/parsed_data" />
|
||||||
|
<env name="JDBC_USERNAME" value="postgres" />
|
||||||
|
<env name="SERVER_PORT" value="8080" />
|
||||||
|
<env name="WEBDRIVER_CHROME_PATH" value="$PROJECT_DIR$/parsing-service/web-driver/chromedriver" />
|
||||||
|
</envs>
|
||||||
|
<module name="parsing-service.main" />
|
||||||
|
<option name="SPRING_BOOT_MAIN_CLASS" value="ru.pricepulse.parsingservice.ParsingServiceApplication" />
|
||||||
|
<method v="2">
|
||||||
|
<option name="Make" enabled="true" />
|
||||||
|
</method>
|
||||||
|
</configuration>
|
||||||
|
</component>
|
@ -1,6 +1,6 @@
|
|||||||
<component name="ProjectRunConfigurationManager">
|
<component name="ProjectRunConfigurationManager">
|
||||||
<configuration default="false" name="ParsingService [local]" type="SpringBootApplicationConfigurationType" factoryName="Spring Boot">
|
<configuration default="false" name="ParsingService [local]" type="SpringBootApplicationConfigurationType" factoryName="Spring Boot">
|
||||||
<option name="ACTIVE_PROFILES" value="dev" />
|
<option name="ACTIVE_PROFILES" value="dev,headless" />
|
||||||
<option name="SCHEDULED_DEBUGGER" value="true" />
|
<option name="SCHEDULED_DEBUGGER" value="true" />
|
||||||
<envs>
|
<envs>
|
||||||
<env name="JDBC_PASSWORD" value="postgres" />
|
<env name="JDBC_PASSWORD" value="postgres" />
|
||||||
|
@ -25,6 +25,7 @@ repositories {
|
|||||||
|
|
||||||
ext {
|
ext {
|
||||||
jsoupVesion = '1.18.1'
|
jsoupVesion = '1.18.1'
|
||||||
|
seleniumVersion = '4.25.0'
|
||||||
}
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
@ -33,6 +34,9 @@ dependencies {
|
|||||||
implementation 'org.liquibase:liquibase-core'
|
implementation 'org.liquibase:liquibase-core'
|
||||||
implementation 'org.springframework.kafka:spring-kafka'
|
implementation 'org.springframework.kafka:spring-kafka'
|
||||||
implementation "org.jsoup:jsoup:${jsoupVesion}"
|
implementation "org.jsoup:jsoup:${jsoupVesion}"
|
||||||
|
implementation "org.seleniumhq.selenium:selenium-java:${seleniumVersion}"
|
||||||
|
implementation 'io.github.bonigarcia:webdrivermanager:5.5.0'
|
||||||
|
implementation 'org.apache.commons:commons-pool2:2.12.0'
|
||||||
|
|
||||||
compileOnly 'org.projectlombok:lombok'
|
compileOnly 'org.projectlombok:lombok'
|
||||||
|
|
||||||
|
@ -0,0 +1,8 @@
|
|||||||
|
package ru.pricepulse.parsingservice.config;
|
||||||
|
|
||||||
|
import org.springframework.context.annotation.Configuration;
|
||||||
|
import org.springframework.retry.annotation.EnableRetry;
|
||||||
|
|
||||||
|
@Configuration
|
||||||
|
@EnableRetry
|
||||||
|
public class RetryConfig {}
|
@ -0,0 +1,10 @@
|
|||||||
|
package ru.pricepulse.parsingservice.config;
|
||||||
|
|
||||||
|
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||||
|
import org.springframework.context.annotation.Configuration;
|
||||||
|
import ru.pricepulse.parsingservice.config.properties.SeleniumConfigProperties;
|
||||||
|
|
||||||
|
@Configuration
|
||||||
|
@EnableConfigurationProperties(SeleniumConfigProperties.class)
|
||||||
|
public class SeleniumConfig {
|
||||||
|
}
|
@ -0,0 +1,59 @@
|
|||||||
|
package ru.pricepulse.parsingservice.config;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import io.github.bonigarcia.wdm.WebDriverManager;
|
||||||
|
import org.openqa.selenium.WebDriver;
|
||||||
|
import org.openqa.selenium.chrome.ChromeDriver;
|
||||||
|
import org.openqa.selenium.chrome.ChromeOptions;
|
||||||
|
import org.springframework.beans.factory.config.ConfigurableBeanFactory;
|
||||||
|
import org.springframework.context.annotation.Bean;
|
||||||
|
import org.springframework.context.annotation.Configuration;
|
||||||
|
import org.springframework.context.annotation.Profile;
|
||||||
|
import org.springframework.context.annotation.Scope;
|
||||||
|
|
||||||
|
@Configuration
|
||||||
|
public class WebDriverConfig {
|
||||||
|
|
||||||
|
@Bean
|
||||||
|
@Profile("visible")
|
||||||
|
@Scope(ConfigurableBeanFactory.SCOPE_PROTOTYPE)
|
||||||
|
public WebDriver webDriverVisible() {
|
||||||
|
Map<String, Object> prefs = new HashMap<>();
|
||||||
|
prefs.put("profile.managed_default_content_settings.images", 2);
|
||||||
|
prefs.put("profile.managed_default_content_settings.geolocation", 2);
|
||||||
|
|
||||||
|
var options = new ChromeOptions();
|
||||||
|
options.setExperimentalOption("prefs", prefs);
|
||||||
|
WebDriverManager.chromedriver().setup();
|
||||||
|
return new ChromeDriver(options);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Bean
|
||||||
|
@Profile("headless")
|
||||||
|
@Scope(ConfigurableBeanFactory.SCOPE_PROTOTYPE)
|
||||||
|
public WebDriver webDriverHeadless(ChromeOptions options) {
|
||||||
|
WebDriverManager.chromedriver().setup();
|
||||||
|
return new ChromeDriver(options);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Bean
|
||||||
|
@Profile("headless")
|
||||||
|
public ChromeOptions chromeOptions() {
|
||||||
|
Map<String, Object> prefs = new HashMap<>();
|
||||||
|
prefs.put("profile.managed_default_content_settings.images", 2);
|
||||||
|
prefs.put("profile.managed_default_content_settings.stylesheets", 2);
|
||||||
|
|
||||||
|
var options = new ChromeOptions();
|
||||||
|
options.setExperimentalOption("prefs", prefs);
|
||||||
|
options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36");
|
||||||
|
//options.addArguments("--window-size=1920,2000");
|
||||||
|
options.addArguments("--headless");
|
||||||
|
options.addArguments("--disable-gpu");
|
||||||
|
options.addArguments("--no-sandbox");
|
||||||
|
options.addArguments("--disable-dev-shm-usage");
|
||||||
|
return options;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,7 @@
|
|||||||
|
package ru.pricepulse.parsingservice.config.properties;
|
||||||
|
|
||||||
|
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||||
|
|
||||||
|
@ConfigurationProperties("selenium")
|
||||||
|
public class SeleniumConfigProperties {
|
||||||
|
}
|
@ -22,6 +22,7 @@ import ru.pricepulse.parsingservice.persistence.enums.MarketplaceEnum;
|
|||||||
@Entity
|
@Entity
|
||||||
@Table(name = "product")
|
@Table(name = "product")
|
||||||
public class ProductEntity {
|
public class ProductEntity {
|
||||||
|
|
||||||
@Id
|
@Id
|
||||||
@GeneratedValue(strategy = GenerationType.IDENTITY)
|
@GeneratedValue(strategy = GenerationType.IDENTITY)
|
||||||
@Column(name = "id", nullable = false)
|
@Column(name = "id", nullable = false)
|
||||||
@ -69,4 +70,5 @@ public class ProductEntity {
|
|||||||
protected void onCreate() {
|
protected void onCreate() {
|
||||||
createdAt = LocalDateTime.now();
|
createdAt = LocalDateTime.now();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -0,0 +1,67 @@
|
|||||||
|
package ru.pricepulse.parsingservice.pool;
|
||||||
|
|
||||||
|
import java.util.Queue;
|
||||||
|
import java.util.concurrent.ConcurrentLinkedQueue;
|
||||||
|
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.openqa.selenium.WebDriver;
|
||||||
|
import org.openqa.selenium.chrome.ChromeDriver;
|
||||||
|
import org.openqa.selenium.chrome.ChromeOptions;
|
||||||
|
import org.springframework.beans.factory.ObjectFactory;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
|
@Component
|
||||||
|
public class WebDriverPool {
|
||||||
|
|
||||||
|
private final Queue<WebDriver> availableDrivers = new ConcurrentLinkedQueue<>(); // Список доступных драйверов
|
||||||
|
private final Queue<WebDriver> busyDrivers = new ConcurrentLinkedQueue<>(); // Список занятых драйверов
|
||||||
|
private final ObjectFactory<WebDriver> webDriverFactory;
|
||||||
|
|
||||||
|
public WebDriverPool(ObjectFactory<WebDriver> webDriverFactory) {
|
||||||
|
this.webDriverFactory = webDriverFactory;
|
||||||
|
int poolSize = 12;
|
||||||
|
|
||||||
|
// Инициализация пула с указанным количеством драйверов
|
||||||
|
for (int i = 0; i < poolSize; i++) {
|
||||||
|
availableDrivers.add(createNewDriver());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Метод для создания нового экземпляра WebDriver
|
||||||
|
private WebDriver createNewDriver() {
|
||||||
|
return webDriverFactory.getObject();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Метод для заимствования драйвера
|
||||||
|
public WebDriver borrowDriver() {
|
||||||
|
WebDriver driver = availableDrivers.poll(); // Получаем драйвер из доступных
|
||||||
|
if (driver != null) {
|
||||||
|
busyDrivers.add(driver); // Добавляем драйвер в занятые
|
||||||
|
}
|
||||||
|
log.info("Занимаем драйвер {}", driver);
|
||||||
|
return driver; // Возвращаем драйвер
|
||||||
|
}
|
||||||
|
|
||||||
|
// Метод для возврата драйвера в пул
|
||||||
|
public void returnDriver(WebDriver driver) {
|
||||||
|
busyDrivers.remove(driver); // Убираем драйвер из занятых
|
||||||
|
availableDrivers.add(driver); // Возвращаем драйвер в доступные
|
||||||
|
log.info("Вернули драйвер {}", driver);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Метод для закрытия всех драйверов в пуле
|
||||||
|
public void shutdownPool() {
|
||||||
|
// Закрываем доступные драйверы
|
||||||
|
for (WebDriver driver : availableDrivers) {
|
||||||
|
driver.quit();
|
||||||
|
}
|
||||||
|
// Закрываем занятые драйверы
|
||||||
|
for (WebDriver driver : busyDrivers) {
|
||||||
|
driver.quit();
|
||||||
|
}
|
||||||
|
availableDrivers.clear();
|
||||||
|
busyDrivers.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,67 +0,0 @@
|
|||||||
package ru.pricepulse.parsingservice.service.impl.parsing;
|
|
||||||
|
|
||||||
import java.util.concurrent.ExecutorService;
|
|
||||||
import java.util.concurrent.Executors;
|
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
|
||||||
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
import org.springframework.retry.annotation.Recover;
|
|
||||||
import org.springframework.retry.annotation.Retryable;
|
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
import ru.pricepulse.parsingservice.service.DataParser;
|
|
||||||
import ru.pricepulse.parsingservice.service.MarketplaceParsingService;
|
|
||||||
import ru.pricepulse.parsingservice.service.request.PageFetcher;
|
|
||||||
|
|
||||||
@Slf4j
|
|
||||||
@Service
|
|
||||||
public class OzonParsingService implements MarketplaceParsingService {
|
|
||||||
|
|
||||||
private final PageFetcher pageFetcher;
|
|
||||||
private final DataParser dataParser;
|
|
||||||
private final ExecutorService executorService;
|
|
||||||
|
|
||||||
public OzonParsingService(PageFetcher pageFetcher, DataParser dataParser) {
|
|
||||||
this.pageFetcher = pageFetcher;
|
|
||||||
this.dataParser = dataParser;
|
|
||||||
this.executorService = Executors.newFixedThreadPool(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void processCategory(String categoryUrl) {
|
|
||||||
int pageNumber = 1;
|
|
||||||
AtomicBoolean hasMoreData = new AtomicBoolean(true);
|
|
||||||
|
|
||||||
while (hasMoreData.get()) {
|
|
||||||
int finalPageNumber = pageNumber;
|
|
||||||
executorService.submit(() -> processTask(categoryUrl, finalPageNumber, hasMoreData));
|
|
||||||
pageNumber++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Retryable
|
|
||||||
private void processTask(String categoryUrl, int pageNumber, AtomicBoolean hasMoreData) {
|
|
||||||
String pageUrl = categoryUrl + "?page=" + pageNumber;
|
|
||||||
String pageContent;
|
|
||||||
try {
|
|
||||||
log.info("Получение страницы {}", pageUrl);
|
|
||||||
pageContent = pageFetcher.fetchPage(pageUrl);
|
|
||||||
} catch (Exception e) {
|
|
||||||
log.error("Ошибка получения страницы - {} \n {}", pageUrl, e.getMessage(), e);
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!dataParser.pageHasData(pageContent)) {
|
|
||||||
log.warn("Данные не найдены - {}", pageUrl);
|
|
||||||
hasMoreData.set(false);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
dataParser.parseAndQueueData(pageContent);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Recover
|
|
||||||
private void recover(Exception e, String categoryUrl, int pageNumber, AtomicBoolean hasMoreData) {
|
|
||||||
log.error(e.getMessage(), e);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -0,0 +1,7 @@
|
|||||||
|
package ru.pricepulse.parsingservice.service.marketplace.ozon;
|
||||||
|
|
||||||
|
public interface MarketplacePage {
|
||||||
|
|
||||||
|
boolean isLoaded();
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,67 @@
|
|||||||
|
package ru.pricepulse.parsingservice.service.marketplace.ozon.page;
|
||||||
|
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.openqa.selenium.By;
|
||||||
|
import org.openqa.selenium.WebDriver;
|
||||||
|
import org.openqa.selenium.WebElement;
|
||||||
|
import org.openqa.selenium.support.ui.ExpectedConditions;
|
||||||
|
import org.openqa.selenium.support.ui.WebDriverWait;
|
||||||
|
import ru.pricepulse.parsingservice.service.marketplace.ozon.MarketplacePage;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
|
public class AccessDeniedPage implements MarketplacePage {
|
||||||
|
|
||||||
|
private static final String RELOAD_BUTTON_ID = "reload-button";
|
||||||
|
private static final String RELOAD_BUTTON_XPATH = "//button[contains(text(),'Обновить')]";
|
||||||
|
private static final String WARNING_IMAGE_CSS = "img[alt='warning']";
|
||||||
|
private static final String ACCESS_DENIED_TEXT_XPATH = "//h1[text()='Доступ ограничен']";
|
||||||
|
|
||||||
|
private final By reloadButtonById = By.id(RELOAD_BUTTON_ID);
|
||||||
|
private final By reloadButtonByXpath = By.xpath(RELOAD_BUTTON_XPATH);
|
||||||
|
private final By warningImage = By.cssSelector(WARNING_IMAGE_CSS);
|
||||||
|
private final By accessDeniedText = By.xpath(ACCESS_DENIED_TEXT_XPATH);
|
||||||
|
|
||||||
|
private WebDriver driver;
|
||||||
|
private WebDriverWait wait;
|
||||||
|
|
||||||
|
public AccessDeniedPage(WebDriver driver,
|
||||||
|
WebDriverWait wait) {
|
||||||
|
this.driver = driver;
|
||||||
|
this.wait = wait;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void clickReloadButton() {
|
||||||
|
try {
|
||||||
|
log.debug("Пытаемся найти кнопку по id и нажать");
|
||||||
|
driver.findElement(reloadButtonById).click();
|
||||||
|
return;
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.debug("Кнопка обновления страницы не найдена по id");
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
log.debug("Пытаемся найти кнопку по xpath и нажать");
|
||||||
|
driver.findElement(reloadButtonByXpath).click();
|
||||||
|
log.debug("Успешно нашли кнопку по xpath");
|
||||||
|
return;
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.debug("Кнопка обновления страницы не найдена по xpath");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isWarningImage() {
|
||||||
|
return driver.findElement(warningImage) != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isAccessDeniedText() {
|
||||||
|
return driver.findElement(accessDeniedText) != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isLoaded() {
|
||||||
|
try {
|
||||||
|
return isWarningImage() && isAccessDeniedText();
|
||||||
|
} catch (Exception e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,79 @@
|
|||||||
|
package ru.pricepulse.parsingservice.service.marketplace.ozon.page;
|
||||||
|
|
||||||
|
import static org.openqa.selenium.support.ui.ExpectedConditions.visibilityOfAllElements;
|
||||||
|
import static org.openqa.selenium.support.ui.ExpectedConditions.visibilityOfElementLocated;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.openqa.selenium.By;
|
||||||
|
import org.openqa.selenium.StaleElementReferenceException;
|
||||||
|
import org.openqa.selenium.WebDriver;
|
||||||
|
import org.openqa.selenium.WebElement;
|
||||||
|
import org.openqa.selenium.support.ui.ExpectedConditions;
|
||||||
|
import org.openqa.selenium.support.ui.WebDriverWait;
|
||||||
|
import org.springframework.retry.annotation.Recover;
|
||||||
|
import org.springframework.retry.annotation.Retryable;
|
||||||
|
import ru.pricepulse.parsingservice.service.marketplace.ozon.MarketplacePage;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
|
public class CategoryPage implements MarketplacePage {
|
||||||
|
|
||||||
|
private static final int PAGE_SIZE = 12;
|
||||||
|
private static final String SEARCH_RESULTS = "div[data-widget='searchResultsV2']";
|
||||||
|
|
||||||
|
private final By searchResults = By.cssSelector(SEARCH_RESULTS);
|
||||||
|
|
||||||
|
private WebDriver driver;
|
||||||
|
|
||||||
|
private WebDriverWait wait;
|
||||||
|
|
||||||
|
public CategoryPage(WebDriver driver, WebDriverWait wait) {
|
||||||
|
this.driver = driver;
|
||||||
|
this.wait = wait;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Set<String> getProductsLinks() {
|
||||||
|
wait.until(visibilityOfElementLocated(searchResults));
|
||||||
|
var searchResultsElement = driver.findElement(searchResults);
|
||||||
|
wait.until(driver -> visibilityOfElementLocated(By.cssSelector(":scope > div")));
|
||||||
|
var outerDiv = searchResultsElement.findElement(By.cssSelector(":scope > div")); // Внешний блок со списком товаров
|
||||||
|
wait.until(driver -> visibilityOfAllElements(outerDiv.findElements(By.cssSelector(":scope > div"))));
|
||||||
|
var innerDivs = outerDiv.findElements(By.cssSelector(":scope > div")); // Блок карточки товара
|
||||||
|
return searchProductsLinks(innerDivs, driver);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Set<String> searchProductsLinks(List<WebElement> innerDivs, WebDriver driver) {
|
||||||
|
return innerDivs.stream()
|
||||||
|
.map(div -> {
|
||||||
|
waitVisibility(div);
|
||||||
|
List<WebElement> linkTags = null;
|
||||||
|
try {
|
||||||
|
linkTags = div.findElements(By.tagName("a"));
|
||||||
|
} catch (Exception ignored) {}
|
||||||
|
return linkTags != null && !linkTags.isEmpty()
|
||||||
|
? linkTags.getFirst().getAttribute("href")
|
||||||
|
: null;
|
||||||
|
})
|
||||||
|
.filter(href -> href != null && !href.isEmpty())
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void waitVisibility(WebElement outerElement) {
|
||||||
|
wait.until(driver -> !outerElement.findElements(By.tagName("a")).isEmpty());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isLoaded() {
|
||||||
|
try {
|
||||||
|
return driver.findElement(searchResults) != null;
|
||||||
|
} catch (Exception e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,141 @@
|
|||||||
|
package ru.pricepulse.parsingservice.service.marketplace.ozon.parsing;
|
||||||
|
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.temporal.ChronoUnit;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.openqa.selenium.JavascriptExecutor;
|
||||||
|
import org.openqa.selenium.StaleElementReferenceException;
|
||||||
|
import org.openqa.selenium.WebDriver;
|
||||||
|
import org.openqa.selenium.support.ui.WebDriverWait;
|
||||||
|
import org.slf4j.MDC;
|
||||||
|
import org.springframework.beans.factory.ObjectFactory;
|
||||||
|
import org.springframework.retry.annotation.Recover;
|
||||||
|
import org.springframework.retry.annotation.Retryable;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
import ru.pricepulse.parsingservice.pool.WebDriverPool;
|
||||||
|
import ru.pricepulse.parsingservice.service.marketplace.ozon.page.AccessDeniedPage;
|
||||||
|
import ru.pricepulse.parsingservice.service.marketplace.ozon.page.CategoryPage;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
|
@Service
|
||||||
|
public class CategoryPageParsingService {
|
||||||
|
|
||||||
|
private final ExecutorService productPageExecutor = Executors.newFixedThreadPool(3);
|
||||||
|
|
||||||
|
private final WebDriverPool webDriverPool;
|
||||||
|
|
||||||
|
public CategoryPageParsingService(WebDriverPool webDriverPool) {
|
||||||
|
this.webDriverPool = webDriverPool;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Retryable(maxAttempts = 10, recover = "recover")
|
||||||
|
public void parseCategoryPage(int finalPageIndex, String url, ArrayList<String> errors) throws InterruptedException {
|
||||||
|
MDC.put("pageIndex", String.valueOf(finalPageIndex));
|
||||||
|
String pageUrl = url + "/?page=" + finalPageIndex;
|
||||||
|
var driver = webDriverPool.borrowDriver();
|
||||||
|
|
||||||
|
try {
|
||||||
|
driver.manage().timeouts().pageLoadTimeout(Duration.of(10, ChronoUnit.SECONDS));
|
||||||
|
driver.get(pageUrl);
|
||||||
|
WebDriverWait wait = new WebDriverWait(driver, Duration.of(10, ChronoUnit.SECONDS));
|
||||||
|
var accessDeniedPage = new AccessDeniedPage(driver, wait); // TODO подумать как не создавать кучу PageObject
|
||||||
|
var categoryPage = new CategoryPage(driver, wait);
|
||||||
|
wait.until(d -> checkForWaitingPageLoading(accessDeniedPage, categoryPage));
|
||||||
|
if (checkAccessDeniedPage(accessDeniedPage)) {
|
||||||
|
log.info("Доступ ограничен, пробуем решить проблему: {}", pageUrl);
|
||||||
|
resolveAccessDeniedPage(accessDeniedPage);
|
||||||
|
log.info("Проблема успешно решена: {}", pageUrl);
|
||||||
|
}
|
||||||
|
log.info("Получаем список ссылок на товары на текущей странице: {}", pageUrl);
|
||||||
|
Set<String> hrefs = Set.of();
|
||||||
|
try {
|
||||||
|
hrefs = categoryPage.getProductsLinks();
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new Exception(e);
|
||||||
|
}
|
||||||
|
webDriverPool.returnDriver(driver);
|
||||||
|
log.info("Страница {} Получены ссылки на товары: {}", finalPageIndex, hrefs.size());
|
||||||
|
hrefs.forEach(href -> {
|
||||||
|
MDC.put("pageIndex", String.valueOf(finalPageIndex));
|
||||||
|
try {
|
||||||
|
processPage(href);
|
||||||
|
errors.add(href);
|
||||||
|
log.error(String.valueOf(errors.size()));
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
/*hrefs.forEach(href -> productPageExecutor.submit(() -> {
|
||||||
|
MDC.put("pageIndex", String.valueOf(finalPageIndex));
|
||||||
|
try {
|
||||||
|
processPage(href);
|
||||||
|
errors.add(href);
|
||||||
|
log.error(String.valueOf(errors.size()));
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}));*/
|
||||||
|
productPageExecutor.awaitTermination(10, TimeUnit.SECONDS);
|
||||||
|
} catch (Exception ignored) {
|
||||||
|
throw new RuntimeException(ignored);
|
||||||
|
} finally {
|
||||||
|
webDriverPool.returnDriver(driver); // Завершаем работу драйвера
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String processPage(String href) throws InterruptedException {
|
||||||
|
var driver = webDriverPool.borrowDriver();
|
||||||
|
try {
|
||||||
|
driver.get(href);
|
||||||
|
log.info("Страница обработана");
|
||||||
|
} catch (Throwable ignored) {
|
||||||
|
|
||||||
|
} finally {
|
||||||
|
webDriverPool.returnDriver(driver); // Завершаем работу драйвера
|
||||||
|
}
|
||||||
|
return href;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean checkForWaitingPageLoading(AccessDeniedPage accessDeniedPage,
|
||||||
|
CategoryPage categoryPage) {
|
||||||
|
log.debug("Проверка что страница 'Доступ ограничен'");
|
||||||
|
try {
|
||||||
|
if (checkAccessDeniedPage(accessDeniedPage)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.debug("Ошибка проверки", e);
|
||||||
|
}
|
||||||
|
log.debug("Проверка что страница 'Страница категории'");
|
||||||
|
if (checkCategoryPage(categoryPage)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
log.debug("Проверка загрузки страницы неудачна");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean checkAccessDeniedPage(AccessDeniedPage accessDeniedPage) {
|
||||||
|
return accessDeniedPage.isLoaded();
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean checkCategoryPage(CategoryPage categoryPage) {
|
||||||
|
return categoryPage.isLoaded();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void resolveAccessDeniedPage(AccessDeniedPage accessDeniedPage) {
|
||||||
|
accessDeniedPage.clickReloadButton();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Recover
|
||||||
|
private void recover(Exception e) {
|
||||||
|
log.error("Все ретраи провалились");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,60 @@
|
|||||||
|
package ru.pricepulse.parsingservice.service.marketplace.ozon.parsing;
|
||||||
|
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.temporal.ChronoUnit;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
|
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.openqa.selenium.JavascriptExecutor;
|
||||||
|
import org.openqa.selenium.StaleElementReferenceException;
|
||||||
|
import org.openqa.selenium.WebDriver;
|
||||||
|
import org.openqa.selenium.support.ui.WebDriverWait;
|
||||||
|
import org.slf4j.MDC;
|
||||||
|
import org.springframework.beans.factory.ObjectFactory;
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.retry.annotation.Retryable;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
import ru.pricepulse.parsingservice.service.MarketplaceParsingService;
|
||||||
|
import ru.pricepulse.parsingservice.service.marketplace.ozon.page.AccessDeniedPage;
|
||||||
|
import ru.pricepulse.parsingservice.service.marketplace.ozon.page.CategoryPage;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
|
@Service
|
||||||
|
public class ParsingService implements MarketplaceParsingService {
|
||||||
|
|
||||||
|
private final AtomicBoolean stopFlag = new AtomicBoolean(false);
|
||||||
|
|
||||||
|
private final ExecutorService categoryExecutor = Executors.newFixedThreadPool(1);
|
||||||
|
|
||||||
|
private final CategoryPageParsingService categoryPageParsingService;
|
||||||
|
|
||||||
|
public ParsingService(CategoryPageParsingService categoryPageParsingService) {
|
||||||
|
this.categoryPageParsingService = categoryPageParsingService;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void processCategory(String url) {
|
||||||
|
var startTime = System.currentTimeMillis();
|
||||||
|
log.info("Начало обработки категории: {}", url);
|
||||||
|
int pageIndex = 1;
|
||||||
|
var errors = new ArrayList<String>();
|
||||||
|
|
||||||
|
while (!stopFlag.get()) {
|
||||||
|
int finalPageIndex = pageIndex;
|
||||||
|
try {
|
||||||
|
categoryPageParsingService.parseCategoryPage(finalPageIndex, url, errors);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
++pageIndex;
|
||||||
|
if (pageIndex > 5) {
|
||||||
|
stopFlag.set(true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
log.info("Время выполнения {} ", (System.currentTimeMillis() - startTime) / 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -4,20 +4,19 @@ import lombok.RequiredArgsConstructor;
|
|||||||
import org.springframework.scheduling.annotation.Scheduled;
|
import org.springframework.scheduling.annotation.Scheduled;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
import ru.pricepulse.parsingservice.config.properties.OzonConfigProperties;
|
import ru.pricepulse.parsingservice.config.properties.OzonConfigProperties;
|
||||||
import ru.pricepulse.parsingservice.service.impl.parsing.OzonParsingService;
|
import ru.pricepulse.parsingservice.service.marketplace.ozon.parsing.ParsingService;
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class OzonProductUpdater {
|
public class OzonProductUpdater {
|
||||||
|
|
||||||
private final OzonConfigProperties properties;
|
private final OzonConfigProperties properties;
|
||||||
|
private final ParsingService ozonParsingService;
|
||||||
private final OzonParsingService parsingService;
|
|
||||||
|
|
||||||
@Scheduled(fixedRate = 3600000)
|
@Scheduled(fixedRate = 3600000)
|
||||||
public void updateOzonProducts() {
|
public void updateOzonProducts() {
|
||||||
properties.getCategoriesUrls().forEach(
|
properties.getCategoriesUrls()
|
||||||
parsingService::processCategory);
|
.forEach(ozonParsingService::processCategory);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -15,10 +15,12 @@ spring:
|
|||||||
liquibase:
|
liquibase:
|
||||||
change-log: classpath:/db/changelog/master.yml
|
change-log: classpath:/db/changelog/master.yml
|
||||||
kafka:
|
kafka:
|
||||||
|
selenium:
|
||||||
|
|
||||||
marketplace:
|
marketplace:
|
||||||
ozon:
|
ozon:
|
||||||
categories-urls:
|
categories-urls:
|
||||||
- https://www.ozon.ru/category/noutbuki-15692
|
- https://www.ozon.ru/category/noutbuki-15692
|
||||||
|
logging:
|
||||||
|
pattern:
|
||||||
|
console: "%d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg %X%n"
|
Loading…
Reference in New Issue
Block a user