From 9895aaff334e334d1bbae905d64d9d6314bb7095 Mon Sep 17 00:00:00 2001 From: "danil.markov" Date: Mon, 14 Oct 2024 21:43:57 +0400 Subject: [PATCH] Feature/parsing-service: save commit --- parsing-service/build.gradle | 1 + .../ozon_parser/pool/WebDriverPool.java | 2 + .../ozon_parser/service/DataParser.java | 37 -------- .../ozon_parser/service/PartitionService.java | 8 +- .../ozon_parser/service/dto/ParsedData.java | 7 +- .../marketplace/ozon/MarketplacePage.java | 7 -- .../marketplace/ozon/page/CategoryPage.java | 73 -------------- .../ozon/parsing/ParsingService.java | 47 ---------- .../service/messaging/ParsedDataProducer.java | 18 ---- .../ozon => }/page/AccessDeniedPage.java | 3 +- .../service/page/CategoryPage.java | 94 +++++++++++++++++++ .../service/page/MarketplacePage.java | 7 ++ .../service/page/NoContentPage.java | 32 +++++++ .../OzonCategoryPageParsingService.java} | 91 +++++++----------- .../service/parsing/OzonParsingService.java | 43 +++++++++ .../service/request/PageFetcher.java | 20 ---- .../service/scheduler/OzonProductUpdater.java | 6 +- .../service/scheduler/PartitionScheduler.java | 2 + .../entity/PriceHistoryEntity.java | 7 ++ .../persistence/entity/PriceHistoryId.java | 11 +-- .../repository/ProductPriceRepository.java | 3 +- .../repository/ProductRepository.java | 3 + .../service/ProductService.java | 69 ++++++++++++++ .../wildberries_parser/DebugRunner.java | 8 +- ...ce.java => WildberriesParsingService.java} | 6 +- .../src/main/resources/application.yml | 15 +-- .../20241006_001_add_columns_in_tables.xml | 17 +--- 27 files changed, 336 insertions(+), 301 deletions(-) delete mode 100644 parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/DataParser.java delete mode 100644 parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/marketplace/ozon/MarketplacePage.java delete mode 100644 parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/marketplace/ozon/page/CategoryPage.java delete mode 100644 parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/marketplace/ozon/parsing/ParsingService.java delete mode 100644 parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/messaging/ParsedDataProducer.java rename parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/{marketplace/ozon => }/page/AccessDeniedPage.java (93%) create mode 100644 parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/page/CategoryPage.java create mode 100644 parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/page/MarketplacePage.java create mode 100644 parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/page/NoContentPage.java rename parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/{marketplace/ozon/parsing/CategoryPageParsingService.java => parsing/OzonCategoryPageParsingService.java} (51%) create mode 100644 parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/parsing/OzonParsingService.java delete mode 100644 parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/request/PageFetcher.java create mode 100644 parsing-service/src/main/java/ru/pricepulse/parsingservice/service/ProductService.java rename parsing-service/src/main/java/ru/pricepulse/parsingservice/wildberries_parser/service/{ParsingService.java => WildberriesParsingService.java} (95%) diff --git a/parsing-service/build.gradle b/parsing-service/build.gradle index d6697fa..e056776 100644 --- a/parsing-service/build.gradle +++ b/parsing-service/build.gradle @@ -37,6 +37,7 @@ dependencies { implementation "org.seleniumhq.selenium:selenium-java:${seleniumVersion}" implementation 'io.github.bonigarcia:webdrivermanager:5.5.0' implementation 'org.apache.commons:commons-pool2:2.12.0' + implementation 'com.clickhouse:clickhouse-jdbc:0.6.5' compileOnly 'org.projectlombok:lombok' diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/pool/WebDriverPool.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/pool/WebDriverPool.java index b5d4705..b2bc6b1 100644 --- a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/pool/WebDriverPool.java +++ b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/pool/WebDriverPool.java @@ -8,10 +8,12 @@ import org.openqa.selenium.WebDriver; import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.chrome.ChromeOptions; import org.springframework.beans.factory.ObjectFactory; +import org.springframework.context.annotation.Profile; import org.springframework.stereotype.Component; @Slf4j @Component +@Profile("ozon") public class WebDriverPool { private final Queue availableDrivers = new ConcurrentLinkedQueue<>(); // Список доступных драйверов diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/DataParser.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/DataParser.java deleted file mode 100644 index 4eb5606..0000000 --- a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/DataParser.java +++ /dev/null @@ -1,37 +0,0 @@ -package ru.pricepulse.parsingservice.ozon_parser.service; - -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.springframework.stereotype.Service; -import ru.pricepulse.parsingservice.ozon_parser.service.dto.ParsedData; -import ru.pricepulse.parsingservice.ozon_parser.service.messaging.ParsedDataProducer; - -@Slf4j -@Service -@RequiredArgsConstructor -public class DataParser { - - private final ParsedDataProducer queueProducer; - - public boolean pageHasData(String html) { - Document doc = Jsoup.parse(html); - return doc.select("div[data-widget=searchResultsError]").isEmpty(); - } - - public void parseAndQueueData(String html) { - Document doc = Jsoup.parse(html); - for (Element item : doc.select(".item-class")) { - String title = item.select(".item-title").text(); - String price = item.select(".item-price").text(); - - ParsedData parsedData = new ParsedData(); - log.info("Попытка отправить данные в очередь"); - queueProducer.sendToQueue(parsedData); - log.info("Данные успешно отправлены в очередь"); - } - } - -} diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/PartitionService.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/PartitionService.java index 2371095..f629883 100644 --- a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/PartitionService.java +++ b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/PartitionService.java @@ -1,25 +1,27 @@ package ru.pricepulse.parsingservice.ozon_parser.service; import lombok.RequiredArgsConstructor; +import org.springframework.context.annotation.Profile; import org.springframework.jdbc.core.JdbcTemplate; import org.springframework.stereotype.Service; @Service @RequiredArgsConstructor +@Profile("postgres_stat") public class PartitionService { - private final JdbcTemplate jdbcTemplate; + private final JdbcTemplate postgresDataSource; public boolean checkPartitionExists(String partitionName) { String query = "SELECT to_regclass('public." + partitionName + "')"; - String result = jdbcTemplate.queryForObject(query, String.class); + String result = postgresDataSource.queryForObject(query, String.class); return result != null; } public void createPartition(String partitionName, String startDate, String endDate) { String createPartitionSQL = "CREATE TABLE IF NOT EXISTS " + partitionName + " PARTITION OF price_history FOR VALUES FROM ('" + startDate + "') TO ('" + endDate + "')"; - jdbcTemplate.execute(createPartitionSQL); + postgresDataSource.execute(createPartitionSQL); } } diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/dto/ParsedData.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/dto/ParsedData.java index 1bfba10..82c723f 100644 --- a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/dto/ParsedData.java +++ b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/dto/ParsedData.java @@ -1,7 +1,10 @@ package ru.pricepulse.parsingservice.ozon_parser.service.dto; +import java.math.BigDecimal; + import lombok.Getter; import lombok.Setter; +import ru.pricepulse.parsingservice.enumeration.Category; import ru.pricepulse.parsingservice.enumeration.Marketplace; @Getter @@ -10,7 +13,7 @@ public class ParsedData { private Marketplace marketplace; - private String category; + private Category category; private String brand; @@ -20,4 +23,6 @@ public class ParsedData { private String imageUrl; + private BigDecimal price; + } diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/marketplace/ozon/MarketplacePage.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/marketplace/ozon/MarketplacePage.java deleted file mode 100644 index 20462c5..0000000 --- a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/marketplace/ozon/MarketplacePage.java +++ /dev/null @@ -1,7 +0,0 @@ -package ru.pricepulse.parsingservice.ozon_parser.service.marketplace.ozon; - -public interface MarketplacePage { - - boolean isLoaded(); - -} diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/marketplace/ozon/page/CategoryPage.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/marketplace/ozon/page/CategoryPage.java deleted file mode 100644 index 712957b..0000000 --- a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/marketplace/ozon/page/CategoryPage.java +++ /dev/null @@ -1,73 +0,0 @@ -package ru.pricepulse.parsingservice.ozon_parser.service.marketplace.ozon.page; - -import static org.openqa.selenium.support.ui.ExpectedConditions.visibilityOfAllElements; -import static org.openqa.selenium.support.ui.ExpectedConditions.visibilityOfElementLocated; - -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; - -import lombok.extern.slf4j.Slf4j; -import org.openqa.selenium.By; -import org.openqa.selenium.WebDriver; -import org.openqa.selenium.WebElement; -import org.openqa.selenium.support.ui.WebDriverWait; -import ru.pricepulse.parsingservice.ozon_parser.service.marketplace.ozon.MarketplacePage; - -@Slf4j -public class CategoryPage implements MarketplacePage { - - private static final int PAGE_SIZE = 12; - private static final String SEARCH_RESULTS = "div[data-widget='searchResultsV2']"; - - private final By searchResults = By.cssSelector(SEARCH_RESULTS); - - private WebDriver driver; - - private WebDriverWait wait; - - public CategoryPage(WebDriver driver, WebDriverWait wait) { - this.driver = driver; - this.wait = wait; - } - - public Set getProductsLinks() { - wait.until(visibilityOfElementLocated(searchResults)); - var searchResultsElement = driver.findElement(searchResults); - wait.until(driver -> visibilityOfElementLocated(By.cssSelector(":scope > div"))); - var outerDiv = searchResultsElement.findElement(By.cssSelector(":scope > div")); // Внешний блок со списком товаров - wait.until(driver -> visibilityOfAllElements(outerDiv.findElements(By.cssSelector(":scope > div")))); - var innerDivs = outerDiv.findElements(By.cssSelector(":scope > div")); // Блок карточки товара - return searchProductsLinks(innerDivs, driver); - } - - private Set searchProductsLinks(List innerDivs, WebDriver driver) { - return innerDivs.stream() - .map(div -> { - waitVisibility(div); - List linkTags = null; - try { - linkTags = div.findElements(By.tagName("a")); - } catch (Exception ignored) {} - return linkTags != null && !linkTags.isEmpty() - ? linkTags.getFirst().getAttribute("href") - : null; - }) - .filter(href -> href != null && !href.isEmpty()) - .collect(Collectors.toSet()); - } - - private void waitVisibility(WebElement outerElement) { - wait.until(driver -> !outerElement.findElements(By.tagName("a")).isEmpty()); - } - - @Override - public boolean isLoaded() { - try { - return driver.findElement(searchResults) != null; - } catch (Exception e) { - return false; - } - } - -} diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/marketplace/ozon/parsing/ParsingService.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/marketplace/ozon/parsing/ParsingService.java deleted file mode 100644 index d2a56c4..0000000 --- a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/marketplace/ozon/parsing/ParsingService.java +++ /dev/null @@ -1,47 +0,0 @@ -package ru.pricepulse.parsingservice.ozon_parser.service.marketplace.ozon.parsing; - -import java.util.ArrayList; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.atomic.AtomicBoolean; - -import lombok.extern.slf4j.Slf4j; -import org.springframework.stereotype.Service; -import ru.pricepulse.parsingservice.ozon_parser.service.MarketplaceParsingService; - -@Slf4j -@Service -public class ParsingService implements MarketplaceParsingService { - - private final AtomicBoolean stopFlag = new AtomicBoolean(false); - - private final ExecutorService categoryExecutor = Executors.newFixedThreadPool(1); - - private final CategoryPageParsingService categoryPageParsingService; - - public ParsingService(CategoryPageParsingService categoryPageParsingService) { - this.categoryPageParsingService = categoryPageParsingService; - } - - public void processCategory(String url) { - var startTime = System.currentTimeMillis(); - log.info("Начало обработки категории: {}", url); - int pageIndex = 1; - var errors = new ArrayList(); - - while (!stopFlag.get()) { - int finalPageIndex = pageIndex; - try { - categoryPageParsingService.parseCategoryPage(finalPageIndex, url, errors); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - ++pageIndex; - if (pageIndex > 5) { - stopFlag.set(true); - } - } - log.info("Время выполнения {} ", (System.currentTimeMillis() - startTime) / 1000); - } - -} \ No newline at end of file diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/messaging/ParsedDataProducer.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/messaging/ParsedDataProducer.java deleted file mode 100644 index ec9fea9..0000000 --- a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/messaging/ParsedDataProducer.java +++ /dev/null @@ -1,18 +0,0 @@ -package ru.pricepulse.parsingservice.ozon_parser.service.messaging; - -import lombok.RequiredArgsConstructor; -import org.springframework.kafka.core.KafkaTemplate; -import org.springframework.stereotype.Service; -import ru.pricepulse.parsingservice.ozon_parser.service.dto.ParsedData; - -@Service -@RequiredArgsConstructor -public class ParsedDataProducer { - - private final KafkaTemplate kafkaTemplate; - - public void sendToQueue(ParsedData data) { - kafkaTemplate.send("parsed-data-queue", data); - } - -} diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/marketplace/ozon/page/AccessDeniedPage.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/page/AccessDeniedPage.java similarity index 93% rename from parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/marketplace/ozon/page/AccessDeniedPage.java rename to parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/page/AccessDeniedPage.java index e5d2d61..eb88b73 100644 --- a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/marketplace/ozon/page/AccessDeniedPage.java +++ b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/page/AccessDeniedPage.java @@ -1,10 +1,9 @@ -package ru.pricepulse.parsingservice.ozon_parser.service.marketplace.ozon.page; +package ru.pricepulse.parsingservice.ozon_parser.service.page; import lombok.extern.slf4j.Slf4j; import org.openqa.selenium.By; import org.openqa.selenium.WebDriver; import org.openqa.selenium.support.ui.WebDriverWait; -import ru.pricepulse.parsingservice.ozon_parser.service.marketplace.ozon.MarketplacePage; @Slf4j public class AccessDeniedPage implements MarketplacePage { diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/page/CategoryPage.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/page/CategoryPage.java new file mode 100644 index 0000000..7d6b615 --- /dev/null +++ b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/page/CategoryPage.java @@ -0,0 +1,94 @@ +package ru.pricepulse.parsingservice.ozon_parser.service.page; + +import static org.openqa.selenium.support.ui.ExpectedConditions.visibilityOfAllElements; +import static org.openqa.selenium.support.ui.ExpectedConditions.visibilityOfElementLocated; + +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import lombok.extern.slf4j.Slf4j; +import org.openqa.selenium.By; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebElement; +import org.openqa.selenium.support.ui.WebDriverWait; +import ru.pricepulse.parsingservice.ozon_parser.service.dto.ParsedData; + +@Slf4j +public class CategoryPage implements MarketplacePage { + + private static final String SEARCH_RESULTS = "div[data-widget='searchResultsV2']"; + + private final By searchResults = By.cssSelector(SEARCH_RESULTS); + + private WebDriver driver; + + private WebDriverWait wait; + + public CategoryPage(WebDriver driver, WebDriverWait wait) { + this.driver = driver; + this.wait = wait; + } + + public ArrayList getParsedProducts() { + wait.until(visibilityOfElementLocated(searchResults)); + log.info("Нашли SearchResultsV2"); + var searchResultsElement = driver.findElement(searchResults); + wait.until(driver -> visibilityOfElementLocated(By.cssSelector(":scope > div"))); + log.info("Нашли внешний блок списка"); + var outerDiv = searchResultsElement.findElement(By.cssSelector(":scope > div")); // Внешний блок со списком товаров + wait.until(driver -> visibilityOfAllElements(outerDiv.findElements(By.cssSelector(":scope > div")))); + log.info("Нашли элементы списка"); + var innerDivs = outerDiv.findElements(By.cssSelector(":scope > div")); // Блок карточки товара + + var products = new ArrayList(); + innerDivs.forEach(innerDiv -> { + var productDataDivs = innerDiv.findElements(By.cssSelector(":scope > div")); + var productImageUrl = productDataDivs.get(0) + .findElement(By.cssSelector(":scope > a > div")) + .findElements(By.cssSelector(":scope > div")).getFirst() + .findElement(By.tagName("img")).getAttribute("src"); + + var productBrand = productDataDivs.get(1).findElement(By.cssSelector(":scope > div")) + .findElements(By.cssSelector(":scope > div")).getFirst() + .findElement(By.tagName("b")).getText(); + + var productNameLink = productDataDivs.get(1).findElement(By.cssSelector(":scope > div > a")); + + var productUrl = productNameLink.getAttribute("href"); + + var productName = productNameLink.findElement(By.tagName("span")).getText(); + + var productPrice = parseCurrency(productDataDivs.get(2).findElement(By.cssSelector(":scope > div > div")) + .findElements(By.tagName("span")).getFirst().getText()); + var parsedData = new ParsedData(); + parsedData.setUrl(productUrl); + parsedData.setBrand(productBrand); + parsedData.setProductName(productName); + parsedData.setImageUrl(productImageUrl); + parsedData.setPrice(productPrice); + products.add(parsedData); + }); + + + return products; + } + + private BigDecimal parseCurrency(String currencyStr) { + String cleanedString = currencyStr.replaceAll("[^\\d]", ""); + + return new BigDecimal(cleanedString); + } + + @Override + public boolean isLoaded() { + try { + return driver.findElement(searchResults) != null; + } catch (Exception e) { + return false; + } + } + +} diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/page/MarketplacePage.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/page/MarketplacePage.java new file mode 100644 index 0000000..e2e2f0b --- /dev/null +++ b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/page/MarketplacePage.java @@ -0,0 +1,7 @@ +package ru.pricepulse.parsingservice.ozon_parser.service.page; + +public interface MarketplacePage { + + boolean isLoaded(); + +} diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/page/NoContentPage.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/page/NoContentPage.java new file mode 100644 index 0000000..bbcfb1e --- /dev/null +++ b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/page/NoContentPage.java @@ -0,0 +1,32 @@ +package ru.pricepulse.parsingservice.ozon_parser.service.page; + +import lombok.extern.slf4j.Slf4j; +import org.openqa.selenium.By; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.support.ui.WebDriverWait; + +@Slf4j +public class NoContentPage { + + private static final String ERROR_TEXT_XPATH = "\"//*[contains(text(), 'Простите, произошла ошибка. Попробуйте обновить страницу или вернуться на шаг назад.')]\""; + + private final By errorText = By.xpath(ERROR_TEXT_XPATH); + + private WebDriver driver; + + private WebDriverWait wait; + + public NoContentPage(WebDriver driver, WebDriverWait wait) { + this.driver = driver; + this.wait = wait; + } + + public boolean isLoaded() { + try { + return driver.findElement(errorText) != null; + } catch (Exception e) { + return false; + } + } + +} diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/marketplace/ozon/parsing/CategoryPageParsingService.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/parsing/OzonCategoryPageParsingService.java similarity index 51% rename from parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/marketplace/ozon/parsing/CategoryPageParsingService.java rename to parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/parsing/OzonCategoryPageParsingService.java index 7257435..4e47f14 100644 --- a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/marketplace/ozon/parsing/CategoryPageParsingService.java +++ b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/parsing/OzonCategoryPageParsingService.java @@ -1,103 +1,82 @@ -package ru.pricepulse.parsingservice.ozon_parser.service.marketplace.ozon.parsing; +package ru.pricepulse.parsingservice.ozon_parser.service.parsing; import java.time.Duration; import java.time.temporal.ChronoUnit; -import java.util.ArrayList; -import java.util.Set; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.TimeUnit; +import java.util.List; +import java.util.concurrent.atomic.AtomicBoolean; import lombok.extern.slf4j.Slf4j; import org.openqa.selenium.support.ui.WebDriverWait; -import org.slf4j.MDC; +import org.springframework.context.annotation.Profile; import org.springframework.retry.annotation.Recover; import org.springframework.retry.annotation.Retryable; import org.springframework.stereotype.Service; +import ru.pricepulse.parsingservice.enumeration.Category; +import ru.pricepulse.parsingservice.enumeration.Marketplace; import ru.pricepulse.parsingservice.ozon_parser.pool.WebDriverPool; -import ru.pricepulse.parsingservice.ozon_parser.service.marketplace.ozon.page.AccessDeniedPage; -import ru.pricepulse.parsingservice.ozon_parser.service.marketplace.ozon.page.CategoryPage; +import ru.pricepulse.parsingservice.ozon_parser.service.dto.ParsedData; +import ru.pricepulse.parsingservice.ozon_parser.service.page.AccessDeniedPage; +import ru.pricepulse.parsingservice.ozon_parser.service.page.CategoryPage; +import ru.pricepulse.parsingservice.ozon_parser.service.page.NoContentPage; +import ru.pricepulse.parsingservice.service.ProductService; @Slf4j @Service -public class CategoryPageParsingService { - - private final ExecutorService productPageExecutor = Executors.newFixedThreadPool(3); +@Profile("ozon") +public class OzonCategoryPageParsingService { private final WebDriverPool webDriverPool; - public CategoryPageParsingService(WebDriverPool webDriverPool) { + private final ProductService productService; + + public OzonCategoryPageParsingService(WebDriverPool webDriverPool, + ProductService productService) { this.webDriverPool = webDriverPool; + this.productService = productService; } @Retryable(maxAttempts = 10, recover = "recover") - public void parseCategoryPage(int finalPageIndex, String url, ArrayList errors) throws InterruptedException { - MDC.put("pageIndex", String.valueOf(finalPageIndex)); - String pageUrl = url + "/?page=" + finalPageIndex; + public void parseCategoryPage(String pageUrl, Category category, AtomicBoolean stopFlag) { var driver = webDriverPool.borrowDriver(); try { driver.manage().timeouts().pageLoadTimeout(Duration.of(10, ChronoUnit.SECONDS)); driver.get(pageUrl); WebDriverWait wait = new WebDriverWait(driver, Duration.of(10, ChronoUnit.SECONDS)); - var accessDeniedPage = new AccessDeniedPage(driver, wait); // TODO подумать как не создавать кучу PageObject + var accessDeniedPage = new AccessDeniedPage(driver, wait); var categoryPage = new CategoryPage(driver, wait); + var noContentPage = new NoContentPage(driver, wait); wait.until(d -> checkForWaitingPageLoading(accessDeniedPage, categoryPage)); if (checkAccessDeniedPage(accessDeniedPage)) { log.info("Доступ ограничен, пробуем решить проблему: {}", pageUrl); resolveAccessDeniedPage(accessDeniedPage); log.info("Проблема успешно решена: {}", pageUrl); } - log.info("Получаем список ссылок на товары на текущей странице: {}", pageUrl); - Set hrefs = Set.of(); + if (noContentPage.isLoaded()) { + log.info("Страница не найдена"); + stopFlag.set(true); + return; + } + log.info("Получаем список товаров на текущей странице: {}", pageUrl); + List parsedData; try { - hrefs = categoryPage.getProductsLinks(); + parsedData = categoryPage.getParsedProducts(); + for (ParsedData data : parsedData) { + data.setMarketplace(Marketplace.OZON); + data.setCategory(category); + } + productService.saveBatch(parsedData); } catch (Exception e) { throw new Exception(e); } webDriverPool.returnDriver(driver); - log.info("Страница {} Получены ссылки на товары: {}", finalPageIndex, hrefs.size()); - hrefs.forEach(href -> { - MDC.put("pageIndex", String.valueOf(finalPageIndex)); - try { - processPage(href); - errors.add(href); - log.error(String.valueOf(errors.size())); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - }); - /*hrefs.forEach(href -> productPageExecutor.submit(() -> { - MDC.put("pageIndex", String.valueOf(finalPageIndex)); - try { - processPage(href); - errors.add(href); - log.error(String.valueOf(errors.size())); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - }));*/ - productPageExecutor.awaitTermination(10, TimeUnit.SECONDS); } catch (Exception ignored) { throw new RuntimeException(ignored); } finally { - webDriverPool.returnDriver(driver); // Завершаем работу драйвера + webDriverPool.returnDriver(driver); } } - private String processPage(String href) throws InterruptedException { - var driver = webDriverPool.borrowDriver(); - try { - driver.get(href); - log.info("Страница обработана"); - } catch (Throwable ignored) { - - } finally { - webDriverPool.returnDriver(driver); // Завершаем работу драйвера - } - return href; - } - private boolean checkForWaitingPageLoading(AccessDeniedPage accessDeniedPage, CategoryPage categoryPage) { log.debug("Проверка что страница 'Доступ ограничен'"); diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/parsing/OzonParsingService.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/parsing/OzonParsingService.java new file mode 100644 index 0000000..cb5fa3d --- /dev/null +++ b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/parsing/OzonParsingService.java @@ -0,0 +1,43 @@ +package ru.pricepulse.parsingservice.ozon_parser.service.parsing; + +import java.util.ArrayList; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicBoolean; + +import lombok.extern.slf4j.Slf4j; +import org.springframework.context.annotation.Profile; +import org.springframework.stereotype.Service; +import ru.pricepulse.parsingservice.enumeration.Category; +import ru.pricepulse.parsingservice.ozon_parser.service.MarketplaceParsingService; + +@Slf4j +@Service +@Profile("ozon") +public class OzonParsingService implements MarketplaceParsingService { + + private final AtomicBoolean stopFlag = new AtomicBoolean(false); + + private final ExecutorService pageExecutorService = Executors.newFixedThreadPool(12); + + private final OzonCategoryPageParsingService categoryPageParsingService; + + public OzonParsingService(OzonCategoryPageParsingService categoryPageParsingService) { + this.categoryPageParsingService = categoryPageParsingService; + } + + public void processCategory(String url) { + int pageIndex = 1; + + while (!stopFlag.get()) { + int finalPageIndex = pageIndex; + String pageUrl = url + "&page=" + finalPageIndex; + pageExecutorService.submit(() -> categoryPageParsingService.parseCategoryPage(pageUrl, Category.LAPTOP, stopFlag)); + ++pageIndex; + } + if (stopFlag.get()) { + pageExecutorService.shutdownNow(); + } + } + +} \ No newline at end of file diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/request/PageFetcher.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/request/PageFetcher.java deleted file mode 100644 index 1378ec6..0000000 --- a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/request/PageFetcher.java +++ /dev/null @@ -1,20 +0,0 @@ -package ru.pricepulse.parsingservice.ozon_parser.service.request; - -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; -import org.springframework.stereotype.Service; -import org.springframework.web.client.RestTemplate; - -@Slf4j -@Service -@RequiredArgsConstructor -public class PageFetcher { - - private final RestTemplate restTemplate; - - public String fetchPage(String url) { - log.info("Поолучение страницы {}", url); - return restTemplate.getForObject(url, String.class); - } - -} diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/scheduler/OzonProductUpdater.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/scheduler/OzonProductUpdater.java index 6efbfe6..91b63b4 100644 --- a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/scheduler/OzonProductUpdater.java +++ b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/scheduler/OzonProductUpdater.java @@ -1,17 +1,19 @@ package ru.pricepulse.parsingservice.ozon_parser.service.scheduler; import lombok.RequiredArgsConstructor; +import org.springframework.context.annotation.Profile; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Service; import ru.pricepulse.parsingservice.config.properties.OzonConfigProperties; -import ru.pricepulse.parsingservice.ozon_parser.service.marketplace.ozon.parsing.ParsingService; +import ru.pricepulse.parsingservice.ozon_parser.service.parsing.OzonParsingService; @Service @RequiredArgsConstructor +@Profile("ozon") public class OzonProductUpdater { private final OzonConfigProperties properties; - private final ParsingService ozonParsingService; + private final OzonParsingService ozonParsingService; @Scheduled(fixedRate = 3600000) public void updateOzonProducts() { diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/scheduler/PartitionScheduler.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/scheduler/PartitionScheduler.java index 878441f..038fcde 100644 --- a/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/scheduler/PartitionScheduler.java +++ b/parsing-service/src/main/java/ru/pricepulse/parsingservice/ozon_parser/service/scheduler/PartitionScheduler.java @@ -6,6 +6,7 @@ import java.time.format.DateTimeFormatter; import jakarta.annotation.PostConstruct; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import org.springframework.context.annotation.Profile; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Service; import ru.pricepulse.parsingservice.ozon_parser.service.PartitionService; @@ -13,6 +14,7 @@ import ru.pricepulse.parsingservice.ozon_parser.service.PartitionService; @Slf4j @Service @RequiredArgsConstructor +@Profile("postgres_stat") public class PartitionScheduler { private final PartitionService partitionService; diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/persistence/entity/PriceHistoryEntity.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/persistence/entity/PriceHistoryEntity.java index 652ed4c..eac22ac 100644 --- a/parsing-service/src/main/java/ru/pricepulse/parsingservice/persistence/entity/PriceHistoryEntity.java +++ b/parsing-service/src/main/java/ru/pricepulse/parsingservice/persistence/entity/PriceHistoryEntity.java @@ -3,11 +3,13 @@ package ru.pricepulse.parsingservice.persistence.entity; import jakarta.persistence.Column; import jakarta.persistence.EmbeddedId; import jakarta.persistence.Entity; +import jakarta.persistence.PrePersist; import jakarta.persistence.Table; import lombok.*; import org.hibernate.proxy.HibernateProxy; import java.math.BigDecimal; +import java.time.LocalDateTime; import java.util.Objects; @Getter @@ -48,4 +50,9 @@ public class PriceHistoryEntity { return Objects.hash(id); } + @PrePersist + protected void onCreate() { + id.setDate(LocalDateTime.now()); + } + } \ No newline at end of file diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/persistence/entity/PriceHistoryId.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/persistence/entity/PriceHistoryId.java index 1515fbb..33f7a50 100644 --- a/parsing-service/src/main/java/ru/pricepulse/parsingservice/persistence/entity/PriceHistoryId.java +++ b/parsing-service/src/main/java/ru/pricepulse/parsingservice/persistence/entity/PriceHistoryId.java @@ -5,11 +5,10 @@ import lombok.AllArgsConstructor; import lombok.Getter; import lombok.NoArgsConstructor; import lombok.Setter; -import org.hibernate.annotations.OnDelete; -import org.hibernate.annotations.OnDeleteAction; import org.hibernate.proxy.HibernateProxy; import java.io.Serializable; +import java.time.LocalDateTime; import java.time.OffsetDateTime; import java.util.Objects; @@ -20,13 +19,11 @@ import java.util.Objects; @Embeddable public class PriceHistoryId implements Serializable { - @ManyToOne(fetch = FetchType.LAZY, optional = false) - @OnDelete(action = OnDeleteAction.CASCADE) - @JoinColumn(name = "product_id", nullable = false) - private ProductEntity product; + @Column(name = "product_url", nullable = false, unique = true) + private String productUrl; @Column(name = "date", nullable = false) - private OffsetDateTime date; + private LocalDateTime date; @Override public final boolean equals(Object o) { diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/persistence/repository/ProductPriceRepository.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/persistence/repository/ProductPriceRepository.java index b456fe0..8195db3 100644 --- a/parsing-service/src/main/java/ru/pricepulse/parsingservice/persistence/repository/ProductPriceRepository.java +++ b/parsing-service/src/main/java/ru/pricepulse/parsingservice/persistence/repository/ProductPriceRepository.java @@ -2,6 +2,7 @@ package ru.pricepulse.parsingservice.persistence.repository; import org.springframework.data.jpa.repository.JpaRepository; import ru.pricepulse.parsingservice.persistence.entity.PriceHistoryEntity; +import ru.pricepulse.parsingservice.persistence.entity.PriceHistoryId; -public interface ProductPriceRepository extends JpaRepository { +public interface ProductPriceRepository extends JpaRepository { } diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/persistence/repository/ProductRepository.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/persistence/repository/ProductRepository.java index b58f16f..3532dea 100644 --- a/parsing-service/src/main/java/ru/pricepulse/parsingservice/persistence/repository/ProductRepository.java +++ b/parsing-service/src/main/java/ru/pricepulse/parsingservice/persistence/repository/ProductRepository.java @@ -4,4 +4,7 @@ import org.springframework.data.jpa.repository.JpaRepository; import ru.pricepulse.parsingservice.persistence.entity.ProductEntity; public interface ProductRepository extends JpaRepository { + + boolean existsByUrl(String url); + } diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/service/ProductService.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/service/ProductService.java new file mode 100644 index 0000000..1269def --- /dev/null +++ b/parsing-service/src/main/java/ru/pricepulse/parsingservice/service/ProductService.java @@ -0,0 +1,69 @@ +package ru.pricepulse.parsingservice.service; + +import java.util.ArrayList; +import java.util.List; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; +import ru.pricepulse.parsingservice.ozon_parser.service.dto.ParsedData; +import ru.pricepulse.parsingservice.persistence.entity.PriceHistoryEntity; +import ru.pricepulse.parsingservice.persistence.entity.PriceHistoryId; +import ru.pricepulse.parsingservice.persistence.entity.ProductEntity; +import ru.pricepulse.parsingservice.persistence.repository.ProductPriceRepository; +import ru.pricepulse.parsingservice.persistence.repository.ProductRepository; + +@Slf4j +@Service +@RequiredArgsConstructor +public class ProductService { + + private final ProductRepository productRepository; + + private final ProductPriceRepository productPriceRepository; + + @Transactional + public void saveBatch(List parsedData) { + var products = new ArrayList(); + var prices = new ArrayList(); + parsedData.forEach(product -> processParsedProduct(product, prices, products)); + productRepository.saveAll(products); + log.info("Сохранили пачку товаров {}", products.size()); + productPriceRepository.saveAll(prices); + log.info("Сохранили историю цен {}", prices.size()); + } + + private void processParsedProduct(ParsedData product, + ArrayList prices, + ArrayList products) { + var priceHistoryEntity = getPriceHistory(product); + prices.add(priceHistoryEntity); + if (productRepository.existsByUrl(product.getUrl())) { + log.debug("Запись {} уже есть", product.getUrl()); + return; + } + var productEntity = getProduct(product); + products.add(productEntity); + } + + private PriceHistoryEntity getPriceHistory(ParsedData product) { + var priceHistoryId = new PriceHistoryId(); + priceHistoryId.setProductUrl(product.getUrl()); + var priceHistory = new PriceHistoryEntity(); + priceHistory.setId(priceHistoryId); + priceHistory.setPrice(product.getPrice()); + return priceHistory; + } + + private ProductEntity getProduct(ParsedData product) { + var productEntity = new ProductEntity(); + productEntity.setCategory(product.getCategory()); + productEntity.setBrand(product.getBrand()); + productEntity.setProductName(product.getProductName()); + productEntity.setUrl(product.getUrl()); + productEntity.setMarketplace(product.getMarketplace()); + productEntity.setImageUrl(product.getImageUrl()); + return productEntity; + } +} diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/wildberries_parser/DebugRunner.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/wildberries_parser/DebugRunner.java index 976288f..50e7fcc 100644 --- a/parsing-service/src/main/java/ru/pricepulse/parsingservice/wildberries_parser/DebugRunner.java +++ b/parsing-service/src/main/java/ru/pricepulse/parsingservice/wildberries_parser/DebugRunner.java @@ -3,17 +3,17 @@ package ru.pricepulse.parsingservice.wildberries_parser; import lombok.AllArgsConstructor; import org.springframework.boot.CommandLineRunner; import org.springframework.stereotype.Component; -import ru.pricepulse.parsingservice.wildberries_parser.service.ParsingService; +import ru.pricepulse.parsingservice.wildberries_parser.service.WildberriesParsingService; @Component @AllArgsConstructor public class DebugRunner implements CommandLineRunner { - private final ParsingService parsingService; + private final WildberriesParsingService parsingService; @Override public void run(String... args){ - System.out.println("Начинаем отладку..."); + /*System.out.println("Начинаем отладку..."); parsingService.parse(); - System.out.println("Заканчиваем отладку..."); + System.out.println("Заканчиваем отладку...");*/ } } diff --git a/parsing-service/src/main/java/ru/pricepulse/parsingservice/wildberries_parser/service/ParsingService.java b/parsing-service/src/main/java/ru/pricepulse/parsingservice/wildberries_parser/service/WildberriesParsingService.java similarity index 95% rename from parsing-service/src/main/java/ru/pricepulse/parsingservice/wildberries_parser/service/ParsingService.java rename to parsing-service/src/main/java/ru/pricepulse/parsingservice/wildberries_parser/service/WildberriesParsingService.java index 816d97f..1e17eac 100644 --- a/parsing-service/src/main/java/ru/pricepulse/parsingservice/wildberries_parser/service/ParsingService.java +++ b/parsing-service/src/main/java/ru/pricepulse/parsingservice/wildberries_parser/service/WildberriesParsingService.java @@ -16,14 +16,14 @@ import ru.pricepulse.parsingservice.wildberries_parser.service.client.Client; import ru.pricepulse.parsingservice.wildberries_parser.service.dto.ProductInfoDto; import java.math.BigDecimal; -import java.time.OffsetDateTime; +import java.time.LocalDateTime; import java.util.ArrayList; import java.util.List; import java.util.Map; @Service @AllArgsConstructor -public class ParsingService { +public class WildberriesParsingService { private final Client client; private final ObjectMapper objectMapper; private final ConversionService conversionService; @@ -56,7 +56,7 @@ public class ParsingService { ProductEntity productEntity = conversionService.convert(dto, ProductEntity.class); PriceHistoryEntity priceHistory = PriceHistoryEntity.builder() - .id(new PriceHistoryId(productEntity, OffsetDateTime.now())) + .id(new PriceHistoryId(productEntity.getUrl(), LocalDateTime.now())) .price(BigDecimal.valueOf(dto.getSalePriceU() / 100.0)) .build(); diff --git a/parsing-service/src/main/resources/application.yml b/parsing-service/src/main/resources/application.yml index f5d8e78..3b8b123 100644 --- a/parsing-service/src/main/resources/application.yml +++ b/parsing-service/src/main/resources/application.yml @@ -9,18 +9,21 @@ spring: database: postgresql datasource: driver-class-name: org.postgresql.Driver - url: jdbc:postgresql://${JDBC_URL} - username: ${JDBC_USERNAME} - password: ${JDBC_PASSWORD} + url: jdbc:postgresql://${POSTGRES_JDBC_URL} + username: ${POSTGRES_JDBC_USERNAME} + password: ${POSTGRES_JDBC_PASSWORD} + clickhouse: + driver-class-name: com.clickhouse.jdbc.ClickHouseDriver + url: jdbc:clickhouse://${CLICKHOUSE_JDBC_URL} + username: ${CLICKHOUSE_JDBC_USERNAME} + password: ${CLICKHOUSE_JDBC_PASSWORD} liquibase: change-log: classpath:/db/changelog/master.yml - kafka: -selenium: marketplace: ozon: categories-urls: - - https://www.ozon.ru/category/noutbuki-15692 + - https://www.ozon.ru/category/noutbuki-15692/?brandcertified=t&is_high_rating=t wildberries: base-url: "https://static-basket-01.wbbasket.ru" catalog-url: "/vol0/data/main-menu-ru-ru-v3.json" diff --git a/parsing-service/src/main/resources/db/changelog/20241006/20241006_001_add_columns_in_tables.xml b/parsing-service/src/main/resources/db/changelog/20241006/20241006_001_add_columns_in_tables.xml index cdfba04..4300523 100644 --- a/parsing-service/src/main/resources/db/changelog/20241006/20241006_001_add_columns_in_tables.xml +++ b/parsing-service/src/main/resources/db/changelog/20241006/20241006_001_add_columns_in_tables.xml @@ -7,7 +7,7 @@ - + @@ -15,25 +15,14 @@ - - - - - CREATE TABLE if not exists price_history( - product_id bigint NOT NULL, + product_url varchar NOT NULL, price numeric(10, 2) NOT NULL, date timestamptz NOT NULL, - PRIMARY KEY (product_id, date) + PRIMARY KEY (product_url, date) ) PARTITION BY RANGE (date); - \ No newline at end of file