diff --git a/LabWork01/LabWork4/BloomFilter.py b/LabWork01/LabWork4/BloomFilter.py new file mode 100644 index 0000000..f3c0317 --- /dev/null +++ b/LabWork01/LabWork4/BloomFilter.py @@ -0,0 +1,55 @@ +import hashlib +import mmh3 +from bitarray import bitarray + +class BloomFilter(object): + + def __init__(self, size, hash_count): + self.size = size + self.hash_count = hash_count + self.bit_array = bitarray(size) + self.bit_array.setall(0) + + def add(self, item): + for seed in range(self.hash_count): + index = mmh3.hash(item, seed) % self.size + self.bit_array[index] = 1 + + def contains(self, item): + for seed in range(self.hash_count): + index = mmh3.hash(item, seed) % self.size + if self.bit_array[index] == 0: + return False + return True + + # def __init__(self, size, number_expected_elements=100000): + # self.size = size + # self.number_expected_elements = number_expected_elements + # + # self.bloom_filter = bitarray(self.size) + # self.bloom_filter.setall(0) + # + # self.number_hash_functions = round((self.size / self.number_expected_elements) * math.log(2)) + # + # # основная хеш-функция + # def _hash_djb2(self, s): + # hash = 5381 + # for x in s: + # hash = ((hash << 5) + hash) + ord(x) + # return hash % self.size + # + # # симулируем создания множества хеш-функций + # def _hash(self, item, K): + # return self._hash_djb2(str(K) + item) + # + # # добавление нового элемента в фильтр Блума + # def add_to_filter(self, item): + # for i in range(self.number_hash_functions): + # self.bloom_filter[self._hash(item, i)] = 1 + # + # # проверка н наличие элемента в фильторе Блума + # def check_is_not_in_filter(self, item): + # for i in range(self.number_hash_functions): + # if self.bloom_filter[self._hash(item, i)] == 0: + # return True + # return False \ No newline at end of file diff --git a/LabWork01/LabWork4/SiteSearch.py b/LabWork01/LabWork4/SiteSearch.py new file mode 100644 index 0000000..0052eca --- /dev/null +++ b/LabWork01/LabWork4/SiteSearch.py @@ -0,0 +1,28 @@ +from LabWork01.LabWork4.BloomFilter import BloomFilter + +class SiteSearch: + def __init__(self): + self.filter: BloomFilter = BloomFilter(100000, 5) + self.keyword_urls: dict[str, list[str]] = {} + + def add(self, url: str, keywords: list[str]) -> None: + for keyword in keywords: + lowercase_string = keyword.lower() + self.filter.add(lowercase_string) + if lowercase_string not in self.keyword_urls: + self.keyword_urls[lowercase_string] = [] + self.keyword_urls[lowercase_string].append(url) + + def find_url(self, keyword: str) -> list[str]: + lowercase_string = keyword.lower() + if self.filter.contains(lowercase_string): + return self.keyword_urls.get(lowercase_string) + else: + return [] + + def contains(self, keyword: str) -> list[str]: + lowercase_string = keyword.lower() + if self.filter.contains(lowercase_string): + return True + else: + return False \ No newline at end of file diff --git a/LabWork01/LoadDB.py b/LabWork01/LoadDB.py index a60d6e5..698fd19 100644 --- a/LabWork01/LoadDB.py +++ b/LabWork01/LoadDB.py @@ -12,7 +12,7 @@ from LabWork01.LabWork3.AddData import addData from LabWork01.LabWork3.CreateGraphics import createGraphics from LabWork01.LabWork3.CustomGraphics import createCusGraphics from LabWork01.LabWork3.DeletePng import deleteAllPng - +from LabWork01.LabWork4.SiteSearch import SiteSearch app = Flask(__name__) @@ -25,6 +25,12 @@ listTypes = listShops.dtypes.to_list() #формируем записи о кол-ве пустых ячеек в каждом столбце countNull = listShops.isnull().sum() +# для фильтра Блума +search_engine = SiteSearch() +search_engine.add("https://www.kaggle.com/datasets/ankanhore545/100-highest-valued-unicorns", ["Company", "Valuation", "Country", "State", "City", "Industries", "Founded Year", "Name of Founders", "Total Funding", "Number of Employees"]) +search_engine.add("https://www.kaggle.com/datasets/ilyaryabov/tesla-insider-trading", ["Insider Trading", "Relationship", "Date", "Transaction", "Cost", "Shares", "Value", "Shares Total", "SEC Form 4"]) +search_engine.add("https://www.kaggle.com/datasets/sameepvani/nasa-nearest-earth-objects", ["NASA", "est_diameter_min", "est_diameter_max", "relative_velocity", "miss_distance", "orbiting_body", "sentry_object", "absolute_magnitude", "hazardous"]) + @app.route("/") def home(): return render_template('main_page.html', context=[], main_img=[], image_names=[], tableAnalys=[], titles=[''], listTypes=listTypes, countNull=countNull, firstRow=1, secondRow=4, firstColumn=1, secondColumn=4) @@ -143,6 +149,23 @@ def analysis(): listTypes=listTypes, countNull=countNull, firstRow=1, secondRow=4, firstColumn=1, secondColumn=4) +@app.route('/findURL', methods=['GET']) +def get_page_findURL(): + return render_template('findURL.html') + +@app.route('/findURL', methods=['POST']) +def findURL(): + word = request.form["word"] + if (search_engine.contains(word)): + links = search_engine.find_url(word) + word_links = [] + for item in links: + word_links.append({item, word}) + print(word_links) + + return render_template('findURL.html', word_links=word_links) + return render_template('findURL.html') + if __name__=="__main__": app.run(debug=True) diff --git a/LabWork01/templates/findUrl.html b/LabWork01/templates/findUrl.html new file mode 100644 index 0000000..0212333 --- /dev/null +++ b/LabWork01/templates/findUrl.html @@ -0,0 +1,33 @@ + + +
+ + + + + +