From 87246af738958f2d1fafd01e72e654ecc654923f Mon Sep 17 00:00:00 2001 From: maksim Date: Tue, 28 May 2024 00:50:44 +0400 Subject: [PATCH] =?UTF-8?q?=D0=9A=D1=80=D0=B0=D1=81=D0=B8=D0=B2=D1=8B?= =?UTF-8?q?=D0=B5=20=D0=BF=D0=B0=D0=BF=D0=BE=D1=87=D0=BA=D0=B8.=20=D0=96?= =?UTF-8?q?=D0=B0=D0=BB=D1=8C=20=D1=83=20=D0=BC=D0=B5=D0=BD=D1=8F=20=D0=BD?= =?UTF-8?q?=D0=B5=D1=82=20=D0=BF=D0=B0=D0=BF=D1=8B=20:(?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../data_ga.json | 0 genetic_algorithm/genetic_algorithm.py | 115 ++++++++++++++ graph_show.py | 29 ---- model.py | 6 +- neural_network/create_lstm/class/.gitkeep | 0 .../create_lstm/create_model_lstm_negative.py | 149 ++++++++++++++++++ .../create_lstm/create_model_lstm_positive.py | 148 +++++++++++++++++ neural_network/create_lstm/model/.gitkeep | 0 neural_network/create_lstm/tokenizer/.gitkeep | 0 neural_network/dataset/class/.gitkeep | 0 neural_network/dataset/conversion.py | 54 +++++++ neural_network/dataset/download_dataset.py | 18 +++ neural_network/dataset/filtered/.gitkeep | 0 13 files changed, 487 insertions(+), 32 deletions(-) rename molirabotai.json => genetic_algorithm/data_ga.json (100%) create mode 100644 genetic_algorithm/genetic_algorithm.py delete mode 100644 graph_show.py create mode 100644 neural_network/create_lstm/class/.gitkeep create mode 100644 neural_network/create_lstm/create_model_lstm_negative.py create mode 100644 neural_network/create_lstm/create_model_lstm_positive.py create mode 100644 neural_network/create_lstm/model/.gitkeep create mode 100644 neural_network/create_lstm/tokenizer/.gitkeep create mode 100644 neural_network/dataset/class/.gitkeep create mode 100644 neural_network/dataset/conversion.py create mode 100644 neural_network/dataset/download_dataset.py create mode 100644 neural_network/dataset/filtered/.gitkeep diff --git a/molirabotai.json b/genetic_algorithm/data_ga.json similarity index 100% rename from molirabotai.json rename to genetic_algorithm/data_ga.json diff --git a/genetic_algorithm/genetic_algorithm.py b/genetic_algorithm/genetic_algorithm.py new file mode 100644 index 0000000..0e20f6a --- /dev/null +++ b/genetic_algorithm/genetic_algorithm.py @@ -0,0 +1,115 @@ +import json +from datetime import datetime, timedelta +import random + + +# Функция для загрузки данных из JSON-файла и преобразования в граф +def load_graph_from_json(file_path): + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + + graph = {} + start_point = data['from'] + end_point = data['to'] + + for flight in data['flights']: + departure_point = flight['departurePoint'] + destination_point = flight['destinationPoint'] + distance = flight['distance'] + departure_time = datetime.fromisoformat(flight['departureTime']) + destination_time = datetime.fromisoformat(flight['destinationTime']) + + if departure_point not in graph: + graph[departure_point] = {} + + graph[departure_point][destination_point] = (distance, departure_time, destination_time) + + return graph, start_point, end_point + + +# Функция для вычисления длины и времени пути с учетом минимального интервала времени +def path_length_and_time(path, graph): + length = 0 + start_time = graph[path[0]][path[1]][1] + end_time = start_time + for i in range(len(path) - 1): + if path[i] not in graph or path[i + 1] not in graph[path[i]]: + return float('inf'), start_time, end_time + length += graph[path[i]][path[i + 1]][0] + if i > 0: # Пропуск первой вершины + if end_time + timedelta(minutes=5) > graph[path[i]][path[i + 1]][1]: + return float('inf'), start_time, end_time # Неприемлемый путь + end_time = graph[path[i]][path[i + 1]][2] + return length, start_time, end_time + + +# Функция для генерации начальной популяции +def generate_population(size, start, end, graph): + population = [] + for _ in range(size): + path = [start] + while path[-1] != end: + if path[-1] not in graph or not graph[path[-1]]: + break + next_node = random.choice(list(graph[path[-1]].keys())) + if next_node not in path: + path.append(next_node) + if path[-1] == end: + population.append(path) + return population + + +# Функция для селекции родителей +def select_parents(population, graph): + sorted_population = sorted(population, key=lambda p: path_length_and_time(p, graph)[0]) + return sorted_population[:len(sorted_population) // 2] + + +# Функция для скрещивания (кроссинговера) +def crossover(parent1, parent2): + crossover_point = len(parent1) // 2 + child = parent1[:crossover_point] + for gene in parent2: + if gene not in child: + child.append(gene) + return child + + +# Функция для мутации +def mutate(child): + if len(child) <= 2: + return child # Если путь состоит из одной вершины, мутация не требуется + index1 = random.randint(1, len(child) - 2) + index2 = random.randint(1, len(child) - 2) + child[index1], child[index2] = child[index2], child[index1] + return child + + +# Главная функция генетического алгоритма +def genetic_algorithm(start, end, graph, population_size=100, generations=100): + population = generate_population(population_size, start, end, graph) + for _ in range(generations): + parents = select_parents(population, graph) + new_population = parents[:] + while len(new_population) < population_size: + parent1 = random.choice(parents) + parent2 = random.choice(parents) + child = crossover(parent1, parent2) + if random.random() < 0.1: # Вероятность мутации + child = mutate(child) + new_population.append(child) + population = new_population + best_path = min(population, key=lambda p: path_length_and_time(p, graph)[0]) + best_length, start_time, end_time = path_length_and_time(best_path, graph) + return best_path, best_length, start_time, end_time + + +# Пример использования +file_path = './/data_ga.json' +graph, start_point, end_point = load_graph_from_json(file_path) + +best_path, length, start_time, end_time = genetic_algorithm(start_point, end_point, graph) +print("Наиболее короткий путь от", start_point, "до", end_point, ":", best_path) +print("Длина пути:", length) +print("Начальное время:", start_time.strftime("%d %B %Y %H:%M")) +print("Конечное время:", end_time.strftime("%d %B %Y %H:%M")) diff --git a/graph_show.py b/graph_show.py deleted file mode 100644 index 68723d3..0000000 --- a/graph_show.py +++ /dev/null @@ -1,29 +0,0 @@ -from matplotlib.lines import Line2D - -vertex = ((0, 1), (1, 1), (0.5, 0.8), (0.1, 0.5), (0.8, 0.2), (0.4, 0)) - -vx = [v[0] for v in vertex] -vy = [v[1] for v in vertex] - -def show_graph(ax, best): - ax.add_line(Line2D((vertex[0][0], vertex[1][0]), (vertex[0][1], vertex[1][1]), color='#aaa')) - ax.add_line(Line2D((vertex[0][0], vertex[2][0]), (vertex[0][1], vertex[2][1]), color='#aaa')) - ax.add_line(Line2D((vertex[0][0], vertex[3][0]), (vertex[0][1], vertex[3][1]), color='#aaa')) - ax.add_line(Line2D((vertex[1][0], vertex[2][0]), (vertex[1][1], vertex[2][1]), color='#aaa')) - ax.add_line(Line2D((vertex[2][0], vertex[5][0]), (vertex[2][1], vertex[5][1]), color='#aaa')) - ax.add_line(Line2D((vertex[2][0], vertex[4][0]), (vertex[2][1], vertex[4][1]), color='#aaa')) - ax.add_line(Line2D((vertex[3][0], vertex[5][0]), (vertex[3][1], vertex[5][1]), color='#aaa')) - ax.add_line(Line2D((vertex[4][0], vertex[5][0]), (vertex[4][1], vertex[5][1]), color='#aaa')) - - startV = 0 - for i, v in enumerate(best): - if i == 0: - continue - - prev = startV - v = v[:v.index(i)+1] - for j in v: - ax.add_line(Line2D((vertex[prev][0], vertex[j][0]), (vertex[prev][1], vertex[j][1]), color='r')) - prev = j - - ax.plot(vx, vy, ' ob', markersize=15) diff --git a/model.py b/model.py index 38a470f..1f3cb2f 100644 --- a/model.py +++ b/model.py @@ -5,14 +5,14 @@ from keras.src.legacy.preprocessing.text import Tokenizer from keras.src.utils import pad_sequences # Загрузка модели -model = tf.keras.models.load_model('best_model_lstm_negative.keras') +model = tf.keras.models.load_model('.//neural_network/create_lstm/model/best_model_lstm_negative.keras') # Загрузка токенизатора -with open('tokenizer_lstm_lstm_negative.pickle', 'rb') as handle: +with open('.//neural_network/create_lstm/tokenizer/tokenizer_lstm_lstm_negative.pickle', 'rb') as handle: tokenizer = pickle.load(handle) # Загрузка названий классов -with open('class_names_lstm_negative.txt', 'r', encoding='utf-8') as file: +with open('.//neural_network/create_lstm/class/class_names_lstm_negative.txt', 'r', encoding='utf-8') as file: class_names = [line.strip() for line in file.readlines()] def preprocess_text(text: str): diff --git a/neural_network/create_lstm/class/.gitkeep b/neural_network/create_lstm/class/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/neural_network/create_lstm/create_model_lstm_negative.py b/neural_network/create_lstm/create_model_lstm_negative.py new file mode 100644 index 0000000..73fada1 --- /dev/null +++ b/neural_network/create_lstm/create_model_lstm_negative.py @@ -0,0 +1,149 @@ +import pickle + +import pandas as pd +from keras import Sequential +from keras.src.callbacks import ModelCheckpoint +from keras.src.legacy.preprocessing.text import Tokenizer +from keras.src.saving import load_model +from keras.src.utils import pad_sequences +from matplotlib import pyplot as plt +from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM, SpatialDropout1D +from sklearn.preprocessing import LabelEncoder +from tensorflow.keras import utils +from nltk import word_tokenize, SnowballStemmer +from nltk.corpus import stopwords +import string + +from pymystem3 import Mystem + +# Максимальное количество слов +num_words = 10000 +# Максимальная длина отзыва +max_reviews_len = 90 +# Количество классов отзыва +nb_classes = 10 + +def remove_stopwords_and_punctuation(reviews): + stop_words = set(stopwords.words('russian')) + punctuation = set(string.punctuation) + filtered_tokens = [] + + # Удаление стоп слов и пунктуаций + for review in reviews: + words = word_tokenize(review) + filtered_words = [word for word in words if + word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation] + filtered_tokens.extend(filtered_words) + + return filtered_tokens + +def lemmatize_token(filtered_tokens): + mystem = Mystem() + lemmatized_tokens = [] + for token in filtered_tokens: + lemmatized = mystem.lemmatize(token)[0] + lemmatized_tokens.append(lemmatized) + print(lemmatized) + return lemmatized_tokens + +def stemmer_token(filtered_tokens): + stemmer = SnowballStemmer("russian") + stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens] + return stemmer_tokens + +# Данные +train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv') +train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True) +reviews = train['text'] +print("Набор данных готов") + +filtered_tokens = remove_stopwords_and_punctuation(reviews) +print(filtered_tokens[:10]) + +# lemmatized_tokens = lemmatize_token(filtered_tokens) +# print(lemmatized_tokens[:10]) + +# stemmer_tokens = stemmer_token(filtered_tokens) +# print(stemmer_tokens[:10]) + +label_encoder = LabelEncoder() +train['rubrics'] = label_encoder.fit_transform(train['rubrics']) + +# Сохраняем названия классов +class_names = label_encoder.classes_ + +y_train = utils.to_categorical(train['rubrics'], nb_classes) + +tokenizer = Tokenizer(num_words=num_words) +tokenizer.fit_on_texts(filtered_tokens) + +sequences = tokenizer.texts_to_sequences(reviews) + +x_train = pad_sequences(sequences, maxlen=max_reviews_len) + +print('начинается создание модели') + +# Построение модели +model_lstm = Sequential() +model_lstm.add(Embedding(num_words, 128, input_length=max_reviews_len)) +model_lstm.add(SpatialDropout1D(0.2)) +model_lstm.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2)) +model_lstm.add(Dense(nb_classes, activation='softmax')) + +model_lstm.compile(optimizer='adam', + loss='categorical_crossentropy', + metrics=['accuracy']) + +model_lstm.summary() + +# Определим обратный вызов ModelCheckpoint +model_lstm_save_path = './/model/best_model_lstm_negative.keras' +checkpoint_callback_lstm = ModelCheckpoint(model_lstm_save_path, + monitor='val_accuracy', + save_best_only=True, + verbose=1) + +# Обучение модели +history_lstm = model_lstm.fit(x_train, + y_train, + epochs=10, + batch_size=128, + validation_split=0.1, + callbacks=[checkpoint_callback_lstm]) + +# Графики +plt.plot(history_lstm.history['accuracy'], + label='Доля верных ответов на обучающем наборе') +plt.plot(history_lstm.history['val_accuracy'], + label='Доля верных ответов на проверочном наборе') +plt.xlabel('Эпоха обучения') +plt.ylabel('Доля верных ответов') +plt.legend() +plt.show() + +# Загрузка модели +model_lstm = load_model('.//model/best_model_lstm_negative.keras') + +# Пример текста отзыва пользователя +user_review = "Не люблю пьяных людей, когда они рядом, всегда будет что то плохое" + +# Подготовка отзыва пользователя +filtered_tokens = remove_stopwords_and_punctuation([user_review]) +sequences = tokenizer.texts_to_sequences(filtered_tokens) +x_user = pad_sequences(sequences, maxlen=max_reviews_len) + +# Получение вероятности принадлежности отзыва пользователя к разным классам +predicted_probabilities = model_lstm.predict(x_user) + +# Вывод вероятностей с названиями классов +for class_name, prob in zip(class_names, predicted_probabilities[0]): + print(f"Вероятность отзыва относится к классу '{class_name}': {prob}") + +# Сохраняем названия классов в текстовый файл +with open('.//class/class_names_lstm_negative.txt', 'w', encoding='utf-8') as file: + for class_name in class_names: + file.write(f"{class_name}\n") + +# saving +with open('.//tokenizer/tokenizer_lstm_lstm_negative.pickle', 'wb') as handle: + pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) diff --git a/neural_network/create_lstm/create_model_lstm_positive.py b/neural_network/create_lstm/create_model_lstm_positive.py new file mode 100644 index 0000000..27049fc --- /dev/null +++ b/neural_network/create_lstm/create_model_lstm_positive.py @@ -0,0 +1,148 @@ +import pandas as pd +from keras import Sequential +from keras.src.callbacks import ModelCheckpoint +from keras.src.legacy.preprocessing.text import Tokenizer +from keras.src.saving import load_model +from keras.src.utils import pad_sequences +from matplotlib import pyplot as plt +from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM, SpatialDropout1D +from sklearn.preprocessing import LabelEncoder +from tensorflow.keras import utils +from nltk import word_tokenize, SnowballStemmer +from nltk.corpus import stopwords +import string +import pickle + +from pymystem3 import Mystem + +# Максимальное количество слов +num_words = 10000 +# Максимальная длина отзыва +max_reviews_len = 90 +# Количество классов отзыва +nb_classes = 10 + +def remove_stopwords_and_punctuation(reviews): + stop_words = set(stopwords.words('russian')) + punctuation = set(string.punctuation) + filtered_tokens = [] + + # Удаление стоп слов и пунктуаций + for review in reviews: + words = word_tokenize(review) + filtered_words = [word for word in words if + word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation] + filtered_tokens.extend(filtered_words) + + return filtered_tokens + +def lemmatize_token(filtered_tokens): + mystem = Mystem() + lemmatized_tokens = [] + for token in filtered_tokens: + lemmatized = mystem.lemmatize(token)[0] + lemmatized_tokens.append(lemmatized) + print(lemmatized) + return lemmatized_tokens + +def stemmer_token(filtered_tokens): + stemmer = SnowballStemmer("russian") + stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens] + return stemmer_tokens + +# Данные +train = pd.read_csv('..//dataset/filtered/filtered_dataset_positive.csv') +train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True) +reviews = train['text'] +print("Набор данных готов") + +filtered_tokens = remove_stopwords_and_punctuation(reviews) +print(filtered_tokens[:10]) + +# lemmatized_tokens = lemmatize_token(filtered_tokens) +# print(lemmatized_tokens[:10]) + +# stemmer_tokens = stemmer_token(filtered_tokens) +# print(stemmer_tokens[:10]) + +label_encoder = LabelEncoder() +train['rubrics'] = label_encoder.fit_transform(train['rubrics']) + +# Сохраняем названия классов +class_names = label_encoder.classes_ + +y_train = utils.to_categorical(train['rubrics'], nb_classes) + +tokenizer = Tokenizer(num_words=num_words) +tokenizer.fit_on_texts(filtered_tokens) + +sequences = tokenizer.texts_to_sequences(reviews) + +x_train = pad_sequences(sequences, maxlen=max_reviews_len) + +print('начинается создание модели') + +# Построение модели +model_lstm = Sequential() +model_lstm.add(Embedding(num_words, 256, input_length=max_reviews_len)) +model_lstm.add(SpatialDropout1D(0.2)) +model_lstm.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2)) +model_lstm.add(Dense(nb_classes, activation='softmax')) + +model_lstm.compile(optimizer='adam', + loss='categorical_crossentropy', + metrics=['accuracy']) + +model_lstm.summary() + +# Определим обратный вызов ModelCheckpoint +model_lstm_save_path = './/model/best_model_lstm_positive.keras' +checkpoint_callback_lstm = ModelCheckpoint(model_lstm_save_path, + monitor='val_accuracy', + save_best_only=True, + verbose=1) + +# Обучение модели +history_lstm = model_lstm.fit(x_train, + y_train, + epochs=5, + batch_size=128, + validation_split=0.1, + callbacks=[checkpoint_callback_lstm]) + +# Графики +plt.plot(history_lstm.history['accuracy'], + label='Доля верных ответов на обучающем наборе') +plt.plot(history_lstm.history['val_accuracy'], + label='Доля верных ответов на проверочном наборе') +plt.xlabel('Эпоха обучения') +plt.ylabel('Доля верных ответов') +plt.legend() +plt.show() + +# Загрузка модели +model_lstm = load_model('.//model/best_model_lstm_positive.keras') + +# Пример текста отзыва пользователя +user_review = "Ой я так люблю вкусно кушать! Я бы хотел сьесть все самое вкусное в этом мире! Пончики, пиццу, вообще все все все!" + +# Подготовка отзыва пользователя +filtered_tokens = remove_stopwords_and_punctuation([user_review]) +sequences = tokenizer.texts_to_sequences(filtered_tokens) +x_user = pad_sequences(sequences, maxlen=max_reviews_len) + +# Получение вероятности принадлежности отзыва пользователя к разным классам +predicted_probabilities = model_lstm.predict(x_user) + +# Вывод вероятностей с названиями классов +for class_name, prob in zip(class_names, predicted_probabilities[0]): + print(f"Вероятность отзыва относится к классу '{class_name}': {prob}") + +# Сохраняем названия классов в текстовый файл +with open('.//class/class_names_lstm_positive.txt', 'w', encoding='utf-8') as file: + for class_name in class_names: + file.write(f"{class_name}\n") + +# saving +with open('.//tokenizer/tokenizer_lstm_positive.pickle', 'wb') as handle: + pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) \ No newline at end of file diff --git a/neural_network/create_lstm/model/.gitkeep b/neural_network/create_lstm/model/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/neural_network/create_lstm/tokenizer/.gitkeep b/neural_network/create_lstm/tokenizer/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/neural_network/dataset/class/.gitkeep b/neural_network/dataset/class/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/neural_network/dataset/conversion.py b/neural_network/dataset/conversion.py new file mode 100644 index 0000000..d9408d0 --- /dev/null +++ b/neural_network/dataset/conversion.py @@ -0,0 +1,54 @@ +import pandas as pd + +# Загрузка датасета +dataset = pd.read_csv('../dataset/geo-reviews-dataset-2023.csv') + +# Создание нового DataFrame с фильтрацией по условию rating +filtered_dataset_positive = dataset[dataset['rating'] > 3] +filtered_dataset_negative = dataset[dataset['rating'] < 3] + +# Удаление слов после запятой в столбце 'rubrics' +filtered_dataset_positive.loc[:, 'rubrics'] = filtered_dataset_positive['rubrics'].apply(lambda x: x.split(';')[0] if pd.notnull(x) else x) +filtered_dataset_negative.loc[:, 'rubrics'] = filtered_dataset_negative['rubrics'].apply(lambda x: x.split(';')[0] if pd.notnull(x) else x) + +# Переименование рубрик +rename_mapping = {'Пиццерия': 'Быстрое питание', 'Ресторан': 'Быстрое питание', 'Кафе': 'Быстрое питание', 'Магазин продуктов': 'Супермаркет', 'Аптека': 'Медцентр, клиника', 'Стоматологическая клиника': 'Медцентр, клиника'} +filtered_dataset_positive.loc[:, 'rubrics'] = filtered_dataset_positive['rubrics'].replace(rename_mapping) +filtered_dataset_negative.loc[:, 'rubrics'] = filtered_dataset_negative['rubrics'].replace(rename_mapping) + +# Получение уникальных слов из столбца 'rubrics' +unique_rubrics_positive = set(filtered_dataset_positive['rubrics'].value_counts().head(10).index.tolist()) +unique_rubrics_negative = set(filtered_dataset_negative['rubrics'].value_counts().head(10).index.tolist()) + +# Сохранение строк, содержащих рубрики из unique_rubrics_positive и unique_rubrics_negative +filtered_dataset_positive = filtered_dataset_positive[filtered_dataset_positive['rubrics'].isin(unique_rubrics_positive)] +filtered_dataset_negative = filtered_dataset_negative[filtered_dataset_negative['rubrics'].isin(unique_rubrics_negative)] + +# Ограничение количества строк одного типа rubrics +filtered_dataset_negative = (filtered_dataset_negative.groupby('rubrics').head(1500).reset_index(drop=True)) +filtered_dataset_positive = (filtered_dataset_positive.groupby('rubrics').head(15000).reset_index(drop=True)) + + +# Вывод количества строк для каждой rubrics в unique_rubrics_positive +for rubric in unique_rubrics_positive: + count = filtered_dataset_positive[filtered_dataset_positive['rubrics'] == rubric].shape[0] + print(f"Количество строк с rubrics '{rubric}' в filtered_dataset_positive: {count}") + +# Вывод количества строк для каждой rubrics в unique_rubrics_negative +for rubric in unique_rubrics_negative: + count = filtered_dataset_negative[filtered_dataset_negative['rubrics'] == rubric].shape[0] + print(f"Количество строк с rubrics '{rubric}' в filtered_dataset_negative: {count}") + +# Сохранение уникальных слов в файл class_positive.txt с кодировкой UTF-8 +with open('class/class_positive.txt', 'w', encoding='utf-8') as file: + for rubric in unique_rubrics_positive: + file.write(f"{rubric}\n") + +# Сохранение уникальных слов в файл class_negative.txt с кодировкой UTF-8 +with open('class/class_negative.txt', 'w', encoding='utf-8') as file: + for rubric in unique_rubrics_negative: + file.write(f"{rubric}\n") + +# Сохранение отфильтрованного DataFrame в новый CSV файл +filtered_dataset_positive.to_csv('../dataset/filtered/filtered_dataset_positive.csv', index=False) +filtered_dataset_negative.to_csv('../dataset/filtered/filtered_dataset_negative.csv', index=False) diff --git a/neural_network/dataset/download_dataset.py b/neural_network/dataset/download_dataset.py new file mode 100644 index 0000000..ef22a4b --- /dev/null +++ b/neural_network/dataset/download_dataset.py @@ -0,0 +1,18 @@ +import kaggle +import zipfile +import os + +# Аутентификация +kaggle.api.authenticate() + +# Скачивание конкретного файла из набора данных +dataset = 'kyakovlev/yandex-geo-reviews-dataset-2023' +file_name = 'geo-reviews-dataset-2023.csv' +kaggle.api.dataset_download_file(dataset, file_name, path='../dataset') + +# Распаковка архива, если файл сжат +zip_path = f'./{file_name}.zip' +if os.path.exists(zip_path): + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall('./') + os.remove(zip_path) # Удаление zip файла после распаковки \ No newline at end of file diff --git a/neural_network/dataset/filtered/.gitkeep b/neural_network/dataset/filtered/.gitkeep new file mode 100644 index 0000000..e69de29