diff --git a/model.py b/model.py index d415d6b..551c00b 100644 --- a/model.py +++ b/model.py @@ -5,14 +5,14 @@ from keras.src.legacy.preprocessing.text import Tokenizer from keras.src.utils import pad_sequences # Загрузка модели -model = tf.keras.models.load_model('.//neural_network/create_lstm/model/best_model_lstm_negative.keras') +model = tf.keras.models.load_model('.//neural_network/models/model/best_model_lstm_negative.keras') # Загрузка токенизатора -with open('.//neural_network/create_lstm/tokenization/tokenizer_lstm_lstm_negative.pickle', 'rb') as handle: +with open('neural_network/models/tokenization/tokenizer_lstm_lstm_negative.pickle', 'rb') as handle: tokenizer = pickle.load(handle) # Загрузка названий классов -with open('.//neural_network/create_lstm/class/class_names_lstm_negative.txt', 'r', encoding='utf-8') as file: +with open('neural_network/models/class/class_names_lstm_negative.txt', 'r', encoding='utf-8') as file: class_names = [line.strip() for line in file.readlines()] def preprocess_text(text: str): diff --git a/neural_network/class/class_negative.py b/neural_network/class/class_negative.py deleted file mode 100644 index 73c060a..0000000 --- a/neural_network/class/class_negative.py +++ /dev/null @@ -1,16 +0,0 @@ -import pandas as pd -from sklearn.preprocessing import LabelEncoder - -# Данные -train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv') - -label_encoder = LabelEncoder() -train['rubrics'] = label_encoder.fit_transform(train['rubrics']) - -# Сохраняем названия классов -class_names = label_encoder.classes_ - -# Сохраняем названия классов в текстовый файл -with open('.//class_names_negative.txt', 'w', encoding='utf-8') as file: - for class_name in class_names: - file.write(f"{class_name}\n") diff --git a/neural_network/class/class_positive.py b/neural_network/class/class_positive.py deleted file mode 100644 index 5332ae6..0000000 --- a/neural_network/class/class_positive.py +++ /dev/null @@ -1,16 +0,0 @@ -import pandas as pd -from sklearn.preprocessing import LabelEncoder - -# Данные -train = pd.read_csv('../dataset/filtered/filtered_dataset_positive.csv') - -label_encoder = LabelEncoder() -train['rubrics'] = label_encoder.fit_transform(train['rubrics']) - -# Сохраняем названия классов -class_names = label_encoder.classes_ - -# Сохраняем названия классов в текстовый файл -with open('.//class_names_positive.txt', 'w', encoding='utf-8') as file: - for class_name in class_names: - file.write(f"{class_name}\n") diff --git a/neural_network/class/class_save.py b/neural_network/class/class_save.py new file mode 100644 index 0000000..7888583 --- /dev/null +++ b/neural_network/class/class_save.py @@ -0,0 +1,61 @@ +import pandas as pd +from sklearn.preprocessing import LabelEncoder + +class DataLoader: + def __init__(self, dataset_path): + self.dataset_path = dataset_path + + def load_data(self): + return pd.read_csv(self.dataset_path) + + +class LabelProcessor: + def __init__(self, dataset, label_column): + self.dataset = dataset + self.label_column = label_column + self.label_encoder = LabelEncoder() + + def encode_labels(self): + self.dataset[self.label_column] = self.label_encoder.fit_transform(self.dataset[self.label_column]) + return self.dataset + + def get_class_names(self): + return self.label_encoder.classes_ + + def save_class_names(self, class_names, output_path): + with open(output_path, 'w', encoding='utf-8') as file: + for class_name in class_names: + file.write(f"{class_name}\n") + + +def process_dataset(dataset_path, label_column, output_path): + # Load data + data_loader = DataLoader(dataset_path) + dataset = data_loader.load_data() + + # Process labels + label_processor = LabelProcessor(dataset, label_column) + dataset = label_processor.encode_labels() + class_names = label_processor.get_class_names() + + # Save class names + label_processor.save_class_names(class_names, output_path) + + return dataset + + +def main(): + positive_dataset_path = '../dataset/filtered/filtered_dataset_positive.csv' + negative_dataset_path = '../dataset/filtered/filtered_dataset_negative.csv' + positive_output_path = './class_names_positive.txt' + negative_output_path = './class_names_negative.txt' + + # Process positive dataset + process_dataset(positive_dataset_path, 'rubrics', positive_output_path) + + # Process negative dataset + process_dataset(negative_dataset_path, 'rubrics', negative_output_path) + + +if __name__ == "__main__": + main() diff --git a/neural_network/create_cnn/create_model_cnn_negative.py b/neural_network/create_cnn/create_model_cnn_negative.py deleted file mode 100644 index 1cfea74..0000000 --- a/neural_network/create_cnn/create_model_cnn_negative.py +++ /dev/null @@ -1,80 +0,0 @@ -import pickle -import pandas as pd -from keras import Sequential -from keras.src.callbacks import ModelCheckpoint -from keras.src.utils import pad_sequences -from matplotlib import pyplot as plt -from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D -from tensorflow.keras import utils -from sklearn.preprocessing import LabelEncoder - -# Максимальное количество слов -num_words = 10000 -# Максимальная длина отзыва -max_reviews_len = 90 -# Количество классов отзыва -nb_classes = 10 - -# Загрузка токенизатора -with open('..//tokenization/tokenizer_negative.pickle', 'rb') as handle: - tokenizer = pickle.load(handle) - -# Загрузка названий классов -with open('..//class/class_names_negative.txt', 'r', encoding='utf-8') as file: - class_names = [line.strip() for line in file.readlines()] - -# Данные -train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv') -train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True) -reviews = train['text'] -print("Набор данных готов") - -# Кодирование категорий -label_encoder = LabelEncoder() -label_encoder.fit(class_names) -encoded_labels = label_encoder.transform(train['rubrics']) -y_train = utils.to_categorical(encoded_labels, nb_classes) - -sequences = tokenizer.texts_to_sequences(reviews) -x_train = pad_sequences(sequences, maxlen=max_reviews_len) - -print('начинается создание модели') - -# Построение модели -model_cnn = Sequential() -model_cnn.add(Embedding(num_words, 128, input_length=max_reviews_len)) -model_cnn.add(Conv1D(128, 5, activation='relu')) -model_cnn.add(GlobalMaxPooling1D()) -model_cnn.add(Dense(64, activation='relu')) -model_cnn.add(Dense(nb_classes, activation='softmax')) - -model_cnn.compile(optimizer='adam', - loss='categorical_crossentropy', - metrics=['accuracy']) - -model_cnn.summary() - -# Определим обратный вызов ModelCheckpoint -model_cnn_save_path = './/model/best_model_cnn_negative.keras' -checkpoint_callback_cnn = ModelCheckpoint(model_cnn_save_path, - monitor='val_accuracy', - save_best_only=True, - verbose=1) - -# Обучение модели -history_cnn = model_cnn.fit(x_train, - y_train, - epochs=10, - batch_size=128, - validation_split=0.1, - callbacks=[checkpoint_callback_cnn]) - -# Графики -plt.plot(history_cnn.history['accuracy'], - label='Доля верных ответов на обучающем наборе') -plt.plot(history_cnn.history['val_accuracy'], - label='Доля верных ответов на проверочном наборе') -plt.xlabel('Эпоха обучения') -plt.ylabel('Доля верных ответов') -plt.legend() -plt.savefig('.//graphics/history_cnn_negative.png') diff --git a/neural_network/create_cnn/create_model_cnn_positive.py b/neural_network/create_cnn/create_model_cnn_positive.py deleted file mode 100644 index feaaf1c..0000000 --- a/neural_network/create_cnn/create_model_cnn_positive.py +++ /dev/null @@ -1,81 +0,0 @@ -import pickle -import pandas as pd -from keras import Sequential -from keras.src.callbacks import ModelCheckpoint -from keras.src.utils import pad_sequences -from matplotlib import pyplot as plt -from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D -from tensorflow.keras import utils -from sklearn.preprocessing import LabelEncoder - -# Максимальное количество слов -num_words = 10000 -# Максимальная длина отзыва -max_reviews_len = 90 -# Количество классов отзыва -nb_classes = 10 - -# Загрузка токенизатора -with open('..//tokenization/tokenizer_positive.pickle', 'rb') as handle: - tokenizer = pickle.load(handle) - -# Загрузка названий классов -with open('..//class/class_names_positive.txt', 'r', encoding='utf-8') as file: - class_names = [line.strip() for line in file.readlines()] - -# Данные -train = pd.read_csv('../dataset/filtered/filtered_dataset_positive.csv') -train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True) -reviews = train['text'] -print("Набор данных готов") - -# Кодирование категорий -label_encoder = LabelEncoder() -label_encoder.fit(class_names) -encoded_labels = label_encoder.transform(train['rubrics']) -y_train = utils.to_categorical(encoded_labels, nb_classes) - -sequences = tokenizer.texts_to_sequences(reviews) -x_train = pad_sequences(sequences, maxlen=max_reviews_len) - -print('начинается создание модели') - -# Построение модели -model_cnn = Sequential() -model_cnn.add(Embedding(num_words, 256, input_length=max_reviews_len)) -model_cnn.add(Conv1D(256, 5, activation='relu')) -model_cnn.add(GlobalMaxPooling1D()) -model_cnn.add(Dense(128, activation='relu')) -model_cnn.add(Dense(nb_classes, activation='softmax')) - - -model_cnn.compile(optimizer='adam', - loss='categorical_crossentropy', - metrics=['accuracy']) - -model_cnn.summary() - -# Определим обратный вызов ModelCheckpoint -model_cnn_save_path = './/model/best_model_cnn_positive.keras' -checkpoint_callback_cnn = ModelCheckpoint(model_cnn_save_path, - monitor='val_accuracy', - save_best_only=True, - verbose=1) - -# Обучение модели -history_cnn = model_cnn.fit(x_train, - y_train, - epochs=5, - batch_size=128, - validation_split=0.1, - callbacks=[checkpoint_callback_cnn]) - -# Графики -plt.plot(history_cnn.history['accuracy'], - label='Доля верных ответов на обучающем наборе') -plt.plot(history_cnn.history['val_accuracy'], - label='Доля верных ответов на проверочном наборе') -plt.xlabel('Эпоха обучения') -plt.ylabel('Доля верных ответов') -plt.legend() -plt.savefig('.//graphics/history_cnn_positive.png') diff --git a/neural_network/create_gru/create_model_gru_negative.py b/neural_network/create_gru/create_model_gru_negative.py deleted file mode 100644 index 38e1c46..0000000 --- a/neural_network/create_gru/create_model_gru_negative.py +++ /dev/null @@ -1,80 +0,0 @@ -import pickle -import pandas as pd -from keras import Sequential -from keras.src.callbacks import ModelCheckpoint -from keras.src.utils import pad_sequences -from matplotlib import pyplot as plt -from tensorflow.keras.layers import Dense, Embedding, Dropout, GRU, Bidirectional -from tensorflow.keras import utils -from sklearn.preprocessing import LabelEncoder - -# Максимальное количество слов -num_words = 10000 -# Максимальная длина отзыва -max_reviews_len = 90 -# Количество классов отзыва -nb_classes = 10 - -# Загрузка токенизатора -with open('..//tokenization/tokenizer_negative.pickle', 'rb') as handle: - tokenizer = pickle.load(handle) - -# Загрузка названий классов -with open('..//class/class_names_negative.txt', 'r', encoding='utf-8') as file: - class_names = [line.strip() for line in file.readlines()] - -# Данные -train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv') -train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True) -reviews = train['text'] -print("Набор данных готов") - -# Кодирование категорий -label_encoder = LabelEncoder() -label_encoder.fit(class_names) -encoded_labels = label_encoder.transform(train['rubrics']) -y_train = utils.to_categorical(encoded_labels, nb_classes) - -sequences = tokenizer.texts_to_sequences(reviews) -x_train = pad_sequences(sequences, maxlen=max_reviews_len) - -print('начинается создание модели') - -# Построение модели -model_gru = Sequential() -model_gru.add(Embedding(num_words, 128, input_length=max_reviews_len)) -model_gru.add(GRU(128, return_sequences=True)) -model_gru.add(Dropout(0.25)) -model_gru.add(GRU(64)) -model_gru.add(Dense(nb_classes, activation='softmax')) - -model_gru.compile(optimizer='adam', - loss='categorical_crossentropy', - metrics=['accuracy']) - -model_gru.summary() - -# Определим обратный вызов ModelCheckpoint -model_lstm_save_path = './/model/best_model_gru_negative.keras' -checkpoint_callback_gru = ModelCheckpoint(model_lstm_save_path, - monitor='val_accuracy', - save_best_only=True, - verbose=1) - -# Обучение модели -history_gru = model_gru.fit(x_train, - y_train, - epochs=10, - batch_size=128, - validation_split=0.1, - callbacks=[checkpoint_callback_gru]) - -# Графики -plt.plot(history_gru.history['accuracy'], - label='Доля верных ответов на обучающем наборе') -plt.plot(history_gru.history['val_accuracy'], - label='Доля верных ответов на проверочном наборе') -plt.xlabel('Эпоха обучения') -plt.ylabel('Доля верных ответов') -plt.legend() -plt.savefig('.//graphics/history_gru_negative.png') diff --git a/neural_network/create_gru/create_model_gru_positive.py b/neural_network/create_gru/create_model_gru_positive.py deleted file mode 100644 index b424eac..0000000 --- a/neural_network/create_gru/create_model_gru_positive.py +++ /dev/null @@ -1,80 +0,0 @@ -import pickle -import pandas as pd -from keras import Sequential -from keras.src.callbacks import ModelCheckpoint -from keras.src.utils import pad_sequences -from matplotlib import pyplot as plt -from tensorflow.keras.layers import Dense, Embedding, GRU, Dropout -from tensorflow.keras import utils -from sklearn.preprocessing import LabelEncoder - -# Максимальное количество слов -num_words = 10000 -# Максимальная длина отзыва -max_reviews_len = 90 -# Количество классов отзыва -nb_classes = 10 - -# Загрузка токенизатора -with open('..//tokenization/tokenizer_positive.pickle', 'rb') as handle: - tokenizer = pickle.load(handle) - -# Загрузка названий классов -with open('..//class/class_names_positive.txt', 'r', encoding='utf-8') as file: - class_names = [line.strip() for line in file.readlines()] - -# Данные -train = pd.read_csv('../dataset/filtered/filtered_dataset_positive.csv') -train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True) -reviews = train['text'] -print("Набор данных готов") - -# Кодирование категорий -label_encoder = LabelEncoder() -label_encoder.fit(class_names) -encoded_labels = label_encoder.transform(train['rubrics']) -y_train = utils.to_categorical(encoded_labels, nb_classes) - -sequences = tokenizer.texts_to_sequences(reviews) -x_train = pad_sequences(sequences, maxlen=max_reviews_len) - -print('начинается создание модели') - -# Построение модели -model_gru = Sequential() -model_gru.add(Embedding(num_words, 256, input_length=max_reviews_len)) -model_gru.add(GRU(256, return_sequences=True)) -model_gru.add(Dropout(0.25)) -model_gru.add(GRU(128)) -model_gru.add(Dense(nb_classes, activation='softmax')) - -model_gru.compile(optimizer='adam', - loss='categorical_crossentropy', - metrics=['accuracy']) - -model_gru.summary() - -# Определим обратный вызов ModelCheckpoint -model_gru_save_path = './/model/best_model_gru_positive.keras' -checkpoint_callback_gru = ModelCheckpoint(model_gru_save_path, - monitor='val_accuracy', - save_best_only=True, - verbose=1) - -# Обучение модели -history_gru = model_gru.fit(x_train, - y_train, - epochs=5, - batch_size=128, - validation_split=0.1, - callbacks=[checkpoint_callback_gru]) - -# Графики -plt.plot(history_gru.history['accuracy'], - label='Доля верных ответов на обучающем наборе') -plt.plot(history_gru.history['val_accuracy'], - label='Доля верных ответов на проверочном наборе') -plt.xlabel('Эпоха обучения') -plt.ylabel('Доля верных ответов') -plt.legend() -plt.savefig('.//graphics/history_gru_positive.png') \ No newline at end of file diff --git a/neural_network/create_gru/graphics/.gitkeep b/neural_network/create_gru/graphics/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/neural_network/create_gru/model/.gitkeep b/neural_network/create_gru/model/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/neural_network/create_lstm/create_model_lstm_negative.py b/neural_network/create_lstm/create_model_lstm_negative.py deleted file mode 100644 index 5de3dbe..0000000 --- a/neural_network/create_lstm/create_model_lstm_negative.py +++ /dev/null @@ -1,79 +0,0 @@ -import pickle -import pandas as pd -from keras import Sequential -from keras.src.callbacks import ModelCheckpoint -from keras.src.utils import pad_sequences -from matplotlib import pyplot as plt -from tensorflow.keras.layers import Dense, Embedding, SpatialDropout1D, LSTM -from tensorflow.keras import utils -from sklearn.preprocessing import LabelEncoder - -# Максимальное количество слов -num_words = 10000 -# Максимальная длина отзыва -max_reviews_len = 90 -# Количество классов отзыва -nb_classes = 10 - -# Загрузка токенизатора -with open('..//tokenization/tokenizer_negative.pickle', 'rb') as handle: - tokenizer = pickle.load(handle) - -# Загрузка названий классов -with open('..//class/class_names_negative.txt', 'r', encoding='utf-8') as file: - class_names = [line.strip() for line in file.readlines()] - -# Данные -train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv') -train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True) -reviews = train['text'] -print("Набор данных готов") - -# Кодирование категорий -label_encoder = LabelEncoder() -label_encoder.fit(class_names) -encoded_labels = label_encoder.transform(train['rubrics']) -y_train = utils.to_categorical(encoded_labels, nb_classes) - -sequences = tokenizer.texts_to_sequences(reviews) -x_train = pad_sequences(sequences, maxlen=max_reviews_len) - -print('начинается создание модели') - -# Построение модели -model_lstm = Sequential() -model_lstm.add(Embedding(num_words, 128, input_length=max_reviews_len)) -model_lstm.add(SpatialDropout1D(0.2)) -model_lstm.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2)) -model_lstm.add(Dense(nb_classes, activation='softmax')) - -model_lstm.compile(optimizer='adam', - loss='categorical_crossentropy', - metrics=['accuracy']) - -model_lstm.summary() - -# Определим обратный вызов ModelCheckpoint -model_lstm_save_path = './/model/best_model_lstm_negative.keras' -checkpoint_callback_lstm = ModelCheckpoint(model_lstm_save_path, - monitor='val_accuracy', - save_best_only=True, - verbose=1) - -# Обучение модели -history_lstm = model_lstm.fit(x_train, - y_train, - epochs=10, - batch_size=128, - validation_split=0.1, - callbacks=[checkpoint_callback_lstm]) - -# Графики -plt.plot(history_lstm.history['accuracy'], - label='Доля верных ответов на обучающем наборе') -plt.plot(history_lstm.history['val_accuracy'], - label='Доля верных ответов на проверочном наборе') -plt.xlabel('Эпоха обучения') -plt.ylabel('Доля верных ответов') -plt.legend() -plt.savefig('.//graphics/history_lstm_negative.png') diff --git a/neural_network/create_lstm/create_model_lstm_positive.py b/neural_network/create_lstm/create_model_lstm_positive.py deleted file mode 100644 index 5580065..0000000 --- a/neural_network/create_lstm/create_model_lstm_positive.py +++ /dev/null @@ -1,79 +0,0 @@ -import pickle -import pandas as pd -from keras import Sequential -from keras.src.callbacks import ModelCheckpoint -from keras.src.utils import pad_sequences -from matplotlib import pyplot as plt -from tensorflow.keras.layers import Dense, Embedding, SpatialDropout1D, LSTM -from tensorflow.keras import utils -from sklearn.preprocessing import LabelEncoder - -# Максимальное количество слов -num_words = 10000 -# Максимальная длина отзыва -max_reviews_len = 90 -# Количество классов отзыва -nb_classes = 10 - -# Загрузка токенизатора -with open('..//tokenization/tokenizer_positive.pickle', 'rb') as handle: - tokenizer = pickle.load(handle) - -# Загрузка названий классов -with open('..//class/class_names_positive.txt', 'r', encoding='utf-8') as file: - class_names = [line.strip() for line in file.readlines()] - -# Данные -train = pd.read_csv('../dataset/filtered/filtered_dataset_positive.csv') -train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True) -reviews = train['text'] -print("Набор данных готов") - -# Кодирование категорий -label_encoder = LabelEncoder() -label_encoder.fit(class_names) -encoded_labels = label_encoder.transform(train['rubrics']) -y_train = utils.to_categorical(encoded_labels, nb_classes) - -sequences = tokenizer.texts_to_sequences(reviews) -x_train = pad_sequences(sequences, maxlen=max_reviews_len) - -print('начинается создание модели') - -# Построение модели -model_lstm = Sequential() -model_lstm.add(Embedding(num_words, 256, input_length=max_reviews_len)) -model_lstm.add(SpatialDropout1D(0.2)) -model_lstm.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2)) -model_lstm.add(Dense(nb_classes, activation='softmax')) - -model_lstm.compile(optimizer='adam', - loss='categorical_crossentropy', - metrics=['accuracy']) - -model_lstm.summary() - -# Определим обратный вызов ModelCheckpoint -model_lstm_save_path = './/model/best_model_lstm_positive.keras' -checkpoint_callback_lstm = ModelCheckpoint(model_lstm_save_path, - monitor='val_accuracy', - save_best_only=True, - verbose=1) - -# Обучение модели -history_lstm = model_lstm.fit(x_train, - y_train, - epochs=5, - batch_size=128, - validation_split=0.1, - callbacks=[checkpoint_callback_lstm]) - -# Графики -plt.plot(history_lstm.history['accuracy'], - label='Доля верных ответов на обучающем наборе') -plt.plot(history_lstm.history['val_accuracy'], - label='Доля верных ответов на проверочном наборе') -plt.xlabel('Эпоха обучения') -plt.ylabel('Доля верных ответов') -plt.legend() -plt.savefig('.//graphics/history_lstm_positive.png') \ No newline at end of file diff --git a/neural_network/create_lstm/graphics/.gitkeep b/neural_network/create_lstm/graphics/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/neural_network/create_lstm/model/.gitkeep b/neural_network/create_lstm/model/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/neural_network/dataset/conversion.py b/neural_network/dataset/conversion.py index d9408d0..fd03d57 100644 --- a/neural_network/dataset/conversion.py +++ b/neural_network/dataset/conversion.py @@ -1,54 +1,124 @@ +import kaggle +import zipfile +import os import pandas as pd -# Загрузка датасета -dataset = pd.read_csv('../dataset/geo-reviews-dataset-2023.csv') -# Создание нового DataFrame с фильтрацией по условию rating -filtered_dataset_positive = dataset[dataset['rating'] > 3] -filtered_dataset_negative = dataset[dataset['rating'] < 3] +class KaggleDatasetDownloader: + def __init__(self, dataset, file_name, download_path): + self.dataset = dataset + self.file_name = file_name + self.download_path = download_path -# Удаление слов после запятой в столбце 'rubrics' -filtered_dataset_positive.loc[:, 'rubrics'] = filtered_dataset_positive['rubrics'].apply(lambda x: x.split(';')[0] if pd.notnull(x) else x) -filtered_dataset_negative.loc[:, 'rubrics'] = filtered_dataset_negative['rubrics'].apply(lambda x: x.split(';')[0] if pd.notnull(x) else x) + def authenticate_and_download(self): + kaggle.api.authenticate() + kaggle.api.dataset_download_file(self.dataset, self.file_name, path=self.download_path) + self.extract_if_needed() -# Переименование рубрик -rename_mapping = {'Пиццерия': 'Быстрое питание', 'Ресторан': 'Быстрое питание', 'Кафе': 'Быстрое питание', 'Магазин продуктов': 'Супермаркет', 'Аптека': 'Медцентр, клиника', 'Стоматологическая клиника': 'Медцентр, клиника'} -filtered_dataset_positive.loc[:, 'rubrics'] = filtered_dataset_positive['rubrics'].replace(rename_mapping) -filtered_dataset_negative.loc[:, 'rubrics'] = filtered_dataset_negative['rubrics'].replace(rename_mapping) - -# Получение уникальных слов из столбца 'rubrics' -unique_rubrics_positive = set(filtered_dataset_positive['rubrics'].value_counts().head(10).index.tolist()) -unique_rubrics_negative = set(filtered_dataset_negative['rubrics'].value_counts().head(10).index.tolist()) - -# Сохранение строк, содержащих рубрики из unique_rubrics_positive и unique_rubrics_negative -filtered_dataset_positive = filtered_dataset_positive[filtered_dataset_positive['rubrics'].isin(unique_rubrics_positive)] -filtered_dataset_negative = filtered_dataset_negative[filtered_dataset_negative['rubrics'].isin(unique_rubrics_negative)] - -# Ограничение количества строк одного типа rubrics -filtered_dataset_negative = (filtered_dataset_negative.groupby('rubrics').head(1500).reset_index(drop=True)) -filtered_dataset_positive = (filtered_dataset_positive.groupby('rubrics').head(15000).reset_index(drop=True)) + def extract_if_needed(self): + zip_path = os.path.join(self.download_path, f'{self.file_name}.zip') + if os.path.exists(zip_path): + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(self.download_path) + os.remove(zip_path) -# Вывод количества строк для каждой rubrics в unique_rubrics_positive -for rubric in unique_rubrics_positive: - count = filtered_dataset_positive[filtered_dataset_positive['rubrics'] == rubric].shape[0] - print(f"Количество строк с rubrics '{rubric}' в filtered_dataset_positive: {count}") +class DatasetProcessor: + def __init__(self, dataset_path): + self.dataset = pd.read_csv(dataset_path) -# Вывод количества строк для каждой rubrics в unique_rubrics_negative -for rubric in unique_rubrics_negative: - count = filtered_dataset_negative[filtered_dataset_negative['rubrics'] == rubric].shape[0] - print(f"Количество строк с rubrics '{rubric}' в filtered_dataset_negative: {count}") + def filter_and_process(self): + filtered_positive = self.dataset[self.dataset['rating'] > 3].copy() + filtered_negative = self.dataset[self.dataset['rating'] < 3].copy() -# Сохранение уникальных слов в файл class_positive.txt с кодировкой UTF-8 -with open('class/class_positive.txt', 'w', encoding='utf-8') as file: - for rubric in unique_rubrics_positive: - file.write(f"{rubric}\n") + filtered_positive = self.clean_rubrics(filtered_positive) + filtered_negative = self.clean_rubrics(filtered_negative) -# Сохранение уникальных слов в файл class_negative.txt с кодировкой UTF-8 -with open('class/class_negative.txt', 'w', encoding='utf-8') as file: - for rubric in unique_rubrics_negative: - file.write(f"{rubric}\n") + rename_mapping = { + 'Пиццерия': 'Быстрое питание', 'Ресторан': 'Быстрое питание', + 'Кафе': 'Быстрое питание', 'Магазин продуктов': 'Супермаркет', + 'Аптека': 'Медцентр, клиника', 'Стоматологическая клиника': 'Медцентр, клиника' + } -# Сохранение отфильтрованного DataFrame в новый CSV файл -filtered_dataset_positive.to_csv('../dataset/filtered/filtered_dataset_positive.csv', index=False) -filtered_dataset_negative.to_csv('../dataset/filtered/filtered_dataset_negative.csv', index=False) + filtered_positive = self.rename_rubrics(filtered_positive, rename_mapping) + filtered_negative = self.rename_rubrics(filtered_negative, rename_mapping) + + unique_rubrics_positive = self.get_top_rubrics(filtered_positive) + unique_rubrics_negative = self.get_top_rubrics(filtered_negative) + + filtered_positive = self.filter_by_rubrics(filtered_positive, unique_rubrics_positive) + filtered_negative = self.filter_by_rubrics(filtered_negative, unique_rubrics_negative) + + filtered_positive = self.limit_rows_per_rubric(filtered_positive, 15000) + filtered_negative = self.limit_rows_per_rubric(filtered_negative, 1500) + + self.print_rubric_counts(filtered_positive, unique_rubrics_positive) + self.print_rubric_counts(filtered_negative, unique_rubrics_negative) + + return filtered_positive, filtered_negative, unique_rubrics_positive, unique_rubrics_negative + + @staticmethod + def clean_rubrics(dataset): + dataset.loc[:, 'rubrics'] = dataset['rubrics'].apply(lambda x: x.split(';')[0] if pd.notnull(x) else x) + return dataset + + @staticmethod + def rename_rubrics(dataset, rename_mapping): + dataset.loc[:, 'rubrics'] = dataset['rubrics'].replace(rename_mapping) + return dataset + + @staticmethod + def get_top_rubrics(dataset, top_n=10): + return set(dataset['rubrics'].value_counts().head(top_n).index.tolist()) + + @staticmethod + def filter_by_rubrics(dataset, rubrics): + return dataset[dataset['rubrics'].isin(rubrics)] + + @staticmethod + def limit_rows_per_rubric(dataset, limit): + return dataset.groupby('rubrics').head(limit).reset_index(drop=True) + + @staticmethod + def print_rubric_counts(dataset, rubrics): + for rubric in rubrics: + count = dataset[dataset['rubrics'] == rubric].shape[0] + print(f"Количество строк с rubrics '{rubric}': {count}") + + + +class FileSaver: + @staticmethod + def save_rubrics_to_file(rubrics, file_path): + with open(file_path, 'w', encoding='utf-8') as file: + for rubric in rubrics: + file.write(f"{rubric}\n") + + @staticmethod + def save_dataset_to_csv(dataset, file_path): + dataset.to_csv(file_path, index=False) + + +def main(): + dataset = 'kyakovlev/yandex-geo-reviews-dataset-2023' + file_name = 'geo-reviews-dataset-2023.csv' + download_path = '../dataset' + dataset_path = os.path.join(download_path, file_name) + + # Скачивание и распаковка датасета + downloader = KaggleDatasetDownloader(dataset, file_name, download_path) + downloader.authenticate_and_download() + + # Обработка датасета + processor = DatasetProcessor(dataset_path) + filtered_positive, filtered_negative, unique_rubrics_positive, unique_rubrics_negative = processor.filter_and_process() + + # Сохранение результатов + FileSaver.save_rubrics_to_file(unique_rubrics_positive, 'class/class_positive.txt') + FileSaver.save_rubrics_to_file(unique_rubrics_negative, 'class/class_negative.txt') + FileSaver.save_dataset_to_csv(filtered_positive, '../dataset/filtered/filtered_dataset_positive.csv') + FileSaver.save_dataset_to_csv(filtered_negative, '../dataset/filtered/filtered_dataset_negative.csv') + + +if __name__ == "__main__": + main() diff --git a/neural_network/dataset/download_dataset.py b/neural_network/dataset/download_dataset.py deleted file mode 100644 index ef22a4b..0000000 --- a/neural_network/dataset/download_dataset.py +++ /dev/null @@ -1,18 +0,0 @@ -import kaggle -import zipfile -import os - -# Аутентификация -kaggle.api.authenticate() - -# Скачивание конкретного файла из набора данных -dataset = 'kyakovlev/yandex-geo-reviews-dataset-2023' -file_name = 'geo-reviews-dataset-2023.csv' -kaggle.api.dataset_download_file(dataset, file_name, path='../dataset') - -# Распаковка архива, если файл сжат -zip_path = f'./{file_name}.zip' -if os.path.exists(zip_path): - with zipfile.ZipFile(zip_path, 'r') as zip_ref: - zip_ref.extractall('./') - os.remove(zip_path) # Удаление zip файла после распаковки \ No newline at end of file diff --git a/neural_network/models/create_model.py b/neural_network/models/create_model.py new file mode 100644 index 0000000..d075619 --- /dev/null +++ b/neural_network/models/create_model.py @@ -0,0 +1,271 @@ +import pickle +import pandas as pd +from keras import Sequential +from keras.src.callbacks import ModelCheckpoint +from keras.src.utils import pad_sequences +from matplotlib import pyplot as plt +from tensorflow.keras.layers import Dense, Embedding, SpatialDropout1D, LSTM, GRU, Dropout, Conv1D, GlobalMaxPooling1D +from tensorflow.keras import utils +from sklearn.preprocessing import LabelEncoder + +# Определение констант +NUM_WORDS = 10000 +MAX_REVIEWS_LEN = 90 +NB_CLASSES = 10 + +class DataProcessor: + def __init__(self, tokenizer_path, class_names_path, dataset_path): + self.tokenizer = self.load_tokenizer(tokenizer_path) + self.class_names = self.load_class_names(class_names_path) + self.dataset = self.load_dataset(dataset_path) + self.label_encoder = LabelEncoder() + + @staticmethod + def load_tokenizer(tokenizer_path): + with open(tokenizer_path, 'rb') as handle: + return pickle.load(handle) + + @staticmethod + def load_class_names(class_names_path): + with open(class_names_path, 'r', encoding='utf-8') as file: + return [line.strip() for line in file.readlines()] + + @staticmethod + def load_dataset(dataset_path): + train = pd.read_csv(dataset_path) + train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True) + return train + + def preprocess_data(self): + reviews = self.dataset['text'] + encoded_labels = self.encode_labels(self.dataset['rubrics']) + y_train = utils.to_categorical(encoded_labels, NB_CLASSES) + sequences = self.tokenizer.texts_to_sequences(reviews) + x_train = pad_sequences(sequences, maxlen=MAX_REVIEWS_LEN) + return x_train, y_train + + def encode_labels(self, labels): + self.label_encoder.fit(self.class_names) + return self.label_encoder.transform(labels) + +class CNNModelNegative: + def __init__(self, num_words, max_reviews_len, nb_classes): + self.num_words = num_words + self.max_reviews_len = max_reviews_len + self.nb_classes = nb_classes + self.model = self.build_model() + + def build_model(self): + model = Sequential() + model.add(Embedding(self.num_words, 128, input_length=self.max_reviews_len)) + model.add(Conv1D(128, 5, activation='relu')) + model.add(GlobalMaxPooling1D()) + model.add(Dense(64, activation='relu')) + model.add(Dense(self.nb_classes, activation='softmax')) + + model.compile(optimizer='adam', + loss='categorical_crossentropy', + metrics=['accuracy']) + return model + + def train(self, x_train, y_train, epochs, batch_size, validation_split, model_save_path): + checkpoint_callback = ModelCheckpoint(model_save_path, monitor='val_accuracy', save_best_only=True, verbose=1) + history = self.model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split, callbacks=[checkpoint_callback]) + return history + +class GRUModelNegative: + def __init__(self, num_words, max_reviews_len, nb_classes): + self.num_words = num_words + self.max_reviews_len = max_reviews_len + self.nb_classes = nb_classes + self.model = self.build_model() + + def build_model(self): + model = Sequential() + model.add(Embedding(self.num_words, 128, input_length=self.max_reviews_len)) + model.add(GRU(128, return_sequences=True)) + model.add(Dropout(0.25)) + model.add(GRU(64)) + model.add(Dense(self.nb_classes, activation='softmax')) + model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) + return model + + def train(self, x_train, y_train, epochs, batch_size, validation_split, model_save_path): + checkpoint_callback = ModelCheckpoint(model_save_path, monitor='val_accuracy', save_best_only=True, verbose=1) + history = self.model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split, callbacks=[checkpoint_callback]) + return history +class LSTMModelNegative: + def __init__(self, num_words, max_reviews_len, nb_classes): + self.num_words = num_words + self.max_reviews_len = max_reviews_len + self.nb_classes = nb_classes + self.model = self.build_model() + + def build_model(self): + model = Sequential() + model.add(Embedding(self.num_words, 128, input_length=self.max_reviews_len)) + model.add(SpatialDropout1D(0.2)) + model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2)) + model.add(Dense(self.nb_classes, activation='softmax')) + model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) + return model + + def train(self, x_train, y_train, epochs, batch_size, validation_split, model_save_path): + checkpoint_callback = ModelCheckpoint(model_save_path, monitor='val_accuracy', save_best_only=True, verbose=1) + history = self.model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split, callbacks=[checkpoint_callback]) + return history + +class LSTMModelPositive: + def __init__(self, num_words, max_reviews_len, nb_classes): + self.num_words = num_words + self.max_reviews_len = max_reviews_len + self.nb_classes = nb_classes + self.model = self.build_model() + + def build_model(self): + model = Sequential() + model.add(Embedding(self.num_words, 256, input_length=self.max_reviews_len)) + model.add(SpatialDropout1D(0.2)) + model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2)) + model.add(Dense(self.nb_classes, activation='softmax')) + model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) + return model + + def train(self, x_train, y_train, epochs, batch_size, validation_split, model_save_path): + checkpoint_callback = ModelCheckpoint(model_save_path, monitor='val_accuracy', save_best_only=True, verbose=1) + history = self.model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split, callbacks=[checkpoint_callback]) + return history + +class GRUModelPositive: + def __init__(self, num_words, max_reviews_len, nb_classes): + self.num_words = num_words + self.max_reviews_len = max_reviews_len + self.nb_classes = nb_classes + self.model = self.build_model() + + def build_model(self): + model = Sequential() + model.add(Embedding(self.num_words, 256, input_length=self.max_reviews_len)) + model.add(GRU(256, return_sequences=True)) + model.add(Dropout(0.25)) + model.add(GRU(128)) + model.add(Dense(self.nb_classes, activation='softmax')) + model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) + return model + + def train(self, x_train, y_train, epochs, batch_size, validation_split, model_save_path): + checkpoint_callback = ModelCheckpoint(model_save_path, monitor='val_accuracy', save_best_only=True, verbose=1) + history = self.model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split, callbacks=[checkpoint_callback]) + return history + +class CNNModelPositive: + def __init__(self, num_words, max_reviews_len, nb_classes): + self.num_words = num_words + self.max_reviews_len = max_reviews_len + self.nb_classes = nb_classes + self.model = self.build_model() + + def build_model(self): + model = Sequential() + model.add(Embedding(self.num_words, 256, input_length=self.max_reviews_len)) + model.add(Conv1D(256, 5, activation='relu')) + model.add(GlobalMaxPooling1D()) + model.add(Dense(128, activation='relu')) + model.add(Dense(self.nb_classes, activation='softmax')) + model.compile(optimizer='adam', + loss='categorical_crossentropy', + metrics=['accuracy']) + return model + + def train(self, x_train, y_train, epochs, batch_size, validation_split, model_save_path): + checkpoint_callback = ModelCheckpoint(model_save_path, + monitor='val_accuracy', + save_best_only=True, + verbose=1) + history = self.model.fit(x_train, + y_train, + epochs=epochs, + batch_size=batch_size, + validation_split=validation_split, + callbacks=[checkpoint_callback]) + return history + +class Plotter: + @staticmethod + def plot_history(history, save_path): + plt.plot(history.history['accuracy'], + label='Доля верных ответов на обучающем наборе') + plt.plot(history.history['val_accuracy'], + label='Доля верных ответов на проверочном наборе') + plt.xlabel('Эпоха обучения') + plt.ylabel('Доля верных ответов') + plt.legend() + plt.savefig(save_path) + plt.show() + +def main(): + + tokenizer_path_positive = '../tokenization/tokenizer_positive.pickle' + class_names_path_positive = '../class/class_names_positive.txt' + dataset_path_positive = '../dataset/filtered/filtered_dataset_positive.csv' + + model_save_path_lstm_positive = './model/best_model_lstm_positive.keras' + plot_save_path_lstm_positive = './graphics/history_lstm_positive.png' + + model_save_path_cnn_positive = './model/best_model_cnn_positive.keras' + plot_save_path_cnn_positive = './graphics/history_cnn_positive.png' + + model_save_path_gru_positive = './model/best_model_gru_positive.keras' + plot_save_path_gru_positive = './graphics/history_gru_positive.png' + + tokenizer_path_negative = '../tokenization/tokenizer_negative.pickle' + class_names_path_negative = '../class/class_names_negative.txt' + dataset_path_negative = '../dataset/filtered/filtered_dataset_negative.csv' + + model_save_path_lstm_negative = './model/best_model_lstm_negative.keras' + plot_save_path_lstm_negative = './graphics/history_lstm_negative.png' + + model_save_path_cnn_negative = './model/best_model_cnn_negative.keras' + plot_save_path_cnn_negative = './graphics/history_cnn_negative.png' + + model_save_path_gru_negative = './model/best_model_gru_negative.keras' + plot_save_path_gru_negative = './graphics/history_gru_negative.png' + + data_processor_negative = DataProcessor(tokenizer_path_negative, class_names_path_negative, dataset_path_negative) + x_train, y_train = data_processor_negative.preprocess_data() + + cnn_model_negative = CNNModelNegative(NUM_WORDS, MAX_REVIEWS_LEN, NB_CLASSES) + history = cnn_model_negative.train(x_train, y_train, epochs=10, batch_size=128, validation_split=0.1, model_save_path=model_save_path_cnn_negative) + Plotter.plot_history(history, plot_save_path_cnn_negative) + print("Training and plotting completed successfully: CNNModelNegative.") + + lstm_model_negative = LSTMModelNegative(NUM_WORDS, MAX_REVIEWS_LEN, NB_CLASSES) + history = lstm_model_negative.train(x_train, y_train, epochs=10, batch_size=128, validation_split=0.1, model_save_path=model_save_path_lstm_negative) + Plotter.plot_history(history, plot_save_path_lstm_negative) + print("Training and plotting completed successfully: LSTMModelNegative.") + + gru_model_negative = GRUModelNegative(NUM_WORDS, MAX_REVIEWS_LEN, NB_CLASSES) + history = gru_model_negative.train(x_train, y_train, epochs=10, batch_size=128, validation_split=0.1, model_save_path=model_save_path_gru_negative) + Plotter.plot_history(history, plot_save_path_gru_negative) + print("Training and plotting completed successfully: GRUModelNegative.") + + data_processor_positive = DataProcessor(tokenizer_path_positive, class_names_path_positive, dataset_path_positive) + x_train, y_train = data_processor_positive.preprocess_data() + + cnn_model_positive = CNNModelPositive(NUM_WORDS, MAX_REVIEWS_LEN, NB_CLASSES) + history = cnn_model_positive.train(x_train, y_train, epochs=5, batch_size=128, validation_split=0.1, model_save_path=model_save_path_cnn_positive) + Plotter.plot_history(history, plot_save_path_cnn_positive) + print("Training and plotting completed successfully: CNNModelPositive.") + + lstm_model_positive = LSTMModelPositive(NUM_WORDS, MAX_REVIEWS_LEN, NB_CLASSES) + history = lstm_model_positive.train(x_train, y_train, epochs=5, batch_size=128, validation_split=0.1, model_save_path=model_save_path_lstm_positive) + Plotter.plot_history(history, plot_save_path_lstm_positive) + print("Training and plotting completed successfully: LSTMModelPositive.") + + gru_model_positive = GRUModelPositive(NUM_WORDS, MAX_REVIEWS_LEN, NB_CLASSES) + history = gru_model_positive.train(x_train, y_train, epochs=5, batch_size=128, validation_split=0.1, model_save_path=model_save_path_gru_positive) + Plotter.plot_history(history, plot_save_path_gru_positive) + print("Training and plotting completed successfully: GRUModelPositive.") + +if __name__ == "__main__": + main() diff --git a/neural_network/create_cnn/graphics/.gitkeep b/neural_network/models/graphics/.gitkeep similarity index 100% rename from neural_network/create_cnn/graphics/.gitkeep rename to neural_network/models/graphics/.gitkeep diff --git a/neural_network/create_cnn/model/.gitkeep b/neural_network/models/model/.gitkeep similarity index 100% rename from neural_network/create_cnn/model/.gitkeep rename to neural_network/models/model/.gitkeep diff --git a/neural_network/tokenization/tokenizer.py b/neural_network/tokenization/tokenizer.py new file mode 100644 index 0000000..18c6c29 --- /dev/null +++ b/neural_network/tokenization/tokenizer.py @@ -0,0 +1,81 @@ +import pandas as pd +from keras.src.legacy.preprocessing.text import Tokenizer +from nltk import word_tokenize, SnowballStemmer +from nltk.corpus import stopwords +import string +import pickle +from pymystem3 import Mystem + +# Константы +NUM_WORDS = 10000 + +class DataLoader: + def __init__(self, dataset_path): + self.dataset_path = dataset_path + + def load_data(self): + return pd.read_csv(self.dataset_path) + +class TextProcessor: + def __init__(self): + self.stop_words = set(stopwords.words('russian')) + self.punctuation = set(string.punctuation) + self.mystem = Mystem() + self.stemmer = SnowballStemmer("russian") + + def remove_stopwords_and_punctuation(self, reviews): + filtered_tokens = [] + for review in reviews: + words = word_tokenize(review) + filtered_words = [word for word in words if + word.lower() not in self.stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in self.punctuation] + filtered_tokens.extend(filtered_words) + return filtered_tokens + + def lemmatize_tokens(self, tokens): + lemmatized_tokens = [self.mystem.lemmatize(token)[0] for token in tokens] + return lemmatized_tokens + + def stem_tokens(self, tokens): + stemmed_tokens = [self.stemmer.stem(word) for word in tokens] + return stemmed_tokens + +class TokenizerSaver: + def __init__(self, num_words): + self.num_words = num_words + self.tokenizer = Tokenizer(num_words=self.num_words) + + def fit_on_texts(self, texts): + self.tokenizer.fit_on_texts(texts) + + def save_tokenizer(self, save_path): + with open(save_path, 'wb') as handle: + pickle.dump(self.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) + +def process_and_save_tokenizer(dataset_path, save_path): + # Load data + data_loader = DataLoader(dataset_path) + dataset = data_loader.load_data() + + # Process text + text_processor = TextProcessor() + reviews = dataset['text'] + filtered_tokens = text_processor.remove_stopwords_and_punctuation(reviews) + + # Tokenize and save + tokenizer_saver = TokenizerSaver(NUM_WORDS) + tokenizer_saver.fit_on_texts(filtered_tokens) + tokenizer_saver.save_tokenizer(save_path) + +def main(): + positive_dataset_path = '../dataset/filtered/filtered_dataset_positive.csv' + negative_dataset_path = '../dataset/filtered/filtered_dataset_negative.csv' + positive_tokenizer_path = './tokenizer_positive.pickle' + negative_tokenizer_path = './tokenizer_negative.pickle' + + # Process and save tokenizers + process_and_save_tokenizer(positive_dataset_path, positive_tokenizer_path) + process_and_save_tokenizer(negative_dataset_path, negative_tokenizer_path) + +if __name__ == "__main__": + main() diff --git a/neural_network/tokenization/tokenizer_negative.py b/neural_network/tokenization/tokenizer_negative.py deleted file mode 100644 index 92fd2c3..0000000 --- a/neural_network/tokenization/tokenizer_negative.py +++ /dev/null @@ -1,60 +0,0 @@ -import pandas as pd -from keras.src.legacy.preprocessing.text import Tokenizer -from nltk import word_tokenize, SnowballStemmer -from nltk.corpus import stopwords -import string -import pickle - -from pymystem3 import Mystem - -# Максимальное количество слов -num_words = 10000 -# Количество классов отзыва -nb_classes = 10 - -def remove_stopwords_and_punctuation(reviews): - stop_words = set(stopwords.words('russian')) - punctuation = set(string.punctuation) - filtered_tokens = [] - - # Удаление стоп слов и пунктуаций - for review in reviews: - words = word_tokenize(review) - filtered_words = [word for word in words if - word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation] - filtered_tokens.extend(filtered_words) - - return filtered_tokens - -def lemmatize_token(filtered_tokens): - mystem = Mystem() - lemmatized_tokens = [] - for token in filtered_tokens: - lemmatized = mystem.lemmatize(token)[0] - lemmatized_tokens.append(lemmatized) - print(lemmatized) - return lemmatized_tokens - -def stemmer_token(filtered_tokens): - stemmer = SnowballStemmer("russian") - stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens] - return stemmer_tokens - -# Данные -train = pd.read_csv('..//dataset/filtered/filtered_dataset_negative.csv') -reviews = train['text'] - -filtered_tokens = remove_stopwords_and_punctuation(reviews) - -# lemmatized_tokens = lemmatize_token(filtered_tokens) -# print(lemmatized_tokens[:10]) - -# stemmer_tokens = stemmer_token(filtered_tokens) -# print(stemmer_tokens[:10]) - -tokenizer = Tokenizer(num_words=num_words) -tokenizer.fit_on_texts(filtered_tokens) - -# saving -with open('.//tokenizer_negative.pickle', 'wb') as handle: - pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) \ No newline at end of file diff --git a/neural_network/tokenization/tokenizer_positive.py b/neural_network/tokenization/tokenizer_positive.py deleted file mode 100644 index ae646ec..0000000 --- a/neural_network/tokenization/tokenizer_positive.py +++ /dev/null @@ -1,60 +0,0 @@ -import pandas as pd -from keras.src.legacy.preprocessing.text import Tokenizer -from nltk import word_tokenize, SnowballStemmer -from nltk.corpus import stopwords -import string -import pickle - -from pymystem3 import Mystem - -# Максимальное количество слов -num_words = 10000 -# Количество классов отзыва -nb_classes = 10 - -def remove_stopwords_and_punctuation(reviews): - stop_words = set(stopwords.words('russian')) - punctuation = set(string.punctuation) - filtered_tokens = [] - - # Удаление стоп слов и пунктуаций - for review in reviews: - words = word_tokenize(review) - filtered_words = [word for word in words if - word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation] - filtered_tokens.extend(filtered_words) - - return filtered_tokens - -def lemmatize_token(filtered_tokens): - mystem = Mystem() - lemmatized_tokens = [] - for token in filtered_tokens: - lemmatized = mystem.lemmatize(token)[0] - lemmatized_tokens.append(lemmatized) - print(lemmatized) - return lemmatized_tokens - -def stemmer_token(filtered_tokens): - stemmer = SnowballStemmer("russian") - stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens] - return stemmer_tokens - -# Данные -train = pd.read_csv('..//dataset/filtered/filtered_dataset_positive.csv') -reviews = train['text'] - -filtered_tokens = remove_stopwords_and_punctuation(reviews) - -# lemmatized_tokens = lemmatize_token(filtered_tokens) -# print(lemmatized_tokens[:10]) - -# stemmer_tokens = stemmer_token(filtered_tokens) -# print(stemmer_tokens[:10]) - -tokenizer = Tokenizer(num_words=num_words) -tokenizer.fit_on_texts(filtered_tokens) - -# saving -with open('.//tokenizer_positive.pickle', 'wb') as handle: - pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) \ No newline at end of file