From c56263c386f4ca76b04316cc5c8f9c1bccc658d4 Mon Sep 17 00:00:00 2001 From: maksim Date: Thu, 30 May 2024 19:27:03 +0400 Subject: [PATCH] =?UTF-8?q?=D0=A4=D1=83=D1=85,=20=D0=BF=D0=BE=D1=83=D0=B1?= =?UTF-8?q?=D0=B8=D1=80=D0=B0=D0=BB=20=D0=BB=D0=B8=D1=88=D0=BD=D0=B8=D0=B9?= =?UTF-8?q?=20=D0=BA=D0=BE=D0=B4=20+=20=D0=BF=D0=BE=D1=87=D0=B8=D0=BD?= =?UTF-8?q?=D0=B8=D0=BB=20GRU?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- neural_network/class/class_positive.py | 2 +- .../create_cnn/create_model_cnn_negative.py | 101 ++----------- .../create_cnn/create_model_cnn_positive.py | 105 +++----------- neural_network/create_cnn/graphics/.gitkeep | 0 .../create_gru/create_model_gru_negative.py | 133 +++++------------- .../create_gru/create_model_gru_positive.py | 104 +++----------- neural_network/create_gru/graphics/.gitkeep | 0 .../create_lstm/create_model_lstm_negative.py | 96 ++----------- .../create_lstm/create_model_lstm_positive.py | 99 ++----------- neural_network/create_lstm/graphics/.gitkeep | 0 10 files changed, 112 insertions(+), 528 deletions(-) create mode 100644 neural_network/create_cnn/graphics/.gitkeep create mode 100644 neural_network/create_gru/graphics/.gitkeep create mode 100644 neural_network/create_lstm/graphics/.gitkeep diff --git a/neural_network/class/class_positive.py b/neural_network/class/class_positive.py index 86521d3..5332ae6 100644 --- a/neural_network/class/class_positive.py +++ b/neural_network/class/class_positive.py @@ -2,7 +2,7 @@ import pandas as pd from sklearn.preprocessing import LabelEncoder # Данные -train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv') +train = pd.read_csv('../dataset/filtered/filtered_dataset_positive.csv') label_encoder = LabelEncoder() train['rubrics'] = label_encoder.fit_transform(train['rubrics']) diff --git a/neural_network/create_cnn/create_model_cnn_negative.py b/neural_network/create_cnn/create_model_cnn_negative.py index 3e94bb1..1cfea74 100644 --- a/neural_network/create_cnn/create_model_cnn_negative.py +++ b/neural_network/create_cnn/create_model_cnn_negative.py @@ -1,20 +1,12 @@ import pickle - import pandas as pd from keras import Sequential from keras.src.callbacks import ModelCheckpoint -from keras.src.legacy.preprocessing.text import Tokenizer -from keras.src.saving import load_model from keras.src.utils import pad_sequences from matplotlib import pyplot as plt from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D -from sklearn.preprocessing import LabelEncoder from tensorflow.keras import utils -from nltk import word_tokenize, SnowballStemmer -from nltk.corpus import stopwords -import string - -from pymystem3 import Mystem +from sklearn.preprocessing import LabelEncoder # Максимальное количество слов num_words = 10000 @@ -23,33 +15,13 @@ max_reviews_len = 90 # Количество классов отзыва nb_classes = 10 -def remove_stopwords_and_punctuation(reviews): - stop_words = set(stopwords.words('russian')) - punctuation = set(string.punctuation) - filtered_tokens = [] +# Загрузка токенизатора +with open('..//tokenization/tokenizer_negative.pickle', 'rb') as handle: + tokenizer = pickle.load(handle) - # Удаление стоп слов и пунктуаций - for review in reviews: - words = word_tokenize(review) - filtered_words = [word for word in words if - word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation] - filtered_tokens.extend(filtered_words) - - return filtered_tokens - -def lemmatize_token(filtered_tokens): - mystem = Mystem() - lemmatized_tokens = [] - for token in filtered_tokens: - lemmatized = mystem.lemmatize(token)[0] - lemmatized_tokens.append(lemmatized) - print(lemmatized) - return lemmatized_tokens - -def stemmer_token(filtered_tokens): - stemmer = SnowballStemmer("russian") - stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens] - return stemmer_tokens +# Загрузка названий классов +with open('..//class/class_names_negative.txt', 'r', encoding='utf-8') as file: + class_names = [line.strip() for line in file.readlines()] # Данные train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv') @@ -57,28 +29,13 @@ train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True) reviews = train['text'] print("Набор данных готов") -filtered_tokens = remove_stopwords_and_punctuation(reviews) -print(filtered_tokens[:10]) - -# lemmatized_tokens = lemmatize_token(filtered_tokens) -# print(lemmatized_tokens[:10]) - -# stemmer_tokens = stemmer_token(filtered_tokens) -# print(stemmer_tokens[:10]) - +# Кодирование категорий label_encoder = LabelEncoder() -train['rubrics'] = label_encoder.fit_transform(train['rubrics']) - -# Сохраняем названия классов -class_names = label_encoder.classes_ - -y_train = utils.to_categorical(train['rubrics'], nb_classes) - -tokenizer = Tokenizer(num_words=num_words) -tokenizer.fit_on_texts(filtered_tokens) +label_encoder.fit(class_names) +encoded_labels = label_encoder.transform(train['rubrics']) +y_train = utils.to_categorical(encoded_labels, nb_classes) sequences = tokenizer.texts_to_sequences(reviews) - x_train = pad_sequences(sequences, maxlen=max_reviews_len) print('начинается создание модели') @@ -91,7 +48,6 @@ model_cnn.add(GlobalMaxPooling1D()) model_cnn.add(Dense(64, activation='relu')) model_cnn.add(Dense(nb_classes, activation='softmax')) - model_cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) @@ -101,9 +57,9 @@ model_cnn.summary() # Определим обратный вызов ModelCheckpoint model_cnn_save_path = './/model/best_model_cnn_negative.keras' checkpoint_callback_cnn = ModelCheckpoint(model_cnn_save_path, - monitor='val_accuracy', - save_best_only=True, - verbose=1) + monitor='val_accuracy', + save_best_only=True, + verbose=1) # Обучение модели history_cnn = model_cnn.fit(x_train, @@ -121,31 +77,4 @@ plt.plot(history_cnn.history['val_accuracy'], plt.xlabel('Эпоха обучения') plt.ylabel('Доля верных ответов') plt.legend() -plt.show() - -# Загрузка модели -model_cnn = load_model('.//model/best_model_cnn_negative.keras') - -# Пример текста отзыва пользователя -user_review = "Не люблю пьяных людей, когда они рядом, всегда будет что то плохое" - -# Подготовка отзыва пользователя -filtered_tokens = remove_stopwords_and_punctuation([user_review]) -sequences = tokenizer.texts_to_sequences(filtered_tokens) -x_user = pad_sequences(sequences, maxlen=max_reviews_len) - -# Получение вероятности принадлежности отзыва пользователя к разным классам -predicted_probabilities = model_cnn.predict(x_user) - -# Вывод вероятностей с названиями классов -for class_name, prob in zip(class_names, predicted_probabilities[0]): - print(f"Вероятность отзыва относится к классу '{class_name}': {prob}") - -# Сохраняем названия классов в текстовый файл -with open('.//class/class_names_cnn_negative.txt', 'w', encoding='utf-8') as file: - for class_name in class_names: - file.write(f"{class_name}\n") - -# saving -with open('../tokenization/tokenizer_cnn_negative.pickle', 'wb') as handle: - pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) +plt.savefig('.//graphics/history_cnn_negative.png') diff --git a/neural_network/create_cnn/create_model_cnn_positive.py b/neural_network/create_cnn/create_model_cnn_positive.py index 2e82ed6..feaaf1c 100644 --- a/neural_network/create_cnn/create_model_cnn_positive.py +++ b/neural_network/create_cnn/create_model_cnn_positive.py @@ -1,19 +1,12 @@ +import pickle import pandas as pd from keras import Sequential from keras.src.callbacks import ModelCheckpoint -from keras.src.legacy.preprocessing.text import Tokenizer -from keras.src.saving import load_model from keras.src.utils import pad_sequences from matplotlib import pyplot as plt -from tensorflow.keras.layers import Dense, Embedding, GlobalMaxPooling1D, Conv1D -from sklearn.preprocessing import LabelEncoder +from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D from tensorflow.keras import utils -from nltk import word_tokenize, SnowballStemmer -from nltk.corpus import stopwords -import string -import pickle - -from pymystem3 import Mystem +from sklearn.preprocessing import LabelEncoder # Максимальное количество слов num_words = 10000 @@ -22,72 +15,37 @@ max_reviews_len = 90 # Количество классов отзыва nb_classes = 10 -def remove_stopwords_and_punctuation(reviews): - stop_words = set(stopwords.words('russian')) - punctuation = set(string.punctuation) - filtered_tokens = [] +# Загрузка токенизатора +with open('..//tokenization/tokenizer_positive.pickle', 'rb') as handle: + tokenizer = pickle.load(handle) - # Удаление стоп слов и пунктуаций - for review in reviews: - words = word_tokenize(review) - filtered_words = [word for word in words if - word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation] - filtered_tokens.extend(filtered_words) - - return filtered_tokens - -def lemmatize_token(filtered_tokens): - mystem = Mystem() - lemmatized_tokens = [] - for token in filtered_tokens: - lemmatized = mystem.lemmatize(token)[0] - lemmatized_tokens.append(lemmatized) - print(lemmatized) - return lemmatized_tokens - -def stemmer_token(filtered_tokens): - stemmer = SnowballStemmer("russian") - stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens] - return stemmer_tokens +# Загрузка названий классов +with open('..//class/class_names_positive.txt', 'r', encoding='utf-8') as file: + class_names = [line.strip() for line in file.readlines()] # Данные -train = pd.read_csv('..//dataset/filtered/filtered_dataset_positive.csv') +train = pd.read_csv('../dataset/filtered/filtered_dataset_positive.csv') train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True) reviews = train['text'] print("Набор данных готов") -filtered_tokens = remove_stopwords_and_punctuation(reviews) -print(filtered_tokens[:10]) - -# lemmatized_tokens = lemmatize_token(filtered_tokens) -# print(lemmatized_tokens[:10]) - -# stemmer_tokens = stemmer_token(filtered_tokens) -# print(stemmer_tokens[:10]) - +# Кодирование категорий label_encoder = LabelEncoder() -train['rubrics'] = label_encoder.fit_transform(train['rubrics']) - -# Сохраняем названия классов -class_names = label_encoder.classes_ - -y_train = utils.to_categorical(train['rubrics'], nb_classes) - -tokenizer = Tokenizer(num_words=num_words) -tokenizer.fit_on_texts(filtered_tokens) +label_encoder.fit(class_names) +encoded_labels = label_encoder.transform(train['rubrics']) +y_train = utils.to_categorical(encoded_labels, nb_classes) sequences = tokenizer.texts_to_sequences(reviews) - x_train = pad_sequences(sequences, maxlen=max_reviews_len) print('начинается создание модели') # Построение модели model_cnn = Sequential() -model_cnn.add(Embedding(num_words, 128, input_length=max_reviews_len)) -model_cnn.add(Conv1D(128, 5, activation='relu')) +model_cnn.add(Embedding(num_words, 256, input_length=max_reviews_len)) +model_cnn.add(Conv1D(256, 5, activation='relu')) model_cnn.add(GlobalMaxPooling1D()) -model_cnn.add(Dense(64, activation='relu')) +model_cnn.add(Dense(128, activation='relu')) model_cnn.add(Dense(nb_classes, activation='softmax')) @@ -120,31 +78,4 @@ plt.plot(history_cnn.history['val_accuracy'], plt.xlabel('Эпоха обучения') plt.ylabel('Доля верных ответов') plt.legend() -plt.show() - -# Загрузка модели -model_cnn = load_model('.//model/best_model_cnn_positive.keras') - -# Пример текста отзыва пользователя -user_review = "Ой я так люблю вкусно кушать! Я бы хотел сьесть все самое вкусное в этом мире! Пончики, пиццу, вообще все все все!" - -# Подготовка отзыва пользователя -filtered_tokens = remove_stopwords_and_punctuation([user_review]) -sequences = tokenizer.texts_to_sequences(filtered_tokens) -x_user = pad_sequences(sequences, maxlen=max_reviews_len) - -# Получение вероятности принадлежности отзыва пользователя к разным классам -predicted_probabilities = model_cnn.predict(x_user) - -# Вывод вероятностей с названиями классов -for class_name, prob in zip(class_names, predicted_probabilities[0]): - print(f"Вероятность отзыва относится к классу '{class_name}': {prob}") - -# Сохраняем названия классов в текстовый файл -with open('.//class/class_names_cnn_positive.txt', 'w', encoding='utf-8') as file: - for class_name in class_names: - file.write(f"{class_name}\n") - -# saving -with open('../tokenization/tokenizer_cnn_positive.pickle', 'wb') as handle: - pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) \ No newline at end of file +plt.savefig('.//graphics/history_cnn_positive.png') diff --git a/neural_network/create_cnn/graphics/.gitkeep b/neural_network/create_cnn/graphics/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/neural_network/create_gru/create_model_gru_negative.py b/neural_network/create_gru/create_model_gru_negative.py index 6219e63..38e1c46 100644 --- a/neural_network/create_gru/create_model_gru_negative.py +++ b/neural_network/create_gru/create_model_gru_negative.py @@ -1,20 +1,12 @@ import pickle - import pandas as pd from keras import Sequential from keras.src.callbacks import ModelCheckpoint -from keras.src.legacy.preprocessing.text import Tokenizer -from keras.src.saving import load_model from keras.src.utils import pad_sequences from matplotlib import pyplot as plt -from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM, SpatialDropout1D -from sklearn.preprocessing import LabelEncoder +from tensorflow.keras.layers import Dense, Embedding, Dropout, GRU, Bidirectional from tensorflow.keras import utils -from nltk import word_tokenize, SnowballStemmer -from nltk.corpus import stopwords -import string - -from pymystem3 import Mystem +from sklearn.preprocessing import LabelEncoder # Максимальное количество слов num_words = 10000 @@ -23,33 +15,13 @@ max_reviews_len = 90 # Количество классов отзыва nb_classes = 10 -def remove_stopwords_and_punctuation(reviews): - stop_words = set(stopwords.words('russian')) - punctuation = set(string.punctuation) - filtered_tokens = [] +# Загрузка токенизатора +with open('..//tokenization/tokenizer_negative.pickle', 'rb') as handle: + tokenizer = pickle.load(handle) - # Удаление стоп слов и пунктуаций - for review in reviews: - words = word_tokenize(review) - filtered_words = [word for word in words if - word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation] - filtered_tokens.extend(filtered_words) - - return filtered_tokens - -def lemmatize_token(filtered_tokens): - mystem = Mystem() - lemmatized_tokens = [] - for token in filtered_tokens: - lemmatized = mystem.lemmatize(token)[0] - lemmatized_tokens.append(lemmatized) - print(lemmatized) - return lemmatized_tokens - -def stemmer_token(filtered_tokens): - stemmer = SnowballStemmer("russian") - stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens] - return stemmer_tokens +# Загрузка названий классов +with open('..//class/class_names_negative.txt', 'r', encoding='utf-8') as file: + class_names = [line.strip() for line in file.readlines()] # Данные train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv') @@ -57,93 +29,52 @@ train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True) reviews = train['text'] print("Набор данных готов") -filtered_tokens = remove_stopwords_and_punctuation(reviews) -print(filtered_tokens[:10]) - -# lemmatized_tokens = lemmatize_token(filtered_tokens) -# print(lemmatized_tokens[:10]) - -# stemmer_tokens = stemmer_token(filtered_tokens) -# print(stemmer_tokens[:10]) - +# Кодирование категорий label_encoder = LabelEncoder() -train['rubrics'] = label_encoder.fit_transform(train['rubrics']) - -# Сохраняем названия классов -class_names = label_encoder.classes_ - -y_train = utils.to_categorical(train['rubrics'], nb_classes) - -tokenizer = Tokenizer(num_words=num_words) -tokenizer.fit_on_texts(filtered_tokens) +label_encoder.fit(class_names) +encoded_labels = label_encoder.transform(train['rubrics']) +y_train = utils.to_categorical(encoded_labels, nb_classes) sequences = tokenizer.texts_to_sequences(reviews) - x_train = pad_sequences(sequences, maxlen=max_reviews_len) print('начинается создание модели') # Построение модели -model_lstm = Sequential() -model_lstm.add(Embedding(num_words, 128, input_length=max_reviews_len)) -model_lstm.add(SpatialDropout1D(0.2)) -model_lstm.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2)) -model_lstm.add(Dense(nb_classes, activation='softmax')) +model_gru = Sequential() +model_gru.add(Embedding(num_words, 128, input_length=max_reviews_len)) +model_gru.add(GRU(128, return_sequences=True)) +model_gru.add(Dropout(0.25)) +model_gru.add(GRU(64)) +model_gru.add(Dense(nb_classes, activation='softmax')) -model_lstm.compile(optimizer='adam', - loss='categorical_crossentropy', - metrics=['accuracy']) +model_gru.compile(optimizer='adam', + loss='categorical_crossentropy', + metrics=['accuracy']) -model_lstm.summary() +model_gru.summary() # Определим обратный вызов ModelCheckpoint -model_lstm_save_path = './/model/best_model_lstm_negative.keras' +model_lstm_save_path = './/model/best_model_gru_negative.keras' checkpoint_callback_gru = ModelCheckpoint(model_lstm_save_path, monitor='val_accuracy', save_best_only=True, verbose=1) # Обучение модели -history_lstm = model_lstm.fit(x_train, - y_train, - epochs=10, - batch_size=128, - validation_split=0.1, - callbacks=[checkpoint_callback_gru]) +history_gru = model_gru.fit(x_train, + y_train, + epochs=10, + batch_size=128, + validation_split=0.1, + callbacks=[checkpoint_callback_gru]) # Графики -plt.plot(history_lstm.history['accuracy'], +plt.plot(history_gru.history['accuracy'], label='Доля верных ответов на обучающем наборе') -plt.plot(history_lstm.history['val_accuracy'], +plt.plot(history_gru.history['val_accuracy'], label='Доля верных ответов на проверочном наборе') plt.xlabel('Эпоха обучения') plt.ylabel('Доля верных ответов') plt.legend() -plt.show() - -# Загрузка модели -model_lstm = load_model('.//model/best_model_lstm_negative.keras') - -# Пример текста отзыва пользователя -user_review = "Не люблю пьяных людей, когда они рядом, всегда будет что то плохое" - -# Подготовка отзыва пользователя -filtered_tokens = remove_stopwords_and_punctuation([user_review]) -sequences = tokenizer.texts_to_sequences(filtered_tokens) -x_user = pad_sequences(sequences, maxlen=max_reviews_len) - -# Получение вероятности принадлежности отзыва пользователя к разным классам -predicted_probabilities = model_lstm.predict(x_user) - -# Вывод вероятностей с названиями классов -for class_name, prob in zip(class_names, predicted_probabilities[0]): - print(f"Вероятность отзыва относится к классу '{class_name}': {prob}") - -# Сохраняем названия классов в текстовый файл -with open('.//class/class_names_lstm_negative.txt', 'w', encoding='utf-8') as file: - for class_name in class_names: - file.write(f"{class_name}\n") - -# saving -with open('.//tokenization/tokenizer_lstm_lstm_negative.pickle', 'wb') as handle: - pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) +plt.savefig('.//graphics/history_gru_negative.png') diff --git a/neural_network/create_gru/create_model_gru_positive.py b/neural_network/create_gru/create_model_gru_positive.py index edb008a..b424eac 100644 --- a/neural_network/create_gru/create_model_gru_positive.py +++ b/neural_network/create_gru/create_model_gru_positive.py @@ -1,19 +1,12 @@ +import pickle import pandas as pd from keras import Sequential from keras.src.callbacks import ModelCheckpoint -from keras.src.legacy.preprocessing.text import Tokenizer -from keras.src.saving import load_model from keras.src.utils import pad_sequences from matplotlib import pyplot as plt -from tensorflow.keras.layers import Dense, Embedding, Dropout, GRU, SpatialDropout1D -from sklearn.preprocessing import LabelEncoder +from tensorflow.keras.layers import Dense, Embedding, GRU, Dropout from tensorflow.keras import utils -from nltk import word_tokenize, SnowballStemmer -from nltk.corpus import stopwords -import string -import pickle - -from pymystem3 import Mystem +from sklearn.preprocessing import LabelEncoder # Максимальное количество слов num_words = 10000 @@ -22,62 +15,27 @@ max_reviews_len = 90 # Количество классов отзыва nb_classes = 10 -def remove_stopwords_and_punctuation(reviews): - stop_words = set(stopwords.words('russian')) - punctuation = set(string.punctuation) - filtered_tokens = [] +# Загрузка токенизатора +with open('..//tokenization/tokenizer_positive.pickle', 'rb') as handle: + tokenizer = pickle.load(handle) - # Удаление стоп слов и пунктуаций - for review in reviews: - words = word_tokenize(review) - filtered_words = [word for word in words if - word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation] - filtered_tokens.extend(filtered_words) - - return filtered_tokens - -def lemmatize_token(filtered_tokens): - mystem = Mystem() - lemmatized_tokens = [] - for token in filtered_tokens: - lemmatized = mystem.lemmatize(token)[0] - lemmatized_tokens.append(lemmatized) - print(lemmatized) - return lemmatized_tokens - -def stemmer_token(filtered_tokens): - stemmer = SnowballStemmer("russian") - stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens] - return stemmer_tokens +# Загрузка названий классов +with open('..//class/class_names_positive.txt', 'r', encoding='utf-8') as file: + class_names = [line.strip() for line in file.readlines()] # Данные -train = pd.read_csv('..//dataset/filtered/filtered_dataset_positive.csv') +train = pd.read_csv('../dataset/filtered/filtered_dataset_positive.csv') train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True) reviews = train['text'] print("Набор данных готов") -filtered_tokens = remove_stopwords_and_punctuation(reviews) -print(filtered_tokens[:10]) - -# lemmatized_tokens = lemmatize_token(filtered_tokens) -# print(lemmatized_tokens[:10]) - -# stemmer_tokens = stemmer_token(filtered_tokens) -# print(stemmer_tokens[:10]) - +# Кодирование категорий label_encoder = LabelEncoder() -train['rubrics'] = label_encoder.fit_transform(train['rubrics']) - -# Сохраняем названия классов -class_names = label_encoder.classes_ - -y_train = utils.to_categorical(train['rubrics'], nb_classes) - -tokenizer = Tokenizer(num_words=num_words) -tokenizer.fit_on_texts(filtered_tokens) +label_encoder.fit(class_names) +encoded_labels = label_encoder.transform(train['rubrics']) +y_train = utils.to_categorical(encoded_labels, nb_classes) sequences = tokenizer.texts_to_sequences(reviews) - x_train = pad_sequences(sequences, maxlen=max_reviews_len) print('начинается создание модели') @@ -85,8 +43,9 @@ print('начинается создание модели') # Построение модели model_gru = Sequential() model_gru.add(Embedding(num_words, 256, input_length=max_reviews_len)) -model_gru.add(SpatialDropout1D(0.2)) -model_gru.add(GRU(128, dropout=0.2, recurrent_dropout=0.2)) +model_gru.add(GRU(256, return_sequences=True)) +model_gru.add(Dropout(0.25)) +model_gru.add(GRU(128)) model_gru.add(Dense(nb_classes, activation='softmax')) model_gru.compile(optimizer='adam', @@ -118,31 +77,4 @@ plt.plot(history_gru.history['val_accuracy'], plt.xlabel('Эпоха обучения') plt.ylabel('Доля верных ответов') plt.legend() -plt.show() - -# Загрузка модели -model_gru = load_model('.//model/best_model_gru_positive.keras') - -# Пример текста отзыва пользователя -user_review = "Ой я так люблю вкусно кушать! Я бы хотел сьесть все самое вкусное в этом мире! Пончики, пиццу, вообще все все все!" - -# Подготовка отзыва пользователя -filtered_tokens = remove_stopwords_and_punctuation([user_review]) -sequences = tokenizer.texts_to_sequences(filtered_tokens) -x_user = pad_sequences(sequences, maxlen=max_reviews_len) - -# Получение вероятности принадлежности отзыва пользователя к разным классам -predicted_probabilities = model_gru.predict(x_user) - -# Вывод вероятностей с названиями классов -for class_name, prob in zip(class_names, predicted_probabilities[0]): - print(f"Вероятность отзыва относится к классу '{class_name}': {prob}") - -# Сохраняем названия классов в текстовый файл -with open('.//class/class_names_gru_positive.txt', 'w', encoding='utf-8') as file: - for class_name in class_names: - file.write(f"{class_name}\n") - -# saving -with open('.//tokenization/tokenizer_gru_positive.pickle', 'wb') as handle: - pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) \ No newline at end of file +plt.savefig('.//graphics/history_gru_positive.png') \ No newline at end of file diff --git a/neural_network/create_gru/graphics/.gitkeep b/neural_network/create_gru/graphics/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/neural_network/create_lstm/create_model_lstm_negative.py b/neural_network/create_lstm/create_model_lstm_negative.py index d3e7a49..5de3dbe 100644 --- a/neural_network/create_lstm/create_model_lstm_negative.py +++ b/neural_network/create_lstm/create_model_lstm_negative.py @@ -1,20 +1,12 @@ import pickle - import pandas as pd from keras import Sequential from keras.src.callbacks import ModelCheckpoint -from keras.src.legacy.preprocessing.text import Tokenizer -from keras.src.saving import load_model from keras.src.utils import pad_sequences from matplotlib import pyplot as plt -from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM, SpatialDropout1D -from sklearn.preprocessing import LabelEncoder +from tensorflow.keras.layers import Dense, Embedding, SpatialDropout1D, LSTM from tensorflow.keras import utils -from nltk import word_tokenize, SnowballStemmer -from nltk.corpus import stopwords -import string - -from pymystem3 import Mystem +from sklearn.preprocessing import LabelEncoder # Максимальное количество слов num_words = 10000 @@ -23,33 +15,13 @@ max_reviews_len = 90 # Количество классов отзыва nb_classes = 10 -def remove_stopwords_and_punctuation(reviews): - stop_words = set(stopwords.words('russian')) - punctuation = set(string.punctuation) - filtered_tokens = [] +# Загрузка токенизатора +with open('..//tokenization/tokenizer_negative.pickle', 'rb') as handle: + tokenizer = pickle.load(handle) - # Удаление стоп слов и пунктуаций - for review in reviews: - words = word_tokenize(review) - filtered_words = [word for word in words if - word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation] - filtered_tokens.extend(filtered_words) - - return filtered_tokens - -def lemmatize_token(filtered_tokens): - mystem = Mystem() - lemmatized_tokens = [] - for token in filtered_tokens: - lemmatized = mystem.lemmatize(token)[0] - lemmatized_tokens.append(lemmatized) - print(lemmatized) - return lemmatized_tokens - -def stemmer_token(filtered_tokens): - stemmer = SnowballStemmer("russian") - stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens] - return stemmer_tokens +# Загрузка названий классов +with open('..//class/class_names_negative.txt', 'r', encoding='utf-8') as file: + class_names = [line.strip() for line in file.readlines()] # Данные train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv') @@ -57,28 +29,13 @@ train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True) reviews = train['text'] print("Набор данных готов") -filtered_tokens = remove_stopwords_and_punctuation(reviews) -print(filtered_tokens[:10]) - -# lemmatized_tokens = lemmatize_token(filtered_tokens) -# print(lemmatized_tokens[:10]) - -# stemmer_tokens = stemmer_token(filtered_tokens) -# print(stemmer_tokens[:10]) - +# Кодирование категорий label_encoder = LabelEncoder() -train['rubrics'] = label_encoder.fit_transform(train['rubrics']) - -# Сохраняем названия классов -class_names = label_encoder.classes_ - -y_train = utils.to_categorical(train['rubrics'], nb_classes) - -tokenizer = Tokenizer(num_words=num_words) -tokenizer.fit_on_texts(filtered_tokens) +label_encoder.fit(class_names) +encoded_labels = label_encoder.transform(train['rubrics']) +y_train = utils.to_categorical(encoded_labels, nb_classes) sequences = tokenizer.texts_to_sequences(reviews) - x_train = pad_sequences(sequences, maxlen=max_reviews_len) print('начинается создание модели') @@ -119,31 +76,4 @@ plt.plot(history_lstm.history['val_accuracy'], plt.xlabel('Эпоха обучения') plt.ylabel('Доля верных ответов') plt.legend() -plt.show() - -# Загрузка модели -model_lstm = load_model('.//model/best_model_lstm_negative.keras') - -# Пример текста отзыва пользователя -user_review = "Не люблю пьяных людей, когда они рядом, всегда будет что то плохое" - -# Подготовка отзыва пользователя -filtered_tokens = remove_stopwords_and_punctuation([user_review]) -sequences = tokenizer.texts_to_sequences(filtered_tokens) -x_user = pad_sequences(sequences, maxlen=max_reviews_len) - -# Получение вероятности принадлежности отзыва пользователя к разным классам -predicted_probabilities = model_lstm.predict(x_user) - -# Вывод вероятностей с названиями классов -for class_name, prob in zip(class_names, predicted_probabilities[0]): - print(f"Вероятность отзыва относится к классу '{class_name}': {prob}") - -# Сохраняем названия классов в текстовый файл -with open('.//class/class_names_lstm_negative.txt', 'w', encoding='utf-8') as file: - for class_name in class_names: - file.write(f"{class_name}\n") - -# saving -with open('.//tokenization/tokenizer_lstm_negative.pickle', 'wb') as handle: - pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) +plt.savefig('.//graphics/history_lstm_negative.png') diff --git a/neural_network/create_lstm/create_model_lstm_positive.py b/neural_network/create_lstm/create_model_lstm_positive.py index 3297c2f..5580065 100644 --- a/neural_network/create_lstm/create_model_lstm_positive.py +++ b/neural_network/create_lstm/create_model_lstm_positive.py @@ -1,19 +1,12 @@ +import pickle import pandas as pd from keras import Sequential from keras.src.callbacks import ModelCheckpoint -from keras.src.legacy.preprocessing.text import Tokenizer -from keras.src.saving import load_model from keras.src.utils import pad_sequences from matplotlib import pyplot as plt -from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM, SpatialDropout1D -from sklearn.preprocessing import LabelEncoder +from tensorflow.keras.layers import Dense, Embedding, SpatialDropout1D, LSTM from tensorflow.keras import utils -from nltk import word_tokenize, SnowballStemmer -from nltk.corpus import stopwords -import string -import pickle - -from pymystem3 import Mystem +from sklearn.preprocessing import LabelEncoder # Максимальное количество слов num_words = 10000 @@ -22,62 +15,27 @@ max_reviews_len = 90 # Количество классов отзыва nb_classes = 10 -def remove_stopwords_and_punctuation(reviews): - stop_words = set(stopwords.words('russian')) - punctuation = set(string.punctuation) - filtered_tokens = [] +# Загрузка токенизатора +with open('..//tokenization/tokenizer_positive.pickle', 'rb') as handle: + tokenizer = pickle.load(handle) - # Удаление стоп слов и пунктуаций - for review in reviews: - words = word_tokenize(review) - filtered_words = [word for word in words if - word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation] - filtered_tokens.extend(filtered_words) - - return filtered_tokens - -def lemmatize_token(filtered_tokens): - mystem = Mystem() - lemmatized_tokens = [] - for token in filtered_tokens: - lemmatized = mystem.lemmatize(token)[0] - lemmatized_tokens.append(lemmatized) - print(lemmatized) - return lemmatized_tokens - -def stemmer_token(filtered_tokens): - stemmer = SnowballStemmer("russian") - stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens] - return stemmer_tokens +# Загрузка названий классов +with open('..//class/class_names_positive.txt', 'r', encoding='utf-8') as file: + class_names = [line.strip() for line in file.readlines()] # Данные -train = pd.read_csv('..//dataset/filtered/filtered_dataset_positive.csv') +train = pd.read_csv('../dataset/filtered/filtered_dataset_positive.csv') train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True) reviews = train['text'] print("Набор данных готов") -filtered_tokens = remove_stopwords_and_punctuation(reviews) -print(filtered_tokens[:10]) - -# lemmatized_tokens = lemmatize_token(filtered_tokens) -# print(lemmatized_tokens[:10]) - -# stemmer_tokens = stemmer_token(filtered_tokens) -# print(stemmer_tokens[:10]) - +# Кодирование категорий label_encoder = LabelEncoder() -train['rubrics'] = label_encoder.fit_transform(train['rubrics']) - -# Сохраняем названия классов -class_names = label_encoder.classes_ - -y_train = utils.to_categorical(train['rubrics'], nb_classes) - -tokenizer = Tokenizer(num_words=num_words) -tokenizer.fit_on_texts(filtered_tokens) +label_encoder.fit(class_names) +encoded_labels = label_encoder.transform(train['rubrics']) +y_train = utils.to_categorical(encoded_labels, nb_classes) sequences = tokenizer.texts_to_sequences(reviews) - x_train = pad_sequences(sequences, maxlen=max_reviews_len) print('начинается создание модели') @@ -118,31 +76,4 @@ plt.plot(history_lstm.history['val_accuracy'], plt.xlabel('Эпоха обучения') plt.ylabel('Доля верных ответов') plt.legend() -plt.show() - -# Загрузка модели -model_lstm = load_model('.//model/best_model_lstm_positive.keras') - -# Пример текста отзыва пользователя -user_review = "Ой я так люблю вкусно кушать! Я бы хотел сьесть все самое вкусное в этом мире! Пончики, пиццу, вообще все все все!" - -# Подготовка отзыва пользователя -filtered_tokens = remove_stopwords_and_punctuation([user_review]) -sequences = tokenizer.texts_to_sequences(filtered_tokens) -x_user = pad_sequences(sequences, maxlen=max_reviews_len) - -# Получение вероятности принадлежности отзыва пользователя к разным классам -predicted_probabilities = model_lstm.predict(x_user) - -# Вывод вероятностей с названиями классов -for class_name, prob in zip(class_names, predicted_probabilities[0]): - print(f"Вероятность отзыва относится к классу '{class_name}': {prob}") - -# Сохраняем названия классов в текстовый файл -with open('.//class/class_names_lstm_positive.txt', 'w', encoding='utf-8') as file: - for class_name in class_names: - file.write(f"{class_name}\n") - -# saving -with open('.//tokenization/tokenizer_lstm_positive.pickle', 'wb') as handle: - pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) \ No newline at end of file +plt.savefig('.//graphics/history_lstm_positive.png') \ No newline at end of file diff --git a/neural_network/create_lstm/graphics/.gitkeep b/neural_network/create_lstm/graphics/.gitkeep new file mode 100644 index 0000000..e69de29