Фух, поубирал лишний код + починил GRU

This commit is contained in:
maksim 2024-05-30 19:27:03 +04:00
parent de1c25bafa
commit c56263c386
10 changed files with 112 additions and 528 deletions

View File

@ -2,7 +2,7 @@ import pandas as pd
from sklearn.preprocessing import LabelEncoder
# Данные
train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv')
train = pd.read_csv('../dataset/filtered/filtered_dataset_positive.csv')
label_encoder = LabelEncoder()
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])

View File

@ -1,20 +1,12 @@
import pickle
import pandas as pd
from keras import Sequential
from keras.src.callbacks import ModelCheckpoint
from keras.src.legacy.preprocessing.text import Tokenizer
from keras.src.saving import load_model
from keras.src.utils import pad_sequences
from matplotlib import pyplot as plt
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import utils
from nltk import word_tokenize, SnowballStemmer
from nltk.corpus import stopwords
import string
from pymystem3 import Mystem
from sklearn.preprocessing import LabelEncoder
# Максимальное количество слов
num_words = 10000
@ -23,33 +15,13 @@ max_reviews_len = 90
# Количество классов отзыва
nb_classes = 10
def remove_stopwords_and_punctuation(reviews):
stop_words = set(stopwords.words('russian'))
punctuation = set(string.punctuation)
filtered_tokens = []
# Загрузка токенизатора
with open('..//tokenization/tokenizer_negative.pickle', 'rb') as handle:
tokenizer = pickle.load(handle)
# Удаление стоп слов и пунктуаций
for review in reviews:
words = word_tokenize(review)
filtered_words = [word for word in words if
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
filtered_tokens.extend(filtered_words)
return filtered_tokens
def lemmatize_token(filtered_tokens):
mystem = Mystem()
lemmatized_tokens = []
for token in filtered_tokens:
lemmatized = mystem.lemmatize(token)[0]
lemmatized_tokens.append(lemmatized)
print(lemmatized)
return lemmatized_tokens
def stemmer_token(filtered_tokens):
stemmer = SnowballStemmer("russian")
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
return stemmer_tokens
# Загрузка названий классов
with open('..//class/class_names_negative.txt', 'r', encoding='utf-8') as file:
class_names = [line.strip() for line in file.readlines()]
# Данные
train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv')
@ -57,28 +29,13 @@ train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
reviews = train['text']
print("Набор данных готов")
filtered_tokens = remove_stopwords_and_punctuation(reviews)
print(filtered_tokens[:10])
# lemmatized_tokens = lemmatize_token(filtered_tokens)
# print(lemmatized_tokens[:10])
# stemmer_tokens = stemmer_token(filtered_tokens)
# print(stemmer_tokens[:10])
# Кодирование категорий
label_encoder = LabelEncoder()
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
# Сохраняем названия классов
class_names = label_encoder.classes_
y_train = utils.to_categorical(train['rubrics'], nb_classes)
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(filtered_tokens)
label_encoder.fit(class_names)
encoded_labels = label_encoder.transform(train['rubrics'])
y_train = utils.to_categorical(encoded_labels, nb_classes)
sequences = tokenizer.texts_to_sequences(reviews)
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
print('начинается создание модели')
@ -91,7 +48,6 @@ model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(64, activation='relu'))
model_cnn.add(Dense(nb_classes, activation='softmax'))
model_cnn.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
@ -101,9 +57,9 @@ model_cnn.summary()
# Определим обратный вызов ModelCheckpoint
model_cnn_save_path = './/model/best_model_cnn_negative.keras'
checkpoint_callback_cnn = ModelCheckpoint(model_cnn_save_path,
monitor='val_accuracy',
save_best_only=True,
verbose=1)
monitor='val_accuracy',
save_best_only=True,
verbose=1)
# Обучение модели
history_cnn = model_cnn.fit(x_train,
@ -121,31 +77,4 @@ plt.plot(history_cnn.history['val_accuracy'],
plt.xlabel('Эпоха обучения')
plt.ylabel('Доля верных ответов')
plt.legend()
plt.show()
# Загрузка модели
model_cnn = load_model('.//model/best_model_cnn_negative.keras')
# Пример текста отзыва пользователя
user_review = "Не люблю пьяных людей, когда они рядом, всегда будет что то плохое"
# Подготовка отзыва пользователя
filtered_tokens = remove_stopwords_and_punctuation([user_review])
sequences = tokenizer.texts_to_sequences(filtered_tokens)
x_user = pad_sequences(sequences, maxlen=max_reviews_len)
# Получение вероятности принадлежности отзыва пользователя к разным классам
predicted_probabilities = model_cnn.predict(x_user)
# Вывод вероятностей с названиями классов
for class_name, prob in zip(class_names, predicted_probabilities[0]):
print(f"Вероятность отзыва относится к классу '{class_name}': {prob}")
# Сохраняем названия классов в текстовый файл
with open('.//class/class_names_cnn_negative.txt', 'w', encoding='utf-8') as file:
for class_name in class_names:
file.write(f"{class_name}\n")
# saving
with open('../tokenization/tokenizer_cnn_negative.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
plt.savefig('.//graphics/history_cnn_negative.png')

View File

@ -1,19 +1,12 @@
import pickle
import pandas as pd
from keras import Sequential
from keras.src.callbacks import ModelCheckpoint
from keras.src.legacy.preprocessing.text import Tokenizer
from keras.src.saving import load_model
from keras.src.utils import pad_sequences
from matplotlib import pyplot as plt
from tensorflow.keras.layers import Dense, Embedding, GlobalMaxPooling1D, Conv1D
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D
from tensorflow.keras import utils
from nltk import word_tokenize, SnowballStemmer
from nltk.corpus import stopwords
import string
import pickle
from pymystem3 import Mystem
from sklearn.preprocessing import LabelEncoder
# Максимальное количество слов
num_words = 10000
@ -22,72 +15,37 @@ max_reviews_len = 90
# Количество классов отзыва
nb_classes = 10
def remove_stopwords_and_punctuation(reviews):
stop_words = set(stopwords.words('russian'))
punctuation = set(string.punctuation)
filtered_tokens = []
# Загрузка токенизатора
with open('..//tokenization/tokenizer_positive.pickle', 'rb') as handle:
tokenizer = pickle.load(handle)
# Удаление стоп слов и пунктуаций
for review in reviews:
words = word_tokenize(review)
filtered_words = [word for word in words if
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
filtered_tokens.extend(filtered_words)
return filtered_tokens
def lemmatize_token(filtered_tokens):
mystem = Mystem()
lemmatized_tokens = []
for token in filtered_tokens:
lemmatized = mystem.lemmatize(token)[0]
lemmatized_tokens.append(lemmatized)
print(lemmatized)
return lemmatized_tokens
def stemmer_token(filtered_tokens):
stemmer = SnowballStemmer("russian")
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
return stemmer_tokens
# Загрузка названий классов
with open('..//class/class_names_positive.txt', 'r', encoding='utf-8') as file:
class_names = [line.strip() for line in file.readlines()]
# Данные
train = pd.read_csv('..//dataset/filtered/filtered_dataset_positive.csv')
train = pd.read_csv('../dataset/filtered/filtered_dataset_positive.csv')
train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
reviews = train['text']
print("Набор данных готов")
filtered_tokens = remove_stopwords_and_punctuation(reviews)
print(filtered_tokens[:10])
# lemmatized_tokens = lemmatize_token(filtered_tokens)
# print(lemmatized_tokens[:10])
# stemmer_tokens = stemmer_token(filtered_tokens)
# print(stemmer_tokens[:10])
# Кодирование категорий
label_encoder = LabelEncoder()
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
# Сохраняем названия классов
class_names = label_encoder.classes_
y_train = utils.to_categorical(train['rubrics'], nb_classes)
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(filtered_tokens)
label_encoder.fit(class_names)
encoded_labels = label_encoder.transform(train['rubrics'])
y_train = utils.to_categorical(encoded_labels, nb_classes)
sequences = tokenizer.texts_to_sequences(reviews)
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
print('начинается создание модели')
# Построение модели
model_cnn = Sequential()
model_cnn.add(Embedding(num_words, 128, input_length=max_reviews_len))
model_cnn.add(Conv1D(128, 5, activation='relu'))
model_cnn.add(Embedding(num_words, 256, input_length=max_reviews_len))
model_cnn.add(Conv1D(256, 5, activation='relu'))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(64, activation='relu'))
model_cnn.add(Dense(128, activation='relu'))
model_cnn.add(Dense(nb_classes, activation='softmax'))
@ -120,31 +78,4 @@ plt.plot(history_cnn.history['val_accuracy'],
plt.xlabel('Эпоха обучения')
plt.ylabel('Доля верных ответов')
plt.legend()
plt.show()
# Загрузка модели
model_cnn = load_model('.//model/best_model_cnn_positive.keras')
# Пример текста отзыва пользователя
user_review = "Ой я так люблю вкусно кушать! Я бы хотел сьесть все самое вкусное в этом мире! Пончики, пиццу, вообще все все все!"
# Подготовка отзыва пользователя
filtered_tokens = remove_stopwords_and_punctuation([user_review])
sequences = tokenizer.texts_to_sequences(filtered_tokens)
x_user = pad_sequences(sequences, maxlen=max_reviews_len)
# Получение вероятности принадлежности отзыва пользователя к разным классам
predicted_probabilities = model_cnn.predict(x_user)
# Вывод вероятностей с названиями классов
for class_name, prob in zip(class_names, predicted_probabilities[0]):
print(f"Вероятность отзыва относится к классу '{class_name}': {prob}")
# Сохраняем названия классов в текстовый файл
with open('.//class/class_names_cnn_positive.txt', 'w', encoding='utf-8') as file:
for class_name in class_names:
file.write(f"{class_name}\n")
# saving
with open('../tokenization/tokenizer_cnn_positive.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
plt.savefig('.//graphics/history_cnn_positive.png')

View File

@ -1,20 +1,12 @@
import pickle
import pandas as pd
from keras import Sequential
from keras.src.callbacks import ModelCheckpoint
from keras.src.legacy.preprocessing.text import Tokenizer
from keras.src.saving import load_model
from keras.src.utils import pad_sequences
from matplotlib import pyplot as plt
from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM, SpatialDropout1D
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Dense, Embedding, Dropout, GRU, Bidirectional
from tensorflow.keras import utils
from nltk import word_tokenize, SnowballStemmer
from nltk.corpus import stopwords
import string
from pymystem3 import Mystem
from sklearn.preprocessing import LabelEncoder
# Максимальное количество слов
num_words = 10000
@ -23,33 +15,13 @@ max_reviews_len = 90
# Количество классов отзыва
nb_classes = 10
def remove_stopwords_and_punctuation(reviews):
stop_words = set(stopwords.words('russian'))
punctuation = set(string.punctuation)
filtered_tokens = []
# Загрузка токенизатора
with open('..//tokenization/tokenizer_negative.pickle', 'rb') as handle:
tokenizer = pickle.load(handle)
# Удаление стоп слов и пунктуаций
for review in reviews:
words = word_tokenize(review)
filtered_words = [word for word in words if
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
filtered_tokens.extend(filtered_words)
return filtered_tokens
def lemmatize_token(filtered_tokens):
mystem = Mystem()
lemmatized_tokens = []
for token in filtered_tokens:
lemmatized = mystem.lemmatize(token)[0]
lemmatized_tokens.append(lemmatized)
print(lemmatized)
return lemmatized_tokens
def stemmer_token(filtered_tokens):
stemmer = SnowballStemmer("russian")
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
return stemmer_tokens
# Загрузка названий классов
with open('..//class/class_names_negative.txt', 'r', encoding='utf-8') as file:
class_names = [line.strip() for line in file.readlines()]
# Данные
train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv')
@ -57,93 +29,52 @@ train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
reviews = train['text']
print("Набор данных готов")
filtered_tokens = remove_stopwords_and_punctuation(reviews)
print(filtered_tokens[:10])
# lemmatized_tokens = lemmatize_token(filtered_tokens)
# print(lemmatized_tokens[:10])
# stemmer_tokens = stemmer_token(filtered_tokens)
# print(stemmer_tokens[:10])
# Кодирование категорий
label_encoder = LabelEncoder()
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
# Сохраняем названия классов
class_names = label_encoder.classes_
y_train = utils.to_categorical(train['rubrics'], nb_classes)
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(filtered_tokens)
label_encoder.fit(class_names)
encoded_labels = label_encoder.transform(train['rubrics'])
y_train = utils.to_categorical(encoded_labels, nb_classes)
sequences = tokenizer.texts_to_sequences(reviews)
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
print('начинается создание модели')
# Построение модели
model_lstm = Sequential()
model_lstm.add(Embedding(num_words, 128, input_length=max_reviews_len))
model_lstm.add(SpatialDropout1D(0.2))
model_lstm.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(nb_classes, activation='softmax'))
model_gru = Sequential()
model_gru.add(Embedding(num_words, 128, input_length=max_reviews_len))
model_gru.add(GRU(128, return_sequences=True))
model_gru.add(Dropout(0.25))
model_gru.add(GRU(64))
model_gru.add(Dense(nb_classes, activation='softmax'))
model_lstm.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
model_gru.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
model_lstm.summary()
model_gru.summary()
# Определим обратный вызов ModelCheckpoint
model_lstm_save_path = './/model/best_model_lstm_negative.keras'
model_lstm_save_path = './/model/best_model_gru_negative.keras'
checkpoint_callback_gru = ModelCheckpoint(model_lstm_save_path,
monitor='val_accuracy',
save_best_only=True,
verbose=1)
# Обучение модели
history_lstm = model_lstm.fit(x_train,
y_train,
epochs=10,
batch_size=128,
validation_split=0.1,
callbacks=[checkpoint_callback_gru])
history_gru = model_gru.fit(x_train,
y_train,
epochs=10,
batch_size=128,
validation_split=0.1,
callbacks=[checkpoint_callback_gru])
# Графики
plt.plot(history_lstm.history['accuracy'],
plt.plot(history_gru.history['accuracy'],
label='Доля верных ответов на обучающем наборе')
plt.plot(history_lstm.history['val_accuracy'],
plt.plot(history_gru.history['val_accuracy'],
label='Доля верных ответов на проверочном наборе')
plt.xlabel('Эпоха обучения')
plt.ylabel('Доля верных ответов')
plt.legend()
plt.show()
# Загрузка модели
model_lstm = load_model('.//model/best_model_lstm_negative.keras')
# Пример текста отзыва пользователя
user_review = "Не люблю пьяных людей, когда они рядом, всегда будет что то плохое"
# Подготовка отзыва пользователя
filtered_tokens = remove_stopwords_and_punctuation([user_review])
sequences = tokenizer.texts_to_sequences(filtered_tokens)
x_user = pad_sequences(sequences, maxlen=max_reviews_len)
# Получение вероятности принадлежности отзыва пользователя к разным классам
predicted_probabilities = model_lstm.predict(x_user)
# Вывод вероятностей с названиями классов
for class_name, prob in zip(class_names, predicted_probabilities[0]):
print(f"Вероятность отзыва относится к классу '{class_name}': {prob}")
# Сохраняем названия классов в текстовый файл
with open('.//class/class_names_lstm_negative.txt', 'w', encoding='utf-8') as file:
for class_name in class_names:
file.write(f"{class_name}\n")
# saving
with open('.//tokenization/tokenizer_lstm_lstm_negative.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
plt.savefig('.//graphics/history_gru_negative.png')

View File

@ -1,19 +1,12 @@
import pickle
import pandas as pd
from keras import Sequential
from keras.src.callbacks import ModelCheckpoint
from keras.src.legacy.preprocessing.text import Tokenizer
from keras.src.saving import load_model
from keras.src.utils import pad_sequences
from matplotlib import pyplot as plt
from tensorflow.keras.layers import Dense, Embedding, Dropout, GRU, SpatialDropout1D
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Dense, Embedding, GRU, Dropout
from tensorflow.keras import utils
from nltk import word_tokenize, SnowballStemmer
from nltk.corpus import stopwords
import string
import pickle
from pymystem3 import Mystem
from sklearn.preprocessing import LabelEncoder
# Максимальное количество слов
num_words = 10000
@ -22,62 +15,27 @@ max_reviews_len = 90
# Количество классов отзыва
nb_classes = 10
def remove_stopwords_and_punctuation(reviews):
stop_words = set(stopwords.words('russian'))
punctuation = set(string.punctuation)
filtered_tokens = []
# Загрузка токенизатора
with open('..//tokenization/tokenizer_positive.pickle', 'rb') as handle:
tokenizer = pickle.load(handle)
# Удаление стоп слов и пунктуаций
for review in reviews:
words = word_tokenize(review)
filtered_words = [word for word in words if
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
filtered_tokens.extend(filtered_words)
return filtered_tokens
def lemmatize_token(filtered_tokens):
mystem = Mystem()
lemmatized_tokens = []
for token in filtered_tokens:
lemmatized = mystem.lemmatize(token)[0]
lemmatized_tokens.append(lemmatized)
print(lemmatized)
return lemmatized_tokens
def stemmer_token(filtered_tokens):
stemmer = SnowballStemmer("russian")
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
return stemmer_tokens
# Загрузка названий классов
with open('..//class/class_names_positive.txt', 'r', encoding='utf-8') as file:
class_names = [line.strip() for line in file.readlines()]
# Данные
train = pd.read_csv('..//dataset/filtered/filtered_dataset_positive.csv')
train = pd.read_csv('../dataset/filtered/filtered_dataset_positive.csv')
train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
reviews = train['text']
print("Набор данных готов")
filtered_tokens = remove_stopwords_and_punctuation(reviews)
print(filtered_tokens[:10])
# lemmatized_tokens = lemmatize_token(filtered_tokens)
# print(lemmatized_tokens[:10])
# stemmer_tokens = stemmer_token(filtered_tokens)
# print(stemmer_tokens[:10])
# Кодирование категорий
label_encoder = LabelEncoder()
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
# Сохраняем названия классов
class_names = label_encoder.classes_
y_train = utils.to_categorical(train['rubrics'], nb_classes)
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(filtered_tokens)
label_encoder.fit(class_names)
encoded_labels = label_encoder.transform(train['rubrics'])
y_train = utils.to_categorical(encoded_labels, nb_classes)
sequences = tokenizer.texts_to_sequences(reviews)
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
print('начинается создание модели')
@ -85,8 +43,9 @@ print('начинается создание модели')
# Построение модели
model_gru = Sequential()
model_gru.add(Embedding(num_words, 256, input_length=max_reviews_len))
model_gru.add(SpatialDropout1D(0.2))
model_gru.add(GRU(128, dropout=0.2, recurrent_dropout=0.2))
model_gru.add(GRU(256, return_sequences=True))
model_gru.add(Dropout(0.25))
model_gru.add(GRU(128))
model_gru.add(Dense(nb_classes, activation='softmax'))
model_gru.compile(optimizer='adam',
@ -118,31 +77,4 @@ plt.plot(history_gru.history['val_accuracy'],
plt.xlabel('Эпоха обучения')
plt.ylabel('Доля верных ответов')
plt.legend()
plt.show()
# Загрузка модели
model_gru = load_model('.//model/best_model_gru_positive.keras')
# Пример текста отзыва пользователя
user_review = "Ой я так люблю вкусно кушать! Я бы хотел сьесть все самое вкусное в этом мире! Пончики, пиццу, вообще все все все!"
# Подготовка отзыва пользователя
filtered_tokens = remove_stopwords_and_punctuation([user_review])
sequences = tokenizer.texts_to_sequences(filtered_tokens)
x_user = pad_sequences(sequences, maxlen=max_reviews_len)
# Получение вероятности принадлежности отзыва пользователя к разным классам
predicted_probabilities = model_gru.predict(x_user)
# Вывод вероятностей с названиями классов
for class_name, prob in zip(class_names, predicted_probabilities[0]):
print(f"Вероятность отзыва относится к классу '{class_name}': {prob}")
# Сохраняем названия классов в текстовый файл
with open('.//class/class_names_gru_positive.txt', 'w', encoding='utf-8') as file:
for class_name in class_names:
file.write(f"{class_name}\n")
# saving
with open('.//tokenization/tokenizer_gru_positive.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
plt.savefig('.//graphics/history_gru_positive.png')

View File

@ -1,20 +1,12 @@
import pickle
import pandas as pd
from keras import Sequential
from keras.src.callbacks import ModelCheckpoint
from keras.src.legacy.preprocessing.text import Tokenizer
from keras.src.saving import load_model
from keras.src.utils import pad_sequences
from matplotlib import pyplot as plt
from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM, SpatialDropout1D
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Dense, Embedding, SpatialDropout1D, LSTM
from tensorflow.keras import utils
from nltk import word_tokenize, SnowballStemmer
from nltk.corpus import stopwords
import string
from pymystem3 import Mystem
from sklearn.preprocessing import LabelEncoder
# Максимальное количество слов
num_words = 10000
@ -23,33 +15,13 @@ max_reviews_len = 90
# Количество классов отзыва
nb_classes = 10
def remove_stopwords_and_punctuation(reviews):
stop_words = set(stopwords.words('russian'))
punctuation = set(string.punctuation)
filtered_tokens = []
# Загрузка токенизатора
with open('..//tokenization/tokenizer_negative.pickle', 'rb') as handle:
tokenizer = pickle.load(handle)
# Удаление стоп слов и пунктуаций
for review in reviews:
words = word_tokenize(review)
filtered_words = [word for word in words if
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
filtered_tokens.extend(filtered_words)
return filtered_tokens
def lemmatize_token(filtered_tokens):
mystem = Mystem()
lemmatized_tokens = []
for token in filtered_tokens:
lemmatized = mystem.lemmatize(token)[0]
lemmatized_tokens.append(lemmatized)
print(lemmatized)
return lemmatized_tokens
def stemmer_token(filtered_tokens):
stemmer = SnowballStemmer("russian")
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
return stemmer_tokens
# Загрузка названий классов
with open('..//class/class_names_negative.txt', 'r', encoding='utf-8') as file:
class_names = [line.strip() for line in file.readlines()]
# Данные
train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv')
@ -57,28 +29,13 @@ train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
reviews = train['text']
print("Набор данных готов")
filtered_tokens = remove_stopwords_and_punctuation(reviews)
print(filtered_tokens[:10])
# lemmatized_tokens = lemmatize_token(filtered_tokens)
# print(lemmatized_tokens[:10])
# stemmer_tokens = stemmer_token(filtered_tokens)
# print(stemmer_tokens[:10])
# Кодирование категорий
label_encoder = LabelEncoder()
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
# Сохраняем названия классов
class_names = label_encoder.classes_
y_train = utils.to_categorical(train['rubrics'], nb_classes)
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(filtered_tokens)
label_encoder.fit(class_names)
encoded_labels = label_encoder.transform(train['rubrics'])
y_train = utils.to_categorical(encoded_labels, nb_classes)
sequences = tokenizer.texts_to_sequences(reviews)
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
print('начинается создание модели')
@ -119,31 +76,4 @@ plt.plot(history_lstm.history['val_accuracy'],
plt.xlabel('Эпоха обучения')
plt.ylabel('Доля верных ответов')
plt.legend()
plt.show()
# Загрузка модели
model_lstm = load_model('.//model/best_model_lstm_negative.keras')
# Пример текста отзыва пользователя
user_review = "Не люблю пьяных людей, когда они рядом, всегда будет что то плохое"
# Подготовка отзыва пользователя
filtered_tokens = remove_stopwords_and_punctuation([user_review])
sequences = tokenizer.texts_to_sequences(filtered_tokens)
x_user = pad_sequences(sequences, maxlen=max_reviews_len)
# Получение вероятности принадлежности отзыва пользователя к разным классам
predicted_probabilities = model_lstm.predict(x_user)
# Вывод вероятностей с названиями классов
for class_name, prob in zip(class_names, predicted_probabilities[0]):
print(f"Вероятность отзыва относится к классу '{class_name}': {prob}")
# Сохраняем названия классов в текстовый файл
with open('.//class/class_names_lstm_negative.txt', 'w', encoding='utf-8') as file:
for class_name in class_names:
file.write(f"{class_name}\n")
# saving
with open('.//tokenization/tokenizer_lstm_negative.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
plt.savefig('.//graphics/history_lstm_negative.png')

View File

@ -1,19 +1,12 @@
import pickle
import pandas as pd
from keras import Sequential
from keras.src.callbacks import ModelCheckpoint
from keras.src.legacy.preprocessing.text import Tokenizer
from keras.src.saving import load_model
from keras.src.utils import pad_sequences
from matplotlib import pyplot as plt
from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM, SpatialDropout1D
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Dense, Embedding, SpatialDropout1D, LSTM
from tensorflow.keras import utils
from nltk import word_tokenize, SnowballStemmer
from nltk.corpus import stopwords
import string
import pickle
from pymystem3 import Mystem
from sklearn.preprocessing import LabelEncoder
# Максимальное количество слов
num_words = 10000
@ -22,62 +15,27 @@ max_reviews_len = 90
# Количество классов отзыва
nb_classes = 10
def remove_stopwords_and_punctuation(reviews):
stop_words = set(stopwords.words('russian'))
punctuation = set(string.punctuation)
filtered_tokens = []
# Загрузка токенизатора
with open('..//tokenization/tokenizer_positive.pickle', 'rb') as handle:
tokenizer = pickle.load(handle)
# Удаление стоп слов и пунктуаций
for review in reviews:
words = word_tokenize(review)
filtered_words = [word for word in words if
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
filtered_tokens.extend(filtered_words)
return filtered_tokens
def lemmatize_token(filtered_tokens):
mystem = Mystem()
lemmatized_tokens = []
for token in filtered_tokens:
lemmatized = mystem.lemmatize(token)[0]
lemmatized_tokens.append(lemmatized)
print(lemmatized)
return lemmatized_tokens
def stemmer_token(filtered_tokens):
stemmer = SnowballStemmer("russian")
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
return stemmer_tokens
# Загрузка названий классов
with open('..//class/class_names_positive.txt', 'r', encoding='utf-8') as file:
class_names = [line.strip() for line in file.readlines()]
# Данные
train = pd.read_csv('..//dataset/filtered/filtered_dataset_positive.csv')
train = pd.read_csv('../dataset/filtered/filtered_dataset_positive.csv')
train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
reviews = train['text']
print("Набор данных готов")
filtered_tokens = remove_stopwords_and_punctuation(reviews)
print(filtered_tokens[:10])
# lemmatized_tokens = lemmatize_token(filtered_tokens)
# print(lemmatized_tokens[:10])
# stemmer_tokens = stemmer_token(filtered_tokens)
# print(stemmer_tokens[:10])
# Кодирование категорий
label_encoder = LabelEncoder()
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
# Сохраняем названия классов
class_names = label_encoder.classes_
y_train = utils.to_categorical(train['rubrics'], nb_classes)
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(filtered_tokens)
label_encoder.fit(class_names)
encoded_labels = label_encoder.transform(train['rubrics'])
y_train = utils.to_categorical(encoded_labels, nb_classes)
sequences = tokenizer.texts_to_sequences(reviews)
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
print('начинается создание модели')
@ -118,31 +76,4 @@ plt.plot(history_lstm.history['val_accuracy'],
plt.xlabel('Эпоха обучения')
plt.ylabel('Доля верных ответов')
plt.legend()
plt.show()
# Загрузка модели
model_lstm = load_model('.//model/best_model_lstm_positive.keras')
# Пример текста отзыва пользователя
user_review = "Ой я так люблю вкусно кушать! Я бы хотел сьесть все самое вкусное в этом мире! Пончики, пиццу, вообще все все все!"
# Подготовка отзыва пользователя
filtered_tokens = remove_stopwords_and_punctuation([user_review])
sequences = tokenizer.texts_to_sequences(filtered_tokens)
x_user = pad_sequences(sequences, maxlen=max_reviews_len)
# Получение вероятности принадлежности отзыва пользователя к разным классам
predicted_probabilities = model_lstm.predict(x_user)
# Вывод вероятностей с названиями классов
for class_name, prob in zip(class_names, predicted_probabilities[0]):
print(f"Вероятность отзыва относится к классу '{class_name}': {prob}")
# Сохраняем названия классов в текстовый файл
with open('.//class/class_names_lstm_positive.txt', 'w', encoding='utf-8') as file:
for class_name in class_names:
file.write(f"{class_name}\n")
# saving
with open('.//tokenization/tokenizer_lstm_positive.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
plt.savefig('.//graphics/history_lstm_positive.png')