Делаю нормальное расскидывание, осталось в модельках убрать лишний код, ну и ну и какое нить форматирование.
This commit is contained in:
parent
87246af738
commit
de1c25bafa
2
model.py
2
model.py
@ -8,7 +8,7 @@ from keras.src.utils import pad_sequences
|
||||
model = tf.keras.models.load_model('.//neural_network/create_lstm/model/best_model_lstm_negative.keras')
|
||||
|
||||
# Загрузка токенизатора
|
||||
with open('.//neural_network/create_lstm/tokenizer/tokenizer_lstm_lstm_negative.pickle', 'rb') as handle:
|
||||
with open('.//neural_network/create_lstm/tokenization/tokenizer_lstm_lstm_negative.pickle', 'rb') as handle:
|
||||
tokenizer = pickle.load(handle)
|
||||
|
||||
# Загрузка названий классов
|
||||
|
16
neural_network/class/class_negative.py
Normal file
16
neural_network/class/class_negative.py
Normal file
@ -0,0 +1,16 @@
|
||||
import pandas as pd
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
# Данные
|
||||
train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv')
|
||||
|
||||
label_encoder = LabelEncoder()
|
||||
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
|
||||
|
||||
# Сохраняем названия классов
|
||||
class_names = label_encoder.classes_
|
||||
|
||||
# Сохраняем названия классов в текстовый файл
|
||||
with open('.//class_names_negative.txt', 'w', encoding='utf-8') as file:
|
||||
for class_name in class_names:
|
||||
file.write(f"{class_name}\n")
|
16
neural_network/class/class_positive.py
Normal file
16
neural_network/class/class_positive.py
Normal file
@ -0,0 +1,16 @@
|
||||
import pandas as pd
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
# Данные
|
||||
train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv')
|
||||
|
||||
label_encoder = LabelEncoder()
|
||||
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
|
||||
|
||||
# Сохраняем названия классов
|
||||
class_names = label_encoder.classes_
|
||||
|
||||
# Сохраняем названия классов в текстовый файл
|
||||
with open('.//class_names_positive.txt', 'w', encoding='utf-8') as file:
|
||||
for class_name in class_names:
|
||||
file.write(f"{class_name}\n")
|
151
neural_network/create_cnn/create_model_cnn_negative.py
Normal file
151
neural_network/create_cnn/create_model_cnn_negative.py
Normal file
@ -0,0 +1,151 @@
|
||||
import pickle
|
||||
|
||||
import pandas as pd
|
||||
from keras import Sequential
|
||||
from keras.src.callbacks import ModelCheckpoint
|
||||
from keras.src.legacy.preprocessing.text import Tokenizer
|
||||
from keras.src.saving import load_model
|
||||
from keras.src.utils import pad_sequences
|
||||
from matplotlib import pyplot as plt
|
||||
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from tensorflow.keras import utils
|
||||
from nltk import word_tokenize, SnowballStemmer
|
||||
from nltk.corpus import stopwords
|
||||
import string
|
||||
|
||||
from pymystem3 import Mystem
|
||||
|
||||
# Максимальное количество слов
|
||||
num_words = 10000
|
||||
# Максимальная длина отзыва
|
||||
max_reviews_len = 90
|
||||
# Количество классов отзыва
|
||||
nb_classes = 10
|
||||
|
||||
def remove_stopwords_and_punctuation(reviews):
|
||||
stop_words = set(stopwords.words('russian'))
|
||||
punctuation = set(string.punctuation)
|
||||
filtered_tokens = []
|
||||
|
||||
# Удаление стоп слов и пунктуаций
|
||||
for review in reviews:
|
||||
words = word_tokenize(review)
|
||||
filtered_words = [word for word in words if
|
||||
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
|
||||
filtered_tokens.extend(filtered_words)
|
||||
|
||||
return filtered_tokens
|
||||
|
||||
def lemmatize_token(filtered_tokens):
|
||||
mystem = Mystem()
|
||||
lemmatized_tokens = []
|
||||
for token in filtered_tokens:
|
||||
lemmatized = mystem.lemmatize(token)[0]
|
||||
lemmatized_tokens.append(lemmatized)
|
||||
print(lemmatized)
|
||||
return lemmatized_tokens
|
||||
|
||||
def stemmer_token(filtered_tokens):
|
||||
stemmer = SnowballStemmer("russian")
|
||||
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
|
||||
return stemmer_tokens
|
||||
|
||||
# Данные
|
||||
train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv')
|
||||
train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
|
||||
reviews = train['text']
|
||||
print("Набор данных готов")
|
||||
|
||||
filtered_tokens = remove_stopwords_and_punctuation(reviews)
|
||||
print(filtered_tokens[:10])
|
||||
|
||||
# lemmatized_tokens = lemmatize_token(filtered_tokens)
|
||||
# print(lemmatized_tokens[:10])
|
||||
|
||||
# stemmer_tokens = stemmer_token(filtered_tokens)
|
||||
# print(stemmer_tokens[:10])
|
||||
|
||||
label_encoder = LabelEncoder()
|
||||
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
|
||||
|
||||
# Сохраняем названия классов
|
||||
class_names = label_encoder.classes_
|
||||
|
||||
y_train = utils.to_categorical(train['rubrics'], nb_classes)
|
||||
|
||||
tokenizer = Tokenizer(num_words=num_words)
|
||||
tokenizer.fit_on_texts(filtered_tokens)
|
||||
|
||||
sequences = tokenizer.texts_to_sequences(reviews)
|
||||
|
||||
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
|
||||
|
||||
print('начинается создание модели')
|
||||
|
||||
# Построение модели
|
||||
model_cnn = Sequential()
|
||||
model_cnn.add(Embedding(num_words, 128, input_length=max_reviews_len))
|
||||
model_cnn.add(Conv1D(128, 5, activation='relu'))
|
||||
model_cnn.add(GlobalMaxPooling1D())
|
||||
model_cnn.add(Dense(64, activation='relu'))
|
||||
model_cnn.add(Dense(nb_classes, activation='softmax'))
|
||||
|
||||
|
||||
model_cnn.compile(optimizer='adam',
|
||||
loss='categorical_crossentropy',
|
||||
metrics=['accuracy'])
|
||||
|
||||
model_cnn.summary()
|
||||
|
||||
# Определим обратный вызов ModelCheckpoint
|
||||
model_cnn_save_path = './/model/best_model_cnn_negative.keras'
|
||||
checkpoint_callback_cnn = ModelCheckpoint(model_cnn_save_path,
|
||||
monitor='val_accuracy',
|
||||
save_best_only=True,
|
||||
verbose=1)
|
||||
|
||||
# Обучение модели
|
||||
history_cnn = model_cnn.fit(x_train,
|
||||
y_train,
|
||||
epochs=10,
|
||||
batch_size=128,
|
||||
validation_split=0.1,
|
||||
callbacks=[checkpoint_callback_cnn])
|
||||
|
||||
# Графики
|
||||
plt.plot(history_cnn.history['accuracy'],
|
||||
label='Доля верных ответов на обучающем наборе')
|
||||
plt.plot(history_cnn.history['val_accuracy'],
|
||||
label='Доля верных ответов на проверочном наборе')
|
||||
plt.xlabel('Эпоха обучения')
|
||||
plt.ylabel('Доля верных ответов')
|
||||
plt.legend()
|
||||
plt.show()
|
||||
|
||||
# Загрузка модели
|
||||
model_cnn = load_model('.//model/best_model_cnn_negative.keras')
|
||||
|
||||
# Пример текста отзыва пользователя
|
||||
user_review = "Не люблю пьяных людей, когда они рядом, всегда будет что то плохое"
|
||||
|
||||
# Подготовка отзыва пользователя
|
||||
filtered_tokens = remove_stopwords_and_punctuation([user_review])
|
||||
sequences = tokenizer.texts_to_sequences(filtered_tokens)
|
||||
x_user = pad_sequences(sequences, maxlen=max_reviews_len)
|
||||
|
||||
# Получение вероятности принадлежности отзыва пользователя к разным классам
|
||||
predicted_probabilities = model_cnn.predict(x_user)
|
||||
|
||||
# Вывод вероятностей с названиями классов
|
||||
for class_name, prob in zip(class_names, predicted_probabilities[0]):
|
||||
print(f"Вероятность отзыва относится к классу '{class_name}': {prob}")
|
||||
|
||||
# Сохраняем названия классов в текстовый файл
|
||||
with open('.//class/class_names_cnn_negative.txt', 'w', encoding='utf-8') as file:
|
||||
for class_name in class_names:
|
||||
file.write(f"{class_name}\n")
|
||||
|
||||
# saving
|
||||
with open('../tokenization/tokenizer_cnn_negative.pickle', 'wb') as handle:
|
||||
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
150
neural_network/create_cnn/create_model_cnn_positive.py
Normal file
150
neural_network/create_cnn/create_model_cnn_positive.py
Normal file
@ -0,0 +1,150 @@
|
||||
import pandas as pd
|
||||
from keras import Sequential
|
||||
from keras.src.callbacks import ModelCheckpoint
|
||||
from keras.src.legacy.preprocessing.text import Tokenizer
|
||||
from keras.src.saving import load_model
|
||||
from keras.src.utils import pad_sequences
|
||||
from matplotlib import pyplot as plt
|
||||
from tensorflow.keras.layers import Dense, Embedding, GlobalMaxPooling1D, Conv1D
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from tensorflow.keras import utils
|
||||
from nltk import word_tokenize, SnowballStemmer
|
||||
from nltk.corpus import stopwords
|
||||
import string
|
||||
import pickle
|
||||
|
||||
from pymystem3 import Mystem
|
||||
|
||||
# Максимальное количество слов
|
||||
num_words = 10000
|
||||
# Максимальная длина отзыва
|
||||
max_reviews_len = 90
|
||||
# Количество классов отзыва
|
||||
nb_classes = 10
|
||||
|
||||
def remove_stopwords_and_punctuation(reviews):
|
||||
stop_words = set(stopwords.words('russian'))
|
||||
punctuation = set(string.punctuation)
|
||||
filtered_tokens = []
|
||||
|
||||
# Удаление стоп слов и пунктуаций
|
||||
for review in reviews:
|
||||
words = word_tokenize(review)
|
||||
filtered_words = [word for word in words if
|
||||
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
|
||||
filtered_tokens.extend(filtered_words)
|
||||
|
||||
return filtered_tokens
|
||||
|
||||
def lemmatize_token(filtered_tokens):
|
||||
mystem = Mystem()
|
||||
lemmatized_tokens = []
|
||||
for token in filtered_tokens:
|
||||
lemmatized = mystem.lemmatize(token)[0]
|
||||
lemmatized_tokens.append(lemmatized)
|
||||
print(lemmatized)
|
||||
return lemmatized_tokens
|
||||
|
||||
def stemmer_token(filtered_tokens):
|
||||
stemmer = SnowballStemmer("russian")
|
||||
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
|
||||
return stemmer_tokens
|
||||
|
||||
# Данные
|
||||
train = pd.read_csv('..//dataset/filtered/filtered_dataset_positive.csv')
|
||||
train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
|
||||
reviews = train['text']
|
||||
print("Набор данных готов")
|
||||
|
||||
filtered_tokens = remove_stopwords_and_punctuation(reviews)
|
||||
print(filtered_tokens[:10])
|
||||
|
||||
# lemmatized_tokens = lemmatize_token(filtered_tokens)
|
||||
# print(lemmatized_tokens[:10])
|
||||
|
||||
# stemmer_tokens = stemmer_token(filtered_tokens)
|
||||
# print(stemmer_tokens[:10])
|
||||
|
||||
label_encoder = LabelEncoder()
|
||||
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
|
||||
|
||||
# Сохраняем названия классов
|
||||
class_names = label_encoder.classes_
|
||||
|
||||
y_train = utils.to_categorical(train['rubrics'], nb_classes)
|
||||
|
||||
tokenizer = Tokenizer(num_words=num_words)
|
||||
tokenizer.fit_on_texts(filtered_tokens)
|
||||
|
||||
sequences = tokenizer.texts_to_sequences(reviews)
|
||||
|
||||
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
|
||||
|
||||
print('начинается создание модели')
|
||||
|
||||
# Построение модели
|
||||
model_cnn = Sequential()
|
||||
model_cnn.add(Embedding(num_words, 128, input_length=max_reviews_len))
|
||||
model_cnn.add(Conv1D(128, 5, activation='relu'))
|
||||
model_cnn.add(GlobalMaxPooling1D())
|
||||
model_cnn.add(Dense(64, activation='relu'))
|
||||
model_cnn.add(Dense(nb_classes, activation='softmax'))
|
||||
|
||||
|
||||
model_cnn.compile(optimizer='adam',
|
||||
loss='categorical_crossentropy',
|
||||
metrics=['accuracy'])
|
||||
|
||||
model_cnn.summary()
|
||||
|
||||
# Определим обратный вызов ModelCheckpoint
|
||||
model_cnn_save_path = './/model/best_model_cnn_positive.keras'
|
||||
checkpoint_callback_cnn = ModelCheckpoint(model_cnn_save_path,
|
||||
monitor='val_accuracy',
|
||||
save_best_only=True,
|
||||
verbose=1)
|
||||
|
||||
# Обучение модели
|
||||
history_cnn = model_cnn.fit(x_train,
|
||||
y_train,
|
||||
epochs=5,
|
||||
batch_size=128,
|
||||
validation_split=0.1,
|
||||
callbacks=[checkpoint_callback_cnn])
|
||||
|
||||
# Графики
|
||||
plt.plot(history_cnn.history['accuracy'],
|
||||
label='Доля верных ответов на обучающем наборе')
|
||||
plt.plot(history_cnn.history['val_accuracy'],
|
||||
label='Доля верных ответов на проверочном наборе')
|
||||
plt.xlabel('Эпоха обучения')
|
||||
plt.ylabel('Доля верных ответов')
|
||||
plt.legend()
|
||||
plt.show()
|
||||
|
||||
# Загрузка модели
|
||||
model_cnn = load_model('.//model/best_model_cnn_positive.keras')
|
||||
|
||||
# Пример текста отзыва пользователя
|
||||
user_review = "Ой я так люблю вкусно кушать! Я бы хотел сьесть все самое вкусное в этом мире! Пончики, пиццу, вообще все все все!"
|
||||
|
||||
# Подготовка отзыва пользователя
|
||||
filtered_tokens = remove_stopwords_and_punctuation([user_review])
|
||||
sequences = tokenizer.texts_to_sequences(filtered_tokens)
|
||||
x_user = pad_sequences(sequences, maxlen=max_reviews_len)
|
||||
|
||||
# Получение вероятности принадлежности отзыва пользователя к разным классам
|
||||
predicted_probabilities = model_cnn.predict(x_user)
|
||||
|
||||
# Вывод вероятностей с названиями классов
|
||||
for class_name, prob in zip(class_names, predicted_probabilities[0]):
|
||||
print(f"Вероятность отзыва относится к классу '{class_name}': {prob}")
|
||||
|
||||
# Сохраняем названия классов в текстовый файл
|
||||
with open('.//class/class_names_cnn_positive.txt', 'w', encoding='utf-8') as file:
|
||||
for class_name in class_names:
|
||||
file.write(f"{class_name}\n")
|
||||
|
||||
# saving
|
||||
with open('../tokenization/tokenizer_cnn_positive.pickle', 'wb') as handle:
|
||||
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
149
neural_network/create_gru/create_model_gru_negative.py
Normal file
149
neural_network/create_gru/create_model_gru_negative.py
Normal file
@ -0,0 +1,149 @@
|
||||
import pickle
|
||||
|
||||
import pandas as pd
|
||||
from keras import Sequential
|
||||
from keras.src.callbacks import ModelCheckpoint
|
||||
from keras.src.legacy.preprocessing.text import Tokenizer
|
||||
from keras.src.saving import load_model
|
||||
from keras.src.utils import pad_sequences
|
||||
from matplotlib import pyplot as plt
|
||||
from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM, SpatialDropout1D
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from tensorflow.keras import utils
|
||||
from nltk import word_tokenize, SnowballStemmer
|
||||
from nltk.corpus import stopwords
|
||||
import string
|
||||
|
||||
from pymystem3 import Mystem
|
||||
|
||||
# Максимальное количество слов
|
||||
num_words = 10000
|
||||
# Максимальная длина отзыва
|
||||
max_reviews_len = 90
|
||||
# Количество классов отзыва
|
||||
nb_classes = 10
|
||||
|
||||
def remove_stopwords_and_punctuation(reviews):
|
||||
stop_words = set(stopwords.words('russian'))
|
||||
punctuation = set(string.punctuation)
|
||||
filtered_tokens = []
|
||||
|
||||
# Удаление стоп слов и пунктуаций
|
||||
for review in reviews:
|
||||
words = word_tokenize(review)
|
||||
filtered_words = [word for word in words if
|
||||
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
|
||||
filtered_tokens.extend(filtered_words)
|
||||
|
||||
return filtered_tokens
|
||||
|
||||
def lemmatize_token(filtered_tokens):
|
||||
mystem = Mystem()
|
||||
lemmatized_tokens = []
|
||||
for token in filtered_tokens:
|
||||
lemmatized = mystem.lemmatize(token)[0]
|
||||
lemmatized_tokens.append(lemmatized)
|
||||
print(lemmatized)
|
||||
return lemmatized_tokens
|
||||
|
||||
def stemmer_token(filtered_tokens):
|
||||
stemmer = SnowballStemmer("russian")
|
||||
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
|
||||
return stemmer_tokens
|
||||
|
||||
# Данные
|
||||
train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv')
|
||||
train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
|
||||
reviews = train['text']
|
||||
print("Набор данных готов")
|
||||
|
||||
filtered_tokens = remove_stopwords_and_punctuation(reviews)
|
||||
print(filtered_tokens[:10])
|
||||
|
||||
# lemmatized_tokens = lemmatize_token(filtered_tokens)
|
||||
# print(lemmatized_tokens[:10])
|
||||
|
||||
# stemmer_tokens = stemmer_token(filtered_tokens)
|
||||
# print(stemmer_tokens[:10])
|
||||
|
||||
label_encoder = LabelEncoder()
|
||||
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
|
||||
|
||||
# Сохраняем названия классов
|
||||
class_names = label_encoder.classes_
|
||||
|
||||
y_train = utils.to_categorical(train['rubrics'], nb_classes)
|
||||
|
||||
tokenizer = Tokenizer(num_words=num_words)
|
||||
tokenizer.fit_on_texts(filtered_tokens)
|
||||
|
||||
sequences = tokenizer.texts_to_sequences(reviews)
|
||||
|
||||
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
|
||||
|
||||
print('начинается создание модели')
|
||||
|
||||
# Построение модели
|
||||
model_lstm = Sequential()
|
||||
model_lstm.add(Embedding(num_words, 128, input_length=max_reviews_len))
|
||||
model_lstm.add(SpatialDropout1D(0.2))
|
||||
model_lstm.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
|
||||
model_lstm.add(Dense(nb_classes, activation='softmax'))
|
||||
|
||||
model_lstm.compile(optimizer='adam',
|
||||
loss='categorical_crossentropy',
|
||||
metrics=['accuracy'])
|
||||
|
||||
model_lstm.summary()
|
||||
|
||||
# Определим обратный вызов ModelCheckpoint
|
||||
model_lstm_save_path = './/model/best_model_lstm_negative.keras'
|
||||
checkpoint_callback_gru = ModelCheckpoint(model_lstm_save_path,
|
||||
monitor='val_accuracy',
|
||||
save_best_only=True,
|
||||
verbose=1)
|
||||
|
||||
# Обучение модели
|
||||
history_lstm = model_lstm.fit(x_train,
|
||||
y_train,
|
||||
epochs=10,
|
||||
batch_size=128,
|
||||
validation_split=0.1,
|
||||
callbacks=[checkpoint_callback_gru])
|
||||
|
||||
# Графики
|
||||
plt.plot(history_lstm.history['accuracy'],
|
||||
label='Доля верных ответов на обучающем наборе')
|
||||
plt.plot(history_lstm.history['val_accuracy'],
|
||||
label='Доля верных ответов на проверочном наборе')
|
||||
plt.xlabel('Эпоха обучения')
|
||||
plt.ylabel('Доля верных ответов')
|
||||
plt.legend()
|
||||
plt.show()
|
||||
|
||||
# Загрузка модели
|
||||
model_lstm = load_model('.//model/best_model_lstm_negative.keras')
|
||||
|
||||
# Пример текста отзыва пользователя
|
||||
user_review = "Не люблю пьяных людей, когда они рядом, всегда будет что то плохое"
|
||||
|
||||
# Подготовка отзыва пользователя
|
||||
filtered_tokens = remove_stopwords_and_punctuation([user_review])
|
||||
sequences = tokenizer.texts_to_sequences(filtered_tokens)
|
||||
x_user = pad_sequences(sequences, maxlen=max_reviews_len)
|
||||
|
||||
# Получение вероятности принадлежности отзыва пользователя к разным классам
|
||||
predicted_probabilities = model_lstm.predict(x_user)
|
||||
|
||||
# Вывод вероятностей с названиями классов
|
||||
for class_name, prob in zip(class_names, predicted_probabilities[0]):
|
||||
print(f"Вероятность отзыва относится к классу '{class_name}': {prob}")
|
||||
|
||||
# Сохраняем названия классов в текстовый файл
|
||||
with open('.//class/class_names_lstm_negative.txt', 'w', encoding='utf-8') as file:
|
||||
for class_name in class_names:
|
||||
file.write(f"{class_name}\n")
|
||||
|
||||
# saving
|
||||
with open('.//tokenization/tokenizer_lstm_lstm_negative.pickle', 'wb') as handle:
|
||||
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
148
neural_network/create_gru/create_model_gru_positive.py
Normal file
148
neural_network/create_gru/create_model_gru_positive.py
Normal file
@ -0,0 +1,148 @@
|
||||
import pandas as pd
|
||||
from keras import Sequential
|
||||
from keras.src.callbacks import ModelCheckpoint
|
||||
from keras.src.legacy.preprocessing.text import Tokenizer
|
||||
from keras.src.saving import load_model
|
||||
from keras.src.utils import pad_sequences
|
||||
from matplotlib import pyplot as plt
|
||||
from tensorflow.keras.layers import Dense, Embedding, Dropout, GRU, SpatialDropout1D
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from tensorflow.keras import utils
|
||||
from nltk import word_tokenize, SnowballStemmer
|
||||
from nltk.corpus import stopwords
|
||||
import string
|
||||
import pickle
|
||||
|
||||
from pymystem3 import Mystem
|
||||
|
||||
# Максимальное количество слов
|
||||
num_words = 10000
|
||||
# Максимальная длина отзыва
|
||||
max_reviews_len = 90
|
||||
# Количество классов отзыва
|
||||
nb_classes = 10
|
||||
|
||||
def remove_stopwords_and_punctuation(reviews):
|
||||
stop_words = set(stopwords.words('russian'))
|
||||
punctuation = set(string.punctuation)
|
||||
filtered_tokens = []
|
||||
|
||||
# Удаление стоп слов и пунктуаций
|
||||
for review in reviews:
|
||||
words = word_tokenize(review)
|
||||
filtered_words = [word for word in words if
|
||||
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
|
||||
filtered_tokens.extend(filtered_words)
|
||||
|
||||
return filtered_tokens
|
||||
|
||||
def lemmatize_token(filtered_tokens):
|
||||
mystem = Mystem()
|
||||
lemmatized_tokens = []
|
||||
for token in filtered_tokens:
|
||||
lemmatized = mystem.lemmatize(token)[0]
|
||||
lemmatized_tokens.append(lemmatized)
|
||||
print(lemmatized)
|
||||
return lemmatized_tokens
|
||||
|
||||
def stemmer_token(filtered_tokens):
|
||||
stemmer = SnowballStemmer("russian")
|
||||
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
|
||||
return stemmer_tokens
|
||||
|
||||
# Данные
|
||||
train = pd.read_csv('..//dataset/filtered/filtered_dataset_positive.csv')
|
||||
train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
|
||||
reviews = train['text']
|
||||
print("Набор данных готов")
|
||||
|
||||
filtered_tokens = remove_stopwords_and_punctuation(reviews)
|
||||
print(filtered_tokens[:10])
|
||||
|
||||
# lemmatized_tokens = lemmatize_token(filtered_tokens)
|
||||
# print(lemmatized_tokens[:10])
|
||||
|
||||
# stemmer_tokens = stemmer_token(filtered_tokens)
|
||||
# print(stemmer_tokens[:10])
|
||||
|
||||
label_encoder = LabelEncoder()
|
||||
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
|
||||
|
||||
# Сохраняем названия классов
|
||||
class_names = label_encoder.classes_
|
||||
|
||||
y_train = utils.to_categorical(train['rubrics'], nb_classes)
|
||||
|
||||
tokenizer = Tokenizer(num_words=num_words)
|
||||
tokenizer.fit_on_texts(filtered_tokens)
|
||||
|
||||
sequences = tokenizer.texts_to_sequences(reviews)
|
||||
|
||||
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
|
||||
|
||||
print('начинается создание модели')
|
||||
|
||||
# Построение модели
|
||||
model_gru = Sequential()
|
||||
model_gru.add(Embedding(num_words, 256, input_length=max_reviews_len))
|
||||
model_gru.add(SpatialDropout1D(0.2))
|
||||
model_gru.add(GRU(128, dropout=0.2, recurrent_dropout=0.2))
|
||||
model_gru.add(Dense(nb_classes, activation='softmax'))
|
||||
|
||||
model_gru.compile(optimizer='adam',
|
||||
loss='categorical_crossentropy',
|
||||
metrics=['accuracy'])
|
||||
|
||||
model_gru.summary()
|
||||
|
||||
# Определим обратный вызов ModelCheckpoint
|
||||
model_gru_save_path = './/model/best_model_gru_positive.keras'
|
||||
checkpoint_callback_gru = ModelCheckpoint(model_gru_save_path,
|
||||
monitor='val_accuracy',
|
||||
save_best_only=True,
|
||||
verbose=1)
|
||||
|
||||
# Обучение модели
|
||||
history_gru = model_gru.fit(x_train,
|
||||
y_train,
|
||||
epochs=5,
|
||||
batch_size=128,
|
||||
validation_split=0.1,
|
||||
callbacks=[checkpoint_callback_gru])
|
||||
|
||||
# Графики
|
||||
plt.plot(history_gru.history['accuracy'],
|
||||
label='Доля верных ответов на обучающем наборе')
|
||||
plt.plot(history_gru.history['val_accuracy'],
|
||||
label='Доля верных ответов на проверочном наборе')
|
||||
plt.xlabel('Эпоха обучения')
|
||||
plt.ylabel('Доля верных ответов')
|
||||
plt.legend()
|
||||
plt.show()
|
||||
|
||||
# Загрузка модели
|
||||
model_gru = load_model('.//model/best_model_gru_positive.keras')
|
||||
|
||||
# Пример текста отзыва пользователя
|
||||
user_review = "Ой я так люблю вкусно кушать! Я бы хотел сьесть все самое вкусное в этом мире! Пончики, пиццу, вообще все все все!"
|
||||
|
||||
# Подготовка отзыва пользователя
|
||||
filtered_tokens = remove_stopwords_and_punctuation([user_review])
|
||||
sequences = tokenizer.texts_to_sequences(filtered_tokens)
|
||||
x_user = pad_sequences(sequences, maxlen=max_reviews_len)
|
||||
|
||||
# Получение вероятности принадлежности отзыва пользователя к разным классам
|
||||
predicted_probabilities = model_gru.predict(x_user)
|
||||
|
||||
# Вывод вероятностей с названиями классов
|
||||
for class_name, prob in zip(class_names, predicted_probabilities[0]):
|
||||
print(f"Вероятность отзыва относится к классу '{class_name}': {prob}")
|
||||
|
||||
# Сохраняем названия классов в текстовый файл
|
||||
with open('.//class/class_names_gru_positive.txt', 'w', encoding='utf-8') as file:
|
||||
for class_name in class_names:
|
||||
file.write(f"{class_name}\n")
|
||||
|
||||
# saving
|
||||
with open('.//tokenization/tokenizer_gru_positive.pickle', 'wb') as handle:
|
||||
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
@ -145,5 +145,5 @@ with open('.//class/class_names_lstm_negative.txt', 'w', encoding='utf-8') as fi
|
||||
file.write(f"{class_name}\n")
|
||||
|
||||
# saving
|
||||
with open('.//tokenizer/tokenizer_lstm_lstm_negative.pickle', 'wb') as handle:
|
||||
with open('.//tokenization/tokenizer_lstm_negative.pickle', 'wb') as handle:
|
||||
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
@ -144,5 +144,5 @@ with open('.//class/class_names_lstm_positive.txt', 'w', encoding='utf-8') as fi
|
||||
file.write(f"{class_name}\n")
|
||||
|
||||
# saving
|
||||
with open('.//tokenizer/tokenizer_lstm_positive.pickle', 'wb') as handle:
|
||||
with open('.//tokenization/tokenizer_lstm_positive.pickle', 'wb') as handle:
|
||||
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
60
neural_network/tokenization/tokenizer_negative.py
Normal file
60
neural_network/tokenization/tokenizer_negative.py
Normal file
@ -0,0 +1,60 @@
|
||||
import pandas as pd
|
||||
from keras.src.legacy.preprocessing.text import Tokenizer
|
||||
from nltk import word_tokenize, SnowballStemmer
|
||||
from nltk.corpus import stopwords
|
||||
import string
|
||||
import pickle
|
||||
|
||||
from pymystem3 import Mystem
|
||||
|
||||
# Максимальное количество слов
|
||||
num_words = 10000
|
||||
# Количество классов отзыва
|
||||
nb_classes = 10
|
||||
|
||||
def remove_stopwords_and_punctuation(reviews):
|
||||
stop_words = set(stopwords.words('russian'))
|
||||
punctuation = set(string.punctuation)
|
||||
filtered_tokens = []
|
||||
|
||||
# Удаление стоп слов и пунктуаций
|
||||
for review in reviews:
|
||||
words = word_tokenize(review)
|
||||
filtered_words = [word for word in words if
|
||||
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
|
||||
filtered_tokens.extend(filtered_words)
|
||||
|
||||
return filtered_tokens
|
||||
|
||||
def lemmatize_token(filtered_tokens):
|
||||
mystem = Mystem()
|
||||
lemmatized_tokens = []
|
||||
for token in filtered_tokens:
|
||||
lemmatized = mystem.lemmatize(token)[0]
|
||||
lemmatized_tokens.append(lemmatized)
|
||||
print(lemmatized)
|
||||
return lemmatized_tokens
|
||||
|
||||
def stemmer_token(filtered_tokens):
|
||||
stemmer = SnowballStemmer("russian")
|
||||
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
|
||||
return stemmer_tokens
|
||||
|
||||
# Данные
|
||||
train = pd.read_csv('..//dataset/filtered/filtered_dataset_negative.csv')
|
||||
reviews = train['text']
|
||||
|
||||
filtered_tokens = remove_stopwords_and_punctuation(reviews)
|
||||
|
||||
# lemmatized_tokens = lemmatize_token(filtered_tokens)
|
||||
# print(lemmatized_tokens[:10])
|
||||
|
||||
# stemmer_tokens = stemmer_token(filtered_tokens)
|
||||
# print(stemmer_tokens[:10])
|
||||
|
||||
tokenizer = Tokenizer(num_words=num_words)
|
||||
tokenizer.fit_on_texts(filtered_tokens)
|
||||
|
||||
# saving
|
||||
with open('.//tokenizer_negative.pickle', 'wb') as handle:
|
||||
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
60
neural_network/tokenization/tokenizer_positive.py
Normal file
60
neural_network/tokenization/tokenizer_positive.py
Normal file
@ -0,0 +1,60 @@
|
||||
import pandas as pd
|
||||
from keras.src.legacy.preprocessing.text import Tokenizer
|
||||
from nltk import word_tokenize, SnowballStemmer
|
||||
from nltk.corpus import stopwords
|
||||
import string
|
||||
import pickle
|
||||
|
||||
from pymystem3 import Mystem
|
||||
|
||||
# Максимальное количество слов
|
||||
num_words = 10000
|
||||
# Количество классов отзыва
|
||||
nb_classes = 10
|
||||
|
||||
def remove_stopwords_and_punctuation(reviews):
|
||||
stop_words = set(stopwords.words('russian'))
|
||||
punctuation = set(string.punctuation)
|
||||
filtered_tokens = []
|
||||
|
||||
# Удаление стоп слов и пунктуаций
|
||||
for review in reviews:
|
||||
words = word_tokenize(review)
|
||||
filtered_words = [word for word in words if
|
||||
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
|
||||
filtered_tokens.extend(filtered_words)
|
||||
|
||||
return filtered_tokens
|
||||
|
||||
def lemmatize_token(filtered_tokens):
|
||||
mystem = Mystem()
|
||||
lemmatized_tokens = []
|
||||
for token in filtered_tokens:
|
||||
lemmatized = mystem.lemmatize(token)[0]
|
||||
lemmatized_tokens.append(lemmatized)
|
||||
print(lemmatized)
|
||||
return lemmatized_tokens
|
||||
|
||||
def stemmer_token(filtered_tokens):
|
||||
stemmer = SnowballStemmer("russian")
|
||||
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
|
||||
return stemmer_tokens
|
||||
|
||||
# Данные
|
||||
train = pd.read_csv('..//dataset/filtered/filtered_dataset_positive.csv')
|
||||
reviews = train['text']
|
||||
|
||||
filtered_tokens = remove_stopwords_and_punctuation(reviews)
|
||||
|
||||
# lemmatized_tokens = lemmatize_token(filtered_tokens)
|
||||
# print(lemmatized_tokens[:10])
|
||||
|
||||
# stemmer_tokens = stemmer_token(filtered_tokens)
|
||||
# print(stemmer_tokens[:10])
|
||||
|
||||
tokenizer = Tokenizer(num_words=num_words)
|
||||
tokenizer.fit_on_texts(filtered_tokens)
|
||||
|
||||
# saving
|
||||
with open('.//tokenizer_positive.pickle', 'wb') as handle:
|
||||
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
Loading…
Reference in New Issue
Block a user