Делаю нормальное расскидывание, осталось в модельках убрать лишний код, ну и ну и какое нить форматирование.

This commit is contained in:
maksim 2024-05-30 00:13:05 +04:00
parent 87246af738
commit de1c25bafa
13 changed files with 753 additions and 3 deletions

View File

@ -8,7 +8,7 @@ from keras.src.utils import pad_sequences
model = tf.keras.models.load_model('.//neural_network/create_lstm/model/best_model_lstm_negative.keras')
# Загрузка токенизатора
with open('.//neural_network/create_lstm/tokenizer/tokenizer_lstm_lstm_negative.pickle', 'rb') as handle:
with open('.//neural_network/create_lstm/tokenization/tokenizer_lstm_lstm_negative.pickle', 'rb') as handle:
tokenizer = pickle.load(handle)
# Загрузка названий классов

View File

@ -0,0 +1,16 @@
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# Данные
train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv')
label_encoder = LabelEncoder()
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
# Сохраняем названия классов
class_names = label_encoder.classes_
# Сохраняем названия классов в текстовый файл
with open('.//class_names_negative.txt', 'w', encoding='utf-8') as file:
for class_name in class_names:
file.write(f"{class_name}\n")

View File

@ -0,0 +1,16 @@
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# Данные
train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv')
label_encoder = LabelEncoder()
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
# Сохраняем названия классов
class_names = label_encoder.classes_
# Сохраняем названия классов в текстовый файл
with open('.//class_names_positive.txt', 'w', encoding='utf-8') as file:
for class_name in class_names:
file.write(f"{class_name}\n")

View File

@ -0,0 +1,151 @@
import pickle
import pandas as pd
from keras import Sequential
from keras.src.callbacks import ModelCheckpoint
from keras.src.legacy.preprocessing.text import Tokenizer
from keras.src.saving import load_model
from keras.src.utils import pad_sequences
from matplotlib import pyplot as plt
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import utils
from nltk import word_tokenize, SnowballStemmer
from nltk.corpus import stopwords
import string
from pymystem3 import Mystem
# Максимальное количество слов
num_words = 10000
# Максимальная длина отзыва
max_reviews_len = 90
# Количество классов отзыва
nb_classes = 10
def remove_stopwords_and_punctuation(reviews):
stop_words = set(stopwords.words('russian'))
punctuation = set(string.punctuation)
filtered_tokens = []
# Удаление стоп слов и пунктуаций
for review in reviews:
words = word_tokenize(review)
filtered_words = [word for word in words if
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
filtered_tokens.extend(filtered_words)
return filtered_tokens
def lemmatize_token(filtered_tokens):
mystem = Mystem()
lemmatized_tokens = []
for token in filtered_tokens:
lemmatized = mystem.lemmatize(token)[0]
lemmatized_tokens.append(lemmatized)
print(lemmatized)
return lemmatized_tokens
def stemmer_token(filtered_tokens):
stemmer = SnowballStemmer("russian")
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
return stemmer_tokens
# Данные
train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv')
train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
reviews = train['text']
print("Набор данных готов")
filtered_tokens = remove_stopwords_and_punctuation(reviews)
print(filtered_tokens[:10])
# lemmatized_tokens = lemmatize_token(filtered_tokens)
# print(lemmatized_tokens[:10])
# stemmer_tokens = stemmer_token(filtered_tokens)
# print(stemmer_tokens[:10])
label_encoder = LabelEncoder()
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
# Сохраняем названия классов
class_names = label_encoder.classes_
y_train = utils.to_categorical(train['rubrics'], nb_classes)
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(filtered_tokens)
sequences = tokenizer.texts_to_sequences(reviews)
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
print('начинается создание модели')
# Построение модели
model_cnn = Sequential()
model_cnn.add(Embedding(num_words, 128, input_length=max_reviews_len))
model_cnn.add(Conv1D(128, 5, activation='relu'))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(64, activation='relu'))
model_cnn.add(Dense(nb_classes, activation='softmax'))
model_cnn.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
model_cnn.summary()
# Определим обратный вызов ModelCheckpoint
model_cnn_save_path = './/model/best_model_cnn_negative.keras'
checkpoint_callback_cnn = ModelCheckpoint(model_cnn_save_path,
monitor='val_accuracy',
save_best_only=True,
verbose=1)
# Обучение модели
history_cnn = model_cnn.fit(x_train,
y_train,
epochs=10,
batch_size=128,
validation_split=0.1,
callbacks=[checkpoint_callback_cnn])
# Графики
plt.plot(history_cnn.history['accuracy'],
label='Доля верных ответов на обучающем наборе')
plt.plot(history_cnn.history['val_accuracy'],
label='Доля верных ответов на проверочном наборе')
plt.xlabel('Эпоха обучения')
plt.ylabel('Доля верных ответов')
plt.legend()
plt.show()
# Загрузка модели
model_cnn = load_model('.//model/best_model_cnn_negative.keras')
# Пример текста отзыва пользователя
user_review = "Не люблю пьяных людей, когда они рядом, всегда будет что то плохое"
# Подготовка отзыва пользователя
filtered_tokens = remove_stopwords_and_punctuation([user_review])
sequences = tokenizer.texts_to_sequences(filtered_tokens)
x_user = pad_sequences(sequences, maxlen=max_reviews_len)
# Получение вероятности принадлежности отзыва пользователя к разным классам
predicted_probabilities = model_cnn.predict(x_user)
# Вывод вероятностей с названиями классов
for class_name, prob in zip(class_names, predicted_probabilities[0]):
print(f"Вероятность отзыва относится к классу '{class_name}': {prob}")
# Сохраняем названия классов в текстовый файл
with open('.//class/class_names_cnn_negative.txt', 'w', encoding='utf-8') as file:
for class_name in class_names:
file.write(f"{class_name}\n")
# saving
with open('../tokenization/tokenizer_cnn_negative.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

View File

@ -0,0 +1,150 @@
import pandas as pd
from keras import Sequential
from keras.src.callbacks import ModelCheckpoint
from keras.src.legacy.preprocessing.text import Tokenizer
from keras.src.saving import load_model
from keras.src.utils import pad_sequences
from matplotlib import pyplot as plt
from tensorflow.keras.layers import Dense, Embedding, GlobalMaxPooling1D, Conv1D
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import utils
from nltk import word_tokenize, SnowballStemmer
from nltk.corpus import stopwords
import string
import pickle
from pymystem3 import Mystem
# Максимальное количество слов
num_words = 10000
# Максимальная длина отзыва
max_reviews_len = 90
# Количество классов отзыва
nb_classes = 10
def remove_stopwords_and_punctuation(reviews):
stop_words = set(stopwords.words('russian'))
punctuation = set(string.punctuation)
filtered_tokens = []
# Удаление стоп слов и пунктуаций
for review in reviews:
words = word_tokenize(review)
filtered_words = [word for word in words if
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
filtered_tokens.extend(filtered_words)
return filtered_tokens
def lemmatize_token(filtered_tokens):
mystem = Mystem()
lemmatized_tokens = []
for token in filtered_tokens:
lemmatized = mystem.lemmatize(token)[0]
lemmatized_tokens.append(lemmatized)
print(lemmatized)
return lemmatized_tokens
def stemmer_token(filtered_tokens):
stemmer = SnowballStemmer("russian")
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
return stemmer_tokens
# Данные
train = pd.read_csv('..//dataset/filtered/filtered_dataset_positive.csv')
train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
reviews = train['text']
print("Набор данных готов")
filtered_tokens = remove_stopwords_and_punctuation(reviews)
print(filtered_tokens[:10])
# lemmatized_tokens = lemmatize_token(filtered_tokens)
# print(lemmatized_tokens[:10])
# stemmer_tokens = stemmer_token(filtered_tokens)
# print(stemmer_tokens[:10])
label_encoder = LabelEncoder()
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
# Сохраняем названия классов
class_names = label_encoder.classes_
y_train = utils.to_categorical(train['rubrics'], nb_classes)
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(filtered_tokens)
sequences = tokenizer.texts_to_sequences(reviews)
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
print('начинается создание модели')
# Построение модели
model_cnn = Sequential()
model_cnn.add(Embedding(num_words, 128, input_length=max_reviews_len))
model_cnn.add(Conv1D(128, 5, activation='relu'))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(64, activation='relu'))
model_cnn.add(Dense(nb_classes, activation='softmax'))
model_cnn.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
model_cnn.summary()
# Определим обратный вызов ModelCheckpoint
model_cnn_save_path = './/model/best_model_cnn_positive.keras'
checkpoint_callback_cnn = ModelCheckpoint(model_cnn_save_path,
monitor='val_accuracy',
save_best_only=True,
verbose=1)
# Обучение модели
history_cnn = model_cnn.fit(x_train,
y_train,
epochs=5,
batch_size=128,
validation_split=0.1,
callbacks=[checkpoint_callback_cnn])
# Графики
plt.plot(history_cnn.history['accuracy'],
label='Доля верных ответов на обучающем наборе')
plt.plot(history_cnn.history['val_accuracy'],
label='Доля верных ответов на проверочном наборе')
plt.xlabel('Эпоха обучения')
plt.ylabel('Доля верных ответов')
plt.legend()
plt.show()
# Загрузка модели
model_cnn = load_model('.//model/best_model_cnn_positive.keras')
# Пример текста отзыва пользователя
user_review = "Ой я так люблю вкусно кушать! Я бы хотел сьесть все самое вкусное в этом мире! Пончики, пиццу, вообще все все все!"
# Подготовка отзыва пользователя
filtered_tokens = remove_stopwords_and_punctuation([user_review])
sequences = tokenizer.texts_to_sequences(filtered_tokens)
x_user = pad_sequences(sequences, maxlen=max_reviews_len)
# Получение вероятности принадлежности отзыва пользователя к разным классам
predicted_probabilities = model_cnn.predict(x_user)
# Вывод вероятностей с названиями классов
for class_name, prob in zip(class_names, predicted_probabilities[0]):
print(f"Вероятность отзыва относится к классу '{class_name}': {prob}")
# Сохраняем названия классов в текстовый файл
with open('.//class/class_names_cnn_positive.txt', 'w', encoding='utf-8') as file:
for class_name in class_names:
file.write(f"{class_name}\n")
# saving
with open('../tokenization/tokenizer_cnn_positive.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

View File

@ -0,0 +1,149 @@
import pickle
import pandas as pd
from keras import Sequential
from keras.src.callbacks import ModelCheckpoint
from keras.src.legacy.preprocessing.text import Tokenizer
from keras.src.saving import load_model
from keras.src.utils import pad_sequences
from matplotlib import pyplot as plt
from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM, SpatialDropout1D
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import utils
from nltk import word_tokenize, SnowballStemmer
from nltk.corpus import stopwords
import string
from pymystem3 import Mystem
# Максимальное количество слов
num_words = 10000
# Максимальная длина отзыва
max_reviews_len = 90
# Количество классов отзыва
nb_classes = 10
def remove_stopwords_and_punctuation(reviews):
stop_words = set(stopwords.words('russian'))
punctuation = set(string.punctuation)
filtered_tokens = []
# Удаление стоп слов и пунктуаций
for review in reviews:
words = word_tokenize(review)
filtered_words = [word for word in words if
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
filtered_tokens.extend(filtered_words)
return filtered_tokens
def lemmatize_token(filtered_tokens):
mystem = Mystem()
lemmatized_tokens = []
for token in filtered_tokens:
lemmatized = mystem.lemmatize(token)[0]
lemmatized_tokens.append(lemmatized)
print(lemmatized)
return lemmatized_tokens
def stemmer_token(filtered_tokens):
stemmer = SnowballStemmer("russian")
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
return stemmer_tokens
# Данные
train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv')
train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
reviews = train['text']
print("Набор данных готов")
filtered_tokens = remove_stopwords_and_punctuation(reviews)
print(filtered_tokens[:10])
# lemmatized_tokens = lemmatize_token(filtered_tokens)
# print(lemmatized_tokens[:10])
# stemmer_tokens = stemmer_token(filtered_tokens)
# print(stemmer_tokens[:10])
label_encoder = LabelEncoder()
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
# Сохраняем названия классов
class_names = label_encoder.classes_
y_train = utils.to_categorical(train['rubrics'], nb_classes)
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(filtered_tokens)
sequences = tokenizer.texts_to_sequences(reviews)
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
print('начинается создание модели')
# Построение модели
model_lstm = Sequential()
model_lstm.add(Embedding(num_words, 128, input_length=max_reviews_len))
model_lstm.add(SpatialDropout1D(0.2))
model_lstm.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(nb_classes, activation='softmax'))
model_lstm.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
model_lstm.summary()
# Определим обратный вызов ModelCheckpoint
model_lstm_save_path = './/model/best_model_lstm_negative.keras'
checkpoint_callback_gru = ModelCheckpoint(model_lstm_save_path,
monitor='val_accuracy',
save_best_only=True,
verbose=1)
# Обучение модели
history_lstm = model_lstm.fit(x_train,
y_train,
epochs=10,
batch_size=128,
validation_split=0.1,
callbacks=[checkpoint_callback_gru])
# Графики
plt.plot(history_lstm.history['accuracy'],
label='Доля верных ответов на обучающем наборе')
plt.plot(history_lstm.history['val_accuracy'],
label='Доля верных ответов на проверочном наборе')
plt.xlabel('Эпоха обучения')
plt.ylabel('Доля верных ответов')
plt.legend()
plt.show()
# Загрузка модели
model_lstm = load_model('.//model/best_model_lstm_negative.keras')
# Пример текста отзыва пользователя
user_review = "Не люблю пьяных людей, когда они рядом, всегда будет что то плохое"
# Подготовка отзыва пользователя
filtered_tokens = remove_stopwords_and_punctuation([user_review])
sequences = tokenizer.texts_to_sequences(filtered_tokens)
x_user = pad_sequences(sequences, maxlen=max_reviews_len)
# Получение вероятности принадлежности отзыва пользователя к разным классам
predicted_probabilities = model_lstm.predict(x_user)
# Вывод вероятностей с названиями классов
for class_name, prob in zip(class_names, predicted_probabilities[0]):
print(f"Вероятность отзыва относится к классу '{class_name}': {prob}")
# Сохраняем названия классов в текстовый файл
with open('.//class/class_names_lstm_negative.txt', 'w', encoding='utf-8') as file:
for class_name in class_names:
file.write(f"{class_name}\n")
# saving
with open('.//tokenization/tokenizer_lstm_lstm_negative.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

View File

@ -0,0 +1,148 @@
import pandas as pd
from keras import Sequential
from keras.src.callbacks import ModelCheckpoint
from keras.src.legacy.preprocessing.text import Tokenizer
from keras.src.saving import load_model
from keras.src.utils import pad_sequences
from matplotlib import pyplot as plt
from tensorflow.keras.layers import Dense, Embedding, Dropout, GRU, SpatialDropout1D
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import utils
from nltk import word_tokenize, SnowballStemmer
from nltk.corpus import stopwords
import string
import pickle
from pymystem3 import Mystem
# Максимальное количество слов
num_words = 10000
# Максимальная длина отзыва
max_reviews_len = 90
# Количество классов отзыва
nb_classes = 10
def remove_stopwords_and_punctuation(reviews):
stop_words = set(stopwords.words('russian'))
punctuation = set(string.punctuation)
filtered_tokens = []
# Удаление стоп слов и пунктуаций
for review in reviews:
words = word_tokenize(review)
filtered_words = [word for word in words if
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
filtered_tokens.extend(filtered_words)
return filtered_tokens
def lemmatize_token(filtered_tokens):
mystem = Mystem()
lemmatized_tokens = []
for token in filtered_tokens:
lemmatized = mystem.lemmatize(token)[0]
lemmatized_tokens.append(lemmatized)
print(lemmatized)
return lemmatized_tokens
def stemmer_token(filtered_tokens):
stemmer = SnowballStemmer("russian")
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
return stemmer_tokens
# Данные
train = pd.read_csv('..//dataset/filtered/filtered_dataset_positive.csv')
train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
reviews = train['text']
print("Набор данных готов")
filtered_tokens = remove_stopwords_and_punctuation(reviews)
print(filtered_tokens[:10])
# lemmatized_tokens = lemmatize_token(filtered_tokens)
# print(lemmatized_tokens[:10])
# stemmer_tokens = stemmer_token(filtered_tokens)
# print(stemmer_tokens[:10])
label_encoder = LabelEncoder()
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
# Сохраняем названия классов
class_names = label_encoder.classes_
y_train = utils.to_categorical(train['rubrics'], nb_classes)
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(filtered_tokens)
sequences = tokenizer.texts_to_sequences(reviews)
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
print('начинается создание модели')
# Построение модели
model_gru = Sequential()
model_gru.add(Embedding(num_words, 256, input_length=max_reviews_len))
model_gru.add(SpatialDropout1D(0.2))
model_gru.add(GRU(128, dropout=0.2, recurrent_dropout=0.2))
model_gru.add(Dense(nb_classes, activation='softmax'))
model_gru.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
model_gru.summary()
# Определим обратный вызов ModelCheckpoint
model_gru_save_path = './/model/best_model_gru_positive.keras'
checkpoint_callback_gru = ModelCheckpoint(model_gru_save_path,
monitor='val_accuracy',
save_best_only=True,
verbose=1)
# Обучение модели
history_gru = model_gru.fit(x_train,
y_train,
epochs=5,
batch_size=128,
validation_split=0.1,
callbacks=[checkpoint_callback_gru])
# Графики
plt.plot(history_gru.history['accuracy'],
label='Доля верных ответов на обучающем наборе')
plt.plot(history_gru.history['val_accuracy'],
label='Доля верных ответов на проверочном наборе')
plt.xlabel('Эпоха обучения')
plt.ylabel('Доля верных ответов')
plt.legend()
plt.show()
# Загрузка модели
model_gru = load_model('.//model/best_model_gru_positive.keras')
# Пример текста отзыва пользователя
user_review = "Ой я так люблю вкусно кушать! Я бы хотел сьесть все самое вкусное в этом мире! Пончики, пиццу, вообще все все все!"
# Подготовка отзыва пользователя
filtered_tokens = remove_stopwords_and_punctuation([user_review])
sequences = tokenizer.texts_to_sequences(filtered_tokens)
x_user = pad_sequences(sequences, maxlen=max_reviews_len)
# Получение вероятности принадлежности отзыва пользователя к разным классам
predicted_probabilities = model_gru.predict(x_user)
# Вывод вероятностей с названиями классов
for class_name, prob in zip(class_names, predicted_probabilities[0]):
print(f"Вероятность отзыва относится к классу '{class_name}': {prob}")
# Сохраняем названия классов в текстовый файл
with open('.//class/class_names_gru_positive.txt', 'w', encoding='utf-8') as file:
for class_name in class_names:
file.write(f"{class_name}\n")
# saving
with open('.//tokenization/tokenizer_gru_positive.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

View File

@ -145,5 +145,5 @@ with open('.//class/class_names_lstm_negative.txt', 'w', encoding='utf-8') as fi
file.write(f"{class_name}\n")
# saving
with open('.//tokenizer/tokenizer_lstm_lstm_negative.pickle', 'wb') as handle:
with open('.//tokenization/tokenizer_lstm_negative.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

View File

@ -144,5 +144,5 @@ with open('.//class/class_names_lstm_positive.txt', 'w', encoding='utf-8') as fi
file.write(f"{class_name}\n")
# saving
with open('.//tokenizer/tokenizer_lstm_positive.pickle', 'wb') as handle:
with open('.//tokenization/tokenizer_lstm_positive.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

View File

@ -0,0 +1,60 @@
import pandas as pd
from keras.src.legacy.preprocessing.text import Tokenizer
from nltk import word_tokenize, SnowballStemmer
from nltk.corpus import stopwords
import string
import pickle
from pymystem3 import Mystem
# Максимальное количество слов
num_words = 10000
# Количество классов отзыва
nb_classes = 10
def remove_stopwords_and_punctuation(reviews):
stop_words = set(stopwords.words('russian'))
punctuation = set(string.punctuation)
filtered_tokens = []
# Удаление стоп слов и пунктуаций
for review in reviews:
words = word_tokenize(review)
filtered_words = [word for word in words if
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
filtered_tokens.extend(filtered_words)
return filtered_tokens
def lemmatize_token(filtered_tokens):
mystem = Mystem()
lemmatized_tokens = []
for token in filtered_tokens:
lemmatized = mystem.lemmatize(token)[0]
lemmatized_tokens.append(lemmatized)
print(lemmatized)
return lemmatized_tokens
def stemmer_token(filtered_tokens):
stemmer = SnowballStemmer("russian")
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
return stemmer_tokens
# Данные
train = pd.read_csv('..//dataset/filtered/filtered_dataset_negative.csv')
reviews = train['text']
filtered_tokens = remove_stopwords_and_punctuation(reviews)
# lemmatized_tokens = lemmatize_token(filtered_tokens)
# print(lemmatized_tokens[:10])
# stemmer_tokens = stemmer_token(filtered_tokens)
# print(stemmer_tokens[:10])
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(filtered_tokens)
# saving
with open('.//tokenizer_negative.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

View File

@ -0,0 +1,60 @@
import pandas as pd
from keras.src.legacy.preprocessing.text import Tokenizer
from nltk import word_tokenize, SnowballStemmer
from nltk.corpus import stopwords
import string
import pickle
from pymystem3 import Mystem
# Максимальное количество слов
num_words = 10000
# Количество классов отзыва
nb_classes = 10
def remove_stopwords_and_punctuation(reviews):
stop_words = set(stopwords.words('russian'))
punctuation = set(string.punctuation)
filtered_tokens = []
# Удаление стоп слов и пунктуаций
for review in reviews:
words = word_tokenize(review)
filtered_words = [word for word in words if
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
filtered_tokens.extend(filtered_words)
return filtered_tokens
def lemmatize_token(filtered_tokens):
mystem = Mystem()
lemmatized_tokens = []
for token in filtered_tokens:
lemmatized = mystem.lemmatize(token)[0]
lemmatized_tokens.append(lemmatized)
print(lemmatized)
return lemmatized_tokens
def stemmer_token(filtered_tokens):
stemmer = SnowballStemmer("russian")
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
return stemmer_tokens
# Данные
train = pd.read_csv('..//dataset/filtered/filtered_dataset_positive.csv')
reviews = train['text']
filtered_tokens = remove_stopwords_and_punctuation(reviews)
# lemmatized_tokens = lemmatize_token(filtered_tokens)
# print(lemmatized_tokens[:10])
# stemmer_tokens = stemmer_token(filtered_tokens)
# print(stemmer_tokens[:10])
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(filtered_tokens)
# saving
with open('.//tokenizer_positive.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)