Ну вот сейчас точный последний реафакторинг, сделаю сейчас еще файл, чтобы можно было все сразу запустить и все
This commit is contained in:
parent
c56263c386
commit
b4dc220fe5
6
model.py
6
model.py
@ -5,14 +5,14 @@ from keras.src.legacy.preprocessing.text import Tokenizer
|
||||
from keras.src.utils import pad_sequences
|
||||
|
||||
# Загрузка модели
|
||||
model = tf.keras.models.load_model('.//neural_network/create_lstm/model/best_model_lstm_negative.keras')
|
||||
model = tf.keras.models.load_model('.//neural_network/models/model/best_model_lstm_negative.keras')
|
||||
|
||||
# Загрузка токенизатора
|
||||
with open('.//neural_network/create_lstm/tokenization/tokenizer_lstm_lstm_negative.pickle', 'rb') as handle:
|
||||
with open('neural_network/models/tokenization/tokenizer_lstm_lstm_negative.pickle', 'rb') as handle:
|
||||
tokenizer = pickle.load(handle)
|
||||
|
||||
# Загрузка названий классов
|
||||
with open('.//neural_network/create_lstm/class/class_names_lstm_negative.txt', 'r', encoding='utf-8') as file:
|
||||
with open('neural_network/models/class/class_names_lstm_negative.txt', 'r', encoding='utf-8') as file:
|
||||
class_names = [line.strip() for line in file.readlines()]
|
||||
|
||||
def preprocess_text(text: str):
|
||||
|
@ -1,16 +0,0 @@
|
||||
import pandas as pd
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
# Данные
|
||||
train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv')
|
||||
|
||||
label_encoder = LabelEncoder()
|
||||
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
|
||||
|
||||
# Сохраняем названия классов
|
||||
class_names = label_encoder.classes_
|
||||
|
||||
# Сохраняем названия классов в текстовый файл
|
||||
with open('.//class_names_negative.txt', 'w', encoding='utf-8') as file:
|
||||
for class_name in class_names:
|
||||
file.write(f"{class_name}\n")
|
@ -1,16 +0,0 @@
|
||||
import pandas as pd
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
# Данные
|
||||
train = pd.read_csv('../dataset/filtered/filtered_dataset_positive.csv')
|
||||
|
||||
label_encoder = LabelEncoder()
|
||||
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
|
||||
|
||||
# Сохраняем названия классов
|
||||
class_names = label_encoder.classes_
|
||||
|
||||
# Сохраняем названия классов в текстовый файл
|
||||
with open('.//class_names_positive.txt', 'w', encoding='utf-8') as file:
|
||||
for class_name in class_names:
|
||||
file.write(f"{class_name}\n")
|
61
neural_network/class/class_save.py
Normal file
61
neural_network/class/class_save.py
Normal file
@ -0,0 +1,61 @@
|
||||
import pandas as pd
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
class DataLoader:
|
||||
def __init__(self, dataset_path):
|
||||
self.dataset_path = dataset_path
|
||||
|
||||
def load_data(self):
|
||||
return pd.read_csv(self.dataset_path)
|
||||
|
||||
|
||||
class LabelProcessor:
|
||||
def __init__(self, dataset, label_column):
|
||||
self.dataset = dataset
|
||||
self.label_column = label_column
|
||||
self.label_encoder = LabelEncoder()
|
||||
|
||||
def encode_labels(self):
|
||||
self.dataset[self.label_column] = self.label_encoder.fit_transform(self.dataset[self.label_column])
|
||||
return self.dataset
|
||||
|
||||
def get_class_names(self):
|
||||
return self.label_encoder.classes_
|
||||
|
||||
def save_class_names(self, class_names, output_path):
|
||||
with open(output_path, 'w', encoding='utf-8') as file:
|
||||
for class_name in class_names:
|
||||
file.write(f"{class_name}\n")
|
||||
|
||||
|
||||
def process_dataset(dataset_path, label_column, output_path):
|
||||
# Load data
|
||||
data_loader = DataLoader(dataset_path)
|
||||
dataset = data_loader.load_data()
|
||||
|
||||
# Process labels
|
||||
label_processor = LabelProcessor(dataset, label_column)
|
||||
dataset = label_processor.encode_labels()
|
||||
class_names = label_processor.get_class_names()
|
||||
|
||||
# Save class names
|
||||
label_processor.save_class_names(class_names, output_path)
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
def main():
|
||||
positive_dataset_path = '../dataset/filtered/filtered_dataset_positive.csv'
|
||||
negative_dataset_path = '../dataset/filtered/filtered_dataset_negative.csv'
|
||||
positive_output_path = './class_names_positive.txt'
|
||||
negative_output_path = './class_names_negative.txt'
|
||||
|
||||
# Process positive dataset
|
||||
process_dataset(positive_dataset_path, 'rubrics', positive_output_path)
|
||||
|
||||
# Process negative dataset
|
||||
process_dataset(negative_dataset_path, 'rubrics', negative_output_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -1,80 +0,0 @@
|
||||
import pickle
|
||||
import pandas as pd
|
||||
from keras import Sequential
|
||||
from keras.src.callbacks import ModelCheckpoint
|
||||
from keras.src.utils import pad_sequences
|
||||
from matplotlib import pyplot as plt
|
||||
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D
|
||||
from tensorflow.keras import utils
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
# Максимальное количество слов
|
||||
num_words = 10000
|
||||
# Максимальная длина отзыва
|
||||
max_reviews_len = 90
|
||||
# Количество классов отзыва
|
||||
nb_classes = 10
|
||||
|
||||
# Загрузка токенизатора
|
||||
with open('..//tokenization/tokenizer_negative.pickle', 'rb') as handle:
|
||||
tokenizer = pickle.load(handle)
|
||||
|
||||
# Загрузка названий классов
|
||||
with open('..//class/class_names_negative.txt', 'r', encoding='utf-8') as file:
|
||||
class_names = [line.strip() for line in file.readlines()]
|
||||
|
||||
# Данные
|
||||
train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv')
|
||||
train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
|
||||
reviews = train['text']
|
||||
print("Набор данных готов")
|
||||
|
||||
# Кодирование категорий
|
||||
label_encoder = LabelEncoder()
|
||||
label_encoder.fit(class_names)
|
||||
encoded_labels = label_encoder.transform(train['rubrics'])
|
||||
y_train = utils.to_categorical(encoded_labels, nb_classes)
|
||||
|
||||
sequences = tokenizer.texts_to_sequences(reviews)
|
||||
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
|
||||
|
||||
print('начинается создание модели')
|
||||
|
||||
# Построение модели
|
||||
model_cnn = Sequential()
|
||||
model_cnn.add(Embedding(num_words, 128, input_length=max_reviews_len))
|
||||
model_cnn.add(Conv1D(128, 5, activation='relu'))
|
||||
model_cnn.add(GlobalMaxPooling1D())
|
||||
model_cnn.add(Dense(64, activation='relu'))
|
||||
model_cnn.add(Dense(nb_classes, activation='softmax'))
|
||||
|
||||
model_cnn.compile(optimizer='adam',
|
||||
loss='categorical_crossentropy',
|
||||
metrics=['accuracy'])
|
||||
|
||||
model_cnn.summary()
|
||||
|
||||
# Определим обратный вызов ModelCheckpoint
|
||||
model_cnn_save_path = './/model/best_model_cnn_negative.keras'
|
||||
checkpoint_callback_cnn = ModelCheckpoint(model_cnn_save_path,
|
||||
monitor='val_accuracy',
|
||||
save_best_only=True,
|
||||
verbose=1)
|
||||
|
||||
# Обучение модели
|
||||
history_cnn = model_cnn.fit(x_train,
|
||||
y_train,
|
||||
epochs=10,
|
||||
batch_size=128,
|
||||
validation_split=0.1,
|
||||
callbacks=[checkpoint_callback_cnn])
|
||||
|
||||
# Графики
|
||||
plt.plot(history_cnn.history['accuracy'],
|
||||
label='Доля верных ответов на обучающем наборе')
|
||||
plt.plot(history_cnn.history['val_accuracy'],
|
||||
label='Доля верных ответов на проверочном наборе')
|
||||
plt.xlabel('Эпоха обучения')
|
||||
plt.ylabel('Доля верных ответов')
|
||||
plt.legend()
|
||||
plt.savefig('.//graphics/history_cnn_negative.png')
|
@ -1,81 +0,0 @@
|
||||
import pickle
|
||||
import pandas as pd
|
||||
from keras import Sequential
|
||||
from keras.src.callbacks import ModelCheckpoint
|
||||
from keras.src.utils import pad_sequences
|
||||
from matplotlib import pyplot as plt
|
||||
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D
|
||||
from tensorflow.keras import utils
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
# Максимальное количество слов
|
||||
num_words = 10000
|
||||
# Максимальная длина отзыва
|
||||
max_reviews_len = 90
|
||||
# Количество классов отзыва
|
||||
nb_classes = 10
|
||||
|
||||
# Загрузка токенизатора
|
||||
with open('..//tokenization/tokenizer_positive.pickle', 'rb') as handle:
|
||||
tokenizer = pickle.load(handle)
|
||||
|
||||
# Загрузка названий классов
|
||||
with open('..//class/class_names_positive.txt', 'r', encoding='utf-8') as file:
|
||||
class_names = [line.strip() for line in file.readlines()]
|
||||
|
||||
# Данные
|
||||
train = pd.read_csv('../dataset/filtered/filtered_dataset_positive.csv')
|
||||
train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
|
||||
reviews = train['text']
|
||||
print("Набор данных готов")
|
||||
|
||||
# Кодирование категорий
|
||||
label_encoder = LabelEncoder()
|
||||
label_encoder.fit(class_names)
|
||||
encoded_labels = label_encoder.transform(train['rubrics'])
|
||||
y_train = utils.to_categorical(encoded_labels, nb_classes)
|
||||
|
||||
sequences = tokenizer.texts_to_sequences(reviews)
|
||||
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
|
||||
|
||||
print('начинается создание модели')
|
||||
|
||||
# Построение модели
|
||||
model_cnn = Sequential()
|
||||
model_cnn.add(Embedding(num_words, 256, input_length=max_reviews_len))
|
||||
model_cnn.add(Conv1D(256, 5, activation='relu'))
|
||||
model_cnn.add(GlobalMaxPooling1D())
|
||||
model_cnn.add(Dense(128, activation='relu'))
|
||||
model_cnn.add(Dense(nb_classes, activation='softmax'))
|
||||
|
||||
|
||||
model_cnn.compile(optimizer='adam',
|
||||
loss='categorical_crossentropy',
|
||||
metrics=['accuracy'])
|
||||
|
||||
model_cnn.summary()
|
||||
|
||||
# Определим обратный вызов ModelCheckpoint
|
||||
model_cnn_save_path = './/model/best_model_cnn_positive.keras'
|
||||
checkpoint_callback_cnn = ModelCheckpoint(model_cnn_save_path,
|
||||
monitor='val_accuracy',
|
||||
save_best_only=True,
|
||||
verbose=1)
|
||||
|
||||
# Обучение модели
|
||||
history_cnn = model_cnn.fit(x_train,
|
||||
y_train,
|
||||
epochs=5,
|
||||
batch_size=128,
|
||||
validation_split=0.1,
|
||||
callbacks=[checkpoint_callback_cnn])
|
||||
|
||||
# Графики
|
||||
plt.plot(history_cnn.history['accuracy'],
|
||||
label='Доля верных ответов на обучающем наборе')
|
||||
plt.plot(history_cnn.history['val_accuracy'],
|
||||
label='Доля верных ответов на проверочном наборе')
|
||||
plt.xlabel('Эпоха обучения')
|
||||
plt.ylabel('Доля верных ответов')
|
||||
plt.legend()
|
||||
plt.savefig('.//graphics/history_cnn_positive.png')
|
@ -1,80 +0,0 @@
|
||||
import pickle
|
||||
import pandas as pd
|
||||
from keras import Sequential
|
||||
from keras.src.callbacks import ModelCheckpoint
|
||||
from keras.src.utils import pad_sequences
|
||||
from matplotlib import pyplot as plt
|
||||
from tensorflow.keras.layers import Dense, Embedding, Dropout, GRU, Bidirectional
|
||||
from tensorflow.keras import utils
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
# Максимальное количество слов
|
||||
num_words = 10000
|
||||
# Максимальная длина отзыва
|
||||
max_reviews_len = 90
|
||||
# Количество классов отзыва
|
||||
nb_classes = 10
|
||||
|
||||
# Загрузка токенизатора
|
||||
with open('..//tokenization/tokenizer_negative.pickle', 'rb') as handle:
|
||||
tokenizer = pickle.load(handle)
|
||||
|
||||
# Загрузка названий классов
|
||||
with open('..//class/class_names_negative.txt', 'r', encoding='utf-8') as file:
|
||||
class_names = [line.strip() for line in file.readlines()]
|
||||
|
||||
# Данные
|
||||
train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv')
|
||||
train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
|
||||
reviews = train['text']
|
||||
print("Набор данных готов")
|
||||
|
||||
# Кодирование категорий
|
||||
label_encoder = LabelEncoder()
|
||||
label_encoder.fit(class_names)
|
||||
encoded_labels = label_encoder.transform(train['rubrics'])
|
||||
y_train = utils.to_categorical(encoded_labels, nb_classes)
|
||||
|
||||
sequences = tokenizer.texts_to_sequences(reviews)
|
||||
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
|
||||
|
||||
print('начинается создание модели')
|
||||
|
||||
# Построение модели
|
||||
model_gru = Sequential()
|
||||
model_gru.add(Embedding(num_words, 128, input_length=max_reviews_len))
|
||||
model_gru.add(GRU(128, return_sequences=True))
|
||||
model_gru.add(Dropout(0.25))
|
||||
model_gru.add(GRU(64))
|
||||
model_gru.add(Dense(nb_classes, activation='softmax'))
|
||||
|
||||
model_gru.compile(optimizer='adam',
|
||||
loss='categorical_crossentropy',
|
||||
metrics=['accuracy'])
|
||||
|
||||
model_gru.summary()
|
||||
|
||||
# Определим обратный вызов ModelCheckpoint
|
||||
model_lstm_save_path = './/model/best_model_gru_negative.keras'
|
||||
checkpoint_callback_gru = ModelCheckpoint(model_lstm_save_path,
|
||||
monitor='val_accuracy',
|
||||
save_best_only=True,
|
||||
verbose=1)
|
||||
|
||||
# Обучение модели
|
||||
history_gru = model_gru.fit(x_train,
|
||||
y_train,
|
||||
epochs=10,
|
||||
batch_size=128,
|
||||
validation_split=0.1,
|
||||
callbacks=[checkpoint_callback_gru])
|
||||
|
||||
# Графики
|
||||
plt.plot(history_gru.history['accuracy'],
|
||||
label='Доля верных ответов на обучающем наборе')
|
||||
plt.plot(history_gru.history['val_accuracy'],
|
||||
label='Доля верных ответов на проверочном наборе')
|
||||
plt.xlabel('Эпоха обучения')
|
||||
plt.ylabel('Доля верных ответов')
|
||||
plt.legend()
|
||||
plt.savefig('.//graphics/history_gru_negative.png')
|
@ -1,80 +0,0 @@
|
||||
import pickle
|
||||
import pandas as pd
|
||||
from keras import Sequential
|
||||
from keras.src.callbacks import ModelCheckpoint
|
||||
from keras.src.utils import pad_sequences
|
||||
from matplotlib import pyplot as plt
|
||||
from tensorflow.keras.layers import Dense, Embedding, GRU, Dropout
|
||||
from tensorflow.keras import utils
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
# Максимальное количество слов
|
||||
num_words = 10000
|
||||
# Максимальная длина отзыва
|
||||
max_reviews_len = 90
|
||||
# Количество классов отзыва
|
||||
nb_classes = 10
|
||||
|
||||
# Загрузка токенизатора
|
||||
with open('..//tokenization/tokenizer_positive.pickle', 'rb') as handle:
|
||||
tokenizer = pickle.load(handle)
|
||||
|
||||
# Загрузка названий классов
|
||||
with open('..//class/class_names_positive.txt', 'r', encoding='utf-8') as file:
|
||||
class_names = [line.strip() for line in file.readlines()]
|
||||
|
||||
# Данные
|
||||
train = pd.read_csv('../dataset/filtered/filtered_dataset_positive.csv')
|
||||
train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
|
||||
reviews = train['text']
|
||||
print("Набор данных готов")
|
||||
|
||||
# Кодирование категорий
|
||||
label_encoder = LabelEncoder()
|
||||
label_encoder.fit(class_names)
|
||||
encoded_labels = label_encoder.transform(train['rubrics'])
|
||||
y_train = utils.to_categorical(encoded_labels, nb_classes)
|
||||
|
||||
sequences = tokenizer.texts_to_sequences(reviews)
|
||||
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
|
||||
|
||||
print('начинается создание модели')
|
||||
|
||||
# Построение модели
|
||||
model_gru = Sequential()
|
||||
model_gru.add(Embedding(num_words, 256, input_length=max_reviews_len))
|
||||
model_gru.add(GRU(256, return_sequences=True))
|
||||
model_gru.add(Dropout(0.25))
|
||||
model_gru.add(GRU(128))
|
||||
model_gru.add(Dense(nb_classes, activation='softmax'))
|
||||
|
||||
model_gru.compile(optimizer='adam',
|
||||
loss='categorical_crossentropy',
|
||||
metrics=['accuracy'])
|
||||
|
||||
model_gru.summary()
|
||||
|
||||
# Определим обратный вызов ModelCheckpoint
|
||||
model_gru_save_path = './/model/best_model_gru_positive.keras'
|
||||
checkpoint_callback_gru = ModelCheckpoint(model_gru_save_path,
|
||||
monitor='val_accuracy',
|
||||
save_best_only=True,
|
||||
verbose=1)
|
||||
|
||||
# Обучение модели
|
||||
history_gru = model_gru.fit(x_train,
|
||||
y_train,
|
||||
epochs=5,
|
||||
batch_size=128,
|
||||
validation_split=0.1,
|
||||
callbacks=[checkpoint_callback_gru])
|
||||
|
||||
# Графики
|
||||
plt.plot(history_gru.history['accuracy'],
|
||||
label='Доля верных ответов на обучающем наборе')
|
||||
plt.plot(history_gru.history['val_accuracy'],
|
||||
label='Доля верных ответов на проверочном наборе')
|
||||
plt.xlabel('Эпоха обучения')
|
||||
plt.ylabel('Доля верных ответов')
|
||||
plt.legend()
|
||||
plt.savefig('.//graphics/history_gru_positive.png')
|
@ -1,79 +0,0 @@
|
||||
import pickle
|
||||
import pandas as pd
|
||||
from keras import Sequential
|
||||
from keras.src.callbacks import ModelCheckpoint
|
||||
from keras.src.utils import pad_sequences
|
||||
from matplotlib import pyplot as plt
|
||||
from tensorflow.keras.layers import Dense, Embedding, SpatialDropout1D, LSTM
|
||||
from tensorflow.keras import utils
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
# Максимальное количество слов
|
||||
num_words = 10000
|
||||
# Максимальная длина отзыва
|
||||
max_reviews_len = 90
|
||||
# Количество классов отзыва
|
||||
nb_classes = 10
|
||||
|
||||
# Загрузка токенизатора
|
||||
with open('..//tokenization/tokenizer_negative.pickle', 'rb') as handle:
|
||||
tokenizer = pickle.load(handle)
|
||||
|
||||
# Загрузка названий классов
|
||||
with open('..//class/class_names_negative.txt', 'r', encoding='utf-8') as file:
|
||||
class_names = [line.strip() for line in file.readlines()]
|
||||
|
||||
# Данные
|
||||
train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv')
|
||||
train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
|
||||
reviews = train['text']
|
||||
print("Набор данных готов")
|
||||
|
||||
# Кодирование категорий
|
||||
label_encoder = LabelEncoder()
|
||||
label_encoder.fit(class_names)
|
||||
encoded_labels = label_encoder.transform(train['rubrics'])
|
||||
y_train = utils.to_categorical(encoded_labels, nb_classes)
|
||||
|
||||
sequences = tokenizer.texts_to_sequences(reviews)
|
||||
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
|
||||
|
||||
print('начинается создание модели')
|
||||
|
||||
# Построение модели
|
||||
model_lstm = Sequential()
|
||||
model_lstm.add(Embedding(num_words, 128, input_length=max_reviews_len))
|
||||
model_lstm.add(SpatialDropout1D(0.2))
|
||||
model_lstm.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
|
||||
model_lstm.add(Dense(nb_classes, activation='softmax'))
|
||||
|
||||
model_lstm.compile(optimizer='adam',
|
||||
loss='categorical_crossentropy',
|
||||
metrics=['accuracy'])
|
||||
|
||||
model_lstm.summary()
|
||||
|
||||
# Определим обратный вызов ModelCheckpoint
|
||||
model_lstm_save_path = './/model/best_model_lstm_negative.keras'
|
||||
checkpoint_callback_lstm = ModelCheckpoint(model_lstm_save_path,
|
||||
monitor='val_accuracy',
|
||||
save_best_only=True,
|
||||
verbose=1)
|
||||
|
||||
# Обучение модели
|
||||
history_lstm = model_lstm.fit(x_train,
|
||||
y_train,
|
||||
epochs=10,
|
||||
batch_size=128,
|
||||
validation_split=0.1,
|
||||
callbacks=[checkpoint_callback_lstm])
|
||||
|
||||
# Графики
|
||||
plt.plot(history_lstm.history['accuracy'],
|
||||
label='Доля верных ответов на обучающем наборе')
|
||||
plt.plot(history_lstm.history['val_accuracy'],
|
||||
label='Доля верных ответов на проверочном наборе')
|
||||
plt.xlabel('Эпоха обучения')
|
||||
plt.ylabel('Доля верных ответов')
|
||||
plt.legend()
|
||||
plt.savefig('.//graphics/history_lstm_negative.png')
|
@ -1,79 +0,0 @@
|
||||
import pickle
|
||||
import pandas as pd
|
||||
from keras import Sequential
|
||||
from keras.src.callbacks import ModelCheckpoint
|
||||
from keras.src.utils import pad_sequences
|
||||
from matplotlib import pyplot as plt
|
||||
from tensorflow.keras.layers import Dense, Embedding, SpatialDropout1D, LSTM
|
||||
from tensorflow.keras import utils
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
# Максимальное количество слов
|
||||
num_words = 10000
|
||||
# Максимальная длина отзыва
|
||||
max_reviews_len = 90
|
||||
# Количество классов отзыва
|
||||
nb_classes = 10
|
||||
|
||||
# Загрузка токенизатора
|
||||
with open('..//tokenization/tokenizer_positive.pickle', 'rb') as handle:
|
||||
tokenizer = pickle.load(handle)
|
||||
|
||||
# Загрузка названий классов
|
||||
with open('..//class/class_names_positive.txt', 'r', encoding='utf-8') as file:
|
||||
class_names = [line.strip() for line in file.readlines()]
|
||||
|
||||
# Данные
|
||||
train = pd.read_csv('../dataset/filtered/filtered_dataset_positive.csv')
|
||||
train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
|
||||
reviews = train['text']
|
||||
print("Набор данных готов")
|
||||
|
||||
# Кодирование категорий
|
||||
label_encoder = LabelEncoder()
|
||||
label_encoder.fit(class_names)
|
||||
encoded_labels = label_encoder.transform(train['rubrics'])
|
||||
y_train = utils.to_categorical(encoded_labels, nb_classes)
|
||||
|
||||
sequences = tokenizer.texts_to_sequences(reviews)
|
||||
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
|
||||
|
||||
print('начинается создание модели')
|
||||
|
||||
# Построение модели
|
||||
model_lstm = Sequential()
|
||||
model_lstm.add(Embedding(num_words, 256, input_length=max_reviews_len))
|
||||
model_lstm.add(SpatialDropout1D(0.2))
|
||||
model_lstm.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
|
||||
model_lstm.add(Dense(nb_classes, activation='softmax'))
|
||||
|
||||
model_lstm.compile(optimizer='adam',
|
||||
loss='categorical_crossentropy',
|
||||
metrics=['accuracy'])
|
||||
|
||||
model_lstm.summary()
|
||||
|
||||
# Определим обратный вызов ModelCheckpoint
|
||||
model_lstm_save_path = './/model/best_model_lstm_positive.keras'
|
||||
checkpoint_callback_lstm = ModelCheckpoint(model_lstm_save_path,
|
||||
monitor='val_accuracy',
|
||||
save_best_only=True,
|
||||
verbose=1)
|
||||
|
||||
# Обучение модели
|
||||
history_lstm = model_lstm.fit(x_train,
|
||||
y_train,
|
||||
epochs=5,
|
||||
batch_size=128,
|
||||
validation_split=0.1,
|
||||
callbacks=[checkpoint_callback_lstm])
|
||||
|
||||
# Графики
|
||||
plt.plot(history_lstm.history['accuracy'],
|
||||
label='Доля верных ответов на обучающем наборе')
|
||||
plt.plot(history_lstm.history['val_accuracy'],
|
||||
label='Доля верных ответов на проверочном наборе')
|
||||
plt.xlabel('Эпоха обучения')
|
||||
plt.ylabel('Доля верных ответов')
|
||||
plt.legend()
|
||||
plt.savefig('.//graphics/history_lstm_positive.png')
|
@ -1,54 +1,124 @@
|
||||
import kaggle
|
||||
import zipfile
|
||||
import os
|
||||
import pandas as pd
|
||||
|
||||
# Загрузка датасета
|
||||
dataset = pd.read_csv('../dataset/geo-reviews-dataset-2023.csv')
|
||||
|
||||
# Создание нового DataFrame с фильтрацией по условию rating
|
||||
filtered_dataset_positive = dataset[dataset['rating'] > 3]
|
||||
filtered_dataset_negative = dataset[dataset['rating'] < 3]
|
||||
class KaggleDatasetDownloader:
|
||||
def __init__(self, dataset, file_name, download_path):
|
||||
self.dataset = dataset
|
||||
self.file_name = file_name
|
||||
self.download_path = download_path
|
||||
|
||||
# Удаление слов после запятой в столбце 'rubrics'
|
||||
filtered_dataset_positive.loc[:, 'rubrics'] = filtered_dataset_positive['rubrics'].apply(lambda x: x.split(';')[0] if pd.notnull(x) else x)
|
||||
filtered_dataset_negative.loc[:, 'rubrics'] = filtered_dataset_negative['rubrics'].apply(lambda x: x.split(';')[0] if pd.notnull(x) else x)
|
||||
def authenticate_and_download(self):
|
||||
kaggle.api.authenticate()
|
||||
kaggle.api.dataset_download_file(self.dataset, self.file_name, path=self.download_path)
|
||||
self.extract_if_needed()
|
||||
|
||||
# Переименование рубрик
|
||||
rename_mapping = {'Пиццерия': 'Быстрое питание', 'Ресторан': 'Быстрое питание', 'Кафе': 'Быстрое питание', 'Магазин продуктов': 'Супермаркет', 'Аптека': 'Медцентр, клиника', 'Стоматологическая клиника': 'Медцентр, клиника'}
|
||||
filtered_dataset_positive.loc[:, 'rubrics'] = filtered_dataset_positive['rubrics'].replace(rename_mapping)
|
||||
filtered_dataset_negative.loc[:, 'rubrics'] = filtered_dataset_negative['rubrics'].replace(rename_mapping)
|
||||
|
||||
# Получение уникальных слов из столбца 'rubrics'
|
||||
unique_rubrics_positive = set(filtered_dataset_positive['rubrics'].value_counts().head(10).index.tolist())
|
||||
unique_rubrics_negative = set(filtered_dataset_negative['rubrics'].value_counts().head(10).index.tolist())
|
||||
|
||||
# Сохранение строк, содержащих рубрики из unique_rubrics_positive и unique_rubrics_negative
|
||||
filtered_dataset_positive = filtered_dataset_positive[filtered_dataset_positive['rubrics'].isin(unique_rubrics_positive)]
|
||||
filtered_dataset_negative = filtered_dataset_negative[filtered_dataset_negative['rubrics'].isin(unique_rubrics_negative)]
|
||||
|
||||
# Ограничение количества строк одного типа rubrics
|
||||
filtered_dataset_negative = (filtered_dataset_negative.groupby('rubrics').head(1500).reset_index(drop=True))
|
||||
filtered_dataset_positive = (filtered_dataset_positive.groupby('rubrics').head(15000).reset_index(drop=True))
|
||||
def extract_if_needed(self):
|
||||
zip_path = os.path.join(self.download_path, f'{self.file_name}.zip')
|
||||
if os.path.exists(zip_path):
|
||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
||||
zip_ref.extractall(self.download_path)
|
||||
os.remove(zip_path)
|
||||
|
||||
|
||||
# Вывод количества строк для каждой rubrics в unique_rubrics_positive
|
||||
for rubric in unique_rubrics_positive:
|
||||
count = filtered_dataset_positive[filtered_dataset_positive['rubrics'] == rubric].shape[0]
|
||||
print(f"Количество строк с rubrics '{rubric}' в filtered_dataset_positive: {count}")
|
||||
class DatasetProcessor:
|
||||
def __init__(self, dataset_path):
|
||||
self.dataset = pd.read_csv(dataset_path)
|
||||
|
||||
# Вывод количества строк для каждой rubrics в unique_rubrics_negative
|
||||
for rubric in unique_rubrics_negative:
|
||||
count = filtered_dataset_negative[filtered_dataset_negative['rubrics'] == rubric].shape[0]
|
||||
print(f"Количество строк с rubrics '{rubric}' в filtered_dataset_negative: {count}")
|
||||
def filter_and_process(self):
|
||||
filtered_positive = self.dataset[self.dataset['rating'] > 3].copy()
|
||||
filtered_negative = self.dataset[self.dataset['rating'] < 3].copy()
|
||||
|
||||
# Сохранение уникальных слов в файл class_positive.txt с кодировкой UTF-8
|
||||
with open('class/class_positive.txt', 'w', encoding='utf-8') as file:
|
||||
for rubric in unique_rubrics_positive:
|
||||
file.write(f"{rubric}\n")
|
||||
filtered_positive = self.clean_rubrics(filtered_positive)
|
||||
filtered_negative = self.clean_rubrics(filtered_negative)
|
||||
|
||||
# Сохранение уникальных слов в файл class_negative.txt с кодировкой UTF-8
|
||||
with open('class/class_negative.txt', 'w', encoding='utf-8') as file:
|
||||
for rubric in unique_rubrics_negative:
|
||||
file.write(f"{rubric}\n")
|
||||
rename_mapping = {
|
||||
'Пиццерия': 'Быстрое питание', 'Ресторан': 'Быстрое питание',
|
||||
'Кафе': 'Быстрое питание', 'Магазин продуктов': 'Супермаркет',
|
||||
'Аптека': 'Медцентр, клиника', 'Стоматологическая клиника': 'Медцентр, клиника'
|
||||
}
|
||||
|
||||
# Сохранение отфильтрованного DataFrame в новый CSV файл
|
||||
filtered_dataset_positive.to_csv('../dataset/filtered/filtered_dataset_positive.csv', index=False)
|
||||
filtered_dataset_negative.to_csv('../dataset/filtered/filtered_dataset_negative.csv', index=False)
|
||||
filtered_positive = self.rename_rubrics(filtered_positive, rename_mapping)
|
||||
filtered_negative = self.rename_rubrics(filtered_negative, rename_mapping)
|
||||
|
||||
unique_rubrics_positive = self.get_top_rubrics(filtered_positive)
|
||||
unique_rubrics_negative = self.get_top_rubrics(filtered_negative)
|
||||
|
||||
filtered_positive = self.filter_by_rubrics(filtered_positive, unique_rubrics_positive)
|
||||
filtered_negative = self.filter_by_rubrics(filtered_negative, unique_rubrics_negative)
|
||||
|
||||
filtered_positive = self.limit_rows_per_rubric(filtered_positive, 15000)
|
||||
filtered_negative = self.limit_rows_per_rubric(filtered_negative, 1500)
|
||||
|
||||
self.print_rubric_counts(filtered_positive, unique_rubrics_positive)
|
||||
self.print_rubric_counts(filtered_negative, unique_rubrics_negative)
|
||||
|
||||
return filtered_positive, filtered_negative, unique_rubrics_positive, unique_rubrics_negative
|
||||
|
||||
@staticmethod
|
||||
def clean_rubrics(dataset):
|
||||
dataset.loc[:, 'rubrics'] = dataset['rubrics'].apply(lambda x: x.split(';')[0] if pd.notnull(x) else x)
|
||||
return dataset
|
||||
|
||||
@staticmethod
|
||||
def rename_rubrics(dataset, rename_mapping):
|
||||
dataset.loc[:, 'rubrics'] = dataset['rubrics'].replace(rename_mapping)
|
||||
return dataset
|
||||
|
||||
@staticmethod
|
||||
def get_top_rubrics(dataset, top_n=10):
|
||||
return set(dataset['rubrics'].value_counts().head(top_n).index.tolist())
|
||||
|
||||
@staticmethod
|
||||
def filter_by_rubrics(dataset, rubrics):
|
||||
return dataset[dataset['rubrics'].isin(rubrics)]
|
||||
|
||||
@staticmethod
|
||||
def limit_rows_per_rubric(dataset, limit):
|
||||
return dataset.groupby('rubrics').head(limit).reset_index(drop=True)
|
||||
|
||||
@staticmethod
|
||||
def print_rubric_counts(dataset, rubrics):
|
||||
for rubric in rubrics:
|
||||
count = dataset[dataset['rubrics'] == rubric].shape[0]
|
||||
print(f"Количество строк с rubrics '{rubric}': {count}")
|
||||
|
||||
|
||||
|
||||
class FileSaver:
|
||||
@staticmethod
|
||||
def save_rubrics_to_file(rubrics, file_path):
|
||||
with open(file_path, 'w', encoding='utf-8') as file:
|
||||
for rubric in rubrics:
|
||||
file.write(f"{rubric}\n")
|
||||
|
||||
@staticmethod
|
||||
def save_dataset_to_csv(dataset, file_path):
|
||||
dataset.to_csv(file_path, index=False)
|
||||
|
||||
|
||||
def main():
|
||||
dataset = 'kyakovlev/yandex-geo-reviews-dataset-2023'
|
||||
file_name = 'geo-reviews-dataset-2023.csv'
|
||||
download_path = '../dataset'
|
||||
dataset_path = os.path.join(download_path, file_name)
|
||||
|
||||
# Скачивание и распаковка датасета
|
||||
downloader = KaggleDatasetDownloader(dataset, file_name, download_path)
|
||||
downloader.authenticate_and_download()
|
||||
|
||||
# Обработка датасета
|
||||
processor = DatasetProcessor(dataset_path)
|
||||
filtered_positive, filtered_negative, unique_rubrics_positive, unique_rubrics_negative = processor.filter_and_process()
|
||||
|
||||
# Сохранение результатов
|
||||
FileSaver.save_rubrics_to_file(unique_rubrics_positive, 'class/class_positive.txt')
|
||||
FileSaver.save_rubrics_to_file(unique_rubrics_negative, 'class/class_negative.txt')
|
||||
FileSaver.save_dataset_to_csv(filtered_positive, '../dataset/filtered/filtered_dataset_positive.csv')
|
||||
FileSaver.save_dataset_to_csv(filtered_negative, '../dataset/filtered/filtered_dataset_negative.csv')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
@ -1,18 +0,0 @@
|
||||
import kaggle
|
||||
import zipfile
|
||||
import os
|
||||
|
||||
# Аутентификация
|
||||
kaggle.api.authenticate()
|
||||
|
||||
# Скачивание конкретного файла из набора данных
|
||||
dataset = 'kyakovlev/yandex-geo-reviews-dataset-2023'
|
||||
file_name = 'geo-reviews-dataset-2023.csv'
|
||||
kaggle.api.dataset_download_file(dataset, file_name, path='../dataset')
|
||||
|
||||
# Распаковка архива, если файл сжат
|
||||
zip_path = f'./{file_name}.zip'
|
||||
if os.path.exists(zip_path):
|
||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
||||
zip_ref.extractall('./')
|
||||
os.remove(zip_path) # Удаление zip файла после распаковки
|
271
neural_network/models/create_model.py
Normal file
271
neural_network/models/create_model.py
Normal file
@ -0,0 +1,271 @@
|
||||
import pickle
|
||||
import pandas as pd
|
||||
from keras import Sequential
|
||||
from keras.src.callbacks import ModelCheckpoint
|
||||
from keras.src.utils import pad_sequences
|
||||
from matplotlib import pyplot as plt
|
||||
from tensorflow.keras.layers import Dense, Embedding, SpatialDropout1D, LSTM, GRU, Dropout, Conv1D, GlobalMaxPooling1D
|
||||
from tensorflow.keras import utils
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
# Определение констант
|
||||
NUM_WORDS = 10000
|
||||
MAX_REVIEWS_LEN = 90
|
||||
NB_CLASSES = 10
|
||||
|
||||
class DataProcessor:
|
||||
def __init__(self, tokenizer_path, class_names_path, dataset_path):
|
||||
self.tokenizer = self.load_tokenizer(tokenizer_path)
|
||||
self.class_names = self.load_class_names(class_names_path)
|
||||
self.dataset = self.load_dataset(dataset_path)
|
||||
self.label_encoder = LabelEncoder()
|
||||
|
||||
@staticmethod
|
||||
def load_tokenizer(tokenizer_path):
|
||||
with open(tokenizer_path, 'rb') as handle:
|
||||
return pickle.load(handle)
|
||||
|
||||
@staticmethod
|
||||
def load_class_names(class_names_path):
|
||||
with open(class_names_path, 'r', encoding='utf-8') as file:
|
||||
return [line.strip() for line in file.readlines()]
|
||||
|
||||
@staticmethod
|
||||
def load_dataset(dataset_path):
|
||||
train = pd.read_csv(dataset_path)
|
||||
train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
|
||||
return train
|
||||
|
||||
def preprocess_data(self):
|
||||
reviews = self.dataset['text']
|
||||
encoded_labels = self.encode_labels(self.dataset['rubrics'])
|
||||
y_train = utils.to_categorical(encoded_labels, NB_CLASSES)
|
||||
sequences = self.tokenizer.texts_to_sequences(reviews)
|
||||
x_train = pad_sequences(sequences, maxlen=MAX_REVIEWS_LEN)
|
||||
return x_train, y_train
|
||||
|
||||
def encode_labels(self, labels):
|
||||
self.label_encoder.fit(self.class_names)
|
||||
return self.label_encoder.transform(labels)
|
||||
|
||||
class CNNModelNegative:
|
||||
def __init__(self, num_words, max_reviews_len, nb_classes):
|
||||
self.num_words = num_words
|
||||
self.max_reviews_len = max_reviews_len
|
||||
self.nb_classes = nb_classes
|
||||
self.model = self.build_model()
|
||||
|
||||
def build_model(self):
|
||||
model = Sequential()
|
||||
model.add(Embedding(self.num_words, 128, input_length=self.max_reviews_len))
|
||||
model.add(Conv1D(128, 5, activation='relu'))
|
||||
model.add(GlobalMaxPooling1D())
|
||||
model.add(Dense(64, activation='relu'))
|
||||
model.add(Dense(self.nb_classes, activation='softmax'))
|
||||
|
||||
model.compile(optimizer='adam',
|
||||
loss='categorical_crossentropy',
|
||||
metrics=['accuracy'])
|
||||
return model
|
||||
|
||||
def train(self, x_train, y_train, epochs, batch_size, validation_split, model_save_path):
|
||||
checkpoint_callback = ModelCheckpoint(model_save_path, monitor='val_accuracy', save_best_only=True, verbose=1)
|
||||
history = self.model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split, callbacks=[checkpoint_callback])
|
||||
return history
|
||||
|
||||
class GRUModelNegative:
|
||||
def __init__(self, num_words, max_reviews_len, nb_classes):
|
||||
self.num_words = num_words
|
||||
self.max_reviews_len = max_reviews_len
|
||||
self.nb_classes = nb_classes
|
||||
self.model = self.build_model()
|
||||
|
||||
def build_model(self):
|
||||
model = Sequential()
|
||||
model.add(Embedding(self.num_words, 128, input_length=self.max_reviews_len))
|
||||
model.add(GRU(128, return_sequences=True))
|
||||
model.add(Dropout(0.25))
|
||||
model.add(GRU(64))
|
||||
model.add(Dense(self.nb_classes, activation='softmax'))
|
||||
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
|
||||
return model
|
||||
|
||||
def train(self, x_train, y_train, epochs, batch_size, validation_split, model_save_path):
|
||||
checkpoint_callback = ModelCheckpoint(model_save_path, monitor='val_accuracy', save_best_only=True, verbose=1)
|
||||
history = self.model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split, callbacks=[checkpoint_callback])
|
||||
return history
|
||||
class LSTMModelNegative:
|
||||
def __init__(self, num_words, max_reviews_len, nb_classes):
|
||||
self.num_words = num_words
|
||||
self.max_reviews_len = max_reviews_len
|
||||
self.nb_classes = nb_classes
|
||||
self.model = self.build_model()
|
||||
|
||||
def build_model(self):
|
||||
model = Sequential()
|
||||
model.add(Embedding(self.num_words, 128, input_length=self.max_reviews_len))
|
||||
model.add(SpatialDropout1D(0.2))
|
||||
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
|
||||
model.add(Dense(self.nb_classes, activation='softmax'))
|
||||
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
|
||||
return model
|
||||
|
||||
def train(self, x_train, y_train, epochs, batch_size, validation_split, model_save_path):
|
||||
checkpoint_callback = ModelCheckpoint(model_save_path, monitor='val_accuracy', save_best_only=True, verbose=1)
|
||||
history = self.model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split, callbacks=[checkpoint_callback])
|
||||
return history
|
||||
|
||||
class LSTMModelPositive:
|
||||
def __init__(self, num_words, max_reviews_len, nb_classes):
|
||||
self.num_words = num_words
|
||||
self.max_reviews_len = max_reviews_len
|
||||
self.nb_classes = nb_classes
|
||||
self.model = self.build_model()
|
||||
|
||||
def build_model(self):
|
||||
model = Sequential()
|
||||
model.add(Embedding(self.num_words, 256, input_length=self.max_reviews_len))
|
||||
model.add(SpatialDropout1D(0.2))
|
||||
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
|
||||
model.add(Dense(self.nb_classes, activation='softmax'))
|
||||
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
|
||||
return model
|
||||
|
||||
def train(self, x_train, y_train, epochs, batch_size, validation_split, model_save_path):
|
||||
checkpoint_callback = ModelCheckpoint(model_save_path, monitor='val_accuracy', save_best_only=True, verbose=1)
|
||||
history = self.model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split, callbacks=[checkpoint_callback])
|
||||
return history
|
||||
|
||||
class GRUModelPositive:
|
||||
def __init__(self, num_words, max_reviews_len, nb_classes):
|
||||
self.num_words = num_words
|
||||
self.max_reviews_len = max_reviews_len
|
||||
self.nb_classes = nb_classes
|
||||
self.model = self.build_model()
|
||||
|
||||
def build_model(self):
|
||||
model = Sequential()
|
||||
model.add(Embedding(self.num_words, 256, input_length=self.max_reviews_len))
|
||||
model.add(GRU(256, return_sequences=True))
|
||||
model.add(Dropout(0.25))
|
||||
model.add(GRU(128))
|
||||
model.add(Dense(self.nb_classes, activation='softmax'))
|
||||
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
|
||||
return model
|
||||
|
||||
def train(self, x_train, y_train, epochs, batch_size, validation_split, model_save_path):
|
||||
checkpoint_callback = ModelCheckpoint(model_save_path, monitor='val_accuracy', save_best_only=True, verbose=1)
|
||||
history = self.model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split, callbacks=[checkpoint_callback])
|
||||
return history
|
||||
|
||||
class CNNModelPositive:
|
||||
def __init__(self, num_words, max_reviews_len, nb_classes):
|
||||
self.num_words = num_words
|
||||
self.max_reviews_len = max_reviews_len
|
||||
self.nb_classes = nb_classes
|
||||
self.model = self.build_model()
|
||||
|
||||
def build_model(self):
|
||||
model = Sequential()
|
||||
model.add(Embedding(self.num_words, 256, input_length=self.max_reviews_len))
|
||||
model.add(Conv1D(256, 5, activation='relu'))
|
||||
model.add(GlobalMaxPooling1D())
|
||||
model.add(Dense(128, activation='relu'))
|
||||
model.add(Dense(self.nb_classes, activation='softmax'))
|
||||
model.compile(optimizer='adam',
|
||||
loss='categorical_crossentropy',
|
||||
metrics=['accuracy'])
|
||||
return model
|
||||
|
||||
def train(self, x_train, y_train, epochs, batch_size, validation_split, model_save_path):
|
||||
checkpoint_callback = ModelCheckpoint(model_save_path,
|
||||
monitor='val_accuracy',
|
||||
save_best_only=True,
|
||||
verbose=1)
|
||||
history = self.model.fit(x_train,
|
||||
y_train,
|
||||
epochs=epochs,
|
||||
batch_size=batch_size,
|
||||
validation_split=validation_split,
|
||||
callbacks=[checkpoint_callback])
|
||||
return history
|
||||
|
||||
class Plotter:
|
||||
@staticmethod
|
||||
def plot_history(history, save_path):
|
||||
plt.plot(history.history['accuracy'],
|
||||
label='Доля верных ответов на обучающем наборе')
|
||||
plt.plot(history.history['val_accuracy'],
|
||||
label='Доля верных ответов на проверочном наборе')
|
||||
plt.xlabel('Эпоха обучения')
|
||||
plt.ylabel('Доля верных ответов')
|
||||
plt.legend()
|
||||
plt.savefig(save_path)
|
||||
plt.show()
|
||||
|
||||
def main():
|
||||
|
||||
tokenizer_path_positive = '../tokenization/tokenizer_positive.pickle'
|
||||
class_names_path_positive = '../class/class_names_positive.txt'
|
||||
dataset_path_positive = '../dataset/filtered/filtered_dataset_positive.csv'
|
||||
|
||||
model_save_path_lstm_positive = './model/best_model_lstm_positive.keras'
|
||||
plot_save_path_lstm_positive = './graphics/history_lstm_positive.png'
|
||||
|
||||
model_save_path_cnn_positive = './model/best_model_cnn_positive.keras'
|
||||
plot_save_path_cnn_positive = './graphics/history_cnn_positive.png'
|
||||
|
||||
model_save_path_gru_positive = './model/best_model_gru_positive.keras'
|
||||
plot_save_path_gru_positive = './graphics/history_gru_positive.png'
|
||||
|
||||
tokenizer_path_negative = '../tokenization/tokenizer_negative.pickle'
|
||||
class_names_path_negative = '../class/class_names_negative.txt'
|
||||
dataset_path_negative = '../dataset/filtered/filtered_dataset_negative.csv'
|
||||
|
||||
model_save_path_lstm_negative = './model/best_model_lstm_negative.keras'
|
||||
plot_save_path_lstm_negative = './graphics/history_lstm_negative.png'
|
||||
|
||||
model_save_path_cnn_negative = './model/best_model_cnn_negative.keras'
|
||||
plot_save_path_cnn_negative = './graphics/history_cnn_negative.png'
|
||||
|
||||
model_save_path_gru_negative = './model/best_model_gru_negative.keras'
|
||||
plot_save_path_gru_negative = './graphics/history_gru_negative.png'
|
||||
|
||||
data_processor_negative = DataProcessor(tokenizer_path_negative, class_names_path_negative, dataset_path_negative)
|
||||
x_train, y_train = data_processor_negative.preprocess_data()
|
||||
|
||||
cnn_model_negative = CNNModelNegative(NUM_WORDS, MAX_REVIEWS_LEN, NB_CLASSES)
|
||||
history = cnn_model_negative.train(x_train, y_train, epochs=10, batch_size=128, validation_split=0.1, model_save_path=model_save_path_cnn_negative)
|
||||
Plotter.plot_history(history, plot_save_path_cnn_negative)
|
||||
print("Training and plotting completed successfully: CNNModelNegative.")
|
||||
|
||||
lstm_model_negative = LSTMModelNegative(NUM_WORDS, MAX_REVIEWS_LEN, NB_CLASSES)
|
||||
history = lstm_model_negative.train(x_train, y_train, epochs=10, batch_size=128, validation_split=0.1, model_save_path=model_save_path_lstm_negative)
|
||||
Plotter.plot_history(history, plot_save_path_lstm_negative)
|
||||
print("Training and plotting completed successfully: LSTMModelNegative.")
|
||||
|
||||
gru_model_negative = GRUModelNegative(NUM_WORDS, MAX_REVIEWS_LEN, NB_CLASSES)
|
||||
history = gru_model_negative.train(x_train, y_train, epochs=10, batch_size=128, validation_split=0.1, model_save_path=model_save_path_gru_negative)
|
||||
Plotter.plot_history(history, plot_save_path_gru_negative)
|
||||
print("Training and plotting completed successfully: GRUModelNegative.")
|
||||
|
||||
data_processor_positive = DataProcessor(tokenizer_path_positive, class_names_path_positive, dataset_path_positive)
|
||||
x_train, y_train = data_processor_positive.preprocess_data()
|
||||
|
||||
cnn_model_positive = CNNModelPositive(NUM_WORDS, MAX_REVIEWS_LEN, NB_CLASSES)
|
||||
history = cnn_model_positive.train(x_train, y_train, epochs=5, batch_size=128, validation_split=0.1, model_save_path=model_save_path_cnn_positive)
|
||||
Plotter.plot_history(history, plot_save_path_cnn_positive)
|
||||
print("Training and plotting completed successfully: CNNModelPositive.")
|
||||
|
||||
lstm_model_positive = LSTMModelPositive(NUM_WORDS, MAX_REVIEWS_LEN, NB_CLASSES)
|
||||
history = lstm_model_positive.train(x_train, y_train, epochs=5, batch_size=128, validation_split=0.1, model_save_path=model_save_path_lstm_positive)
|
||||
Plotter.plot_history(history, plot_save_path_lstm_positive)
|
||||
print("Training and plotting completed successfully: LSTMModelPositive.")
|
||||
|
||||
gru_model_positive = GRUModelPositive(NUM_WORDS, MAX_REVIEWS_LEN, NB_CLASSES)
|
||||
history = gru_model_positive.train(x_train, y_train, epochs=5, batch_size=128, validation_split=0.1, model_save_path=model_save_path_gru_positive)
|
||||
Plotter.plot_history(history, plot_save_path_gru_positive)
|
||||
print("Training and plotting completed successfully: GRUModelPositive.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
81
neural_network/tokenization/tokenizer.py
Normal file
81
neural_network/tokenization/tokenizer.py
Normal file
@ -0,0 +1,81 @@
|
||||
import pandas as pd
|
||||
from keras.src.legacy.preprocessing.text import Tokenizer
|
||||
from nltk import word_tokenize, SnowballStemmer
|
||||
from nltk.corpus import stopwords
|
||||
import string
|
||||
import pickle
|
||||
from pymystem3 import Mystem
|
||||
|
||||
# Константы
|
||||
NUM_WORDS = 10000
|
||||
|
||||
class DataLoader:
|
||||
def __init__(self, dataset_path):
|
||||
self.dataset_path = dataset_path
|
||||
|
||||
def load_data(self):
|
||||
return pd.read_csv(self.dataset_path)
|
||||
|
||||
class TextProcessor:
|
||||
def __init__(self):
|
||||
self.stop_words = set(stopwords.words('russian'))
|
||||
self.punctuation = set(string.punctuation)
|
||||
self.mystem = Mystem()
|
||||
self.stemmer = SnowballStemmer("russian")
|
||||
|
||||
def remove_stopwords_and_punctuation(self, reviews):
|
||||
filtered_tokens = []
|
||||
for review in reviews:
|
||||
words = word_tokenize(review)
|
||||
filtered_words = [word for word in words if
|
||||
word.lower() not in self.stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in self.punctuation]
|
||||
filtered_tokens.extend(filtered_words)
|
||||
return filtered_tokens
|
||||
|
||||
def lemmatize_tokens(self, tokens):
|
||||
lemmatized_tokens = [self.mystem.lemmatize(token)[0] for token in tokens]
|
||||
return lemmatized_tokens
|
||||
|
||||
def stem_tokens(self, tokens):
|
||||
stemmed_tokens = [self.stemmer.stem(word) for word in tokens]
|
||||
return stemmed_tokens
|
||||
|
||||
class TokenizerSaver:
|
||||
def __init__(self, num_words):
|
||||
self.num_words = num_words
|
||||
self.tokenizer = Tokenizer(num_words=self.num_words)
|
||||
|
||||
def fit_on_texts(self, texts):
|
||||
self.tokenizer.fit_on_texts(texts)
|
||||
|
||||
def save_tokenizer(self, save_path):
|
||||
with open(save_path, 'wb') as handle:
|
||||
pickle.dump(self.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def process_and_save_tokenizer(dataset_path, save_path):
|
||||
# Load data
|
||||
data_loader = DataLoader(dataset_path)
|
||||
dataset = data_loader.load_data()
|
||||
|
||||
# Process text
|
||||
text_processor = TextProcessor()
|
||||
reviews = dataset['text']
|
||||
filtered_tokens = text_processor.remove_stopwords_and_punctuation(reviews)
|
||||
|
||||
# Tokenize and save
|
||||
tokenizer_saver = TokenizerSaver(NUM_WORDS)
|
||||
tokenizer_saver.fit_on_texts(filtered_tokens)
|
||||
tokenizer_saver.save_tokenizer(save_path)
|
||||
|
||||
def main():
|
||||
positive_dataset_path = '../dataset/filtered/filtered_dataset_positive.csv'
|
||||
negative_dataset_path = '../dataset/filtered/filtered_dataset_negative.csv'
|
||||
positive_tokenizer_path = './tokenizer_positive.pickle'
|
||||
negative_tokenizer_path = './tokenizer_negative.pickle'
|
||||
|
||||
# Process and save tokenizers
|
||||
process_and_save_tokenizer(positive_dataset_path, positive_tokenizer_path)
|
||||
process_and_save_tokenizer(negative_dataset_path, negative_tokenizer_path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -1,60 +0,0 @@
|
||||
import pandas as pd
|
||||
from keras.src.legacy.preprocessing.text import Tokenizer
|
||||
from nltk import word_tokenize, SnowballStemmer
|
||||
from nltk.corpus import stopwords
|
||||
import string
|
||||
import pickle
|
||||
|
||||
from pymystem3 import Mystem
|
||||
|
||||
# Максимальное количество слов
|
||||
num_words = 10000
|
||||
# Количество классов отзыва
|
||||
nb_classes = 10
|
||||
|
||||
def remove_stopwords_and_punctuation(reviews):
|
||||
stop_words = set(stopwords.words('russian'))
|
||||
punctuation = set(string.punctuation)
|
||||
filtered_tokens = []
|
||||
|
||||
# Удаление стоп слов и пунктуаций
|
||||
for review in reviews:
|
||||
words = word_tokenize(review)
|
||||
filtered_words = [word for word in words if
|
||||
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
|
||||
filtered_tokens.extend(filtered_words)
|
||||
|
||||
return filtered_tokens
|
||||
|
||||
def lemmatize_token(filtered_tokens):
|
||||
mystem = Mystem()
|
||||
lemmatized_tokens = []
|
||||
for token in filtered_tokens:
|
||||
lemmatized = mystem.lemmatize(token)[0]
|
||||
lemmatized_tokens.append(lemmatized)
|
||||
print(lemmatized)
|
||||
return lemmatized_tokens
|
||||
|
||||
def stemmer_token(filtered_tokens):
|
||||
stemmer = SnowballStemmer("russian")
|
||||
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
|
||||
return stemmer_tokens
|
||||
|
||||
# Данные
|
||||
train = pd.read_csv('..//dataset/filtered/filtered_dataset_negative.csv')
|
||||
reviews = train['text']
|
||||
|
||||
filtered_tokens = remove_stopwords_and_punctuation(reviews)
|
||||
|
||||
# lemmatized_tokens = lemmatize_token(filtered_tokens)
|
||||
# print(lemmatized_tokens[:10])
|
||||
|
||||
# stemmer_tokens = stemmer_token(filtered_tokens)
|
||||
# print(stemmer_tokens[:10])
|
||||
|
||||
tokenizer = Tokenizer(num_words=num_words)
|
||||
tokenizer.fit_on_texts(filtered_tokens)
|
||||
|
||||
# saving
|
||||
with open('.//tokenizer_negative.pickle', 'wb') as handle:
|
||||
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
@ -1,60 +0,0 @@
|
||||
import pandas as pd
|
||||
from keras.src.legacy.preprocessing.text import Tokenizer
|
||||
from nltk import word_tokenize, SnowballStemmer
|
||||
from nltk.corpus import stopwords
|
||||
import string
|
||||
import pickle
|
||||
|
||||
from pymystem3 import Mystem
|
||||
|
||||
# Максимальное количество слов
|
||||
num_words = 10000
|
||||
# Количество классов отзыва
|
||||
nb_classes = 10
|
||||
|
||||
def remove_stopwords_and_punctuation(reviews):
|
||||
stop_words = set(stopwords.words('russian'))
|
||||
punctuation = set(string.punctuation)
|
||||
filtered_tokens = []
|
||||
|
||||
# Удаление стоп слов и пунктуаций
|
||||
for review in reviews:
|
||||
words = word_tokenize(review)
|
||||
filtered_words = [word for word in words if
|
||||
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
|
||||
filtered_tokens.extend(filtered_words)
|
||||
|
||||
return filtered_tokens
|
||||
|
||||
def lemmatize_token(filtered_tokens):
|
||||
mystem = Mystem()
|
||||
lemmatized_tokens = []
|
||||
for token in filtered_tokens:
|
||||
lemmatized = mystem.lemmatize(token)[0]
|
||||
lemmatized_tokens.append(lemmatized)
|
||||
print(lemmatized)
|
||||
return lemmatized_tokens
|
||||
|
||||
def stemmer_token(filtered_tokens):
|
||||
stemmer = SnowballStemmer("russian")
|
||||
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
|
||||
return stemmer_tokens
|
||||
|
||||
# Данные
|
||||
train = pd.read_csv('..//dataset/filtered/filtered_dataset_positive.csv')
|
||||
reviews = train['text']
|
||||
|
||||
filtered_tokens = remove_stopwords_and_punctuation(reviews)
|
||||
|
||||
# lemmatized_tokens = lemmatize_token(filtered_tokens)
|
||||
# print(lemmatized_tokens[:10])
|
||||
|
||||
# stemmer_tokens = stemmer_token(filtered_tokens)
|
||||
# print(stemmer_tokens[:10])
|
||||
|
||||
tokenizer = Tokenizer(num_words=num_words)
|
||||
tokenizer.fit_on_texts(filtered_tokens)
|
||||
|
||||
# saving
|
||||
with open('.//tokenizer_positive.pickle', 'wb') as handle:
|
||||
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
Loading…
Reference in New Issue
Block a user