diff --git a/model.py b/model.py index 1f3cb2f..d415d6b 100644 --- a/model.py +++ b/model.py @@ -8,7 +8,7 @@ from keras.src.utils import pad_sequences model = tf.keras.models.load_model('.//neural_network/create_lstm/model/best_model_lstm_negative.keras') # Загрузка токенизатора -with open('.//neural_network/create_lstm/tokenizer/tokenizer_lstm_lstm_negative.pickle', 'rb') as handle: +with open('.//neural_network/create_lstm/tokenization/tokenizer_lstm_lstm_negative.pickle', 'rb') as handle: tokenizer = pickle.load(handle) # Загрузка названий классов diff --git a/neural_network/class/class_negative.py b/neural_network/class/class_negative.py new file mode 100644 index 0000000..73c060a --- /dev/null +++ b/neural_network/class/class_negative.py @@ -0,0 +1,16 @@ +import pandas as pd +from sklearn.preprocessing import LabelEncoder + +# Данные +train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv') + +label_encoder = LabelEncoder() +train['rubrics'] = label_encoder.fit_transform(train['rubrics']) + +# Сохраняем названия классов +class_names = label_encoder.classes_ + +# Сохраняем названия классов в текстовый файл +with open('.//class_names_negative.txt', 'w', encoding='utf-8') as file: + for class_name in class_names: + file.write(f"{class_name}\n") diff --git a/neural_network/class/class_positive.py b/neural_network/class/class_positive.py new file mode 100644 index 0000000..86521d3 --- /dev/null +++ b/neural_network/class/class_positive.py @@ -0,0 +1,16 @@ +import pandas as pd +from sklearn.preprocessing import LabelEncoder + +# Данные +train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv') + +label_encoder = LabelEncoder() +train['rubrics'] = label_encoder.fit_transform(train['rubrics']) + +# Сохраняем названия классов +class_names = label_encoder.classes_ + +# Сохраняем названия классов в текстовый файл +with open('.//class_names_positive.txt', 'w', encoding='utf-8') as file: + for class_name in class_names: + file.write(f"{class_name}\n") diff --git a/neural_network/create_cnn/create_model_cnn_negative.py b/neural_network/create_cnn/create_model_cnn_negative.py new file mode 100644 index 0000000..3e94bb1 --- /dev/null +++ b/neural_network/create_cnn/create_model_cnn_negative.py @@ -0,0 +1,151 @@ +import pickle + +import pandas as pd +from keras import Sequential +from keras.src.callbacks import ModelCheckpoint +from keras.src.legacy.preprocessing.text import Tokenizer +from keras.src.saving import load_model +from keras.src.utils import pad_sequences +from matplotlib import pyplot as plt +from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D +from sklearn.preprocessing import LabelEncoder +from tensorflow.keras import utils +from nltk import word_tokenize, SnowballStemmer +from nltk.corpus import stopwords +import string + +from pymystem3 import Mystem + +# Максимальное количество слов +num_words = 10000 +# Максимальная длина отзыва +max_reviews_len = 90 +# Количество классов отзыва +nb_classes = 10 + +def remove_stopwords_and_punctuation(reviews): + stop_words = set(stopwords.words('russian')) + punctuation = set(string.punctuation) + filtered_tokens = [] + + # Удаление стоп слов и пунктуаций + for review in reviews: + words = word_tokenize(review) + filtered_words = [word for word in words if + word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation] + filtered_tokens.extend(filtered_words) + + return filtered_tokens + +def lemmatize_token(filtered_tokens): + mystem = Mystem() + lemmatized_tokens = [] + for token in filtered_tokens: + lemmatized = mystem.lemmatize(token)[0] + lemmatized_tokens.append(lemmatized) + print(lemmatized) + return lemmatized_tokens + +def stemmer_token(filtered_tokens): + stemmer = SnowballStemmer("russian") + stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens] + return stemmer_tokens + +# Данные +train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv') +train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True) +reviews = train['text'] +print("Набор данных готов") + +filtered_tokens = remove_stopwords_and_punctuation(reviews) +print(filtered_tokens[:10]) + +# lemmatized_tokens = lemmatize_token(filtered_tokens) +# print(lemmatized_tokens[:10]) + +# stemmer_tokens = stemmer_token(filtered_tokens) +# print(stemmer_tokens[:10]) + +label_encoder = LabelEncoder() +train['rubrics'] = label_encoder.fit_transform(train['rubrics']) + +# Сохраняем названия классов +class_names = label_encoder.classes_ + +y_train = utils.to_categorical(train['rubrics'], nb_classes) + +tokenizer = Tokenizer(num_words=num_words) +tokenizer.fit_on_texts(filtered_tokens) + +sequences = tokenizer.texts_to_sequences(reviews) + +x_train = pad_sequences(sequences, maxlen=max_reviews_len) + +print('начинается создание модели') + +# Построение модели +model_cnn = Sequential() +model_cnn.add(Embedding(num_words, 128, input_length=max_reviews_len)) +model_cnn.add(Conv1D(128, 5, activation='relu')) +model_cnn.add(GlobalMaxPooling1D()) +model_cnn.add(Dense(64, activation='relu')) +model_cnn.add(Dense(nb_classes, activation='softmax')) + + +model_cnn.compile(optimizer='adam', + loss='categorical_crossentropy', + metrics=['accuracy']) + +model_cnn.summary() + +# Определим обратный вызов ModelCheckpoint +model_cnn_save_path = './/model/best_model_cnn_negative.keras' +checkpoint_callback_cnn = ModelCheckpoint(model_cnn_save_path, + monitor='val_accuracy', + save_best_only=True, + verbose=1) + +# Обучение модели +history_cnn = model_cnn.fit(x_train, + y_train, + epochs=10, + batch_size=128, + validation_split=0.1, + callbacks=[checkpoint_callback_cnn]) + +# Графики +plt.plot(history_cnn.history['accuracy'], + label='Доля верных ответов на обучающем наборе') +plt.plot(history_cnn.history['val_accuracy'], + label='Доля верных ответов на проверочном наборе') +plt.xlabel('Эпоха обучения') +plt.ylabel('Доля верных ответов') +plt.legend() +plt.show() + +# Загрузка модели +model_cnn = load_model('.//model/best_model_cnn_negative.keras') + +# Пример текста отзыва пользователя +user_review = "Не люблю пьяных людей, когда они рядом, всегда будет что то плохое" + +# Подготовка отзыва пользователя +filtered_tokens = remove_stopwords_and_punctuation([user_review]) +sequences = tokenizer.texts_to_sequences(filtered_tokens) +x_user = pad_sequences(sequences, maxlen=max_reviews_len) + +# Получение вероятности принадлежности отзыва пользователя к разным классам +predicted_probabilities = model_cnn.predict(x_user) + +# Вывод вероятностей с названиями классов +for class_name, prob in zip(class_names, predicted_probabilities[0]): + print(f"Вероятность отзыва относится к классу '{class_name}': {prob}") + +# Сохраняем названия классов в текстовый файл +with open('.//class/class_names_cnn_negative.txt', 'w', encoding='utf-8') as file: + for class_name in class_names: + file.write(f"{class_name}\n") + +# saving +with open('../tokenization/tokenizer_cnn_negative.pickle', 'wb') as handle: + pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) diff --git a/neural_network/create_cnn/create_model_cnn_positive.py b/neural_network/create_cnn/create_model_cnn_positive.py new file mode 100644 index 0000000..2e82ed6 --- /dev/null +++ b/neural_network/create_cnn/create_model_cnn_positive.py @@ -0,0 +1,150 @@ +import pandas as pd +from keras import Sequential +from keras.src.callbacks import ModelCheckpoint +from keras.src.legacy.preprocessing.text import Tokenizer +from keras.src.saving import load_model +from keras.src.utils import pad_sequences +from matplotlib import pyplot as plt +from tensorflow.keras.layers import Dense, Embedding, GlobalMaxPooling1D, Conv1D +from sklearn.preprocessing import LabelEncoder +from tensorflow.keras import utils +from nltk import word_tokenize, SnowballStemmer +from nltk.corpus import stopwords +import string +import pickle + +from pymystem3 import Mystem + +# Максимальное количество слов +num_words = 10000 +# Максимальная длина отзыва +max_reviews_len = 90 +# Количество классов отзыва +nb_classes = 10 + +def remove_stopwords_and_punctuation(reviews): + stop_words = set(stopwords.words('russian')) + punctuation = set(string.punctuation) + filtered_tokens = [] + + # Удаление стоп слов и пунктуаций + for review in reviews: + words = word_tokenize(review) + filtered_words = [word for word in words if + word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation] + filtered_tokens.extend(filtered_words) + + return filtered_tokens + +def lemmatize_token(filtered_tokens): + mystem = Mystem() + lemmatized_tokens = [] + for token in filtered_tokens: + lemmatized = mystem.lemmatize(token)[0] + lemmatized_tokens.append(lemmatized) + print(lemmatized) + return lemmatized_tokens + +def stemmer_token(filtered_tokens): + stemmer = SnowballStemmer("russian") + stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens] + return stemmer_tokens + +# Данные +train = pd.read_csv('..//dataset/filtered/filtered_dataset_positive.csv') +train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True) +reviews = train['text'] +print("Набор данных готов") + +filtered_tokens = remove_stopwords_and_punctuation(reviews) +print(filtered_tokens[:10]) + +# lemmatized_tokens = lemmatize_token(filtered_tokens) +# print(lemmatized_tokens[:10]) + +# stemmer_tokens = stemmer_token(filtered_tokens) +# print(stemmer_tokens[:10]) + +label_encoder = LabelEncoder() +train['rubrics'] = label_encoder.fit_transform(train['rubrics']) + +# Сохраняем названия классов +class_names = label_encoder.classes_ + +y_train = utils.to_categorical(train['rubrics'], nb_classes) + +tokenizer = Tokenizer(num_words=num_words) +tokenizer.fit_on_texts(filtered_tokens) + +sequences = tokenizer.texts_to_sequences(reviews) + +x_train = pad_sequences(sequences, maxlen=max_reviews_len) + +print('начинается создание модели') + +# Построение модели +model_cnn = Sequential() +model_cnn.add(Embedding(num_words, 128, input_length=max_reviews_len)) +model_cnn.add(Conv1D(128, 5, activation='relu')) +model_cnn.add(GlobalMaxPooling1D()) +model_cnn.add(Dense(64, activation='relu')) +model_cnn.add(Dense(nb_classes, activation='softmax')) + + +model_cnn.compile(optimizer='adam', + loss='categorical_crossentropy', + metrics=['accuracy']) + +model_cnn.summary() + +# Определим обратный вызов ModelCheckpoint +model_cnn_save_path = './/model/best_model_cnn_positive.keras' +checkpoint_callback_cnn = ModelCheckpoint(model_cnn_save_path, + monitor='val_accuracy', + save_best_only=True, + verbose=1) + +# Обучение модели +history_cnn = model_cnn.fit(x_train, + y_train, + epochs=5, + batch_size=128, + validation_split=0.1, + callbacks=[checkpoint_callback_cnn]) + +# Графики +plt.plot(history_cnn.history['accuracy'], + label='Доля верных ответов на обучающем наборе') +plt.plot(history_cnn.history['val_accuracy'], + label='Доля верных ответов на проверочном наборе') +plt.xlabel('Эпоха обучения') +plt.ylabel('Доля верных ответов') +plt.legend() +plt.show() + +# Загрузка модели +model_cnn = load_model('.//model/best_model_cnn_positive.keras') + +# Пример текста отзыва пользователя +user_review = "Ой я так люблю вкусно кушать! Я бы хотел сьесть все самое вкусное в этом мире! Пончики, пиццу, вообще все все все!" + +# Подготовка отзыва пользователя +filtered_tokens = remove_stopwords_and_punctuation([user_review]) +sequences = tokenizer.texts_to_sequences(filtered_tokens) +x_user = pad_sequences(sequences, maxlen=max_reviews_len) + +# Получение вероятности принадлежности отзыва пользователя к разным классам +predicted_probabilities = model_cnn.predict(x_user) + +# Вывод вероятностей с названиями классов +for class_name, prob in zip(class_names, predicted_probabilities[0]): + print(f"Вероятность отзыва относится к классу '{class_name}': {prob}") + +# Сохраняем названия классов в текстовый файл +with open('.//class/class_names_cnn_positive.txt', 'w', encoding='utf-8') as file: + for class_name in class_names: + file.write(f"{class_name}\n") + +# saving +with open('../tokenization/tokenizer_cnn_positive.pickle', 'wb') as handle: + pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) \ No newline at end of file diff --git a/neural_network/create_lstm/class/.gitkeep b/neural_network/create_cnn/model/.gitkeep similarity index 100% rename from neural_network/create_lstm/class/.gitkeep rename to neural_network/create_cnn/model/.gitkeep diff --git a/neural_network/create_gru/create_model_gru_negative.py b/neural_network/create_gru/create_model_gru_negative.py new file mode 100644 index 0000000..6219e63 --- /dev/null +++ b/neural_network/create_gru/create_model_gru_negative.py @@ -0,0 +1,149 @@ +import pickle + +import pandas as pd +from keras import Sequential +from keras.src.callbacks import ModelCheckpoint +from keras.src.legacy.preprocessing.text import Tokenizer +from keras.src.saving import load_model +from keras.src.utils import pad_sequences +from matplotlib import pyplot as plt +from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM, SpatialDropout1D +from sklearn.preprocessing import LabelEncoder +from tensorflow.keras import utils +from nltk import word_tokenize, SnowballStemmer +from nltk.corpus import stopwords +import string + +from pymystem3 import Mystem + +# Максимальное количество слов +num_words = 10000 +# Максимальная длина отзыва +max_reviews_len = 90 +# Количество классов отзыва +nb_classes = 10 + +def remove_stopwords_and_punctuation(reviews): + stop_words = set(stopwords.words('russian')) + punctuation = set(string.punctuation) + filtered_tokens = [] + + # Удаление стоп слов и пунктуаций + for review in reviews: + words = word_tokenize(review) + filtered_words = [word for word in words if + word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation] + filtered_tokens.extend(filtered_words) + + return filtered_tokens + +def lemmatize_token(filtered_tokens): + mystem = Mystem() + lemmatized_tokens = [] + for token in filtered_tokens: + lemmatized = mystem.lemmatize(token)[0] + lemmatized_tokens.append(lemmatized) + print(lemmatized) + return lemmatized_tokens + +def stemmer_token(filtered_tokens): + stemmer = SnowballStemmer("russian") + stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens] + return stemmer_tokens + +# Данные +train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv') +train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True) +reviews = train['text'] +print("Набор данных готов") + +filtered_tokens = remove_stopwords_and_punctuation(reviews) +print(filtered_tokens[:10]) + +# lemmatized_tokens = lemmatize_token(filtered_tokens) +# print(lemmatized_tokens[:10]) + +# stemmer_tokens = stemmer_token(filtered_tokens) +# print(stemmer_tokens[:10]) + +label_encoder = LabelEncoder() +train['rubrics'] = label_encoder.fit_transform(train['rubrics']) + +# Сохраняем названия классов +class_names = label_encoder.classes_ + +y_train = utils.to_categorical(train['rubrics'], nb_classes) + +tokenizer = Tokenizer(num_words=num_words) +tokenizer.fit_on_texts(filtered_tokens) + +sequences = tokenizer.texts_to_sequences(reviews) + +x_train = pad_sequences(sequences, maxlen=max_reviews_len) + +print('начинается создание модели') + +# Построение модели +model_lstm = Sequential() +model_lstm.add(Embedding(num_words, 128, input_length=max_reviews_len)) +model_lstm.add(SpatialDropout1D(0.2)) +model_lstm.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2)) +model_lstm.add(Dense(nb_classes, activation='softmax')) + +model_lstm.compile(optimizer='adam', + loss='categorical_crossentropy', + metrics=['accuracy']) + +model_lstm.summary() + +# Определим обратный вызов ModelCheckpoint +model_lstm_save_path = './/model/best_model_lstm_negative.keras' +checkpoint_callback_gru = ModelCheckpoint(model_lstm_save_path, + monitor='val_accuracy', + save_best_only=True, + verbose=1) + +# Обучение модели +history_lstm = model_lstm.fit(x_train, + y_train, + epochs=10, + batch_size=128, + validation_split=0.1, + callbacks=[checkpoint_callback_gru]) + +# Графики +plt.plot(history_lstm.history['accuracy'], + label='Доля верных ответов на обучающем наборе') +plt.plot(history_lstm.history['val_accuracy'], + label='Доля верных ответов на проверочном наборе') +plt.xlabel('Эпоха обучения') +plt.ylabel('Доля верных ответов') +plt.legend() +plt.show() + +# Загрузка модели +model_lstm = load_model('.//model/best_model_lstm_negative.keras') + +# Пример текста отзыва пользователя +user_review = "Не люблю пьяных людей, когда они рядом, всегда будет что то плохое" + +# Подготовка отзыва пользователя +filtered_tokens = remove_stopwords_and_punctuation([user_review]) +sequences = tokenizer.texts_to_sequences(filtered_tokens) +x_user = pad_sequences(sequences, maxlen=max_reviews_len) + +# Получение вероятности принадлежности отзыва пользователя к разным классам +predicted_probabilities = model_lstm.predict(x_user) + +# Вывод вероятностей с названиями классов +for class_name, prob in zip(class_names, predicted_probabilities[0]): + print(f"Вероятность отзыва относится к классу '{class_name}': {prob}") + +# Сохраняем названия классов в текстовый файл +with open('.//class/class_names_lstm_negative.txt', 'w', encoding='utf-8') as file: + for class_name in class_names: + file.write(f"{class_name}\n") + +# saving +with open('.//tokenization/tokenizer_lstm_lstm_negative.pickle', 'wb') as handle: + pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) diff --git a/neural_network/create_gru/create_model_gru_positive.py b/neural_network/create_gru/create_model_gru_positive.py new file mode 100644 index 0000000..edb008a --- /dev/null +++ b/neural_network/create_gru/create_model_gru_positive.py @@ -0,0 +1,148 @@ +import pandas as pd +from keras import Sequential +from keras.src.callbacks import ModelCheckpoint +from keras.src.legacy.preprocessing.text import Tokenizer +from keras.src.saving import load_model +from keras.src.utils import pad_sequences +from matplotlib import pyplot as plt +from tensorflow.keras.layers import Dense, Embedding, Dropout, GRU, SpatialDropout1D +from sklearn.preprocessing import LabelEncoder +from tensorflow.keras import utils +from nltk import word_tokenize, SnowballStemmer +from nltk.corpus import stopwords +import string +import pickle + +from pymystem3 import Mystem + +# Максимальное количество слов +num_words = 10000 +# Максимальная длина отзыва +max_reviews_len = 90 +# Количество классов отзыва +nb_classes = 10 + +def remove_stopwords_and_punctuation(reviews): + stop_words = set(stopwords.words('russian')) + punctuation = set(string.punctuation) + filtered_tokens = [] + + # Удаление стоп слов и пунктуаций + for review in reviews: + words = word_tokenize(review) + filtered_words = [word for word in words if + word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation] + filtered_tokens.extend(filtered_words) + + return filtered_tokens + +def lemmatize_token(filtered_tokens): + mystem = Mystem() + lemmatized_tokens = [] + for token in filtered_tokens: + lemmatized = mystem.lemmatize(token)[0] + lemmatized_tokens.append(lemmatized) + print(lemmatized) + return lemmatized_tokens + +def stemmer_token(filtered_tokens): + stemmer = SnowballStemmer("russian") + stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens] + return stemmer_tokens + +# Данные +train = pd.read_csv('..//dataset/filtered/filtered_dataset_positive.csv') +train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True) +reviews = train['text'] +print("Набор данных готов") + +filtered_tokens = remove_stopwords_and_punctuation(reviews) +print(filtered_tokens[:10]) + +# lemmatized_tokens = lemmatize_token(filtered_tokens) +# print(lemmatized_tokens[:10]) + +# stemmer_tokens = stemmer_token(filtered_tokens) +# print(stemmer_tokens[:10]) + +label_encoder = LabelEncoder() +train['rubrics'] = label_encoder.fit_transform(train['rubrics']) + +# Сохраняем названия классов +class_names = label_encoder.classes_ + +y_train = utils.to_categorical(train['rubrics'], nb_classes) + +tokenizer = Tokenizer(num_words=num_words) +tokenizer.fit_on_texts(filtered_tokens) + +sequences = tokenizer.texts_to_sequences(reviews) + +x_train = pad_sequences(sequences, maxlen=max_reviews_len) + +print('начинается создание модели') + +# Построение модели +model_gru = Sequential() +model_gru.add(Embedding(num_words, 256, input_length=max_reviews_len)) +model_gru.add(SpatialDropout1D(0.2)) +model_gru.add(GRU(128, dropout=0.2, recurrent_dropout=0.2)) +model_gru.add(Dense(nb_classes, activation='softmax')) + +model_gru.compile(optimizer='adam', + loss='categorical_crossentropy', + metrics=['accuracy']) + +model_gru.summary() + +# Определим обратный вызов ModelCheckpoint +model_gru_save_path = './/model/best_model_gru_positive.keras' +checkpoint_callback_gru = ModelCheckpoint(model_gru_save_path, + monitor='val_accuracy', + save_best_only=True, + verbose=1) + +# Обучение модели +history_gru = model_gru.fit(x_train, + y_train, + epochs=5, + batch_size=128, + validation_split=0.1, + callbacks=[checkpoint_callback_gru]) + +# Графики +plt.plot(history_gru.history['accuracy'], + label='Доля верных ответов на обучающем наборе') +plt.plot(history_gru.history['val_accuracy'], + label='Доля верных ответов на проверочном наборе') +plt.xlabel('Эпоха обучения') +plt.ylabel('Доля верных ответов') +plt.legend() +plt.show() + +# Загрузка модели +model_gru = load_model('.//model/best_model_gru_positive.keras') + +# Пример текста отзыва пользователя +user_review = "Ой я так люблю вкусно кушать! Я бы хотел сьесть все самое вкусное в этом мире! Пончики, пиццу, вообще все все все!" + +# Подготовка отзыва пользователя +filtered_tokens = remove_stopwords_and_punctuation([user_review]) +sequences = tokenizer.texts_to_sequences(filtered_tokens) +x_user = pad_sequences(sequences, maxlen=max_reviews_len) + +# Получение вероятности принадлежности отзыва пользователя к разным классам +predicted_probabilities = model_gru.predict(x_user) + +# Вывод вероятностей с названиями классов +for class_name, prob in zip(class_names, predicted_probabilities[0]): + print(f"Вероятность отзыва относится к классу '{class_name}': {prob}") + +# Сохраняем названия классов в текстовый файл +with open('.//class/class_names_gru_positive.txt', 'w', encoding='utf-8') as file: + for class_name in class_names: + file.write(f"{class_name}\n") + +# saving +with open('.//tokenization/tokenizer_gru_positive.pickle', 'wb') as handle: + pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) \ No newline at end of file diff --git a/neural_network/create_lstm/tokenizer/.gitkeep b/neural_network/create_gru/model/.gitkeep similarity index 100% rename from neural_network/create_lstm/tokenizer/.gitkeep rename to neural_network/create_gru/model/.gitkeep diff --git a/neural_network/create_lstm/create_model_lstm_negative.py b/neural_network/create_lstm/create_model_lstm_negative.py index 73fada1..d3e7a49 100644 --- a/neural_network/create_lstm/create_model_lstm_negative.py +++ b/neural_network/create_lstm/create_model_lstm_negative.py @@ -145,5 +145,5 @@ with open('.//class/class_names_lstm_negative.txt', 'w', encoding='utf-8') as fi file.write(f"{class_name}\n") # saving -with open('.//tokenizer/tokenizer_lstm_lstm_negative.pickle', 'wb') as handle: +with open('.//tokenization/tokenizer_lstm_negative.pickle', 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) diff --git a/neural_network/create_lstm/create_model_lstm_positive.py b/neural_network/create_lstm/create_model_lstm_positive.py index 27049fc..3297c2f 100644 --- a/neural_network/create_lstm/create_model_lstm_positive.py +++ b/neural_network/create_lstm/create_model_lstm_positive.py @@ -144,5 +144,5 @@ with open('.//class/class_names_lstm_positive.txt', 'w', encoding='utf-8') as fi file.write(f"{class_name}\n") # saving -with open('.//tokenizer/tokenizer_lstm_positive.pickle', 'wb') as handle: +with open('.//tokenization/tokenizer_lstm_positive.pickle', 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) \ No newline at end of file diff --git a/neural_network/tokenization/tokenizer_negative.py b/neural_network/tokenization/tokenizer_negative.py new file mode 100644 index 0000000..92fd2c3 --- /dev/null +++ b/neural_network/tokenization/tokenizer_negative.py @@ -0,0 +1,60 @@ +import pandas as pd +from keras.src.legacy.preprocessing.text import Tokenizer +from nltk import word_tokenize, SnowballStemmer +from nltk.corpus import stopwords +import string +import pickle + +from pymystem3 import Mystem + +# Максимальное количество слов +num_words = 10000 +# Количество классов отзыва +nb_classes = 10 + +def remove_stopwords_and_punctuation(reviews): + stop_words = set(stopwords.words('russian')) + punctuation = set(string.punctuation) + filtered_tokens = [] + + # Удаление стоп слов и пунктуаций + for review in reviews: + words = word_tokenize(review) + filtered_words = [word for word in words if + word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation] + filtered_tokens.extend(filtered_words) + + return filtered_tokens + +def lemmatize_token(filtered_tokens): + mystem = Mystem() + lemmatized_tokens = [] + for token in filtered_tokens: + lemmatized = mystem.lemmatize(token)[0] + lemmatized_tokens.append(lemmatized) + print(lemmatized) + return lemmatized_tokens + +def stemmer_token(filtered_tokens): + stemmer = SnowballStemmer("russian") + stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens] + return stemmer_tokens + +# Данные +train = pd.read_csv('..//dataset/filtered/filtered_dataset_negative.csv') +reviews = train['text'] + +filtered_tokens = remove_stopwords_and_punctuation(reviews) + +# lemmatized_tokens = lemmatize_token(filtered_tokens) +# print(lemmatized_tokens[:10]) + +# stemmer_tokens = stemmer_token(filtered_tokens) +# print(stemmer_tokens[:10]) + +tokenizer = Tokenizer(num_words=num_words) +tokenizer.fit_on_texts(filtered_tokens) + +# saving +with open('.//tokenizer_negative.pickle', 'wb') as handle: + pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) \ No newline at end of file diff --git a/neural_network/tokenization/tokenizer_positive.py b/neural_network/tokenization/tokenizer_positive.py new file mode 100644 index 0000000..ae646ec --- /dev/null +++ b/neural_network/tokenization/tokenizer_positive.py @@ -0,0 +1,60 @@ +import pandas as pd +from keras.src.legacy.preprocessing.text import Tokenizer +from nltk import word_tokenize, SnowballStemmer +from nltk.corpus import stopwords +import string +import pickle + +from pymystem3 import Mystem + +# Максимальное количество слов +num_words = 10000 +# Количество классов отзыва +nb_classes = 10 + +def remove_stopwords_and_punctuation(reviews): + stop_words = set(stopwords.words('russian')) + punctuation = set(string.punctuation) + filtered_tokens = [] + + # Удаление стоп слов и пунктуаций + for review in reviews: + words = word_tokenize(review) + filtered_words = [word for word in words if + word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation] + filtered_tokens.extend(filtered_words) + + return filtered_tokens + +def lemmatize_token(filtered_tokens): + mystem = Mystem() + lemmatized_tokens = [] + for token in filtered_tokens: + lemmatized = mystem.lemmatize(token)[0] + lemmatized_tokens.append(lemmatized) + print(lemmatized) + return lemmatized_tokens + +def stemmer_token(filtered_tokens): + stemmer = SnowballStemmer("russian") + stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens] + return stemmer_tokens + +# Данные +train = pd.read_csv('..//dataset/filtered/filtered_dataset_positive.csv') +reviews = train['text'] + +filtered_tokens = remove_stopwords_and_punctuation(reviews) + +# lemmatized_tokens = lemmatize_token(filtered_tokens) +# print(lemmatized_tokens[:10]) + +# stemmer_tokens = stemmer_token(filtered_tokens) +# print(stemmer_tokens[:10]) + +tokenizer = Tokenizer(num_words=num_words) +tokenizer.fit_on_texts(filtered_tokens) + +# saving +with open('.//tokenizer_positive.pickle', 'wb') as handle: + pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) \ No newline at end of file