IIS_2023_1/arutunyan_dmitry_lab_7/main.py

98 lines
3.5 KiB
Python

import sys
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
file = open("P:\\ULSTU\\ИИС\\Лабораторные\\Lab7\\texts\\text-en.txt", encoding='utf-8').read()
def tokenize_words(input):
# переводим весть текст в строчные буквы
input = input.lower()
# инициализируем токенизатор
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(input)
# выбираем и выбрасываем все стоп слова, находящиеся в списке стоп слов русского языка
filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
return " ".join(filtered)
if __name__ == '__main__':
# предобрабатываем текст, создаём токены
processed_inputs = tokenize_words(file)
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Общее кол-во символов:", input_len)
print("Размер словаря:", vocab_len)
seq_length = 100
x_data = []
y_data = []
for i in range(0, input_len - seq_length, 1):
in_seq = processed_inputs[i:i + seq_length]
out_seq = processed_inputs[i + seq_length]
x_data.append([char_to_num[char] for char in in_seq])
y_data.append(char_to_num[out_seq])
n_patterns = len(x_data)
print("Кол-во паттернов:", n_patterns)
X = np.reshape(x_data, (n_patterns, seq_length, 1))
X = X / float(vocab_len)
y = np_utils.to_categorical(y_data)
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]
# Создание распределенной стратегии
strategy = tf.distribute.MultiWorkerMirroredStrategy()
# Распределение модели на устройства
with strategy.scope():
parallel_model = model
# Обучение модели на GPU и CPU
parallel_model.fit(X, y, epochs=100, batch_size=256, callbacks=desired_callbacks)
model.load_weights(filepath)
model.compile(loss='categorical_crossentropy', optimizer='adam')
num_to_char = dict((i, c) for i, c in enumerate(chars))
start = np.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Случайная выборка:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")
for i in range(1000):
x = np.reshape(pattern, (1, len(pattern), 1))
x = x / float(vocab_len)
prediction = model.predict(x, verbose=0)
index = np.argmax(prediction)
result = num_to_char[index]
sys.stdout.write(result)
pattern.append(index)
pattern = pattern[1:len(pattern)]