98 lines
3.5 KiB
Python
98 lines
3.5 KiB
Python
|
import sys
|
||
|
import numpy as np
|
||
|
from nltk.tokenize import RegexpTokenizer
|
||
|
from nltk.corpus import stopwords
|
||
|
import tensorflow as tf
|
||
|
from keras.models import Sequential
|
||
|
from keras.layers import Dense, Dropout, LSTM
|
||
|
from keras.utils import np_utils
|
||
|
from keras.callbacks import ModelCheckpoint
|
||
|
|
||
|
file = open("P:\\ULSTU\\ИИС\\Лабораторные\\Lab7\\texts\\text-en.txt", encoding='utf-8').read()
|
||
|
|
||
|
|
||
|
def tokenize_words(input):
|
||
|
# переводим весть текст в строчные буквы
|
||
|
input = input.lower()
|
||
|
|
||
|
# инициализируем токенизатор
|
||
|
tokenizer = RegexpTokenizer(r'\w+')
|
||
|
tokens = tokenizer.tokenize(input)
|
||
|
|
||
|
# выбираем и выбрасываем все стоп слова, находящиеся в списке стоп слов русского языка
|
||
|
filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
|
||
|
return " ".join(filtered)
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
# предобрабатываем текст, создаём токены
|
||
|
processed_inputs = tokenize_words(file)
|
||
|
|
||
|
chars = sorted(list(set(processed_inputs)))
|
||
|
char_to_num = dict((c, i) for i, c in enumerate(chars))
|
||
|
|
||
|
input_len = len(processed_inputs)
|
||
|
vocab_len = len(chars)
|
||
|
print("Общее кол-во символов:", input_len)
|
||
|
print("Размер словаря:", vocab_len)
|
||
|
|
||
|
seq_length = 100
|
||
|
x_data = []
|
||
|
y_data = []
|
||
|
for i in range(0, input_len - seq_length, 1):
|
||
|
in_seq = processed_inputs[i:i + seq_length]
|
||
|
out_seq = processed_inputs[i + seq_length]
|
||
|
x_data.append([char_to_num[char] for char in in_seq])
|
||
|
y_data.append(char_to_num[out_seq])
|
||
|
|
||
|
n_patterns = len(x_data)
|
||
|
print("Кол-во паттернов:", n_patterns)
|
||
|
|
||
|
X = np.reshape(x_data, (n_patterns, seq_length, 1))
|
||
|
X = X / float(vocab_len)
|
||
|
y = np_utils.to_categorical(y_data)
|
||
|
|
||
|
model = Sequential()
|
||
|
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
|
||
|
model.add(Dropout(0.2))
|
||
|
model.add(LSTM(256, return_sequences=True))
|
||
|
model.add(Dropout(0.2))
|
||
|
model.add(LSTM(256))
|
||
|
model.add(Dropout(0.2))
|
||
|
model.add(Dense(y.shape[1], activation='softmax'))
|
||
|
|
||
|
model.compile(loss='categorical_crossentropy', optimizer='adam')
|
||
|
|
||
|
filepath = "model_weights_saved.hdf5"
|
||
|
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
|
||
|
desired_callbacks = [checkpoint]
|
||
|
|
||
|
# Создание распределенной стратегии
|
||
|
strategy = tf.distribute.MultiWorkerMirroredStrategy()
|
||
|
# Распределение модели на устройства
|
||
|
with strategy.scope():
|
||
|
parallel_model = model
|
||
|
# Обучение модели на GPU и CPU
|
||
|
parallel_model.fit(X, y, epochs=100, batch_size=256, callbacks=desired_callbacks)
|
||
|
|
||
|
model.load_weights(filepath)
|
||
|
model.compile(loss='categorical_crossentropy', optimizer='adam')
|
||
|
|
||
|
num_to_char = dict((i, c) for i, c in enumerate(chars))
|
||
|
|
||
|
start = np.random.randint(0, len(x_data) - 1)
|
||
|
pattern = x_data[start]
|
||
|
print("Случайная выборка:")
|
||
|
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")
|
||
|
|
||
|
for i in range(1000):
|
||
|
x = np.reshape(pattern, (1, len(pattern), 1))
|
||
|
x = x / float(vocab_len)
|
||
|
prediction = model.predict(x, verbose=0)
|
||
|
index = np.argmax(prediction)
|
||
|
result = num_to_char[index]
|
||
|
sys.stdout.write(result)
|
||
|
pattern.append(index)
|
||
|
pattern = pattern[1:len(pattern)]
|
||
|
|