69 lines
3.0 KiB
Python
69 lines
3.0 KiB
Python
import numpy as np
|
||
from tensorflow import keras
|
||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||
|
||
def prepare_and_train_model(file_path, epochs):
|
||
# Считывание данных из файла
|
||
with open(file_path, encoding='utf-8') as f:
|
||
data = f.read()
|
||
|
||
# Создание токенизатора
|
||
tokenizer = Tokenizer()
|
||
tokenizer.fit_on_texts([data])
|
||
|
||
# Преобразование текста в последовательности чисел
|
||
sequences = tokenizer.texts_to_sequences([data])
|
||
|
||
# Создание обучающих данных
|
||
input_sequences = []
|
||
for sequence in sequences:
|
||
for i in range(1, len(sequence)):
|
||
n_gram_sequence = sequence[:i+1]
|
||
input_sequences.append(n_gram_sequence)
|
||
|
||
# Предобработка для получения одинаковой длины последовательностей
|
||
max_sequence_len = max([len(sequence) for sequence in input_sequences])
|
||
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
|
||
|
||
# Разделение на входные и выходные данные
|
||
x, y = input_sequences[:, :-1], input_sequences[:, -1]
|
||
|
||
# Создание модели рекуррентной нейронной сети
|
||
model = keras.Sequential([
|
||
keras.layers.Embedding(len(tokenizer.word_index) + 1, 100, input_length=max_sequence_len-1),
|
||
keras.layers.Dropout(0.2),
|
||
keras.layers.LSTM(150),
|
||
keras.layers.Dense(len(tokenizer.word_index) + 1, activation='softmax')
|
||
])
|
||
|
||
# Компиляция и обучение модели
|
||
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
|
||
model.fit(x, y, epochs=epochs, verbose=1)
|
||
|
||
return model, tokenizer, max_sequence_len
|
||
|
||
def generate_text_from_model(model, tokenizer, max_sequence_len, seed_text, next_words):
|
||
# Генерация текста
|
||
for _ in range(next_words):
|
||
token_list = tokenizer.texts_to_sequences([seed_text])[0]
|
||
token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
|
||
predicted = model.predict(token_list)
|
||
predict_index = np.argmax(predicted, axis=-1)
|
||
word = tokenizer.index_word.get(predict_index[0], '')
|
||
seed_text += " " + word
|
||
|
||
return seed_text
|
||
|
||
model_rus, tokenizer_rus, max_sequence_len_rus = prepare_and_train_model('russian.txt', 150)
|
||
rus_text_generated = generate_text_from_model(model_rus, tokenizer_rus, max_sequence_len_rus, "В", 55)
|
||
|
||
model_eng, tokenizer_eng, max_sequence_len_eng = prepare_and_train_model('english.txt', 150)
|
||
eng_text_generated = generate_text_from_model(model_eng, tokenizer_eng, max_sequence_len_eng, "In the", 69)
|
||
|
||
with open('russian_generated.txt', 'w', encoding='utf-8') as f_rus:
|
||
f_rus.write(rus_text_generated)
|
||
|
||
with open('english_generated.txt', 'w', encoding='utf-8') as f_eng:
|
||
f_eng.write(eng_text_generated)
|