59 lines
2.4 KiB
Python
59 lines
2.4 KiB
Python
|
import tensorflow as tf
|
||
|
import numpy as np
|
||
|
from keras.models import Sequential
|
||
|
from keras.layers import LSTM, Dense, Embedding
|
||
|
from keras.preprocessing.text import Tokenizer
|
||
|
from keras.preprocessing.sequence import pad_sequences
|
||
|
|
||
|
# Загрузка и предобработка данных на русском языке
|
||
|
with open("rus.txt", "r", encoding="utf-8") as f:
|
||
|
rus_text = f.read()
|
||
|
|
||
|
tokenizer_rus = Tokenizer()
|
||
|
tokenizer_rus.fit_on_texts([rus_text])
|
||
|
|
||
|
rus_vocab_size = len(tokenizer_rus.word_index) + 1
|
||
|
rus_sequences = tokenizer_rus.texts_to_sequences([rus_text])[0]
|
||
|
rus_input_sequences = []
|
||
|
rus_output_sequences = []
|
||
|
|
||
|
for i in range(1, len(rus_sequences)):
|
||
|
rus_input_sequences.append(rus_sequences[:i])
|
||
|
rus_output_sequences.append(rus_sequences[i])
|
||
|
|
||
|
rus_max_sequence_len = max([len(seq) for seq in rus_input_sequences])
|
||
|
rus_input_sequences = pad_sequences(rus_input_sequences, maxlen=rus_max_sequence_len)
|
||
|
|
||
|
x_rus_train = rus_input_sequences
|
||
|
y_rus_train = tf.keras.utils.to_categorical(rus_output_sequences, num_classes=rus_vocab_size)
|
||
|
|
||
|
# Построение модели для русского языка
|
||
|
rus_model = Sequential()
|
||
|
rus_model.add(Embedding(rus_vocab_size, 256, input_length=rus_max_sequence_len))
|
||
|
rus_model.add(LSTM(512))
|
||
|
rus_model.add(Dense(rus_vocab_size, activation='softmax'))
|
||
|
|
||
|
rus_model.compile(loss='categorical_crossentropy', optimizer='adam')
|
||
|
|
||
|
# Обучение модели для русского языка
|
||
|
rus_history = rus_model.fit(x_rus_train, y_rus_train, batch_size=128, epochs=200)
|
||
|
|
||
|
def generate_text(model, tokenizer, max_sequence_len, seed_text):
|
||
|
output_text = seed_text
|
||
|
for _ in range(100): # Генерируем 100 слов
|
||
|
encoded_text = tokenizer.texts_to_sequences([output_text])[0]
|
||
|
pad_encoded = pad_sequences([encoded_text], maxlen=max_sequence_len, truncating='pre')
|
||
|
pred_word_index = np.argmax(model.predict(pad_encoded), axis=-1)
|
||
|
pred_word = tokenizer.index_word[pred_word_index[0]]
|
||
|
output_text += " " + pred_word
|
||
|
return output_text
|
||
|
|
||
|
# Генерация текста для русской модели
|
||
|
rus_output_text = generate_text(rus_model, tokenizer_rus, rus_max_sequence_len, "Помню просторный")
|
||
|
|
||
|
# Вывод результатов
|
||
|
print("Русская модель:")
|
||
|
print("Потери на тренировочных данных:", rus_history.history['loss'][-1])
|
||
|
print("Сгенерированный текст:")
|
||
|
print(rus_output_text)
|