import tensorflow as tf import numpy as np from keras.models import Sequential from keras.layers import LSTM, Dense, Embedding from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences # Загрузка и предобработка данных на русском языке with open("rus.txt", "r", encoding="utf-8") as f: rus_text = f.read() tokenizer_rus = Tokenizer() tokenizer_rus.fit_on_texts([rus_text]) rus_vocab_size = len(tokenizer_rus.word_index) + 1 rus_sequences = tokenizer_rus.texts_to_sequences([rus_text])[0] rus_input_sequences = [] rus_output_sequences = [] for i in range(1, len(rus_sequences)): rus_input_sequences.append(rus_sequences[:i]) rus_output_sequences.append(rus_sequences[i]) rus_max_sequence_len = max([len(seq) for seq in rus_input_sequences]) rus_input_sequences = pad_sequences(rus_input_sequences, maxlen=rus_max_sequence_len) x_rus_train = rus_input_sequences y_rus_train = tf.keras.utils.to_categorical(rus_output_sequences, num_classes=rus_vocab_size) # Загрузка и предобработка данных на английском языке with open("eng.txt", "r", encoding="utf-8") as f: eng_text = f.read() tokenizer_eng = Tokenizer() tokenizer_eng.fit_on_texts([eng_text]) eng_vocab_size = len(tokenizer_eng.word_index) + 1 eng_sequences = tokenizer_eng.texts_to_sequences([eng_text])[0] eng_input_sequences = [] eng_output_sequences = [] for i in range(1, len(eng_sequences)): eng_input_sequences.append(eng_sequences[:i]) eng_output_sequences.append(eng_sequences[i]) eng_max_sequence_len = max([len(seq) for seq in eng_input_sequences]) eng_input_sequences = pad_sequences(eng_input_sequences, maxlen=eng_max_sequence_len) x_eng_train = eng_input_sequences y_eng_train = tf.keras.utils.to_categorical(eng_output_sequences, num_classes=eng_vocab_size) # Построение модели для русского языка rus_model = Sequential() rus_model.add(Embedding(rus_vocab_size, 256, input_length=rus_max_sequence_len)) rus_model.add(LSTM(512)) rus_model.add(Dense(rus_vocab_size, activation='softmax')) rus_model.compile(loss='categorical_crossentropy', optimizer='adam') # Обучение модели для русского языка rus_history = rus_model.fit(x_rus_train, y_rus_train, batch_size=128, epochs=200) # Построение модели для английского языка eng_model = Sequential() eng_model.add(Embedding(eng_vocab_size, 256, input_length=eng_max_sequence_len)) eng_model.add(LSTM(512)) eng_model.add(Dense(eng_vocab_size, activation='softmax')) eng_model.compile(loss='categorical_crossentropy', optimizer='adam') # Обучение модели для английского языка eng_history = eng_model.fit(x_eng_train, y_eng_train, batch_size=128, epochs=200) def generate_text(model, tokenizer, max_sequence_len, seed_text): output_text = seed_text for _ in range(100): # Генерируем 100 слов encoded_text = tokenizer.texts_to_sequences([output_text])[0] pad_encoded = pad_sequences([encoded_text], maxlen=max_sequence_len, truncating='pre') pred_word_index = np.argmax(model.predict(pad_encoded), axis=-1) pred_word = tokenizer.index_word[pred_word_index[0]] output_text += " " + pred_word return output_text # Генерация текста для русской и английской моделей rus_output_text = generate_text(rus_model, tokenizer_rus, rus_max_sequence_len, "Помню просторный") eng_output_text = generate_text(eng_model, tokenizer_eng, eng_max_sequence_len, "The old man") # Вывод результатов print("Русская модель:") print("Потери на тренировочных данных:", rus_history.history['loss'][-1]) print("Сгенерированный текст:") print(rus_output_text) print("Английская модель:") print("Потери на тренировочных данных:", eng_history.history['loss'][-1]) print("Сгенерированный текст:") print(eng_output_text)