import docx import streamlit as st import numpy as np import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import LSTM, Dense, Embedding def extract_text_from_docx(file_path): doc = docx.Document(file_path) full_text = [] for para in doc.paragraphs: full_text.append(para.text) return '\n'.join(full_text) file_path1 = '"C:/Users/79084/Desktop/textru.doc"' file_path2 = '"C:/Users/79084/Desktop/texten.doc"' # Извлечение текста из файла textru = extract_text_from_docx(file_path1) texten = extract_text_from_docx(file_path2) # Предобработка текста tokenizer_russian = tf.keras.preprocessing.text.Tokenizer(char_level=True) tokenizer_russian.fit_on_texts(textru) tokenized_text_russian = tokenizer_russian.texts_to_sequences([textru])[0] tokenizer_english = tf.keras.preprocessing.text.Tokenizer(char_level=True) tokenizer_english.fit_on_texts(texten) tokenized_text_english = tokenizer_english.texts_to_sequences([texten])[0] # Создание последовательных последовательностей для обучения maxlen = 40 step = 3 sentences_russian = [] next_chars_russian = [] sentences_english = [] next_chars_english = [] for i in range(0, len(tokenized_text_russian) - maxlen, step): sentences_russian.append(tokenized_text_russian[i: i + maxlen]) next_chars_russian.append(tokenized_text_russian[i + maxlen]) for i in range(0, len(tokenized_text_english) - maxlen, step): sentences_english.append(tokenized_text_english[i: i + maxlen]) next_chars_english.append(tokenized_text_english[i + maxlen]) # Преобразование данных в массивы numpy x_russian = np.array(sentences_russian) y_russian = np.array(next_chars_russian) x_english = np.array(sentences_english) y_english = np.array(next_chars_english) # Создание модели для русского текста model_russian = Sequential() model_russian.add(Embedding(len(tokenizer_russian.word_index) + 1, 128)) model_russian.add(LSTM(128)) model_russian.add(Dense(len(tokenizer_russian.word_index) + 1, activation='softmax')) model_russian.compile(loss='sparse_categorical_crossentropy', optimizer='adam') # Обучение модели на русском тексте model_russian.fit(x_russian, y_russian, batch_size=128, epochs=50) # Создание модели для английского текста model_english = Sequential() model_english.add(Embedding(len(tokenizer_english.word_index) + 1, 128)) model_english.add(LSTM(128)) model_english.add(Dense(len(tokenizer_english.word_index) + 1, activation='softmax')) model_english.compile(loss='sparse_categorical_crossentropy', optimizer='adam') # Обучение модели на английском тексте model_english.fit(x_english, y_english, batch_size=128, epochs=50) # Функция для генерации текста на основе обученной модели def generate_text(model, tokenizer, seed_text, maxlen, temperature=1.0, num_chars=400): generated_text = seed_text for _ in range(num_chars): encoded = tokenizer.texts_to_sequences([seed_text])[0] encoded = np.array(encoded) predicted_probs = model.predict(encoded, verbose=0)[0] # Используем temperature для более разнообразных предсказаний predicted_probs = np.log(predicted_probs) / temperature exp_preds = np.exp(predicted_probs) predicted_probs = exp_preds / np.sum(exp_preds) predicted = np.random.choice(len(predicted_probs), p=predicted_probs) next_char = tokenizer.index_word.get(predicted, '') generated_text += next_char seed_text += next_char seed_text = seed_text[1:] return generated_text generated_russian_text = generate_text(model_russian, tokenizer_russian, 'Ты к моему', maxlen, temperature=0.5, num_chars=400) st.write(generated_russian_text) generated_english_text = generate_text(model_english, tokenizer_english, 'In the', maxlen, temperature=0.5, num_chars=400) st.write(generated_english_text)