IIS_2023_1/gordeeva_anna_lab_7/laba7.py
Meowweasy e88d1e7fcd itog
2024-01-09 16:55:23 +04:00

100 lines
4.1 KiB
Python

import docx
import streamlit as st
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
def extract_text_from_docx(file_path):
doc = docx.Document(file_path)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
return '\n'.join(full_text)
file_path1 = '"C:/Users/79084/Desktop/textru.doc"'
file_path2 = '"C:/Users/79084/Desktop/texten.doc"'
# Извлечение текста из файла
textru = extract_text_from_docx(file_path1)
texten = extract_text_from_docx(file_path2)
# Предобработка текста
tokenizer_russian = tf.keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer_russian.fit_on_texts(textru)
tokenized_text_russian = tokenizer_russian.texts_to_sequences([textru])[0]
tokenizer_english = tf.keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer_english.fit_on_texts(texten)
tokenized_text_english = tokenizer_english.texts_to_sequences([texten])[0]
# Создание последовательных последовательностей для обучения
maxlen = 40
step = 3
sentences_russian = []
next_chars_russian = []
sentences_english = []
next_chars_english = []
for i in range(0, len(tokenized_text_russian) - maxlen, step):
sentences_russian.append(tokenized_text_russian[i: i + maxlen])
next_chars_russian.append(tokenized_text_russian[i + maxlen])
for i in range(0, len(tokenized_text_english) - maxlen, step):
sentences_english.append(tokenized_text_english[i: i + maxlen])
next_chars_english.append(tokenized_text_english[i + maxlen])
# Преобразование данных в массивы numpy
x_russian = np.array(sentences_russian)
y_russian = np.array(next_chars_russian)
x_english = np.array(sentences_english)
y_english = np.array(next_chars_english)
# Создание модели для русского текста
model_russian = Sequential()
model_russian.add(Embedding(len(tokenizer_russian.word_index) + 1, 128))
model_russian.add(LSTM(128))
model_russian.add(Dense(len(tokenizer_russian.word_index) + 1, activation='softmax'))
model_russian.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
# Обучение модели на русском тексте
model_russian.fit(x_russian, y_russian, batch_size=128, epochs=50)
# Создание модели для английского текста
model_english = Sequential()
model_english.add(Embedding(len(tokenizer_english.word_index) + 1, 128))
model_english.add(LSTM(128))
model_english.add(Dense(len(tokenizer_english.word_index) + 1, activation='softmax'))
model_english.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
# Обучение модели на английском тексте
model_english.fit(x_english, y_english, batch_size=128, epochs=50)
# Функция для генерации текста на основе обученной модели
def generate_text(model, tokenizer, seed_text, maxlen, temperature=1.0, num_chars=400):
generated_text = seed_text
for _ in range(num_chars):
encoded = tokenizer.texts_to_sequences([seed_text])[0]
encoded = np.array(encoded)
predicted_probs = model.predict(encoded, verbose=0)[0]
# Используем temperature для более разнообразных предсказаний
predicted_probs = np.log(predicted_probs) / temperature
exp_preds = np.exp(predicted_probs)
predicted_probs = exp_preds / np.sum(exp_preds)
predicted = np.random.choice(len(predicted_probs), p=predicted_probs)
next_char = tokenizer.index_word.get(predicted, '')
generated_text += next_char
seed_text += next_char
seed_text = seed_text[1:]
return generated_text
generated_russian_text = generate_text(model_russian, tokenizer_russian, 'Ты к моему', maxlen, temperature=0.5, num_chars=400)
st.write(generated_russian_text)
generated_english_text = generate_text(model_english, tokenizer_english, 'In the', maxlen, temperature=0.5, num_chars=400)
st.write(generated_english_text)