IIS_2023_1/savenkov_alexander_lab_7/app.py

112 lines
4.4 KiB
Python
Raw Normal View History

2023-11-19 03:31:55 +04:00
from flask import Flask, render_template, request
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
app = Flask(__name__)
def load_text_data(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read().lower()
return text
def create_sequences(text, sequence_length=100):
sequences = []
for i in range(sequence_length, len(text)):
seq = text[i - sequence_length:i + 1]
sequences.append(seq)
return sequences
def prepare_data(sequences):
input_sequences = []
output_sequences = []
for sequence in sequences:
if len(sequence) > 1:
input_seq = sequence[:-1]
output_seq = sequence[-1]
input_sequences.append(input_seq)
output_sequences.append(output_seq)
x = np.array(input_sequences)
y = np.array(output_sequences)
return x, y
def build_model(total_words):
model = Sequential()
model.add(Embedding(total_words, 50, input_length=100, trainable=True))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(total_words, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
return model
def generate_text(seed_text, model, tokenizer, max_sequence_len=100, temperature=1.0):
for _ in range(max_sequence_len):
token_list = tokenizer.texts_to_sequences([seed_text])[0]
token_list = pad_sequences([token_list], maxlen=100, padding='pre', truncating='post')[0]
predicted_probs = model.predict(np.array([token_list]), verbose=0)[0]
# Нормализация вероятностей
predicted_probs = np.array(predicted_probs).astype('float64')
predicted_probs /= np.sum(predicted_probs)
# Проверка на temperature не равное 0
if temperature != 0:
predicted_id = np.argmax(np.random.multinomial(1, predicted_probs, 1))
else:
predicted_id = np.argmax(predicted_probs)
output_word = tokenizer.index_word[predicted_id]
seed_text += " " + output_word
return seed_text
# Загрузка русского текста
russian_text = load_text_data('russian_text.txt')
tokenizer_russian = Tokenizer()
tokenizer_russian.fit_on_texts([russian_text])
total_words_russian = len(tokenizer_russian.word_index) + 1
# Создание последовательных последовательностей для русского текста
russian_sequences = create_sequences(tokenizer_russian.texts_to_sequences([russian_text])[0])
russian_x, russian_y = prepare_data(russian_sequences)
# Построение и обучение модели для русского текста
model_russian = build_model(total_words_russian)
model_russian.fit(russian_x, russian_y, epochs=5, batch_size=32)
# Загрузка английского текста
english_text = load_text_data('english_text.txt')
tokenizer_english = Tokenizer()
tokenizer_english.fit_on_texts([english_text])
total_words_english = len(tokenizer_english.word_index) + 1
# Создание последовательных последовательностей для английского текста
english_sequences = create_sequences(tokenizer_english.texts_to_sequences([english_text])[0])
english_x, english_y = prepare_data(english_sequences)
# Построение и обучение модели для английского текста
model_english = build_model(total_words_english)
model_english.fit(english_x, english_y, epochs=5, batch_size=32)
@app.route('/')
def index():
return render_template('index.html')
@app.route('/generate', methods=['POST'])
def generate():
seed_text = request.form['seed_text']
language = request.form['language']
temperature = float(request.form['temperature'])
if language == 'russian':
generated_text = generate_text(seed_text, model_russian, tokenizer_russian, temperature=temperature)
else:
generated_text = generate_text(seed_text, model_english, tokenizer_english, temperature=temperature)
return render_template('index.html', seed_text=seed_text, generated_text=generated_text)
if __name__ == '__main__':
app.run(debug=True)