120 lines
4.1 KiB
Python
120 lines
4.1 KiB
Python
|
from sre_parse import Tokenizer
|
||
|
|
||
|
import numpy as np
|
||
|
from flask import Flask
|
||
|
from keras.layers import Dense, LSTM, Embedding
|
||
|
from keras.models import load_model, Sequential
|
||
|
from keras_preprocessing.sequence import pad_sequences
|
||
|
|
||
|
app = Flask(__name__)
|
||
|
|
||
|
|
||
|
@app.route("/")
|
||
|
def home():
|
||
|
return "<html>" \
|
||
|
"<h1>Жукова Алина ПИбд-41</h1>" \
|
||
|
"<h1>Лабораторная работа №7</h1>" \
|
||
|
"<table>" \
|
||
|
"<td>" \
|
||
|
"<form Action='http://127.0.0.1:5000/k4_1_task_7' Method=get>" \
|
||
|
"<input type=submit value='Генерация текста'>" \
|
||
|
"</form>" \
|
||
|
"</td>" \
|
||
|
"</table>" \
|
||
|
"</html>"
|
||
|
|
||
|
# Реккурентная нейронная сеть, генерация текста
|
||
|
# 10 вариант
|
||
|
@app.route("/k4_1_task_7", methods=['GET'])
|
||
|
def k4_1_task_7():
|
||
|
# Загрузка текста из файла
|
||
|
# Русский текст
|
||
|
# with open('lab_4_1__7_text_rus.txt', 'r', encoding='utf-8') as file:
|
||
|
# text = file.read()
|
||
|
# Анлглийский текст
|
||
|
with open('lab_4_1__7_text_eng.txt', 'r', encoding='utf-8') as file:
|
||
|
text = file.read()
|
||
|
|
||
|
# Создание Tokenizer и токенизация текста
|
||
|
tokenizer = Tokenizer(char_level=True)
|
||
|
tokenizer.fit_on_texts(text)
|
||
|
|
||
|
# Преобразование текста в последовательность чисел
|
||
|
sequences = tokenizer.texts_to_sequences(text)
|
||
|
|
||
|
# Подготовка обучающих данных
|
||
|
seq_length = 100
|
||
|
dataX, dataY = [], []
|
||
|
for i in range(0, len(sequences) - seq_length):
|
||
|
seq_in = sequences[i:i + seq_length]
|
||
|
seq_out = sequences[i + seq_length]
|
||
|
dataX.append(seq_in)
|
||
|
dataY.append(seq_out)
|
||
|
|
||
|
dataX = np.array(dataX)
|
||
|
dataY = np.array(dataY)
|
||
|
|
||
|
# Создание модели
|
||
|
vocab_size = len(tokenizer.word_index) + 1
|
||
|
embedding_dim = 256
|
||
|
rnn_units = 1024
|
||
|
|
||
|
model = Sequential()
|
||
|
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=seq_length))
|
||
|
model.add(LSTM(units=rnn_units))
|
||
|
model.add(Dense(units=vocab_size, activation='softmax'))
|
||
|
|
||
|
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
|
||
|
|
||
|
# Загрузка модели
|
||
|
# РУсская модель
|
||
|
# model = load_model('lab_4_1__7_model.keras')
|
||
|
# Английская модель
|
||
|
model = load_model('lab_4_1__7_model_eng.keras')
|
||
|
print("Loaded model from disk")
|
||
|
|
||
|
|
||
|
# Обучение модели
|
||
|
# batch_size = 64
|
||
|
# model.fit(dataX, dataY, epochs=15, batch_size=batch_size)
|
||
|
|
||
|
|
||
|
def generate_text(seed_text, gen_length):
|
||
|
generated_text = seed_text
|
||
|
|
||
|
for _ in range(gen_length):
|
||
|
sequence = tokenizer.texts_to_sequences([seed_text])[0]
|
||
|
sequence = pad_sequences([sequence], maxlen=seq_length)
|
||
|
prediction = model.predict(sequence)[0]
|
||
|
predicted_index = np.argmax(prediction)
|
||
|
predicted_char = tokenizer.index_word[predicted_index]
|
||
|
generated_text += predicted_char
|
||
|
seed_text += predicted_char
|
||
|
seed_text = seed_text[1:]
|
||
|
|
||
|
return generated_text
|
||
|
|
||
|
# Пример использования
|
||
|
start_phraze = "Black cat"
|
||
|
# Русский
|
||
|
# generated_text = generate_text("Невероятный котик", 250)
|
||
|
# Английский
|
||
|
generated_text = generate_text(start_phraze, 250)
|
||
|
|
||
|
i = 10
|
||
|
|
||
|
# Сохранение модели
|
||
|
# Русская модель
|
||
|
# model.save('C:/Users/Alina/PycharmProjects/lab1/lab_4_1__7_model.keras')
|
||
|
# Английская модель
|
||
|
# model.save('C:/Users/Alina/PycharmProjects/lab1/lab_4_1__7_model_eng.keras')
|
||
|
# print("Saved model to disk")
|
||
|
|
||
|
return "<html>" \
|
||
|
"<h1></h1>" \
|
||
|
"<h2>Вариант 10. Задание 7 - Генерация текста</h2>" \
|
||
|
"<h2> Сгенерированный текст, начальная фраза " + start_phraze + ": " + str(generated_text) + " </h2>" \
|
||
|
"</html>"
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
app.run(debug=True)
|