import numpy as np from collections import Counter def create_data(): with open('train_text.txt', encoding="utf8") as text_file: text_sample = text_file.readlines() text_sample = ' '.join(text_sample) return text_sample def text_to_seq(text_sample=create_data()): char_counts = Counter(text_sample) char_counts = sorted(char_counts.items(), key=lambda x: x[1], reverse=True) sorted_chars = [char for char, _ in char_counts] char_to_idx = {char: index for index, char in enumerate(sorted_chars)} idx_to_char = {v: k for k, v in char_to_idx.items()} sequence = np.array([char_to_idx[char] for char in text_sample]) return sequence, char_to_idx, idx_to_char