24 lines
711 B
Python
24 lines
711 B
Python
|
import numpy as np
|
||
|
from collections import Counter
|
||
|
|
||
|
|
||
|
def create_data():
|
||
|
with open('train_text.txt', encoding="utf8") as text_file:
|
||
|
text_sample = text_file.readlines()
|
||
|
text_sample = ' '.join(text_sample)
|
||
|
|
||
|
return text_sample
|
||
|
|
||
|
|
||
|
def text_to_seq(text_sample=create_data()):
|
||
|
char_counts = Counter(text_sample)
|
||
|
char_counts = sorted(char_counts.items(), key=lambda x: x[1], reverse=True)
|
||
|
|
||
|
sorted_chars = [char for char, _ in char_counts]
|
||
|
|
||
|
char_to_idx = {char: index for index, char in enumerate(sorted_chars)}
|
||
|
idx_to_char = {v: k for k, v in char_to_idx.items()}
|
||
|
sequence = np.array([char_to_idx[char] for char in text_sample])
|
||
|
|
||
|
return sequence, char_to_idx, idx_to_char
|