IIS_2023_1/malkova_anastasia_lab_7/data.py

24 lines
711 B
Python
Raw Normal View History

2023-11-17 01:50:49 +04:00
import numpy as np
from collections import Counter
def create_data():
with open('train_text.txt', encoding="utf8") as text_file:
text_sample = text_file.readlines()
text_sample = ' '.join(text_sample)
return text_sample
def text_to_seq(text_sample=create_data()):
char_counts = Counter(text_sample)
char_counts = sorted(char_counts.items(), key=lambda x: x[1], reverse=True)
sorted_chars = [char for char, _ in char_counts]
char_to_idx = {char: index for index, char in enumerate(sorted_chars)}
idx_to_char = {v: k for k, v in char_to_idx.items()}
sequence = np.array([char_to_idx[char] for char in text_sample])
return sequence, char_to_idx, idx_to_char