1.4 KiB
1.4 KiB
Лабораторная работа 8¶
In [ ]:
import pandas as pd
from docx import Document
import os
def read_docx(file_path):
doc = Document(file_path)
full_text = []
for paragraph in doc.paragraphs:
full_text.append(paragraph.text)
return "\n".join(full_text)
def load_docs(dataset_path):
df = pd.DataFrame(columns=["doc", "text"])
for file_path in os.listdir(dataset_path):
if file_path.startswith("~$"):
continue
text = read_docx(dataset_path + file_path)
df.loc[len(df.index)] = [file_path, text]
return df
# Загрузка данных
df = load_docs("data/text/")
df["type"] = df.apply(
lambda row: 0 if str(row["doc"]).startswith("tz_") else 1, axis=1
)
df.info()
df.sort_values(by=["doc"], inplace=True)
display(df.head(), df.tail())