2025-02-22 12:46:29 +04:00

1.4 KiB

Лабораторная работа 8

In [ ]:
import pandas as pd
from docx import Document
import os

def read_docx(file_path):
    doc = Document(file_path)
    full_text = []
    for paragraph in doc.paragraphs:
        full_text.append(paragraph.text)
    return "\n".join(full_text)

def load_docs(dataset_path):
    df = pd.DataFrame(columns=["doc", "text"])
    for file_path in os.listdir(dataset_path):
        if file_path.startswith("~$"):
            continue
        text = read_docx(dataset_path + file_path)
        df.loc[len(df.index)] = [file_path, text]
    return df

# Загрузка данных
df = load_docs("data/text/")
df["type"] = df.apply(
    lambda row: 0 if str(row["doc"]).startswith("tz_") else 1, axis=1
)
df.info()
df.sort_values(by=["doc"], inplace=True)

display(df.head(), df.tail())