56 lines
1.4 KiB
Plaintext
Raw Normal View History

2025-02-22 12:46:29 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Лабораторная работа 8 ##"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from docx import Document\n",
"import os\n",
"\n",
"def read_docx(file_path):\n",
" doc = Document(file_path)\n",
" full_text = []\n",
" for paragraph in doc.paragraphs:\n",
" full_text.append(paragraph.text)\n",
" return \"\\n\".join(full_text)\n",
"\n",
"def load_docs(dataset_path):\n",
" df = pd.DataFrame(columns=[\"doc\", \"text\"])\n",
" for file_path in os.listdir(dataset_path):\n",
" if file_path.startswith(\"~$\"):\n",
" continue\n",
" text = read_docx(dataset_path + file_path)\n",
" df.loc[len(df.index)] = [file_path, text]\n",
" return df\n",
"\n",
"# Загрузка данных\n",
"df = load_docs(\"data/text/\")\n",
"df[\"type\"] = df.apply(\n",
" lambda row: 0 if str(row[\"doc\"]).startswith(\"tz_\") else 1, axis=1\n",
")\n",
"df.info()\n",
"df.sort_values(by=[\"doc\"], inplace=True)\n",
"\n",
"display(df.head(), df.tail())"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}