{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Лабораторная работа 8 ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from docx import Document\n", "import os\n", "\n", "def read_docx(file_path):\n", " doc = Document(file_path)\n", " full_text = []\n", " for paragraph in doc.paragraphs:\n", " full_text.append(paragraph.text)\n", " return \"\\n\".join(full_text)\n", "\n", "def load_docs(dataset_path):\n", " df = pd.DataFrame(columns=[\"doc\", \"text\"])\n", " for file_path in os.listdir(dataset_path):\n", " if file_path.startswith(\"~$\"):\n", " continue\n", " text = read_docx(dataset_path + file_path)\n", " df.loc[len(df.index)] = [file_path, text]\n", " return df\n", "\n", "# Загрузка данных\n", "df = load_docs(\"data/text/\")\n", "df[\"type\"] = df.apply(\n", " lambda row: 0 if str(row[\"doc\"]).startswith(\"tz_\") else 1, axis=1\n", ")\n", "df.info()\n", "df.sort_values(by=[\"doc\"], inplace=True)\n", "\n", "display(df.head(), df.tail())" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 2 }