8 лабабабаба #20

Merged
Arutunyan-Dmitry merged 2 commits from lab_8 into main 2025-02-22 12:59:30 +04:00
3 changed files with 55 additions and 2711 deletions
Showing only changes of commit 0ab7af9543 - Show all commits

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

55
lab_8/lab8.ipynb Normal file
View File

@ -0,0 +1,55 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Лабораторная работа 8 ##"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from docx import Document\n",
"import os\n",
"\n",
"def read_docx(file_path):\n",
" doc = Document(file_path)\n",
" full_text = []\n",
" for paragraph in doc.paragraphs:\n",
" full_text.append(paragraph.text)\n",
" return \"\\n\".join(full_text)\n",
"\n",
"def load_docs(dataset_path):\n",
" df = pd.DataFrame(columns=[\"doc\", \"text\"])\n",
" for file_path in os.listdir(dataset_path):\n",
" if file_path.startswith(\"~$\"):\n",
" continue\n",
" text = read_docx(dataset_path + file_path)\n",
" df.loc[len(df.index)] = [file_path, text]\n",
" return df\n",
"\n",
"# Загрузка данных\n",
"df = load_docs(\"data/text/\")\n",
"df[\"type\"] = df.apply(\n",
" lambda row: 0 if str(row[\"doc\"]).startswith(\"tz_\") else 1, axis=1\n",
")\n",
"df.info()\n",
"df.sort_values(by=[\"doc\"], inplace=True)\n",
"\n",
"display(df.head(), df.tail())"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}