8 лабабабаба

This commit is contained in:
a.puchkina 2025-02-22 12:46:29 +04:00
parent 4ecbf0a55b
commit 0ab7af9543
3 changed files with 55 additions and 2711 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

55
lab_8/lab8.ipynb Normal file
View File

@ -0,0 +1,55 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Лабораторная работа 8 ##"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from docx import Document\n",
"import os\n",
"\n",
"def read_docx(file_path):\n",
" doc = Document(file_path)\n",
" full_text = []\n",
" for paragraph in doc.paragraphs:\n",
" full_text.append(paragraph.text)\n",
" return \"\\n\".join(full_text)\n",
"\n",
"def load_docs(dataset_path):\n",
" df = pd.DataFrame(columns=[\"doc\", \"text\"])\n",
" for file_path in os.listdir(dataset_path):\n",
" if file_path.startswith(\"~$\"):\n",
" continue\n",
" text = read_docx(dataset_path + file_path)\n",
" df.loc[len(df.index)] = [file_path, text]\n",
" return df\n",
"\n",
"# Загрузка данных\n",
"df = load_docs(\"data/text/\")\n",
"df[\"type\"] = df.apply(\n",
" lambda row: 0 if str(row[\"doc\"]).startswith(\"tz_\") else 1, axis=1\n",
")\n",
"df.info()\n",
"df.sort_values(by=[\"doc\"], inplace=True)\n",
"\n",
"display(df.head(), df.tail())"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}