8 лабабабаба
This commit is contained in:
parent
4ecbf0a55b
commit
0ab7af9543
1303
lab_2/lab2.ipynb
1303
lab_2/lab2.ipynb
File diff suppressed because one or more lines are too long
1408
lab_3/lab3.ipynb
1408
lab_3/lab3.ipynb
File diff suppressed because one or more lines are too long
55
lab_8/lab8.ipynb
Normal file
55
lab_8/lab8.ipynb
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Лабораторная работа 8 ##"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from docx import Document\n",
|
||||||
|
"import os\n",
|
||||||
|
"\n",
|
||||||
|
"def read_docx(file_path):\n",
|
||||||
|
" doc = Document(file_path)\n",
|
||||||
|
" full_text = []\n",
|
||||||
|
" for paragraph in doc.paragraphs:\n",
|
||||||
|
" full_text.append(paragraph.text)\n",
|
||||||
|
" return \"\\n\".join(full_text)\n",
|
||||||
|
"\n",
|
||||||
|
"def load_docs(dataset_path):\n",
|
||||||
|
" df = pd.DataFrame(columns=[\"doc\", \"text\"])\n",
|
||||||
|
" for file_path in os.listdir(dataset_path):\n",
|
||||||
|
" if file_path.startswith(\"~$\"):\n",
|
||||||
|
" continue\n",
|
||||||
|
" text = read_docx(dataset_path + file_path)\n",
|
||||||
|
" df.loc[len(df.index)] = [file_path, text]\n",
|
||||||
|
" return df\n",
|
||||||
|
"\n",
|
||||||
|
"# Загрузка данных\n",
|
||||||
|
"df = load_docs(\"data/text/\")\n",
|
||||||
|
"df[\"type\"] = df.apply(\n",
|
||||||
|
" lambda row: 0 if str(row[\"doc\"]).startswith(\"tz_\") else 1, axis=1\n",
|
||||||
|
")\n",
|
||||||
|
"df.info()\n",
|
||||||
|
"df.sort_values(by=[\"doc\"], inplace=True)\n",
|
||||||
|
"\n",
|
||||||
|
"display(df.head(), df.tail())"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"language_info": {
|
||||||
|
"name": "python"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user