2025-02-22 12:46:29 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Лабораторная работа 8 ##"
]
},
2025-02-22 12:57:09 +04:00
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Загрузка данных из .doc файлов:"
]
},
2025-02-22 12:46:29 +04:00
{
"cell_type": "code",
2025-02-22 12:57:09 +04:00
"execution_count": 96,
2025-02-22 12:46:29 +04:00
"metadata": {},
2025-02-22 12:57:09 +04:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Загружено 41 документов.\n"
]
}
],
2025-02-22 12:46:29 +04:00
"source": [
"import os\n",
2025-02-22 12:57:09 +04:00
"import win32com.client\n",
"\n",
"# Укажите правильный путь к папке с файлами\n",
"data_path = os.path.abspath(\"..//static//csv//tz_itdocs\")\n",
"\n",
"# Проверка существования папки\n",
"if not os.path.exists(data_path):\n",
" raise FileNotFoundError(f\"Папка {data_path} не найдена.\")\n",
"\n",
"# Инициализация Word\n",
"word = win32com.client.Dispatch(\"Word.Application\")\n",
"word.Visible = False # Используйте свойство Visible с заглавной буквы\n",
"\n",
"# Чтение всех .doc файлов\n",
"texts = []\n",
"for filename in os.listdir(data_path):\n",
" if filename.endswith(\".doc\"):\n",
" file_path = os.path.join(data_path, filename)\n",
" try:\n",
" # Открытие документа\n",
" doc = word.Documents.Open(file_path)\n",
" text = doc.Content.Text\n",
" texts.append(text)\n",
" doc.Close(SaveChanges=False) # Закрыть без сохранения изменений\n",
" except Exception as e:\n",
" print(f\"Ошибка при чтении файла {filename}: {e}\")\n",
"\n",
"# Закрытие Word\n",
"word.Quit()\n",
"\n",
"# Вывод результата\n",
"print(f\"Загружено {len(texts)} документов.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Предобработка текста:"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [],
"source": [
"stop_words = set(stopwords.words('russian'))\n",
"lemmatizer = WordNetLemmatizer()\n",
2025-02-22 12:46:29 +04:00
"\n",
2025-02-22 12:57:09 +04:00
"def preprocess_text(text):\n",
" # Удаление спецсимволов\n",
" text = re.sub(r'\\W', ' ', text)\n",
" # Приведение к нижнему регистру\n",
" text = text.lower()\n",
" # Удаление стоп-слов и лемматизация\n",
" tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]\n",
" return ' '.join(tokens)\n",
2025-02-22 12:46:29 +04:00
"\n",
2025-02-22 12:57:09 +04:00
"# Применение предобработки к каждому документу\n",
"texts = [preprocess_text(text) for text in texts]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Векторизация текста"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [],
"source": [
"vectorizer = TfidfVectorizer(max_features=1000)\n",
"X = vectorizer.fit_transform(texts)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Кластеризация с использованием K-means"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1kAAAK9CAYAAADWo6YTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACXwklEQVR4nOzde1xUdf7H8fcBFAQFJLmooeKlVbykYpiWmSsqXbSLq13XS3YzrUxrw3bL7CK5aWlmZtaqrXZV1zU1xEv+rM0kRSpTWzXUUm5mgkCKzpzfH8Ss4wAyeBAGX8/Hg0fNme+c85kZGHnz/Z7PMUzTNAUAAAAAsIRXdRcAAAAAALUJIQsAAAAALETIAgAAAAALEbIAAAAAwEKELAAAAACwECELAAAAACxEyAIAAAAACxGyAAAAAMBChCwAAAAAsBAhCwAAAAAsRMjCRW3BggUyDMPpKywsTH369NGnn35a3eUBAIBK2r9/v9O/797e3mrWrJluueUWpaWluYw/ceKEXn31VXXv3l1BQUHy8/PTZZddprFjx+q///1vqcf4y1/+IsMwdNttt1Xxs4GnMUzTNKu7CKC6LFiwQCNHjtRzzz2nqKgomaaprKwsLViwQN9//70++eQT3XjjjdVdJgAAcNP+/fsVFRWlO+64Q9dff71sNpt27dqlOXPm6OTJk/rqq6/UuXNnSdKRI0cUHx+vbdu26cYbb1RcXJzq16+vH374QR988IEyMzNVVFTktH/TNNWsWTP5+PgoKytLWVlZatCgQTU8U9REPtVdAFATXHfdderWrZvj9qhRoxQeHq7333+fkAUAgAfr2rWr7r77bsftq666SoMGDdKcOXM0d+5cSdKIESO0fft2LVmyRIMHD3Z6/PPPP6+//vWvLvvduHGjfv75Z23YsEEDBgzQsmXLNHz48Kp9MvAYLBcEShEcHKx69erJx+d/f4coWXawYMECp7FjxoyRYRgaMWKEY9uyZcsUGxurkJAQ1atXT23bttXUqVNVMnH82WefyTAM/etf/3I59nvvvSfDMLR582ZJ0rfffqsRI0aoZcuW8vPzU0REhO655x798ssvpdbeokULlyWQhmFo48aNTmPOrFeSPv74YxmGoRYtWji2/fDDD/rjH/+oiIgI+fr6KjIyUg8++KCOHj3qGFNUVKRnnnlGMTExCgoKUkBAgHr16qXPPvvMaf8lr9+0adNcau7QoYOuvfZap23XXnuty7avv/7a8XzOlJ+frwkTJqhly5aqU6eO0/M+cuRIqa9Tecd58cUX5eXlpffee6/U51Da15mmTZumnj176pJLLlG9evUUExOjJUuWlHr8RYsWKTY2Vv7+/mrYsKGuueYaJScnSyr7vSz5OvO9stvtmjFjhtq3by8/Pz+Fh4frgQce0K+//up0vBYtWujGG29UcnKyOnfuLD8/P0VHR2vZsmVO40qW0u7fv9/pGJ06dXL5OXj22WcVHR2t+vXrKzAwUFdeeaWWL1/utL/PP/9cQ4YMUbNmzRzfS4899ph+++03p3EjRoxQ/fr1XV6nJUuWuHwfb9y40WWbJN1www0yDEPPPvusU42lfd9ERESUuo8zlTy2vK8zH79lyxbFx8crKChI/v7+6t27t/7zn/+Uus8zffbZZ/L19dWDDz7otP3QoUMaNWqUmjRpIl9fX0VFRWn06NEqKioqdcnz2V9nvlcbNmxQr169FBAQoODgYN10003atWtXuc+3QYMGio2NdXlPy1JevWVx5/NVko4dO6bHHntMLVq0kK+vry699FINGzbM8fNe8r1R1tfZ+/vxxx81ZMgQhYSEyN/fX1deeaVWrVpVaq0jRowodZ9nfr+V9X18tptuukktWrSQn5+fwsLCNGjQIH333XdOY06fPq3nn39erVq1kq+vr1q0aKGnnnpKJ0+edBp35ueFl5eXIiIidNttt+ngwYNO4yr6+WQYhsaOHeuy/cYbb3T67KnMezdu3DhFRkbK19dXrVu31tSpU2W32132aRiGy/fdiRMn1LBhwzL/PamIP/7xj5Kk9PR0ScU/s6tWrdKoUaNcApYk+fr6lnqsxYsXKzo6Wn369FFcXJwWL15cqXpQOzGTBUjKzc3VkSNHZJqmsrOzNWvWLOXn5zv95as0e/fu1bx581y25+XlqXv37ho+fLjq1KmjpKQkJSQkyMfHRxMmTNC1116ryMhILV68WLfccovTYxcvXqxWrVqpR48ekqS1a9fqxx9/1MiRIxUREaHvv/9eb731lr7//nt99dVXLr+oSVKvXr10//33S5J27dqlKVOmlPs8Tp8+Xepf6QoKCnTppZdq4MCBCgwM1I4dOzR79mwdOnRIn3zyieO5vv3227rjjjt033336fjx43rnnXc0YMAApaSkOJZiWOHJJ58sdfsTTzyhN998U6NGjdJVV12lOnXqaNmyZaWG2HOZP3++/va3v2n69Om68847Sx1z//33q1evXpJU6nFmzpypQYMG6a677lJRUZE++OADDRkyRCtXrtQNN9zgGDd58mQ9++yz6tmzp5577jnVrVtXW7Zs0YYNG9S/f3/NmDFD+fn5kv73Pj711FNq166dJDn9EvfAAw84lr8+8sgjSk9P1+uvv67t27frP//5j+rUqeMYu2fPHt1222168MEHNXz4cM2fP19DhgxRUlKS+vXrV+Zr889//tPlF0Cp+PvklltuUYsWLfTbb79pwYIFGjx4sDZv3qzY2FhJxSG+sLBQo0eP1iWXXKKUlBTNmjVLP//8sz7++ONy3xN3bNq0SatXr67Q2OnTpysrK+uc42699Va1bt3acfuxxx5Tu3btHD9jkhzvyYYNG3TdddcpJiZGkyZNkpeXl+bPn68//vGP+vzzzx2vx9m++eYb3Xzzzbr++us1e/Zsx/bDhw8rNjZWx44d0/3336+2bdvq0KFDWrJkiQoLC3XNNdfon//8p2P8iy++KElOP889e/aUJK1bt07XXXedWrZsqWeffVa//fabZs2apauuukqpqalOvzhLcuz3yJEjeuONNzRkyBDt2LFDf/jDH8p8rc5Vb926dct9rc9U1udrfn6+evXqpV27dumee+5R165ddeTIEa1YsUI///yzGjVq5Bj7yCOP6IorrnB6/L333ut0OysrSz179lRhYaEeeeQRXXLJJVq4cKEGDRqkJUuWuHxGS1KjRo306quvOm7/+c9/rvDzOtv999+viIgIHT58WK+//rri4uKUnp4uf39/R70LFy7Un/70J02YMEFbtmxRYmKidu3a5fLZU/LZb7fbtWPHDs2YMUOHDx/W559/7hhT0c+n81HWe1dYWKjevXvr0KFDeuCBB9SsWTN9+eWXmjhxojIyMjRjxgyn8X5+fpo/f75uvvlmx7Zly5bpxIkT51Xfvn37JEmXXHKJJGnFihWS3HsfT548qaVLl2rChAmSpDvuuEMjR45UZmamIiIizqs+1BImcBGbP3++Kcnly9fX11ywYIHT2PT0dFOSOX/+fMe2oUOHmh06dDAjIyPN4cOHl3us6Oho88Ybb3Tcnjhxounr62seO3bMsS07O9v08fExJ02a5NhWWFjosq/333/flGRu2rTJ5b6mTZuaI0eOdNz+7LPPTEnmZ5995tjWvHlzp3rfeOMN09fX1+zTp4/ZvHnzcp/HQw89ZNavX99x+/Tp0+bJkyedxvz6669meHi4ec899zi2lbx+L7/8sss+27dvb/bu3dtpW+/evZ22rV692pRkxsfHm2d/dDVu3NgcMGCA07ZJkyaZksycnJxyn8+Zx1m1apXp4+NjTpgwodSxe/bsMSWZCxcudDnOmc5+z4qKiswOHTqYf/zjH5325eXlZd5yyy2mzWZzGm+3212OXdr7WOLzzz83JZmLFy922p6UlOSyvXnz5qYkc+nSpY5tubm5ZuPGjc0uXbo4tpX8bKSnp5umaZonTpwwmzVrZl533XUuPwdny87ONiWZ06ZNK/M1MU3TTExMNA3DMA8cOODYNnz4cDMgIMBl7Mcff+zy/Et7Tbp37+6o8cyfo7Pfp+zsbLNBgwaOsaW9rmU
"text/plain": [
"<Figure size 1000x800 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>cluster</th>\n",
" <th>pca_1</th>\n",
" <th>pca_2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2 2 техническое задание 2 2 1 общие сведения п...</td>\n",
" <td>2</td>\n",
" <td>0.379267</td>\n",
" <td>0.009187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2 2 техническое задание 2 2 1 общие сведения п...</td>\n",
" <td>0</td>\n",
" <td>0.453726</td>\n",
" <td>0.042687</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2 2 техническое задание общие сведения данной ...</td>\n",
" <td>2</td>\n",
" <td>0.490069</td>\n",
" <td>0.078381</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>техническое задание 2 2 1 общие сведения интер...</td>\n",
" <td>0</td>\n",
" <td>0.073403</td>\n",
" <td>0.132265</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2 2 техническое задание 2 2 1 общие сведения 1...</td>\n",
" <td>0</td>\n",
" <td>0.494253</td>\n",
" <td>-0.036965</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text cluster pca_1 \\\n",
"0 2 2 техническое задание 2 2 1 общие сведения п... 2 0.379267 \n",
"1 2 2 техническое задание 2 2 1 общие сведения п... 0 0.453726 \n",
"2 2 2 техническое задание общие сведения данной ... 2 0.490069 \n",
"3 техническое задание 2 2 1 общие сведения интер... 0 0.073403 \n",
"4 2 2 техническое задание 2 2 1 общие сведения 1... 0 0.494253 \n",
"\n",
" pca_2 \n",
"0 0.009187 \n",
"1 0.042687 \n",
"2 0.078381 \n",
"3 0.132265 \n",
"4 -0.036965 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>cluster</th>\n",
" <th>pca_1</th>\n",
" <th>pca_2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>этапы разработки проекта заключительные стадии...</td>\n",
" <td>3</td>\n",
" <td>-0.471378</td>\n",
" <td>-0.163534</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>этапы разработки проекта определение стратегии...</td>\n",
" <td>1</td>\n",
" <td>-0.350179</td>\n",
" <td>-0.501840</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>этапы разработки проекта реализация тестирован...</td>\n",
" <td>1</td>\n",
" <td>-0.230170</td>\n",
" <td>-0.509385</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td>этапы разработки проекта стратегия анализ введ...</td>\n",
" <td>1</td>\n",
" <td>-0.277140</td>\n",
" <td>-0.409235</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td>2 1 3 язык манипуляции данными ямд язык манипу...</td>\n",
" <td>3</td>\n",
" <td>-0.267309</td>\n",
" <td>0.168029</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text cluster pca_1 \\\n",
"36 этапы разработки проекта заключительные стадии... 3 -0.471378 \n",
"37 этапы разработки проекта определение стратегии... 1 -0.350179 \n",
"38 этапы разработки проекта реализация тестирован... 1 -0.230170 \n",
"39 этапы разработки проекта стратегия анализ введ... 1 -0.277140 \n",
"40 2 1 3 язык манипуляции данными ямд язык манипу... 3 -0.267309 \n",
"\n",
" pca_2 \n",
"36 -0.163534 \n",
"37 -0.501840 \n",
"38 -0.509385 \n",
"39 -0.409235 \n",
"40 0.168029 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"from sklearn.manifold import TSNE\n",
"import numpy as np\n",
"\n",
"num_clusters = 5 # Количество кластеров!\n",
"kmeans = KMeans(n_clusters=num_clusters, random_state=42)\n",
"clusters = kmeans.fit_predict(X)\n",
"\n",
"# Визуализация кластеров с помощью t-SNE\n",
"tsne = TSNE(n_components=2, random_state=42)\n",
"X_embedded = tsne.fit_transform(X.toarray()) # Преобразуем разреженную матрицу в плотную\n",
"\n",
"# Уменьшение размерности с помощью PCA для визуализации\n",
"pca = PCA(n_components=2)\n",
"X_pca = pca.fit_transform(X.toarray()) # Преобразуем разреженную матрицу в плотную\n",
"\n",
"# Создаем DataFrame для удобства\n",
"df = pd.DataFrame({\n",
" \"text\": texts, # Исходные тексты\n",
" \"cluster\": clusters, # Метки кластеров\n",
" \"pca_1\": X_pca[:, 0], # Первая компонента PCA\n",
" \"pca_2\": X_pca[:, 1] # Вторая компонента PCA\n",
"})\n",
"\n",
"# Визуализация кластеров\n",
"plt.figure(figsize=(10, 8))\n",
"for cluster in range(num_clusters):\n",
" # Выбор точек, принадлежащих текущему кластеру\n",
" cluster_points = df[df[\"cluster\"] == cluster]\n",
" plt.scatter(cluster_points[\"pca_1\"], cluster_points[\"pca_2\"], label=f'Cluster {cluster}')\n",
"\n",
"plt.title(\"Визуализация кластеризации текстов с использованием PCA\")\n",
"plt.xlabel(\"Главная компонента 1\")\n",
"plt.ylabel(\"Главная компонента 2\")\n",
"plt.legend()\n",
"plt.show()\n",
"\n",
"# Вывод первых и последних строк DataFrame\n",
2025-02-22 12:46:29 +04:00
"display(df.head(), df.tail())"
]
2025-02-22 12:57:09 +04:00
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Классификация текстов (пример с использованием SVM)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Отчет о классификации:\n",
" precision recall f1-score support\n",
"\n",
" 0 0.57 1.00 0.73 4\n",
" 1 1.00 1.00 1.00 3\n",
" 2 0.00 0.00 0.00 3\n",
" 3 1.00 1.00 1.00 3\n",
"\n",
" accuracy 0.77 13\n",
" macro avg 0.64 0.75 0.68 13\n",
"weighted avg 0.64 0.77 0.69 13\n",
"\n",
"Точность: 0.77\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
"d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
}
],
"source": [
"from sklearn.svm import SVC\n",
"\n",
"y = kmeans.labels_ # Пример меток классов\n",
"\n",
"# Разделение данных на обучающую и тестовую выборки!\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n",
"\n",
"# Обучение модели SVM\n",
"svm_model = SVC(kernel='linear')\n",
"svm_model.fit(X_train, y_train)\n",
"\n",
"# Предсказание на тестовой выборке\n",
"y_pred = svm_model.predict(X_test)\n",
"\n",
"# Оценка качества классификации\n",
"print(\"Отчет о классификации:\")\n",
"print(classification_report(y_test, y_pred))\n",
"print(f\"Точность: {accuracy_score(y_test, y_pred):.2f}\")"
]
2025-02-22 12:46:29 +04:00
}
],
"metadata": {
2025-02-22 12:57:09 +04:00
"kernelspec": {
"display_name": "aimenv",
"language": "python",
"name": "python3"
},
2025-02-22 12:46:29 +04:00
"language_info": {
2025-02-22 12:57:09 +04:00
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
2025-02-22 12:46:29 +04:00
}
},
"nbformat": 4,
"nbformat_minor": 2
}