560 lines
292 KiB
Plaintext
Raw Permalink Normal View History

2025-02-28 20:43:35 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Лабораторная 8"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Записки по ПМУ (кластеризация, антиплагиат)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Файл pmu/31-МасенькинМС.pdf не подходит формату: Package not found at 'pmu/31-МасенькинМС.pdf'\n",
"Файл pmu/31-РазубаевСМ (1).pdf не подходит формату: Package not found at 'pmu/31-РазубаевСМ (1).pdf'\n",
"Файл pmu/31-ТерёхинАС.doc не подходит формату: Package not found at 'pmu/31-ТерёхинАС.doc'\n",
"Файл pmu/31-ШаныгинАВ (1).doc не подходит формату: Package not found at 'pmu/31-ШаныгинАВ (1).doc'\n",
"Файл pmu/32-КозловаАА (1).doc не подходит формату: Package not found at 'pmu/32-КозловаАА (1).doc'\n",
"Файл pmu/32-КозловаАА.doc не подходит формату: Package not found at 'pmu/32-КозловаАА.doc'\n",
"Файл pmu/32-ПучкинаАА (1).doc не подходит формату: Package not found at 'pmu/32-ПучкинаАА (1).doc'\n",
"Файл pmu/32-ПучкинаАА.doc не подходит формату: Package not found at 'pmu/32-ПучкинаАА.doc'\n",
"Файл pmu/32-СмирновАА.doc не подходит формату: Package not found at 'pmu/32-СмирновАА.doc'\n",
"Файл pmu/32-ЧубыкинаПП (1).doc не подходит формату: Package not found at 'pmu/32-ЧубыкинаПП (1).doc'\n",
"Файл pmu/32-ЧубыкинаПП.doc не подходит формату: Package not found at 'pmu/32-ЧубыкинаПП.doc'\n",
"Файл pmu/33-ПанинаАД (1).doc не подходит формату: Package not found at 'pmu/33-ПанинаАД (1).doc'\n",
"Файл pmu/33-СалинОА (1).doc не подходит формату: file 'pmu/33-СалинОА (1).doc' is not a Word file, content type is 'application/vnd.openxmlformats-officedocument.themeManager+xml'\n",
"Файл pmu/33-ТихоненковАЕ (1).doc не подходит формату: Package not found at 'pmu/33-ТихоненковАЕ (1).doc'\n",
"Файл pmu/PIbd33_Kislitsa_E_D.odt не подходит формату: \"There is no item named '[Content_Types].xml' in the archive\"\n",
"Файл pmu/Курсовая Данилов.pdf не подходит формату: Package not found at 'pmu/Курсовая Данилов.pdf'\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 74 entries, 0 to 73\n",
"Data columns (total 3 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 doc 74 non-null object\n",
" 1 text 74 non-null object\n",
" 2 type 74 non-null int64 \n",
"dtypes: int64(1), object(2)\n",
"memory usage: 2.3+ KB\n"
]
}
],
"source": [
"import spacy\n",
"import num2words\n",
"import re\n",
"import unicodedata\n",
"import pandas as pd\n",
"from docx import Document\n",
"import os\n",
"\n",
"sp = spacy.load(\"ru_core_news_lg\")\n",
"\n",
"def read_docx(file_path):\n",
" try:\n",
" doc = Document(file_path)\n",
" full_text = []\n",
" for paragraph in doc.paragraphs:\n",
" full_text.append(paragraph.text)\n",
" return \"\\n\".join(full_text)\n",
" except Exception as e:\n",
" print(f\"Файл {file_path} не подходит формату: {e}\")\n",
" return \"\"\n",
"def load_docs(dataset_path):\n",
" df = pd.DataFrame(columns=[\"doc\", \"text\"])\n",
" for file_path in os.listdir(dataset_path):\n",
" if file_path.startswith(\"~$\"):\n",
" continue\n",
" text = read_docx(dataset_path + file_path)\n",
" df.loc[len(df.index)] = [file_path, text]\n",
" return df\n",
"\n",
"\n",
"df = load_docs(\"pmu/\")\n",
"df[\"type\"] = df.apply(\n",
" lambda row: 0 if str(row[\"doc\"]).startswith(\"tz_\") else 1, axis=1\n",
")\n",
"df.info()\n",
"df.sort_values(by=[\"doc\"], inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Предобработка"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"def preprocess_text(text):\n",
" text = text.lower()\n",
" text = re.sub(\"[^a-zA-Zа-яА-Я ]\", \"\", text)\n",
"\n",
" text = unicodedata.normalize(\"NFKD\", text)\n",
" text = \"\".join([char for char in text if not unicodedata.combining(char)])\n",
"\n",
" words: list[str] = text.split()\n",
" words = [num2words(word, lang=\"ru\") if word.isdigit() else word for word in words]\n",
" text = \" \".join(words)\n",
" doc = sp(text)\n",
" tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]\n",
" return \" \".join(tokens)\n",
"\n",
"df[\"preprocessed_text\"] = df[\"text\"].apply(preprocess_text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выделение частей речи"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"def morphological_analysis(text: str) -> list[dict]:\n",
" doc = sp(text)\n",
" tokens_info = [\n",
" {\n",
" \"text\": token.text,\n",
" \"pos\": token.pos_,\n",
" \"morph\": token.morph,\n",
" }\n",
" for token in doc\n",
" ]\n",
" return tokens_info\n",
"\n",
"#df[\"pos_tagging_text\"] = df[\"preprocessed_text\"].apply(morphological_analysis)\n",
"#print(df.head)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Нормализация текста(Лемматизация)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"from spacy.tokens.doc import Doc\n",
"\n",
"\n",
"def lemmatize_text(text: str) -> str:\n",
" doc = sp(text)\n",
" lemmas = [token.lemma_ for token in doc]\n",
" return \" \".join(lemmas)\n",
"\n",
"df[\"normalized_text\"] = df[\"preprocessed_text\"].apply(lemmatize_text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Фильтрация(длина от 5 до 15 символов)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"def filter_text(text: str, min_length: int = 5, max_length: int = 15) -> str:\n",
" words = text.split()\n",
" filtered_words = [\n",
" word for word in words \n",
" if min_length <= len(word) <= max_length\n",
" ]\n",
" return \" \".join(filtered_words)\n",
"\n",
"df[\"filtered_text\"] = df[\"normalized_text\"].apply(filter_text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Создание нечеткой системы и добавление нечетких правил в базу знаний нечеткой системы"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"from nltk.util import ngrams\n",
"from nltk.tokenize import word_tokenize\n",
"\n",
"def generate_ngrams(text: str, n: int = 2) -> list[tuple]:\n",
" tokens = word_tokenize(text, language=\"russian\")\n",
" n_grams: list[tuple] = list(ngrams(tokens, n))\n",
" return n_grams\n",
"\n",
"df[\"trigrams\"] = df[\"filtered_text\"].apply(lambda x: generate_ngrams(x, n=3))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Векторизация. Метод мешков"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from scipy import sparse\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"\n",
"counts_vectorizer = CountVectorizer()\n",
"counts_matrix = sparse.csr_matrix(counts_vectorizer.fit_transform(df[\"filtered_text\"]))\n",
"counts_df = pd.DataFrame(\n",
" counts_matrix.toarray(),\n",
" columns=counts_vectorizer.get_feature_names_out(),\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Векторизаций. Частотный портрет"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True)\n",
"tfidf_matrix = sparse.csr_matrix(tfidf_vectorizer.fit_transform(df[\"filtered_text\"]))\n",
"tfidf_df = pd.DataFrame(\n",
" tfidf_matrix.toarray(),\n",
" columns=tfidf_vectorizer.get_feature_names_out(),\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Метод локтя"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjIAAAHHCAYAAACle7JuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABTLklEQVR4nO3deVxU9foH8M8MuwKDgDIgCChuqIiQGm6ZYi65JaWZXs2sTHG3zayUrGtlV82bWZlpppbLLzUqMQXFNBQFUYkkF1ySLVEGQTZnzu8P7kyOzMAMzjDb5/168XrJOWcOj+fW5el8n+f7iARBEEBERERkgcSmDoCIiIiooZjIEBERkcViIkNEREQWi4kMERERWSwmMkRERGSxmMgQERGRxWIiQ0RERBaLiQwRERFZLCYyREREZLGYyBAREZHFYiJDREREFouJDJEN2LhxI0QiEUQiEY4cOVLrvCAICAgIgEgkwvDhw00QIRFRwzCRIbIhzs7O2Lp1a63jycnJ+Ouvv+Dk5GSCqIiIGo6JDJENGTZsGHbs2IG7d++qHd+6dSsiIyMhlUpNFBkRUcMwkSGyIePHj0dRURH279+vOlZVVYWdO3fimWee0fgZhUKBVatWoVOnTnB2doaPjw+mTZuGW7duqa4JCgpSLV1p+goKClJdW1ZWhgULFiAgIABOTk5o3749PvroIwiCUOtnHzp0SOs9dfXss89q/PySJUvUrktKSkLfvn3RtGlTeHh4YNSoUfjjjz/UrlmyZEmtn33w4EE4OTnhpZdeUrumrq9Dhw6pPr927Vp07twZTZo0Ubtm586dOv8diWyZvakDIKLGExQUhKioKHz77bcYOnQoAGDv3r2QyWR4+umnsXr16lqfmTZtGjZu3IgpU6Zg9uzZyMnJwSeffIJTp07h6NGjcHBwwKpVq1BaWgoA+OOPP/Dvf/8bb7zxBjp27AgAcHV1BVBTizNy5EgcPHgQU6dORXh4OPbt24dXXnkF169fx8qVKzXGPXv2bHTv3h0AsGnTJrVETBfe3t5q9/7Xv/6ldv7AgQMYOnQoWrdujSVLlqC8vBz//e9/0bt3b6Snp6slYvc6ffo0Ro8ejWHDhmHNmjUAgDFjxiAkJER1zbx589CxY0e8+OKLqmPK57Jt2zbMmDED/fv3x6xZs9C0aVPV8yMiHQlEZPU2bNggABBOnDghfPLJJ4Kbm5tw584dQRAE4amnnhIeffRRQRAEITAwUHj88cdVn/v1118FAMKWLVvU7peQkKDxuCAIwsGDBwUAwsGDB2ud2717twBAePfdd9WOP/nkk4JIJBIuXLigdvyXX34RAAg7d+5UHYuNjRX0+b+uCRMmCMHBwWrHAAiLFy9WfR8eHi60aNFCKCoqUh07ffq0IBaLhUmTJqmOLV68WPWzL1++LPj6+gp9+vQRysvLtf78wMBAYfLkyRrPjR8/XvDw8FD7vPL57dixQ+e/I5Et49ISkY0ZO3YsysvL8eOPP+L27dv48ccftS4r7dixAxKJBIMGDcKNGzdUX5GRkXB1dcXBgwf1+tk///wz7OzsMHv2bLXjCxYsgCAI2Lt3r9rxiooKADVFyg1VVVVVZxFzXl4eMjIy8Oyzz8LT01N1PCwsDIMGDcLPP/9c6zNFRUUYPHgw3Nzc8MMPPzQ4vtu3b6NJkyYP9PcjsnVMZIhsTPPmzREdHY2tW7fi+++/h1wux5NPPqnx2vPnz0Mmk6FFixZo3ry52ldpaSkKCwv1+tlXrlyBn58f3Nzc1I4rl1quXLmidvzGjRsAAIlEotfPuVdxcbFqaUtbTADQvn37Wuc6duyIGzduoKysTO348OHDkZ2djeLiYo21PbqKiopCbm4ulixZgqtXr+LGjRuQyWQNvh+RLWKNDJENeuaZZ/DCCy8gPz8fQ4cOhYeHh8brFAoFWrRogS1btmg837x5cyNGCVy+fBkAtNao6CI/Px+BgYGGCeh/zp07h71792Ls2LFYsGABNmzY0KD7zJs3D9nZ2Vi6dCni4uIMGiORreAbGSIb9MQTT0AsFuPYsWNal5UAoE2bNigqKkLv3r0RHR1d66tr1656/dzAwEDk5ubi9u3basfPnTunOn+vkydPQiqVwt/fX6+fo1RdXY0LFy6o3vhoiwkAsrOza507d+4cvL290bRpU7XjP/zwA4YMGYJly5Zh48aNSExMbFB8Li4uWLduHTp16oQ+ffpg//79+Oijjxp0LyJbxUSGyAa5urpi7dq1WLJkCUaMGKH1urFjx0Iul2Pp0qW1zt29exfFxcV6/dxhw4ZBLpfjk08+UTu+cuVKiEQiVScVUFOHcvDgQYwcOVKvn3GvPXv2oLy8HAMGDNB6ja+vL8LDw/H111+r/X0yMzPxyy+/YNiwYbU+07dvXwDAjBkz0KtXL0ybNg3l5eUNinHhwoW4evUqNm/ejOjoaERGRjboPkS2iktLRDZq8uTJ9V7zyCOPYNq0aVi2bBkyMjLw2GOPwcHBAefPn8eOHTvw8ccfa62v0WTEiBF49NFHsWjRIly+fBldu3bFL7/8gj179mDu3Llo06YNACAlJQWvv/46ysvL0bx5c2zevFl1jz///BMAsHnzZjzxxBO13pYAwJ07d7B48WJ8+umn6NWrFx577LE641q+fDmGDh2KqKgoTJ06VdV+LZFIau03cy+RSIQvv/wS4eHhWLx4MT788EOdnwVQ0/a9cuVKfPPNNwZf/iKyGaZumyIi47u3/bou97dfK33xxRdCZGSk4OLiIri5uQldunQRXn31VSE3N7fWtXW1XwuCINy+fVuYN2+e4OfnJzg4OAht27YVli9fLigUCtU1kydPFgDU+5WTk6PxZ/z1119CQECAMHfuXEEmk9U6j/varwVBEA4cOCD07t1bcHFxEdzd3YURI0YIWVlZatfc2359r7i4OMHe3l5IT0+vdU5b+/WNGzcEPz8/Yfz48WrH2X5NpB+RIDxAyT0RkRE8++yzAGqGXWojEomQk5PzQIXARGT5WCNDREREFos1MkRkdnr16lXvNRMmTKhzfxgisg1cWiIiIiKLxaUlIiIislhMZIiIiMhiWX2NjEKhQG5uLtzc3CASiUwdDhEREelAEATcvn0bfn5+EIu1v3ex+kQmNzcXAQEBpg6DiIiIGuDatWt1jimx+kRGOWX32rVrcHd3N3E0REREpIuSkhIEBASofo9rY/WJjHI5yd3dnYkMERGRhamvLITFvkRERGSxmMgQERGRxWIiQ0RERBaLiQwRERFZLCYyREREZLGYyBAREZHFYiJDREREFouJDBEREVksJjJERERksax+Z19jkSsEpObcROHtCrRwc0aPYE/YiTmUkoiIqDExkWmAhMw8xMVnIU9WoTrmK3HG4hGhGNLZ14SRERER2RYuLekpITMP0zenqyUxAJAvq8D0zelIyMwzUWRERES2h4mMHuQKAXHxWRA0nFMei4vPglyh6QoiIiIyNCYyekjNuVnrTcy9BAB5sgqk5txsvKCIiIhsGGtk9FB4W3sSo+k6FgQTEREZFxMZPbRwc9bxOicWBBMRETUCLi3poUewJ3wlzqjvncobu87iJRYEExERGR0TGT3YiUVYPCIUAGolM8rvHexEyLlxR+PnWRBMRERkWExk9DSksy/WToyAVKK+zCSVOOOziRFYNS68zs+zIJiIiMhwWCPTAEM6+2JQqFRjIe+ejOs63UPXwmEiIiLSjolMA9mJRYhq41XruO4FwbpdR0RERNpxacnA6isIFqGme6lHsGdjhkVERGSVmMgYWF0FwUBNjcziEaHcT4aIiMgAmMgYgbaCYAAI8mqCQaFSE0RFRERkfVgjYyT3FwTbiUR4/f/O4HLRHaz79RJeeqSNqUMkIiKyeExkjOj+guA71XK8uvMMVvzyJwZ0aIF2Pm4mjI6IiMjycWmpET0V6Y8BHVqgSq7Agu2nUS1XmDokIiIii8ZEphGJRCIsG9MF7s72OHtdhs+TL5o6JCIiIovGRKaR+bg7I25UJwDAx4nncfYvGVIuFmFPxnWkXCzi6AIiIiI9sEbGBEaHt8Tes/n
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.cluster import KMeans\n",
"import matplotlib.pyplot as plt\n",
"\n",
"wcss = []\n",
"for i in range(1, 51):\n",
" kmeans = KMeans(n_clusters=i, random_state=42)\n",
" kmeans.fit(tfidf_matrix)\n",
" wcss.append(kmeans.inertia_)\n",
"\n",
"plt.plot(range(1, 51), wcss, marker='o')\n",
"plt.title('Метод локтя')\n",
"plt.xlabel('Число кластеров')\n",
"plt.ylabel('Сумма квадратов ошибок')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Коэффициент силуэта"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkAAAAHHCAYAAABXx+fLAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABYr0lEQVR4nO3deVyU1f4H8M/MyDAoMIgIMygKiktoiitRbgmKVmpppZbptW6Wa4re0vrlkt2LmhV29VLZLTNz7bZpRiqJmaG44YaaGOXGIqjDoizOnN8fNJMjMzADM8zAfN6v17xezvOc58yZR++db+f5nu+RCCEEiIiIiFyI1NEDICIiIqprDICIiIjI5TAAIiIiIpfDAIiIiIhcDgMgIiIicjkMgIiIiMjlMAAiIiIil8MAiIiIiFwOAyAiIiJyOQyAiIiIyOUwACJyQWvWrIFEIoFEIsHPP/9c6bwQAkFBQZBIJHjkkUccMEIiIvtiAETkwhQKBdavX1/p+J49e3Dp0iW4u7s7YFRERPbHAIjIhT300EPYsmULbt++bXR8/fr16NGjB1QqlYNGRkRkXwyAiFzY2LFjkZ+fj507dxqOlZWV4YsvvsBTTz1l8hqdTof4+Hh06tQJCoUCAQEBeOGFF3D9+nVDm+DgYMMjNlOv4OBgQ9vi4mLMnj0bQUFBcHd3R4cOHbB8+XIIISp9dnJystk+LVFSUoKFCxeiffv2UCgUUKvVGDlyJM6fPw8A+P333yGRSLBmzRqj66ZOnQqJRIK//e1vlfocMGCAyfHo+1iwYAHc3Nxw9erVStdOmjQJPj4+KCkpwd69exEdHQ0/Pz94eHigW7duSEhIMLoP5j7L1H345JNPMHDgQPj7+8Pd3R1hYWFISEiw6D4RuYJGjh4AETlOcHAwIiMjsWHDBgwdOhQA8P3330Oj0WDMmDF47733Kl3zwgsvYM2aNZg4cSJmzJiBzMxMrFy5EkePHsW+ffvg5uaG+Ph4FBUVAQBOnz6Nf/3rX3j11Vdxzz33AAA8PT0BVOQaDR8+HLt378Zzzz2H8PBw/PDDD/jHP/6By5cv49133zU57hkzZqBXr14AgLVr1xoFcOZotVo88sgjSEpKwpgxY/DSSy+hsLAQO3fuxMmTJ9G2bVuT12VkZGD16tVV9t2xY0e89tprAIC8vDzMmjXLcO6ZZ57BG2+8gU2bNmHatGmG4/pAc9SoUVAoFPjll1/g7++P//u//4NMJsOePXswZcoUHD9+3BC4vPbaa/j73/9u9DmTJk1C3759K40pISEBnTp1wvDhw9GoUSNs3boVU6ZMgU6nw9SpU6u9X0QNniAil/PJJ58IAOLgwYNi5cqVwsvLS9y8eVMIIcQTTzwhHnzwQSGEEK1btxYPP/yw4bq9e/cKAOLzzz836i8xMdHkcSGE2L17twAgdu/eXenc119/LQCIN9980+j4448/LiQSicjIyDA6vmPHDgFAfPHFF4ZjU6dOFZb8X9nHH38sAIh33nmn0jmdTieEECIzM1MAEJ988onh3JNPPik6d+4sgoKCxIQJEypd+8ADDxjul7k+IiMjRUREhNF1X375pdn7ovfaa68JAOKnn36qdM7U59xJ//d5p5iYGNGmTRuzn0fkSvgIjMjFPfnkk7h16xa2bduGwsJCbNu2zezjry1btkCpVGLQoEHIy8szvHr06AFPT0/s3r3bqs/evn07ZDIZZsyYYXR89uzZEELg+++/NzpeUlICoCJ521r/+9//4Ofnh+nTp1c6Z+4R2uHDh7FlyxbExcVBKjX9f5dlZWXVJouPHz8eBw4cMDxqA4DPP/8cQUFB6N+/v+HYzZs3je7r888/Dzc3N2zZssWSr2jEw8PD8GeNRoO8vDz0798fv/32GzQajdX9ETU0DICIXFzz5s0RHR2N9evX48svv4RWq8Xjjz9usu25c+eg0Wjg7++P5s2bG72KioqQm5tr1Wf/8ccfCAwMhJeXl9Fx/aOyP/74w+h4Xl4eAECpVFr1OQBw/vx5dOjQAY0aWf7kf+7cuejbt2+VpQBu3LhheKRnzujRo+Hu7o7PP/8cQEVAsm3bNjz99NNGwdeyZcuM7mlwcDDKy8uRkZFh8Zj19u3bh+joaDRp0gQ+Pj5o3rw5Xn31VcPnE7k65gAREZ566ik8//zzyM7OxtChQ+Hj42OynU6ng7+/v+GH/G7Nmze34ygrkpQBGCVR28uOHTuwa9cupKSkVNkuOzsbMTExVbZp2rQpHnnkEXz++eeYP38+vvjiC5SWlmLcuHFG7caPH48+ffoYHRszZozVYz9//jyioqLQsWNHvPPOOwgKCoJcLsf27dvx7rvvQqfTWd0nUUPDAIiI8Nhjj+GFF17A/v37sWnTJrPt2rZti127duGBBx4wesRSU61bt8auXbtQWFhoNAt05swZw/k7HTp0CCqVCi1btrT6s9q2bYsDBw6gvLwcbm5uVbYVQmDu3Ll47LHHcN9995ltd+nSJRQWFhpmrKoyfvx4jBgxAgcPHsTnn3+Obt26oVOnTkZt2rRpgzZt2hje5+Xl4dq1a2YTtM3ZunUrSktL8e2336JVq1aG49Y+oiRqyPgIjIjg6emJhIQELFy4EMOGDTPb7sknn4RWq8XixYsrnbt9+zZu3Lhh1ec+9NBD0Gq1WLlypdHxd999FxKJxLAyDQDy8/Oxe/duDB8+3KrP0Bs1ahTy8vIqfRaASkvuN27ciOPHjyMuLq7KPjdu3AgAGDhwYLWfP3ToUPj5+WHp0qXYs2dPpdkfrVZb6Zq4uDgIITBy5Mhq+7+TTCYDYPy9NBoNPvnkE6v6IWrIOANERACACRMmVNumf//+eOGFFxAXF4e0tDQMHjwYbm5uOHfuHLZs2YIVK1aYzR8yZdiwYXjwwQfx2muv4ffff0fXrl2xY8cOfPPNN5g5c6Zh5iMlJQVz587FrVu30Lx5c6xbt87Qx6+//goAWLduHR577DE0adLE5GeNHz8ea9euRWxsLFJTU9G3b18UFxdj165dmDJlCkaMGGFou2PHDjz//PPo0KGDyb5ycnKwYMECfPTRRxgzZgw6duxY7Xd1c3PDmDFjsHLlSshkMowdO9bofFxcHI4cOYK+ffsalq3v3LkTzzzzDB588MFq+7/T4MGDIZfLMWzYMLzwwgsoKirC6tWr4e/vj6ysLKv6ImqwHLoGjYgc4s5l8FW5exm83ocffih69OghPDw8hJeXl7j33nvFyy+/LK5cuVKpbVXL4IUQorCwUMyaNUsEBgYKNzc30a5dO/HWW28ZlqYLIcSECRMEgGpfmZmZVX6fmzdvitdee02EhIQINzc3oVKpxOOPPy7Onz8vhPhrabmHh4e4fPlypXuhXwa/b98+ERoaKhYuXChKS0uN2lW1PD01NVUAEIMHD6507ueffxbR0dGiWbNmQi6Xi44dO4ply5aJ27dvm/wu1S2D//bbb0WXLl2EQqEQwcHBYunSpYZSANXdJyJXIBHCRLlVIiInoq/AfHeF5jtJJBJkZmbWSYJ0TR07dgzh4eFYu3YtnnnmGUcPh8ilMQeIiKiOrF69Gp6enlbn9BCR7TEHiIic3v33319tm6effrraejyOsnXrVqSnp+PDDz/EtGnTzOYpEVHd4SMwIiI7Cw4ORk5ODmJiYvDZZ59VKvxIRHWPARARERG5HOYAERERkcthAEREREQuh0nQJuh0Oly5cgVeXl5md4kmIiIi5yKEQGFhIQIDAyGVVj3HwwDIhCtXriAoKMjRwyAiIqIauHjxYrV7BjIAMkG/QuPixYvw9vZ28GiIiIjIEgUFBQgKCrJopSUDIBP0j728vb0ZABEREdUzlqSvMAmaiIiIXA4DICIiInI5DICIiIjI5TAAIiIiIpfDAIiIiIhcDgMgIiIicjkMgIiIiMjlMAAiIiIil8MAiIiIiFwOK0ETERFRrWl1AqmZ15BbWAJ/LwV6h/hCJnXeDcUZABEREbkoWwUtiSezsGhrOrI0JYZjaqUCC4aFYUhntS2HbDMMgIiIiFyQNUFLVYFS4sksTF53BOKu/rM1JZi87ggSxnU
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.metrics import silhouette_score\n",
"silhouette_scores = []\n",
"for i in range(2, 51): \n",
" kmeans = KMeans(n_clusters=i, random_state=42)\n",
" kmeans.fit(tfidf_matrix)\n",
" score = silhouette_score(tfidf_matrix, kmeans.labels_, random_state=42)\n",
" silhouette_scores.append(score)\n",
" #print(f\"Число кластеров: {i}, Коэффициент силуэта: {score:.4f}\")\n",
"\n",
"plt.plot(range(2, 51), silhouette_scores, marker='o')\n",
"plt.title('Метод силуэта')\n",
"plt.xlabel('Число кластеров')\n",
"plt.ylabel('Silhouette Score')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Кластеризация с использованием Kmeans"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>doc</th>\n",
" <th>cluster</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>31-АнисинРС.docx</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>31-АфанасьевСС.docx</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>31-БакальскаяЕД.docx</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>31-БарсуковПО.docx</td>\n",
" <td>20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>31-БелянинНН.docx</td>\n",
" <td>14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69</th>\n",
" <td>Курсовая ПМУ Волков Никита ПИбд-33.docx</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>70</th>\n",
" <td>ПИбд-31, Лёвушкина Анна, записка к кр.docx</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71</th>\n",
" <td>ПИбд-32 Шабунов Олег.docx</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72</th>\n",
" <td>Фирсов_Кирилл_Записка (1).docx</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73</th>\n",
" <td>Фирсов_Кирилл_Записка.docx</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>74 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" doc cluster\n",
"0 31-АнисинРС.docx 15\n",
"1 31-АфанасьевСС.docx 2\n",
"2 31-БакальскаяЕД.docx 12\n",
"3 31-БарсуковПО.docx 20\n",
"4 31-БелянинНН.docx 14\n",
".. ... ...\n",
"69 Курсовая ПМУ Волков Никита ПИбд-33.docx 7\n",
"70 ПИбд-31, Лёвушкина Анна, записка к кр.docx 10\n",
"71 ПИбд-32 Шабунов Олег.docx 1\n",
"72 Фирсов_Кирилл_Записка (1).docx 3\n",
"73 Фирсов_Кирилл_Записка.docx 3\n",
"\n",
"[74 rows x 2 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.cluster import KMeans\n",
"\n",
"\n",
"n_clusters = 40\n",
"kmeans = KMeans(n_clusters=n_clusters, random_state=42)\n",
"kmeans.fit(tfidf_matrix)\n",
"df[\"cluster\"] = kmeans.labels_\n",
"display(df[[\"doc\", \"cluster\"]])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Визуализация с помощь PCA"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABAYAAANXCAYAAACmL10FAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzddXgUZ9fA4d9u3BMISQiSAMHdQqFQoNCiwTW40xZaCqXF3QPFpRSX4BqgaKFY8VDcnUBISIj7zvvHknlZIoSWQmjO/V179WXmmZkzsvn2OfOIRlEUBSGEEEIIIYQQQmRJ2g8dgBBCCCGEEEIIIT4cSQwIIYQQQgghhBBZmCQGhBBCCCGEEEKILEwSA0IIIYQQQgghRBYmiQEhhBBCCCGEECILk8SAEEIIIYQQQgiRhUliQAghhBBCCCGEyMIkMSCEEEIIIYQQQmRhkhgQQgghhBBCCCGyMEkMCCGEEEIIIYQQWZgkBoRIxbJly9BoNAYfJycnatasyW+//fahwxNCCCEyHXd39xT/f7NatWps2bIl1fJbtmyhXr16ODo6YmpqiqurK61ateL3339PtfyuXbvQaDS4urqi0+n+zVMRQogsx/hDByBEZjZmzBjy5cuHoigEBgaybNky6tevj5+fHw0bNvzQ4QkhhBCZSpkyZRgwYAAAAQEB/PLLLzRr1oz58+fTu3dvABRFoWvXrixbtoyyZcvSv39/XFxcePLkCVu2bKFWrVocO3aMKlWqGOx79erVuLu7c+/ePX7//Xdq16793s9PCCH+qzSKoigfOgghMptly5bRpUsXTp8+TYUKFdTloaGhODs707JlS1avXv0BIxRCCCEyF3d3d0qUKMGOHTvUZU+fPsXDw4NcuXJx/fp1AKZOncrAgQPp168fP//8MxqNxmA/K1eupHDhwnh6eqrLoqKicHZ2ZuLEiSxdupTSpUuzdOnS93NiQgiRBUhXAiHegr29PRYWFhgb/7+xzb1799BoNCxbtsyg7DfffINGo6Fz587qss2bN+Pp6Um2bNmwsLCgSJEiTJ48meT83MGDB9FoNKk2u/T19UWj0fDnn38CcOHCBTp37kz+/PkxNzfHxcWFrl278vz581Rjf72JZ/Ln0KFDBmVejRdgw4YNaDQa3N3d1WXXr1/n888/x8XFBTMzM/LkyUPv3r0JCQlRy8THxzNixAjKly+PnZ0dVlZWVKtWjYMHDxrsP/n6TZ06NUXMJUqUoEaNGgbLatSokWLZ6dOn1fN5VWRkJAMGDCB//vyYmJgYnHdwcHCq1ym944wfPx6tVouvr2+q55Da51VTp06lSpUqZM+eHQsLC8qXL8/GjRtTPf6qVavw9PTE0tISBwcHPvvsM/bu3QukfS+TP6/eK51Ox4wZMyhevDjm5uY4OzvTq1cvQkNDDY7n7u5Ow4YN2bt3L2XKlMHc3JxixYqxefPmFLHduXOHli1bki1bNiwtLfnkk0/YuXOnQZlDhw4ZxGRmZkahQoWYOHEi6eWjX98utc+oUaPU8o8fP6Zr1644OztjZmZG8eLFWbJkSar7fPVZDwgIwN3dnQoVKhAZGakuj42NZdSoURQqVAhzc3Ny5sxJs2bNuH37drr3Ofnz6vfn37xOGYk3PRn9roP+GZo5cyYlS5bE3NycHDlyULduXc6cOaOWyejzCPoK3oABA8iTJw9mZmYULlyYqVOnpnq+qXXr0mg0Bt/N5DKvxpOaUaNGUaxYMaytrbG1teWTTz5h69atKcpt2LCB8uXLY2FhgaOjI+3bt+fx48cGZTp37mwQj4ODAzVq1ODIkSMG5bZt20aDBg1wdXXFzMyMAgUKMHbsWJKSkgzK1ahRgxIlSqSIZerUqWg0Gu7du6cue9t7l9Hvv0ajoV+/filiqFOnDhqN5m+3knNxcaFo0aLcvXsXgJiYGCZOnEiRIkXU83tdhw4dDJICoO92EBMTQ8uWLWnTpg2bN28mNjb2b8UkhBAiJelKIEQ6wsLCCA4ORlEUnj17xuzZs4mMjKR9+/bpbnfr1i1+/fXXFMvDw8OpVKkSnTp1wsTEhN27dzNo0CCMjY0ZMGAANWrUIE+ePKxevZqmTZsabLt69WoKFChA5cqVAdi3bx937tyhS5cuuLi4cPnyZRYuXMjly5c5ceJEqj+2qlWrRs+ePQG4evUqEyZMSPc8EhMTGTp0aIrlUVFR5M6dGy8vL2xtbbl06RJz587l8ePH+Pn5qee6aNEi2rZtS48ePYiIiGDx4sXUqVOHU6dOUaZMmXSP/TZ++umnVJcPHDiQBQsW0K1bNz799FNMTEzYvHlzmv1d07N06VKGDRvGtGnT8Pb2TrVMz549qVatGkCqx5k5cyaNGjWiXbt2xMfHs3btWlq2bMmOHTto0KCBWm706NGMGjWKKlWqMGbMGExNTTl58iS///47X375JTNmzFArs8n3cciQIRQtWhQAa2trdV+9evVSW8B8++233L17lzlz5uDv78+xY8cwMTFRy968eZPWrVvTu3dvOnXqxNKlS2nZsiW7d+/miy++ACAwMJAqVaoQHR3Nt99+S/bs2Vm+fDmNGjVi48aNKZ7b5LhiYmJYt24dQ4YMwcnJiW7duqV6DYsWLcrKlSvVfy9cuJCrV68yffp0dVmpUqXUWD755BM0Gg19+vQhR44c/Pbbb3Tr1o3w8PBUKzmg/17Xq1cPExMTdu3apV6vpKQkGjZsyIEDB2jTpg3fffcdERER7Nu3j0uXLlG7dm2D2JLv8avLChQo8F6uU0biTY4lI9L6rgN069aNZcuWUa9ePbp3705iYiJHjhzhxIkTBi2qvvjiCzp27Giw7bRp0wwqoYqi0KhRIw4ePEi3bt0oU6YMe/bsYeDAgTx+/NjgPr9q+vTpODo6AvoE3d8RFRVF06ZNcXd3JyYmhmXLltG8eXP+/PNPtRKa/F2pWLEiEydOJDAwkJkzZ3Ls2DH8/f2xt7dX9+fo6KjG++jRI2bOnEn9+vV5+PChWm7ZsmVYW1vTv39/rK2t+f333xkxYgTh4eH4+Pj8rfN4XXr37m2+/+bm5qxevRofHx91+aNHjzhw4ADm5uZ/O76EhAQePnxI9uzZATh69CghISH069cPIyOjDO9n9erV1KxZExcXF9q0acOgQYPw8/OjZcuWfzs2IYQQr1CEECksXbpUAVJ8zMzMlGXLlhmUvXv3rgIoS5cuVZe1atVKKVGihJInTx6lU6dO6R6rWLFiSsOGDdV/Dx48WDEzM1NevHihLnv27JlibGysjBw5Ul0WHR2dYl9r1qxRAOXw4cMp1uXKlUvp0qWL+u+DBw8qgHLw4EF1mZubm0G88+bNU8zMzJSaNWsqbm5u6Z7H119/rVhbW6v/TkxMVOLi4gzKhIaGKs7OzkrXrl3VZcnXz8fHJ8U+ixcvrlSvXt1gWfXq1Q2W7dq1SwGUunXrKq//ScuZM6dSp04dg2UjR45UACUoKCjd83n1ODt37lSMjY2VAQMGpFr25s2bCqAsX748xXFe9fo9i4+PV0qUKKF8/vnnBvvSarVK06ZNlaSkJIPyOp0uxbFTu4/Jjhw5ogDK6tWrDZbv3r07xXI3NzcFUDZt2qQuCwsLU3LmzKmULVtWXdavXz8FUI4cOaIui4iIUPLly6e4u7urMacWV2xsrKLVapWvv/46Raxp6dSpU5rPXrdu3ZScOXMqwcHBBsvbtGmj2NnZqdf71VhiY2OVGjVqKE5OTsqtW7cMtluyZIkCKD///HOKY6V27VO7x8nex3V623hfldHv+u+//64AyrfffpvuMQDlm2++SVGmQYMGBvvbunWrAijjxo0zKNeiRQtFo9GkuCe//vqrAij3799Xl73+NyD57/Xp06fTPefXPXv2TAGUqVOnKoqi/z46OTkpJUqUUGJiYtRyO3bsUABlxIgR6rLUnsuFCxcqgHLq1Cl1WWp/p3v16qVYWloqsbGxBudUvHjxFGV9fHwUQLl79666LKP37m2//1988YXi6OiobNy4UV0+duxYpUqVKoqbm5vSoEGDFPG9zs3NTfnyyy+VoKAgJSgoSPnrr7+
"text/plain": [
"<Figure size 1200x1000 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.decomposition import PCA\n",
"import matplotlib.pyplot as plt\n",
"\n",
"n_clusters = 40\n",
"kmeans = KMeans(n_clusters=n_clusters, random_state=42)\n",
"clusters = kmeans.fit_predict(tfidf_matrix)\n",
"\n",
"\n",
"pca = PCA(n_components=2)\n",
"X_pca = pca.fit_transform(tfidf_matrix.toarray())\n",
"\n",
"# \n",
"plt.figure(figsize=(12, 10))\n",
"plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap=\"viridis\", s=50, alpha=0.6)\n",
"\n",
"\n",
"centers = pca.transform(kmeans.cluster_centers_)\n",
"plt.scatter(centers[:, 0], centers[:, 1], c=\"red\", s=200, alpha=0.75, marker=\"X\")\n",
"\n",
"for i, doc in enumerate(df[\"doc\"]):\n",
" plt.text(X_pca[i, 0], X_pca[i, 1], doc, fontsize=8, alpha=0.8)\n",
"\n",
"plt.title(\"Визуализация кластеров текстов с использованием PCA\")\n",
"plt.xlabel(\"Компонента PCA 1\")\n",
"plt.ylabel(\"Компонента PCA 2\")\n",
"plt.colorbar(label=\"Кластер\")\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}