From 7bbb30149a2e10bdcf15d9c5306764ad93a72934 Mon Sep 17 00:00:00 2001 From: russell Date: Fri, 4 Apr 2025 20:39:20 +0400 Subject: [PATCH] lab8 --- lab_8/lab8.ipynb | 1256 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1256 insertions(+) create mode 100644 lab_8/lab8.ipynb diff --git a/lab_8/lab8.ipynb b/lab_8/lab8.ipynb new file mode 100644 index 0000000..49f6c30 --- /dev/null +++ b/lab_8/lab8.ipynb @@ -0,0 +1,1256 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Лабораторная 8\n", + "Датасет: ТЗ и статьи по ИТ (кластеризация, классификация)." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "import spacy\n", + "\n", + "sp = spacy.load(\"ru_core_news_lg\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Загружаем тексты из файлов датафрейм:" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 41 entries, 0 to 40\n", + "Data columns (total 3 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 doc 41 non-null object\n", + " 1 text 41 non-null object\n", + " 2 type 41 non-null int64 \n", + "dtypes: int64(1), object(2)\n", + "memory usage: 1.3+ KB\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
doctexttype
0tz_01.docx2.2 Техническое задание\\n2.2.1 Общие сведения\\...0
1tz_02.docx2.2 Техническое задание\\n2.2.1 Общие сведения\\...0
2tz_03.docx2.2. Техническое задание\\nОбщие сведения:\\nВ д...0
3tz_04.docxТехническое задание\\n2.2.1 Общие сведения\\nИнт...0
4tz_05.docx2.2 Техническое задание\\n2.2.1 Общие сведения....0
\n", + "
" + ], + "text/plain": [ + " doc text type\n", + "0 tz_01.docx 2.2 Техническое задание\\n2.2.1 Общие сведения\\... 0\n", + "1 tz_02.docx 2.2 Техническое задание\\n2.2.1 Общие сведения\\... 0\n", + "2 tz_03.docx 2.2. Техническое задание\\nОбщие сведения:\\nВ д... 0\n", + "3 tz_04.docx Техническое задание\\n2.2.1 Общие сведения\\nИнт... 0\n", + "4 tz_05.docx 2.2 Техническое задание\\n2.2.1 Общие сведения.... 0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
doctexttype
36Этапы разработки проекта2.docxЭтапы разработки проекта: заключительные стади...1
37Этапы разработки проекта3.docxЭтапы разработки проекта: определение стратеги...1
38Этапы разработки проекта4.docxЭтапы разработки проекта: реализация, тестиров...1
39Этапы разработки проекта5.docxЭтапы разработки проекта: стратегия и анализ\\n...1
40Язык манипуляции данными.docx2.1.3. Язык манипуляции данными (ЯМД)\\nЯзык ма...1
\n", + "
" + ], + "text/plain": [ + " doc \\\n", + "36 Этапы разработки проекта2.docx \n", + "37 Этапы разработки проекта3.docx \n", + "38 Этапы разработки проекта4.docx \n", + "39 Этапы разработки проекта5.docx \n", + "40 Язык манипуляции данными.docx \n", + "\n", + " text type \n", + "36 Этапы разработки проекта: заключительные стади... 1 \n", + "37 Этапы разработки проекта: определение стратеги... 1 \n", + "38 Этапы разработки проекта: реализация, тестиров... 1 \n", + "39 Этапы разработки проекта: стратегия и анализ\\n... 1 \n", + "40 2.1.3. Язык манипуляции данными (ЯМД)\\nЯзык ма... 1 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "import pandas as pd\n", + "from docx import Document\n", + "import os\n", + "\n", + "\n", + "def read_docx(file_path):\n", + " doc = Document(file_path)\n", + " full_text = []\n", + " for paragraph in doc.paragraphs:\n", + " full_text.append(paragraph.text)\n", + " return \"\\n\".join(full_text)\n", + "\n", + "def load_docs(dataset_path):\n", + " df = pd.DataFrame(columns=[\"doc\", \"text\"])\n", + " for file_path in os.listdir(dataset_path):\n", + " if file_path.startswith(\"~$\"):\n", + " continue\n", + " text = read_docx(dataset_path + file_path)\n", + " df.loc[len(df.index)] = [file_path, text]\n", + " return df\n", + "\n", + "\n", + "df = load_docs(\"static/text/\")\n", + "df[\"type\"] = df.apply(\n", + " lambda row: 0 if str(row[\"doc\"]).startswith(\"tz_\") else 1, axis=1\n", + ")\n", + "df.info()\n", + "df.sort_values(by=[\"doc\"], inplace=True)\n", + "\n", + "display(df.head(), df.tail())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Предобработка текста" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "from num2words import num2words\n", + "\n", + "def transform_text(text):\n", + " text = re.sub(r'<[^<]+?>', '', text)\n", + " \n", + " text = re.sub(r'http\\S+', '', text)\n", + "\n", + " text = text.lower()\n", + "\n", + " text = re.sub(r'\\s+', ' ', text)\n", + "\n", + " text = re.sub(r'[^a-zA-Zа-яА-Я0-9\\s]', '', text)\n", + "\n", + " words: list[str] = text.split()\n", + " words = [num2words(word, lang=\"ru\") if word.isdigit() else word for word in words]\n", + " text = \" \".join(words)\n", + "\n", + " text = re.sub(r'[^\\w\\s]', '', text)\n", + "\n", + " return text\n", + "\n", + "df[\"preprocessed_text\"] = df[\"text\"].apply(transform_text)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Токенизация, выделения частей речи, лемматизация и фильтрация" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "двадцать_NUM_Case=Nom технический_ADJ_Case=Nom|Degree=Pos|Gender=Neut|Number=Sing задание_NOUN_Animacy=Inan|Case=Nom|Gender=Neut|Number=Sing двести_NUM_Case=Nom двадцать_NUM_Case=Nom общий_ADJ_Case=Nom|Degree=Pos|Number=Plur сведение_NOUN_Animacy=Inan|Case=Nom|Gender=Neut|Number=Plur полный_ADJ_Case=Nom|Degree=Pos|Gender=Neut|Number=Sing наименование_NOUN_Animacy=Inan|Case=Acc|Gender=Neut|Number=Sing система_NOUN_Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing\n" + ] + } + ], + "source": [ + "from nltk.corpus import stopwords\n", + "\n", + "stop_words = set(stopwords.words('russian'))\n", + "\n", + "def is_valid_token(token):\n", + " return token.text not in stop_words and len(token.text) <= 20\n", + "\n", + "def preprocess_text(text):\n", + " doc = sp(text)\n", + " \n", + " filtered_tokens = map(\n", + " lambda token: f\"{token.lemma_}_{token.pos_}_{token.morph}\", \n", + " filter(is_valid_token, doc)\n", + " )\n", + " \n", + " return \" \".join(filtered_tokens)\n", + "\n", + "df[\"preprocessed_text\"] = df[\"preprocessed_text\"].apply(preprocess_text)\n", + "\n", + "first_text_tokens = df[\"preprocessed_text\"].iloc[0].split()[:10]\n", + "print(\" \".join(first_text_tokens))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "N-граммы" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
doctexttypepreprocessed_textbigramstrigrams
0tz_01.docx2.2 Техническое задание\\n2.2.1 Общие сведения\\...0двадцать_NUM_Case=Nom технический_ADJ_Case=Nom...[(двадцать_NUM_Case=Nom, технический_ADJ_Case=...[(двадцать_NUM_Case=Nom, технический_ADJ_Case=...
1tz_02.docx2.2 Техническое задание\\n2.2.1 Общие сведения\\...0двадцать_NUM_Case=Nom технический_ADJ_Case=Nom...[(двадцать_NUM_Case=Nom, технический_ADJ_Case=...[(двадцать_NUM_Case=Nom, технический_ADJ_Case=...
2tz_03.docx2.2. Техническое задание\\nОбщие сведения:\\nВ д...0двадцать_NUM_Case=Nom технический_ADJ_Case=Nom...[(двадцать_NUM_Case=Nom, технический_ADJ_Case=...[(двадцать_NUM_Case=Nom, технический_ADJ_Case=...
3tz_04.docxТехническое задание\\n2.2.1 Общие сведения\\nИнт...0технический_ADJ_Case=Nom|Degree=Pos|Gender=Neu...[(технический_ADJ_Case=Nom|Degree=Pos|Gender=N...[(технический_ADJ_Case=Nom|Degree=Pos|Gender=N...
4tz_05.docx2.2 Техническое задание\\n2.2.1 Общие сведения....0двадцать_NUM_Case=Nom технический_ADJ_Case=Nom...[(двадцать_NUM_Case=Nom, технический_ADJ_Case=...[(двадцать_NUM_Case=Nom, технический_ADJ_Case=...
5tz_06.docx2.2 Техническое задание\\t\\n1.Общие сведения\\nП...0двадцать_NUM_Case=Nom технический_ADJ_Case=Nom...[(двадцать_NUM_Case=Nom, технический_ADJ_Case=...[(двадцать_NUM_Case=Nom, технический_ADJ_Case=...
6tz_07.docxТехническое задание\\nОбщие сведения\\nВ данном ...0технический_ADJ_Case=Nom|Degree=Pos|Gender=Neu...[(технический_ADJ_Case=Nom|Degree=Pos|Gender=N...[(технический_ADJ_Case=Nom|Degree=Pos|Gender=N...
7tz_08.docxТехническое задание\\n1 Общие сведения\\n1.1 Пол...0технический_ADJ_Case=Nom|Degree=Pos|Gender=Neu...[(технический_ADJ_Case=Nom|Degree=Pos|Gender=N...[(технический_ADJ_Case=Nom|Degree=Pos|Gender=N...
8tz_09.docx2.2. Техническое задание\\n2.2.1.\\n\\nОбщие свед...0двадцать_NUM_Case=Nom технический_ADJ_Case=Nom...[(двадцать_NUM_Case=Nom, технический_ADJ_Case=...[(двадцать_NUM_Case=Nom, технический_ADJ_Case=...
9tz_10.docx2.2. Техническое задание\\n2.2.1. Общие сведени...0двадцать_NUM_Case=Nom технический_ADJ_Case=Nom...[(двадцать_NUM_Case=Nom, технический_ADJ_Case=...[(двадцать_NUM_Case=Nom, технический_ADJ_Case=...
\n", + "
" + ], + "text/plain": [ + " doc text type \\\n", + "0 tz_01.docx 2.2 Техническое задание\\n2.2.1 Общие сведения\\... 0 \n", + "1 tz_02.docx 2.2 Техническое задание\\n2.2.1 Общие сведения\\... 0 \n", + "2 tz_03.docx 2.2. Техническое задание\\nОбщие сведения:\\nВ д... 0 \n", + "3 tz_04.docx Техническое задание\\n2.2.1 Общие сведения\\nИнт... 0 \n", + "4 tz_05.docx 2.2 Техническое задание\\n2.2.1 Общие сведения.... 0 \n", + "5 tz_06.docx 2.2 Техническое задание\\t\\n1.Общие сведения\\nП... 0 \n", + "6 tz_07.docx Техническое задание\\nОбщие сведения\\nВ данном ... 0 \n", + "7 tz_08.docx Техническое задание\\n1 Общие сведения\\n1.1 Пол... 0 \n", + "8 tz_09.docx 2.2. Техническое задание\\n2.2.1.\\n\\nОбщие свед... 0 \n", + "9 tz_10.docx 2.2. Техническое задание\\n2.2.1. Общие сведени... 0 \n", + "\n", + " preprocessed_text \\\n", + "0 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n", + "1 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n", + "2 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n", + "3 технический_ADJ_Case=Nom|Degree=Pos|Gender=Neu... \n", + "4 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n", + "5 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n", + "6 технический_ADJ_Case=Nom|Degree=Pos|Gender=Neu... \n", + "7 технический_ADJ_Case=Nom|Degree=Pos|Gender=Neu... \n", + "8 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n", + "9 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n", + "\n", + " bigrams \\\n", + "0 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n", + "1 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n", + "2 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n", + "3 [(технический_ADJ_Case=Nom|Degree=Pos|Gender=N... \n", + "4 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n", + "5 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n", + "6 [(технический_ADJ_Case=Nom|Degree=Pos|Gender=N... \n", + "7 [(технический_ADJ_Case=Nom|Degree=Pos|Gender=N... \n", + "8 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n", + "9 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n", + "\n", + " trigrams \n", + "0 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n", + "1 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n", + "2 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n", + "3 [(технический_ADJ_Case=Nom|Degree=Pos|Gender=N... \n", + "4 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n", + "5 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n", + "6 [(технический_ADJ_Case=Nom|Degree=Pos|Gender=N... \n", + "7 [(технический_ADJ_Case=Nom|Degree=Pos|Gender=N... \n", + "8 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n", + "9 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... " + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from nltk.util import ngrams\n", + "from nltk.tokenize import word_tokenize\n", + "\n", + "def generate_ngrams(text: str, n: int = 2) -> list[tuple]:\n", + " tokens: list[str] = word_tokenize(text, language=\"russian\")\n", + " return list(ngrams(tokens, n))\n", + "\n", + "df[\"bigrams\"] = df[\"preprocessed_text\"].apply(lambda x: generate_ngrams(x, n=2))\n", + "\n", + "df[\"trigrams\"] = df[\"preprocessed_text\"].apply(lambda x: generate_ngrams(x, n=3))\n", + "\n", + "df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Векторизация текста. Мешок слов:" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
000001log_x_foreign000001ver001_propn_foreign012n_propn_foreign100_num_1024х768_num_10мегабайт_noun_animacy13а_num_13гбайт_num_17а_num_1998_adj_...язвительный_adj_caseязык_noun_animacyямд_noun_animacyямд_propn_animacyяод_noun_animacyяода_noun_animacyясность_noun_animacyясный_adj_caseясный_adj_degreeящик_noun_animacy
00000000000...0200000000
10000000000...0100000000
20000000000...0100000000
30000000000...0000000000
40000000000...0200000000
\n", + "

5 rows × 7362 columns

\n", + "
" + ], + "text/plain": [ + " 000001log_x_foreign 000001ver001_propn_foreign 012n_propn_foreign \\\n", + "0 0 0 0 \n", + "1 0 0 0 \n", + "2 0 0 0 \n", + "3 0 0 0 \n", + "4 0 0 0 \n", + "\n", + " 100_num_ 1024х768_num_ 10мегабайт_noun_animacy 13а_num_ 13гбайт_num_ \\\n", + "0 0 0 0 0 0 \n", + "1 0 0 0 0 0 \n", + "2 0 0 0 0 0 \n", + "3 0 0 0 0 0 \n", + "4 0 0 0 0 0 \n", + "\n", + " 17а_num_ 1998_adj_ ... язвительный_adj_case язык_noun_animacy \\\n", + "0 0 0 ... 0 2 \n", + "1 0 0 ... 0 1 \n", + "2 0 0 ... 0 1 \n", + "3 0 0 ... 0 0 \n", + "4 0 0 ... 0 2 \n", + "\n", + " ямд_noun_animacy ямд_propn_animacy яод_noun_animacy яода_noun_animacy \\\n", + "0 0 0 0 0 \n", + "1 0 0 0 0 \n", + "2 0 0 0 0 \n", + "3 0 0 0 0 \n", + "4 0 0 0 0 \n", + "\n", + " ясность_noun_animacy ясный_adj_case ясный_adj_degree ящик_noun_animacy \n", + "0 0 0 0 0 \n", + "1 0 0 0 0 \n", + "2 0 0 0 0 \n", + "3 0 0 0 0 \n", + "4 0 0 0 0 \n", + "\n", + "[5 rows x 7362 columns]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from scipy import sparse\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "counts_vectorizer = CountVectorizer()\n", + "counts_matrix = sparse.csr_matrix(counts_vectorizer.fit_transform(df[\"preprocessed_text\"]))\n", + "counts_df = pd.DataFrame(\n", + " counts_matrix.toarray(),\n", + " columns=counts_vectorizer.get_feature_names_out(),\n", + ")\n", + "\n", + "counts_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Частотный портрет:" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
000001log_x_foreign000001ver001_propn_foreign012n_propn_foreign100_num_1024х768_num_10мегабайт_noun_animacy13а_num_13гбайт_num_17а_num_1998_adj_...язвительный_adj_caseязык_noun_animacyямд_noun_animacyямд_propn_animacyяод_noun_animacyяода_noun_animacyясность_noun_animacyясный_adj_caseясный_adj_degreeящик_noun_animacy
00.00.00.00.00.0000000.00.00.00.0000000.0...0.00.0241560.00.00.00.00.00.00.00.0
10.00.00.00.00.0000000.00.00.00.0000000.0...0.00.0200460.00.00.00.00.00.00.00.0
20.00.00.00.00.0000000.00.00.00.0000000.0...0.00.0157260.00.00.00.00.00.00.00.0
30.00.00.00.00.0000000.00.00.00.0000000.0...0.00.0000000.00.00.00.00.00.00.00.0
40.00.00.00.00.0000000.00.00.00.0000000.0...0.00.0264350.00.00.00.00.00.00.00.0
50.00.00.00.00.0000000.00.00.00.0485460.0...0.00.0145390.00.00.00.00.00.00.00.0
60.00.00.00.00.0000000.00.00.00.0000000.0...0.00.0193150.00.00.00.00.00.00.00.0
70.00.00.00.00.0392090.00.00.00.0000000.0...0.00.0000000.00.00.00.00.00.00.00.0
80.00.00.00.00.0000000.00.00.00.0000000.0...0.00.0404750.00.00.00.00.00.00.00.0
90.00.00.00.00.0000000.00.00.00.0000000.0...0.00.0499770.00.00.00.00.00.00.00.0
\n", + "

10 rows × 7362 columns

\n", + "
" + ], + "text/plain": [ + " 000001log_x_foreign 000001ver001_propn_foreign 012n_propn_foreign \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 \n", + "5 0.0 0.0 0.0 \n", + "6 0.0 0.0 0.0 \n", + "7 0.0 0.0 0.0 \n", + "8 0.0 0.0 0.0 \n", + "9 0.0 0.0 0.0 \n", + "\n", + " 100_num_ 1024х768_num_ 10мегабайт_noun_animacy 13а_num_ 13гбайт_num_ \\\n", + "0 0.0 0.000000 0.0 0.0 0.0 \n", + "1 0.0 0.000000 0.0 0.0 0.0 \n", + "2 0.0 0.000000 0.0 0.0 0.0 \n", + "3 0.0 0.000000 0.0 0.0 0.0 \n", + "4 0.0 0.000000 0.0 0.0 0.0 \n", + "5 0.0 0.000000 0.0 0.0 0.0 \n", + "6 0.0 0.000000 0.0 0.0 0.0 \n", + "7 0.0 0.039209 0.0 0.0 0.0 \n", + "8 0.0 0.000000 0.0 0.0 0.0 \n", + "9 0.0 0.000000 0.0 0.0 0.0 \n", + "\n", + " 17а_num_ 1998_adj_ ... язвительный_adj_case язык_noun_animacy \\\n", + "0 0.000000 0.0 ... 0.0 0.024156 \n", + "1 0.000000 0.0 ... 0.0 0.020046 \n", + "2 0.000000 0.0 ... 0.0 0.015726 \n", + "3 0.000000 0.0 ... 0.0 0.000000 \n", + "4 0.000000 0.0 ... 0.0 0.026435 \n", + "5 0.048546 0.0 ... 0.0 0.014539 \n", + "6 0.000000 0.0 ... 0.0 0.019315 \n", + "7 0.000000 0.0 ... 0.0 0.000000 \n", + "8 0.000000 0.0 ... 0.0 0.040475 \n", + "9 0.000000 0.0 ... 0.0 0.049977 \n", + "\n", + " ямд_noun_animacy ямд_propn_animacy яод_noun_animacy яода_noun_animacy \\\n", + "0 0.0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 0.0 \n", + "5 0.0 0.0 0.0 0.0 \n", + "6 0.0 0.0 0.0 0.0 \n", + "7 0.0 0.0 0.0 0.0 \n", + "8 0.0 0.0 0.0 0.0 \n", + "9 0.0 0.0 0.0 0.0 \n", + "\n", + " ясность_noun_animacy ясный_adj_case ясный_adj_degree ящик_noun_animacy \n", + "0 0.0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 0.0 \n", + "5 0.0 0.0 0.0 0.0 \n", + "6 0.0 0.0 0.0 0.0 \n", + "7 0.0 0.0 0.0 0.0 \n", + "8 0.0 0.0 0.0 0.0 \n", + "9 0.0 0.0 0.0 0.0 \n", + "\n", + "[10 rows x 7362 columns]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "\n", + "tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True)\n", + "tfidf_matrix = sparse.csr_matrix(tfidf_vectorizer.fit_transform(df[\"preprocessed_text\"]))\n", + "tfidf_df = pd.DataFrame(\n", + " tfidf_matrix.toarray(),\n", + " columns=tfidf_vectorizer.get_feature_names_out(),\n", + ")\n", + "\n", + "tfidf_df.head(10)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Обучение и оценка модели:" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TF-IDF Model\n", + "Accuracy: 1.0000\n", + "Precision: 1.0000\n", + "Recall: 1.0000\n", + "F1 Score: 1.0000\n", + "ROC AUC: 1.0000\n", + "Средний F1 Score (кросс-валидация): 0.9714\n", + "Count Vectorizer Model\n", + "Accuracy: 0.8889\n", + "Precision: 1.0000\n", + "Recall: 0.8000\n", + "F1 Score: 0.8889\n", + "ROC AUC: 0.9000\n", + "Средний F1 Score (кросс-валидация): 0.9600\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import train_test_split, cross_val_score\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n", + "\n", + "def train_and_evaluate(X, y, test_size=0.2, cv=5, optimize=False):\n", + " \n", + " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)\n", + "\n", + " model = RandomForestClassifier(n_estimators=100, random_state=42)\n", + " \n", + " model.fit(X_train, y_train)\n", + "\n", + " y_pred = model.predict(X_test)\n", + "\n", + " metrics = {\n", + " \"Accuracy\": accuracy_score(y_test, y_pred),\n", + " \"Precision\": precision_score(y_test, y_pred),\n", + " \"Recall\": recall_score(y_test, y_pred),\n", + " \"F1 Score\": f1_score(y_test, y_pred),\n", + " \"ROC AUC\": roc_auc_score(y_test, y_pred)\n", + " }\n", + "\n", + " f1_cv = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1').mean()\n", + "\n", + " for metric, value in metrics.items():\n", + " print(f\"{metric}: {value:.4f}\")\n", + " \n", + " print(f\"Средний F1 Score (кросс-валидация): {f1_cv:.4f}\")\n", + "\n", + " return model\n", + "\n", + "X_tfidf = tfidf_df\n", + "X_counts = counts_df\n", + "y = df[\"type\"]\n", + "\n", + "print(\"TF-IDF Model\")\n", + "model_tfidf = train_and_evaluate(X_tfidf, y)\n", + "\n", + "print(\"Count Vectorizer Model\")\n", + "model_counts = train_and_evaluate(X_counts, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "TF-IDF показывает признаки переобучения, так как результаты теста идеальны, а на кросс-валидации чуть хуже.\n", + "Count Vectorizer более сбалансирован, но уступает по точности." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "aimvenv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}