Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| fda87f53d5 | |||
| f6bdab7f5b | |||
| e35a826ccd | |||
| 8e9ddc5b7c | |||
| 9ee8efec42 | |||
| d59680bbe0 |
8
MAI_PIbd-33_Tikhonenkov_A_E.code-workspace
Normal file
8
MAI_PIbd-33_Tikhonenkov_A_E.code-workspace
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"folders": [
|
||||
{
|
||||
"path": "."
|
||||
}
|
||||
],
|
||||
"settings": {}
|
||||
}
|
||||
53944
data/Diamonds-Prices.csv
Normal file
53944
data/Diamonds-Prices.csv
Normal file
File diff suppressed because it is too large
Load Diff
6
data/Forbes Billionaires copy.csv
Normal file
6
data/Forbes Billionaires copy.csv
Normal file
@@ -0,0 +1,6 @@
|
||||
Rank ,Name,Networth,Age,Country,Source,Industry
|
||||
1,Elon Musk ,219,50,United States,"Tesla, SpaceX",Automotive
|
||||
2,Jeff Bezos ,171,58,United States,Amazon,Technology
|
||||
3,Bernard Arnault & family ,158,73,France,LVMH,Fashion & Retail
|
||||
4,Bill Gates ,129,66,United States,Microsoft,Technology
|
||||
5,Warren Buffett ,118,91,United States,Berkshire Hathaway,Finance & Investments
|
||||
|
2601
data/Forbes Billionaires.csv
Normal file
2601
data/Forbes Billionaires.csv
Normal file
File diff suppressed because it is too large
Load Diff
1371
data/mobile-phone-price-prediction.csv
Normal file
1371
data/mobile-phone-price-prediction.csv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
data/text/tz_01.docx
Normal file
BIN
data/text/tz_01.docx
Normal file
Binary file not shown.
BIN
data/text/tz_02.docx
Normal file
BIN
data/text/tz_02.docx
Normal file
Binary file not shown.
BIN
data/text/tz_03.docx
Normal file
BIN
data/text/tz_03.docx
Normal file
Binary file not shown.
BIN
data/text/tz_04.docx
Normal file
BIN
data/text/tz_04.docx
Normal file
Binary file not shown.
BIN
data/text/tz_05.docx
Normal file
BIN
data/text/tz_05.docx
Normal file
Binary file not shown.
BIN
data/text/tz_06.docx
Normal file
BIN
data/text/tz_06.docx
Normal file
Binary file not shown.
BIN
data/text/tz_07.docx
Normal file
BIN
data/text/tz_07.docx
Normal file
Binary file not shown.
BIN
data/text/tz_08.docx
Normal file
BIN
data/text/tz_08.docx
Normal file
Binary file not shown.
BIN
data/text/tz_09.docx
Normal file
BIN
data/text/tz_09.docx
Normal file
Binary file not shown.
BIN
data/text/tz_10.docx
Normal file
BIN
data/text/tz_10.docx
Normal file
Binary file not shown.
BIN
data/text/tz_11.docx
Normal file
BIN
data/text/tz_11.docx
Normal file
Binary file not shown.
BIN
data/text/tz_12.docx
Normal file
BIN
data/text/tz_12.docx
Normal file
Binary file not shown.
BIN
data/text/tz_13.docx
Normal file
BIN
data/text/tz_13.docx
Normal file
Binary file not shown.
BIN
data/text/tz_14.docx
Normal file
BIN
data/text/tz_14.docx
Normal file
Binary file not shown.
BIN
data/text/tz_15.docx
Normal file
BIN
data/text/tz_15.docx
Normal file
Binary file not shown.
BIN
data/text/tz_16.docx
Normal file
BIN
data/text/tz_16.docx
Normal file
Binary file not shown.
BIN
data/text/tz_17.docx
Normal file
BIN
data/text/tz_17.docx
Normal file
Binary file not shown.
BIN
data/text/tz_18.docx
Normal file
BIN
data/text/tz_18.docx
Normal file
Binary file not shown.
BIN
data/text/tz_19.docx
Normal file
BIN
data/text/tz_19.docx
Normal file
Binary file not shown.
BIN
data/text/tz_20.docx
Normal file
BIN
data/text/tz_20.docx
Normal file
Binary file not shown.
BIN
data/text/Архитектура, управляемая модель.docx
Normal file
BIN
data/text/Архитектура, управляемая модель.docx
Normal file
Binary file not shown.
BIN
data/text/Введение в проектирование ИС.docx
Normal file
BIN
data/text/Введение в проектирование ИС.docx
Normal file
Binary file not shown.
BIN
data/text/Встроенные операторы SQL.docx
Normal file
BIN
data/text/Встроенные операторы SQL.docx
Normal file
Binary file not shown.
BIN
data/text/Методологии разработки программного обеспечения 2.docx
Normal file
BIN
data/text/Методологии разработки программного обеспечения 2.docx
Normal file
Binary file not shown.
BIN
data/text/Методологии разработки программного обеспечения.docx
Normal file
BIN
data/text/Методологии разработки программного обеспечения.docx
Normal file
Binary file not shown.
BIN
data/text/Методы композиции и декомпозиции.docx
Normal file
BIN
data/text/Методы композиции и декомпозиции.docx
Normal file
Binary file not shown.
BIN
data/text/Модели представления данных в СУБД.docx
Normal file
BIN
data/text/Модели представления данных в СУБД.docx
Normal file
Binary file not shown.
BIN
data/text/Некоторые особенности проектирования.docx
Normal file
BIN
data/text/Некоторые особенности проектирования.docx
Normal file
Binary file not shown.
BIN
data/text/Непроцедурный доступ к данным.docx
Normal file
BIN
data/text/Непроцедурный доступ к данным.docx
Normal file
Binary file not shown.
BIN
data/text/Процедурное расширение языка SQL.docx
Normal file
BIN
data/text/Процедурное расширение языка SQL.docx
Normal file
Binary file not shown.
BIN
data/text/Системные объекты базы данных.docx
Normal file
BIN
data/text/Системные объекты базы данных.docx
Normal file
Binary file not shown.
BIN
data/text/Технология создания распр ИС.docx
Normal file
BIN
data/text/Технология создания распр ИС.docx
Normal file
Binary file not shown.
BIN
data/text/Требования к проекту.docx
Normal file
BIN
data/text/Требования к проекту.docx
Normal file
Binary file not shown.
BIN
data/text/Условия целостности БД.docx
Normal file
BIN
data/text/Условия целостности БД.docx
Normal file
Binary file not shown.
BIN
data/text/Характеристики СУБД.docx
Normal file
BIN
data/text/Характеристики СУБД.docx
Normal file
Binary file not shown.
BIN
data/text/Этапы разработки проекта1.docx
Normal file
BIN
data/text/Этапы разработки проекта1.docx
Normal file
Binary file not shown.
BIN
data/text/Этапы разработки проекта2.docx
Normal file
BIN
data/text/Этапы разработки проекта2.docx
Normal file
Binary file not shown.
BIN
data/text/Этапы разработки проекта3.docx
Normal file
BIN
data/text/Этапы разработки проекта3.docx
Normal file
Binary file not shown.
BIN
data/text/Этапы разработки проекта4.docx
Normal file
BIN
data/text/Этапы разработки проекта4.docx
Normal file
Binary file not shown.
BIN
data/text/Этапы разработки проекта5.docx
Normal file
BIN
data/text/Этапы разработки проекта5.docx
Normal file
Binary file not shown.
BIN
data/text/Язык манипуляции данными.docx
Normal file
BIN
data/text/Язык манипуляции данными.docx
Normal file
Binary file not shown.
93
lab1.ipynb
93
lab1.ipynb
File diff suppressed because one or more lines are too long
2063
lab2.ipynb
Normal file
2063
lab2.ipynb
Normal file
File diff suppressed because one or more lines are too long
1112
lab3.ipynb
Normal file
1112
lab3.ipynb
Normal file
File diff suppressed because one or more lines are too long
3436
lab4.ipynb
Normal file
3436
lab4.ipynb
Normal file
File diff suppressed because one or more lines are too long
1477
lab5.ipynb
Normal file
1477
lab5.ipynb
Normal file
File diff suppressed because one or more lines are too long
1921
lab_7.ipynb
Normal file
1921
lab_7.ipynb
Normal file
File diff suppressed because one or more lines are too long
693
lab_8.ipynb
Normal file
693
lab_8.ipynb
Normal file
@@ -0,0 +1,693 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Лабораторная работа 8\n",
|
||||
"\n",
|
||||
"Выбранный датасет: Технические задания и статьи по ИТ (кластеризация, классификация).\n",
|
||||
"\n",
|
||||
"Выбранный метод машинного обучения: классификация.\n",
|
||||
"\n",
|
||||
"Задача анализа текстов: разработка модели, которая сможет автоматически определять категорию, к которой относится текст (в данном случае, ТЗ или статья)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Импорт библиотеки и инициализация модуля для анализа текста:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import spacy\n",
|
||||
"\n",
|
||||
"sp = spacy.load(\"ru_core_news_lg\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Загрузка текстов из файлов с расширением .docx в датафрейм:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" doc \\\n",
|
||||
"15 tz_16.docx \n",
|
||||
"16 tz_17.docx \n",
|
||||
"17 tz_18.docx \n",
|
||||
"18 tz_19.docx \n",
|
||||
"19 tz_20.docx \n",
|
||||
"20 Архитектура, управляемая модель.docx \n",
|
||||
"21 Введение в проектирование ИС.docx \n",
|
||||
"22 Встроенные операторы SQL.docx \n",
|
||||
"23 Методологии разработки программного обеспечени... \n",
|
||||
"24 Методологии разработки программного обеспечени... \n",
|
||||
"\n",
|
||||
" text type \n",
|
||||
"15 2.2\\tТехническое задание\\n2.2.1\\tОбщие сведени... 0 \n",
|
||||
"16 2.2 Техническое задание.\\n2.2.1 Общие сведения... 0 \n",
|
||||
"17 2.2. Техническое задание\\nОбщие сведения:\\nПол... 0 \n",
|
||||
"18 2.2. Техническое задание\\n2.2.1. Наименование ... 0 \n",
|
||||
"19 2.2. Техническое задание\\n2.2.1. Общие сведени... 0 \n",
|
||||
"20 Архитектура, управляемая модель\\nАббревиатура ... 1 \n",
|
||||
"21 1. ВВЕДЕНИЕ В ПРОЕКТИРОВАНИЕ ИНФОРМАЦИОННЫХ СИ... 1 \n",
|
||||
"22 Встроенные операторы SQL. \\nКак было отмечено ... 1 \n",
|
||||
"23 Методологии разработки программного обеспечени... 1 \n",
|
||||
"24 Методологии разработки программного обеспечени... 1 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"from docx import Document\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"def read_docx(file_path):\n",
|
||||
" doc = Document(file_path)\n",
|
||||
" full_text = []\n",
|
||||
" for paragraph in doc.paragraphs:\n",
|
||||
" full_text.append(paragraph.text)\n",
|
||||
" return \"\\n\".join(full_text)\n",
|
||||
"\n",
|
||||
"def load_docs(dataset_path):\n",
|
||||
" df = pd.DataFrame(columns=[\"doc\", \"text\"])\n",
|
||||
" for file_path in os.listdir(dataset_path):\n",
|
||||
" if file_path.startswith(\"~$\"):\n",
|
||||
" continue\n",
|
||||
" text = read_docx(dataset_path + file_path)\n",
|
||||
" df.loc[len(df.index)] = [file_path, text]\n",
|
||||
" return df\n",
|
||||
"\n",
|
||||
"df = load_docs(\"./data/text/\")\n",
|
||||
"df[\"type\"] = df.apply(\n",
|
||||
" lambda row: 0 if str(row[\"doc\"]).startswith(\"tz_\") else 1, axis=1\n",
|
||||
")\n",
|
||||
"df.sort_values(by=[\"doc\"], inplace=True)\n",
|
||||
"\n",
|
||||
"print(df.iloc[15:25])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Предобработка текста.\n",
|
||||
"\n",
|
||||
"Трансформация:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"import emoji\n",
|
||||
"from num2words import num2words\n",
|
||||
"\n",
|
||||
"# Функция для преобразования эмоджи в слова\n",
|
||||
"def emojis_words(text):\n",
|
||||
" \n",
|
||||
" # Модуль emoji: преобразование эмоджи в их словесные описания\n",
|
||||
" text = emoji.demojize(text, delimiters=(\" \", \" \"))\n",
|
||||
" \n",
|
||||
" # Редактирование текста путём замены \":\" и\" _\", а так же - путём добавления пробела между отдельными словами\n",
|
||||
" text = text.replace(\":\", \"\").replace(\"_\", \" \")\n",
|
||||
" \n",
|
||||
" return text\n",
|
||||
"\n",
|
||||
"def transform_text(text):\n",
|
||||
" # Удаление из текста всех HTML-тегов\n",
|
||||
" text = re.sub(r'<[^<]+?>', '', text)\n",
|
||||
" \n",
|
||||
" # Удаление из текста всех URL и ссылок\n",
|
||||
" text = re.sub(r'http\\S+', '', text)\n",
|
||||
"\n",
|
||||
" # Преобразование эмоджи в текст\n",
|
||||
" text = emojis_words(text)\n",
|
||||
"\n",
|
||||
" # Приведение к нижнему регистру\n",
|
||||
" text = text.lower()\n",
|
||||
"\n",
|
||||
" # Удаление лишних пробелов\n",
|
||||
" text = re.sub(r'\\s+', ' ', text) \n",
|
||||
" \n",
|
||||
" # Преобразование \"ё\" в \"е\"\n",
|
||||
" text = text.replace(\"ё\", \"е\")\n",
|
||||
"\n",
|
||||
" # Удаление всех специальных символов\n",
|
||||
" text = re.sub(r'[^a-zA-Zа-яА-Я0-9\\s]', '', text)\n",
|
||||
"\n",
|
||||
" # Преобразование чисел в слова\n",
|
||||
" words: list[str] = text.split()\n",
|
||||
" words = [num2words(word, lang=\"ru\") if word.isdigit() else word for word in words]\n",
|
||||
" text = \" \".join(words)\n",
|
||||
"\n",
|
||||
" # Удаление из текста всех знаков препинания\n",
|
||||
" text = re.sub(r'[^\\w\\s]', '', text)\n",
|
||||
"\n",
|
||||
" return text\n",
|
||||
"\n",
|
||||
"df[\"preprocessed_text\"] = df[\"text\"].apply(transform_text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Для выполнения токенизации, определения частей речи (POS tagging), нормализации (в данном случае применяется лемматизация) и фильтрации данных будем использовать библиотеку spaCy. На этапе фильтрации с целью уменьшения размерности пространства признаков задействуем словарь стоп-слов, а также исключим все слова, длина которых превышает 20 символов."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"двадцать_NUM_Case=Nom технический_ADJ_Case=Nom|Degree=Pos|Gender=Neut|Number=Sing задание_NOUN_Animacy=Inan|Case=Nom|Gender=Neut|Number=Sing двести_NUM_Case=Nom двадцать_NUM_Case=Nom общий_ADJ_Case=Nom|Degree=Pos|Number=Plur сведение_NOUN_Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing полный_ADJ_Case=Nom|Degree=Pos|Gender=Neut|Number=Sing наименование_NOUN_Animacy=Inan|Case=Nom|Gender=Neut|Number=Sing система_NOUN_Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"from nltk.corpus import stopwords\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"stop_words = set(stopwords.words('russian'))\n",
|
||||
"\n",
|
||||
"def preprocess_text(text):\n",
|
||||
" doc = sp(text)\n",
|
||||
" \n",
|
||||
" filtered_tokens = [\n",
|
||||
"\n",
|
||||
" f\"{token.lemma_}_{token.pos_}_{token.morph}\" # Формирование строки с нужным форматом\n",
|
||||
" for token in doc\n",
|
||||
"\n",
|
||||
" if token.text not in stop_words and len(token.text) <= 20 # Фильтрация \n",
|
||||
"\n",
|
||||
" ]\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" return \" \".join(filtered_tokens)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"df[\"preprocessed_text\"] = df[\"preprocessed_text\"].apply(preprocess_text)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Выведем 10 токенов из первого текста\n",
|
||||
"\n",
|
||||
"first_text_tokens = df[\"preprocessed_text\"].iloc[0].split()[:10]\n",
|
||||
"\n",
|
||||
"print(\" \".join(first_text_tokens))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Перейдем к этапу формирования N-грамм:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[nltk_data] Downloading package punkt_tab to\n",
|
||||
"[nltk_data] D:\\Users\\Leo\\AppData\\Roaming\\nltk_data...\n",
|
||||
"[nltk_data] Package punkt_tab is already up-to-date!\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" doc \\\n",
|
||||
"15 tz_16.docx \n",
|
||||
"16 tz_17.docx \n",
|
||||
"17 tz_18.docx \n",
|
||||
"18 tz_19.docx \n",
|
||||
"19 tz_20.docx \n",
|
||||
"20 Архитектура, управляемая модель.docx \n",
|
||||
"21 Введение в проектирование ИС.docx \n",
|
||||
"22 Встроенные операторы SQL.docx \n",
|
||||
"23 Методологии разработки программного обеспечени... \n",
|
||||
"24 Методологии разработки программного обеспечени... \n",
|
||||
"\n",
|
||||
" text type \\\n",
|
||||
"15 2.2\\tТехническое задание\\n2.2.1\\tОбщие сведени... 0 \n",
|
||||
"16 2.2 Техническое задание.\\n2.2.1 Общие сведения... 0 \n",
|
||||
"17 2.2. Техническое задание\\nОбщие сведения:\\nПол... 0 \n",
|
||||
"18 2.2. Техническое задание\\n2.2.1. Наименование ... 0 \n",
|
||||
"19 2.2. Техническое задание\\n2.2.1. Общие сведени... 0 \n",
|
||||
"20 Архитектура, управляемая модель\\nАббревиатура ... 1 \n",
|
||||
"21 1. ВВЕДЕНИЕ В ПРОЕКТИРОВАНИЕ ИНФОРМАЦИОННЫХ СИ... 1 \n",
|
||||
"22 Встроенные операторы SQL. \\nКак было отмечено ... 1 \n",
|
||||
"23 Методологии разработки программного обеспечени... 1 \n",
|
||||
"24 Методологии разработки программного обеспечени... 1 \n",
|
||||
"\n",
|
||||
" preprocessed_text \\\n",
|
||||
"15 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n",
|
||||
"16 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n",
|
||||
"17 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n",
|
||||
"18 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n",
|
||||
"19 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n",
|
||||
"20 архитектура_NOUN_Animacy=Inan|Case=Nom|Gender=... \n",
|
||||
"21 введение_NOUN_Animacy=Inan|Case=Nom|Gender=Neu... \n",
|
||||
"22 встроенные_ADJ_Case=Nom|Degree=Pos|Number=Plur... \n",
|
||||
"23 методология_NOUN_Animacy=Inan|Case=Gen|Gender=... \n",
|
||||
"24 методология_NOUN_Animacy=Inan|Case=Gen|Gender=... \n",
|
||||
"\n",
|
||||
" bigrams \\\n",
|
||||
"15 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
|
||||
"16 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
|
||||
"17 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
|
||||
"18 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
|
||||
"19 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
|
||||
"20 [(архитектура_NOUN_Animacy=Inan|Case=Nom|Gende... \n",
|
||||
"21 [(введение_NOUN_Animacy=Inan|Case=Nom|Gender=N... \n",
|
||||
"22 [(встроенные_ADJ_Case=Nom|Degree=Pos|Number=Pl... \n",
|
||||
"23 [(методология_NOUN_Animacy=Inan|Case=Gen|Gende... \n",
|
||||
"24 [(методология_NOUN_Animacy=Inan|Case=Gen|Gende... \n",
|
||||
"\n",
|
||||
" trigrams \n",
|
||||
"15 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
|
||||
"16 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
|
||||
"17 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
|
||||
"18 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
|
||||
"19 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
|
||||
"20 [(архитектура_NOUN_Animacy=Inan|Case=Nom|Gende... \n",
|
||||
"21 [(введение_NOUN_Animacy=Inan|Case=Nom|Gender=N... \n",
|
||||
"22 [(встроенные_ADJ_Case=Nom|Degree=Pos|Number=Pl... \n",
|
||||
"23 [(методология_NOUN_Animacy=Inan|Case=Gen|Gende... \n",
|
||||
"24 [(методология_NOUN_Animacy=Inan|Case=Gen|Gende... \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import nltk\n",
|
||||
"from nltk.util import ngrams\n",
|
||||
"from nltk.tokenize import word_tokenize\n",
|
||||
"nltk.download(\"punkt_tab\")\n",
|
||||
"def generate_ngrams(text: str, n: int = 2) -> list[tuple]:\n",
|
||||
" tokens: list[str] = word_tokenize(text, language=\"russian\")\n",
|
||||
" \n",
|
||||
" n_grams: list[tuple] = list(ngrams(tokens, n))\n",
|
||||
" return n_grams\n",
|
||||
"\n",
|
||||
"# Пример для биграмм (N=2)\n",
|
||||
"df[\"bigrams\"] = df[\"preprocessed_text\"].apply(lambda x: generate_ngrams(x, n=2))\n",
|
||||
"\n",
|
||||
"# Пример для триграмм (N=3)\n",
|
||||
"df[\"trigrams\"] = df[\"preprocessed_text\"].apply(lambda x: generate_ngrams(x, n=3))\n",
|
||||
"\n",
|
||||
"print(df.iloc[15:25])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Теперь применим методы для векторизации текста.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Мешок слов:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" тутто_noun_animacy взаимоотношение_noun_animacy иннкпп_propn_animacy \\\n",
|
||||
"15 0 0 0 \n",
|
||||
"16 0 0 0 \n",
|
||||
"17 0 0 0 \n",
|
||||
"18 0 0 0 \n",
|
||||
"19 0 0 0 \n",
|
||||
"20 0 0 0 \n",
|
||||
"21 0 1 0 \n",
|
||||
"22 0 0 0 \n",
|
||||
"23 0 0 0 \n",
|
||||
"24 0 0 0 \n",
|
||||
"25 0 0 0 \n",
|
||||
"\n",
|
||||
" gif_propn_foreign накладывать_verb_aspect \\\n",
|
||||
"15 0 0 \n",
|
||||
"16 0 0 \n",
|
||||
"17 0 0 \n",
|
||||
"18 0 0 \n",
|
||||
"19 0 0 \n",
|
||||
"20 0 0 \n",
|
||||
"21 0 0 \n",
|
||||
"22 0 0 \n",
|
||||
"23 0 0 \n",
|
||||
"24 0 0 \n",
|
||||
"25 0 1 \n",
|
||||
"\n",
|
||||
" метрологическому_propn_animacy связанность_noun_animacy \\\n",
|
||||
"15 0 0 \n",
|
||||
"16 1 0 \n",
|
||||
"17 0 0 \n",
|
||||
"18 0 0 \n",
|
||||
"19 0 0 \n",
|
||||
"20 0 0 \n",
|
||||
"21 0 0 \n",
|
||||
"22 0 0 \n",
|
||||
"23 0 0 \n",
|
||||
"24 0 0 \n",
|
||||
"25 0 0 \n",
|
||||
"\n",
|
||||
" модернизировать_verb_aspect инструментальный_adj_case \\\n",
|
||||
"15 0 0 \n",
|
||||
"16 0 0 \n",
|
||||
"17 0 0 \n",
|
||||
"18 0 0 \n",
|
||||
"19 0 0 \n",
|
||||
"20 0 0 \n",
|
||||
"21 0 1 \n",
|
||||
"22 0 0 \n",
|
||||
"23 0 0 \n",
|
||||
"24 0 0 \n",
|
||||
"25 0 1 \n",
|
||||
"\n",
|
||||
" достаточно_adv_degree \n",
|
||||
"15 0 \n",
|
||||
"16 0 \n",
|
||||
"17 0 \n",
|
||||
"18 0 \n",
|
||||
"19 0 \n",
|
||||
"20 0 \n",
|
||||
"21 6 \n",
|
||||
"22 1 \n",
|
||||
"23 8 \n",
|
||||
"24 3 \n",
|
||||
"25 15 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from scipy import sparse\n",
|
||||
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"counts_vectorizer = CountVectorizer()\n",
|
||||
"counts_matrix = sparse.csr_matrix(counts_vectorizer.fit_transform(df[\"preprocessed_text\"]))\n",
|
||||
"counts_df = pd.DataFrame(\n",
|
||||
" counts_matrix.toarray(),\n",
|
||||
" columns=counts_vectorizer.get_feature_names_out(),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"random_columns = np.random.choice(counts_df.columns, size=10, replace=False)\n",
|
||||
"\n",
|
||||
"print(counts_df.loc[15:25, random_columns]) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Частотный портрет:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" тутто_noun_animacy взаимоотношение_noun_animacy иннкпп_propn_animacy \\\n",
|
||||
"15 0.0 0.000000 0.0 \n",
|
||||
"16 0.0 0.000000 0.0 \n",
|
||||
"17 0.0 0.000000 0.0 \n",
|
||||
"18 0.0 0.000000 0.0 \n",
|
||||
"19 0.0 0.000000 0.0 \n",
|
||||
"20 0.0 0.000000 0.0 \n",
|
||||
"21 0.0 0.022338 0.0 \n",
|
||||
"22 0.0 0.000000 0.0 \n",
|
||||
"23 0.0 0.000000 0.0 \n",
|
||||
"24 0.0 0.000000 0.0 \n",
|
||||
"25 0.0 0.000000 0.0 \n",
|
||||
"\n",
|
||||
" gif_propn_foreign накладывать_verb_aspect \\\n",
|
||||
"15 0.0 0.00000 \n",
|
||||
"16 0.0 0.00000 \n",
|
||||
"17 0.0 0.00000 \n",
|
||||
"18 0.0 0.00000 \n",
|
||||
"19 0.0 0.00000 \n",
|
||||
"20 0.0 0.00000 \n",
|
||||
"21 0.0 0.00000 \n",
|
||||
"22 0.0 0.00000 \n",
|
||||
"23 0.0 0.00000 \n",
|
||||
"24 0.0 0.00000 \n",
|
||||
"25 0.0 0.02162 \n",
|
||||
"\n",
|
||||
" метрологическому_propn_animacy связанность_noun_animacy \\\n",
|
||||
"15 0.000000 0.0 \n",
|
||||
"16 0.042399 0.0 \n",
|
||||
"17 0.000000 0.0 \n",
|
||||
"18 0.000000 0.0 \n",
|
||||
"19 0.000000 0.0 \n",
|
||||
"20 0.000000 0.0 \n",
|
||||
"21 0.000000 0.0 \n",
|
||||
"22 0.000000 0.0 \n",
|
||||
"23 0.000000 0.0 \n",
|
||||
"24 0.000000 0.0 \n",
|
||||
"25 0.000000 0.0 \n",
|
||||
"\n",
|
||||
" модернизировать_verb_aspect инструментальный_adj_case \\\n",
|
||||
"15 0.0 0.000000 \n",
|
||||
"16 0.0 0.000000 \n",
|
||||
"17 0.0 0.000000 \n",
|
||||
"18 0.0 0.000000 \n",
|
||||
"19 0.0 0.000000 \n",
|
||||
"20 0.0 0.000000 \n",
|
||||
"21 0.0 0.017277 \n",
|
||||
"22 0.0 0.000000 \n",
|
||||
"23 0.0 0.000000 \n",
|
||||
"24 0.0 0.000000 \n",
|
||||
"25 0.0 0.018585 \n",
|
||||
"\n",
|
||||
" достаточно_adv_degree \n",
|
||||
"15 0.000000 \n",
|
||||
"16 0.000000 \n",
|
||||
"17 0.000000 \n",
|
||||
"18 0.000000 \n",
|
||||
"19 0.000000 \n",
|
||||
"20 0.000000 \n",
|
||||
"21 0.033501 \n",
|
||||
"22 0.025389 \n",
|
||||
"23 0.047452 \n",
|
||||
"24 0.036795 \n",
|
||||
"25 0.047864 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"\n",
|
||||
"tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True)\n",
|
||||
"tfidf_matrix = sparse.csr_matrix(tfidf_vectorizer.fit_transform(df[\"preprocessed_text\"]))\n",
|
||||
"tfidf_df = pd.DataFrame(\n",
|
||||
" tfidf_matrix.toarray(),\n",
|
||||
" columns=tfidf_vectorizer.get_feature_names_out(),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(tfidf_df.loc[15:25, random_columns]) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Обучение модели и проверка ее качества:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"TF-IDF Model\n",
|
||||
"Accuracy: 0.8889\n",
|
||||
"Precision: 0.7500\n",
|
||||
"Recall: 1.0000\n",
|
||||
"F1 Score: 0.8571\n",
|
||||
"ROC AUC: 0.9167\n",
|
||||
"Cross-validated F1 Score: 1.0000\n",
|
||||
"\n",
|
||||
"Count Vectorizer Model\n",
|
||||
"Accuracy: 1.0000\n",
|
||||
"Precision: 1.0000\n",
|
||||
"Recall: 1.0000\n",
|
||||
"F1 Score: 1.0000\n",
|
||||
"ROC AUC: 1.0000\n",
|
||||
"Cross-validated F1 Score: 0.9333\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV\n",
|
||||
"from sklearn.ensemble import RandomForestClassifier\n",
|
||||
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n",
|
||||
"\n",
|
||||
"def train_and_evaluate(X, y, test_size=0.2, cv=5, optimize=False):\n",
|
||||
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=9)\n",
|
||||
"\n",
|
||||
" if optimize:\n",
|
||||
" param_grid = {\n",
|
||||
" \"n_estimators\": [10, 20, 30, 40, 50, 100, 150, 200, 250, 500],\n",
|
||||
" \"max_features\": [\"sqrt\", \"log2\", 2],\n",
|
||||
" \"max_depth\": [2, 3, 4, 5, 6, 7, 8, 9, 10],\n",
|
||||
" \"criterion\": [\"gini\", \"entropy\", \"log_loss\"],\n",
|
||||
" \"class_weight\": [\"balanced\", \"balanced_subsample\"]\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" grid_search = GridSearchCV(RandomForestClassifier(random_state=9), param_grid, scoring=\"f1\", cv=cv, n_jobs=-1)\n",
|
||||
" grid_search.fit(X_train, y_train)\n",
|
||||
" model = grid_search.best_estimator_\n",
|
||||
" print(f\"Лучшие параметры: {grid_search.best_params_}\")\n",
|
||||
" else:\n",
|
||||
" model = RandomForestClassifier(n_estimators=100, random_state=9)\n",
|
||||
" model.fit(X_train, y_train)\n",
|
||||
"\n",
|
||||
" y_pred = model.predict(X_test)\n",
|
||||
"\n",
|
||||
" accuracy = accuracy_score(y_test, y_pred)\n",
|
||||
" precision = precision_score(y_test, y_pred)\n",
|
||||
" recall = recall_score(y_test, y_pred)\n",
|
||||
" f1 = f1_score(y_test, y_pred)\n",
|
||||
" roc_auc = roc_auc_score(y_test, y_pred)\n",
|
||||
"\n",
|
||||
" print(f\"Accuracy: {accuracy:.4f}\")\n",
|
||||
" print(f\"Precision: {precision:.4f}\")\n",
|
||||
" print(f\"Recall: {recall:.4f}\")\n",
|
||||
" print(f\"F1 Score: {f1:.4f}\")\n",
|
||||
" print(f\"ROC AUC: {roc_auc:.4f}\")\n",
|
||||
"\n",
|
||||
" scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1')\n",
|
||||
" f1_cv = scores.mean()\n",
|
||||
" print(f\"Cross-validated F1 Score: {f1_cv:.4f}\")\n",
|
||||
"\n",
|
||||
" return model\n",
|
||||
"\n",
|
||||
"X_tfidf = tfidf_df\n",
|
||||
"X_counts = counts_df\n",
|
||||
"y = df[\"type\"]\n",
|
||||
"\n",
|
||||
"print(\"TF-IDF Model\")\n",
|
||||
"model_tfidf = train_and_evaluate(X_tfidf, y)\n",
|
||||
"\n",
|
||||
"print(\"\\nCount Vectorizer Model\")\n",
|
||||
"model_counts = train_and_evaluate(X_counts, y)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Как видно, обе модели демонстрируют отличные результаты, причём вторая модель достигает практически идеальных показателей. Однако это может быть связано с небольшим объёмом данных в выборке (всего 41 документ). Вероятно, модель просто запомнила данные, что привело к её переобучению."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Кроме того, в соответствии с заданием, оценим решение, используя альтернативные гиперпараметры модели машинного обучения, которые будут подобраны с помощью метода поиска по сетке."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"TF-IDF Model (Optimized)\n",
|
||||
"Лучшие параметры: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 2, 'max_features': 'sqrt', 'n_estimators': 20}\n",
|
||||
"Accuracy: 0.7778\n",
|
||||
"Precision: 0.6000\n",
|
||||
"Recall: 1.0000\n",
|
||||
"F1 Score: 0.7500\n",
|
||||
"ROC AUC: 0.8333\n",
|
||||
"Cross-validated F1 Score: 1.0000\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"TF-IDF Model (Optimized)\")\n",
|
||||
"model_tfidf = train_and_evaluate(X_tfidf, y, optimize=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Можно сделать вывод, что в данном случае существует возможность настроить гиперпараметры модели таким образом, что её показатели согласно метрикам достигнут практически идеального уровня."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
424
lab_9.ipynb
Normal file
424
lab_9.ipynb
Normal file
File diff suppressed because one or more lines are too long
2912
poetry.lock
generated
2912
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -7,19 +7,36 @@ readme = "readme.md"
|
||||
package-mode = false
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.12"
|
||||
python = ">=3.12,<3.13"
|
||||
jupyter = "^1.1.1"
|
||||
numpy = "^2.1.0"
|
||||
numpy = "^1.26.4"
|
||||
pandas = "^2.2.2"
|
||||
matplotlib = "^3.9.2"
|
||||
flask = "^3.0.3"
|
||||
apiflask = "^2.2.0"
|
||||
flask-cors = "^5.0.0"
|
||||
scikit-learn = "^1.5.2"
|
||||
imbalanced-learn = "^0.12.3"
|
||||
ipykernel = "^6.29.5"
|
||||
imbalanced-learn = "^0.12.4"
|
||||
seaborn = "^0.13.2"
|
||||
featuretools = "^1.31.0"
|
||||
gymnasium = "^1.0.0"
|
||||
scikit-fuzzy = "^0.5.0"
|
||||
networkx = "^3.4.2"
|
||||
spacy = "^3.7.5"
|
||||
docx = "^0.2.4"
|
||||
emoji = "^2.14.1"
|
||||
num2words = "^0.5.14"
|
||||
nltk = "^3.9.1"
|
||||
python-docx = "^1.1.2"
|
||||
opencv-python = "^4.11.0.86"
|
||||
mahotas = "^1.4.18"
|
||||
albumentations = "^2.0.5"
|
||||
ru_core_news_lg = {url = "https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.7.0/ru_core_news_lg-3.7.0-py3-none-any.whl"}
|
||||
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
ipykernel = "^6.29.5"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
Reference in New Issue
Block a user