7 Commits
main ... lab9

Author SHA1 Message Date
fda87f53d5 lab9 2025-05-16 22:38:28 +04:00
f6bdab7f5b 1 2025-05-16 22:31:35 +04:00
e35a826ccd lab7 2025-04-18 21:26:06 +04:00
8e9ddc5b7c lab3 2024-11-29 18:17:46 +04:00
9ee8efec42 lab2 2024-11-29 18:04:52 +04:00
d59680bbe0 lab2done 2024-11-23 07:56:39 +04:00
278e197032 lab1 2024-11-22 22:56:37 +04:00
68 changed files with 85571 additions and 0 deletions

2
.flake8 Normal file
View File

@@ -0,0 +1,2 @@
[flake8]
max-line-length = 120

13
.vscode/extensions.json vendored Normal file
View File

@@ -0,0 +1,13 @@
{
"recommendations": [
"ms-python.black-formatter",
"ms-python.flake8",
"ms-python.isort",
"ms-toolsai.jupyter",
"ms-toolsai.datawrangler",
"ms-python.python",
"donjayamanne.python-environment-manager",
// optional
"usernamehw.errorlens"
]
}

16
.vscode/launch.json vendored Normal file
View File

@@ -0,0 +1,16 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "mai-service",
"type": "debugpy",
"request": "launch",
"program": "run.py",
"console": "integratedTerminal",
"justMyCode": true
}
]
}

38
.vscode/settings.json vendored Normal file
View File

@@ -0,0 +1,38 @@
{
"files.autoSave": "onFocusChange",
"files.exclude": {
"**/__pycache__": true
},
"editor.detectIndentation": false,
"editor.formatOnType": false,
"editor.formatOnPaste": true,
"editor.formatOnSave": true,
"editor.tabSize": 4,
"editor.insertSpaces": true,
"editor.codeActionsOnSave": {
"source.organizeImports": "explicit",
"source.sortImports": "explicit"
},
"editor.stickyScroll.enabled": false,
"diffEditor.ignoreTrimWhitespace": false,
"debug.showVariableTypes": true,
"workbench.editor.highlightModifiedTabs": true,
"git.suggestSmartCommit": false,
"git.autofetch": true,
"git.openRepositoryInParentFolders": "always",
"git.confirmSync": false,
"errorLens.gutterIconsEnabled": true,
"errorLens.messageEnabled": false,
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter",
},
"python.languageServer": "Pylance",
"python.analysis.typeCheckingMode": "basic",
"python.analysis.autoImportCompletions": true,
"isort.args": [
"--profile",
"black"
],
"notebook.lineNumbers": "on",
"notebook.output.minimalErrorRendering": true,
}

View File

@@ -0,0 +1,8 @@
{
"folders": [
{
"path": "."
}
],
"settings": {}
}

52
backend/__init__.py Normal file
View File

@@ -0,0 +1,52 @@
import importlib
import os
import traceback
import matplotlib
from apiflask import APIBlueprint, APIFlask
from flask_cors import CORS
matplotlib.use("agg")
cors = CORS()
api_bp = APIBlueprint("api", __name__, url_prefix="/api/v1")
dataset_path: str | None = None
class Config:
SECRET_KEY = "secret!"
SEND_FILE_MAX_AGE_DEFAULT = -1
def create_app():
global dataset_path
# Create and configure app
app = APIFlask(
"MAI Service",
title="MAI Service API",
docs_path="/",
version="1.0",
static_folder="",
template_folder="",
)
app.config.from_object(Config)
dataset_path = os.path.join(app.instance_path, "dataset")
os.makedirs(dataset_path, exist_ok=True)
@app.errorhandler(Exception)
def my_error_processor(error):
traceback.print_exception(error)
return {"message": str(error), "detail": "No details"}, 500
# Import custom REST methods
importlib.import_module("backend.api")
# Enable REST API
app.register_blueprint(api_bp)
# Enable app extensions
cors.init_app(app)
return app

57
backend/api.py Normal file
View File

@@ -0,0 +1,57 @@
from apiflask import FileSchema, Schema, fields
from flask import send_file
from backend import api_bp, dataset_path
from backend.service import Service
class FileUpload(Schema):
file = fields.File(required=True)
class ColumnInfoDto(Schema):
datatype = fields.String()
items = fields.List(fields.String())
class TableColumnDto(Schema):
name = fields.String()
datatype = fields.String()
items = fields.List(fields.String())
service = Service(dataset_path)
@api_bp.post("/dataset")
@api_bp.input(FileUpload, location="files")
def upload_dataset(files_data):
uploaded_file = files_data["file"]
return service.upload_dataset(uploaded_file)
@api_bp.get("/dataset")
def get_all_datasets():
return service.get_all_datasets()
@api_bp.get("/dataset/<string:name>")
@api_bp.output(TableColumnDto(many=True))
def get_dataset_info(name: str):
return service.get_dataset_info(name)
@api_bp.get("/dataset/<string:name>/<string:column>")
@api_bp.output(ColumnInfoDto)
def get_column_info(name: str, column: str):
return service.get_column_info(name, column)
@api_bp.get("/dataset/draw/hist/<string:name>/<string:column>")
@api_bp.output(
FileSchema(type="string", format="binary"), content_type="image/png", example=""
)
def get_dataset_hist(name: str, column: str):
data = service.get_hist(name, column)
data.seek(0)
return send_file(data, download_name=f"{name}.hist.png", mimetype="image/png")

59
backend/service.py Normal file
View File

@@ -0,0 +1,59 @@
import io
import os
import pathlib
from typing import BinaryIO, Dict, List
import pandas as pd
from matplotlib.figure import Figure
from werkzeug.datastructures import FileStorage
from werkzeug.utils import secure_filename
class Service:
def __init__(self, dataset_path: str | None) -> None:
if dataset_path is None:
raise Exception("Dataset path is not defined")
self.__path: str = dataset_path
def __get_dataset(self, filename: str) -> pd.DataFrame:
full_file_name = os.path.join(self.__path, secure_filename(filename))
return pd.read_csv(full_file_name)
def upload_dataset(self, file: FileStorage) -> str:
if file.filename is None:
raise Exception("Dataset upload error")
file_name: str = file.filename
full_file_name = os.path.join(self.__path, secure_filename(file_name))
file.save(full_file_name)
return file_name
def get_all_datasets(self) -> List[str]:
return [file.name for file in pathlib.Path(self.__path).glob("*.csv")]
def get_dataset_info(self, filename) -> List[Dict]:
dataset = self.__get_dataset(filename)
dataset_info = []
for column in dataset.columns:
items = dataset[column].astype(str)
column_info = {
"name": column,
"datatype": dataset.dtypes[column],
"items": items,
}
dataset_info.append(column_info)
return dataset_info
def get_column_info(self, filename, column) -> Dict:
dataset = self.__get_dataset(filename)
datatype = dataset.dtypes[column]
items = sorted(dataset[column].astype(str).unique())
return {"datatype": datatype, "items": items}
def get_hist(self, filename, column) -> BinaryIO:
dataset = self.__get_dataset(filename)
bytes = io.BytesIO()
plot: Figure | None = dataset.plot.hist(column=[column], bins=80).get_figure()
if plot is None:
raise Exception("Can't create hist plot")
plot.savefig(bytes, dpi=300, format="png")
return bytes

53944
data/Diamonds-Prices.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,6 @@
Rank ,Name,Networth,Age,Country,Source,Industry
1,Elon Musk ,219,50,United States,"Tesla, SpaceX",Automotive
2,Jeff Bezos ,171,58,United States,Amazon,Technology
3,Bernard Arnault & family ,158,73,France,LVMH,Fashion & Retail
4,Bill Gates ,129,66,United States,Microsoft,Technology
5,Warren Buffett ,118,91,United States,Berkshire Hathaway,Finance & Investments
1 Rank Name Networth Age Country Source Industry
2 1 Elon Musk 219 50 United States Tesla, SpaceX Automotive
3 2 Jeff Bezos 171 58 United States Amazon Technology
4 3 Bernard Arnault & family 158 73 France LVMH Fashion & Retail
5 4 Bill Gates 129 66 United States Microsoft Technology
6 5 Warren Buffett 118 91 United States Berkshire Hathaway Finance & Investments

2601
data/Forbes Billionaires.csv Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

BIN
data/text/tz_01.docx Normal file

Binary file not shown.

BIN
data/text/tz_02.docx Normal file

Binary file not shown.

BIN
data/text/tz_03.docx Normal file

Binary file not shown.

BIN
data/text/tz_04.docx Normal file

Binary file not shown.

BIN
data/text/tz_05.docx Normal file

Binary file not shown.

BIN
data/text/tz_06.docx Normal file

Binary file not shown.

BIN
data/text/tz_07.docx Normal file

Binary file not shown.

BIN
data/text/tz_08.docx Normal file

Binary file not shown.

BIN
data/text/tz_09.docx Normal file

Binary file not shown.

BIN
data/text/tz_10.docx Normal file

Binary file not shown.

BIN
data/text/tz_11.docx Normal file

Binary file not shown.

BIN
data/text/tz_12.docx Normal file

Binary file not shown.

BIN
data/text/tz_13.docx Normal file

Binary file not shown.

BIN
data/text/tz_14.docx Normal file

Binary file not shown.

BIN
data/text/tz_15.docx Normal file

Binary file not shown.

BIN
data/text/tz_16.docx Normal file

Binary file not shown.

BIN
data/text/tz_17.docx Normal file

Binary file not shown.

BIN
data/text/tz_18.docx Normal file

Binary file not shown.

BIN
data/text/tz_19.docx Normal file

Binary file not shown.

BIN
data/text/tz_20.docx Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

5111
lab1.csv Normal file

File diff suppressed because it is too large Load Diff

653
lab1.ipynb Normal file

File diff suppressed because one or more lines are too long

2063
lab2.ipynb Normal file

File diff suppressed because one or more lines are too long

1112
lab3.ipynb Normal file

File diff suppressed because one or more lines are too long

3436
lab4.ipynb Normal file

File diff suppressed because one or more lines are too long

1477
lab5.ipynb Normal file

File diff suppressed because one or more lines are too long

1921
lab_7.ipynb Normal file

File diff suppressed because one or more lines are too long

693
lab_8.ipynb Normal file
View File

@@ -0,0 +1,693 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Лабораторная работа 8\n",
"\n",
"Выбранный датасет: Технические задания и статьи по ИТ (кластеризация, классификация).\n",
"\n",
"Выбранный метод машинного обучения: классификация.\n",
"\n",
"Задача анализа текстов: разработка модели, которая сможет автоматически определять категорию, к которой относится текст (в данном случае, ТЗ или статья)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Импорт библиотеки и инициализация модуля для анализа текста:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"import spacy\n",
"\n",
"sp = spacy.load(\"ru_core_news_lg\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Загрузка текстов из файлов с расширением .docx в датафрейм:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" doc \\\n",
"15 tz_16.docx \n",
"16 tz_17.docx \n",
"17 tz_18.docx \n",
"18 tz_19.docx \n",
"19 tz_20.docx \n",
"20 Архитектура, управляемая модель.docx \n",
"21 Введение в проектирование ИС.docx \n",
"22 Встроенные операторы SQL.docx \n",
"23 Методологии разработки программного обеспечени... \n",
"24 Методологии разработки программного обеспечени... \n",
"\n",
" text type \n",
"15 2.2\\tТехническое задание\\n2.2.1\\tОбщие сведени... 0 \n",
"16 2.2 Техническое задание.\\n2.2.1 Общие сведения... 0 \n",
"17 2.2. Техническое задание\\nОбщие сведения:\\nПол... 0 \n",
"18 2.2. Техническое задание\\n2.2.1. Наименование ... 0 \n",
"19 2.2. Техническое задание\\n2.2.1. Общие сведени... 0 \n",
"20 Архитектура, управляемая модель\\nАббревиатура ... 1 \n",
"21 1. ВВЕДЕНИЕ В ПРОЕКТИРОВАНИЕ ИНФОРМАЦИОННЫХ СИ... 1 \n",
"22 Встроенные операторы SQL. \\nКак было отмечено ... 1 \n",
"23 Методологии разработки программного обеспечени... 1 \n",
"24 Методологии разработки программного обеспечени... 1 \n"
]
}
],
"source": [
"import pandas as pd\n",
"from docx import Document\n",
"import os\n",
"\n",
"def read_docx(file_path):\n",
" doc = Document(file_path)\n",
" full_text = []\n",
" for paragraph in doc.paragraphs:\n",
" full_text.append(paragraph.text)\n",
" return \"\\n\".join(full_text)\n",
"\n",
"def load_docs(dataset_path):\n",
" df = pd.DataFrame(columns=[\"doc\", \"text\"])\n",
" for file_path in os.listdir(dataset_path):\n",
" if file_path.startswith(\"~$\"):\n",
" continue\n",
" text = read_docx(dataset_path + file_path)\n",
" df.loc[len(df.index)] = [file_path, text]\n",
" return df\n",
"\n",
"df = load_docs(\"./data/text/\")\n",
"df[\"type\"] = df.apply(\n",
" lambda row: 0 if str(row[\"doc\"]).startswith(\"tz_\") else 1, axis=1\n",
")\n",
"df.sort_values(by=[\"doc\"], inplace=True)\n",
"\n",
"print(df.iloc[15:25])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Предобработка текста.\n",
"\n",
"Трансформация:"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import emoji\n",
"from num2words import num2words\n",
"\n",
"# Функция для преобразования эмоджи в слова\n",
"def emojis_words(text):\n",
" \n",
" # Модуль emoji: преобразование эмоджи в их словесные описания\n",
" text = emoji.demojize(text, delimiters=(\" \", \" \"))\n",
" \n",
" # Редактирование текста путём замены \":\" и\" _\", а так же - путём добавления пробела между отдельными словами\n",
" text = text.replace(\":\", \"\").replace(\"_\", \" \")\n",
" \n",
" return text\n",
"\n",
"def transform_text(text):\n",
" # Удаление из текста всех HTML-тегов\n",
" text = re.sub(r'<[^<]+?>', '', text)\n",
" \n",
" # Удаление из текста всех URL и ссылок\n",
" text = re.sub(r'http\\S+', '', text)\n",
"\n",
" # Преобразование эмоджи в текст\n",
" text = emojis_words(text)\n",
"\n",
" # Приведение к нижнему регистру\n",
" text = text.lower()\n",
"\n",
" # Удаление лишних пробелов\n",
" text = re.sub(r'\\s+', ' ', text) \n",
" \n",
" # Преобразование \"ё\" в \"е\"\n",
" text = text.replace(\"ё\", \"е\")\n",
"\n",
" # Удаление всех специальных символов\n",
" text = re.sub(r'[^a-zA-Zа-яА-Я0-9\\s]', '', text)\n",
"\n",
" # Преобразование чисел в слова\n",
" words: list[str] = text.split()\n",
" words = [num2words(word, lang=\"ru\") if word.isdigit() else word for word in words]\n",
" text = \" \".join(words)\n",
"\n",
" # Удаление из текста всех знаков препинания\n",
" text = re.sub(r'[^\\w\\s]', '', text)\n",
"\n",
" return text\n",
"\n",
"df[\"preprocessed_text\"] = df[\"text\"].apply(transform_text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Для выполнения токенизации, определения частей речи (POS tagging), нормализации (в данном случае применяется лемматизация) и фильтрации данных будем использовать библиотеку spaCy. На этапе фильтрации с целью уменьшения размерности пространства признаков задействуем словарь стоп-слов, а также исключим все слова, длина которых превышает 20 символов."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"двадцать_NUM_Case=Nom технический_ADJ_Case=Nom|Degree=Pos|Gender=Neut|Number=Sing задание_NOUN_Animacy=Inan|Case=Nom|Gender=Neut|Number=Sing двести_NUM_Case=Nom двадцать_NUM_Case=Nom общий_ADJ_Case=Nom|Degree=Pos|Number=Plur сведение_NOUN_Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing полный_ADJ_Case=Nom|Degree=Pos|Gender=Neut|Number=Sing наименование_NOUN_Animacy=Inan|Case=Nom|Gender=Neut|Number=Sing система_NOUN_Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing\n"
]
}
],
"source": [
"\n",
"from nltk.corpus import stopwords\n",
"\n",
"\n",
"stop_words = set(stopwords.words('russian'))\n",
"\n",
"def preprocess_text(text):\n",
" doc = sp(text)\n",
" \n",
" filtered_tokens = [\n",
"\n",
" f\"{token.lemma_}_{token.pos_}_{token.morph}\" # Формирование строки с нужным форматом\n",
" for token in doc\n",
"\n",
" if token.text not in stop_words and len(token.text) <= 20 # Фильтрация \n",
"\n",
" ]\n",
" \n",
"\n",
" return \" \".join(filtered_tokens)\n",
"\n",
"\n",
"df[\"preprocessed_text\"] = df[\"preprocessed_text\"].apply(preprocess_text)\n",
"\n",
"\n",
"# Выведем 10 токенов из первого текста\n",
"\n",
"first_text_tokens = df[\"preprocessed_text\"].iloc[0].split()[:10]\n",
"\n",
"print(\" \".join(first_text_tokens))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Перейдем к этапу формирования N-грамм:\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt_tab to\n",
"[nltk_data] D:\\Users\\Leo\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt_tab is already up-to-date!\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" doc \\\n",
"15 tz_16.docx \n",
"16 tz_17.docx \n",
"17 tz_18.docx \n",
"18 tz_19.docx \n",
"19 tz_20.docx \n",
"20 Архитектура, управляемая модель.docx \n",
"21 Введение в проектирование ИС.docx \n",
"22 Встроенные операторы SQL.docx \n",
"23 Методологии разработки программного обеспечени... \n",
"24 Методологии разработки программного обеспечени... \n",
"\n",
" text type \\\n",
"15 2.2\\tТехническое задание\\n2.2.1\\tОбщие сведени... 0 \n",
"16 2.2 Техническое задание.\\n2.2.1 Общие сведения... 0 \n",
"17 2.2. Техническое задание\\nОбщие сведения:\\nПол... 0 \n",
"18 2.2. Техническое задание\\n2.2.1. Наименование ... 0 \n",
"19 2.2. Техническое задание\\n2.2.1. Общие сведени... 0 \n",
"20 Архитектура, управляемая модель\\nАббревиатура ... 1 \n",
"21 1. ВВЕДЕНИЕ В ПРОЕКТИРОВАНИЕ ИНФОРМАЦИОННЫХ СИ... 1 \n",
"22 Встроенные операторы SQL. \\nКак было отмечено ... 1 \n",
"23 Методологии разработки программного обеспечени... 1 \n",
"24 Методологии разработки программного обеспечени... 1 \n",
"\n",
" preprocessed_text \\\n",
"15 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n",
"16 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n",
"17 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n",
"18 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n",
"19 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n",
"20 архитектура_NOUN_Animacy=Inan|Case=Nom|Gender=... \n",
"21 введение_NOUN_Animacy=Inan|Case=Nom|Gender=Neu... \n",
"22 встроенные_ADJ_Case=Nom|Degree=Pos|Number=Plur... \n",
"23 методология_NOUN_Animacy=Inan|Case=Gen|Gender=... \n",
"24 методология_NOUN_Animacy=Inan|Case=Gen|Gender=... \n",
"\n",
" bigrams \\\n",
"15 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
"16 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
"17 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
"18 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
"19 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
"20 [(архитектура_NOUN_Animacy=Inan|Case=Nom|Gende... \n",
"21 [(введение_NOUN_Animacy=Inan|Case=Nom|Gender=N... \n",
"22 [(встроенные_ADJ_Case=Nom|Degree=Pos|Number=Pl... \n",
"23 [(методология_NOUN_Animacy=Inan|Case=Gen|Gende... \n",
"24 [(методология_NOUN_Animacy=Inan|Case=Gen|Gende... \n",
"\n",
" trigrams \n",
"15 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
"16 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
"17 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
"18 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
"19 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
"20 [(архитектура_NOUN_Animacy=Inan|Case=Nom|Gende... \n",
"21 [(введение_NOUN_Animacy=Inan|Case=Nom|Gender=N... \n",
"22 [(встроенные_ADJ_Case=Nom|Degree=Pos|Number=Pl... \n",
"23 [(методология_NOUN_Animacy=Inan|Case=Gen|Gende... \n",
"24 [(методология_NOUN_Animacy=Inan|Case=Gen|Gende... \n"
]
}
],
"source": [
"import nltk\n",
"from nltk.util import ngrams\n",
"from nltk.tokenize import word_tokenize\n",
"nltk.download(\"punkt_tab\")\n",
"def generate_ngrams(text: str, n: int = 2) -> list[tuple]:\n",
" tokens: list[str] = word_tokenize(text, language=\"russian\")\n",
" \n",
" n_grams: list[tuple] = list(ngrams(tokens, n))\n",
" return n_grams\n",
"\n",
"# Пример для биграмм (N=2)\n",
"df[\"bigrams\"] = df[\"preprocessed_text\"].apply(lambda x: generate_ngrams(x, n=2))\n",
"\n",
"# Пример для триграмм (N=3)\n",
"df[\"trigrams\"] = df[\"preprocessed_text\"].apply(lambda x: generate_ngrams(x, n=3))\n",
"\n",
"print(df.iloc[15:25])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Теперь применим методы для векторизации текста.\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Мешок слов:"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" тутто_noun_animacy взаимоотношение_noun_animacy иннкпп_propn_animacy \\\n",
"15 0 0 0 \n",
"16 0 0 0 \n",
"17 0 0 0 \n",
"18 0 0 0 \n",
"19 0 0 0 \n",
"20 0 0 0 \n",
"21 0 1 0 \n",
"22 0 0 0 \n",
"23 0 0 0 \n",
"24 0 0 0 \n",
"25 0 0 0 \n",
"\n",
" gif_propn_foreign накладывать_verb_aspect \\\n",
"15 0 0 \n",
"16 0 0 \n",
"17 0 0 \n",
"18 0 0 \n",
"19 0 0 \n",
"20 0 0 \n",
"21 0 0 \n",
"22 0 0 \n",
"23 0 0 \n",
"24 0 0 \n",
"25 0 1 \n",
"\n",
" метрологическому_propn_animacy связанность_noun_animacy \\\n",
"15 0 0 \n",
"16 1 0 \n",
"17 0 0 \n",
"18 0 0 \n",
"19 0 0 \n",
"20 0 0 \n",
"21 0 0 \n",
"22 0 0 \n",
"23 0 0 \n",
"24 0 0 \n",
"25 0 0 \n",
"\n",
" модернизировать_verb_aspect инструментальный_adj_case \\\n",
"15 0 0 \n",
"16 0 0 \n",
"17 0 0 \n",
"18 0 0 \n",
"19 0 0 \n",
"20 0 0 \n",
"21 0 1 \n",
"22 0 0 \n",
"23 0 0 \n",
"24 0 0 \n",
"25 0 1 \n",
"\n",
" достаточно_adv_degree \n",
"15 0 \n",
"16 0 \n",
"17 0 \n",
"18 0 \n",
"19 0 \n",
"20 0 \n",
"21 6 \n",
"22 1 \n",
"23 8 \n",
"24 3 \n",
"25 15 \n"
]
}
],
"source": [
"from scipy import sparse\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"import numpy as np\n",
"\n",
"counts_vectorizer = CountVectorizer()\n",
"counts_matrix = sparse.csr_matrix(counts_vectorizer.fit_transform(df[\"preprocessed_text\"]))\n",
"counts_df = pd.DataFrame(\n",
" counts_matrix.toarray(),\n",
" columns=counts_vectorizer.get_feature_names_out(),\n",
")\n",
"\n",
"random_columns = np.random.choice(counts_df.columns, size=10, replace=False)\n",
"\n",
"print(counts_df.loc[15:25, random_columns]) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Частотный портрет:"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" тутто_noun_animacy взаимоотношение_noun_animacy иннкпп_propn_animacy \\\n",
"15 0.0 0.000000 0.0 \n",
"16 0.0 0.000000 0.0 \n",
"17 0.0 0.000000 0.0 \n",
"18 0.0 0.000000 0.0 \n",
"19 0.0 0.000000 0.0 \n",
"20 0.0 0.000000 0.0 \n",
"21 0.0 0.022338 0.0 \n",
"22 0.0 0.000000 0.0 \n",
"23 0.0 0.000000 0.0 \n",
"24 0.0 0.000000 0.0 \n",
"25 0.0 0.000000 0.0 \n",
"\n",
" gif_propn_foreign накладывать_verb_aspect \\\n",
"15 0.0 0.00000 \n",
"16 0.0 0.00000 \n",
"17 0.0 0.00000 \n",
"18 0.0 0.00000 \n",
"19 0.0 0.00000 \n",
"20 0.0 0.00000 \n",
"21 0.0 0.00000 \n",
"22 0.0 0.00000 \n",
"23 0.0 0.00000 \n",
"24 0.0 0.00000 \n",
"25 0.0 0.02162 \n",
"\n",
" метрологическому_propn_animacy связанность_noun_animacy \\\n",
"15 0.000000 0.0 \n",
"16 0.042399 0.0 \n",
"17 0.000000 0.0 \n",
"18 0.000000 0.0 \n",
"19 0.000000 0.0 \n",
"20 0.000000 0.0 \n",
"21 0.000000 0.0 \n",
"22 0.000000 0.0 \n",
"23 0.000000 0.0 \n",
"24 0.000000 0.0 \n",
"25 0.000000 0.0 \n",
"\n",
" модернизировать_verb_aspect инструментальный_adj_case \\\n",
"15 0.0 0.000000 \n",
"16 0.0 0.000000 \n",
"17 0.0 0.000000 \n",
"18 0.0 0.000000 \n",
"19 0.0 0.000000 \n",
"20 0.0 0.000000 \n",
"21 0.0 0.017277 \n",
"22 0.0 0.000000 \n",
"23 0.0 0.000000 \n",
"24 0.0 0.000000 \n",
"25 0.0 0.018585 \n",
"\n",
" достаточно_adv_degree \n",
"15 0.000000 \n",
"16 0.000000 \n",
"17 0.000000 \n",
"18 0.000000 \n",
"19 0.000000 \n",
"20 0.000000 \n",
"21 0.033501 \n",
"22 0.025389 \n",
"23 0.047452 \n",
"24 0.036795 \n",
"25 0.047864 \n"
]
}
],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True)\n",
"tfidf_matrix = sparse.csr_matrix(tfidf_vectorizer.fit_transform(df[\"preprocessed_text\"]))\n",
"tfidf_df = pd.DataFrame(\n",
" tfidf_matrix.toarray(),\n",
" columns=tfidf_vectorizer.get_feature_names_out(),\n",
")\n",
"\n",
"print(tfidf_df.loc[15:25, random_columns]) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Обучение модели и проверка ее качества:"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TF-IDF Model\n",
"Accuracy: 0.8889\n",
"Precision: 0.7500\n",
"Recall: 1.0000\n",
"F1 Score: 0.8571\n",
"ROC AUC: 0.9167\n",
"Cross-validated F1 Score: 1.0000\n",
"\n",
"Count Vectorizer Model\n",
"Accuracy: 1.0000\n",
"Precision: 1.0000\n",
"Recall: 1.0000\n",
"F1 Score: 1.0000\n",
"ROC AUC: 1.0000\n",
"Cross-validated F1 Score: 0.9333\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n",
"\n",
"def train_and_evaluate(X, y, test_size=0.2, cv=5, optimize=False):\n",
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=9)\n",
"\n",
" if optimize:\n",
" param_grid = {\n",
" \"n_estimators\": [10, 20, 30, 40, 50, 100, 150, 200, 250, 500],\n",
" \"max_features\": [\"sqrt\", \"log2\", 2],\n",
" \"max_depth\": [2, 3, 4, 5, 6, 7, 8, 9, 10],\n",
" \"criterion\": [\"gini\", \"entropy\", \"log_loss\"],\n",
" \"class_weight\": [\"balanced\", \"balanced_subsample\"]\n",
" }\n",
"\n",
" grid_search = GridSearchCV(RandomForestClassifier(random_state=9), param_grid, scoring=\"f1\", cv=cv, n_jobs=-1)\n",
" grid_search.fit(X_train, y_train)\n",
" model = grid_search.best_estimator_\n",
" print(f\"Лучшие параметры: {grid_search.best_params_}\")\n",
" else:\n",
" model = RandomForestClassifier(n_estimators=100, random_state=9)\n",
" model.fit(X_train, y_train)\n",
"\n",
" y_pred = model.predict(X_test)\n",
"\n",
" accuracy = accuracy_score(y_test, y_pred)\n",
" precision = precision_score(y_test, y_pred)\n",
" recall = recall_score(y_test, y_pred)\n",
" f1 = f1_score(y_test, y_pred)\n",
" roc_auc = roc_auc_score(y_test, y_pred)\n",
"\n",
" print(f\"Accuracy: {accuracy:.4f}\")\n",
" print(f\"Precision: {precision:.4f}\")\n",
" print(f\"Recall: {recall:.4f}\")\n",
" print(f\"F1 Score: {f1:.4f}\")\n",
" print(f\"ROC AUC: {roc_auc:.4f}\")\n",
"\n",
" scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1')\n",
" f1_cv = scores.mean()\n",
" print(f\"Cross-validated F1 Score: {f1_cv:.4f}\")\n",
"\n",
" return model\n",
"\n",
"X_tfidf = tfidf_df\n",
"X_counts = counts_df\n",
"y = df[\"type\"]\n",
"\n",
"print(\"TF-IDF Model\")\n",
"model_tfidf = train_and_evaluate(X_tfidf, y)\n",
"\n",
"print(\"\\nCount Vectorizer Model\")\n",
"model_counts = train_and_evaluate(X_counts, y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Как видно, обе модели демонстрируют отличные результаты, причём вторая модель достигает практически идеальных показателей. Однако это может быть связано с небольшим объёмом данных в выборке (всего 41 документ). Вероятно, модель просто запомнила данные, что привело к её переобучению."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Кроме того, в соответствии с заданием, оценим решение, используя альтернативные гиперпараметры модели машинного обучения, которые будут подобраны с помощью метода поиска по сетке."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TF-IDF Model (Optimized)\n",
"Лучшие параметры: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 2, 'max_features': 'sqrt', 'n_estimators': 20}\n",
"Accuracy: 0.7778\n",
"Precision: 0.6000\n",
"Recall: 1.0000\n",
"F1 Score: 0.7500\n",
"ROC AUC: 0.8333\n",
"Cross-validated F1 Score: 1.0000\n"
]
}
],
"source": [
"print(\"TF-IDF Model (Optimized)\")\n",
"model_tfidf = train_and_evaluate(X_tfidf, y, optimize=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Можно сделать вывод, что в данном случае существует возможность настроить гиперпараметры модели таким образом, что её показатели согласно метрикам достигнут практически идеального уровня."
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

424
lab_9.ipynb Normal file

File diff suppressed because one or more lines are too long

5343
poetry.lock generated Normal file

File diff suppressed because it is too large Load Diff

2
poetry.toml Normal file
View File

@@ -0,0 +1,2 @@
[virtualenvs]
in-project = true

42
pyproject.toml Normal file
View File

@@ -0,0 +1,42 @@
[tool.poetry]
name = "mai"
version = "1.0.0"
description = "MAI labs"
authors = ["Tikhonenkov Alexey <tikhonenkov2015@gmail.com>"]
readme = "readme.md"
package-mode = false
[tool.poetry.dependencies]
python = ">=3.12,<3.13"
jupyter = "^1.1.1"
numpy = "^1.26.4"
pandas = "^2.2.2"
matplotlib = "^3.9.2"
flask = "^3.0.3"
apiflask = "^2.2.0"
flask-cors = "^5.0.0"
ipykernel = "^6.29.5"
imbalanced-learn = "^0.12.4"
seaborn = "^0.13.2"
featuretools = "^1.31.0"
gymnasium = "^1.0.0"
scikit-fuzzy = "^0.5.0"
networkx = "^3.4.2"
spacy = "^3.7.5"
docx = "^0.2.4"
emoji = "^2.14.1"
num2words = "^0.5.14"
nltk = "^3.9.1"
python-docx = "^1.1.2"
opencv-python = "^4.11.0.86"
mahotas = "^1.4.18"
albumentations = "^2.0.5"
ru_core_news_lg = {url = "https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.7.0/ru_core_news_lg-3.7.0-py3-none-any.whl"}
[tool.poetry.group.dev.dependencies]
ipykernel = "^6.29.5"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

0
readme copy.md Normal file
View File

16
run.py Normal file
View File

@@ -0,0 +1,16 @@
from backend import create_app
app = create_app()
def __main():
app.run(
host="127.0.0.1",
port=8080,
debug=True,
use_reloader=False,
)
if __name__ == "__main__":
__main()