Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| fda87f53d5 | |||
| f6bdab7f5b | |||
| e35a826ccd | |||
| 8e9ddc5b7c | |||
| 9ee8efec42 | |||
| d59680bbe0 | |||
| 278e197032 |
13
.vscode/extensions.json
vendored
Normal file
13
.vscode/extensions.json
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"recommendations": [
|
||||
"ms-python.black-formatter",
|
||||
"ms-python.flake8",
|
||||
"ms-python.isort",
|
||||
"ms-toolsai.jupyter",
|
||||
"ms-toolsai.datawrangler",
|
||||
"ms-python.python",
|
||||
"donjayamanne.python-environment-manager",
|
||||
// optional
|
||||
"usernamehw.errorlens"
|
||||
]
|
||||
}
|
||||
16
.vscode/launch.json
vendored
Normal file
16
.vscode/launch.json
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "mai-service",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "run.py",
|
||||
"console": "integratedTerminal",
|
||||
"justMyCode": true
|
||||
}
|
||||
]
|
||||
}
|
||||
38
.vscode/settings.json
vendored
Normal file
38
.vscode/settings.json
vendored
Normal file
@@ -0,0 +1,38 @@
|
||||
{
|
||||
"files.autoSave": "onFocusChange",
|
||||
"files.exclude": {
|
||||
"**/__pycache__": true
|
||||
},
|
||||
"editor.detectIndentation": false,
|
||||
"editor.formatOnType": false,
|
||||
"editor.formatOnPaste": true,
|
||||
"editor.formatOnSave": true,
|
||||
"editor.tabSize": 4,
|
||||
"editor.insertSpaces": true,
|
||||
"editor.codeActionsOnSave": {
|
||||
"source.organizeImports": "explicit",
|
||||
"source.sortImports": "explicit"
|
||||
},
|
||||
"editor.stickyScroll.enabled": false,
|
||||
"diffEditor.ignoreTrimWhitespace": false,
|
||||
"debug.showVariableTypes": true,
|
||||
"workbench.editor.highlightModifiedTabs": true,
|
||||
"git.suggestSmartCommit": false,
|
||||
"git.autofetch": true,
|
||||
"git.openRepositoryInParentFolders": "always",
|
||||
"git.confirmSync": false,
|
||||
"errorLens.gutterIconsEnabled": true,
|
||||
"errorLens.messageEnabled": false,
|
||||
"[python]": {
|
||||
"editor.defaultFormatter": "ms-python.black-formatter",
|
||||
},
|
||||
"python.languageServer": "Pylance",
|
||||
"python.analysis.typeCheckingMode": "basic",
|
||||
"python.analysis.autoImportCompletions": true,
|
||||
"isort.args": [
|
||||
"--profile",
|
||||
"black"
|
||||
],
|
||||
"notebook.lineNumbers": "on",
|
||||
"notebook.output.minimalErrorRendering": true,
|
||||
}
|
||||
8
MAI_PIbd-33_Tikhonenkov_A_E.code-workspace
Normal file
8
MAI_PIbd-33_Tikhonenkov_A_E.code-workspace
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"folders": [
|
||||
{
|
||||
"path": "."
|
||||
}
|
||||
],
|
||||
"settings": {}
|
||||
}
|
||||
52
backend/__init__.py
Normal file
52
backend/__init__.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import importlib
|
||||
import os
|
||||
import traceback
|
||||
|
||||
import matplotlib
|
||||
from apiflask import APIBlueprint, APIFlask
|
||||
from flask_cors import CORS
|
||||
|
||||
matplotlib.use("agg")
|
||||
|
||||
cors = CORS()
|
||||
api_bp = APIBlueprint("api", __name__, url_prefix="/api/v1")
|
||||
dataset_path: str | None = None
|
||||
|
||||
|
||||
class Config:
|
||||
SECRET_KEY = "secret!"
|
||||
SEND_FILE_MAX_AGE_DEFAULT = -1
|
||||
|
||||
|
||||
def create_app():
|
||||
global dataset_path
|
||||
|
||||
# Create and configure app
|
||||
app = APIFlask(
|
||||
"MAI Service",
|
||||
title="MAI Service API",
|
||||
docs_path="/",
|
||||
version="1.0",
|
||||
static_folder="",
|
||||
template_folder="",
|
||||
)
|
||||
app.config.from_object(Config)
|
||||
|
||||
dataset_path = os.path.join(app.instance_path, "dataset")
|
||||
os.makedirs(dataset_path, exist_ok=True)
|
||||
|
||||
@app.errorhandler(Exception)
|
||||
def my_error_processor(error):
|
||||
traceback.print_exception(error)
|
||||
return {"message": str(error), "detail": "No details"}, 500
|
||||
|
||||
# Import custom REST methods
|
||||
importlib.import_module("backend.api")
|
||||
|
||||
# Enable REST API
|
||||
app.register_blueprint(api_bp)
|
||||
|
||||
# Enable app extensions
|
||||
cors.init_app(app)
|
||||
|
||||
return app
|
||||
57
backend/api.py
Normal file
57
backend/api.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from apiflask import FileSchema, Schema, fields
|
||||
from flask import send_file
|
||||
|
||||
from backend import api_bp, dataset_path
|
||||
from backend.service import Service
|
||||
|
||||
|
||||
class FileUpload(Schema):
|
||||
file = fields.File(required=True)
|
||||
|
||||
|
||||
class ColumnInfoDto(Schema):
|
||||
datatype = fields.String()
|
||||
items = fields.List(fields.String())
|
||||
|
||||
|
||||
class TableColumnDto(Schema):
|
||||
name = fields.String()
|
||||
datatype = fields.String()
|
||||
items = fields.List(fields.String())
|
||||
|
||||
|
||||
service = Service(dataset_path)
|
||||
|
||||
|
||||
@api_bp.post("/dataset")
|
||||
@api_bp.input(FileUpload, location="files")
|
||||
def upload_dataset(files_data):
|
||||
uploaded_file = files_data["file"]
|
||||
return service.upload_dataset(uploaded_file)
|
||||
|
||||
|
||||
@api_bp.get("/dataset")
|
||||
def get_all_datasets():
|
||||
return service.get_all_datasets()
|
||||
|
||||
|
||||
@api_bp.get("/dataset/<string:name>")
|
||||
@api_bp.output(TableColumnDto(many=True))
|
||||
def get_dataset_info(name: str):
|
||||
return service.get_dataset_info(name)
|
||||
|
||||
|
||||
@api_bp.get("/dataset/<string:name>/<string:column>")
|
||||
@api_bp.output(ColumnInfoDto)
|
||||
def get_column_info(name: str, column: str):
|
||||
return service.get_column_info(name, column)
|
||||
|
||||
|
||||
@api_bp.get("/dataset/draw/hist/<string:name>/<string:column>")
|
||||
@api_bp.output(
|
||||
FileSchema(type="string", format="binary"), content_type="image/png", example=""
|
||||
)
|
||||
def get_dataset_hist(name: str, column: str):
|
||||
data = service.get_hist(name, column)
|
||||
data.seek(0)
|
||||
return send_file(data, download_name=f"{name}.hist.png", mimetype="image/png")
|
||||
59
backend/service.py
Normal file
59
backend/service.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import io
|
||||
import os
|
||||
import pathlib
|
||||
from typing import BinaryIO, Dict, List
|
||||
|
||||
import pandas as pd
|
||||
from matplotlib.figure import Figure
|
||||
from werkzeug.datastructures import FileStorage
|
||||
from werkzeug.utils import secure_filename
|
||||
|
||||
|
||||
class Service:
|
||||
def __init__(self, dataset_path: str | None) -> None:
|
||||
if dataset_path is None:
|
||||
raise Exception("Dataset path is not defined")
|
||||
self.__path: str = dataset_path
|
||||
|
||||
def __get_dataset(self, filename: str) -> pd.DataFrame:
|
||||
full_file_name = os.path.join(self.__path, secure_filename(filename))
|
||||
return pd.read_csv(full_file_name)
|
||||
|
||||
def upload_dataset(self, file: FileStorage) -> str:
|
||||
if file.filename is None:
|
||||
raise Exception("Dataset upload error")
|
||||
file_name: str = file.filename
|
||||
full_file_name = os.path.join(self.__path, secure_filename(file_name))
|
||||
file.save(full_file_name)
|
||||
return file_name
|
||||
|
||||
def get_all_datasets(self) -> List[str]:
|
||||
return [file.name for file in pathlib.Path(self.__path).glob("*.csv")]
|
||||
|
||||
def get_dataset_info(self, filename) -> List[Dict]:
|
||||
dataset = self.__get_dataset(filename)
|
||||
dataset_info = []
|
||||
for column in dataset.columns:
|
||||
items = dataset[column].astype(str)
|
||||
column_info = {
|
||||
"name": column,
|
||||
"datatype": dataset.dtypes[column],
|
||||
"items": items,
|
||||
}
|
||||
dataset_info.append(column_info)
|
||||
return dataset_info
|
||||
|
||||
def get_column_info(self, filename, column) -> Dict:
|
||||
dataset = self.__get_dataset(filename)
|
||||
datatype = dataset.dtypes[column]
|
||||
items = sorted(dataset[column].astype(str).unique())
|
||||
return {"datatype": datatype, "items": items}
|
||||
|
||||
def get_hist(self, filename, column) -> BinaryIO:
|
||||
dataset = self.__get_dataset(filename)
|
||||
bytes = io.BytesIO()
|
||||
plot: Figure | None = dataset.plot.hist(column=[column], bins=80).get_figure()
|
||||
if plot is None:
|
||||
raise Exception("Can't create hist plot")
|
||||
plot.savefig(bytes, dpi=300, format="png")
|
||||
return bytes
|
||||
53944
data/Diamonds-Prices.csv
Normal file
53944
data/Diamonds-Prices.csv
Normal file
File diff suppressed because it is too large
Load Diff
6
data/Forbes Billionaires copy.csv
Normal file
6
data/Forbes Billionaires copy.csv
Normal file
@@ -0,0 +1,6 @@
|
||||
Rank ,Name,Networth,Age,Country,Source,Industry
|
||||
1,Elon Musk ,219,50,United States,"Tesla, SpaceX",Automotive
|
||||
2,Jeff Bezos ,171,58,United States,Amazon,Technology
|
||||
3,Bernard Arnault & family ,158,73,France,LVMH,Fashion & Retail
|
||||
4,Bill Gates ,129,66,United States,Microsoft,Technology
|
||||
5,Warren Buffett ,118,91,United States,Berkshire Hathaway,Finance & Investments
|
||||
|
2601
data/Forbes Billionaires.csv
Normal file
2601
data/Forbes Billionaires.csv
Normal file
File diff suppressed because it is too large
Load Diff
5111
data/healthcare-dataset-stroke-data.csv
Normal file
5111
data/healthcare-dataset-stroke-data.csv
Normal file
File diff suppressed because it is too large
Load Diff
1371
data/mobile-phone-price-prediction.csv
Normal file
1371
data/mobile-phone-price-prediction.csv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
data/text/tz_01.docx
Normal file
BIN
data/text/tz_01.docx
Normal file
Binary file not shown.
BIN
data/text/tz_02.docx
Normal file
BIN
data/text/tz_02.docx
Normal file
Binary file not shown.
BIN
data/text/tz_03.docx
Normal file
BIN
data/text/tz_03.docx
Normal file
Binary file not shown.
BIN
data/text/tz_04.docx
Normal file
BIN
data/text/tz_04.docx
Normal file
Binary file not shown.
BIN
data/text/tz_05.docx
Normal file
BIN
data/text/tz_05.docx
Normal file
Binary file not shown.
BIN
data/text/tz_06.docx
Normal file
BIN
data/text/tz_06.docx
Normal file
Binary file not shown.
BIN
data/text/tz_07.docx
Normal file
BIN
data/text/tz_07.docx
Normal file
Binary file not shown.
BIN
data/text/tz_08.docx
Normal file
BIN
data/text/tz_08.docx
Normal file
Binary file not shown.
BIN
data/text/tz_09.docx
Normal file
BIN
data/text/tz_09.docx
Normal file
Binary file not shown.
BIN
data/text/tz_10.docx
Normal file
BIN
data/text/tz_10.docx
Normal file
Binary file not shown.
BIN
data/text/tz_11.docx
Normal file
BIN
data/text/tz_11.docx
Normal file
Binary file not shown.
BIN
data/text/tz_12.docx
Normal file
BIN
data/text/tz_12.docx
Normal file
Binary file not shown.
BIN
data/text/tz_13.docx
Normal file
BIN
data/text/tz_13.docx
Normal file
Binary file not shown.
BIN
data/text/tz_14.docx
Normal file
BIN
data/text/tz_14.docx
Normal file
Binary file not shown.
BIN
data/text/tz_15.docx
Normal file
BIN
data/text/tz_15.docx
Normal file
Binary file not shown.
BIN
data/text/tz_16.docx
Normal file
BIN
data/text/tz_16.docx
Normal file
Binary file not shown.
BIN
data/text/tz_17.docx
Normal file
BIN
data/text/tz_17.docx
Normal file
Binary file not shown.
BIN
data/text/tz_18.docx
Normal file
BIN
data/text/tz_18.docx
Normal file
Binary file not shown.
BIN
data/text/tz_19.docx
Normal file
BIN
data/text/tz_19.docx
Normal file
Binary file not shown.
BIN
data/text/tz_20.docx
Normal file
BIN
data/text/tz_20.docx
Normal file
Binary file not shown.
BIN
data/text/Архитектура, управляемая модель.docx
Normal file
BIN
data/text/Архитектура, управляемая модель.docx
Normal file
Binary file not shown.
BIN
data/text/Введение в проектирование ИС.docx
Normal file
BIN
data/text/Введение в проектирование ИС.docx
Normal file
Binary file not shown.
BIN
data/text/Встроенные операторы SQL.docx
Normal file
BIN
data/text/Встроенные операторы SQL.docx
Normal file
Binary file not shown.
BIN
data/text/Методологии разработки программного обеспечения 2.docx
Normal file
BIN
data/text/Методологии разработки программного обеспечения 2.docx
Normal file
Binary file not shown.
BIN
data/text/Методологии разработки программного обеспечения.docx
Normal file
BIN
data/text/Методологии разработки программного обеспечения.docx
Normal file
Binary file not shown.
BIN
data/text/Методы композиции и декомпозиции.docx
Normal file
BIN
data/text/Методы композиции и декомпозиции.docx
Normal file
Binary file not shown.
BIN
data/text/Модели представления данных в СУБД.docx
Normal file
BIN
data/text/Модели представления данных в СУБД.docx
Normal file
Binary file not shown.
BIN
data/text/Некоторые особенности проектирования.docx
Normal file
BIN
data/text/Некоторые особенности проектирования.docx
Normal file
Binary file not shown.
BIN
data/text/Непроцедурный доступ к данным.docx
Normal file
BIN
data/text/Непроцедурный доступ к данным.docx
Normal file
Binary file not shown.
BIN
data/text/Процедурное расширение языка SQL.docx
Normal file
BIN
data/text/Процедурное расширение языка SQL.docx
Normal file
Binary file not shown.
BIN
data/text/Системные объекты базы данных.docx
Normal file
BIN
data/text/Системные объекты базы данных.docx
Normal file
Binary file not shown.
BIN
data/text/Технология создания распр ИС.docx
Normal file
BIN
data/text/Технология создания распр ИС.docx
Normal file
Binary file not shown.
BIN
data/text/Требования к проекту.docx
Normal file
BIN
data/text/Требования к проекту.docx
Normal file
Binary file not shown.
BIN
data/text/Условия целостности БД.docx
Normal file
BIN
data/text/Условия целостности БД.docx
Normal file
Binary file not shown.
BIN
data/text/Характеристики СУБД.docx
Normal file
BIN
data/text/Характеристики СУБД.docx
Normal file
Binary file not shown.
BIN
data/text/Этапы разработки проекта1.docx
Normal file
BIN
data/text/Этапы разработки проекта1.docx
Normal file
Binary file not shown.
BIN
data/text/Этапы разработки проекта2.docx
Normal file
BIN
data/text/Этапы разработки проекта2.docx
Normal file
Binary file not shown.
BIN
data/text/Этапы разработки проекта3.docx
Normal file
BIN
data/text/Этапы разработки проекта3.docx
Normal file
Binary file not shown.
BIN
data/text/Этапы разработки проекта4.docx
Normal file
BIN
data/text/Этапы разработки проекта4.docx
Normal file
Binary file not shown.
BIN
data/text/Этапы разработки проекта5.docx
Normal file
BIN
data/text/Этапы разработки проекта5.docx
Normal file
Binary file not shown.
BIN
data/text/Язык манипуляции данными.docx
Normal file
BIN
data/text/Язык манипуляции данными.docx
Normal file
Binary file not shown.
653
lab1.ipynb
Normal file
653
lab1.ipynb
Normal file
File diff suppressed because one or more lines are too long
2063
lab2.ipynb
Normal file
2063
lab2.ipynb
Normal file
File diff suppressed because one or more lines are too long
1112
lab3.ipynb
Normal file
1112
lab3.ipynb
Normal file
File diff suppressed because one or more lines are too long
3436
lab4.ipynb
Normal file
3436
lab4.ipynb
Normal file
File diff suppressed because one or more lines are too long
1477
lab5.ipynb
Normal file
1477
lab5.ipynb
Normal file
File diff suppressed because one or more lines are too long
1921
lab_7.ipynb
Normal file
1921
lab_7.ipynb
Normal file
File diff suppressed because one or more lines are too long
693
lab_8.ipynb
Normal file
693
lab_8.ipynb
Normal file
@@ -0,0 +1,693 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Лабораторная работа 8\n",
|
||||
"\n",
|
||||
"Выбранный датасет: Технические задания и статьи по ИТ (кластеризация, классификация).\n",
|
||||
"\n",
|
||||
"Выбранный метод машинного обучения: классификация.\n",
|
||||
"\n",
|
||||
"Задача анализа текстов: разработка модели, которая сможет автоматически определять категорию, к которой относится текст (в данном случае, ТЗ или статья)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Импорт библиотеки и инициализация модуля для анализа текста:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import spacy\n",
|
||||
"\n",
|
||||
"sp = spacy.load(\"ru_core_news_lg\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Загрузка текстов из файлов с расширением .docx в датафрейм:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" doc \\\n",
|
||||
"15 tz_16.docx \n",
|
||||
"16 tz_17.docx \n",
|
||||
"17 tz_18.docx \n",
|
||||
"18 tz_19.docx \n",
|
||||
"19 tz_20.docx \n",
|
||||
"20 Архитектура, управляемая модель.docx \n",
|
||||
"21 Введение в проектирование ИС.docx \n",
|
||||
"22 Встроенные операторы SQL.docx \n",
|
||||
"23 Методологии разработки программного обеспечени... \n",
|
||||
"24 Методологии разработки программного обеспечени... \n",
|
||||
"\n",
|
||||
" text type \n",
|
||||
"15 2.2\\tТехническое задание\\n2.2.1\\tОбщие сведени... 0 \n",
|
||||
"16 2.2 Техническое задание.\\n2.2.1 Общие сведения... 0 \n",
|
||||
"17 2.2. Техническое задание\\nОбщие сведения:\\nПол... 0 \n",
|
||||
"18 2.2. Техническое задание\\n2.2.1. Наименование ... 0 \n",
|
||||
"19 2.2. Техническое задание\\n2.2.1. Общие сведени... 0 \n",
|
||||
"20 Архитектура, управляемая модель\\nАббревиатура ... 1 \n",
|
||||
"21 1. ВВЕДЕНИЕ В ПРОЕКТИРОВАНИЕ ИНФОРМАЦИОННЫХ СИ... 1 \n",
|
||||
"22 Встроенные операторы SQL. \\nКак было отмечено ... 1 \n",
|
||||
"23 Методологии разработки программного обеспечени... 1 \n",
|
||||
"24 Методологии разработки программного обеспечени... 1 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"from docx import Document\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"def read_docx(file_path):\n",
|
||||
" doc = Document(file_path)\n",
|
||||
" full_text = []\n",
|
||||
" for paragraph in doc.paragraphs:\n",
|
||||
" full_text.append(paragraph.text)\n",
|
||||
" return \"\\n\".join(full_text)\n",
|
||||
"\n",
|
||||
"def load_docs(dataset_path):\n",
|
||||
" df = pd.DataFrame(columns=[\"doc\", \"text\"])\n",
|
||||
" for file_path in os.listdir(dataset_path):\n",
|
||||
" if file_path.startswith(\"~$\"):\n",
|
||||
" continue\n",
|
||||
" text = read_docx(dataset_path + file_path)\n",
|
||||
" df.loc[len(df.index)] = [file_path, text]\n",
|
||||
" return df\n",
|
||||
"\n",
|
||||
"df = load_docs(\"./data/text/\")\n",
|
||||
"df[\"type\"] = df.apply(\n",
|
||||
" lambda row: 0 if str(row[\"doc\"]).startswith(\"tz_\") else 1, axis=1\n",
|
||||
")\n",
|
||||
"df.sort_values(by=[\"doc\"], inplace=True)\n",
|
||||
"\n",
|
||||
"print(df.iloc[15:25])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Предобработка текста.\n",
|
||||
"\n",
|
||||
"Трансформация:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"import emoji\n",
|
||||
"from num2words import num2words\n",
|
||||
"\n",
|
||||
"# Функция для преобразования эмоджи в слова\n",
|
||||
"def emojis_words(text):\n",
|
||||
" \n",
|
||||
" # Модуль emoji: преобразование эмоджи в их словесные описания\n",
|
||||
" text = emoji.demojize(text, delimiters=(\" \", \" \"))\n",
|
||||
" \n",
|
||||
" # Редактирование текста путём замены \":\" и\" _\", а так же - путём добавления пробела между отдельными словами\n",
|
||||
" text = text.replace(\":\", \"\").replace(\"_\", \" \")\n",
|
||||
" \n",
|
||||
" return text\n",
|
||||
"\n",
|
||||
"def transform_text(text):\n",
|
||||
" # Удаление из текста всех HTML-тегов\n",
|
||||
" text = re.sub(r'<[^<]+?>', '', text)\n",
|
||||
" \n",
|
||||
" # Удаление из текста всех URL и ссылок\n",
|
||||
" text = re.sub(r'http\\S+', '', text)\n",
|
||||
"\n",
|
||||
" # Преобразование эмоджи в текст\n",
|
||||
" text = emojis_words(text)\n",
|
||||
"\n",
|
||||
" # Приведение к нижнему регистру\n",
|
||||
" text = text.lower()\n",
|
||||
"\n",
|
||||
" # Удаление лишних пробелов\n",
|
||||
" text = re.sub(r'\\s+', ' ', text) \n",
|
||||
" \n",
|
||||
" # Преобразование \"ё\" в \"е\"\n",
|
||||
" text = text.replace(\"ё\", \"е\")\n",
|
||||
"\n",
|
||||
" # Удаление всех специальных символов\n",
|
||||
" text = re.sub(r'[^a-zA-Zа-яА-Я0-9\\s]', '', text)\n",
|
||||
"\n",
|
||||
" # Преобразование чисел в слова\n",
|
||||
" words: list[str] = text.split()\n",
|
||||
" words = [num2words(word, lang=\"ru\") if word.isdigit() else word for word in words]\n",
|
||||
" text = \" \".join(words)\n",
|
||||
"\n",
|
||||
" # Удаление из текста всех знаков препинания\n",
|
||||
" text = re.sub(r'[^\\w\\s]', '', text)\n",
|
||||
"\n",
|
||||
" return text\n",
|
||||
"\n",
|
||||
"df[\"preprocessed_text\"] = df[\"text\"].apply(transform_text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Для выполнения токенизации, определения частей речи (POS tagging), нормализации (в данном случае применяется лемматизация) и фильтрации данных будем использовать библиотеку spaCy. На этапе фильтрации с целью уменьшения размерности пространства признаков задействуем словарь стоп-слов, а также исключим все слова, длина которых превышает 20 символов."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"двадцать_NUM_Case=Nom технический_ADJ_Case=Nom|Degree=Pos|Gender=Neut|Number=Sing задание_NOUN_Animacy=Inan|Case=Nom|Gender=Neut|Number=Sing двести_NUM_Case=Nom двадцать_NUM_Case=Nom общий_ADJ_Case=Nom|Degree=Pos|Number=Plur сведение_NOUN_Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing полный_ADJ_Case=Nom|Degree=Pos|Gender=Neut|Number=Sing наименование_NOUN_Animacy=Inan|Case=Nom|Gender=Neut|Number=Sing система_NOUN_Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"from nltk.corpus import stopwords\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"stop_words = set(stopwords.words('russian'))\n",
|
||||
"\n",
|
||||
"def preprocess_text(text):\n",
|
||||
" doc = sp(text)\n",
|
||||
" \n",
|
||||
" filtered_tokens = [\n",
|
||||
"\n",
|
||||
" f\"{token.lemma_}_{token.pos_}_{token.morph}\" # Формирование строки с нужным форматом\n",
|
||||
" for token in doc\n",
|
||||
"\n",
|
||||
" if token.text not in stop_words and len(token.text) <= 20 # Фильтрация \n",
|
||||
"\n",
|
||||
" ]\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" return \" \".join(filtered_tokens)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"df[\"preprocessed_text\"] = df[\"preprocessed_text\"].apply(preprocess_text)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Выведем 10 токенов из первого текста\n",
|
||||
"\n",
|
||||
"first_text_tokens = df[\"preprocessed_text\"].iloc[0].split()[:10]\n",
|
||||
"\n",
|
||||
"print(\" \".join(first_text_tokens))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Перейдем к этапу формирования N-грамм:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[nltk_data] Downloading package punkt_tab to\n",
|
||||
"[nltk_data] D:\\Users\\Leo\\AppData\\Roaming\\nltk_data...\n",
|
||||
"[nltk_data] Package punkt_tab is already up-to-date!\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" doc \\\n",
|
||||
"15 tz_16.docx \n",
|
||||
"16 tz_17.docx \n",
|
||||
"17 tz_18.docx \n",
|
||||
"18 tz_19.docx \n",
|
||||
"19 tz_20.docx \n",
|
||||
"20 Архитектура, управляемая модель.docx \n",
|
||||
"21 Введение в проектирование ИС.docx \n",
|
||||
"22 Встроенные операторы SQL.docx \n",
|
||||
"23 Методологии разработки программного обеспечени... \n",
|
||||
"24 Методологии разработки программного обеспечени... \n",
|
||||
"\n",
|
||||
" text type \\\n",
|
||||
"15 2.2\\tТехническое задание\\n2.2.1\\tОбщие сведени... 0 \n",
|
||||
"16 2.2 Техническое задание.\\n2.2.1 Общие сведения... 0 \n",
|
||||
"17 2.2. Техническое задание\\nОбщие сведения:\\nПол... 0 \n",
|
||||
"18 2.2. Техническое задание\\n2.2.1. Наименование ... 0 \n",
|
||||
"19 2.2. Техническое задание\\n2.2.1. Общие сведени... 0 \n",
|
||||
"20 Архитектура, управляемая модель\\nАббревиатура ... 1 \n",
|
||||
"21 1. ВВЕДЕНИЕ В ПРОЕКТИРОВАНИЕ ИНФОРМАЦИОННЫХ СИ... 1 \n",
|
||||
"22 Встроенные операторы SQL. \\nКак было отмечено ... 1 \n",
|
||||
"23 Методологии разработки программного обеспечени... 1 \n",
|
||||
"24 Методологии разработки программного обеспечени... 1 \n",
|
||||
"\n",
|
||||
" preprocessed_text \\\n",
|
||||
"15 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n",
|
||||
"16 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n",
|
||||
"17 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n",
|
||||
"18 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n",
|
||||
"19 двадцать_NUM_Case=Nom технический_ADJ_Case=Nom... \n",
|
||||
"20 архитектура_NOUN_Animacy=Inan|Case=Nom|Gender=... \n",
|
||||
"21 введение_NOUN_Animacy=Inan|Case=Nom|Gender=Neu... \n",
|
||||
"22 встроенные_ADJ_Case=Nom|Degree=Pos|Number=Plur... \n",
|
||||
"23 методология_NOUN_Animacy=Inan|Case=Gen|Gender=... \n",
|
||||
"24 методология_NOUN_Animacy=Inan|Case=Gen|Gender=... \n",
|
||||
"\n",
|
||||
" bigrams \\\n",
|
||||
"15 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
|
||||
"16 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
|
||||
"17 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
|
||||
"18 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
|
||||
"19 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
|
||||
"20 [(архитектура_NOUN_Animacy=Inan|Case=Nom|Gende... \n",
|
||||
"21 [(введение_NOUN_Animacy=Inan|Case=Nom|Gender=N... \n",
|
||||
"22 [(встроенные_ADJ_Case=Nom|Degree=Pos|Number=Pl... \n",
|
||||
"23 [(методология_NOUN_Animacy=Inan|Case=Gen|Gende... \n",
|
||||
"24 [(методология_NOUN_Animacy=Inan|Case=Gen|Gende... \n",
|
||||
"\n",
|
||||
" trigrams \n",
|
||||
"15 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
|
||||
"16 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
|
||||
"17 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
|
||||
"18 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
|
||||
"19 [(двадцать_NUM_Case=Nom, технический_ADJ_Case=... \n",
|
||||
"20 [(архитектура_NOUN_Animacy=Inan|Case=Nom|Gende... \n",
|
||||
"21 [(введение_NOUN_Animacy=Inan|Case=Nom|Gender=N... \n",
|
||||
"22 [(встроенные_ADJ_Case=Nom|Degree=Pos|Number=Pl... \n",
|
||||
"23 [(методология_NOUN_Animacy=Inan|Case=Gen|Gende... \n",
|
||||
"24 [(методология_NOUN_Animacy=Inan|Case=Gen|Gende... \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import nltk\n",
|
||||
"from nltk.util import ngrams\n",
|
||||
"from nltk.tokenize import word_tokenize\n",
|
||||
"nltk.download(\"punkt_tab\")\n",
|
||||
"def generate_ngrams(text: str, n: int = 2) -> list[tuple]:\n",
|
||||
" tokens: list[str] = word_tokenize(text, language=\"russian\")\n",
|
||||
" \n",
|
||||
" n_grams: list[tuple] = list(ngrams(tokens, n))\n",
|
||||
" return n_grams\n",
|
||||
"\n",
|
||||
"# Пример для биграмм (N=2)\n",
|
||||
"df[\"bigrams\"] = df[\"preprocessed_text\"].apply(lambda x: generate_ngrams(x, n=2))\n",
|
||||
"\n",
|
||||
"# Пример для триграмм (N=3)\n",
|
||||
"df[\"trigrams\"] = df[\"preprocessed_text\"].apply(lambda x: generate_ngrams(x, n=3))\n",
|
||||
"\n",
|
||||
"print(df.iloc[15:25])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Теперь применим методы для векторизации текста.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Мешок слов:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" тутто_noun_animacy взаимоотношение_noun_animacy иннкпп_propn_animacy \\\n",
|
||||
"15 0 0 0 \n",
|
||||
"16 0 0 0 \n",
|
||||
"17 0 0 0 \n",
|
||||
"18 0 0 0 \n",
|
||||
"19 0 0 0 \n",
|
||||
"20 0 0 0 \n",
|
||||
"21 0 1 0 \n",
|
||||
"22 0 0 0 \n",
|
||||
"23 0 0 0 \n",
|
||||
"24 0 0 0 \n",
|
||||
"25 0 0 0 \n",
|
||||
"\n",
|
||||
" gif_propn_foreign накладывать_verb_aspect \\\n",
|
||||
"15 0 0 \n",
|
||||
"16 0 0 \n",
|
||||
"17 0 0 \n",
|
||||
"18 0 0 \n",
|
||||
"19 0 0 \n",
|
||||
"20 0 0 \n",
|
||||
"21 0 0 \n",
|
||||
"22 0 0 \n",
|
||||
"23 0 0 \n",
|
||||
"24 0 0 \n",
|
||||
"25 0 1 \n",
|
||||
"\n",
|
||||
" метрологическому_propn_animacy связанность_noun_animacy \\\n",
|
||||
"15 0 0 \n",
|
||||
"16 1 0 \n",
|
||||
"17 0 0 \n",
|
||||
"18 0 0 \n",
|
||||
"19 0 0 \n",
|
||||
"20 0 0 \n",
|
||||
"21 0 0 \n",
|
||||
"22 0 0 \n",
|
||||
"23 0 0 \n",
|
||||
"24 0 0 \n",
|
||||
"25 0 0 \n",
|
||||
"\n",
|
||||
" модернизировать_verb_aspect инструментальный_adj_case \\\n",
|
||||
"15 0 0 \n",
|
||||
"16 0 0 \n",
|
||||
"17 0 0 \n",
|
||||
"18 0 0 \n",
|
||||
"19 0 0 \n",
|
||||
"20 0 0 \n",
|
||||
"21 0 1 \n",
|
||||
"22 0 0 \n",
|
||||
"23 0 0 \n",
|
||||
"24 0 0 \n",
|
||||
"25 0 1 \n",
|
||||
"\n",
|
||||
" достаточно_adv_degree \n",
|
||||
"15 0 \n",
|
||||
"16 0 \n",
|
||||
"17 0 \n",
|
||||
"18 0 \n",
|
||||
"19 0 \n",
|
||||
"20 0 \n",
|
||||
"21 6 \n",
|
||||
"22 1 \n",
|
||||
"23 8 \n",
|
||||
"24 3 \n",
|
||||
"25 15 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from scipy import sparse\n",
|
||||
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"counts_vectorizer = CountVectorizer()\n",
|
||||
"counts_matrix = sparse.csr_matrix(counts_vectorizer.fit_transform(df[\"preprocessed_text\"]))\n",
|
||||
"counts_df = pd.DataFrame(\n",
|
||||
" counts_matrix.toarray(),\n",
|
||||
" columns=counts_vectorizer.get_feature_names_out(),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"random_columns = np.random.choice(counts_df.columns, size=10, replace=False)\n",
|
||||
"\n",
|
||||
"print(counts_df.loc[15:25, random_columns]) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Частотный портрет:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" тутто_noun_animacy взаимоотношение_noun_animacy иннкпп_propn_animacy \\\n",
|
||||
"15 0.0 0.000000 0.0 \n",
|
||||
"16 0.0 0.000000 0.0 \n",
|
||||
"17 0.0 0.000000 0.0 \n",
|
||||
"18 0.0 0.000000 0.0 \n",
|
||||
"19 0.0 0.000000 0.0 \n",
|
||||
"20 0.0 0.000000 0.0 \n",
|
||||
"21 0.0 0.022338 0.0 \n",
|
||||
"22 0.0 0.000000 0.0 \n",
|
||||
"23 0.0 0.000000 0.0 \n",
|
||||
"24 0.0 0.000000 0.0 \n",
|
||||
"25 0.0 0.000000 0.0 \n",
|
||||
"\n",
|
||||
" gif_propn_foreign накладывать_verb_aspect \\\n",
|
||||
"15 0.0 0.00000 \n",
|
||||
"16 0.0 0.00000 \n",
|
||||
"17 0.0 0.00000 \n",
|
||||
"18 0.0 0.00000 \n",
|
||||
"19 0.0 0.00000 \n",
|
||||
"20 0.0 0.00000 \n",
|
||||
"21 0.0 0.00000 \n",
|
||||
"22 0.0 0.00000 \n",
|
||||
"23 0.0 0.00000 \n",
|
||||
"24 0.0 0.00000 \n",
|
||||
"25 0.0 0.02162 \n",
|
||||
"\n",
|
||||
" метрологическому_propn_animacy связанность_noun_animacy \\\n",
|
||||
"15 0.000000 0.0 \n",
|
||||
"16 0.042399 0.0 \n",
|
||||
"17 0.000000 0.0 \n",
|
||||
"18 0.000000 0.0 \n",
|
||||
"19 0.000000 0.0 \n",
|
||||
"20 0.000000 0.0 \n",
|
||||
"21 0.000000 0.0 \n",
|
||||
"22 0.000000 0.0 \n",
|
||||
"23 0.000000 0.0 \n",
|
||||
"24 0.000000 0.0 \n",
|
||||
"25 0.000000 0.0 \n",
|
||||
"\n",
|
||||
" модернизировать_verb_aspect инструментальный_adj_case \\\n",
|
||||
"15 0.0 0.000000 \n",
|
||||
"16 0.0 0.000000 \n",
|
||||
"17 0.0 0.000000 \n",
|
||||
"18 0.0 0.000000 \n",
|
||||
"19 0.0 0.000000 \n",
|
||||
"20 0.0 0.000000 \n",
|
||||
"21 0.0 0.017277 \n",
|
||||
"22 0.0 0.000000 \n",
|
||||
"23 0.0 0.000000 \n",
|
||||
"24 0.0 0.000000 \n",
|
||||
"25 0.0 0.018585 \n",
|
||||
"\n",
|
||||
" достаточно_adv_degree \n",
|
||||
"15 0.000000 \n",
|
||||
"16 0.000000 \n",
|
||||
"17 0.000000 \n",
|
||||
"18 0.000000 \n",
|
||||
"19 0.000000 \n",
|
||||
"20 0.000000 \n",
|
||||
"21 0.033501 \n",
|
||||
"22 0.025389 \n",
|
||||
"23 0.047452 \n",
|
||||
"24 0.036795 \n",
|
||||
"25 0.047864 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"\n",
|
||||
"tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True)\n",
|
||||
"tfidf_matrix = sparse.csr_matrix(tfidf_vectorizer.fit_transform(df[\"preprocessed_text\"]))\n",
|
||||
"tfidf_df = pd.DataFrame(\n",
|
||||
" tfidf_matrix.toarray(),\n",
|
||||
" columns=tfidf_vectorizer.get_feature_names_out(),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(tfidf_df.loc[15:25, random_columns]) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Обучение модели и проверка ее качества:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"TF-IDF Model\n",
|
||||
"Accuracy: 0.8889\n",
|
||||
"Precision: 0.7500\n",
|
||||
"Recall: 1.0000\n",
|
||||
"F1 Score: 0.8571\n",
|
||||
"ROC AUC: 0.9167\n",
|
||||
"Cross-validated F1 Score: 1.0000\n",
|
||||
"\n",
|
||||
"Count Vectorizer Model\n",
|
||||
"Accuracy: 1.0000\n",
|
||||
"Precision: 1.0000\n",
|
||||
"Recall: 1.0000\n",
|
||||
"F1 Score: 1.0000\n",
|
||||
"ROC AUC: 1.0000\n",
|
||||
"Cross-validated F1 Score: 0.9333\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV\n",
|
||||
"from sklearn.ensemble import RandomForestClassifier\n",
|
||||
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n",
|
||||
"\n",
|
||||
"def train_and_evaluate(X, y, test_size=0.2, cv=5, optimize=False):\n",
|
||||
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=9)\n",
|
||||
"\n",
|
||||
" if optimize:\n",
|
||||
" param_grid = {\n",
|
||||
" \"n_estimators\": [10, 20, 30, 40, 50, 100, 150, 200, 250, 500],\n",
|
||||
" \"max_features\": [\"sqrt\", \"log2\", 2],\n",
|
||||
" \"max_depth\": [2, 3, 4, 5, 6, 7, 8, 9, 10],\n",
|
||||
" \"criterion\": [\"gini\", \"entropy\", \"log_loss\"],\n",
|
||||
" \"class_weight\": [\"balanced\", \"balanced_subsample\"]\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" grid_search = GridSearchCV(RandomForestClassifier(random_state=9), param_grid, scoring=\"f1\", cv=cv, n_jobs=-1)\n",
|
||||
" grid_search.fit(X_train, y_train)\n",
|
||||
" model = grid_search.best_estimator_\n",
|
||||
" print(f\"Лучшие параметры: {grid_search.best_params_}\")\n",
|
||||
" else:\n",
|
||||
" model = RandomForestClassifier(n_estimators=100, random_state=9)\n",
|
||||
" model.fit(X_train, y_train)\n",
|
||||
"\n",
|
||||
" y_pred = model.predict(X_test)\n",
|
||||
"\n",
|
||||
" accuracy = accuracy_score(y_test, y_pred)\n",
|
||||
" precision = precision_score(y_test, y_pred)\n",
|
||||
" recall = recall_score(y_test, y_pred)\n",
|
||||
" f1 = f1_score(y_test, y_pred)\n",
|
||||
" roc_auc = roc_auc_score(y_test, y_pred)\n",
|
||||
"\n",
|
||||
" print(f\"Accuracy: {accuracy:.4f}\")\n",
|
||||
" print(f\"Precision: {precision:.4f}\")\n",
|
||||
" print(f\"Recall: {recall:.4f}\")\n",
|
||||
" print(f\"F1 Score: {f1:.4f}\")\n",
|
||||
" print(f\"ROC AUC: {roc_auc:.4f}\")\n",
|
||||
"\n",
|
||||
" scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1')\n",
|
||||
" f1_cv = scores.mean()\n",
|
||||
" print(f\"Cross-validated F1 Score: {f1_cv:.4f}\")\n",
|
||||
"\n",
|
||||
" return model\n",
|
||||
"\n",
|
||||
"X_tfidf = tfidf_df\n",
|
||||
"X_counts = counts_df\n",
|
||||
"y = df[\"type\"]\n",
|
||||
"\n",
|
||||
"print(\"TF-IDF Model\")\n",
|
||||
"model_tfidf = train_and_evaluate(X_tfidf, y)\n",
|
||||
"\n",
|
||||
"print(\"\\nCount Vectorizer Model\")\n",
|
||||
"model_counts = train_and_evaluate(X_counts, y)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Как видно, обе модели демонстрируют отличные результаты, причём вторая модель достигает практически идеальных показателей. Однако это может быть связано с небольшим объёмом данных в выборке (всего 41 документ). Вероятно, модель просто запомнила данные, что привело к её переобучению."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Кроме того, в соответствии с заданием, оценим решение, используя альтернативные гиперпараметры модели машинного обучения, которые будут подобраны с помощью метода поиска по сетке."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"TF-IDF Model (Optimized)\n",
|
||||
"Лучшие параметры: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 2, 'max_features': 'sqrt', 'n_estimators': 20}\n",
|
||||
"Accuracy: 0.7778\n",
|
||||
"Precision: 0.6000\n",
|
||||
"Recall: 1.0000\n",
|
||||
"F1 Score: 0.7500\n",
|
||||
"ROC AUC: 0.8333\n",
|
||||
"Cross-validated F1 Score: 1.0000\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"TF-IDF Model (Optimized)\")\n",
|
||||
"model_tfidf = train_and_evaluate(X_tfidf, y, optimize=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Можно сделать вывод, что в данном случае существует возможность настроить гиперпараметры модели таким образом, что её показатели согласно метрикам достигнут практически идеального уровня."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
424
lab_9.ipynb
Normal file
424
lab_9.ipynb
Normal file
File diff suppressed because one or more lines are too long
5343
poetry.lock
generated
Normal file
5343
poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
2
poetry.toml
Normal file
2
poetry.toml
Normal file
@@ -0,0 +1,2 @@
|
||||
[virtualenvs]
|
||||
in-project = true
|
||||
42
pyproject.toml
Normal file
42
pyproject.toml
Normal file
@@ -0,0 +1,42 @@
|
||||
[tool.poetry]
|
||||
name = "mai"
|
||||
version = "1.0.0"
|
||||
description = "MAI labs"
|
||||
authors = ["Tikhonenkov Alexey <tikhonenkov2015@gmail.com>"]
|
||||
readme = "readme.md"
|
||||
package-mode = false
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.12,<3.13"
|
||||
jupyter = "^1.1.1"
|
||||
numpy = "^1.26.4"
|
||||
pandas = "^2.2.2"
|
||||
matplotlib = "^3.9.2"
|
||||
flask = "^3.0.3"
|
||||
apiflask = "^2.2.0"
|
||||
flask-cors = "^5.0.0"
|
||||
ipykernel = "^6.29.5"
|
||||
imbalanced-learn = "^0.12.4"
|
||||
seaborn = "^0.13.2"
|
||||
featuretools = "^1.31.0"
|
||||
gymnasium = "^1.0.0"
|
||||
scikit-fuzzy = "^0.5.0"
|
||||
networkx = "^3.4.2"
|
||||
spacy = "^3.7.5"
|
||||
docx = "^0.2.4"
|
||||
emoji = "^2.14.1"
|
||||
num2words = "^0.5.14"
|
||||
nltk = "^3.9.1"
|
||||
python-docx = "^1.1.2"
|
||||
opencv-python = "^4.11.0.86"
|
||||
mahotas = "^1.4.18"
|
||||
albumentations = "^2.0.5"
|
||||
ru_core_news_lg = {url = "https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.7.0/ru_core_news_lg-3.7.0-py3-none-any.whl"}
|
||||
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
ipykernel = "^6.29.5"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
0
readme copy.md
Normal file
0
readme copy.md
Normal file
Reference in New Issue
Block a user