This commit is contained in:
Вячеслав Иванов 2024-10-25 22:20:23 +04:00
parent 790b607e5e
commit 4960643e7c
18 changed files with 3015 additions and 506 deletions

File diff suppressed because one or more lines are too long

View File

@ -1,2 +0,0 @@
[flake8]
max-line-length = 120

View File

@ -1,13 +0,0 @@
{
"recommendations": [
"ms-python.black-formatter",
"ms-python.flake8",
"ms-python.isort",
"ms-toolsai.jupyter",
"ms-toolsai.datawrangler",
"ms-python.python",
"donjayamanne.python-environment-manager",
// optional
"usernamehw.errorlens"
]
}

View File

@ -1,16 +0,0 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "mai-service",
"type": "debugpy",
"request": "launch",
"program": "run.py",
"console": "integratedTerminal",
"justMyCode": true
}
]
}

View File

@ -1,38 +0,0 @@
{
"files.autoSave": "onFocusChange",
"files.exclude": {
"**/__pycache__": true
},
"editor.detectIndentation": false,
"editor.formatOnType": false,
"editor.formatOnPaste": true,
"editor.formatOnSave": true,
"editor.tabSize": 4,
"editor.insertSpaces": true,
"editor.codeActionsOnSave": {
"source.organizeImports": "explicit",
"source.sortImports": "explicit"
},
"editor.stickyScroll.enabled": false,
"diffEditor.ignoreTrimWhitespace": false,
"debug.showVariableTypes": true,
"workbench.editor.highlightModifiedTabs": true,
"git.suggestSmartCommit": false,
"git.autofetch": true,
"git.openRepositoryInParentFolders": "always",
"git.confirmSync": false,
"errorLens.gutterIconsEnabled": true,
"errorLens.messageEnabled": false,
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter",
},
"python.languageServer": "Pylance",
"python.analysis.typeCheckingMode": "basic",
"python.analysis.autoImportCompletions": true,
"isort.args": [
"--profile",
"black"
],
"notebook.lineNumbers": "on",
"notebook.output.minimalErrorRendering": true,
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 111 KiB

View File

@ -1,52 +0,0 @@
import importlib
import os
import traceback
import matplotlib
from apiflask import APIBlueprint, APIFlask
from flask_cors import CORS
matplotlib.use("agg")
cors = CORS()
api_bp = APIBlueprint("api", __name__, url_prefix="/api/v1")
dataset_path: str | None = None
class Config:
SECRET_KEY = "secret!"
SEND_FILE_MAX_AGE_DEFAULT = -1
def create_app():
global dataset_path
# Create and configure app
app = APIFlask(
"MAI Service",
title="MAI Service API",
docs_path="/",
version="1.0",
static_folder="",
template_folder="",
)
app.config.from_object(Config)
dataset_path = os.path.join(app.instance_path, "dataset")
os.makedirs(dataset_path, exist_ok=True)
@app.errorhandler(Exception)
def my_error_processor(error):
traceback.print_exception(error)
return {"message": str(error), "detail": "No details"}, 500
# Import custom REST methods
importlib.import_module("backend.api")
# Enable REST API
app.register_blueprint(api_bp)
# Enable app extensions
cors.init_app(app)
return app

View File

@ -1,57 +0,0 @@
from apiflask import FileSchema, Schema, fields
from flask import send_file
from backend import api_bp, dataset_path
from backend.service import Service
class FileUpload(Schema):
file = fields.File(required=True)
class ColumnInfoDto(Schema):
datatype = fields.String()
items = fields.List(fields.String())
class TableColumnDto(Schema):
name = fields.String()
datatype = fields.String()
items = fields.List(fields.String())
service = Service(dataset_path)
@api_bp.post("/dataset")
@api_bp.input(FileUpload, location="files")
def upload_dataset(files_data):
uploaded_file = files_data["file"]
return service.upload_dataset(uploaded_file)
@api_bp.get("/dataset")
def get_all_datasets():
return service.get_all_datasets()
@api_bp.get("/dataset/<string:name>")
@api_bp.output(TableColumnDto(many=True))
def get_dataset_info(name: str):
return service.get_dataset_info(name)
@api_bp.get("/dataset/<string:name>/<string:column>")
@api_bp.output(ColumnInfoDto)
def get_column_info(name: str, column: str):
return service.get_column_info(name, column)
@api_bp.get("/dataset/draw/hist/<string:name>/<string:column>")
@api_bp.output(
FileSchema(type="string", format="binary"), content_type="image/png", example=""
)
def get_dataset_hist(name: str, column: str):
data = service.get_hist(name, column)
data.seek(0)
return send_file(data, download_name=f"{name}.hist.png", mimetype="image/png")

View File

@ -1,59 +0,0 @@
import io
import os
import pathlib
from typing import BinaryIO, Dict, List
import pandas as pd
from matplotlib.figure import Figure
from werkzeug.datastructures import FileStorage
from werkzeug.utils import secure_filename
class Service:
def __init__(self, dataset_path: str | None) -> None:
if dataset_path is None:
raise Exception("Dataset path is not defined")
self.__path: str = dataset_path
def __get_dataset(self, filename: str) -> pd.DataFrame:
full_file_name = os.path.join(self.__path, secure_filename(filename))
return pd.read_csv(full_file_name)
def upload_dataset(self, file: FileStorage) -> str:
if file.filename is None:
raise Exception("Dataset upload error")
file_name: str = file.filename
full_file_name = os.path.join(self.__path, secure_filename(file_name))
file.save(full_file_name)
return file_name
def get_all_datasets(self) -> List[str]:
return [file.name for file in pathlib.Path(self.__path).glob("*.csv")]
def get_dataset_info(self, filename) -> List[Dict]:
dataset = self.__get_dataset(filename)
dataset_info = []
for column in dataset.columns:
items = dataset[column].astype(str)
column_info = {
"name": column,
"datatype": dataset.dtypes[column],
"items": items,
}
dataset_info.append(column_info)
return dataset_info
def get_column_info(self, filename, column) -> Dict:
dataset = self.__get_dataset(filename)
datatype = dataset.dtypes[column]
items = sorted(dataset[column].astype(str).unique())
return {"datatype": datatype, "items": items}
def get_hist(self, filename, column) -> BinaryIO:
dataset = self.__get_dataset(filename)
bytes = io.BytesIO()
plot: Figure | None = dataset.plot.hist(column=[column], bins=80).get_figure()
if plot is None:
raise Exception("Can't create hist plot")
plot.savefig(bytes, dpi=300, format="png")
return bytes

Binary file not shown.

Before

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 74 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 129 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 38 KiB

View File

2936
mai/lab4.ipynb Normal file

File diff suppressed because one or more lines are too long

View File

@ -1,55 +0,0 @@
## Окружение и примеры для выполнения лабораторных работ по дисциплине "Методы ИИ"
### Python
Используется Python версии 3.12
Установщик https://www.python.org/ftp/python/3.12.5/python-3.12.5-amd64.exe
### Poetry
Для создания и настройки окружения проекта необходимо установить poetry
**Для Windows (Powershell)**
```
(Invoke-WebRequest -Uri https://install.python-poetry.org -UseBasicParsing).Content | python -
```
**Linux, macOS, Windows (WSL)**
```
curl -sSL https://install.python-poetry.org | python3 -
```
**Добавление poetry в PATH**
1. Открыть настройки переменных среды \
\
<img src="docs/path1.png" width="300"> \
\
<img src="docs/path2.png" width="400"> \
2. Изменить переменную Path текущего пользователя \
\
<img src="docs/path3.png" width="500"> \
3. Добавление пути `%APPDATA%\Python\Scripts` до исполняемого файла poetry \
\
<img src="docs/path4.png" width="400">
### Создание окружения
```
poetry install
```
### Запуск тестового сервиса
Запустить тестовый сервис можно с помощью VSCode (см. launch.json в каталоге .vscode).
Также запустить тестовый сервис можно с помощью командной строки:
1. Активация виртуального окружения -- `poetry shell`
2. Запуск сервиса -- `python run.py`
Для выходы из виртуального окружения используется команду `exit`

View File

@ -1,16 +0,0 @@
from backend import create_app
app = create_app()
def __main():
app.run(
host="127.0.0.1",
port=8080,
debug=True,
use_reloader=False,
)
if __name__ == "__main__":
__main()

79
mai/utils.py Normal file
View File

@ -0,0 +1,79 @@
from typing import Tuple
import pandas as pd
from pandas import DataFrame
from sklearn.model_selection import train_test_split
def split_stratified_into_train_val_test(
df_input,
stratify_colname="y",
frac_train=0.6,
frac_val=0.15,
frac_test=0.25,
random_state=None,
) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
"""
Splits a Pandas dataframe into three subsets (train, val, and test)
following fractional ratios provided by the user, where each subset is
stratified by the values in a specific column (that is, each subset has
the same relative frequency of the values in the column). It performs this
splitting by running train_test_split() twice.
Parameters
----------
df_input : Pandas dataframe
Input dataframe to be split.
stratify_colname : str
The name of the column that will be used for stratification. Usually
this column would be for the label.
frac_train : float
frac_val : float
frac_test : float
The ratios with which the dataframe will be split into train, val, and
test data. The values should be expressed as float fractions and should
sum to 1.0.
random_state : int, None, or RandomStateInstance
Value to be passed to train_test_split().
Returns
-------
df_train, df_val, df_test :
Dataframes containing the three splits.
"""
if frac_train + frac_val + frac_test != 1.0:
raise ValueError(
"fractions %f, %f, %f do not add up to 1.0"
% (frac_train, frac_val, frac_test)
)
if stratify_colname not in df_input.columns:
raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
X = df_input # Contains all columns.
y = df_input[
[stratify_colname]
] # Dataframe of just the column on which to stratify.
# Split original dataframe into train and temp dataframes.
df_train, df_temp, y_train, y_temp = train_test_split(
X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
)
if frac_val <= 0:
assert len(df_input) == len(df_train) + len(df_temp)
return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp
# Split the temp dataframe into val and test dataframes.
relative_frac_test = frac_test / (frac_val + frac_test)
df_val, df_test, y_val, y_test = train_test_split(
df_temp,
y_temp,
stratify=y_temp,
test_size=relative_frac_test,
random_state=random_state,
)
assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
return df_train, df_val, df_test, y_train, y_val, y_test