done

2024-10-25 22:20:23 +04:00 · 2024-10-25 22:20:23 +04:00 · 4960643e7c
commit 4960643e7c
parent 790b607e5e
18 changed files with 3015 additions and 506 deletions
--- a/lab1.ipynb
+++ b/lab1.ipynb
--- a/mai/.flake8
+++ b/mai/.flake8
@ -1,2 +0,0 @@
-[flake8]
-max-line-length = 120
--- a/mai/.vscode/extensions.json
+++ b/mai/.vscode/extensions.json
@ -1,13 +0,0 @@
-{
-    "recommendations": [
-        "ms-python.black-formatter",
-        "ms-python.flake8",
-        "ms-python.isort",
-        "ms-toolsai.jupyter",
-        "ms-toolsai.datawrangler",
-        "ms-python.python",
-        "donjayamanne.python-environment-manager",
-        // optional
-        "usernamehw.errorlens"
-    ]
-}
--- a/mai/.vscode/launch.json
+++ b/mai/.vscode/launch.json
@ -1,16 +0,0 @@
-{
-    // Use IntelliSense to learn about possible attributes.
-    // Hover to view descriptions of existing attributes.
-    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
-    "version": "0.2.0",
-    "configurations": [
-        {
-            "name": "mai-service",
-            "type": "debugpy",
-            "request": "launch",
-            "program": "run.py",
-            "console": "integratedTerminal",
-            "justMyCode": true
-        }
-    ]
-}
--- a/mai/.vscode/settings.json
+++ b/mai/.vscode/settings.json
@ -1,38 +0,0 @@
-{
-    "files.autoSave": "onFocusChange",
-    "files.exclude": {
-        "**/__pycache__": true
-    },
-    "editor.detectIndentation": false,
-    "editor.formatOnType": false,
-    "editor.formatOnPaste": true,
-    "editor.formatOnSave": true,
-    "editor.tabSize": 4,
-    "editor.insertSpaces": true,
-    "editor.codeActionsOnSave": {
-        "source.organizeImports": "explicit",
-        "source.sortImports": "explicit"
-    },
-    "editor.stickyScroll.enabled": false,
-    "diffEditor.ignoreTrimWhitespace": false,
-    "debug.showVariableTypes": true,
-    "workbench.editor.highlightModifiedTabs": true,
-    "git.suggestSmartCommit": false,
-    "git.autofetch": true,
-    "git.openRepositoryInParentFolders": "always",
-    "git.confirmSync": false,
-    "errorLens.gutterIconsEnabled": true,
-    "errorLens.messageEnabled": false,
-    "[python]": {
-        "editor.defaultFormatter": "ms-python.black-formatter",
-    },
-    "python.languageServer": "Pylance",
-    "python.analysis.typeCheckingMode": "basic",
-    "python.analysis.autoImportCompletions": true,
-    "isort.args": [
-        "--profile",
-        "black"
-    ],
-    "notebook.lineNumbers": "on",
-    "notebook.output.minimalErrorRendering": true,
-}
--- a/mai/assets/quantile.png
+++ b/mai/assets/quantile.png
--- a/mai/backend/init.py
+++ b/mai/backend/init.py
@ -1,52 +0,0 @@
-import importlib
-import os
-import traceback
-
-import matplotlib
-from apiflask import APIBlueprint, APIFlask
-from flask_cors import CORS
-
-matplotlib.use("agg")
-
-cors = CORS()
-api_bp = APIBlueprint("api", __name__, url_prefix="/api/v1")
-dataset_path: str | None = None
-
-
-class Config:
-    SECRET_KEY = "secret!"
-    SEND_FILE_MAX_AGE_DEFAULT = -1
-
-
-def create_app():
-    global dataset_path
-
-    # Create and configure app
-    app = APIFlask(
-        "MAI Service",
-        title="MAI Service API",
-        docs_path="/",
-        version="1.0",
-        static_folder="",
-        template_folder="",
-    )
-    app.config.from_object(Config)
-
-    dataset_path = os.path.join(app.instance_path, "dataset")
-    os.makedirs(dataset_path, exist_ok=True)
-
-    @app.errorhandler(Exception)
-    def my_error_processor(error):
-        traceback.print_exception(error)
-        return {"message": str(error), "detail": "No details"}, 500
-
-    # Import custom REST methods
-    importlib.import_module("backend.api")
-
-    # Enable REST API
-    app.register_blueprint(api_bp)
-
-    # Enable app extensions
-    cors.init_app(app)
-
-    return app
--- a/mai/backend/api.py
+++ b/mai/backend/api.py
@ -1,57 +0,0 @@
-from apiflask import FileSchema, Schema, fields
-from flask import send_file
-
-from backend import api_bp, dataset_path
-from backend.service import Service
-
-
-class FileUpload(Schema):
-    file = fields.File(required=True)
-
-
-class ColumnInfoDto(Schema):
-    datatype = fields.String()
-    items = fields.List(fields.String())
-
-
-class TableColumnDto(Schema):
-    name = fields.String()
-    datatype = fields.String()
-    items = fields.List(fields.String())
-
-
-service = Service(dataset_path)
-
-
-@api_bp.post("/dataset")
-@api_bp.input(FileUpload, location="files")
-def upload_dataset(files_data):
-    uploaded_file = files_data["file"]
-    return service.upload_dataset(uploaded_file)
-
-
-@api_bp.get("/dataset")
-def get_all_datasets():
-    return service.get_all_datasets()
-
-
-@api_bp.get("/dataset/<string:name>")
-@api_bp.output(TableColumnDto(many=True))
-def get_dataset_info(name: str):
-    return service.get_dataset_info(name)
-
-
-@api_bp.get("/dataset/<string:name>/<string:column>")
-@api_bp.output(ColumnInfoDto)
-def get_column_info(name: str, column: str):
-    return service.get_column_info(name, column)
-
-
-@api_bp.get("/dataset/draw/hist/<string:name>/<string:column>")
-@api_bp.output(
-    FileSchema(type="string", format="binary"), content_type="image/png", example=""
-)
-def get_dataset_hist(name: str, column: str):
-    data = service.get_hist(name, column)
-    data.seek(0)
-    return send_file(data, download_name=f"{name}.hist.png", mimetype="image/png")
--- a/mai/backend/service.py
+++ b/mai/backend/service.py
@ -1,59 +0,0 @@
-import io
-import os
-import pathlib
-from typing import BinaryIO, Dict, List
-
-import pandas as pd
-from matplotlib.figure import Figure
-from werkzeug.datastructures import FileStorage
-from werkzeug.utils import secure_filename
-
-
-class Service:
-    def __init__(self, dataset_path: str | None) -> None:
-        if dataset_path is None:
-            raise Exception("Dataset path is not defined")
-        self.__path: str = dataset_path
-
-    def __get_dataset(self, filename: str) -> pd.DataFrame:
-        full_file_name = os.path.join(self.__path, secure_filename(filename))
-        return pd.read_csv(full_file_name)
-
-    def upload_dataset(self, file: FileStorage) -> str:
-        if file.filename is None:
-            raise Exception("Dataset upload error")
-        file_name: str = file.filename
-        full_file_name = os.path.join(self.__path, secure_filename(file_name))
-        file.save(full_file_name)
-        return file_name
-
-    def get_all_datasets(self) -> List[str]:
-        return [file.name for file in pathlib.Path(self.__path).glob("*.csv")]
-
-    def get_dataset_info(self, filename) -> List[Dict]:
-        dataset = self.__get_dataset(filename)
-        dataset_info = []
-        for column in dataset.columns:
-            items = dataset[column].astype(str)
-            column_info = {
-                "name": column,
-                "datatype": dataset.dtypes[column],
-                "items": items,
-            }
-            dataset_info.append(column_info)
-        return dataset_info
-
-    def get_column_info(self, filename, column) -> Dict:
-        dataset = self.__get_dataset(filename)
-        datatype = dataset.dtypes[column]
-        items = sorted(dataset[column].astype(str).unique())
-        return {"datatype": datatype, "items": items}
-
-    def get_hist(self, filename, column) -> BinaryIO:
-        dataset = self.__get_dataset(filename)
-        bytes = io.BytesIO()
-        plot: Figure | None = dataset.plot.hist(column=[column], bins=80).get_figure()
-        if plot is None:
-            raise Exception("Can't create hist plot")
-        plot.savefig(bytes, dpi=300, format="png")
-        return bytes
--- a/mai/docs/path1.png
+++ b/mai/docs/path1.png
--- a/mai/docs/path2.png
+++ b/mai/docs/path2.png
--- a/mai/docs/path3.png
+++ b/mai/docs/path3.png
--- a/mai/docs/path4.png
+++ b/mai/docs/path4.png
--- a/mai/lab.ipynb
+++ b/mai/lab.ipynb
--- a/mai/lab4.ipynb
+++ b/mai/lab4.ipynb
--- a/mai/readme.md
+++ b/mai/readme.md
@ -1,55 +0,0 @@
-## Окружение и примеры для выполнения лабораторных работ по дисциплине "Методы ИИ"
-
-### Python
-
-Используется Python версии 3.12
-
-Установщик https://www.python.org/ftp/python/3.12.5/python-3.12.5-amd64.exe
-
-### Poetry
-
-Для создания и настройки окружения проекта необходимо установить poetry
-
-**Для Windows (Powershell)**
-
-```
-(Invoke-WebRequest -Uri https://install.python-poetry.org -UseBasicParsing).Content | python -
-```
-
-**Linux, macOS, Windows (WSL)**
-
-```
-curl -sSL https://install.python-poetry.org | python3 -
-```
-
-**Добавление poetry в PATH**
-
-1. Открыть настройки переменных среды \
-    \
-   <img src="docs/path1.png" width="300"> \
-    \
-   <img src="docs/path2.png" width="400"> \
-2. Изменить переменную Path текущего пользователя \
-    \
-   <img src="docs/path3.png" width="500"> \
-3. Добавление пути `%APPDATA%\Python\Scripts` до исполняемого файла poetry \
-    \
-   <img src="docs/path4.png" width="400">
-
-### Создание окружения
-
-```
-poetry install
-```
-
-### Запуск тестового сервиса
-
-Запустить тестовый сервис можно с помощью VSCode (см. launch.json в каталоге .vscode).
-
-Также запустить тестовый сервис можно с помощью командной строки:
-
-1. Активация виртуального окружения -- `poetry shell`
-
-2. Запуск сервиса -- `python run.py`
-
-Для выходы из виртуального окружения используется команду `exit`
--- a/mai/run.py
+++ b/mai/run.py
@ -1,16 +0,0 @@
-from backend import create_app
-
-app = create_app()
-
-
-def __main():
-    app.run(
-        host="127.0.0.1",
-        port=8080,
-        debug=True,
-        use_reloader=False,
-    )
-
-
-if __name__ == "__main__":
-    __main()
--- a/mai/utils.py
+++ b/mai/utils.py
@ -0,0 +1,79 @@
+from typing import Tuple
+
+import pandas as pd
+from pandas import DataFrame
+from sklearn.model_selection import train_test_split
+
+
+def split_stratified_into_train_val_test(
+    df_input,
+    stratify_colname="y",
+    frac_train=0.6,
+    frac_val=0.15,
+    frac_test=0.25,
+    random_state=None,
+) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
+    """
+    Splits a Pandas dataframe into three subsets (train, val, and test)
+    following fractional ratios provided by the user, where each subset is
+    stratified by the values in a specific column (that is, each subset has
+    the same relative frequency of the values in the column). It performs this
+    splitting by running train_test_split() twice.
+
+    Parameters
+    ----------
+    df_input : Pandas dataframe
+        Input dataframe to be split.
+    stratify_colname : str
+        The name of the column that will be used for stratification. Usually
+        this column would be for the label.
+    frac_train : float
+    frac_val   : float
+    frac_test  : float
+        The ratios with which the dataframe will be split into train, val, and
+        test data. The values should be expressed as float fractions and should
+        sum to 1.0.
+    random_state : int, None, or RandomStateInstance
+        Value to be passed to train_test_split().
+
+    Returns
+    -------
+    df_train, df_val, df_test :
+        Dataframes containing the three splits.
+    """
+
+    if frac_train + frac_val + frac_test != 1.0:
+        raise ValueError(
+            "fractions %f, %f, %f do not add up to 1.0"
+            % (frac_train, frac_val, frac_test)
+        )
+
+    if stratify_colname not in df_input.columns:
+        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
+
+    X = df_input  # Contains all columns.
+    y = df_input[
+        [stratify_colname]
+    ]  # Dataframe of just the column on which to stratify.
+
+    # Split original dataframe into train and temp dataframes.
+    df_train, df_temp, y_train, y_temp = train_test_split(
+        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
+    )
+
+    if frac_val <= 0:
+        assert len(df_input) == len(df_train) + len(df_temp)
+        return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp
+
+    # Split the temp dataframe into val and test dataframes.
+    relative_frac_test = frac_test / (frac_val + frac_test)
+    df_val, df_test, y_val, y_test = train_test_split(
+        df_temp,
+        y_temp,
+        stratify=y_temp,
+        test_size=relative_frac_test,
+        random_state=random_state,
+    )
+
+    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
+    return df_train, df_val, df_test, y_train, y_val, y_test