From 82ac1015453da7f291498ec02b0c43b04a6ee048 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=92=D1=8F=D1=87=D0=B5=D1=81=D0=BB=D0=B0=D0=B2=20=D0=98?= =?UTF-8?q?=D0=B2=D0=B0=D0=BD=D0=BE=D0=B2?= Date: Fri, 25 Oct 2024 22:21:37 +0400 Subject: [PATCH] drop --- mai/utils.py | 79 ---------------------------------------------------- 1 file changed, 79 deletions(-) delete mode 100644 mai/utils.py diff --git a/mai/utils.py b/mai/utils.py deleted file mode 100644 index 7190903..0000000 --- a/mai/utils.py +++ /dev/null @@ -1,79 +0,0 @@ -from typing import Tuple - -import pandas as pd -from pandas import DataFrame -from sklearn.model_selection import train_test_split - - -def split_stratified_into_train_val_test( - df_input, - stratify_colname="y", - frac_train=0.6, - frac_val=0.15, - frac_test=0.25, - random_state=None, -) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]: - """ - Splits a Pandas dataframe into three subsets (train, val, and test) - following fractional ratios provided by the user, where each subset is - stratified by the values in a specific column (that is, each subset has - the same relative frequency of the values in the column). It performs this - splitting by running train_test_split() twice. - - Parameters - ---------- - df_input : Pandas dataframe - Input dataframe to be split. - stratify_colname : str - The name of the column that will be used for stratification. Usually - this column would be for the label. - frac_train : float - frac_val : float - frac_test : float - The ratios with which the dataframe will be split into train, val, and - test data. The values should be expressed as float fractions and should - sum to 1.0. - random_state : int, None, or RandomStateInstance - Value to be passed to train_test_split(). - - Returns - ------- - df_train, df_val, df_test : - Dataframes containing the three splits. - """ - - if frac_train + frac_val + frac_test != 1.0: - raise ValueError( - "fractions %f, %f, %f do not add up to 1.0" - % (frac_train, frac_val, frac_test) - ) - - if stratify_colname not in df_input.columns: - raise ValueError("%s is not a column in the dataframe" % (stratify_colname)) - - X = df_input # Contains all columns. - y = df_input[ - [stratify_colname] - ] # Dataframe of just the column on which to stratify. - - # Split original dataframe into train and temp dataframes. - df_train, df_temp, y_train, y_temp = train_test_split( - X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state - ) - - if frac_val <= 0: - assert len(df_input) == len(df_train) + len(df_temp) - return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp - - # Split the temp dataframe into val and test dataframes. - relative_frac_test = frac_test / (frac_val + frac_test) - df_val, df_test, y_val, y_test = train_test_split( - df_temp, - y_temp, - stratify=y_temp, - test_size=relative_frac_test, - random_state=random_state, - ) - - assert len(df_input) == len(df_train) + len(df_val) + len(df_test) - return df_train, df_val, df_test, y_train, y_val, y_test \ No newline at end of file