From 41c4ab91edd307d6fc96e24670a1b7639b6a9bbb Mon Sep 17 00:00:00 2001 From: "annalyovushkina@yandex.ru" Date: Fri, 15 Nov 2024 22:35:48 +0400 Subject: [PATCH] =?UTF-8?q?=D0=BA=D0=BE=D0=BC=D0=BC=D0=B8=D1=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lab_1/lab1.ipynb | 2 +- lab_4/__pycache__/utils.cpython-312.pyc | Bin 0 -> 2826 bytes lab_4/lab4.ipynb | 240 ++++++++++++++++++++++++ lab_4/utils.py | 79 ++++++++ 4 files changed, 320 insertions(+), 1 deletion(-) create mode 100644 lab_4/__pycache__/utils.cpython-312.pyc create mode 100644 lab_4/lab4.ipynb create mode 100644 lab_4/utils.py diff --git a/lab_1/lab1.ipynb b/lab_1/lab1.ipynb index acd4cbb..716945b 100644 --- a/lab_1/lab1.ipynb +++ b/lab_1/lab1.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [ { diff --git a/lab_4/__pycache__/utils.cpython-312.pyc b/lab_4/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d965a85f693d9f63443297b54e49ad0ec6741b7 GIT binary patch literal 2826 zcma)8O>7fK6yEjP>$M&KBnX00RHg;e7#RZv8k!nV3IxKRfbi2sk*tk(94}e#nw@om zqqP*F9wKqct*Jn2r1p>s4v|__tw1VLFO_<+>7mvNq)ME63nC{@eKX#59Jny{?##}- z_vXFteQ$REiblg6w8YK9sb669l0JAX>_7AT41BF}}|KP9LHFpvEkzk-+qdj+2&ejOMOR(&byKe|zhcLLWcWes0~QRtLx z$s?v*!tju=Oj*@)7A97Xlr`0IL=#&TQ_qW3(QQHK1-M>^ebigqXA5)iX#441;im^V z&g`~$cwkNOQ3=~$5az(jLwEizZ}T>H3oN|dW_UB!p)J__TY6kgv^krf;%RiXpcU{e z*rFY@g9>l^<^W+k@W}trU7PTOq*M#l!nKGkD1qr6;NumGM#7ri_N!w~@_g->3wd)i z*69I`BffJS#3ET<4!K(_&#qy&d+7Fd$PT}!WTYUD0j3l=y4PF;BIxRfHDpKN*Y!@6 z(ORq)uO&Kse&5)low0T3$2#XbW45Dq%nsXeJ3)O-iE=-{97fQ_D0v8y(PddzWP%hR zS%H!ask&t#t7u}3NM({>kQ!jDOQRWCOCyj+lz%B^r(kHBF{|oR2(#%94V zO9aF`*C<*RjS@mJE4ogHZBPAY+tr#F8zV2dJVnx@71$jR2#oY;laP zjp`tx=W*N64N7uq3h*q;%mZ40Z=Lf4DVJ7P*>S&#Q_K_)is_^gj37GJGh~O_Tb=TX zoMuwD04T;|fYhybb||#snQ9$~YbWMXUS$)sr;v6pq!>syEF>!ms+3(M=63CYY9*}& zk1SBVo066i9LXhH)d{uUWa=Dtx;haw>1!t?I$?&+Y11@JC)naIG3hJA5j3njzOrJ% ziN>Y69m%UGC*fK5%EAe6kkXMnCOV;$VVXdMv- zbfU*QRdQ_;j@W|T)gqP?sM0jFbcjW|VGBy@P3=uKKQwV2jzuyd>u_>T$ZBa~c=TfK z^5~?p@6f^Ag{m=gofM~4T^`O1Pe7gJ4o_4pRU?^l)!EH>51jK3PB>F6qvb5!oP8&l z2zHj>BIn@EKF_T|C-p3zZ3O$4s*T{l%E@QJ{m*0Dn_Yv=cyj6J@>liP{#ODYJ@8r( z;^8$;2!+?eoV0!E&ObZ%H{j32>&fHwFRnI{V}Hl4)un6yN{RX5rj%%gl67gvE5R2M z=ZDwB+}@$46k0gAczbzlWvH>|$kT~+!M9gz`1|M2E=(=XE{kgtCnXjS-mN{4_beX0 zSNlO&KKp(2XJPfsPhAa^eHPEwrEEQ(ZN_^a^nBO%ZQuP5R(e-Ie)7rh+268{kN^2; zJ$|JwU0D;k9lMr`tI0q{f+39c)?aOd8!2Pc=M`xhUbTooUlZ}bg5+4bwx<8-5MEpgRtxn=w(ifq21vP7z}yPm)FIYho^S_$oN&vhe&SL}azv|IhDtN(H<=@r;V_Vi z8G{E#UZ`lsu&eHcIXMRY 46\u001b[0m \u001b[43mdf_transformed\u001b[49m, stratify_colname\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAge\u001b[39m\u001b[38;5;124m\"\u001b[39m, frac_train\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.80\u001b[39m, frac_val\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m, frac_test\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.20\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 47\u001b[0m )\n\u001b[0;32m 49\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, X_train)\n\u001b[0;32m 50\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, y_train)\n", + "\u001b[1;31mNameError\u001b[0m: name 'df_transformed' is not defined" + ] + } + ], + "source": [ + "from typing import Tuple\n", + "import pandas as pd\n", + "from pandas import DataFrame\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "def split_stratified_into_train_val_test(\n", + " df_input,\n", + " stratify_colname=\"y\",\n", + " frac_train=0.6,\n", + " frac_val=0.15,\n", + " frac_test=0.25,\n", + " random_state=None,\n", + ") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n", + " \n", + " if frac_train + frac_val + frac_test != 1.0:\n", + " raise ValueError(\n", + " \"fractions %f, %f, %f do not add up to 1.0\"\n", + " % (frac_train, frac_val, frac_test)\n", + " )\n", + " if stratify_colname not in df_input.columns:\n", + " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", + " X = df_input # Contains all columns.\n", + " y = df_input[\n", + " [stratify_colname]\n", + " ] # Dataframe of just the column on which to stratify.\n", + " # Split original dataframe into train and temp dataframes.\n", + " df_train, df_temp, y_train, y_temp = train_test_split(\n", + " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", + " )\n", + " if frac_val <= 0:\n", + " assert len(df_input) == len(df_train) + len(df_temp)\n", + " return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n", + " # Split the temp dataframe into val and test dataframes.\n", + " relative_frac_test = frac_test / (frac_val + frac_test)\n", + " df_val, df_test, y_val, y_test = train_test_split(\n", + " df_temp,\n", + " y_temp,\n", + " stratify=y_temp,\n", + " test_size=relative_frac_test,\n", + " random_state=random_state,\n", + " )\n", + " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", + " return df_train, df_val, df_test, y_train, y_val, y_test\n", + "\n", + "X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n", + " df, stratify_colname=\"Age\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=None\n", + ")\n", + "\n", + "display(\"X_train\", X_train)\n", + "display(\"y_train\", y_train)\n", + "\n", + "display(\"X_test\", X_test)\n", + "display(\"y_test\", y_test)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lab_4/utils.py b/lab_4/utils.py new file mode 100644 index 0000000..cb8c396 --- /dev/null +++ b/lab_4/utils.py @@ -0,0 +1,79 @@ +from typing import Tuple + +import pandas as pd +from pandas import DataFrame +from sklearn.model_selection import train_test_split + + +def split_stratified_into_train_val_test( + df_input, + stratify_colname="y", + frac_train=0.6, + frac_val=0.15, + frac_test=0.25, + random_state=None, +) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]: + """ + Splits a Pandas dataframe into three subsets (train, val, and test) + following fractional ratios provided by the user, where each subset is + stratified by the values in a specific column (that is, each subset has + the same relative frequency of the values in the column). It performs this + splitting by running train_test_split() twice. + + Parameters + ---------- + df_input : Pandas dataframe + Input dataframe to be split. + stratify_colname : str + The name of the column that will be used for stratification. Usually + this column would be for the label. + frac_train : float + frac_val : float + frac_test : float + The ratios with which the dataframe will be split into train, val, and + test data. The values should be expressed as float fractions and should + sum to 1.0. + random_state : int, None, or RandomStateInstance + Value to be passed to train_test_split(). + + Returns + ------- + df_train, df_val, df_test : + Dataframes containing the three splits. + """ + + if frac_train + frac_val + frac_test != 1.0: + raise ValueError( + "fractions %f, %f, %f do not add up to 1.0" + % (frac_train, frac_val, frac_test) + ) + + if stratify_colname not in df_input.columns: + raise ValueError("%s is not a column in the dataframe" % (stratify_colname)) + + X = df_input # Contains all columns. + y = df_input[ + [stratify_colname] + ] # Dataframe of just the column on which to stratify. + + # Split original dataframe into train and temp dataframes. + df_train, df_temp, y_train, y_temp = train_test_split( + X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state + ) + + if frac_val <= 0: + assert len(df_input) == len(df_train) + len(df_temp) + return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp + + # Split the temp dataframe into val and test dataframes. + relative_frac_test = frac_test / (frac_val + frac_test) + df_val, df_test, y_val, y_test = train_test_split( + df_temp, + y_temp, + stratify=y_temp, + test_size=relative_frac_test, + random_state=random_state, + ) + + assert len(df_input) == len(df_train) + len(df_val) + len(df_test) + return df_train, df_val, df_test, y_train, y_val, y_test