{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Загрузка данных в DataFrame" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv(\"../data/kc_house_data.csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Получение сведений о пропущенных данных" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(df.isnull().sum())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(df.isnull().any())" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "for i in df.columns:\n", " null_rate = df[i].isnull().sum() / len(df) * 100\n", " if null_rate > 0:\n", " print(f\"{i} процент пустых значений: {null_rate:.2f}%\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Создание выборок данных" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "\n", "def split_stratified_into_train_val_test(\n", " df_input,\n", " stratify_colname=\"y\",\n", " frac_train=0.6,\n", " frac_val=0.15,\n", " frac_test=0.25,\n", " random_state=None,\n", "):\n", " \"\"\"\n", " Splits a Pandas dataframe into three subsets (train, val, and test)\n", " following fractional ratios provided by the user, where each subset is\n", " stratified by the values in a specific column (that is, each subset has\n", " the same relative frequency of the values in the column). It performs this\n", " splitting by running train_test_split() twice.\n", "\n", " Parameters\n", " ----------\n", " df_input : Pandas dataframe\n", " Input dataframe to be split.\n", " stratify_colname : str\n", " The name of the column that will be used for stratification. Usually\n", " this column would be for the label.\n", " frac_train : float\n", " frac_val : float\n", " frac_test : float\n", " The ratios with which the dataframe will be split into train, val, and\n", " test data. The values should be expressed as float fractions and should\n", " sum to 1.0.\n", " random_state : int, None, or RandomStateInstance\n", " Value to be passed to train_test_split().\n", "\n", " Returns\n", " -------\n", " df_train, df_val, df_test :\n", " Dataframes containing the three splits.\n", " \"\"\"\n", "\n", " if frac_train + frac_val + frac_test != 1.0:\n", " raise ValueError(\n", " \"fractions %f, %f, %f do not add up to 1.0\"\n", " % (frac_train, frac_val, frac_test)\n", " )\n", "\n", " if stratify_colname not in df_input.columns:\n", " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", "\n", " X = df_input # Contains all columns.\n", " y = df_input[\n", " [stratify_colname]\n", " ] # Dataframe of just the column on which to stratify.\n", "\n", " # Split original dataframe into train and temp dataframes.\n", " df_train, df_temp, y_train, y_temp = train_test_split(\n", " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", " )\n", "\n", " # Split the temp dataframe into val and test dataframes.\n", " relative_frac_test = frac_test / (frac_val + frac_test)\n", " df_val, df_test, y_val, y_test = train_test_split(\n", " df_temp,\n", " y_temp,\n", " stratify=y_temp,\n", " test_size=relative_frac_test,\n", " random_state=random_state,\n", " )\n", "\n", " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", "\n", " return df_train, df_val, df_test" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[3 5 4 1 2]\n" ] } ], "source": [ "print(df.condition.unique())\n", "\n", "data = df[\n", " [\n", " \"price\",\n", " \"bedrooms\",\n", " \"bathrooms\",\n", " \"sqft_living\",\n", " \"sqft_lot\",\n", " \"floors\",\n", " \"view\",\n", " \"condition\",\n", " \"grade\",\n", " \"sqft_above\",\n", " \"sqft_basement\",\n", " \"yr_built\",\n", " \"yr_renovated\",\n", " \"zipcode\",\n", " \"lat\",\n", " \"long\",\n", " ]\n", "].copy()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Обучающая выборка: (12967, 16)\n", "condition\n", "3 8418\n", "4 3407\n", "5 1021\n", "2 103\n", "1 18\n", "Name: count, dtype: int64\n", "Контрольная выборка: (4323, 16)\n", "condition\n", "3 2806\n", "4 1136\n", "5 340\n", "2 35\n", "1 6\n", "Name: count, dtype: int64\n", "Тестовая выборка: (4323, 16)\n", "condition\n", "3 2807\n", "4 1136\n", "5 340\n", "2 34\n", "1 6\n", "Name: count, dtype: int64\n" ] } ], "source": [ "df_train, df_val, df_test = split_stratified_into_train_val_test(\n", " data,\n", " stratify_colname=\"condition\",\n", " frac_train=0.60,\n", " frac_val=0.20,\n", " frac_test=0.20,\n", ")\n", "\n", "print(\"Обучающая выборка: \", df_train.shape)\n", "print(df_train.condition.value_counts())\n", "\n", "print(\"Контрольная выборка: \", df_val.shape)\n", "print(df_val.condition.value_counts())\n", "\n", "print(\"Тестовая выборка: \", df_test.shape)\n", "print(df_test.condition.value_counts())" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Обучающая выборка: (12967, 16)\n", "condition\n", "3 8418\n", "4 3407\n", "5 1021\n", "2 103\n", "1 18\n", "Name: count, dtype: int64\n", "Обучающая выборка после oversampling: (42073, 16)\n", "condition\n", "5 8464\n", "2 8421\n", "1 8420\n", "3 8418\n", "4 8350\n", "Name: count, dtype: int64\n" ] } ], "source": [ "from imblearn.over_sampling import ADASYN\n", "\n", "ada = ADASYN()\n", "\n", "print(\"Обучающая выборка: \", df_train.shape)\n", "print(df_train.condition.value_counts())\n", "\n", "X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"condition\"])\n", "df_train_adasyn = pd.DataFrame(X_resampled)\n", "\n", "print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n", "print(df_train_adasyn.condition.value_counts())" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 2 }