2024-11-10 14:56:44 +04:00
|
|
|
|
{
|
|
|
|
|
"cells": [
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"Загрузка данных в DataFrame"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"import pandas as pd\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"df = pd.read_csv(\"../data/kc_house_data.csv\")"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"df.head()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"Получение сведений о пропущенных данных"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"print(df.isnull().sum())"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"print(df.isnull().any())"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-11-23 15:06:07 +04:00
|
|
|
|
"execution_count": 3,
|
2024-11-10 14:56:44 +04:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"for i in df.columns:\n",
|
|
|
|
|
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
|
|
|
|
" if null_rate > 0:\n",
|
|
|
|
|
" print(f\"{i} процент пустых значений: {null_rate:.2f}%\")"
|
|
|
|
|
]
|
2024-11-23 15:06:07 +04:00
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"Создание выборок данных"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 9,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def split_stratified_into_train_val_test(\n",
|
|
|
|
|
" df_input,\n",
|
|
|
|
|
" stratify_colname=\"y\",\n",
|
|
|
|
|
" frac_train=0.6,\n",
|
|
|
|
|
" frac_val=0.15,\n",
|
|
|
|
|
" frac_test=0.25,\n",
|
|
|
|
|
" random_state=None,\n",
|
|
|
|
|
"):\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
|
|
|
|
|
" following fractional ratios provided by the user, where each subset is\n",
|
|
|
|
|
" stratified by the values in a specific column (that is, each subset has\n",
|
|
|
|
|
" the same relative frequency of the values in the column). It performs this\n",
|
|
|
|
|
" splitting by running train_test_split() twice.\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" Parameters\n",
|
|
|
|
|
" ----------\n",
|
|
|
|
|
" df_input : Pandas dataframe\n",
|
|
|
|
|
" Input dataframe to be split.\n",
|
|
|
|
|
" stratify_colname : str\n",
|
|
|
|
|
" The name of the column that will be used for stratification. Usually\n",
|
|
|
|
|
" this column would be for the label.\n",
|
|
|
|
|
" frac_train : float\n",
|
|
|
|
|
" frac_val : float\n",
|
|
|
|
|
" frac_test : float\n",
|
|
|
|
|
" The ratios with which the dataframe will be split into train, val, and\n",
|
|
|
|
|
" test data. The values should be expressed as float fractions and should\n",
|
|
|
|
|
" sum to 1.0.\n",
|
|
|
|
|
" random_state : int, None, or RandomStateInstance\n",
|
|
|
|
|
" Value to be passed to train_test_split().\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" Returns\n",
|
|
|
|
|
" -------\n",
|
|
|
|
|
" df_train, df_val, df_test :\n",
|
|
|
|
|
" Dataframes containing the three splits.\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" if frac_train + frac_val + frac_test != 1.0:\n",
|
|
|
|
|
" raise ValueError(\n",
|
|
|
|
|
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
|
|
|
|
" % (frac_train, frac_val, frac_test)\n",
|
|
|
|
|
" )\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" if stratify_colname not in df_input.columns:\n",
|
|
|
|
|
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" X = df_input # Contains all columns.\n",
|
|
|
|
|
" y = df_input[\n",
|
|
|
|
|
" [stratify_colname]\n",
|
|
|
|
|
" ] # Dataframe of just the column on which to stratify.\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # Split original dataframe into train and temp dataframes.\n",
|
|
|
|
|
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
|
|
|
|
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
|
|
|
|
" )\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # Split the temp dataframe into val and test dataframes.\n",
|
|
|
|
|
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
|
|
|
|
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
|
|
|
|
" df_temp,\n",
|
|
|
|
|
" y_temp,\n",
|
|
|
|
|
" stratify=y_temp,\n",
|
|
|
|
|
" test_size=relative_frac_test,\n",
|
|
|
|
|
" random_state=random_state,\n",
|
|
|
|
|
" )\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" return df_train, df_val, df_test"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 15,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"[3 5 4 1 2]\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"print(df.condition.unique())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"data = df[\n",
|
|
|
|
|
" [\n",
|
|
|
|
|
" \"price\",\n",
|
|
|
|
|
" \"bedrooms\",\n",
|
|
|
|
|
" \"bathrooms\",\n",
|
|
|
|
|
" \"sqft_living\",\n",
|
|
|
|
|
" \"sqft_lot\",\n",
|
|
|
|
|
" \"floors\",\n",
|
|
|
|
|
" \"view\",\n",
|
|
|
|
|
" \"condition\",\n",
|
|
|
|
|
" \"grade\",\n",
|
|
|
|
|
" \"sqft_above\",\n",
|
|
|
|
|
" \"sqft_basement\",\n",
|
|
|
|
|
" \"yr_built\",\n",
|
|
|
|
|
" \"yr_renovated\",\n",
|
|
|
|
|
" \"zipcode\",\n",
|
|
|
|
|
" \"lat\",\n",
|
|
|
|
|
" \"long\",\n",
|
|
|
|
|
" ]\n",
|
|
|
|
|
"].copy()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 16,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"Обучающая выборка: (12967, 16)\n",
|
|
|
|
|
"condition\n",
|
|
|
|
|
"3 8418\n",
|
|
|
|
|
"4 3407\n",
|
|
|
|
|
"5 1021\n",
|
|
|
|
|
"2 103\n",
|
|
|
|
|
"1 18\n",
|
|
|
|
|
"Name: count, dtype: int64\n",
|
|
|
|
|
"Контрольная выборка: (4323, 16)\n",
|
|
|
|
|
"condition\n",
|
|
|
|
|
"3 2806\n",
|
|
|
|
|
"4 1136\n",
|
|
|
|
|
"5 340\n",
|
|
|
|
|
"2 35\n",
|
|
|
|
|
"1 6\n",
|
|
|
|
|
"Name: count, dtype: int64\n",
|
|
|
|
|
"Тестовая выборка: (4323, 16)\n",
|
|
|
|
|
"condition\n",
|
|
|
|
|
"3 2807\n",
|
|
|
|
|
"4 1136\n",
|
|
|
|
|
"5 340\n",
|
|
|
|
|
"2 34\n",
|
|
|
|
|
"1 6\n",
|
|
|
|
|
"Name: count, dtype: int64\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
|
|
|
|
|
" data,\n",
|
|
|
|
|
" stratify_colname=\"condition\",\n",
|
|
|
|
|
" frac_train=0.60,\n",
|
|
|
|
|
" frac_val=0.20,\n",
|
|
|
|
|
" frac_test=0.20,\n",
|
|
|
|
|
")\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
|
|
|
|
"print(df_train.condition.value_counts())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print(\"Контрольная выборка: \", df_val.shape)\n",
|
|
|
|
|
"print(df_val.condition.value_counts())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print(\"Тестовая выборка: \", df_test.shape)\n",
|
|
|
|
|
"print(df_test.condition.value_counts())"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 18,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"Обучающая выборка: (12967, 16)\n",
|
|
|
|
|
"condition\n",
|
|
|
|
|
"3 8418\n",
|
|
|
|
|
"4 3407\n",
|
|
|
|
|
"5 1021\n",
|
|
|
|
|
"2 103\n",
|
|
|
|
|
"1 18\n",
|
|
|
|
|
"Name: count, dtype: int64\n",
|
|
|
|
|
"Обучающая выборка после oversampling: (42073, 16)\n",
|
|
|
|
|
"condition\n",
|
|
|
|
|
"5 8464\n",
|
|
|
|
|
"2 8421\n",
|
|
|
|
|
"1 8420\n",
|
|
|
|
|
"3 8418\n",
|
|
|
|
|
"4 8350\n",
|
|
|
|
|
"Name: count, dtype: int64\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"from imblearn.over_sampling import ADASYN\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"ada = ADASYN()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
|
|
|
|
"print(df_train.condition.value_counts())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"condition\"])\n",
|
|
|
|
|
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
|
|
|
|
|
"print(df_train_adasyn.condition.value_counts())"
|
|
|
|
|
]
|
2024-11-10 14:56:44 +04:00
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"kernelspec": {
|
|
|
|
|
"display_name": ".venv",
|
|
|
|
|
"language": "python",
|
|
|
|
|
"name": "python3"
|
|
|
|
|
},
|
|
|
|
|
"language_info": {
|
|
|
|
|
"codemirror_mode": {
|
|
|
|
|
"name": "ipython",
|
|
|
|
|
"version": 3
|
|
|
|
|
},
|
|
|
|
|
"file_extension": ".py",
|
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
|
"name": "python",
|
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
|
"version": "3.12.7"
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"nbformat": 4,
|
|
|
|
|
"nbformat_minor": 2
|
|
|
|
|
}
|