1323 lines
44 KiB
Plaintext
1323 lines
44 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Загрузка данных в DataFrame"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"Index: 21613 entries, 7129300520 to 1523300157\n",
|
||
"Data columns (total 20 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 date 21613 non-null object \n",
|
||
" 1 price 21607 non-null float64\n",
|
||
" 2 bedrooms 21613 non-null int64 \n",
|
||
" 3 bathrooms 21613 non-null float64\n",
|
||
" 4 sqft_living 21613 non-null int64 \n",
|
||
" 5 sqft_lot 21613 non-null int64 \n",
|
||
" 6 floors 21613 non-null float64\n",
|
||
" 7 waterfront 21613 non-null int64 \n",
|
||
" 8 view 21613 non-null int64 \n",
|
||
" 9 condition 21613 non-null int64 \n",
|
||
" 10 grade 21613 non-null int64 \n",
|
||
" 11 sqft_above 21613 non-null int64 \n",
|
||
" 12 sqft_basement 21613 non-null int64 \n",
|
||
" 13 yr_built 21613 non-null int64 \n",
|
||
" 14 yr_renovated 21613 non-null int64 \n",
|
||
" 15 zipcode 21613 non-null int64 \n",
|
||
" 16 lat 21613 non-null float64\n",
|
||
" 17 long 21613 non-null float64\n",
|
||
" 18 sqft_living15 21613 non-null int64 \n",
|
||
" 19 sqft_lot15 21613 non-null int64 \n",
|
||
"dtypes: float64(5), int64(14), object(1)\n",
|
||
"memory usage: 3.5+ MB\n",
|
||
"(21613, 20)\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>date</th>\n",
|
||
" <th>price</th>\n",
|
||
" <th>bedrooms</th>\n",
|
||
" <th>bathrooms</th>\n",
|
||
" <th>sqft_living</th>\n",
|
||
" <th>sqft_lot</th>\n",
|
||
" <th>floors</th>\n",
|
||
" <th>waterfront</th>\n",
|
||
" <th>view</th>\n",
|
||
" <th>condition</th>\n",
|
||
" <th>grade</th>\n",
|
||
" <th>sqft_above</th>\n",
|
||
" <th>sqft_basement</th>\n",
|
||
" <th>yr_built</th>\n",
|
||
" <th>yr_renovated</th>\n",
|
||
" <th>zipcode</th>\n",
|
||
" <th>lat</th>\n",
|
||
" <th>long</th>\n",
|
||
" <th>sqft_living15</th>\n",
|
||
" <th>sqft_lot15</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>id</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>7129300520</th>\n",
|
||
" <td>20141013T000000</td>\n",
|
||
" <td>221900.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1180</td>\n",
|
||
" <td>5650</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1180</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1955</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98178</td>\n",
|
||
" <td>47.5112</td>\n",
|
||
" <td>-122.257</td>\n",
|
||
" <td>1340</td>\n",
|
||
" <td>5650</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6414100192</th>\n",
|
||
" <td>20141209T000000</td>\n",
|
||
" <td>538000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.25</td>\n",
|
||
" <td>2570</td>\n",
|
||
" <td>7242</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>2170</td>\n",
|
||
" <td>400</td>\n",
|
||
" <td>1951</td>\n",
|
||
" <td>1991</td>\n",
|
||
" <td>98125</td>\n",
|
||
" <td>47.7210</td>\n",
|
||
" <td>-122.319</td>\n",
|
||
" <td>1690</td>\n",
|
||
" <td>7639</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5631500400</th>\n",
|
||
" <td>20150225T000000</td>\n",
|
||
" <td>180000.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>770</td>\n",
|
||
" <td>10000</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>770</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1933</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98028</td>\n",
|
||
" <td>47.7379</td>\n",
|
||
" <td>-122.233</td>\n",
|
||
" <td>2720</td>\n",
|
||
" <td>8062</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2487200875</th>\n",
|
||
" <td>20141209T000000</td>\n",
|
||
" <td>604000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>3.00</td>\n",
|
||
" <td>1960</td>\n",
|
||
" <td>5000</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1050</td>\n",
|
||
" <td>910</td>\n",
|
||
" <td>1965</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98136</td>\n",
|
||
" <td>47.5208</td>\n",
|
||
" <td>-122.393</td>\n",
|
||
" <td>1360</td>\n",
|
||
" <td>5000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1954400510</th>\n",
|
||
" <td>20150218T000000</td>\n",
|
||
" <td>510000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.00</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>8080</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1987</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98074</td>\n",
|
||
" <td>47.6168</td>\n",
|
||
" <td>-122.045</td>\n",
|
||
" <td>1800</td>\n",
|
||
" <td>7503</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" date price bedrooms bathrooms sqft_living \\\n",
|
||
"id \n",
|
||
"7129300520 20141013T000000 221900.0 3 1.00 1180 \n",
|
||
"6414100192 20141209T000000 538000.0 3 2.25 2570 \n",
|
||
"5631500400 20150225T000000 180000.0 2 1.00 770 \n",
|
||
"2487200875 20141209T000000 604000.0 4 3.00 1960 \n",
|
||
"1954400510 20150218T000000 510000.0 3 2.00 1680 \n",
|
||
"\n",
|
||
" sqft_lot floors waterfront view condition grade sqft_above \\\n",
|
||
"id \n",
|
||
"7129300520 5650 1.0 0 0 3 7 1180 \n",
|
||
"6414100192 7242 2.0 0 0 3 7 2170 \n",
|
||
"5631500400 10000 1.0 0 0 3 6 770 \n",
|
||
"2487200875 5000 1.0 0 0 5 7 1050 \n",
|
||
"1954400510 8080 1.0 0 0 3 8 1680 \n",
|
||
"\n",
|
||
" sqft_basement yr_built yr_renovated zipcode lat long \\\n",
|
||
"id \n",
|
||
"7129300520 0 1955 0 98178 47.5112 -122.257 \n",
|
||
"6414100192 400 1951 1991 98125 47.7210 -122.319 \n",
|
||
"5631500400 0 1933 0 98028 47.7379 -122.233 \n",
|
||
"2487200875 910 1965 0 98136 47.5208 -122.393 \n",
|
||
"1954400510 0 1987 0 98074 47.6168 -122.045 \n",
|
||
"\n",
|
||
" sqft_living15 sqft_lot15 \n",
|
||
"id \n",
|
||
"7129300520 1340 5650 \n",
|
||
"6414100192 1690 7639 \n",
|
||
"5631500400 2720 8062 \n",
|
||
"2487200875 1360 5000 \n",
|
||
"1954400510 1800 7503 "
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"df1 = pd.read_csv(\"../data/kc_house_data.csv\", index_col=\"id\")\n",
|
||
"\n",
|
||
"df1.info()\n",
|
||
"\n",
|
||
"print(df1.shape)\n",
|
||
"\n",
|
||
"df1.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Получение сведений о пропущенных данных"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Типы пропущенных данных:\n",
|
||
"- None - представление пустых данных в Python\n",
|
||
"- NaN - представление пустых данных в Pandas\n",
|
||
"- '' - пустая строка"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"date 0\n",
|
||
"price 6\n",
|
||
"bedrooms 0\n",
|
||
"bathrooms 0\n",
|
||
"sqft_living 0\n",
|
||
"sqft_lot 0\n",
|
||
"floors 0\n",
|
||
"waterfront 0\n",
|
||
"view 0\n",
|
||
"condition 0\n",
|
||
"grade 0\n",
|
||
"sqft_above 0\n",
|
||
"sqft_basement 0\n",
|
||
"yr_built 0\n",
|
||
"yr_renovated 0\n",
|
||
"zipcode 0\n",
|
||
"lat 0\n",
|
||
"long 0\n",
|
||
"sqft_living15 0\n",
|
||
"sqft_lot15 0\n",
|
||
"dtype: int64\n",
|
||
"\n",
|
||
"date False\n",
|
||
"price True\n",
|
||
"bedrooms False\n",
|
||
"bathrooms False\n",
|
||
"sqft_living False\n",
|
||
"sqft_lot False\n",
|
||
"floors False\n",
|
||
"waterfront False\n",
|
||
"view False\n",
|
||
"condition False\n",
|
||
"grade False\n",
|
||
"sqft_above False\n",
|
||
"sqft_basement False\n",
|
||
"yr_built False\n",
|
||
"yr_renovated False\n",
|
||
"zipcode False\n",
|
||
"lat False\n",
|
||
"long False\n",
|
||
"sqft_living15 False\n",
|
||
"sqft_lot15 False\n",
|
||
"dtype: bool\n",
|
||
"\n",
|
||
"price процент пустых значений: %0.03\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Количество пустых значений признаков\n",
|
||
"print(df1.isnull().sum())\n",
|
||
"\n",
|
||
"print()\n",
|
||
"\n",
|
||
"# Есть ли пустые значения признаков\n",
|
||
"print(df1.isnull().any())\n",
|
||
"\n",
|
||
"print()\n",
|
||
"\n",
|
||
"# Процент пустых значений признаков\n",
|
||
"for i in df1.columns:\n",
|
||
" null_rate = df1[i].isnull().sum() / len(df1) * 100\n",
|
||
" if null_rate > 0:\n",
|
||
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Заполнение пропущенных данных\n",
|
||
"\n",
|
||
"https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n",
|
||
"\n",
|
||
"https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"(21613, 21)\n",
|
||
"date False\n",
|
||
"price False\n",
|
||
"bedrooms False\n",
|
||
"bathrooms False\n",
|
||
"sqft_living False\n",
|
||
"sqft_lot False\n",
|
||
"floors False\n",
|
||
"waterfront False\n",
|
||
"view False\n",
|
||
"condition False\n",
|
||
"grade False\n",
|
||
"sqft_above False\n",
|
||
"sqft_basement False\n",
|
||
"yr_built False\n",
|
||
"yr_renovated False\n",
|
||
"zipcode False\n",
|
||
"lat False\n",
|
||
"long False\n",
|
||
"sqft_living15 False\n",
|
||
"sqft_lot15 False\n",
|
||
"priceFillNa False\n",
|
||
"dtype: bool\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>date</th>\n",
|
||
" <th>price</th>\n",
|
||
" <th>bedrooms</th>\n",
|
||
" <th>bathrooms</th>\n",
|
||
" <th>sqft_living</th>\n",
|
||
" <th>sqft_lot</th>\n",
|
||
" <th>floors</th>\n",
|
||
" <th>waterfront</th>\n",
|
||
" <th>view</th>\n",
|
||
" <th>condition</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>sqft_basement</th>\n",
|
||
" <th>yr_built</th>\n",
|
||
" <th>yr_renovated</th>\n",
|
||
" <th>zipcode</th>\n",
|
||
" <th>lat</th>\n",
|
||
" <th>long</th>\n",
|
||
" <th>sqft_living15</th>\n",
|
||
" <th>sqft_lot15</th>\n",
|
||
" <th>priceFillNa</th>\n",
|
||
" <th>priceFillMedian</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>id</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>263000018</th>\n",
|
||
" <td>20140521T000000</td>\n",
|
||
" <td>360000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>1530</td>\n",
|
||
" <td>1131</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2009</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98103</td>\n",
|
||
" <td>47.6993</td>\n",
|
||
" <td>-122.346</td>\n",
|
||
" <td>1530</td>\n",
|
||
" <td>1509</td>\n",
|
||
" <td>360000.0</td>\n",
|
||
" <td>360000.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6600060120</th>\n",
|
||
" <td>20150223T000000</td>\n",
|
||
" <td>400000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>2310</td>\n",
|
||
" <td>5813</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2014</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98146</td>\n",
|
||
" <td>47.5107</td>\n",
|
||
" <td>-122.362</td>\n",
|
||
" <td>1830</td>\n",
|
||
" <td>7200</td>\n",
|
||
" <td>400000.0</td>\n",
|
||
" <td>400000.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1523300141</th>\n",
|
||
" <td>20140623T000000</td>\n",
|
||
" <td>402101.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>0.75</td>\n",
|
||
" <td>1020</td>\n",
|
||
" <td>1350</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2009</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98144</td>\n",
|
||
" <td>47.5944</td>\n",
|
||
" <td>-122.299</td>\n",
|
||
" <td>1020</td>\n",
|
||
" <td>2007</td>\n",
|
||
" <td>402101.0</td>\n",
|
||
" <td>402101.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>291310100</th>\n",
|
||
" <td>20150116T000000</td>\n",
|
||
" <td>400000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>1600</td>\n",
|
||
" <td>2388</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2004</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98027</td>\n",
|
||
" <td>47.5345</td>\n",
|
||
" <td>-122.069</td>\n",
|
||
" <td>1410</td>\n",
|
||
" <td>1287</td>\n",
|
||
" <td>400000.0</td>\n",
|
||
" <td>400000.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1523300157</th>\n",
|
||
" <td>20141015T000000</td>\n",
|
||
" <td>325000.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>0.75</td>\n",
|
||
" <td>1020</td>\n",
|
||
" <td>1076</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2008</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98144</td>\n",
|
||
" <td>47.5941</td>\n",
|
||
" <td>-122.299</td>\n",
|
||
" <td>1020</td>\n",
|
||
" <td>1357</td>\n",
|
||
" <td>325000.0</td>\n",
|
||
" <td>325000.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 22 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" date price bedrooms bathrooms sqft_living \\\n",
|
||
"id \n",
|
||
"263000018 20140521T000000 360000.0 3 2.50 1530 \n",
|
||
"6600060120 20150223T000000 400000.0 4 2.50 2310 \n",
|
||
"1523300141 20140623T000000 402101.0 2 0.75 1020 \n",
|
||
"291310100 20150116T000000 400000.0 3 2.50 1600 \n",
|
||
"1523300157 20141015T000000 325000.0 2 0.75 1020 \n",
|
||
"\n",
|
||
" sqft_lot floors waterfront view condition ... sqft_basement \\\n",
|
||
"id ... \n",
|
||
"263000018 1131 3.0 0 0 3 ... 0 \n",
|
||
"6600060120 5813 2.0 0 0 3 ... 0 \n",
|
||
"1523300141 1350 2.0 0 0 3 ... 0 \n",
|
||
"291310100 2388 2.0 0 0 3 ... 0 \n",
|
||
"1523300157 1076 2.0 0 0 3 ... 0 \n",
|
||
"\n",
|
||
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
|
||
"id \n",
|
||
"263000018 2009 0 98103 47.6993 -122.346 1530 \n",
|
||
"6600060120 2014 0 98146 47.5107 -122.362 1830 \n",
|
||
"1523300141 2009 0 98144 47.5944 -122.299 1020 \n",
|
||
"291310100 2004 0 98027 47.5345 -122.069 1410 \n",
|
||
"1523300157 2008 0 98144 47.5941 -122.299 1020 \n",
|
||
"\n",
|
||
" sqft_lot15 priceFillNa priceFillMedian \n",
|
||
"id \n",
|
||
"263000018 1509 360000.0 360000.0 \n",
|
||
"6600060120 7200 400000.0 400000.0 \n",
|
||
"1523300141 2007 402101.0 402101.0 \n",
|
||
"291310100 1287 400000.0 400000.0 \n",
|
||
"1523300157 1357 325000.0 325000.0 \n",
|
||
"\n",
|
||
"[5 rows x 22 columns]"
|
||
]
|
||
},
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"fillna_df = df1.fillna(0)\n",
|
||
"\n",
|
||
"print(fillna_df.shape)\n",
|
||
"\n",
|
||
"print(fillna_df.isnull().any())\n",
|
||
"\n",
|
||
"# Замена пустых данных на 0\n",
|
||
"df1[\"priceFillNa\"] = df1[\"price\"].fillna(0)\n",
|
||
"\n",
|
||
"# Замена пустых данных на медиану\n",
|
||
"df1[\"priceFillMedian\"] = df1[\"price\"].fillna(df1[\"priceFillNa\"].median())\n",
|
||
"\n",
|
||
"df1.tail()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>date</th>\n",
|
||
" <th>price</th>\n",
|
||
" <th>bedrooms</th>\n",
|
||
" <th>bathrooms</th>\n",
|
||
" <th>sqft_living</th>\n",
|
||
" <th>sqft_lot</th>\n",
|
||
" <th>floors</th>\n",
|
||
" <th>waterfront</th>\n",
|
||
" <th>view</th>\n",
|
||
" <th>condition</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>yr_built</th>\n",
|
||
" <th>yr_renovated</th>\n",
|
||
" <th>zipcode</th>\n",
|
||
" <th>lat</th>\n",
|
||
" <th>long</th>\n",
|
||
" <th>sqft_living15</th>\n",
|
||
" <th>sqft_lot15</th>\n",
|
||
" <th>priceFillNa</th>\n",
|
||
" <th>priceFillMedian</th>\n",
|
||
" <th>PriceCopy</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>id</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>263000018</th>\n",
|
||
" <td>20140521T000000</td>\n",
|
||
" <td>360000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>1530</td>\n",
|
||
" <td>1131</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2009</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98103</td>\n",
|
||
" <td>47.6993</td>\n",
|
||
" <td>-122.346</td>\n",
|
||
" <td>1530</td>\n",
|
||
" <td>1509</td>\n",
|
||
" <td>360000.0</td>\n",
|
||
" <td>360000.0</td>\n",
|
||
" <td>360000.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6600060120</th>\n",
|
||
" <td>20150223T000000</td>\n",
|
||
" <td>400000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>2310</td>\n",
|
||
" <td>5813</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2014</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98146</td>\n",
|
||
" <td>47.5107</td>\n",
|
||
" <td>-122.362</td>\n",
|
||
" <td>1830</td>\n",
|
||
" <td>7200</td>\n",
|
||
" <td>400000.0</td>\n",
|
||
" <td>400000.0</td>\n",
|
||
" <td>400000.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1523300141</th>\n",
|
||
" <td>20140623T000000</td>\n",
|
||
" <td>402101.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>0.75</td>\n",
|
||
" <td>1020</td>\n",
|
||
" <td>1350</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2009</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98144</td>\n",
|
||
" <td>47.5944</td>\n",
|
||
" <td>-122.299</td>\n",
|
||
" <td>1020</td>\n",
|
||
" <td>2007</td>\n",
|
||
" <td>402101.0</td>\n",
|
||
" <td>402101.0</td>\n",
|
||
" <td>402101.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>291310100</th>\n",
|
||
" <td>20150116T000000</td>\n",
|
||
" <td>400000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>1600</td>\n",
|
||
" <td>2388</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2004</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98027</td>\n",
|
||
" <td>47.5345</td>\n",
|
||
" <td>-122.069</td>\n",
|
||
" <td>1410</td>\n",
|
||
" <td>1287</td>\n",
|
||
" <td>400000.0</td>\n",
|
||
" <td>400000.0</td>\n",
|
||
" <td>400000.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1523300157</th>\n",
|
||
" <td>20141015T000000</td>\n",
|
||
" <td>325000.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>0.75</td>\n",
|
||
" <td>1020</td>\n",
|
||
" <td>1076</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2008</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98144</td>\n",
|
||
" <td>47.5941</td>\n",
|
||
" <td>-122.299</td>\n",
|
||
" <td>1020</td>\n",
|
||
" <td>1357</td>\n",
|
||
" <td>325000.0</td>\n",
|
||
" <td>325000.0</td>\n",
|
||
" <td>325000.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 23 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" date price bedrooms bathrooms sqft_living \\\n",
|
||
"id \n",
|
||
"263000018 20140521T000000 360000.0 3 2.50 1530 \n",
|
||
"6600060120 20150223T000000 400000.0 4 2.50 2310 \n",
|
||
"1523300141 20140623T000000 402101.0 2 0.75 1020 \n",
|
||
"291310100 20150116T000000 400000.0 3 2.50 1600 \n",
|
||
"1523300157 20141015T000000 325000.0 2 0.75 1020 \n",
|
||
"\n",
|
||
" sqft_lot floors waterfront view condition ... yr_built \\\n",
|
||
"id ... \n",
|
||
"263000018 1131 3.0 0 0 3 ... 2009 \n",
|
||
"6600060120 5813 2.0 0 0 3 ... 2014 \n",
|
||
"1523300141 1350 2.0 0 0 3 ... 2009 \n",
|
||
"291310100 2388 2.0 0 0 3 ... 2004 \n",
|
||
"1523300157 1076 2.0 0 0 3 ... 2008 \n",
|
||
"\n",
|
||
" yr_renovated zipcode lat long sqft_living15 \\\n",
|
||
"id \n",
|
||
"263000018 0 98103 47.6993 -122.346 1530 \n",
|
||
"6600060120 0 98146 47.5107 -122.362 1830 \n",
|
||
"1523300141 0 98144 47.5944 -122.299 1020 \n",
|
||
"291310100 0 98027 47.5345 -122.069 1410 \n",
|
||
"1523300157 0 98144 47.5941 -122.299 1020 \n",
|
||
"\n",
|
||
" sqft_lot15 priceFillNa priceFillMedian PriceCopy \n",
|
||
"id \n",
|
||
"263000018 1509 360000.0 360000.0 360000.0 \n",
|
||
"6600060120 7200 400000.0 400000.0 400000.0 \n",
|
||
"1523300141 2007 402101.0 402101.0 402101.0 \n",
|
||
"291310100 1287 400000.0 400000.0 400000.0 \n",
|
||
"1523300157 1357 325000.0 325000.0 325000.0 \n",
|
||
"\n",
|
||
"[5 rows x 23 columns]"
|
||
]
|
||
},
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1[\"PriceCopy\"] = df1[\"price\"]\n",
|
||
"\n",
|
||
"# Замена данных сразу в DataFrame без копирования\n",
|
||
"df1.fillna({\"PriceCopy\": 0}, inplace=True)\n",
|
||
"\n",
|
||
"df1.tail()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Удаление наблюдений с пропусками"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"(21607, 23)\n",
|
||
"date False\n",
|
||
"price False\n",
|
||
"bedrooms False\n",
|
||
"bathrooms False\n",
|
||
"sqft_living False\n",
|
||
"sqft_lot False\n",
|
||
"floors False\n",
|
||
"waterfront False\n",
|
||
"view False\n",
|
||
"condition False\n",
|
||
"grade False\n",
|
||
"sqft_above False\n",
|
||
"sqft_basement False\n",
|
||
"yr_built False\n",
|
||
"yr_renovated False\n",
|
||
"zipcode False\n",
|
||
"lat False\n",
|
||
"long False\n",
|
||
"sqft_living15 False\n",
|
||
"sqft_lot15 False\n",
|
||
"priceFillNa False\n",
|
||
"dtype: bool\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"dropna_df = df1.dropna()\n",
|
||
"\n",
|
||
"print(dropna_df.shape)\n",
|
||
"\n",
|
||
"print(fillna_df.isnull().any())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Создание выборок данных\n",
|
||
"\n",
|
||
"Библиотека scikit-learn\n",
|
||
"\n",
|
||
"https://scikit-learn.org/stable/index.html"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"<img src=\"assets/lec2-split.png\" width=\"600\" style=\"background-color: white\">"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Функция для создания выборок\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"\n",
|
||
"\n",
|
||
"def split_stratified_into_train_val_test(\n",
|
||
" df_input,\n",
|
||
" stratify_colname=\"y\",\n",
|
||
" frac_train=0.6,\n",
|
||
" frac_val=0.15,\n",
|
||
" frac_test=0.25,\n",
|
||
" random_state=None,\n",
|
||
"):\n",
|
||
" \"\"\"\n",
|
||
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
|
||
" following fractional ratios provided by the user, where each subset is\n",
|
||
" stratified by the values in a specific column (that is, each subset has\n",
|
||
" the same relative frequency of the values in the column). It performs this\n",
|
||
" splitting by running train_test_split() twice.\n",
|
||
"\n",
|
||
" Parameters\n",
|
||
" ----------\n",
|
||
" df_input : Pandas dataframe\n",
|
||
" Input dataframe to be split.\n",
|
||
" stratify_colname : str\n",
|
||
" The name of the column that will be used for stratification. Usually\n",
|
||
" this column would be for the label.\n",
|
||
" frac_train : float\n",
|
||
" frac_val : float\n",
|
||
" frac_test : float\n",
|
||
" The ratios with which the dataframe will be split into train, val, and\n",
|
||
" test data. The values should be expressed as float fractions and should\n",
|
||
" sum to 1.0.\n",
|
||
" random_state : int, None, or RandomStateInstance\n",
|
||
" Value to be passed to train_test_split().\n",
|
||
"\n",
|
||
" Returns\n",
|
||
" -------\n",
|
||
" df_train, df_val, df_test :\n",
|
||
" Dataframes containing the three splits.\n",
|
||
" \"\"\"\n",
|
||
"\n",
|
||
" if frac_train + frac_val + frac_test != 1.0:\n",
|
||
" raise ValueError(\n",
|
||
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
||
" % (frac_train, frac_val, frac_test)\n",
|
||
" )\n",
|
||
"\n",
|
||
" if stratify_colname not in df_input.columns:\n",
|
||
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
||
"\n",
|
||
" X = df_input # Contains all columns.\n",
|
||
" y = df_input[\n",
|
||
" [stratify_colname]\n",
|
||
" ] # Dataframe of just the column on which to stratify.\n",
|
||
"\n",
|
||
" # Split original dataframe into train and temp dataframes.\n",
|
||
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
||
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
||
" )\n",
|
||
"\n",
|
||
" # Split the temp dataframe into val and test dataframes.\n",
|
||
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
||
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
||
" df_temp,\n",
|
||
" y_temp,\n",
|
||
" stratify=y_temp,\n",
|
||
" test_size=relative_frac_test,\n",
|
||
" random_state=random_state,\n",
|
||
" )\n",
|
||
"\n",
|
||
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
||
"\n",
|
||
" return df_train, df_val, df_test"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 27,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"waterfront\n",
|
||
"0 21450\n",
|
||
"1 163\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Обучающая выборка: (12967, 3)\n",
|
||
"waterfront\n",
|
||
"0 12869\n",
|
||
"1 98\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Контрольная выборка: (4323, 3)\n",
|
||
"waterfront\n",
|
||
"0 4290\n",
|
||
"1 33\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Тестовая выборка: (4323, 3)\n",
|
||
"waterfront\n",
|
||
"0 4291\n",
|
||
"1 32\n",
|
||
"Name: count, dtype: int64\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Вывод распределения количества наблюдений по меткам (классам)\n",
|
||
"print(df1.waterfront.value_counts())\n",
|
||
"\n",
|
||
"data = df1[[\"waterfront\", \"priceFillMedian\", \"bedrooms\"]].copy()\n",
|
||
"\n",
|
||
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
|
||
" data, stratify_colname=\"waterfront\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
|
||
")\n",
|
||
"\n",
|
||
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
||
"print(df_train.waterfront.value_counts())\n",
|
||
"\n",
|
||
"print(\"Контрольная выборка: \", df_val.shape)\n",
|
||
"print(df_val.waterfront.value_counts())\n",
|
||
"\n",
|
||
"print(\"Тестовая выборка: \", df_test.shape)\n",
|
||
"print(df_test.waterfront.value_counts())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Выборка с избытком (oversampling)\n",
|
||
"\n",
|
||
"https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n",
|
||
"\n",
|
||
"https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n",
|
||
"\n",
|
||
"Выборка с недостатком (undersampling)\n",
|
||
"\n",
|
||
"https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n",
|
||
"\n",
|
||
"Библиотека imbalanced-learn\n",
|
||
"\n",
|
||
"https://imbalanced-learn.org/stable/"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 28,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Обучающая выборка: (12967, 3)\n",
|
||
"waterfront\n",
|
||
"0 12869\n",
|
||
"1 98\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Обучающая выборка после oversampling: (25754, 3)\n",
|
||
"waterfront\n",
|
||
"1 12885\n",
|
||
"0 12869\n",
|
||
"Name: count, dtype: int64\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>waterfront</th>\n",
|
||
" <th>priceFillMedian</th>\n",
|
||
" <th>bedrooms</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>360000.000000</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>488250.000000</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>699000.000000</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>405000.000000</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>343000.000000</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25749</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>528600.011954</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25750</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>537978.366911</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25751</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>522391.975863</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25752</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>537575.496160</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25753</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>538006.679050</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>25754 rows × 3 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" waterfront priceFillMedian bedrooms\n",
|
||
"0 0 360000.000000 4\n",
|
||
"1 0 488250.000000 4\n",
|
||
"2 0 699000.000000 4\n",
|
||
"3 0 405000.000000 2\n",
|
||
"4 0 343000.000000 2\n",
|
||
"... ... ... ...\n",
|
||
"25749 1 528600.011954 2\n",
|
||
"25750 1 537978.366911 1\n",
|
||
"25751 1 522391.975863 3\n",
|
||
"25752 1 537575.496160 2\n",
|
||
"25753 1 538006.679050 2\n",
|
||
"\n",
|
||
"[25754 rows x 3 columns]"
|
||
]
|
||
},
|
||
"execution_count": 28,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from imblearn.over_sampling import ADASYN\n",
|
||
"\n",
|
||
"ada = ADASYN()\n",
|
||
"\n",
|
||
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
||
"print(df_train.waterfront.value_counts())\n",
|
||
"\n",
|
||
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"waterfront\"])\n",
|
||
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
|
||
"\n",
|
||
"print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
|
||
"print(df_train_adasyn.waterfront.value_counts())\n",
|
||
"\n",
|
||
"df_train_adasyn"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": ".venv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.6"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|