MII_Yunusov_Niyaz/notebooks/lec2 copy.ipynb
2024-09-30 23:02:17 +04:00

1323 lines
44 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Загрузка данных в DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 21613 entries, 7129300520 to 1523300157\n",
"Data columns (total 20 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 date 21613 non-null object \n",
" 1 price 21607 non-null float64\n",
" 2 bedrooms 21613 non-null int64 \n",
" 3 bathrooms 21613 non-null float64\n",
" 4 sqft_living 21613 non-null int64 \n",
" 5 sqft_lot 21613 non-null int64 \n",
" 6 floors 21613 non-null float64\n",
" 7 waterfront 21613 non-null int64 \n",
" 8 view 21613 non-null int64 \n",
" 9 condition 21613 non-null int64 \n",
" 10 grade 21613 non-null int64 \n",
" 11 sqft_above 21613 non-null int64 \n",
" 12 sqft_basement 21613 non-null int64 \n",
" 13 yr_built 21613 non-null int64 \n",
" 14 yr_renovated 21613 non-null int64 \n",
" 15 zipcode 21613 non-null int64 \n",
" 16 lat 21613 non-null float64\n",
" 17 long 21613 non-null float64\n",
" 18 sqft_living15 21613 non-null int64 \n",
" 19 sqft_lot15 21613 non-null int64 \n",
"dtypes: float64(5), int64(14), object(1)\n",
"memory usage: 3.5+ MB\n",
"(21613, 20)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>condition</th>\n",
" <th>grade</th>\n",
" <th>sqft_above</th>\n",
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>7129300520</th>\n",
" <td>20141013T000000</td>\n",
" <td>221900.0</td>\n",
" <td>3</td>\n",
" <td>1.00</td>\n",
" <td>1180</td>\n",
" <td>5650</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>1180</td>\n",
" <td>0</td>\n",
" <td>1955</td>\n",
" <td>0</td>\n",
" <td>98178</td>\n",
" <td>47.5112</td>\n",
" <td>-122.257</td>\n",
" <td>1340</td>\n",
" <td>5650</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6414100192</th>\n",
" <td>20141209T000000</td>\n",
" <td>538000.0</td>\n",
" <td>3</td>\n",
" <td>2.25</td>\n",
" <td>2570</td>\n",
" <td>7242</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>2170</td>\n",
" <td>400</td>\n",
" <td>1951</td>\n",
" <td>1991</td>\n",
" <td>98125</td>\n",
" <td>47.7210</td>\n",
" <td>-122.319</td>\n",
" <td>1690</td>\n",
" <td>7639</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5631500400</th>\n",
" <td>20150225T000000</td>\n",
" <td>180000.0</td>\n",
" <td>2</td>\n",
" <td>1.00</td>\n",
" <td>770</td>\n",
" <td>10000</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>770</td>\n",
" <td>0</td>\n",
" <td>1933</td>\n",
" <td>0</td>\n",
" <td>98028</td>\n",
" <td>47.7379</td>\n",
" <td>-122.233</td>\n",
" <td>2720</td>\n",
" <td>8062</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2487200875</th>\n",
" <td>20141209T000000</td>\n",
" <td>604000.0</td>\n",
" <td>4</td>\n",
" <td>3.00</td>\n",
" <td>1960</td>\n",
" <td>5000</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>7</td>\n",
" <td>1050</td>\n",
" <td>910</td>\n",
" <td>1965</td>\n",
" <td>0</td>\n",
" <td>98136</td>\n",
" <td>47.5208</td>\n",
" <td>-122.393</td>\n",
" <td>1360</td>\n",
" <td>5000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1954400510</th>\n",
" <td>20150218T000000</td>\n",
" <td>510000.0</td>\n",
" <td>3</td>\n",
" <td>2.00</td>\n",
" <td>1680</td>\n",
" <td>8080</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>1680</td>\n",
" <td>0</td>\n",
" <td>1987</td>\n",
" <td>0</td>\n",
" <td>98074</td>\n",
" <td>47.6168</td>\n",
" <td>-122.045</td>\n",
" <td>1800</td>\n",
" <td>7503</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" date price bedrooms bathrooms sqft_living \\\n",
"id \n",
"7129300520 20141013T000000 221900.0 3 1.00 1180 \n",
"6414100192 20141209T000000 538000.0 3 2.25 2570 \n",
"5631500400 20150225T000000 180000.0 2 1.00 770 \n",
"2487200875 20141209T000000 604000.0 4 3.00 1960 \n",
"1954400510 20150218T000000 510000.0 3 2.00 1680 \n",
"\n",
" sqft_lot floors waterfront view condition grade sqft_above \\\n",
"id \n",
"7129300520 5650 1.0 0 0 3 7 1180 \n",
"6414100192 7242 2.0 0 0 3 7 2170 \n",
"5631500400 10000 1.0 0 0 3 6 770 \n",
"2487200875 5000 1.0 0 0 5 7 1050 \n",
"1954400510 8080 1.0 0 0 3 8 1680 \n",
"\n",
" sqft_basement yr_built yr_renovated zipcode lat long \\\n",
"id \n",
"7129300520 0 1955 0 98178 47.5112 -122.257 \n",
"6414100192 400 1951 1991 98125 47.7210 -122.319 \n",
"5631500400 0 1933 0 98028 47.7379 -122.233 \n",
"2487200875 910 1965 0 98136 47.5208 -122.393 \n",
"1954400510 0 1987 0 98074 47.6168 -122.045 \n",
"\n",
" sqft_living15 sqft_lot15 \n",
"id \n",
"7129300520 1340 5650 \n",
"6414100192 1690 7639 \n",
"5631500400 2720 8062 \n",
"2487200875 1360 5000 \n",
"1954400510 1800 7503 "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"df1 = pd.read_csv(\"../data/kc_house_data.csv\", index_col=\"id\")\n",
"\n",
"df1.info()\n",
"\n",
"print(df1.shape)\n",
"\n",
"df1.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Получение сведений о пропущенных данных"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Типы пропущенных данных:\n",
"- None - представление пустых данных в Python\n",
"- NaN - представление пустых данных в Pandas\n",
"- '' - пустая строка"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"date 0\n",
"price 6\n",
"bedrooms 0\n",
"bathrooms 0\n",
"sqft_living 0\n",
"sqft_lot 0\n",
"floors 0\n",
"waterfront 0\n",
"view 0\n",
"condition 0\n",
"grade 0\n",
"sqft_above 0\n",
"sqft_basement 0\n",
"yr_built 0\n",
"yr_renovated 0\n",
"zipcode 0\n",
"lat 0\n",
"long 0\n",
"sqft_living15 0\n",
"sqft_lot15 0\n",
"dtype: int64\n",
"\n",
"date False\n",
"price True\n",
"bedrooms False\n",
"bathrooms False\n",
"sqft_living False\n",
"sqft_lot False\n",
"floors False\n",
"waterfront False\n",
"view False\n",
"condition False\n",
"grade False\n",
"sqft_above False\n",
"sqft_basement False\n",
"yr_built False\n",
"yr_renovated False\n",
"zipcode False\n",
"lat False\n",
"long False\n",
"sqft_living15 False\n",
"sqft_lot15 False\n",
"dtype: bool\n",
"\n",
"price процент пустых значений: %0.03\n"
]
}
],
"source": [
"# Количество пустых значений признаков\n",
"print(df1.isnull().sum())\n",
"\n",
"print()\n",
"\n",
"# Есть ли пустые значения признаков\n",
"print(df1.isnull().any())\n",
"\n",
"print()\n",
"\n",
"# Процент пустых значений признаков\n",
"for i in df1.columns:\n",
" null_rate = df1[i].isnull().sum() / len(df1) * 100\n",
" if null_rate > 0:\n",
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Заполнение пропущенных данных\n",
"\n",
"https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n",
"\n",
"https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(21613, 21)\n",
"date False\n",
"price False\n",
"bedrooms False\n",
"bathrooms False\n",
"sqft_living False\n",
"sqft_lot False\n",
"floors False\n",
"waterfront False\n",
"view False\n",
"condition False\n",
"grade False\n",
"sqft_above False\n",
"sqft_basement False\n",
"yr_built False\n",
"yr_renovated False\n",
"zipcode False\n",
"lat False\n",
"long False\n",
"sqft_living15 False\n",
"sqft_lot15 False\n",
"priceFillNa False\n",
"dtype: bool\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>condition</th>\n",
" <th>...</th>\n",
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" <th>priceFillNa</th>\n",
" <th>priceFillMedian</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>263000018</th>\n",
" <td>20140521T000000</td>\n",
" <td>360000.0</td>\n",
" <td>3</td>\n",
" <td>2.50</td>\n",
" <td>1530</td>\n",
" <td>1131</td>\n",
" <td>3.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2009</td>\n",
" <td>0</td>\n",
" <td>98103</td>\n",
" <td>47.6993</td>\n",
" <td>-122.346</td>\n",
" <td>1530</td>\n",
" <td>1509</td>\n",
" <td>360000.0</td>\n",
" <td>360000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6600060120</th>\n",
" <td>20150223T000000</td>\n",
" <td>400000.0</td>\n",
" <td>4</td>\n",
" <td>2.50</td>\n",
" <td>2310</td>\n",
" <td>5813</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2014</td>\n",
" <td>0</td>\n",
" <td>98146</td>\n",
" <td>47.5107</td>\n",
" <td>-122.362</td>\n",
" <td>1830</td>\n",
" <td>7200</td>\n",
" <td>400000.0</td>\n",
" <td>400000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1523300141</th>\n",
" <td>20140623T000000</td>\n",
" <td>402101.0</td>\n",
" <td>2</td>\n",
" <td>0.75</td>\n",
" <td>1020</td>\n",
" <td>1350</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2009</td>\n",
" <td>0</td>\n",
" <td>98144</td>\n",
" <td>47.5944</td>\n",
" <td>-122.299</td>\n",
" <td>1020</td>\n",
" <td>2007</td>\n",
" <td>402101.0</td>\n",
" <td>402101.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>291310100</th>\n",
" <td>20150116T000000</td>\n",
" <td>400000.0</td>\n",
" <td>3</td>\n",
" <td>2.50</td>\n",
" <td>1600</td>\n",
" <td>2388</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2004</td>\n",
" <td>0</td>\n",
" <td>98027</td>\n",
" <td>47.5345</td>\n",
" <td>-122.069</td>\n",
" <td>1410</td>\n",
" <td>1287</td>\n",
" <td>400000.0</td>\n",
" <td>400000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1523300157</th>\n",
" <td>20141015T000000</td>\n",
" <td>325000.0</td>\n",
" <td>2</td>\n",
" <td>0.75</td>\n",
" <td>1020</td>\n",
" <td>1076</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2008</td>\n",
" <td>0</td>\n",
" <td>98144</td>\n",
" <td>47.5941</td>\n",
" <td>-122.299</td>\n",
" <td>1020</td>\n",
" <td>1357</td>\n",
" <td>325000.0</td>\n",
" <td>325000.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" date price bedrooms bathrooms sqft_living \\\n",
"id \n",
"263000018 20140521T000000 360000.0 3 2.50 1530 \n",
"6600060120 20150223T000000 400000.0 4 2.50 2310 \n",
"1523300141 20140623T000000 402101.0 2 0.75 1020 \n",
"291310100 20150116T000000 400000.0 3 2.50 1600 \n",
"1523300157 20141015T000000 325000.0 2 0.75 1020 \n",
"\n",
" sqft_lot floors waterfront view condition ... sqft_basement \\\n",
"id ... \n",
"263000018 1131 3.0 0 0 3 ... 0 \n",
"6600060120 5813 2.0 0 0 3 ... 0 \n",
"1523300141 1350 2.0 0 0 3 ... 0 \n",
"291310100 2388 2.0 0 0 3 ... 0 \n",
"1523300157 1076 2.0 0 0 3 ... 0 \n",
"\n",
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
"id \n",
"263000018 2009 0 98103 47.6993 -122.346 1530 \n",
"6600060120 2014 0 98146 47.5107 -122.362 1830 \n",
"1523300141 2009 0 98144 47.5944 -122.299 1020 \n",
"291310100 2004 0 98027 47.5345 -122.069 1410 \n",
"1523300157 2008 0 98144 47.5941 -122.299 1020 \n",
"\n",
" sqft_lot15 priceFillNa priceFillMedian \n",
"id \n",
"263000018 1509 360000.0 360000.0 \n",
"6600060120 7200 400000.0 400000.0 \n",
"1523300141 2007 402101.0 402101.0 \n",
"291310100 1287 400000.0 400000.0 \n",
"1523300157 1357 325000.0 325000.0 \n",
"\n",
"[5 rows x 22 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fillna_df = df1.fillna(0)\n",
"\n",
"print(fillna_df.shape)\n",
"\n",
"print(fillna_df.isnull().any())\n",
"\n",
"# Замена пустых данных на 0\n",
"df1[\"priceFillNa\"] = df1[\"price\"].fillna(0)\n",
"\n",
"# Замена пустых данных на медиану\n",
"df1[\"priceFillMedian\"] = df1[\"price\"].fillna(df1[\"priceFillNa\"].median())\n",
"\n",
"df1.tail()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>condition</th>\n",
" <th>...</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" <th>priceFillNa</th>\n",
" <th>priceFillMedian</th>\n",
" <th>PriceCopy</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>263000018</th>\n",
" <td>20140521T000000</td>\n",
" <td>360000.0</td>\n",
" <td>3</td>\n",
" <td>2.50</td>\n",
" <td>1530</td>\n",
" <td>1131</td>\n",
" <td>3.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>2009</td>\n",
" <td>0</td>\n",
" <td>98103</td>\n",
" <td>47.6993</td>\n",
" <td>-122.346</td>\n",
" <td>1530</td>\n",
" <td>1509</td>\n",
" <td>360000.0</td>\n",
" <td>360000.0</td>\n",
" <td>360000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6600060120</th>\n",
" <td>20150223T000000</td>\n",
" <td>400000.0</td>\n",
" <td>4</td>\n",
" <td>2.50</td>\n",
" <td>2310</td>\n",
" <td>5813</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>2014</td>\n",
" <td>0</td>\n",
" <td>98146</td>\n",
" <td>47.5107</td>\n",
" <td>-122.362</td>\n",
" <td>1830</td>\n",
" <td>7200</td>\n",
" <td>400000.0</td>\n",
" <td>400000.0</td>\n",
" <td>400000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1523300141</th>\n",
" <td>20140623T000000</td>\n",
" <td>402101.0</td>\n",
" <td>2</td>\n",
" <td>0.75</td>\n",
" <td>1020</td>\n",
" <td>1350</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>2009</td>\n",
" <td>0</td>\n",
" <td>98144</td>\n",
" <td>47.5944</td>\n",
" <td>-122.299</td>\n",
" <td>1020</td>\n",
" <td>2007</td>\n",
" <td>402101.0</td>\n",
" <td>402101.0</td>\n",
" <td>402101.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>291310100</th>\n",
" <td>20150116T000000</td>\n",
" <td>400000.0</td>\n",
" <td>3</td>\n",
" <td>2.50</td>\n",
" <td>1600</td>\n",
" <td>2388</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>2004</td>\n",
" <td>0</td>\n",
" <td>98027</td>\n",
" <td>47.5345</td>\n",
" <td>-122.069</td>\n",
" <td>1410</td>\n",
" <td>1287</td>\n",
" <td>400000.0</td>\n",
" <td>400000.0</td>\n",
" <td>400000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1523300157</th>\n",
" <td>20141015T000000</td>\n",
" <td>325000.0</td>\n",
" <td>2</td>\n",
" <td>0.75</td>\n",
" <td>1020</td>\n",
" <td>1076</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>2008</td>\n",
" <td>0</td>\n",
" <td>98144</td>\n",
" <td>47.5941</td>\n",
" <td>-122.299</td>\n",
" <td>1020</td>\n",
" <td>1357</td>\n",
" <td>325000.0</td>\n",
" <td>325000.0</td>\n",
" <td>325000.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 23 columns</p>\n",
"</div>"
],
"text/plain": [
" date price bedrooms bathrooms sqft_living \\\n",
"id \n",
"263000018 20140521T000000 360000.0 3 2.50 1530 \n",
"6600060120 20150223T000000 400000.0 4 2.50 2310 \n",
"1523300141 20140623T000000 402101.0 2 0.75 1020 \n",
"291310100 20150116T000000 400000.0 3 2.50 1600 \n",
"1523300157 20141015T000000 325000.0 2 0.75 1020 \n",
"\n",
" sqft_lot floors waterfront view condition ... yr_built \\\n",
"id ... \n",
"263000018 1131 3.0 0 0 3 ... 2009 \n",
"6600060120 5813 2.0 0 0 3 ... 2014 \n",
"1523300141 1350 2.0 0 0 3 ... 2009 \n",
"291310100 2388 2.0 0 0 3 ... 2004 \n",
"1523300157 1076 2.0 0 0 3 ... 2008 \n",
"\n",
" yr_renovated zipcode lat long sqft_living15 \\\n",
"id \n",
"263000018 0 98103 47.6993 -122.346 1530 \n",
"6600060120 0 98146 47.5107 -122.362 1830 \n",
"1523300141 0 98144 47.5944 -122.299 1020 \n",
"291310100 0 98027 47.5345 -122.069 1410 \n",
"1523300157 0 98144 47.5941 -122.299 1020 \n",
"\n",
" sqft_lot15 priceFillNa priceFillMedian PriceCopy \n",
"id \n",
"263000018 1509 360000.0 360000.0 360000.0 \n",
"6600060120 7200 400000.0 400000.0 400000.0 \n",
"1523300141 2007 402101.0 402101.0 402101.0 \n",
"291310100 1287 400000.0 400000.0 400000.0 \n",
"1523300157 1357 325000.0 325000.0 325000.0 \n",
"\n",
"[5 rows x 23 columns]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1[\"PriceCopy\"] = df1[\"price\"]\n",
"\n",
"# Замена данных сразу в DataFrame без копирования\n",
"df1.fillna({\"PriceCopy\": 0}, inplace=True)\n",
"\n",
"df1.tail()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Удаление наблюдений с пропусками"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(21607, 23)\n",
"date False\n",
"price False\n",
"bedrooms False\n",
"bathrooms False\n",
"sqft_living False\n",
"sqft_lot False\n",
"floors False\n",
"waterfront False\n",
"view False\n",
"condition False\n",
"grade False\n",
"sqft_above False\n",
"sqft_basement False\n",
"yr_built False\n",
"yr_renovated False\n",
"zipcode False\n",
"lat False\n",
"long False\n",
"sqft_living15 False\n",
"sqft_lot15 False\n",
"priceFillNa False\n",
"dtype: bool\n"
]
}
],
"source": [
"dropna_df = df1.dropna()\n",
"\n",
"print(dropna_df.shape)\n",
"\n",
"print(fillna_df.isnull().any())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Создание выборок данных\n",
"\n",
"Библиотека scikit-learn\n",
"\n",
"https://scikit-learn.org/stable/index.html"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<img src=\"assets/lec2-split.png\" width=\"600\" style=\"background-color: white\">"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# Функция для создания выборок\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"\n",
"def split_stratified_into_train_val_test(\n",
" df_input,\n",
" stratify_colname=\"y\",\n",
" frac_train=0.6,\n",
" frac_val=0.15,\n",
" frac_test=0.25,\n",
" random_state=None,\n",
"):\n",
" \"\"\"\n",
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
" following fractional ratios provided by the user, where each subset is\n",
" stratified by the values in a specific column (that is, each subset has\n",
" the same relative frequency of the values in the column). It performs this\n",
" splitting by running train_test_split() twice.\n",
"\n",
" Parameters\n",
" ----------\n",
" df_input : Pandas dataframe\n",
" Input dataframe to be split.\n",
" stratify_colname : str\n",
" The name of the column that will be used for stratification. Usually\n",
" this column would be for the label.\n",
" frac_train : float\n",
" frac_val : float\n",
" frac_test : float\n",
" The ratios with which the dataframe will be split into train, val, and\n",
" test data. The values should be expressed as float fractions and should\n",
" sum to 1.0.\n",
" random_state : int, None, or RandomStateInstance\n",
" Value to be passed to train_test_split().\n",
"\n",
" Returns\n",
" -------\n",
" df_train, df_val, df_test :\n",
" Dataframes containing the three splits.\n",
" \"\"\"\n",
"\n",
" if frac_train + frac_val + frac_test != 1.0:\n",
" raise ValueError(\n",
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
" % (frac_train, frac_val, frac_test)\n",
" )\n",
"\n",
" if stratify_colname not in df_input.columns:\n",
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
"\n",
" X = df_input # Contains all columns.\n",
" y = df_input[\n",
" [stratify_colname]\n",
" ] # Dataframe of just the column on which to stratify.\n",
"\n",
" # Split original dataframe into train and temp dataframes.\n",
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
" )\n",
"\n",
" # Split the temp dataframe into val and test dataframes.\n",
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
" df_val, df_test, y_val, y_test = train_test_split(\n",
" df_temp,\n",
" y_temp,\n",
" stratify=y_temp,\n",
" test_size=relative_frac_test,\n",
" random_state=random_state,\n",
" )\n",
"\n",
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
"\n",
" return df_train, df_val, df_test"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"waterfront\n",
"0 21450\n",
"1 163\n",
"Name: count, dtype: int64\n",
"Обучающая выборка: (12967, 3)\n",
"waterfront\n",
"0 12869\n",
"1 98\n",
"Name: count, dtype: int64\n",
"Контрольная выборка: (4323, 3)\n",
"waterfront\n",
"0 4290\n",
"1 33\n",
"Name: count, dtype: int64\n",
"Тестовая выборка: (4323, 3)\n",
"waterfront\n",
"0 4291\n",
"1 32\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"# Вывод распределения количества наблюдений по меткам (классам)\n",
"print(df1.waterfront.value_counts())\n",
"\n",
"data = df1[[\"waterfront\", \"priceFillMedian\", \"bedrooms\"]].copy()\n",
"\n",
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
" data, stratify_colname=\"waterfront\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
")\n",
"\n",
"print(\"Обучающая выборка: \", df_train.shape)\n",
"print(df_train.waterfront.value_counts())\n",
"\n",
"print(\"Контрольная выборка: \", df_val.shape)\n",
"print(df_val.waterfront.value_counts())\n",
"\n",
"print(\"Тестовая выборка: \", df_test.shape)\n",
"print(df_test.waterfront.value_counts())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выборка с избытком (oversampling)\n",
"\n",
"https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n",
"\n",
"https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n",
"\n",
"Выборка с недостатком (undersampling)\n",
"\n",
"https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n",
"\n",
"Библиотека imbalanced-learn\n",
"\n",
"https://imbalanced-learn.org/stable/"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Обучающая выборка: (12967, 3)\n",
"waterfront\n",
"0 12869\n",
"1 98\n",
"Name: count, dtype: int64\n",
"Обучающая выборка после oversampling: (25754, 3)\n",
"waterfront\n",
"1 12885\n",
"0 12869\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>waterfront</th>\n",
" <th>priceFillMedian</th>\n",
" <th>bedrooms</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>360000.000000</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>488250.000000</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>699000.000000</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>405000.000000</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>343000.000000</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25749</th>\n",
" <td>1</td>\n",
" <td>528600.011954</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25750</th>\n",
" <td>1</td>\n",
" <td>537978.366911</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25751</th>\n",
" <td>1</td>\n",
" <td>522391.975863</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25752</th>\n",
" <td>1</td>\n",
" <td>537575.496160</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25753</th>\n",
" <td>1</td>\n",
" <td>538006.679050</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>25754 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" waterfront priceFillMedian bedrooms\n",
"0 0 360000.000000 4\n",
"1 0 488250.000000 4\n",
"2 0 699000.000000 4\n",
"3 0 405000.000000 2\n",
"4 0 343000.000000 2\n",
"... ... ... ...\n",
"25749 1 528600.011954 2\n",
"25750 1 537978.366911 1\n",
"25751 1 522391.975863 3\n",
"25752 1 537575.496160 2\n",
"25753 1 538006.679050 2\n",
"\n",
"[25754 rows x 3 columns]"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from imblearn.over_sampling import ADASYN\n",
"\n",
"ada = ADASYN()\n",
"\n",
"print(\"Обучающая выборка: \", df_train.shape)\n",
"print(df_train.waterfront.value_counts())\n",
"\n",
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"waterfront\"])\n",
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
"\n",
"print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
"print(df_train_adasyn.waterfront.value_counts())\n",
"\n",
"df_train_adasyn"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}