MII_Yunusov_Niyaz/notebooks/lec2 copy.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Загрузка данных в DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 21613 entries, 7129300520 to 1523300157\n",
      "Data columns (total 20 columns):\n",
      " #   Column         Non-Null Count  Dtype  \n",
      "---  ------         --------------  -----  \n",
      " 0   date           21613 non-null  object \n",
      " 1   price          21607 non-null  float64\n",
      " 2   bedrooms       21613 non-null  int64  \n",
      " 3   bathrooms      21613 non-null  float64\n",
      " 4   sqft_living    21613 non-null  int64  \n",
      " 5   sqft_lot       21613 non-null  int64  \n",
      " 6   floors         21613 non-null  float64\n",
      " 7   waterfront     21613 non-null  int64  \n",
      " 8   view           21613 non-null  int64  \n",
      " 9   condition      21613 non-null  int64  \n",
      " 10  grade          21613 non-null  int64  \n",
      " 11  sqft_above     21613 non-null  int64  \n",
      " 12  sqft_basement  21613 non-null  int64  \n",
      " 13  yr_built       21613 non-null  int64  \n",
      " 14  yr_renovated   21613 non-null  int64  \n",
      " 15  zipcode        21613 non-null  int64  \n",
      " 16  lat            21613 non-null  float64\n",
      " 17  long           21613 non-null  float64\n",
      " 18  sqft_living15  21613 non-null  int64  \n",
      " 19  sqft_lot15     21613 non-null  int64  \n",
      "dtypes: float64(5), int64(14), object(1)\n",
      "memory usage: 3.5+ MB\n",
      "(21613, 20)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>bedrooms</th>\n",
       "      <th>bathrooms</th>\n",
       "      <th>sqft_living</th>\n",
       "      <th>sqft_lot</th>\n",
       "      <th>floors</th>\n",
       "      <th>waterfront</th>\n",
       "      <th>view</th>\n",
       "      <th>condition</th>\n",
       "      <th>grade</th>\n",
       "      <th>sqft_above</th>\n",
       "      <th>sqft_basement</th>\n",
       "      <th>yr_built</th>\n",
       "      <th>yr_renovated</th>\n",
       "      <th>zipcode</th>\n",
       "      <th>lat</th>\n",
       "      <th>long</th>\n",
       "      <th>sqft_living15</th>\n",
       "      <th>sqft_lot15</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>7129300520</th>\n",
       "      <td>20141013T000000</td>\n",
       "      <td>221900.0</td>\n",
       "      <td>3</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1180</td>\n",
       "      <td>5650</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>1180</td>\n",
       "      <td>0</td>\n",
       "      <td>1955</td>\n",
       "      <td>0</td>\n",
       "      <td>98178</td>\n",
       "      <td>47.5112</td>\n",
       "      <td>-122.257</td>\n",
       "      <td>1340</td>\n",
       "      <td>5650</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6414100192</th>\n",
       "      <td>20141209T000000</td>\n",
       "      <td>538000.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2.25</td>\n",
       "      <td>2570</td>\n",
       "      <td>7242</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>2170</td>\n",
       "      <td>400</td>\n",
       "      <td>1951</td>\n",
       "      <td>1991</td>\n",
       "      <td>98125</td>\n",
       "      <td>47.7210</td>\n",
       "      <td>-122.319</td>\n",
       "      <td>1690</td>\n",
       "      <td>7639</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5631500400</th>\n",
       "      <td>20150225T000000</td>\n",
       "      <td>180000.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1.00</td>\n",
       "      <td>770</td>\n",
       "      <td>10000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "      <td>770</td>\n",
       "      <td>0</td>\n",
       "      <td>1933</td>\n",
       "      <td>0</td>\n",
       "      <td>98028</td>\n",
       "      <td>47.7379</td>\n",
       "      <td>-122.233</td>\n",
       "      <td>2720</td>\n",
       "      <td>8062</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2487200875</th>\n",
       "      <td>20141209T000000</td>\n",
       "      <td>604000.0</td>\n",
       "      <td>4</td>\n",
       "      <td>3.00</td>\n",
       "      <td>1960</td>\n",
       "      <td>5000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>7</td>\n",
       "      <td>1050</td>\n",
       "      <td>910</td>\n",
       "      <td>1965</td>\n",
       "      <td>0</td>\n",
       "      <td>98136</td>\n",
       "      <td>47.5208</td>\n",
       "      <td>-122.393</td>\n",
       "      <td>1360</td>\n",
       "      <td>5000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1954400510</th>\n",
       "      <td>20150218T000000</td>\n",
       "      <td>510000.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2.00</td>\n",
       "      <td>1680</td>\n",
       "      <td>8080</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>8</td>\n",
       "      <td>1680</td>\n",
       "      <td>0</td>\n",
       "      <td>1987</td>\n",
       "      <td>0</td>\n",
       "      <td>98074</td>\n",
       "      <td>47.6168</td>\n",
       "      <td>-122.045</td>\n",
       "      <td>1800</td>\n",
       "      <td>7503</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       date     price  bedrooms  bathrooms  sqft_living  \\\n",
       "id                                                                        \n",
       "7129300520  20141013T000000  221900.0         3       1.00         1180   \n",
       "6414100192  20141209T000000  538000.0         3       2.25         2570   \n",
       "5631500400  20150225T000000  180000.0         2       1.00          770   \n",
       "2487200875  20141209T000000  604000.0         4       3.00         1960   \n",
       "1954400510  20150218T000000  510000.0         3       2.00         1680   \n",
       "\n",
       "            sqft_lot  floors  waterfront  view  condition  grade  sqft_above  \\\n",
       "id                                                                             \n",
       "7129300520      5650     1.0           0     0          3      7        1180   \n",
       "6414100192      7242     2.0           0     0          3      7        2170   \n",
       "5631500400     10000     1.0           0     0          3      6         770   \n",
       "2487200875      5000     1.0           0     0          5      7        1050   \n",
       "1954400510      8080     1.0           0     0          3      8        1680   \n",
       "\n",
       "            sqft_basement  yr_built  yr_renovated  zipcode      lat     long  \\\n",
       "id                                                                             \n",
       "7129300520              0      1955             0    98178  47.5112 -122.257   \n",
       "6414100192            400      1951          1991    98125  47.7210 -122.319   \n",
       "5631500400              0      1933             0    98028  47.7379 -122.233   \n",
       "2487200875            910      1965             0    98136  47.5208 -122.393   \n",
       "1954400510              0      1987             0    98074  47.6168 -122.045   \n",
       "\n",
       "            sqft_living15  sqft_lot15  \n",
       "id                                     \n",
       "7129300520           1340        5650  \n",
       "6414100192           1690        7639  \n",
       "5631500400           2720        8062  \n",
       "2487200875           1360        5000  \n",
       "1954400510           1800        7503  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df1 = pd.read_csv(\"../data/kc_house_data.csv\", index_col=\"id\")\n",
    "\n",
    "df1.info()\n",
    "\n",
    "print(df1.shape)\n",
    "\n",
    "df1.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Получение сведений о пропущенных данных"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Типы пропущенных данных:\n",
    "- None - представление пустых данных в Python\n",
    "- NaN - представление пустых данных в Pandas\n",
    "- '' - пустая строка"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "date             0\n",
      "price            6\n",
      "bedrooms         0\n",
      "bathrooms        0\n",
      "sqft_living      0\n",
      "sqft_lot         0\n",
      "floors           0\n",
      "waterfront       0\n",
      "view             0\n",
      "condition        0\n",
      "grade            0\n",
      "sqft_above       0\n",
      "sqft_basement    0\n",
      "yr_built         0\n",
      "yr_renovated     0\n",
      "zipcode          0\n",
      "lat              0\n",
      "long             0\n",
      "sqft_living15    0\n",
      "sqft_lot15       0\n",
      "dtype: int64\n",
      "\n",
      "date             False\n",
      "price             True\n",
      "bedrooms         False\n",
      "bathrooms        False\n",
      "sqft_living      False\n",
      "sqft_lot         False\n",
      "floors           False\n",
      "waterfront       False\n",
      "view             False\n",
      "condition        False\n",
      "grade            False\n",
      "sqft_above       False\n",
      "sqft_basement    False\n",
      "yr_built         False\n",
      "yr_renovated     False\n",
      "zipcode          False\n",
      "lat              False\n",
      "long             False\n",
      "sqft_living15    False\n",
      "sqft_lot15       False\n",
      "dtype: bool\n",
      "\n",
      "price процент пустых значений: %0.03\n"
     ]
    }
   ],
   "source": [
    "# Количество пустых значений признаков\n",
    "print(df1.isnull().sum())\n",
    "\n",
    "print()\n",
    "\n",
    "# Есть ли пустые значения признаков\n",
    "print(df1.isnull().any())\n",
    "\n",
    "print()\n",
    "\n",
    "# Процент пустых значений признаков\n",
    "for i in df1.columns:\n",
    "    null_rate = df1[i].isnull().sum() / len(df1) * 100\n",
    "    if null_rate > 0:\n",
    "        print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Заполнение пропущенных данных\n",
    "\n",
    "https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n",
    "\n",
    "https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(21613, 21)\n",
      "date             False\n",
      "price            False\n",
      "bedrooms         False\n",
      "bathrooms        False\n",
      "sqft_living      False\n",
      "sqft_lot         False\n",
      "floors           False\n",
      "waterfront       False\n",
      "view             False\n",
      "condition        False\n",
      "grade            False\n",
      "sqft_above       False\n",
      "sqft_basement    False\n",
      "yr_built         False\n",
      "yr_renovated     False\n",
      "zipcode          False\n",
      "lat              False\n",
      "long             False\n",
      "sqft_living15    False\n",
      "sqft_lot15       False\n",
      "priceFillNa      False\n",
      "dtype: bool\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>bedrooms</th>\n",
       "      <th>bathrooms</th>\n",
       "      <th>sqft_living</th>\n",
       "      <th>sqft_lot</th>\n",
       "      <th>floors</th>\n",
       "      <th>waterfront</th>\n",
       "      <th>view</th>\n",
       "      <th>condition</th>\n",
       "      <th>...</th>\n",
       "      <th>sqft_basement</th>\n",
       "      <th>yr_built</th>\n",
       "      <th>yr_renovated</th>\n",
       "      <th>zipcode</th>\n",
       "      <th>lat</th>\n",
       "      <th>long</th>\n",
       "      <th>sqft_living15</th>\n",
       "      <th>sqft_lot15</th>\n",
       "      <th>priceFillNa</th>\n",
       "      <th>priceFillMedian</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>263000018</th>\n",
       "      <td>20140521T000000</td>\n",
       "      <td>360000.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2.50</td>\n",
       "      <td>1530</td>\n",
       "      <td>1131</td>\n",
       "      <td>3.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>2009</td>\n",
       "      <td>0</td>\n",
       "      <td>98103</td>\n",
       "      <td>47.6993</td>\n",
       "      <td>-122.346</td>\n",
       "      <td>1530</td>\n",
       "      <td>1509</td>\n",
       "      <td>360000.0</td>\n",
       "      <td>360000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6600060120</th>\n",
       "      <td>20150223T000000</td>\n",
       "      <td>400000.0</td>\n",
       "      <td>4</td>\n",
       "      <td>2.50</td>\n",
       "      <td>2310</td>\n",
       "      <td>5813</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>2014</td>\n",
       "      <td>0</td>\n",
       "      <td>98146</td>\n",
       "      <td>47.5107</td>\n",
       "      <td>-122.362</td>\n",
       "      <td>1830</td>\n",
       "      <td>7200</td>\n",
       "      <td>400000.0</td>\n",
       "      <td>400000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1523300141</th>\n",
       "      <td>20140623T000000</td>\n",
       "      <td>402101.0</td>\n",
       "      <td>2</td>\n",
       "      <td>0.75</td>\n",
       "      <td>1020</td>\n",
       "      <td>1350</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>2009</td>\n",
       "      <td>0</td>\n",
       "      <td>98144</td>\n",
       "      <td>47.5944</td>\n",
       "      <td>-122.299</td>\n",
       "      <td>1020</td>\n",
       "      <td>2007</td>\n",
       "      <td>402101.0</td>\n",
       "      <td>402101.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>291310100</th>\n",
       "      <td>20150116T000000</td>\n",
       "      <td>400000.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2.50</td>\n",
       "      <td>1600</td>\n",
       "      <td>2388</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>2004</td>\n",
       "      <td>0</td>\n",
       "      <td>98027</td>\n",
       "      <td>47.5345</td>\n",
       "      <td>-122.069</td>\n",
       "      <td>1410</td>\n",
       "      <td>1287</td>\n",
       "      <td>400000.0</td>\n",
       "      <td>400000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1523300157</th>\n",
       "      <td>20141015T000000</td>\n",
       "      <td>325000.0</td>\n",
       "      <td>2</td>\n",
       "      <td>0.75</td>\n",
       "      <td>1020</td>\n",
       "      <td>1076</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>2008</td>\n",
       "      <td>0</td>\n",
       "      <td>98144</td>\n",
       "      <td>47.5941</td>\n",
       "      <td>-122.299</td>\n",
       "      <td>1020</td>\n",
       "      <td>1357</td>\n",
       "      <td>325000.0</td>\n",
       "      <td>325000.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                       date     price  bedrooms  bathrooms  sqft_living  \\\n",
       "id                                                                        \n",
       "263000018   20140521T000000  360000.0         3       2.50         1530   \n",
       "6600060120  20150223T000000  400000.0         4       2.50         2310   \n",
       "1523300141  20140623T000000  402101.0         2       0.75         1020   \n",
       "291310100   20150116T000000  400000.0         3       2.50         1600   \n",
       "1523300157  20141015T000000  325000.0         2       0.75         1020   \n",
       "\n",
       "            sqft_lot  floors  waterfront  view  condition  ...  sqft_basement  \\\n",
       "id                                                         ...                  \n",
       "263000018       1131     3.0           0     0          3  ...              0   \n",
       "6600060120      5813     2.0           0     0          3  ...              0   \n",
       "1523300141      1350     2.0           0     0          3  ...              0   \n",
       "291310100       2388     2.0           0     0          3  ...              0   \n",
       "1523300157      1076     2.0           0     0          3  ...              0   \n",
       "\n",
       "            yr_built  yr_renovated  zipcode      lat     long  sqft_living15  \\\n",
       "id                                                                             \n",
       "263000018       2009             0    98103  47.6993 -122.346           1530   \n",
       "6600060120      2014             0    98146  47.5107 -122.362           1830   \n",
       "1523300141      2009             0    98144  47.5944 -122.299           1020   \n",
       "291310100       2004             0    98027  47.5345 -122.069           1410   \n",
       "1523300157      2008             0    98144  47.5941 -122.299           1020   \n",
       "\n",
       "            sqft_lot15  priceFillNa  priceFillMedian  \n",
       "id                                                    \n",
       "263000018         1509     360000.0         360000.0  \n",
       "6600060120        7200     400000.0         400000.0  \n",
       "1523300141        2007     402101.0         402101.0  \n",
       "291310100         1287     400000.0         400000.0  \n",
       "1523300157        1357     325000.0         325000.0  \n",
       "\n",
       "[5 rows x 22 columns]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fillna_df = df1.fillna(0)\n",
    "\n",
    "print(fillna_df.shape)\n",
    "\n",
    "print(fillna_df.isnull().any())\n",
    "\n",
    "# Замена пустых данных на 0\n",
    "df1[\"priceFillNa\"] = df1[\"price\"].fillna(0)\n",
    "\n",
    "# Замена пустых данных на медиану\n",
    "df1[\"priceFillMedian\"] = df1[\"price\"].fillna(df1[\"priceFillNa\"].median())\n",
    "\n",
    "df1.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>bedrooms</th>\n",
       "      <th>bathrooms</th>\n",
       "      <th>sqft_living</th>\n",
       "      <th>sqft_lot</th>\n",
       "      <th>floors</th>\n",
       "      <th>waterfront</th>\n",
       "      <th>view</th>\n",
       "      <th>condition</th>\n",
       "      <th>...</th>\n",
       "      <th>yr_built</th>\n",
       "      <th>yr_renovated</th>\n",
       "      <th>zipcode</th>\n",
       "      <th>lat</th>\n",
       "      <th>long</th>\n",
       "      <th>sqft_living15</th>\n",
       "      <th>sqft_lot15</th>\n",
       "      <th>priceFillNa</th>\n",
       "      <th>priceFillMedian</th>\n",
       "      <th>PriceCopy</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>263000018</th>\n",
       "      <td>20140521T000000</td>\n",
       "      <td>360000.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2.50</td>\n",
       "      <td>1530</td>\n",
       "      <td>1131</td>\n",
       "      <td>3.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>...</td>\n",
       "      <td>2009</td>\n",
       "      <td>0</td>\n",
       "      <td>98103</td>\n",
       "      <td>47.6993</td>\n",
       "      <td>-122.346</td>\n",
       "      <td>1530</td>\n",
       "      <td>1509</td>\n",
       "      <td>360000.0</td>\n",
       "      <td>360000.0</td>\n",
       "      <td>360000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6600060120</th>\n",
       "      <td>20150223T000000</td>\n",
       "      <td>400000.0</td>\n",
       "      <td>4</td>\n",
       "      <td>2.50</td>\n",
       "      <td>2310</td>\n",
       "      <td>5813</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>...</td>\n",
       "      <td>2014</td>\n",
       "      <td>0</td>\n",
       "      <td>98146</td>\n",
       "      <td>47.5107</td>\n",
       "      <td>-122.362</td>\n",
       "      <td>1830</td>\n",
       "      <td>7200</td>\n",
       "      <td>400000.0</td>\n",
       "      <td>400000.0</td>\n",
       "      <td>400000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1523300141</th>\n",
       "      <td>20140623T000000</td>\n",
       "      <td>402101.0</td>\n",
       "      <td>2</td>\n",
       "      <td>0.75</td>\n",
       "      <td>1020</td>\n",
       "      <td>1350</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>...</td>\n",
       "      <td>2009</td>\n",
       "      <td>0</td>\n",
       "      <td>98144</td>\n",
       "      <td>47.5944</td>\n",
       "      <td>-122.299</td>\n",
       "      <td>1020</td>\n",
       "      <td>2007</td>\n",
       "      <td>402101.0</td>\n",
       "      <td>402101.0</td>\n",
       "      <td>402101.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>291310100</th>\n",
       "      <td>20150116T000000</td>\n",
       "      <td>400000.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2.50</td>\n",
       "      <td>1600</td>\n",
       "      <td>2388</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>...</td>\n",
       "      <td>2004</td>\n",
       "      <td>0</td>\n",
       "      <td>98027</td>\n",
       "      <td>47.5345</td>\n",
       "      <td>-122.069</td>\n",
       "      <td>1410</td>\n",
       "      <td>1287</td>\n",
       "      <td>400000.0</td>\n",
       "      <td>400000.0</td>\n",
       "      <td>400000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1523300157</th>\n",
       "      <td>20141015T000000</td>\n",
       "      <td>325000.0</td>\n",
       "      <td>2</td>\n",
       "      <td>0.75</td>\n",
       "      <td>1020</td>\n",
       "      <td>1076</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>...</td>\n",
       "      <td>2008</td>\n",
       "      <td>0</td>\n",
       "      <td>98144</td>\n",
       "      <td>47.5941</td>\n",
       "      <td>-122.299</td>\n",
       "      <td>1020</td>\n",
       "      <td>1357</td>\n",
       "      <td>325000.0</td>\n",
       "      <td>325000.0</td>\n",
       "      <td>325000.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 23 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                       date     price  bedrooms  bathrooms  sqft_living  \\\n",
       "id                                                                        \n",
       "263000018   20140521T000000  360000.0         3       2.50         1530   \n",
       "6600060120  20150223T000000  400000.0         4       2.50         2310   \n",
       "1523300141  20140623T000000  402101.0         2       0.75         1020   \n",
       "291310100   20150116T000000  400000.0         3       2.50         1600   \n",
       "1523300157  20141015T000000  325000.0         2       0.75         1020   \n",
       "\n",
       "            sqft_lot  floors  waterfront  view  condition  ...  yr_built  \\\n",
       "id                                                         ...             \n",
       "263000018       1131     3.0           0     0          3  ...      2009   \n",
       "6600060120      5813     2.0           0     0          3  ...      2014   \n",
       "1523300141      1350     2.0           0     0          3  ...      2009   \n",
       "291310100       2388     2.0           0     0          3  ...      2004   \n",
       "1523300157      1076     2.0           0     0          3  ...      2008   \n",
       "\n",
       "            yr_renovated  zipcode      lat     long  sqft_living15  \\\n",
       "id                                                                   \n",
       "263000018              0    98103  47.6993 -122.346           1530   \n",
       "6600060120             0    98146  47.5107 -122.362           1830   \n",
       "1523300141             0    98144  47.5944 -122.299           1020   \n",
       "291310100              0    98027  47.5345 -122.069           1410   \n",
       "1523300157             0    98144  47.5941 -122.299           1020   \n",
       "\n",
       "            sqft_lot15  priceFillNa  priceFillMedian  PriceCopy  \n",
       "id                                                               \n",
       "263000018         1509     360000.0         360000.0   360000.0  \n",
       "6600060120        7200     400000.0         400000.0   400000.0  \n",
       "1523300141        2007     402101.0         402101.0   402101.0  \n",
       "291310100         1287     400000.0         400000.0   400000.0  \n",
       "1523300157        1357     325000.0         325000.0   325000.0  \n",
       "\n",
       "[5 rows x 23 columns]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1[\"PriceCopy\"] = df1[\"price\"]\n",
    "\n",
    "# Замена данных сразу в DataFrame без копирования\n",
    "df1.fillna({\"PriceCopy\": 0}, inplace=True)\n",
    "\n",
    "df1.tail()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Удаление наблюдений с пропусками"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(21607, 23)\n",
      "date             False\n",
      "price            False\n",
      "bedrooms         False\n",
      "bathrooms        False\n",
      "sqft_living      False\n",
      "sqft_lot         False\n",
      "floors           False\n",
      "waterfront       False\n",
      "view             False\n",
      "condition        False\n",
      "grade            False\n",
      "sqft_above       False\n",
      "sqft_basement    False\n",
      "yr_built         False\n",
      "yr_renovated     False\n",
      "zipcode          False\n",
      "lat              False\n",
      "long             False\n",
      "sqft_living15    False\n",
      "sqft_lot15       False\n",
      "priceFillNa      False\n",
      "dtype: bool\n"
     ]
    }
   ],
   "source": [
    "dropna_df = df1.dropna()\n",
    "\n",
    "print(dropna_df.shape)\n",
    "\n",
    "print(fillna_df.isnull().any())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Создание выборок данных\n",
    "\n",
    "Библиотека scikit-learn\n",
    "\n",
    "https://scikit-learn.org/stable/index.html"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img src=\"assets/lec2-split.png\" width=\"600\" style=\"background-color: white\">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Функция для создания выборок\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "\n",
    "def split_stratified_into_train_val_test(\n",
    "    df_input,\n",
    "    stratify_colname=\"y\",\n",
    "    frac_train=0.6,\n",
    "    frac_val=0.15,\n",
    "    frac_test=0.25,\n",
    "    random_state=None,\n",
    "):\n",
    "    \"\"\"\n",
    "    Splits a Pandas dataframe into three subsets (train, val, and test)\n",
    "    following fractional ratios provided by the user, where each subset is\n",
    "    stratified by the values in a specific column (that is, each subset has\n",
    "    the same relative frequency of the values in the column). It performs this\n",
    "    splitting by running train_test_split() twice.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    df_input : Pandas dataframe\n",
    "        Input dataframe to be split.\n",
    "    stratify_colname : str\n",
    "        The name of the column that will be used for stratification. Usually\n",
    "        this column would be for the label.\n",
    "    frac_train : float\n",
    "    frac_val   : float\n",
    "    frac_test  : float\n",
    "        The ratios with which the dataframe will be split into train, val, and\n",
    "        test data. The values should be expressed as float fractions and should\n",
    "        sum to 1.0.\n",
    "    random_state : int, None, or RandomStateInstance\n",
    "        Value to be passed to train_test_split().\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    df_train, df_val, df_test :\n",
    "        Dataframes containing the three splits.\n",
    "    \"\"\"\n",
    "\n",
    "    if frac_train + frac_val + frac_test != 1.0:\n",
    "        raise ValueError(\n",
    "            \"fractions %f, %f, %f do not add up to 1.0\"\n",
    "            % (frac_train, frac_val, frac_test)\n",
    "        )\n",
    "\n",
    "    if stratify_colname not in df_input.columns:\n",
    "        raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
    "\n",
    "    X = df_input  # Contains all columns.\n",
    "    y = df_input[\n",
    "        [stratify_colname]\n",
    "    ]  # Dataframe of just the column on which to stratify.\n",
    "\n",
    "    # Split original dataframe into train and temp dataframes.\n",
    "    df_train, df_temp, y_train, y_temp = train_test_split(\n",
    "        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
    "    )\n",
    "\n",
    "    # Split the temp dataframe into val and test dataframes.\n",
    "    relative_frac_test = frac_test / (frac_val + frac_test)\n",
    "    df_val, df_test, y_val, y_test = train_test_split(\n",
    "        df_temp,\n",
    "        y_temp,\n",
    "        stratify=y_temp,\n",
    "        test_size=relative_frac_test,\n",
    "        random_state=random_state,\n",
    "    )\n",
    "\n",
    "    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
    "\n",
    "    return df_train, df_val, df_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "waterfront\n",
      "0    21450\n",
      "1      163\n",
      "Name: count, dtype: int64\n",
      "Обучающая выборка:  (12967, 3)\n",
      "waterfront\n",
      "0    12869\n",
      "1       98\n",
      "Name: count, dtype: int64\n",
      "Контрольная выборка:  (4323, 3)\n",
      "waterfront\n",
      "0    4290\n",
      "1      33\n",
      "Name: count, dtype: int64\n",
      "Тестовая выборка:  (4323, 3)\n",
      "waterfront\n",
      "0    4291\n",
      "1      32\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# Вывод распределения количества наблюдений по меткам (классам)\n",
    "print(df1.waterfront.value_counts())\n",
    "\n",
    "data = df1[[\"waterfront\", \"priceFillMedian\", \"bedrooms\"]].copy()\n",
    "\n",
    "df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
    "    data, stratify_colname=\"waterfront\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
    ")\n",
    "\n",
    "print(\"Обучающая выборка: \", df_train.shape)\n",
    "print(df_train.waterfront.value_counts())\n",
    "\n",
    "print(\"Контрольная выборка: \", df_val.shape)\n",
    "print(df_val.waterfront.value_counts())\n",
    "\n",
    "print(\"Тестовая выборка: \", df_test.shape)\n",
    "print(df_test.waterfront.value_counts())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Выборка с избытком (oversampling)\n",
    "\n",
    "https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n",
    "\n",
    "https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n",
    "\n",
    "Выборка с недостатком (undersampling)\n",
    "\n",
    "https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n",
    "\n",
    "Библиотека imbalanced-learn\n",
    "\n",
    "https://imbalanced-learn.org/stable/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Обучающая выборка:  (12967, 3)\n",
      "waterfront\n",
      "0    12869\n",
      "1       98\n",
      "Name: count, dtype: int64\n",
      "Обучающая выборка после oversampling:  (25754, 3)\n",
      "waterfront\n",
      "1    12885\n",
      "0    12869\n",
      "Name: count, dtype: int64\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>waterfront</th>\n",
       "      <th>priceFillMedian</th>\n",
       "      <th>bedrooms</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>360000.000000</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>488250.000000</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>699000.000000</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>405000.000000</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>343000.000000</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25749</th>\n",
       "      <td>1</td>\n",
       "      <td>528600.011954</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25750</th>\n",
       "      <td>1</td>\n",
       "      <td>537978.366911</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25751</th>\n",
       "      <td>1</td>\n",
       "      <td>522391.975863</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25752</th>\n",
       "      <td>1</td>\n",
       "      <td>537575.496160</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25753</th>\n",
       "      <td>1</td>\n",
       "      <td>538006.679050</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>25754 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       waterfront  priceFillMedian  bedrooms\n",
       "0               0    360000.000000         4\n",
       "1               0    488250.000000         4\n",
       "2               0    699000.000000         4\n",
       "3               0    405000.000000         2\n",
       "4               0    343000.000000         2\n",
       "...           ...              ...       ...\n",
       "25749           1    528600.011954         2\n",
       "25750           1    537978.366911         1\n",
       "25751           1    522391.975863         3\n",
       "25752           1    537575.496160         2\n",
       "25753           1    538006.679050         2\n",
       "\n",
       "[25754 rows x 3 columns]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from imblearn.over_sampling import ADASYN\n",
    "\n",
    "ada = ADASYN()\n",
    "\n",
    "print(\"Обучающая выборка: \", df_train.shape)\n",
    "print(df_train.waterfront.value_counts())\n",
    "\n",
    "X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"waterfront\"])\n",
    "df_train_adasyn = pd.DataFrame(X_resampled)\n",
    "\n",
    "print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
    "print(df_train_adasyn.waterfront.value_counts())\n",
    "\n",
    "df_train_adasyn"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}