mai_pi-33_zakharov/notebooks/lab2_1.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Загрузка данных в DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"../data/kc_house_data.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Получение сведений о пропущенных данных"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(df.isnull().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(df.isnull().any())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in df.columns:\n",
    "    null_rate = df[i].isnull().sum() / len(df) * 100\n",
    "    if null_rate > 0:\n",
    "        print(f\"{i} процент пустых значений: {null_rate:.2f}%\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Создание выборок данных"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "\n",
    "def split_stratified_into_train_val_test(\n",
    "    df_input,\n",
    "    stratify_colname=\"y\",\n",
    "    frac_train=0.6,\n",
    "    frac_val=0.15,\n",
    "    frac_test=0.25,\n",
    "    random_state=None,\n",
    "):\n",
    "    \"\"\"\n",
    "    Splits a Pandas dataframe into three subsets (train, val, and test)\n",
    "    following fractional ratios provided by the user, where each subset is\n",
    "    stratified by the values in a specific column (that is, each subset has\n",
    "    the same relative frequency of the values in the column). It performs this\n",
    "    splitting by running train_test_split() twice.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    df_input : Pandas dataframe\n",
    "        Input dataframe to be split.\n",
    "    stratify_colname : str\n",
    "        The name of the column that will be used for stratification. Usually\n",
    "        this column would be for the label.\n",
    "    frac_train : float\n",
    "    frac_val   : float\n",
    "    frac_test  : float\n",
    "        The ratios with which the dataframe will be split into train, val, and\n",
    "        test data. The values should be expressed as float fractions and should\n",
    "        sum to 1.0.\n",
    "    random_state : int, None, or RandomStateInstance\n",
    "        Value to be passed to train_test_split().\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    df_train, df_val, df_test :\n",
    "        Dataframes containing the three splits.\n",
    "    \"\"\"\n",
    "\n",
    "    if frac_train + frac_val + frac_test != 1.0:\n",
    "        raise ValueError(\n",
    "            \"fractions %f, %f, %f do not add up to 1.0\"\n",
    "            % (frac_train, frac_val, frac_test)\n",
    "        )\n",
    "\n",
    "    if stratify_colname not in df_input.columns:\n",
    "        raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
    "\n",
    "    X = df_input  # Contains all columns.\n",
    "    y = df_input[\n",
    "        [stratify_colname]\n",
    "    ]  # Dataframe of just the column on which to stratify.\n",
    "\n",
    "    # Split original dataframe into train and temp dataframes.\n",
    "    df_train, df_temp, y_train, y_temp = train_test_split(\n",
    "        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
    "    )\n",
    "\n",
    "    # Split the temp dataframe into val and test dataframes.\n",
    "    relative_frac_test = frac_test / (frac_val + frac_test)\n",
    "    df_val, df_test, y_val, y_test = train_test_split(\n",
    "        df_temp,\n",
    "        y_temp,\n",
    "        stratify=y_temp,\n",
    "        test_size=relative_frac_test,\n",
    "        random_state=random_state,\n",
    "    )\n",
    "\n",
    "    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
    "\n",
    "    return df_train, df_val, df_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[3 5 4 1 2]\n"
     ]
    }
   ],
   "source": [
    "print(df.condition.unique())\n",
    "\n",
    "data = df[\n",
    "    [\n",
    "        \"price\",\n",
    "        \"bedrooms\",\n",
    "        \"bathrooms\",\n",
    "        \"sqft_living\",\n",
    "        \"sqft_lot\",\n",
    "        \"floors\",\n",
    "        \"view\",\n",
    "        \"condition\",\n",
    "        \"grade\",\n",
    "        \"sqft_above\",\n",
    "        \"sqft_basement\",\n",
    "        \"yr_built\",\n",
    "        \"yr_renovated\",\n",
    "        \"zipcode\",\n",
    "        \"lat\",\n",
    "        \"long\",\n",
    "    ]\n",
    "].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Обучающая выборка:  (12967, 16)\n",
      "condition\n",
      "3    8418\n",
      "4    3407\n",
      "5    1021\n",
      "2     103\n",
      "1      18\n",
      "Name: count, dtype: int64\n",
      "Контрольная выборка:  (4323, 16)\n",
      "condition\n",
      "3    2806\n",
      "4    1136\n",
      "5     340\n",
      "2      35\n",
      "1       6\n",
      "Name: count, dtype: int64\n",
      "Тестовая выборка:  (4323, 16)\n",
      "condition\n",
      "3    2807\n",
      "4    1136\n",
      "5     340\n",
      "2      34\n",
      "1       6\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
    "    data,\n",
    "    stratify_colname=\"condition\",\n",
    "    frac_train=0.60,\n",
    "    frac_val=0.20,\n",
    "    frac_test=0.20,\n",
    ")\n",
    "\n",
    "print(\"Обучающая выборка: \", df_train.shape)\n",
    "print(df_train.condition.value_counts())\n",
    "\n",
    "print(\"Контрольная выборка: \", df_val.shape)\n",
    "print(df_val.condition.value_counts())\n",
    "\n",
    "print(\"Тестовая выборка: \", df_test.shape)\n",
    "print(df_test.condition.value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Обучающая выборка:  (12967, 16)\n",
      "condition\n",
      "3    8418\n",
      "4    3407\n",
      "5    1021\n",
      "2     103\n",
      "1      18\n",
      "Name: count, dtype: int64\n",
      "Обучающая выборка после oversampling:  (42073, 16)\n",
      "condition\n",
      "5    8464\n",
      "2    8421\n",
      "1    8420\n",
      "3    8418\n",
      "4    8350\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "from imblearn.over_sampling import ADASYN\n",
    "\n",
    "ada = ADASYN()\n",
    "\n",
    "print(\"Обучающая выборка: \", df_train.shape)\n",
    "print(df_train.condition.value_counts())\n",
    "\n",
    "X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"condition\"])\n",
    "df_train_adasyn = pd.DataFrame(X_resampled)\n",
    "\n",
    "print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
    "print(df_train_adasyn.condition.value_counts())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}