feat(lab-2): do lab-2, part 2

2024-11-23 15:06:07 +04:00 · 2024-11-23 15:06:07 +04:00 · f7672b7625
commit f7672b7625
parent f249d643dc
3 changed files with 792 additions and 10 deletions
--- a/data/car_price_prediction.csv
+++ b/data/car_price_prediction.csv
@ -1,4 +1,4 @@
-ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
+ID,Price,Levy,Manufacturer,Model,Prod_year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear_box_type,Drive_wheels,Doors,Wheel,Color,Airbags
 45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
 44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
 45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
--- a/notebooks/lab2_1.ipynb
+++ b/notebooks/lab2_1.ipynb
@ -54,7 +54,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@ -63,6 +63,229 @@
    "    if null_rate > 0:\n",
    "        print(f\"{i} процент пустых значений: {null_rate:.2f}%\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Создание выборок данных"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "\n",
    "def split_stratified_into_train_val_test(\n",
    "    df_input,\n",
    "    stratify_colname=\"y\",\n",
    "    frac_train=0.6,\n",
    "    frac_val=0.15,\n",
    "    frac_test=0.25,\n",
    "    random_state=None,\n",
    "):\n",
    "    \"\"\"\n",
    "    Splits a Pandas dataframe into three subsets (train, val, and test)\n",
    "    following fractional ratios provided by the user, where each subset is\n",
    "    stratified by the values in a specific column (that is, each subset has\n",
    "    the same relative frequency of the values in the column). It performs this\n",
    "    splitting by running train_test_split() twice.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    df_input : Pandas dataframe\n",
    "        Input dataframe to be split.\n",
    "    stratify_colname : str\n",
    "        The name of the column that will be used for stratification. Usually\n",
    "        this column would be for the label.\n",
    "    frac_train : float\n",
    "    frac_val   : float\n",
    "    frac_test  : float\n",
    "        The ratios with which the dataframe will be split into train, val, and\n",
    "        test data. The values should be expressed as float fractions and should\n",
    "        sum to 1.0.\n",
    "    random_state : int, None, or RandomStateInstance\n",
    "        Value to be passed to train_test_split().\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    df_train, df_val, df_test :\n",
    "        Dataframes containing the three splits.\n",
    "    \"\"\"\n",
    "\n",
    "    if frac_train + frac_val + frac_test != 1.0:\n",
    "        raise ValueError(\n",
    "            \"fractions %f, %f, %f do not add up to 1.0\"\n",
    "            % (frac_train, frac_val, frac_test)\n",
    "        )\n",
    "\n",
    "    if stratify_colname not in df_input.columns:\n",
    "        raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
    "\n",
    "    X = df_input  # Contains all columns.\n",
    "    y = df_input[\n",
    "        [stratify_colname]\n",
    "    ]  # Dataframe of just the column on which to stratify.\n",
    "\n",
    "    # Split original dataframe into train and temp dataframes.\n",
    "    df_train, df_temp, y_train, y_temp = train_test_split(\n",
    "        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
    "    )\n",
    "\n",
    "    # Split the temp dataframe into val and test dataframes.\n",
    "    relative_frac_test = frac_test / (frac_val + frac_test)\n",
    "    df_val, df_test, y_val, y_test = train_test_split(\n",
    "        df_temp,\n",
    "        y_temp,\n",
    "        stratify=y_temp,\n",
    "        test_size=relative_frac_test,\n",
    "        random_state=random_state,\n",
    "    )\n",
    "\n",
    "    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
    "\n",
    "    return df_train, df_val, df_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[3 5 4 1 2]\n"
     ]
    }
   ],
   "source": [
    "print(df.condition.unique())\n",
    "\n",
    "data = df[\n",
    "    [\n",
    "        \"price\",\n",
    "        \"bedrooms\",\n",
    "        \"bathrooms\",\n",
    "        \"sqft_living\",\n",
    "        \"sqft_lot\",\n",
    "        \"floors\",\n",
    "        \"view\",\n",
    "        \"condition\",\n",
    "        \"grade\",\n",
    "        \"sqft_above\",\n",
    "        \"sqft_basement\",\n",
    "        \"yr_built\",\n",
    "        \"yr_renovated\",\n",
    "        \"zipcode\",\n",
    "        \"lat\",\n",
    "        \"long\",\n",
    "    ]\n",
    "].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Обучающая выборка:  (12967, 16)\n",
      "condition\n",
      "3    8418\n",
      "4    3407\n",
      "5    1021\n",
      "2     103\n",
      "1      18\n",
      "Name: count, dtype: int64\n",
      "Контрольная выборка:  (4323, 16)\n",
      "condition\n",
      "3    2806\n",
      "4    1136\n",
      "5     340\n",
      "2      35\n",
      "1       6\n",
      "Name: count, dtype: int64\n",
      "Тестовая выборка:  (4323, 16)\n",
      "condition\n",
      "3    2807\n",
      "4    1136\n",
      "5     340\n",
      "2      34\n",
      "1       6\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
    "    data,\n",
    "    stratify_colname=\"condition\",\n",
    "    frac_train=0.60,\n",
    "    frac_val=0.20,\n",
    "    frac_test=0.20,\n",
    ")\n",
    "\n",
    "print(\"Обучающая выборка: \", df_train.shape)\n",
    "print(df_train.condition.value_counts())\n",
    "\n",
    "print(\"Контрольная выборка: \", df_val.shape)\n",
    "print(df_val.condition.value_counts())\n",
    "\n",
    "print(\"Тестовая выборка: \", df_test.shape)\n",
    "print(df_test.condition.value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Обучающая выборка:  (12967, 16)\n",
      "condition\n",
      "3    8418\n",
      "4    3407\n",
      "5    1021\n",
      "2     103\n",
      "1      18\n",
      "Name: count, dtype: int64\n",
      "Обучающая выборка после oversampling:  (42073, 16)\n",
      "condition\n",
      "5    8464\n",
      "2    8421\n",
      "1    8420\n",
      "3    8418\n",
      "4    8350\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "from imblearn.over_sampling import ADASYN\n",
    "\n",
    "ada = ADASYN()\n",
    "\n",
    "print(\"Обучающая выборка: \", df_train.shape)\n",
    "print(df_train.condition.value_counts())\n",
    "\n",
    "X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"condition\"])\n",
    "df_train_adasyn = pd.DataFrame(X_resampled)\n",
    "\n",
    "print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
    "print(df_train_adasyn.condition.value_counts())"
   ]
  }
 ],
 "metadata": {
--- a/notebooks/lab2_2.ipynb
+++ b/notebooks/lab2_2.ipynb
@ -9,7 +9,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -20,9 +20,188 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>Price</th>\n",
       "      <th>Levy</th>\n",
       "      <th>Manufacturer</th>\n",
       "      <th>Model</th>\n",
       "      <th>Prod_year</th>\n",
       "      <th>Category</th>\n",
       "      <th>Leather interior</th>\n",
       "      <th>Fuel type</th>\n",
       "      <th>Engine volume</th>\n",
       "      <th>Mileage</th>\n",
       "      <th>Cylinders</th>\n",
       "      <th>Gear_box_type</th>\n",
       "      <th>Drive_wheels</th>\n",
       "      <th>Doors</th>\n",
       "      <th>Wheel</th>\n",
       "      <th>Color</th>\n",
       "      <th>Airbags</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>45654403</td>\n",
       "      <td>13328</td>\n",
       "      <td>1399</td>\n",
       "      <td>LEXUS</td>\n",
       "      <td>RX 450</td>\n",
       "      <td>2010</td>\n",
       "      <td>Jeep</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Hybrid</td>\n",
       "      <td>3.5</td>\n",
       "      <td>186005 km</td>\n",
       "      <td>6.0</td>\n",
       "      <td>Automatic</td>\n",
       "      <td>4x4</td>\n",
       "      <td>04-May</td>\n",
       "      <td>Left wheel</td>\n",
       "      <td>Silver</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>44731507</td>\n",
       "      <td>16621</td>\n",
       "      <td>1018</td>\n",
       "      <td>CHEVROLET</td>\n",
       "      <td>Equinox</td>\n",
       "      <td>2011</td>\n",
       "      <td>Jeep</td>\n",
       "      <td>No</td>\n",
       "      <td>Petrol</td>\n",
       "      <td>3</td>\n",
       "      <td>192000 km</td>\n",
       "      <td>6.0</td>\n",
       "      <td>Tiptronic</td>\n",
       "      <td>4x4</td>\n",
       "      <td>04-May</td>\n",
       "      <td>Left wheel</td>\n",
       "      <td>Black</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>45774419</td>\n",
       "      <td>8467</td>\n",
       "      <td>-</td>\n",
       "      <td>HONDA</td>\n",
       "      <td>FIT</td>\n",
       "      <td>2006</td>\n",
       "      <td>Hatchback</td>\n",
       "      <td>No</td>\n",
       "      <td>Petrol</td>\n",
       "      <td>1.3</td>\n",
       "      <td>200000 km</td>\n",
       "      <td>4.0</td>\n",
       "      <td>Variator</td>\n",
       "      <td>Front</td>\n",
       "      <td>04-May</td>\n",
       "      <td>Right-hand drive</td>\n",
       "      <td>Black</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>45769185</td>\n",
       "      <td>3607</td>\n",
       "      <td>862</td>\n",
       "      <td>FORD</td>\n",
       "      <td>Escape</td>\n",
       "      <td>2011</td>\n",
       "      <td>Jeep</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Hybrid</td>\n",
       "      <td>2.5</td>\n",
       "      <td>168966 km</td>\n",
       "      <td>4.0</td>\n",
       "      <td>Automatic</td>\n",
       "      <td>4x4</td>\n",
       "      <td>04-May</td>\n",
       "      <td>Left wheel</td>\n",
       "      <td>White</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>45809263</td>\n",
       "      <td>11726</td>\n",
       "      <td>446</td>\n",
       "      <td>HONDA</td>\n",
       "      <td>FIT</td>\n",
       "      <td>2014</td>\n",
       "      <td>Hatchback</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Petrol</td>\n",
       "      <td>1.3</td>\n",
       "      <td>91901 km</td>\n",
       "      <td>4.0</td>\n",
       "      <td>Automatic</td>\n",
       "      <td>Front</td>\n",
       "      <td>04-May</td>\n",
       "      <td>Left wheel</td>\n",
       "      <td>Silver</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         ID  Price  Levy Manufacturer    Model  Prod_year   Category  \\\n",
       "0  45654403  13328  1399        LEXUS   RX 450       2010       Jeep   \n",
       "1  44731507  16621  1018    CHEVROLET  Equinox       2011       Jeep   \n",
       "2  45774419   8467     -        HONDA      FIT       2006  Hatchback   \n",
       "3  45769185   3607   862         FORD   Escape       2011       Jeep   \n",
       "4  45809263  11726   446        HONDA      FIT       2014  Hatchback   \n",
       "\n",
       "  Leather interior Fuel type Engine volume    Mileage  Cylinders  \\\n",
       "0              Yes    Hybrid           3.5  186005 km        6.0   \n",
       "1               No    Petrol             3  192000 km        6.0   \n",
       "2               No    Petrol           1.3  200000 km        4.0   \n",
       "3              Yes    Hybrid           2.5  168966 km        4.0   \n",
       "4              Yes    Petrol           1.3   91901 km        4.0   \n",
       "\n",
       "  Gear_box_type Drive_wheels   Doors             Wheel   Color  Airbags  \n",
       "0     Automatic          4x4  04-May        Left wheel  Silver       12  \n",
       "1     Tiptronic          4x4  04-May        Left wheel   Black        8  \n",
       "2      Variator        Front  04-May  Right-hand drive   Black        2  \n",
       "3     Automatic          4x4  04-May        Left wheel   White        0  \n",
       "4     Automatic        Front  04-May        Left wheel  Silver        4  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
@ -36,33 +215,413 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ID                  0\n",
      "Price               0\n",
      "Levy                0\n",
      "Manufacturer        0\n",
      "Model               0\n",
      "Prod_year           0\n",
      "Category            0\n",
      "Leather interior    0\n",
      "Fuel type           0\n",
      "Engine volume       0\n",
      "Mileage             0\n",
      "Cylinders           0\n",
      "Gear_box_type       0\n",
      "Drive_wheels        0\n",
      "Doors               0\n",
      "Wheel               0\n",
      "Color               0\n",
      "Airbags             0\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print(df.isnull().sum())"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ID                  False\n",
      "Price               False\n",
      "Levy                False\n",
      "Manufacturer        False\n",
      "Model               False\n",
      "Prod_year           False\n",
      "Category            False\n",
      "Leather interior    False\n",
      "Fuel type           False\n",
      "Engine volume       False\n",
      "Mileage             False\n",
      "Cylinders           False\n",
      "Gear_box_type       False\n",
      "Drive_wheels        False\n",
      "Doors               False\n",
      "Wheel               False\n",
      "Color               False\n",
      "Airbags             False\n",
      "dtype: bool\n"
     ]
    }
   ],
   "source": [
    "print(df.isnull().any())"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['1399' '1018' '-' '862' '446' '891' '761' '751' '394' '1053' '1055'\n",
      " '1079' '810' '2386' '1850' '531' '586' '1249' '2455' '583' '1537' '1288'\n",
      " '915' '1750' '707' '1077' '1486' '1091' '650' '382' '1436' '1194' '503'\n",
      " '1017' '1104' '639' '629' '919' '781' '530' '640' '765' '777' '779' '934'\n",
      " '769' '645' '1185' '1324' '830' '1187' '1111' '760' '642' '1604' '1095'\n",
      " '966' '473' '1138' '1811' '988' '917' '1156' '687' '11714' '836' '1347'\n",
      " '2866' '1646' '259' '609' '697' '585' '475' '690' '308' '1823' '1361'\n",
      " '1273' '924' '584' '2078' '831' '1172' '893' '1872' '1885' '1266' '447'\n",
      " '2148' '1730' '730' '289' '502' '333' '1325' '247' '879' '1342' '1327'\n",
      " '1598' '1514' '1058' '738' '1935' '481' '1522' '1282' '456' '880' '900'\n",
      " '798' '1277' '442' '1051' '790' '1292' '1047' '528' '1211' '1493' '1793'\n",
      " '574' '930' '1998' '271' '706' '1481' '1677' '1661' '1286' '1408' '1090'\n",
      " '595' '1451' '1267' '993' '1714' '878' '641' '749' '1511' '603' '353'\n",
      " '877' '1236' '1141' '397' '784' '1024' '1357' '1301' '770' '922' '1438'\n",
      " '753' '607' '1363' '638' '490' '431' '565' '517' '833' '489' '1760' '986'\n",
      " '1841' '1620' '1360' '474' '1099' '978' '1624' '1946' '1268' '1307' '696'\n",
      " '649' '666' '2151' '551' '800' '971' '1323' '2377' '1845' '1083' '694'\n",
      " '463' '419' '345' '1515' '1505' '2056' '1203' '729' '460' '1356' '876'\n",
      " '911' '1190' '780' '448' '2410' '1848' '1148' '834' '1275' '1028' '1197'\n",
      " '724' '890' '1705' '505' '789' '2959' '518' '461' '1719' '2858' '3156'\n",
      " '2225' '2177' '1968' '1888' '1308' '2736' '1103' '557' '2195' '843'\n",
      " '1664' '723' '4508' '562' '501' '2018' '1076' '1202' '3301' '691' '1440'\n",
      " '1869' '1178' '418' '1820' '1413' '488' '1304' '363' '2108' '521' '1659'\n",
      " '87' '1411' '1528' '3292' '7058' '1578' '627' '874' '1996' '1488' '5679'\n",
      " '1234' '5603' '400' '889' '3268' '875' '949' '2265' '441' '742' '425'\n",
      " '2476' '2971' '614' '1816' '1375' '1405' '2297' '1062' '1113' '420'\n",
      " '2469' '658' '1951' '2670' '2578' '1995' '1032' '994' '1011' '2421'\n",
      " '1296' '155' '494' '426' '1086' '961' '2236' '1829' '764' '1834' '1054'\n",
      " '617' '1529' '2266' '637' '626' '1832' '1016' '2002' '1756' '746' '1285'\n",
      " '2690' '1118' '5332' '980' '1807' '970' '1228' '1195' '1132' '1768'\n",
      " '1384' '1080' '7063' '1817' '1452' '1975' '1368' '702' '1974' '1781'\n",
      " '1036' '944' '663' '364' '1539' '1345' '1680' '2209' '741' '1575' '695'\n",
      " '1317' '294' '1525' '424' '997' '1473' '1552' '2819' '2188' '1668' '3057'\n",
      " '799' '1502' '2606' '552' '1694' '1759' '1110' '399' '1470' '1174' '5877'\n",
      " '1474' '1688' '526' '686' '5908' '1107' '2070' '1468' '1246' '1685' '556'\n",
      " '1533' '1917' '1346' '732' '692' '579' '421' '362' '3505' '1855' '2711'\n",
      " '1586' '3739' '681' '1708' '2278' '1701' '722' '1482' '928' '827' '832'\n",
      " '527' '604' '173' '1341' '3329' '1553' '859' '167' '916' '828' '2082'\n",
      " '1176' '1108' '975' '3008' '1516' '2269' '1699' '2073' '1031' '1503'\n",
      " '2364' '1030' '1442' '5666' '2715' '1437' '2067' '1426' '2908' '1279'\n",
      " '866' '4283' '279' '2658' '3015' '2004' '1391' '4736' '748' '1466' '644'\n",
      " '683' '2705' '1297' '731' '1252' '2216' '3141' '3273' '1518' '1723'\n",
      " '1588' '972' '682' '1094' '668' '175' '967' '402' '3894' '1960' '1599'\n",
      " '2000' '2084' '1621' '714' '1109' '3989' '873' '1572' '1163' '1991'\n",
      " '1716' '1673' '2562' '2874' '965' '462' '605' '1948' '1736' '3518' '2054'\n",
      " '2467' '1681' '1272' '1205' '750' '2156' '2566' '115' '524' '3184' '676'\n",
      " '1678' '612' '328' '955' '1441' '1675' '3965' '2909' '623' '822' '867'\n",
      " '3025' '1993' '792' '636' '4057' '3743' '2337' '2570' '2418' '2472'\n",
      " '3910' '1662' '2123' '2628' '3208' '2080' '3699' '2913' '864' '2505'\n",
      " '870' '7536' '1924' '1671' '1064' '1836' '1866' '4741' '841' '1369'\n",
      " '5681' '3112' '1366' '2223' '1198' '1039' '3811' '3571' '1387' '1171'\n",
      " '1365' '1531' '1590' '11706' '2308' '4860' '1641' '1045' '1901']\n"
     ]
    }
   ],
   "source": [
    "print(df[\"Levy\"].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"Levy\"] = df[\"Levy\"].replace({'-' : None})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Levy процент пустых значений: 30.25%\n"
     ]
    }
   ],
   "source": [
    "for i in df.columns:\n",
    "    null_rate = df[i].isnull().sum() / len(df) * 100\n",
    "    if null_rate > 0:\n",
    "        print(f\"{i} процент пустых значений: {null_rate:.2f}%\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Заполнение пропущенных данных"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.fillna({\"Levy\": 0}, inplace=True)\n",
    "for i in df.columns:\n",
    "    null_rate = df[i].isnull().sum() / len(df) * 100\n",
    "    if null_rate > 0:\n",
    "        print(f\"{i} процент пустых значений: {null_rate:.2f}%\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Создание выборок данных"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "\n",
    "def split_stratified_into_train_val_test(\n",
    "    df_input,\n",
    "    stratify_colname=\"y\",\n",
    "    frac_train=0.6,\n",
    "    frac_val=0.15,\n",
    "    frac_test=0.25,\n",
    "    random_state=None,\n",
    "):\n",
    "    \"\"\"\n",
    "    Splits a Pandas dataframe into three subsets (train, val, and test)\n",
    "    following fractional ratios provided by the user, where each subset is\n",
    "    stratified by the values in a specific column (that is, each subset has\n",
    "    the same relative frequency of the values in the column). It performs this\n",
    "    splitting by running train_test_split() twice.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    df_input : Pandas dataframe\n",
    "        Input dataframe to be split.\n",
    "    stratify_colname : str\n",
    "        The name of the column that will be used for stratification. Usually\n",
    "        this column would be for the label.\n",
    "    frac_train : float\n",
    "    frac_val   : float\n",
    "    frac_test  : float\n",
    "        The ratios with which the dataframe will be split into train, val, and\n",
    "        test data. The values should be expressed as float fractions and should\n",
    "        sum to 1.0.\n",
    "    random_state : int, None, or RandomStateInstance\n",
    "        Value to be passed to train_test_split().\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    df_train, df_val, df_test :\n",
    "        Dataframes containing the three splits.\n",
    "    \"\"\"\n",
    "\n",
    "    if frac_train + frac_val + frac_test != 1.0:\n",
    "        raise ValueError(\n",
    "            \"fractions %f, %f, %f do not add up to 1.0\"\n",
    "            % (frac_train, frac_val, frac_test)\n",
    "        )\n",
    "\n",
    "    if stratify_colname not in df_input.columns:\n",
    "        raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
    "\n",
    "    X = df_input  # Contains all columns.\n",
    "    y = df_input[\n",
    "        [stratify_colname]\n",
    "    ]  # Dataframe of just the column on which to stratify.\n",
    "\n",
    "    # Split original dataframe into train and temp dataframes.\n",
    "    df_train, df_temp, y_train, y_temp = train_test_split(\n",
    "        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
    "    )\n",
    "\n",
    "    # Split the temp dataframe into val and test dataframes.\n",
    "    relative_frac_test = frac_test / (frac_val + frac_test)\n",
    "    df_val, df_test, y_val, y_test = train_test_split(\n",
    "        df_temp,\n",
    "        y_temp,\n",
    "        stratify=y_temp,\n",
    "        test_size=relative_frac_test,\n",
    "        random_state=random_state,\n",
    "    )\n",
    "\n",
    "    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
    "\n",
    "    return df_train, df_val, df_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Automatic' 'Tiptronic' 'Variator' 'Manual']\n"
     ]
    }
   ],
   "source": [
    "print(df.Gear_box_type.unique())\n",
    "\n",
    "data = df[\n",
    "    [\n",
    "        \"Price\",\n",
    "        \"Gear_box_type\",\n",
    "    ]\n",
    "].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Обучающая выборка:  (11542, 2)\n",
      "Gear_box_type\n",
      "Automatic    8108\n",
      "Tiptronic    1861\n",
      "Manual       1125\n",
      "Variator      448\n",
      "Name: count, dtype: int64\n",
      "Контрольная выборка:  (3847, 2)\n",
      "Gear_box_type\n",
      "Automatic    2703\n",
      "Tiptronic     620\n",
      "Manual        375\n",
      "Variator      149\n",
      "Name: count, dtype: int64\n",
      "Тестовая выборка:  (3848, 2)\n",
      "Gear_box_type\n",
      "Automatic    2703\n",
      "Tiptronic     621\n",
      "Manual        375\n",
      "Variator      149\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
    "    data,\n",
    "    stratify_colname=\"Gear_box_type\",\n",
    "    frac_train=0.60,\n",
    "    frac_val=0.20,\n",
    "    frac_test=0.20,\n",
    ")\n",
    "\n",
    "print(\"Обучающая выборка: \", df_train.shape)\n",
    "print(df_train.Gear_box_type.value_counts())\n",
    "\n",
    "print(\"Контрольная выборка: \", df_val.shape)\n",
    "print(df_val.Gear_box_type.value_counts())\n",
    "\n",
    "print(\"Тестовая выборка: \", df_test.shape)\n",
    "print(df_test.Gear_box_type.value_counts())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Выборка с избытком (oversampling)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Обучающая выборка:  (11542, 2)\n",
      "Gear_box_type\n",
      "Automatic    8108\n",
      "Tiptronic    1861\n",
      "Manual       1125\n",
      "Variator      448\n",
      "Name: count, dtype: int64\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "could not convert string to float: 'Automatic'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_9996\\2277749880.py\u001b[0m in \u001b[0;36m?\u001b[1;34m()\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Обучающая выборка: \"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      6\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mGear_box_type\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalue_counts\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 8\u001b[1;33m \u001b[0mX_resampled\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_resampled\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mada\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit_resample\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"Gear_box_type\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      9\u001b[0m \u001b[0mdf_train_adasyn\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_resampled\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     10\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     11\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Обучающая выборка после oversampling: \"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdf_train_adasyn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\imblearn\\base.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m    204\u001b[0m         \u001b[0my_resampled\u001b[0m \u001b[1;33m:\u001b[0m \u001b[0marray\u001b[0m\u001b[1;33m-\u001b[0m\u001b[0mlike\u001b[0m \u001b[0mof\u001b[0m \u001b[0mshape\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mn_samples_new\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    205\u001b[0m             \u001b[0mThe\u001b[0m \u001b[0mcorresponding\u001b[0m \u001b[0mlabel\u001b[0m \u001b[0mof\u001b[0m \u001b[1;33m`\u001b[0m\u001b[0mX_resampled\u001b[0m\u001b[1;33m`\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    206\u001b[0m         \"\"\"\n\u001b[0;32m    207\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_params\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 208\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit_resample\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\imblearn\\base.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m    102\u001b[0m             \u001b[0mThe\u001b[0m \u001b[0mcorresponding\u001b[0m \u001b[0mlabel\u001b[0m \u001b[0mof\u001b[0m \u001b[1;33m`\u001b[0m\u001b[0mX_resampled\u001b[0m\u001b[1;33m`\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    103\u001b[0m         \"\"\"\n\u001b[0;32m    104\u001b[0m         \u001b[0mcheck_classification_targets\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    105\u001b[0m         \u001b[0marrays_transformer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mArraysTransformer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 106\u001b[1;33m         \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbinarize_y\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_check_X_y\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    107\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    108\u001b[0m         self.sampling_strategy_ = check_sampling_strategy(\n\u001b[0;32m    109\u001b[0m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msampling_strategy\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sampling_type\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\imblearn\\base.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, X, y, accept_sparse)\u001b[0m\n\u001b[0;32m    157\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_check_X_y\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    158\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0maccept_sparse\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    159\u001b[0m             \u001b[0maccept_sparse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m\"csr\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"csc\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    160\u001b[0m         \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbinarize_y\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcheck_target_type\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindicate_one_vs_all\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 161\u001b[1;33m         \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_data\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreset\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maccept_sparse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    162\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbinarize_y\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\sklearn\\base.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001b[0m\n\u001b[0;32m    646\u001b[0m                 \u001b[1;32mif\u001b[0m \u001b[1;34m\"estimator\"\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mcheck_y_params\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    647\u001b[0m                     \u001b[0mcheck_y_params\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mdefault_check_params\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    648\u001b[0m                 \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minput_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"y\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    649\u001b[0m             \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 650\u001b[1;33m                 \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mcheck_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    651\u001b[0m             \u001b[0mout\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    652\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    653\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mno_val_X\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mcheck_params\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"ensure_2d\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[0;32m   1297\u001b[0m         raise ValueError(\n\u001b[0;32m   1298\u001b[0m             \u001b[1;33mf\"\u001b[0m\u001b[1;33m{\u001b[0m\u001b[0mestimator_name\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m requires y to be passed, but the target y is None\u001b[0m\u001b[1;33m\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1299\u001b[0m         \u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1300\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1301\u001b[1;33m     X = check_array(\n\u001b[0m\u001b[0;32m   1302\u001b[0m         \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1303\u001b[0m         \u001b[0maccept_sparse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maccept_sparse\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1304\u001b[0m         \u001b[0maccept_large_sparse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maccept_large_sparse\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[0;32m   1009\u001b[0m                         \u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1010\u001b[0m                     \u001b[0marray\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mxp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1011\u001b[0m                 \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1012\u001b[0m                     \u001b[0marray\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_asarray_with_order\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mxp\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mxp\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1013\u001b[1;33m             \u001b[1;32mexcept\u001b[0m \u001b[0mComplexWarning\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mcomplex_warning\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1014\u001b[0m                 raise ValueError(\n\u001b[0;32m   1015\u001b[0m                     \u001b[1;34m\"Complex data not supported\\n{}\\n\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1016\u001b[0m                 \u001b[1;33m)\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mcomplex_warning\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\sklearn\\utils\\_array_api.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(array, dtype, order, copy, xp, device)\u001b[0m\n\u001b[0;32m    741\u001b[0m         \u001b[1;31m# Use NumPy API to support order\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    742\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mcopy\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    743\u001b[0m             \u001b[0marray\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    744\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 745\u001b[1;33m             \u001b[0marray\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    746\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    747\u001b[0m         \u001b[1;31m# At this point array is a NumPy ndarray. We convert it to an array\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    748\u001b[0m         \u001b[1;31m# container that is consistent with the input's namespace.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, dtype, copy)\u001b[0m\n\u001b[0;32m   2149\u001b[0m     def __array__(\n\u001b[0;32m   2150\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mnpt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDTypeLike\u001b[0m \u001b[1;33m|\u001b[0m \u001b[1;32mNone\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mbool_t\u001b[0m \u001b[1;33m|\u001b[0m \u001b[1;32mNone\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2151\u001b[0m     \u001b[1;33m)\u001b[0m \u001b[1;33m->\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mndarray\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2152\u001b[0m         \u001b[0mvalues\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_values\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2153\u001b[1;33m         \u001b[0marr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   2154\u001b[0m         if (\n\u001b[0;32m   2155\u001b[0m             \u001b[0mastype_is_view\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0marr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2156\u001b[0m             \u001b[1;32mand\u001b[0m \u001b[0musing_copy_on_write\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mValueError\u001b[0m: could not convert string to float: 'Automatic'"
     ]
    }
   ],
   "source": [
    "from imblearn.over_sampling import ADASYN\n",
    "\n",
    "ada = ADASYN()\n",
    "\n",
    "print(\"Обучающая выборка: \", df_train.shape)\n",
    "print(df_train.Gear_box_type.value_counts())\n",
    "\n",
    "X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Gear_box_type\"])\n",
    "df_train_adasyn = pd.DataFrame(X_resampled)\n",
    "\n",
    "print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
    "print(df_train_adasyn.Gear_box_type.value_counts())"
   ]
  }
 ],
 "metadata": {