MII_Salin_Oleg_PIbd-33/lec2.ipynb
2024-10-22 18:54:39 +04:00

763 lines
24 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Загрузка данных в DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 19237 entries, 45654403 to 45813273\n",
"Data columns (total 17 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Price 19237 non-null int64 \n",
" 1 Levy 19237 non-null object \n",
" 2 Manufacturer 19237 non-null object \n",
" 3 Model 19237 non-null object \n",
" 4 Prodyear 19237 non-null int64 \n",
" 5 Category 19237 non-null object \n",
" 6 Leatherinterior 19237 non-null object \n",
" 7 Fueltype 19237 non-null object \n",
" 8 Engine volume 19237 non-null object \n",
" 9 Mileage 19237 non-null object \n",
" 10 Cylinders 19237 non-null float64\n",
" 11 Gear box type 19237 non-null object \n",
" 12 Drive wheels 19237 non-null object \n",
" 13 Doors 19237 non-null object \n",
" 14 Wheel 19237 non-null object \n",
" 15 Color 19237 non-null object \n",
" 16 Airbags 19237 non-null int64 \n",
"dtypes: float64(1), int64(3), object(13)\n",
"memory usage: 2.6+ MB\n",
"(19237, 17)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Price</th>\n",
" <th>Levy</th>\n",
" <th>Manufacturer</th>\n",
" <th>Model</th>\n",
" <th>Prodyear</th>\n",
" <th>Category</th>\n",
" <th>Leatherinterior</th>\n",
" <th>Fueltype</th>\n",
" <th>Engine volume</th>\n",
" <th>Mileage</th>\n",
" <th>Cylinders</th>\n",
" <th>Gear box type</th>\n",
" <th>Drive wheels</th>\n",
" <th>Doors</th>\n",
" <th>Wheel</th>\n",
" <th>Color</th>\n",
" <th>Airbags</th>\n",
" </tr>\n",
" <tr>\n",
" <th>ID</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>45654403</th>\n",
" <td>13328</td>\n",
" <td>1399</td>\n",
" <td>LEXUS</td>\n",
" <td>RX 450</td>\n",
" <td>2010</td>\n",
" <td>Jeep</td>\n",
" <td>1</td>\n",
" <td>Hybrid</td>\n",
" <td>3.5</td>\n",
" <td>186005 km</td>\n",
" <td>6.0</td>\n",
" <td>Automatic</td>\n",
" <td>4x4</td>\n",
" <td>04-May</td>\n",
" <td>Left wheel</td>\n",
" <td>Silver</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>44731507</th>\n",
" <td>16621</td>\n",
" <td>1018</td>\n",
" <td>CHEVROLET</td>\n",
" <td>Equinox</td>\n",
" <td>2011</td>\n",
" <td>Jeep</td>\n",
" <td>0</td>\n",
" <td>Petrol</td>\n",
" <td>3</td>\n",
" <td>192000 km</td>\n",
" <td>6.0</td>\n",
" <td>Tiptronic</td>\n",
" <td>4x4</td>\n",
" <td>04-May</td>\n",
" <td>Left wheel</td>\n",
" <td>Black</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45774419</th>\n",
" <td>8467</td>\n",
" <td>-</td>\n",
" <td>HONDA</td>\n",
" <td>FIT</td>\n",
" <td>2006</td>\n",
" <td>Hatchback</td>\n",
" <td>0</td>\n",
" <td>Petrol</td>\n",
" <td>1.3</td>\n",
" <td>200000 km</td>\n",
" <td>4.0</td>\n",
" <td>Variator</td>\n",
" <td>Front</td>\n",
" <td>04-May</td>\n",
" <td>Right-hand drive</td>\n",
" <td>Black</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45769185</th>\n",
" <td>3607</td>\n",
" <td>862</td>\n",
" <td>FORD</td>\n",
" <td>Escape</td>\n",
" <td>2011</td>\n",
" <td>Jeep</td>\n",
" <td>1</td>\n",
" <td>Hybrid</td>\n",
" <td>2.5</td>\n",
" <td>168966 km</td>\n",
" <td>4.0</td>\n",
" <td>Automatic</td>\n",
" <td>4x4</td>\n",
" <td>04-May</td>\n",
" <td>Left wheel</td>\n",
" <td>White</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45809263</th>\n",
" <td>11726</td>\n",
" <td>446</td>\n",
" <td>HONDA</td>\n",
" <td>FIT</td>\n",
" <td>2014</td>\n",
" <td>Hatchback</td>\n",
" <td>1</td>\n",
" <td>Petrol</td>\n",
" <td>1.3</td>\n",
" <td>91901 km</td>\n",
" <td>4.0</td>\n",
" <td>Automatic</td>\n",
" <td>Front</td>\n",
" <td>04-May</td>\n",
" <td>Left wheel</td>\n",
" <td>Silver</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Price Levy Manufacturer Model Prodyear Category \\\n",
"ID \n",
"45654403 13328 1399 LEXUS RX 450 2010 Jeep \n",
"44731507 16621 1018 CHEVROLET Equinox 2011 Jeep \n",
"45774419 8467 - HONDA FIT 2006 Hatchback \n",
"45769185 3607 862 FORD Escape 2011 Jeep \n",
"45809263 11726 446 HONDA FIT 2014 Hatchback \n",
"\n",
" Leatherinterior Fueltype Engine volume Mileage Cylinders \\\n",
"ID \n",
"45654403 1 Hybrid 3.5 186005 km 6.0 \n",
"44731507 0 Petrol 3 192000 km 6.0 \n",
"45774419 0 Petrol 1.3 200000 km 4.0 \n",
"45769185 1 Hybrid 2.5 168966 km 4.0 \n",
"45809263 1 Petrol 1.3 91901 km 4.0 \n",
"\n",
" Gear box type Drive wheels Doors Wheel Color Airbags \n",
"ID \n",
"45654403 Automatic 4x4 04-May Left wheel Silver 12 \n",
"44731507 Tiptronic 4x4 04-May Left wheel Black 8 \n",
"45774419 Variator Front 04-May Right-hand drive Black 2 \n",
"45769185 Automatic 4x4 04-May Left wheel White 0 \n",
"45809263 Automatic Front 04-May Left wheel Silver 4 "
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv(\"data/car_price_prediction.csv\", index_col=\"ID\")\n",
"\n",
"df.info()\n",
"\n",
"df[\"Leatherinterior\"] = df[\"Leatherinterior\"].apply(\n",
" lambda x: 1 if x == 'Yes' else 0,\n",
")\n",
"\n",
"print(df.shape)\n",
"\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Получение сведений о пропущенных данных"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Типы пропущенных данных:\n",
"- None - представление пустых данных в Python\n",
"- NaN - представление пустых данных в Pandas\n",
"- '' - пустая строка"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Price 0\n",
"Levy 0\n",
"Manufacturer 0\n",
"Model 0\n",
"Prodyear 0\n",
"Category 0\n",
"Leatherinterior 0\n",
"Fueltype 0\n",
"Engine volume 0\n",
"Mileage 0\n",
"Cylinders 0\n",
"Gear box type 0\n",
"Drive wheels 0\n",
"Doors 0\n",
"Wheel 0\n",
"Color 0\n",
"Airbags 0\n",
"dtype: int64\n",
"\n",
"Price False\n",
"Levy False\n",
"Manufacturer False\n",
"Model False\n",
"Prodyear False\n",
"Category False\n",
"Leatherinterior False\n",
"Fueltype False\n",
"Engine volume False\n",
"Mileage False\n",
"Cylinders False\n",
"Gear box type False\n",
"Drive wheels False\n",
"Doors False\n",
"Wheel False\n",
"Color False\n",
"Airbags False\n",
"dtype: bool\n",
"\n"
]
}
],
"source": [
"# Количество пустых значений признаков\n",
"print(df.isnull().sum())\n",
"\n",
"print()\n",
"\n",
"# Есть ли пустые значения признаков\n",
"print(df.isnull().any())\n",
"\n",
"print()\n",
"\n",
"# Процент пустых значений признаков\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Заполнение пропущенных данных\n",
"\n",
"https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n",
"\n",
"https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# fillna_df = df.fillna(0)\n",
"\n",
"# print(fillna_df.shape)\n",
"\n",
"# print(fillna_df.isnull().any())\n",
"\n",
"# # Замена пустых данных на 0\n",
"# df[\"AgeFillNA\"] = df[\"Age\"].fillna(0)\n",
"\n",
"# # Замена пустых данных на медиану\n",
"# df[\"AgeFillMedian\"] = df[\"Age\"].fillna(df[\"Age\"].median())\n",
"\n",
"# df.tail()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# df[\"AgeCopy\"] = df[\"Age\"]\n",
"\n",
"# # Замена данных сразу в DataFrame без копирования\n",
"# df.fillna({\"AgeCopy\": 0}, inplace=True)\n",
"\n",
"# df.tail()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Удаление наблюдений с пропусками"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# dropna_df = df.dropna()\n",
"\n",
"# print(dropna_df.shape)\n",
"\n",
"# print(fillna_df.isnull().any())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Создание выборок данных\n",
"\n",
"Библиотека scikit-learn\n",
"\n",
"https://scikit-learn.org/stable/index.html"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<img src=\"assets/lec2-split.png\" width=\"600\" style=\"background-color: white\">"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Функция для создания выборок\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"\n",
"def split_stratified_into_train_val_test(\n",
" df_input,\n",
" stratify_colname=\"y\",\n",
" frac_train=0.6,\n",
" frac_val=0.15,\n",
" frac_test=0.25,\n",
" random_state=None,\n",
"):\n",
" \"\"\"\n",
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
" following fractional ratios provided by the user, where each subset is\n",
" stratified by the values in a specific column (that is, each subset has\n",
" the same relative frequency of the values in the column). It performs this\n",
" splitting by running train_test_split() twice.\n",
"\n",
" Parameters\n",
" ----------\n",
" df_input : Pandas dataframe\n",
" Input dataframe to be split.\n",
" stratify_colname : str\n",
" The name of the column that will be used for stratification. Usually\n",
" this column would be for the label.\n",
" frac_train : float\n",
" frac_val : float\n",
" frac_test : float\n",
" The ratios with which the dataframe will be split into train, val, and\n",
" test data. The values should be expressed as float fractions and should\n",
" sum to 1.0.\n",
" random_state : int, None, or RandomStateInstance\n",
" Value to be passed to train_test_split().\n",
"\n",
" Returns\n",
" -------\n",
" df_train, df_val, df_test :\n",
" Dataframes containing the three splits.\n",
" \"\"\"\n",
"\n",
" if frac_train + frac_val + frac_test != 1.0:\n",
" raise ValueError(\n",
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
" % (frac_train, frac_val, frac_test)\n",
" )\n",
"\n",
" if stratify_colname not in df_input.columns:\n",
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
"\n",
" X = df_input # Contains all columns.\n",
" y = df_input[\n",
" [stratify_colname]\n",
" ] # Dataframe of just the column on which to stratify.\n",
"\n",
" # Split original dataframe into train and temp dataframes.\n",
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
" )\n",
"\n",
" # Split the temp dataframe into val and test dataframes.\n",
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
" df_val, df_test, y_val, y_test = train_test_split(\n",
" df_temp,\n",
" y_temp,\n",
" stratify=y_temp,\n",
" test_size=relative_frac_test,\n",
" random_state=random_state,\n",
" )\n",
"\n",
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
"\n",
" return df_train, df_val, df_test"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Leatherinterior\n",
"1 13954\n",
"0 5283\n",
"Name: count, dtype: int64\n",
"Обучающая выборка: (11542, 3)\n",
"Leatherinterior\n",
"1 8372\n",
"0 3170\n",
"Name: count, dtype: int64\n",
"Контрольная выборка: (3847, 3)\n",
"Leatherinterior\n",
"1 2791\n",
"0 1056\n",
"Name: count, dtype: int64\n",
"Тестовая выборка: (3848, 3)\n",
"Leatherinterior\n",
"1 2791\n",
"0 1057\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"# Вывод распределения количества наблюдений по меткам (классам)\n",
"print(df.Leatherinterior.value_counts())\n",
"\n",
"data = df[[\"Leatherinterior\", \"Price\", \"Airbags\"]].copy()\n",
"\n",
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
" data,\n",
" stratify_colname=\"Leatherinterior\",\n",
" frac_train=0.60,\n",
" frac_val=0.20,\n",
" frac_test=0.20,\n",
")\n",
"\n",
"print(\"Обучающая выборка: \", df_train.shape)\n",
"print(df_train.Leatherinterior.value_counts())\n",
"\n",
"print(\"Контрольная выборка: \", df_val.shape)\n",
"print(df_val.Leatherinterior.value_counts())\n",
"\n",
"print(\"Тестовая выборка: \", df_test.shape)\n",
"print(df_test.Leatherinterior.value_counts())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выборка с избытком (oversampling)\n",
"\n",
"https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n",
"\n",
"https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n",
"\n",
"Выборка с недостатком (undersampling)\n",
"\n",
"https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n",
"\n",
"Библиотека imbalanced-learn\n",
"\n",
"https://imbalanced-learn.org/stable/"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Обучающая выборка: (11542, 3)\n",
"Leatherinterior\n",
"1 8372\n",
"0 3170\n",
"Name: count, dtype: int64\n",
"Обучающая выборка после oversampling: (16585, 3)\n",
"Leatherinterior\n",
"1 8372\n",
"0 8213\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Leatherinterior</th>\n",
" <th>Price</th>\n",
" <th>Airbags</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>12231</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>18817</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>15053</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>470</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>19914</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16580</th>\n",
" <td>0</td>\n",
" <td>13015</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16581</th>\n",
" <td>0</td>\n",
" <td>8799</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16582</th>\n",
" <td>0</td>\n",
" <td>2057</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16583</th>\n",
" <td>0</td>\n",
" <td>2000</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16584</th>\n",
" <td>0</td>\n",
" <td>1910</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>16585 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" Leatherinterior Price Airbags\n",
"0 1 12231 8\n",
"1 1 18817 10\n",
"2 1 15053 4\n",
"3 1 470 0\n",
"4 1 19914 6\n",
"... ... ... ...\n",
"16580 0 13015 4\n",
"16581 0 8799 2\n",
"16582 0 2057 2\n",
"16583 0 2000 2\n",
"16584 0 1910 1\n",
"\n",
"[16585 rows x 3 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from imblearn.over_sampling import ADASYN\n",
"\n",
"ada = ADASYN()\n",
"\n",
"print(\"Обучающая выборка: \", df_train.shape)\n",
"print(df_train.Leatherinterior.value_counts())\n",
"\n",
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Leatherinterior\"]) # type: ignore\n",
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
"\n",
"print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
"print(df_train_adasyn.Leatherinterior.value_counts())\n",
"\n",
"df_train_adasyn"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}