From ca83b3c591689681cfb1a883aee6214b8bc74ea3 Mon Sep 17 00:00:00 2001 From: Timourka Date: Sat, 19 Oct 2024 02:38:29 +0400 Subject: [PATCH] =?UTF-8?q?3=20=D0=BB=D0=B0=D0=B1=D0=BE=D1=80=D0=B0=D1=82?= =?UTF-8?q?=D0=BE=D1=80=D0=BD=D0=B0=D1=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lab_3/laba3.ipynb | 1122 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1122 insertions(+) create mode 100644 lab_3/laba3.ipynb diff --git a/lab_3/laba3.ipynb b/lab_3/laba3.ipynb new file mode 100644 index 0000000..541c713 --- /dev/null +++ b/lab_3/laba3.ipynb @@ -0,0 +1,1122 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['ID', 'Price', 'Levy', 'Manufacturer', 'Model', 'Prod. year',\n", + " 'Category', 'Leather interior', 'Fuel type', 'Engine volume', 'Mileage',\n", + " 'Cylinders', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color',\n", + " 'Airbags'],\n", + " dtype='object')" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "df = pd.read_csv(\"..//static//csv//car_price_prediction.csv\", sep=\",\")\n", + "df.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "определение бизнесцелей:\n", + "1. Прогнозирование цены автомобиля.\n", + "2. Оценка факторов, влияющих на цену автомобиля." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Определение целей технического проекта:\n", + "1. Построить модель машинного обучения для регрессии, которая будет прогнозировать стоимость автомобиля на основе предоставленных данных о его характеристиках.\n", + "2. Провести анализ данных для выявления ключевых факторов, влияющих на стоимость автомобиля." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "преобразуем пробег в число" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "df['Mileage'] = df['Mileage'].str.replace(r'\\D+', '', regex=True).astype(float)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "посмотрим выбросы и усредним их:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Колонка Price:\n", + " Есть выбросы: Да\n", + " Количество выбросов: 1073\n", + " Минимальное значение: 1.0\n", + " Максимальное значение: 47191.0\n", + " 1-й квартиль (Q1): 5331.0\n", + " 3-й квартиль (Q3): 22075.0\n", + "\n", + "Колонка Mileage:\n", + " Есть выбросы: Да\n", + " Количество выбросов: 640\n", + " Минимальное значение: 0.0\n", + " Максимальное значение: 367011.5\n", + " 1-й квартиль (Q1): 70139.0\n", + " 3-й квартиль (Q3): 188888.0\n", + "\n" + ] + } + ], + "source": [ + "numeric_columns = ['Price', 'Mileage']\n", + "for column in numeric_columns:\n", + " if pd.api.types.is_numeric_dtype(df[column]): # Проверяем, является ли колонка числовой\n", + " q1 = df[column].quantile(0.25) # Находим 1-й квартиль (Q1)\n", + " q3 = df[column].quantile(0.75) # Находим 3-й квартиль (Q3)\n", + " iqr = q3 - q1 # Вычисляем межквартильный размах (IQR)\n", + "\n", + " # Определяем границы для выбросов\n", + " lower_bound = q1 - 1.5 * iqr # Нижняя граница\n", + " upper_bound = q3 + 1.5 * iqr # Верхняя граница\n", + "\n", + " # Подсчитываем количество выбросов\n", + " outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]\n", + " outlier_count = outliers.shape[0]\n", + "\n", + " # Устраняем выбросы: заменяем значения ниже нижней границы на саму нижнюю границу, а выше верхней — на верхнюю\n", + " df[column] = df[column].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)\n", + "\n", + " print(f\"Колонка {column}:\")\n", + " print(f\" Есть выбросы: {'Да' if outlier_count > 0 else 'Нет'}\")\n", + " print(f\" Количество выбросов: {outlier_count}\")\n", + " print(f\" Минимальное значение: {df[column].min()}\")\n", + " print(f\" Максимальное значение: {df[column].max()}\")\n", + " print(f\" 1-й квартиль (Q1): {q1}\")\n", + " print(f\" 3-й квартиль (Q3): {q3}\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "превращаем тире во чтото\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\bingo\\AppData\\Local\\Temp\\ipykernel_13744\\3336777531.py:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df['Levy'].replace(\"-\", np.nan, inplace=True)\n", + "C:\\Users\\bingo\\AppData\\Local\\Temp\\ipykernel_13744\\3336777531.py:9: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df['Levy'].fillna(df['Levy'].median(), inplace=True)\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "# Замена прочерков \"-\" в столбце Levy на NaN\n", + "df['Levy'].replace(\"-\", np.nan, inplace=True)\n", + "\n", + "# Преобразование столбца Levy в числовой формат (если он был строковым)\n", + "df['Levy'] = pd.to_numeric(df['Levy'], errors='coerce')\n", + "\n", + "# Заполнение пропусков в столбце Levy медианой\n", + "df['Levy'].fillna(df['Levy'].median(), inplace=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "код конструирования полей, все номинальные превращаем в числовые\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ID Price Levy Model Mileage Cylinders Airbags Age \\\n", + "0 45654403 13328.0 1399.0 1242 186005.0 6.0 12 14 \n", + "1 44731507 16621.0 1018.0 658 192000.0 6.0 8 13 \n", + "2 45774419 8467.0 781.0 684 200000.0 4.0 2 18 \n", + "3 45769185 3607.0 862.0 661 168966.0 4.0 0 13 \n", + "4 45809263 11726.0 446.0 684 91901.0 4.0 4 10 \n", + "\n", + " Leather Interior Manufacturer_ALFA ROMEO ... Color_Orange Color_Pink \\\n", + "0 1 0.0 ... 0.0 0.0 \n", + "1 0 0.0 ... 0.0 0.0 \n", + "2 0 0.0 ... 0.0 0.0 \n", + "3 1 0.0 ... 0.0 0.0 \n", + "4 1 0.0 ... 0.0 0.0 \n", + "\n", + " Color_Purple Color_Red Color_Silver Color_Sky blue Color_White \\\n", + "0 0.0 0.0 1.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 0.0 1.0 \n", + "4 0.0 0.0 1.0 0.0 0.0 \n", + "\n", + " Color_Yellow Drive wheels_Front Drive wheels_Rear \n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 1.0 0.0 \n", + "3 0.0 0.0 0.0 \n", + "4 0.0 1.0 0.0 \n", + "\n", + "[5 rows x 218 columns]\n" + ] + } + ], + "source": [ + "from sklearn.preprocessing import OneHotEncoder, LabelEncoder\n", + "#Пример: Создание нового признака \"Age\" (возраст автомобиля)\n", + "df['Age'] = 2024 - df['Prod. year']\n", + "\n", + "df['Leather Interior'] = df['Leather interior'].apply(lambda x: 1 if x == 'Yes' else 0)\n", + "\n", + "# Удаление ненужного столбца 'Prod. year', так как он был использован для создания 'Age'\n", + "df.drop(columns=['Prod. year'], inplace=True)\n", + "df.drop(columns=['Leather interior'], inplace=True)\n", + "\n", + "# Определение категориальных признаков для преобразования\n", + "categorical_columns = ['Manufacturer', 'Engine volume', 'Doors', 'Wheel', 'Category', 'Fuel type', 'Gear box type', 'Color', 'Drive wheels']\n", + "\n", + "# Инициализация OneHotEncoder\n", + "encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n", + "\n", + "# Применение OneHotEncoder к выбранным категориальным признакам\n", + "encoded_values = encoder.fit_transform(df[categorical_columns])\n", + "\n", + "# Получение имен новых закодированных столбцов\n", + "encoded_columns = encoder.get_feature_names_out(categorical_columns)\n", + "\n", + "# Преобразование в DataFrame\n", + "encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n", + "\n", + "# Объединение закодированных значений с оригинальным DataFrame, исключив исходные категориальные столбцы\n", + "df = df.drop(columns=categorical_columns)\n", + "df = pd.concat([df.reset_index(drop=True), encoded_values_df.reset_index(drop=True)], axis=1)\n", + "\n", + "# Применение Label Encoding для столбца 'Model'\n", + "label_encoder = LabelEncoder()\n", + "df['Model'] = label_encoder.fit_transform(df['Model'])\n", + "\n", + "print(df.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Создадим выборки данных. разбивать будем относительно параметра цены, ведь это тот самый параметр по которому наша выборка разбивается на классы. И собственно его нам и надо будет предсказывать" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер обучающей выборки: (11542, 217)\n", + "Размер контрольной выборки: (3847, 217)\n", + "Размер тестовой выборки: (3848, 217)\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Выделение признаков (X) и целевой переменной (y)\n", + "X = df.drop(columns=['Price']) # Признаки\n", + "y = df['Price'] # Целевая переменная (цена автомобиля)\n", + "\n", + "# Разделение данных на обучающую и временную выборки\n", + "X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n", + "\n", + "# Разделение временной выборки на контрольную и тестовую выборки\n", + "X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n", + "\n", + "# Проверка размеров выборок\n", + "print(f\"Размер обучающей выборки: {X_train.shape}\")\n", + "print(f\"Размер контрольной выборки: {X_val.shape}\")\n", + "print(f\"Размер тестовой выборки: {X_test.shape}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "# Функция для оценки распределения цены\n", + "def plot_distribution(y_data, title):\n", + " plt.figure(figsize=(10, 6))\n", + " sns.histplot(y_data, kde=True, bins=50)\n", + " plt.title(title)\n", + " plt.xlabel('Price')\n", + " plt.ylabel('Frequency')\n", + " plt.grid(True)\n", + " plt.show()\n", + "\n", + "# Оценка распределения цены в каждой выборке\n", + "plot_distribution(y_train, \"Распределение цены в обучающей выборке\")\n", + "plot_distribution(y_val, \"Распределение цены в контрольной выборке\")\n", + "plot_distribution(y_test, \"Распределение цены в тестовой выборке\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Нормированные данные:\n", + " ID Levy Model Mileage Cylinders Airbags Age \\\n", + "0 0.993528 0.112841 0.781624 0.506810 0.333333 0.750 0.123457 \n", + "1 0.956715 0.080072 0.414097 0.523144 0.333333 0.500 0.111111 \n", + "2 0.998315 0.059689 0.430459 0.544942 0.200000 0.125 0.172840 \n", + "3 0.998107 0.066655 0.415985 0.460383 0.200000 0.000 0.111111 \n", + "4 0.999705 0.030876 0.430459 0.250404 0.200000 0.250 0.074074 \n", + "\n", + " Leather Interior Manufacturer_ALFA ROMEO Manufacturer_ASTON MARTIN ... \\\n", + "0 1.0 0.0 0.0 ... \n", + "1 0.0 0.0 0.0 ... \n", + "2 0.0 0.0 0.0 ... \n", + "3 1.0 0.0 0.0 ... \n", + "4 1.0 0.0 0.0 ... \n", + "\n", + " Color_Orange Color_Pink Color_Purple Color_Red Color_Silver \\\n", + "0 0.0 0.0 0.0 0.0 1.0 \n", + "1 0.0 0.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 0.0 1.0 \n", + "\n", + " Color_Sky blue Color_White Color_Yellow Drive wheels_Front \\\n", + "0 0.0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 1.0 \n", + "3 0.0 1.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 1.0 \n", + "\n", + " Drive wheels_Rear \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 \n", + "\n", + "[5 rows x 217 columns]\n", + "\n", + "Стандартизированные данные:\n", + " ID Levy Model Mileage Cylinders Airbags Age \\\n", + "0 0.083141 1.359450 0.923579 0.537156 1.180937 1.254005 0.161034 \n", + "1 -0.902262 0.382603 -0.498032 0.603282 1.180937 0.328091 -0.015379 \n", + "2 0.211286 -0.225042 -0.434741 0.691523 -0.485866 -1.060780 0.866685 \n", + "3 0.205697 -0.017366 -0.490729 0.349214 -0.485866 -1.523737 -0.015379 \n", + "4 0.248490 -1.083950 -0.434741 -0.500823 -0.485866 -0.597823 -0.544617 \n", + "\n", + " Leather Interior Manufacturer_ALFA ROMEO Manufacturer_ASTON MARTIN ... \\\n", + "0 0.615306 -0.014421 -0.00721 ... \n", + "1 -1.625208 -0.014421 -0.00721 ... \n", + "2 -1.625208 -0.014421 -0.00721 ... \n", + "3 0.615306 -0.014421 -0.00721 ... \n", + "4 0.615306 -0.014421 -0.00721 ... \n", + "\n", + " Color_Orange Color_Pink Color_Purple Color_Red Color_Silver \\\n", + "0 -0.115443 -0.036788 -0.045072 -0.185361 2.018180 \n", + "1 -0.115443 -0.036788 -0.045072 -0.185361 -0.495496 \n", + "2 -0.115443 -0.036788 -0.045072 -0.185361 -0.495496 \n", + "3 -0.115443 -0.036788 -0.045072 -0.185361 -0.495496 \n", + "4 -0.115443 -0.036788 -0.045072 -0.185361 2.018180 \n", + "\n", + " Color_Sky blue Color_White Color_Yellow Drive wheels_Front \\\n", + "0 -0.07989 -0.551707 -0.074436 -1.422413 \n", + "1 -0.07989 -0.551707 -0.074436 -1.422413 \n", + "2 -0.07989 -0.551707 -0.074436 0.703031 \n", + "3 -0.07989 1.812557 -0.074436 -1.422413 \n", + "4 -0.07989 -0.551707 -0.074436 0.703031 \n", + "\n", + " Drive wheels_Rear \n", + "0 -0.368962 \n", + "1 -0.368962 \n", + "2 -0.368962 \n", + "3 -0.368962 \n", + "4 -0.368962 \n", + "\n", + "[5 rows x 217 columns]\n" + ] + } + ], + "source": [ + "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n", + "\n", + "# Предполагаем, что вы уже выделили ваши признаки X\n", + "# Применение нормировки Min-Max к всем числовым признакам\n", + "min_max_scaler = MinMaxScaler()\n", + "X_normalized = pd.DataFrame(min_max_scaler.fit_transform(X), columns=X.columns)\n", + "\n", + "# Применение стандартизации к всем числовым признакам\n", + "standard_scaler = StandardScaler()\n", + "X_standardized = pd.DataFrame(standard_scaler.fit_transform(X), columns=X.columns)\n", + "\n", + "# Проверка первых 5 строк после нормировки\n", + "print(\"Нормированные данные:\")\n", + "print(X_normalized.head())\n", + "\n", + "# Проверка первых 5 строк после стандартизации\n", + "print(\"\\nСтандартизированные данные:\")\n", + "print(X_standardized.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## пример использование feature tools \n", + "попытаюсь вынести model в отдельную таблицу" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: featuretools in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (1.31.0)Note: you may need to restart the kernel to use updated packages.\n", + "\n", + "Requirement already satisfied: cloudpickle>=1.5.0 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from featuretools) (3.1.0)\n", + "Requirement already satisfied: holidays>=0.17 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from featuretools) (0.58)\n", + "Requirement already satisfied: numpy>=1.25.0 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from featuretools) (2.1.1)\n", + "Requirement already satisfied: packaging>=20.0 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from featuretools) (24.1)\n", + "Requirement already satisfied: pandas>=2.0.0 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from featuretools) (2.2.2)\n", + "Requirement already satisfied: psutil>=5.7.0 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from featuretools) (6.0.0)\n", + "Requirement already satisfied: scipy>=1.10.0 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from featuretools) (1.14.1)\n", + "Requirement already satisfied: tqdm>=4.66.3 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from featuretools) (4.66.5)\n", + "Requirement already satisfied: woodwork>=0.28.0 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from featuretools) (0.31.0)\n", + "Requirement already satisfied: python-dateutil in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from holidays>=0.17->featuretools) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from pandas>=2.0.0->featuretools) (2024.1)\n", + "Requirement already satisfied: tzdata>=2022.7 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from pandas>=2.0.0->featuretools) (2024.1)\n", + "Requirement already satisfied: colorama in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from tqdm>=4.66.3->featuretools) (0.4.6)\n", + "Requirement already satisfied: scikit-learn>=1.1.0 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from woodwork>=0.28.0->featuretools) (1.5.2)\n", + "Requirement already satisfied: importlib-resources>=5.10.0 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from woodwork>=0.28.0->featuretools) (6.4.5)\n", + "Requirement already satisfied: six>=1.5 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from python-dateutil->holidays>=0.17->featuretools) (1.16.0)\n", + "Requirement already satisfied: joblib>=1.2.0 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from scikit-learn>=1.1.0->woodwork>=0.28.0->featuretools) (1.4.2)\n", + "Requirement already satisfied: threadpoolctl>=3.1.0 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from scikit-learn>=1.1.0->woodwork>=0.28.0->featuretools) (3.5.0)\n" + ] + } + ], + "source": [ + "pip install --upgrade featuretools" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: setuptools in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (75.2.0)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install --upgrade setuptools" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\МИИ\\AIM-PIbd-31-Kouvshinoff-T-A\\laba\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index car_id not found in dataframe, creating new integer column\n", + " warnings.warn(\n", + "d:\\МИИ\\AIM-PIbd-31-Kouvshinoff-T-A\\laba\\Lib\\site-packages\\featuretools\\synthesis\\dfs.py:321: UnusedPrimitiveWarning: Some specified primitives were not used during DFS:\n", + " trans_primitives: ['hour', 'weekday']\n", + " agg_primitives: ['any', 'mode']\n", + "This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.\n", + " warnings.warn(warning_msg, UnusedPrimitiveWarning)\n", + "d:\\МИИ\\AIM-PIbd-31-Kouvshinoff-T-A\\laba\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n", + " ).agg(to_agg)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDPriceLevyMileageCylindersAirbagsAgeLeather InteriorManufacturer_ALFA ROMEOManufacturer_ASTON MARTIN...model_table.MEAN(car_features.Manufacturer_TOYOTA)model_table.MEAN(car_features.Manufacturer_UAZ)model_table.MEAN(car_features.Manufacturer_VAZ)model_table.MEAN(car_features.Manufacturer_VOLKSWAGEN)model_table.MEAN(car_features.Manufacturer_VOLVO)model_table.MEAN(car_features.Manufacturer_ZAZ)model_table.MEAN(car_features.Manufacturer_სხვა)model_table.MEAN(car_features.Mileage)model_table.MEAN(car_features.Price)model_table.MEAN(car_features.Wheel_Right-hand drive)
car_id
045654403133281399.01860056.0121410.00.0...0.00.00.00.00.00.00.0186005.013328.00.0
144731507166211018.01920006.081300.00.0...0.00.00.00.00.00.00.0192000.016621.00.0
2457744198467781.02000004.021800.00.0...0.00.00.00.00.00.00.0200000.08467.01.0
3457691853607862.01689664.001310.00.0...0.00.00.00.00.00.00.0168966.03607.00.0
44580926311726446.0919014.041010.00.0...0.00.00.00.00.00.00.091901.011726.00.0
..................................................................
19232457983558467781.03000004.052510.00.0...0.00.00.00.00.00.00.0300000.08467.00.0
192334577885615681831.01616004.081310.00.0...0.00.00.00.00.00.00.0161600.015681.00.0
192344580499726108836.01163654.041410.00.0...0.00.00.00.00.00.00.0116365.026108.00.0
192354579352653311288.0512584.041710.00.0...0.00.00.00.00.00.00.051258.05331.00.0
1923645813273470753.01869234.0121210.00.0...0.00.00.00.00.00.00.0186923.0470.00.0
\n", + "

19237 rows × 438 columns

\n", + "
" + ], + "text/plain": [ + " ID Price Levy Mileage Cylinders Airbags Age \\\n", + "car_id \n", + "0 45654403 13328 1399.0 186005 6.0 12 14 \n", + "1 44731507 16621 1018.0 192000 6.0 8 13 \n", + "2 45774419 8467 781.0 200000 4.0 2 18 \n", + "3 45769185 3607 862.0 168966 4.0 0 13 \n", + "4 45809263 11726 446.0 91901 4.0 4 10 \n", + "... ... ... ... ... ... ... ... \n", + "19232 45798355 8467 781.0 300000 4.0 5 25 \n", + "19233 45778856 15681 831.0 161600 4.0 8 13 \n", + "19234 45804997 26108 836.0 116365 4.0 4 14 \n", + "19235 45793526 5331 1288.0 51258 4.0 4 17 \n", + "19236 45813273 470 753.0 186923 4.0 12 12 \n", + "\n", + " Leather Interior Manufacturer_ALFA ROMEO Manufacturer_ASTON MARTIN \\\n", + "car_id \n", + "0 1 0.0 0.0 \n", + "1 0 0.0 0.0 \n", + "2 0 0.0 0.0 \n", + "3 1 0.0 0.0 \n", + "4 1 0.0 0.0 \n", + "... ... ... ... \n", + "19232 1 0.0 0.0 \n", + "19233 1 0.0 0.0 \n", + "19234 1 0.0 0.0 \n", + "19235 1 0.0 0.0 \n", + "19236 1 0.0 0.0 \n", + "\n", + " ... model_table.MEAN(car_features.Manufacturer_TOYOTA) \\\n", + "car_id ... \n", + "0 ... 0.0 \n", + "1 ... 0.0 \n", + "2 ... 0.0 \n", + "3 ... 0.0 \n", + "4 ... 0.0 \n", + "... ... ... \n", + "19232 ... 0.0 \n", + "19233 ... 0.0 \n", + "19234 ... 0.0 \n", + "19235 ... 0.0 \n", + "19236 ... 0.0 \n", + "\n", + " model_table.MEAN(car_features.Manufacturer_UAZ) \\\n", + "car_id \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 \n", + "... ... \n", + "19232 0.0 \n", + "19233 0.0 \n", + "19234 0.0 \n", + "19235 0.0 \n", + "19236 0.0 \n", + "\n", + " model_table.MEAN(car_features.Manufacturer_VAZ) \\\n", + "car_id \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 \n", + "... ... \n", + "19232 0.0 \n", + "19233 0.0 \n", + "19234 0.0 \n", + "19235 0.0 \n", + "19236 0.0 \n", + "\n", + " model_table.MEAN(car_features.Manufacturer_VOLKSWAGEN) \\\n", + "car_id \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 \n", + "... ... \n", + "19232 0.0 \n", + "19233 0.0 \n", + "19234 0.0 \n", + "19235 0.0 \n", + "19236 0.0 \n", + "\n", + " model_table.MEAN(car_features.Manufacturer_VOLVO) \\\n", + "car_id \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 \n", + "... ... \n", + "19232 0.0 \n", + "19233 0.0 \n", + "19234 0.0 \n", + "19235 0.0 \n", + "19236 0.0 \n", + "\n", + " model_table.MEAN(car_features.Manufacturer_ZAZ) \\\n", + "car_id \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 \n", + "... ... \n", + "19232 0.0 \n", + "19233 0.0 \n", + "19234 0.0 \n", + "19235 0.0 \n", + "19236 0.0 \n", + "\n", + " model_table.MEAN(car_features.Manufacturer_სხვა) \\\n", + "car_id \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 \n", + "... ... \n", + "19232 0.0 \n", + "19233 0.0 \n", + "19234 0.0 \n", + "19235 0.0 \n", + "19236 0.0 \n", + "\n", + " model_table.MEAN(car_features.Mileage) \\\n", + "car_id \n", + "0 186005.0 \n", + "1 192000.0 \n", + "2 200000.0 \n", + "3 168966.0 \n", + "4 91901.0 \n", + "... ... \n", + "19232 300000.0 \n", + "19233 161600.0 \n", + "19234 116365.0 \n", + "19235 51258.0 \n", + "19236 186923.0 \n", + "\n", + " model_table.MEAN(car_features.Price) \\\n", + "car_id \n", + "0 13328.0 \n", + "1 16621.0 \n", + "2 8467.0 \n", + "3 3607.0 \n", + "4 11726.0 \n", + "... ... \n", + "19232 8467.0 \n", + "19233 15681.0 \n", + "19234 26108.0 \n", + "19235 5331.0 \n", + "19236 470.0 \n", + "\n", + " model_table.MEAN(car_features.Wheel_Right-hand drive) \n", + "car_id \n", + "0 0.0 \n", + "1 0.0 \n", + "2 1.0 \n", + "3 0.0 \n", + "4 0.0 \n", + "... ... \n", + "19232 0.0 \n", + "19233 0.0 \n", + "19234 0.0 \n", + "19235 0.0 \n", + "19236 0.0 \n", + "\n", + "[19237 rows x 438 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import featuretools as ft\n", + "from woodwork.logical_types import Categorical, Integer\n", + "# Создание двух таблиц: одна с моделью, другая с остальными данными\n", + "model_df = df[['ID', 'Model']].drop_duplicates().reset_index(drop=True)\n", + "other_df = df.drop(columns=['Model'])\n", + "\n", + "# Создание уникального идентификатора для связи\n", + "model_df['model_id'] = model_df.index\n", + "other_df['model_id'] = other_df['ID'].map(model_df.set_index('ID')['model_id'])\n", + "\n", + "es = ft.EntitySet(id=\"orders\")\n", + "es = es.add_dataframe(\n", + " dataframe_name=\"model_table\",\n", + " dataframe=model_df,\n", + " index=\"model_id\", # Индекс для уникальной идентификации моделей\n", + " logical_types={\n", + " \"Model\": Categorical # Определяем логический тип для модели\n", + " },\n", + ")\n", + "es = es.add_dataframe(\n", + " dataframe_name=\"car_features\",\n", + " dataframe=other_df,\n", + " index=\"car_id\", # Индекс для уникальной идентификации автомобилей\n", + " logical_types={\n", + " \"Price\": Integer, # Целевая переменная (цена)\n", + " \"Mileage\": Integer, # Пробег (числовой признак)\n", + " \"model_id\": Integer, # Пробег (числовой признак)\n", + " },\n", + ")\n", + "es = es.add_relationship(\"model_table\", \"model_id\", \"car_features\", \"model_id\")\n", + "\n", + "feature_matrix, feature_defs = ft.dfs(\n", + " entityset=es,\n", + " target_dataframe_name=\"car_features\",\n", + " agg_primitives=[\"mean\", \"count\", \"mode\", \"any\"],\n", + " trans_primitives=[\"hour\", \"weekday\"],\n", + " max_depth=2,\n", + ")\n", + "\n", + "feature_matrix" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "laba", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}