diff --git a/lab_3/lab3.ipynb b/lab_3/lab3.ipynb new file mode 100644 index 0000000..02e514a --- /dev/null +++ b/lab_3/lab3.ipynb @@ -0,0 +1,570 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "df = pd.read_csv(\"C://Users//annal//aim//static//csv//Forbes_Billionaires.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Определим бизнес цели:\n", + "## 1- Прогнозирование места в рейтинге\n", + "## 2- Оценка факторов, влияющих на место в рейтинге" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Определим цели технического проекта:\n", + "## Построить модель, которая будет прогнозировать место в рейтинге на основе представленных данных об участнике\n", + "## Провести анализ данных для выявления важнейших характеристик для прогнозирования" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Проверим выбросы и усредним" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Колонка Networth:\n", + " Есть выбросы: Да\n", + " Количество выбросов: 226\n", + " Минимальное значение: 1.0\n", + " Максимальное значение: 9.0\n", + " 1-й квантиль (Q1): 1.5\n", + " 3-й квантиль (Q3): 4.5\n", + "\n", + "Колонка Age:\n", + " Есть выбросы: Да\n", + " Количество выбросов: 6\n", + " Минимальное значение: 26.5\n", + " Максимальное значение: 100.0\n", + " 1-й квантиль (Q1): 55.0\n", + " 3-й квантиль (Q3): 74.0\n", + "\n" + ] + } + ], + "source": [ + "numeric_columns = ['Networth', 'Age']\n", + "for column in numeric_columns:\n", + " if pd.api.types.is_numeric_dtype(df[column]): # Проверяем, является ли колонка числовой\n", + " q1 = df[column].quantile(0.25) # Находим 1-й квантиль (Q1)\n", + " q3 = df[column].quantile(0.75) # Находим 3-й квантиль (Q3)\n", + " iqr = q3 - q1 # Вычисляем межквантильный размах (IQR)\n", + "\n", + " # Определяем границы для выбросов\n", + " lower_bound = q1 - 1.5 * iqr # Нижняя граница\n", + " upper_bound = q3 + 1.5 * iqr # Верхняя граница\n", + "\n", + " # Подсчитываем количество выбросов\n", + " outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]\n", + " outlier_count = outliers.shape[0]\n", + "\n", + " # Устраняем выбросы: заменяем значения ниже нижней границы на саму нижнюю границу, а выше верхней — на верхнюю\n", + " df[column] = df[column].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)\n", + "\n", + " print(f\"Колонка {column}:\")\n", + " print(f\" Есть выбросы: {'Да' if outlier_count > 0 else 'Нет'}\")\n", + " print(f\" Количество выбросов: {outlier_count}\")\n", + " print(f\" Минимальное значение: {df[column].min()}\")\n", + " print(f\" Максимальное значение: {df[column].max()}\")\n", + " print(f\" 1-й квантиль (Q1): {q1}\")\n", + " print(f\" 3-й квантиль (Q3): {q3}\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Превратим номинальные столбцы в числовые" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Rank Networth Age Country Source Industry \\\n", + "0 1 9.0 50.0 70 123 0 \n", + "1 2 9.0 58.0 70 5 15 \n", + "2 3 9.0 73.0 20 73 3 \n", + "3 4 9.0 66.0 70 81 15 \n", + "4 5 9.0 91.0 70 11 4 \n", + "\n", + " Name_Abdulla Al Futtaim & family \\\n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 \n", + "\n", + " Name_Abdulla bin Ahmad Al Ghurair & family Name_Abdulsamad Rabiu \\\n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "\n", + " Name_Abhay Firodia ... Name_Zhu Yan & family Name_Zhu Yiming \\\n", + "0 0.0 ... 0.0 0.0 \n", + "1 0.0 ... 0.0 0.0 \n", + "2 0.0 ... 0.0 0.0 \n", + "3 0.0 ... 0.0 0.0 \n", + "4 0.0 ... 0.0 0.0 \n", + "\n", + " Name_Zhu Yiwen & family Name_Zhuo Jun Name_Ziv Aviram \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 \n", + "\n", + " Name_Zong Qinghou Name_Zong Yanmin Name_Zugen Ni Name_Zuowen Song \\\n", + "0 0.0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 0.0 \n", + "\n", + " Name_Zygmunt Solorz-Zak \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 \n", + "\n", + "[5 rows x 2603 columns]\n" + ] + } + ], + "source": [ + "from sklearn.preprocessing import OneHotEncoder, LabelEncoder\n", + "\n", + "# Определение категориальных признаков для преобразования\n", + "categorical_columns = ['Name']\n", + "\n", + "# Инициализация OneHotEncoder\n", + "encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n", + "\n", + "# Применение OneHotEncoder к выбранным категориальным признакам\n", + "encoded_values = encoder.fit_transform(df[categorical_columns])\n", + "\n", + "# Получение имен новых закодированных столбцов\n", + "encoded_columns = encoder.get_feature_names_out(categorical_columns)\n", + "\n", + "# Преобразование в DataFrame\n", + "encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n", + "\n", + "# Объединение закодированных значений с оригинальным DataFrame, исключив исходные категориальные столбцы\n", + "df = df.drop(columns=categorical_columns)\n", + "df = pd.concat([df.reset_index(drop=True), encoded_values_df.reset_index(drop=True)], axis=1)\n", + "\n", + "# Применение Label Encoding для столбца 'Country', 'Source', 'Industry'\n", + "label_encoder = LabelEncoder()\n", + "df['Country'] = label_encoder.fit_transform(df['Country'])\n", + "df['Source'] = label_encoder.fit_transform(df['Source'])\n", + "df['Industry'] = label_encoder.fit_transform(df['Industry'])\n", + "\n", + "\n", + "print(df.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Создадим выборки данных по параметру места в рейтинге" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер обучающей выборки: (1560, 2602)\n", + "Размер контрольной выборки: (520, 2602)\n", + "Размер тестовой выборки: (520, 2602)\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Выделение признаков (X) и целевой переменной (y)\n", + "X = df.drop(columns=['Rank ']) # Признаки\n", + "y = df['Rank '] # Целевая переменная\n", + "\n", + "# Разделение данных на обучающую и временную выборки\n", + "X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n", + "\n", + "# Разделение временной выборки на контрольную и тестовую выборки\n", + "X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n", + "\n", + "# Проверка размеров выборок\n", + "print(f\"Размер обучающей выборки: {X_train.shape}\")\n", + "print(f\"Размер контрольной выборки: {X_val.shape}\")\n", + "print(f\"Размер тестовой выборки: {X_test.shape}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "# Функция для оценки распределения цены\n", + "def plot_distribution(y_data, title):\n", + " plt.figure(figsize=(10, 6))\n", + " sns.histplot(y_data, kde=True, bins=50)\n", + " plt.title(title)\n", + " plt.xlabel('Rank ')\n", + " plt.ylabel('Frequency')\n", + " plt.grid(True)\n", + " plt.show()\n", + "\n", + "# Оценка распределения цены в каждой выборке\n", + "plot_distribution(y_train, \"Распределение места в обучающей выборке\")\n", + "plot_distribution(y_val, \"Распределение места в контрольной выборке\")\n", + "plot_distribution(y_test, \"Распределение места в тестовой выборке\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Применим min-max нормировку для улучшения качества работы модели" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Нормированные данные:\n", + " Networth Age Country Source Industry \\\n", + "0 1.0 0.319728 0.945946 0.137584 0.000000 \n", + "1 1.0 0.428571 0.945946 0.005593 0.882353 \n", + "2 1.0 0.632653 0.270270 0.081655 0.176471 \n", + "3 1.0 0.537415 0.945946 0.090604 0.882353 \n", + "4 1.0 0.877551 0.945946 0.012304 0.235294 \n", + "\n", + " Name_Abdulla Al Futtaim & family \\\n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 \n", + "\n", + " Name_Abdulla bin Ahmad Al Ghurair & family Name_Abdulsamad Rabiu \\\n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "\n", + " Name_Abhay Firodia Name_Abigail Johnson ... Name_Zhu Yan & family \\\n", + "0 0.0 0.0 ... 0.0 \n", + "1 0.0 0.0 ... 0.0 \n", + "2 0.0 0.0 ... 0.0 \n", + "3 0.0 0.0 ... 0.0 \n", + "4 0.0 0.0 ... 0.0 \n", + "\n", + " Name_Zhu Yiming Name_Zhu Yiwen & family Name_Zhuo Jun \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 \n", + "\n", + " Name_Ziv Aviram Name_Zong Qinghou Name_Zong Yanmin Name_Zugen Ni \\\n", + "0 0.0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 0.0 \n", + "\n", + " Name_Zuowen Song Name_Zygmunt Solorz-Zak \n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "\n", + "[5 rows x 2602 columns]\n", + "\n", + "Стандартизированные данные:\n", + " Networth Age Country Source Industry \\\n", + "0 2.266803 -1.081352 1.173910 -1.505003 -1.701719 \n", + "1 2.266803 -0.475422 1.173910 -2.004526 1.339990 \n", + "2 2.266803 0.660697 -0.805574 -1.716665 -1.093377 \n", + "3 2.266803 0.130508 1.173910 -1.682800 1.339990 \n", + "4 2.266803 2.024040 1.173910 -1.979126 -0.890597 \n", + "\n", + " Name_Abdulla Al Futtaim & family \\\n", + "0 -0.019615 \n", + "1 -0.019615 \n", + "2 -0.019615 \n", + "3 -0.019615 \n", + "4 -0.019615 \n", + "\n", + " Name_Abdulla bin Ahmad Al Ghurair & family Name_Abdulsamad Rabiu \\\n", + "0 -0.019615 -0.019615 \n", + "1 -0.019615 -0.019615 \n", + "2 -0.019615 -0.019615 \n", + "3 -0.019615 -0.019615 \n", + "4 -0.019615 -0.019615 \n", + "\n", + " Name_Abhay Firodia Name_Abigail Johnson ... Name_Zhu Yan & family \\\n", + "0 -0.019615 -0.019615 ... -0.019615 \n", + "1 -0.019615 -0.019615 ... -0.019615 \n", + "2 -0.019615 -0.019615 ... -0.019615 \n", + "3 -0.019615 -0.019615 ... -0.019615 \n", + "4 -0.019615 -0.019615 ... -0.019615 \n", + "\n", + " Name_Zhu Yiming Name_Zhu Yiwen & family Name_Zhuo Jun \\\n", + "0 -0.019615 -0.019615 -0.019615 \n", + "1 -0.019615 -0.019615 -0.019615 \n", + "2 -0.019615 -0.019615 -0.019615 \n", + "3 -0.019615 -0.019615 -0.019615 \n", + "4 -0.019615 -0.019615 -0.019615 \n", + "\n", + " Name_Ziv Aviram Name_Zong Qinghou Name_Zong Yanmin Name_Zugen Ni \\\n", + "0 -0.019615 -0.019615 -0.019615 -0.019615 \n", + "1 -0.019615 -0.019615 -0.019615 -0.019615 \n", + "2 -0.019615 -0.019615 -0.019615 -0.019615 \n", + "3 -0.019615 -0.019615 -0.019615 -0.019615 \n", + "4 -0.019615 -0.019615 -0.019615 -0.019615 \n", + "\n", + " Name_Zuowen Song Name_Zygmunt Solorz-Zak \n", + "0 -0.019615 -0.019615 \n", + "1 -0.019615 -0.019615 \n", + "2 -0.019615 -0.019615 \n", + "3 -0.019615 -0.019615 \n", + "4 -0.019615 -0.019615 \n", + "\n", + "[5 rows x 2602 columns]\n" + ] + } + ], + "source": [ + "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n", + "\n", + "# Предполагаем, что вы уже выделили ваши признаки X\n", + "# Применение нормировки Min-Max к всем числовым признакам\n", + "min_max_scaler = MinMaxScaler()\n", + "X_normalized = pd.DataFrame(min_max_scaler.fit_transform(X), columns=X.columns)\n", + "\n", + "# Применение стандартизации к всем числовым признакам\n", + "standard_scaler = StandardScaler()\n", + "X_standardized = pd.DataFrame(standard_scaler.fit_transform(X), columns=X.columns)\n", + "\n", + "# Проверка первых 5 строк после нормировки\n", + "print(\"Нормированные данные:\")\n", + "print(X_normalized.head())\n", + "\n", + "# Проверка первых 5 строк после стандартизации\n", + "print(\"\\nСтандартизированные данные:\")\n", + "print(X_standardized.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Приведём пример использования future tools" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting featuretoolsNote: you may need to restart the kernel to use updated packages.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "[notice] A new release of pip is available: 24.2 -> 24.3.1\n", + "[notice] To update, run: python.exe -m pip install --upgrade pip\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Downloading featuretools-1.31.0-py3-none-any.whl.metadata (15 kB)\n", + "Collecting cloudpickle>=1.5.0 (from featuretools)\n", + " Downloading cloudpickle-3.1.0-py3-none-any.whl.metadata (7.0 kB)\n", + "Collecting holidays>=0.17 (from featuretools)\n", + " Downloading holidays-0.59-py3-none-any.whl.metadata (25 kB)\n", + "Requirement already satisfied: numpy>=1.25.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from featuretools) (2.1.1)\n", + "Requirement already satisfied: packaging>=20.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from featuretools) (24.1)\n", + "Requirement already satisfied: pandas>=2.0.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from featuretools) (2.2.2)\n", + "Requirement already satisfied: psutil>=5.7.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from featuretools) (6.0.0)\n", + "Requirement already satisfied: scipy>=1.10.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from featuretools) (1.14.1)\n", + "Collecting tqdm>=4.66.3 (from featuretools)\n", + " Downloading tqdm-4.66.6-py3-none-any.whl.metadata (57 kB)\n", + "Collecting woodwork>=0.28.0 (from featuretools)\n", + " Downloading woodwork-0.31.0-py3-none-any.whl.metadata (10 kB)\n", + "Requirement already satisfied: python-dateutil in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from holidays>=0.17->featuretools) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from pandas>=2.0.0->featuretools) (2024.1)\n", + "Requirement already satisfied: tzdata>=2022.7 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from pandas>=2.0.0->featuretools) (2024.1)\n", + "Requirement already satisfied: colorama in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from tqdm>=4.66.3->featuretools) (0.4.6)\n", + "Requirement already satisfied: scikit-learn>=1.1.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from woodwork>=0.28.0->featuretools) (1.5.2)\n", + "Collecting importlib-resources>=5.10.0 (from woodwork>=0.28.0->featuretools)\n", + " Downloading importlib_resources-6.4.5-py3-none-any.whl.metadata (4.0 kB)\n", + "Requirement already satisfied: six>=1.5 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from python-dateutil->holidays>=0.17->featuretools) (1.16.0)\n", + "Requirement already satisfied: joblib>=1.2.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from scikit-learn>=1.1.0->woodwork>=0.28.0->featuretools) (1.4.2)\n", + "Requirement already satisfied: threadpoolctl>=3.1.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from scikit-learn>=1.1.0->woodwork>=0.28.0->featuretools) (3.5.0)\n", + "Downloading featuretools-1.31.0-py3-none-any.whl (587 kB)\n", + " ---------------------------------------- 0.0/587.9 kB ? eta -:--:--\n", + " ----------------- ---------------------- 262.1/587.9 kB ? eta -:--:--\n", + " ---------------------------------------- 587.9/587.9 kB 1.5 MB/s eta 0:00:00\n", + "Downloading cloudpickle-3.1.0-py3-none-any.whl (22 kB)\n", + "Downloading holidays-0.59-py3-none-any.whl (1.1 MB)\n", + " ---------------------------------------- 0.0/1.1 MB ? eta -:--:--\n", + " --------- ------------------------------ 0.3/1.1 MB ? eta -:--:--\n", + " ---------------------------- ----------- 0.8/1.1 MB 1.9 MB/s eta 0:00:01\n", + " ---------------------------------------- 1.1/1.1 MB 2.2 MB/s eta 0:00:00\n", + "Downloading tqdm-4.66.6-py3-none-any.whl (78 kB)\n", + "Downloading woodwork-0.31.0-py3-none-any.whl (215 kB)\n", + "Downloading importlib_resources-6.4.5-py3-none-any.whl (36 kB)\n", + "Installing collected packages: tqdm, importlib-resources, cloudpickle, holidays, woodwork, featuretools\n", + "Successfully installed cloudpickle-3.1.0 featuretools-1.31.0 holidays-0.59 importlib-resources-6.4.5 tqdm-4.66.6 woodwork-0.31.0\n" + ] + } + ], + "source": [ + "pip install --upgrade featuretools" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting setuptools\n", + " Downloading setuptools-75.3.0-py3-none-any.whl.metadata (6.9 kB)\n", + "Downloading setuptools-75.3.0-py3-none-any.whl (1.3 MB)\n", + " ---------------------------------------- 0.0/1.3 MB ? eta -:--:--\n", + " ---------------- ----------------------- 0.5/1.3 MB 3.4 MB/s eta 0:00:01\n", + " ---------------------------------------- 1.3/1.3 MB 3.7 MB/s eta 0:00:00\n", + "Installing collected packages: setuptools\n", + "Successfully installed setuptools-75.3.0\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "[notice] A new release of pip is available: 24.2 -> 24.3.1\n", + "[notice] To update, run: python.exe -m pip install --upgrade pip\n" + ] + } + ], + "source": [ + "pip install --upgrade setuptools" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}