571 lines
147 KiB
Plaintext
571 lines
147 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 1,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"df = pd.read_csv(\"C://Users//annal//aim//static//csv//Forbes_Billionaires.csv\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Определим бизнес цели:\n",
|
|||
|
"## 1- Прогнозирование места в рейтинге\n",
|
|||
|
"## 2- Оценка факторов, влияющих на место в рейтинге"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Определим цели технического проекта:\n",
|
|||
|
"## Построить модель, которая будет прогнозировать место в рейтинге на основе представленных данных об участнике\n",
|
|||
|
"## Провести анализ данных для выявления важнейших характеристик для прогнозирования"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Проверим выбросы и усредним"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 2,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Колонка Networth:\n",
|
|||
|
" Есть выбросы: Да\n",
|
|||
|
" Количество выбросов: 226\n",
|
|||
|
" Минимальное значение: 1.0\n",
|
|||
|
" Максимальное значение: 9.0\n",
|
|||
|
" 1-й квантиль (Q1): 1.5\n",
|
|||
|
" 3-й квантиль (Q3): 4.5\n",
|
|||
|
"\n",
|
|||
|
"Колонка Age:\n",
|
|||
|
" Есть выбросы: Да\n",
|
|||
|
" Количество выбросов: 6\n",
|
|||
|
" Минимальное значение: 26.5\n",
|
|||
|
" Максимальное значение: 100.0\n",
|
|||
|
" 1-й квантиль (Q1): 55.0\n",
|
|||
|
" 3-й квантиль (Q3): 74.0\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"numeric_columns = ['Networth', 'Age']\n",
|
|||
|
"for column in numeric_columns:\n",
|
|||
|
" if pd.api.types.is_numeric_dtype(df[column]): # Проверяем, является ли колонка числовой\n",
|
|||
|
" q1 = df[column].quantile(0.25) # Находим 1-й квантиль (Q1)\n",
|
|||
|
" q3 = df[column].quantile(0.75) # Находим 3-й квантиль (Q3)\n",
|
|||
|
" iqr = q3 - q1 # Вычисляем межквантильный размах (IQR)\n",
|
|||
|
"\n",
|
|||
|
" # Определяем границы для выбросов\n",
|
|||
|
" lower_bound = q1 - 1.5 * iqr # Нижняя граница\n",
|
|||
|
" upper_bound = q3 + 1.5 * iqr # Верхняя граница\n",
|
|||
|
"\n",
|
|||
|
" # Подсчитываем количество выбросов\n",
|
|||
|
" outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]\n",
|
|||
|
" outlier_count = outliers.shape[0]\n",
|
|||
|
"\n",
|
|||
|
" # Устраняем выбросы: заменяем значения ниже нижней границы на саму нижнюю границу, а выше верхней — на верхнюю\n",
|
|||
|
" df[column] = df[column].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)\n",
|
|||
|
"\n",
|
|||
|
" print(f\"Колонка {column}:\")\n",
|
|||
|
" print(f\" Есть выбросы: {'Да' if outlier_count > 0 else 'Нет'}\")\n",
|
|||
|
" print(f\" Количество выбросов: {outlier_count}\")\n",
|
|||
|
" print(f\" Минимальное значение: {df[column].min()}\")\n",
|
|||
|
" print(f\" Максимальное значение: {df[column].max()}\")\n",
|
|||
|
" print(f\" 1-й квантиль (Q1): {q1}\")\n",
|
|||
|
" print(f\" 3-й квантиль (Q3): {q3}\\n\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Превратим номинальные столбцы в числовые"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 3,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" Rank Networth Age Country Source Industry \\\n",
|
|||
|
"0 1 9.0 50.0 70 123 0 \n",
|
|||
|
"1 2 9.0 58.0 70 5 15 \n",
|
|||
|
"2 3 9.0 73.0 20 73 3 \n",
|
|||
|
"3 4 9.0 66.0 70 81 15 \n",
|
|||
|
"4 5 9.0 91.0 70 11 4 \n",
|
|||
|
"\n",
|
|||
|
" Name_Abdulla Al Futtaim & family \\\n",
|
|||
|
"0 0.0 \n",
|
|||
|
"1 0.0 \n",
|
|||
|
"2 0.0 \n",
|
|||
|
"3 0.0 \n",
|
|||
|
"4 0.0 \n",
|
|||
|
"\n",
|
|||
|
" Name_Abdulla bin Ahmad Al Ghurair & family Name_Abdulsamad Rabiu \\\n",
|
|||
|
"0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" Name_Abhay Firodia ... Name_Zhu Yan & family Name_Zhu Yiming \\\n",
|
|||
|
"0 0.0 ... 0.0 0.0 \n",
|
|||
|
"1 0.0 ... 0.0 0.0 \n",
|
|||
|
"2 0.0 ... 0.0 0.0 \n",
|
|||
|
"3 0.0 ... 0.0 0.0 \n",
|
|||
|
"4 0.0 ... 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" Name_Zhu Yiwen & family Name_Zhuo Jun Name_Ziv Aviram \\\n",
|
|||
|
"0 0.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" Name_Zong Qinghou Name_Zong Yanmin Name_Zugen Ni Name_Zuowen Song \\\n",
|
|||
|
"0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" Name_Zygmunt Solorz-Zak \n",
|
|||
|
"0 0.0 \n",
|
|||
|
"1 0.0 \n",
|
|||
|
"2 0.0 \n",
|
|||
|
"3 0.0 \n",
|
|||
|
"4 0.0 \n",
|
|||
|
"\n",
|
|||
|
"[5 rows x 2603 columns]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.preprocessing import OneHotEncoder, LabelEncoder\n",
|
|||
|
"\n",
|
|||
|
"# Определение категориальных признаков для преобразования\n",
|
|||
|
"categorical_columns = ['Name']\n",
|
|||
|
"\n",
|
|||
|
"# Инициализация OneHotEncoder\n",
|
|||
|
"encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
|
|||
|
"\n",
|
|||
|
"# Применение OneHotEncoder к выбранным категориальным признакам\n",
|
|||
|
"encoded_values = encoder.fit_transform(df[categorical_columns])\n",
|
|||
|
"\n",
|
|||
|
"# Получение имен новых закодированных столбцов\n",
|
|||
|
"encoded_columns = encoder.get_feature_names_out(categorical_columns)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование в DataFrame\n",
|
|||
|
"encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
|
|||
|
"\n",
|
|||
|
"# Объединение закодированных значений с оригинальным DataFrame, исключив исходные категориальные столбцы\n",
|
|||
|
"df = df.drop(columns=categorical_columns)\n",
|
|||
|
"df = pd.concat([df.reset_index(drop=True), encoded_values_df.reset_index(drop=True)], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"# Применение Label Encoding для столбца 'Country', 'Source', 'Industry'\n",
|
|||
|
"label_encoder = LabelEncoder()\n",
|
|||
|
"df['Country'] = label_encoder.fit_transform(df['Country'])\n",
|
|||
|
"df['Source'] = label_encoder.fit_transform(df['Source'])\n",
|
|||
|
"df['Industry'] = label_encoder.fit_transform(df['Industry'])\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"print(df.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Создадим выборки данных по параметру места в рейтинге"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 4,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: (1560, 2602)\n",
|
|||
|
"Размер контрольной выборки: (520, 2602)\n",
|
|||
|
"Размер тестовой выборки: (520, 2602)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"# Выделение признаков (X) и целевой переменной (y)\n",
|
|||
|
"X = df.drop(columns=['Rank ']) # Признаки\n",
|
|||
|
"y = df['Rank '] # Целевая переменная\n",
|
|||
|
"\n",
|
|||
|
"# Разделение данных на обучающую и временную выборки\n",
|
|||
|
"X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение временной выборки на контрольную и тестовую выборки\n",
|
|||
|
"X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка размеров выборок\n",
|
|||
|
"print(f\"Размер обучающей выборки: {X_train.shape}\")\n",
|
|||
|
"print(f\"Размер контрольной выборки: {X_val.shape}\")\n",
|
|||
|
"print(f\"Размер тестовой выборки: {X_test.shape}\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 5,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0kAAAIjCAYAAADWYVDIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB2gklEQVR4nO3dd3wUdf7H8ffuZtMbENKA0HsVFEUQUKoVxS4qop7e2bty6oHlTpETsWL5KaCCBXs5UUTEAgJSRUIvoSSBBdLbZvf7+yNk3SUBkpBkk/B6PswDd3a+M5+ZfHez752Z71iMMUYAAAAAAEmS1d8FAAAAAEBdQkgCAAAAAC+EJAAAAADwQkgCAAAAAC+EJAAAAADwQkgCAAAAAC+EJAAAAADwQkgCAAAAAC+EJAAAaklubq527typgwcP+rsUVLPs7Gxt375dubm5/i4FQDUgJAEAUIPmzJmjIUOGKCIiQuHh4UpKStIzzzzj77LqhZycHE2dOtXzOCMjQy+//LL/CvJijNHrr7+u0047TaGhoYqMjFTr1q317rvv+rs0ANXAYowx/i4CQM2ZMWOGxo0b53kcFBSkpKQkDR8+XI8++qji4uL8WB3QsD300EOaNGmSRo0apSuuuEIxMTGyWCzq0KGDWrRo4e/y6jyXy6WoqCi99tprGjhwoJ599lmtX79ec+fO9XdpuvLKK/XBBx9o7NixOu+88xQVFSWLxaIePXqoadOm/i4PwHEK8HcBAGrH448/rtatW6ugoEC//PKLpk2bpv/9739au3atQkND/V0e0OAsXLhQkyZN0lNPPaWHHnrI3+XUSzabTY899piuvfZaud1uRUZG6uuvv/Z3WXr77bf1wQcf6N1339VVV13l73IA1ACOJAENXOmRpGXLlunkk0/2TL/33ns1ZcoUzZ49W1deeaUfKwQapvPPP18HDhzQr7/+6u9S6r1du3Zp586d6ty5s6Kjo/1djrp3764ePXpo1qxZ/i4FQA3hmiTgBHXWWWdJkrZt2yZJOnDggO677z51795d4eHhioyM1Nlnn63Vq1eXaVtQUKCJEyeqQ4cOCg4OVkJCgkaPHq0tW7ZIkrZv3y6LxXLEn8GDB3uW9eOPP8piseiDDz7QP//5T8XHxyssLEwXXHCBdu7cWWbdS5Ys0ciRIxUVFaXQ0FANGjToiB9CBw8eXO76J06cWGbed999V3369FFISIgaN26sK664otz1H23bvLndbk2dOlVdu3ZVcHCw4uLidPPNN5e5YL9Vq1Y677zzyqzntttuK7PM8mqfPHlymX0qSYWFhZowYYLatWunoKAgtWjRQg888IAKCwvL3VfeSvfbhRdeWOa5m2++WRaLRd26davS9krSN998o0GDBikiIkKRkZE65ZRTNHv2bJ91H+2n1PTp03XWWWcpNjZWQUFB6tKli6ZNm3bM7ZOk6667zmeZjRo10uDBg/Xzzz9XqP0PP/ygM844Q2FhYYqOjtaoUaOUnJzsM89vv/2mbt266YorrlDjxo0VEhKiU045RZ999plnnpycHIWFhenOO+8ss45du3bJZrPpqaee8tTcqlWrMvMd3i927NihW265RR07dlRISIiaNGmiSy+9VNu3b/dpV/ra+/HHHz3Tli1bpmHDhikiIkJhYWHl7pMZM2bIYrHo999/90xzOBzl9s/zzjuv3Jor8jqeOHGi5/fdvHlz9evXTwEBAYqPjy9Td3lK25f+REREqG/fvj77Xyrpc4f3Z2+lr/kZM2ZIKhl8Y+3atWrRooXOPfdcRUZGHnFfSdLWrVt16aWXqnHjxgoNDdVpp51W5mhYZd4HBw8eXOb1/u9//1tWq9XzOipVmfdLAL443Q44QZUGmiZNmkgq+UP+2Wef6dJLL1Xr1q2Vnp6u1157TYMGDdK6deuUmJgoqeQagfPOO0/z58/XFVdcoTvvvFPZ2dmaN2+e1q5dq7Zt23rWceWVV+qcc87xWe/48ePLreff//63LBaLHnzwQe3du1dTp07V0KFDtWrVKoWEhEgq+WB69tlnq0+fPpowYYKsVqvng/LPP/+svn37lllu8+bNPR8yc3Jy9I9//KPcdT/66KO67LLLdOONN2rfvn168cUXNXDgQK1cubLcb65vuukmnXHGGZKkTz75RJ9++qnP8zfffLPnKN4dd9yhbdu26aWXXtLKlSv166+/ym63l7sfKiMjI8Ozbd7cbrcuuOAC/fLLL7rpppvUuXNn/fHHH3ruuee0cePGMh8SyxMcHKyvv/5ae/fuVWxsrCQpPz9fH3zwgYKDg8vMX9HtnTFjhq6//np17dpV48ePV3R0tFauXKm5c+fqqquu0sMPP6wbb7xRUskH77vvvttnX3ubNm2aunbtqgsuuEABAQH68ssvdcstt8jtduvWW2895jbGxMToueeek1QSSJ5//nmdc8452rlz51GPVnz//fc6++yz1aZNG02cOFH5+fl68cUX1b9/f61YscITCvbv36/XX39d4eHhuuOOO9S0aVO9++67Gj16tGbNmqUrr7xS4eHhuuiii/TBBx9oypQpstlsnvW89957MsZozJgxx9wWb8uWLdOiRYt0xRVXqHnz5tq+fbumTZumwYMHa926dUc8vXbz5s0aPHiwQkNDdf/99ys0NFRvvPGGhg4dqnnz5mngwIGVquNIqvI6LvXss88qPT29Uut75513JJX0p1deeUWXXnqp1q5dq44dO1ap/v3790uSJk2apPj4eN1///0KDg4ud1+lp6fr9NNPV15enu644w41adJEM2fO1AUXXKCPPvpIF110kc+yK/I+eLjp06frkUce0bPPPutz6t/x7GcAkgyABm369OlGkvn+++/Nvn37zM6dO837779vmjRpYkJCQsyuXbuMMcYUFBQYl8vl03bbtm0mKCjIPP74455pb731lpFkpkyZUmZdbrfb006SmTx5cpl5unbtagYNGuR5vGDBAiPJNGvWzGRlZXmmf/jhh0aSef755z3Lbt++vRkxYoRnPcYYk5eXZ1q3bm2GDRtWZl2nn3666datm+fxvn37jCQzYcIEz7Tt27cbm81m/v3vf/u0/eOPP0xAQECZ6Zs2bTKSzMyZMz3TJkyYYLzfTn/++WcjycyaNcun7dy5c8tMb9mypTn33HPL1H7rrbeaw9+iD6/9gQceMLGxsaZPnz4++/Sdd94xVqvV/Pzzzz7tX331VSPJ/Prrr2XW523QoEGma9eupkePHua///2vz3KbN29uzjjjDNO1a9dKb29GRoaJiIgwp556qsnPz/eZ1/t3Wqq0H02fPr3cOvPy8spMGzFihGnTps1Rt88YY8aOHWtatmzpM+311183kszSpUuP2rZXr14mNjbW7N+/3zNt9erVxmq1mmuvvdYzTZKRZH788Uefmjt37mzi4+NNUVGRMcaYb7/91kgy33zzjc96evTo4fN7HTdunElKSipTz+H9orz9snjxYiPJvP32255ppa+9BQsWGGOMufjii43NZjNr1671zONwOEyTJk1Mnz59PNNK31OWLVvmmVbea8sYY84991yf/VyZ1/Hhr6u9e/eaiIgIc/bZZ/vUfSSHtzfGmO+++85IMh9++KFnWml/P5LD+2Hp48DAQLNx40affXD4vrrrrruMJJ/XYnZ2tmndurVp1aqV5z23ou+DpfWW9ouvv/7aBAQEmHvvvden5qq8XwLwxel2wAli6NChatq0qVq0aKErrrhC4eHh+vTTT9WsWTNJJaPeWa0lbwkul0v79+9XeHi4OnbsqBUrVniW8/HHHysmJka33357mXUcfnpYZVx77bWKiIjwPL7kkkuUkJCg//3vf5KkVatWadOmTbrqqqu0f/9+ORwOORwO5ebmasiQIfrpp5/kdrt9lllQUFDuUQ9vn3zyidxuty677DLPMh0Oh+Lj49W+fXstWLDAZ/6ioiJJJfvrSObMmaOoqCgNGzbMZ5l9+vRReHh4mWU6nU6f+RwOhwoKCo5a9+7du/Xiiy/q0UcfVXh4eJn1d+7cWZ06dfJZZukploev/0jGjRun6dOnex5Pnz5dY8e
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0kAAAIjCAYAAADWYVDIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB7+UlEQVR4nO3dd3xUVf7/8ffMZDKZNEIS0giBEKpURVEs2KhWRNcCKmIBV3BVbF/bAuqubde2urjubwXcFQsquhZQRAFRQOmCEUMoERICk5DeJjP390fI7AwJpJBkkvB6Ph55wNy5597PvXNmMu/ce881GYZhCAAAAAAgSTL7uwAAAAAAaE0ISQAAAADghZAEAAAAAF4ISQAAAADghZAEAAAAAF4ISQAAAADghZAEAAAAAF4ISQAAAADghZAEAADQTNxutxwOh3bu3OnvUgA0ACEJAAC0OatWrdLy5cs9j5cvX67vvvvOfwV52b9/v+6++2517dpVgYGB6tSpk0466SQVFBT4uzQA9URIAtqJefPmyWQyeX6CgoLUq1cvTZ8+XdnZ2f4uDwCa1G+//aY77rhDP/30k3766Sfdcccd+u233/xdlnbs2KHTTjtN77zzjqZOnapPP/1US5cu1bJlyxQSEuLv8gDUU4C/CwDQtB5//HElJyerrKxMq1at0pw5c/T5559r69atCg4O9nd5ANAkxo8frxdffFEDBw6UJA0bNkzjx4/3c1XS1KlTFRgYqDVr1qhz587+LgdAIxGSgHZm7NixOvXUUyVJt956q6KiovT888/r448/1nXXXefn6gCgadhsNn3//ffaunWrJKl///6yWCx+rWn9+vX6+uuv9eWXXxKQgDaO0+2Adu6CCy6QJO3atUuSlJubq/vuu08DBgxQaGiowsPDNXbsWG3evLlG27KyMs2aNUu9evVSUFCQ4uPjNX78eKWnp0uSdu/e7XOK35E/5513nmdZy5cvl8lk0rvvvquHH35YcXFxCgkJ0WWXXVbrKTJr167VmDFj1KFDBwUHB+vcc8896vUG5513Xq3rnzVrVo15//Of/2jIkCGy2+2KjIzUtddeW+v6j7Vt3txut1588UX169dPQUFBio2N1dSpU3Xo0CGf+bp166ZLLrmkxnqmT59eY5m11f7cc8/V2KeSVF5erpkzZ6pHjx6y2Wzq0qWLHnjgAZWXl9e6r7xV77dx48bVeG7q1KkymUzq379/o7ZXkhYvXqxzzz1XYWFhCg8P12mnnaYFCxb4rPtYP9Xmzp2rCy64QDExMbLZbDrppJM0Z86cOrdPkm666SafZXbs2FHnnXeevv3223q17datm8+0//znPzKbzXr66ad9pn/99dc655xzFBISooiICF1++eVKTU31mWfWrFkymUxyOBw+09etWyeTyaR58+bVWnNtP7t375b0v3715ZdfavDgwQoKCtJJJ52kDz/8sMb27Ny5U7/73e8UGRmp4OBgnXHGGfrss8/qtd9qe0/ddNNNCg0NrXM/NqTvV1ZW6oknnlBKSopsNpu6deumhx9+uEZ/7tatm2666SZZLBYNGjRIgwYN0ocffiiTyVTjNTtaTdXbZDabFRcXp2uuuUYZGRmeeao/A/7yl78cdTnVr2m1NWvWKCgoSOnp6erXr59sNpvi4uI0depU5ebm1mi/cOFCz+dRdHS0rr/+eu3bt89nnur9vHPnTo0ePVohISFKSEjQ448/LsMwatRb3Y8kqbCwUEOGDFFycrKysrI80xvyPgZOVBxJAtq56kATFRUlqeqL0kcffaTf/e53Sk5OVnZ2tv7xj3/o3HPP1c8//6yEhARJksvl0iWXXKJly5bp2muv1V133aXCwkItXbpUW7duVUpKimcd1113nS666CKf9T700EO11vOnP/1JJpNJDz74oA4cOKAXX3xRI0aM0KZNm2S32yVVfeEcO3ashgwZopkzZ8psNnu+KH/77bcaOnRojeUmJibqqaeekiQVFRXp97//fa3rfuyxx3T11Vfr1ltv1cGDB/W3v/1Nw4cP18aNGxUREVGjzZQpU3TOOedIkj788EMtWrTI5/mpU6dq3rx5mjx5sv7whz9o165deuWVV7Rx40Z99913slqtte6HhsjLy/Nsmze3263LLrtMq1at0pQpU9S3b1/99NNPeuGFF/Trr7/qo48+qnPZQUFB+uyzz3TgwAHFxMRIkkpLS/Xuu+8qKCioxvz13d558+bp5ptvVr9+/fTQQw8pIiJCGzdu1JIlSzRhwgQ98sgjuvXWWyVJDodD99xzj8++9jZnzhz169dPl112mQICAvTJJ5/ojjvukNvt1rRp0+rcxujoaL3wwguSpL179+qll17SRRddpN9++63W1/xovvzyS918882aPn26/u///s8z/auvvtLYsWPVvXt3zZo1S6Wlpfrb3/6ms846Sxs2bKjXl3ZvU6dO1YgRIzyPb7jhBl1xxRU+p5J16tTJ8/+0tDRdc801uv322zVp0iTNnTtXv/vd77RkyRKNHDlSkpSdna0zzzxTJSUl+sMf/qCoqCjNnz9fl112md5//31dccUVNerw3m/VdTS3W2+9VfPnz9dVV12le++9V2vXrtVTTz2l1NTUGu89b5WVlXrkkUcatK5zzjlHU6ZMkdvt1tatW/Xiiy8qMzOzXgH6aHJyclRWVqbf//73uuCCC3T77bcrPT1dr776qtauXau1a9fKZrNJkud9dNppp+mpp55Sdna2XnrpJX333Xc1Po9cLpfGjBmjM844Q88++6yWLFmimTNnqrKyUo8//nittTidTl155ZXKyMjQd999p/j4eM9zLfG5BbR5BoB2Ye7cuYYk46uvvjIOHjxo/Pbbb8Y777xjREVFGXa73di7d69hGIZRVlZmuFwun7a7du0ybDab8fjjj3umvfHGG4Yk4/nnn6+xLrfb7WknyXjuuedqzNOvXz/j3HPP9Tz+5ptvDElG586djYKCAs/09957z5BkvPTSS55l9+zZ0xg9erRnPYZhGCUlJUZycrIxcuTIGus688wzjf79+3seHzx40JBkzJw50zNt9+7dhsViMf70pz/5tP3pp5+MgICAGtPT0tIMScb8+fM902bOnGl4f2x+++23hiTjrbfe8mm7ZMmSGtO7du1qXHzxxTVqnzZtmnHkR/GRtT/wwANGTEyMMWTIEJ99+u9//9swm83Gt99+69P+tddeMyQZ3333XY31eTv33HONfv36GQMHDjT+8pe/+Cw3MTHROOecc4x+/fo1eHvz8vKMsLAw4/TTTzdKS0t95vV+TatV96O5c+fWWmdJSUmNaaNHjza6d+9+zO0zDMOYNGmS0bVrV59pr7/+uiHJ+OGHH+rddt26dUZoaKjxu9/9rsb7Z/DgwUZMTIyRk5PjmbZ582bDbDYbN954o2dadf85ePCgT/sff/zxmNt/ZH/w1rVrV0OS8cEHH3im5efnG/Hx8cbJJ5/smXb33Xcbknz6SmFhoZGcnGx069atxjZNnDjRSE5OPmYdkyZNMkJCQmqt68ga69P3N23aZEgybr31Vp/57rvvPkOS8fXXX/ssc9KkSZ7Hf//73w2bzWacf/75NV7vo9Xk3d4wDGPChAlGcHCw5/GxPt+qHfmZUP34wgsvNCorKz3Tqz+f//a3vxmGYRgVFRVGTEyM0b9/f5/3yKeffmpIMv74xz96pk2aNMmQZNx5552eaW6327j44ouNwMBAT3/yfh+53W5j4sSJRnBwsLF27VqfmhvyuQWcyDjdDmhnRowYoU6dOqlLly669tprFRoaqkWLFnnOj7fZbDKbq976LpdLOTk5Cg0NVe/evbVhwwbPcj744ANFR0frzjvvrLGOI0+RaYgbb7xRYWFhnsdXXXWV4uPj9fnnn0uSNm3apLS0NE2YMEE5OTlyOBxyOBwqLi7WhRdeqJUrV8rtdvsss6ysrNajHt4+/PBDud1uXX311Z5lOhwOxcXFqWfPnvrmm2985q+oqJAkz199a7Nw4UJ16NBBI0eO9FnmkCFDFBoaWmOZTqfTZz6Hw6GysrJj1r1v3z797W9/02O
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0kAAAIjCAYAAADWYVDIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB2TUlEQVR4nO3dd3hUZf7+8Xtmkkx6QhLSCKE3aQoIXxZFVKpd3EUBFRHFdWHXFdsPK7juorLWVdHdVcBe1rZ2USmi2OhIQIhA6Gmkt8nM8/sjZHaGBEiGJJOE9+u65krmzHnO+ZxznszMndMsxhgjAAAAAIAkyervAgAAAACgOSEkAQAAAIAHQhIAAAAAeCAkAQAAAIAHQhIAAAAAeCAkAQAAAIAHQhIAAAAAeCAkAQAAAIAHQhIAAEAtjDHKzc3Vtm3b/F0KgCZGSAIAAE1q06ZNeu+999zP161bp48++sh/BXkoLCzU3XffrR49eigoKEixsbHq3r27tm7d6u/SADQhQhIAt0WLFslisbgfwcHB6t69u2bOnKmDBw/6uzwArURhYaFuuOEGfffdd9q2bZtuuukmbdy40d9lKScnR0OHDtWTTz6p3/72t3r//fe1ZMkSLVu2TB07dvR3eQCaUIC/CwDQ/Nx///3q1KmTysrKtHLlSi1YsEAff/yxNm3apNDQUH+XB6CFGzp0qPshSd27d9f111/v56qk2267Tfv379eqVavUu3dvf5cDwI8ISQBqGDdunAYNGiRJuu666xQbG6tHH31U77//viZOnOjn6gC0Bu+99542b96s0tJS9e3bV0FBQX6tJzMzU4sXL9azzz5LQALA4XYAju+cc86RJO3YsUOSlJubq1tvvVV9+/ZVeHi4IiMjNW7cOK1fv75G27KyMs2ZM0fdu3dXcHCwkpKSNH78eKWnp0uSdu7c6XWI35GPESNGuKe1bNkyWSwWvfHGG7rzzjuVmJiosLAwXXTRRdq9e3eNeX///fcaO3asoqKiFBoaqrPOOkvffPNNrcs4YsSIWuc/Z86cGuO+/PLLGjhwoEJCQhQTE6Mrrrii1vkfa9k8uVwuPf744+rdu7eCg4OVkJCgG264QYcOHfIar2PHjrrgggtqzGfmzJk1pllb7fPnz6+xTiWpvLxc9913n7p27Sq73a727dvr9ttvV3l5ea3rylP1ervkkktqvHbDDTfIYrGoT58+Pi2vJH3yySc666yzFBERocjISJ1++ul69dVXveZ9rEe1hQsX6pxzzlF8fLzsdrtOOeUULViw4LjLJ0nXXHON1zTbtGmjESNG6Ouvv65Xu9oeO3fu9FrWM888U2FhYYqIiND555+vn3/+ucZ0t2zZogkTJqht27YKCQlRjx49dNddd0mS5syZc9x5Llu2zD2tt956y92X4+LidOWVV2rv3r0+L/8zzzyj3r17y263Kzk5WTNmzFBeXp7XOCNGjHD3wVNOOUUDBw7U+vXra/3bqM2R2z0uLk7nn3++Nm3a5DWexWLRzJkzjzqd6sOLq7fBjz/+KJfLpYqKCg0aNEjBwcGKjY3VxIkTlZGRUaP9V1995d5e0dHRuvjii5WWluY1TvX2qN5mkZGRio2N1U033aSysrIa9Xr+zVZWVuq8885TTEyMNm/e7DVuXd+DAPiOPUkAjqs60MTGxkqSfv31V7333nv63e9+p06dOungwYN67rnndNZZZ2nz5s1KTk6WJDmdTl1wwQX68ssvdcUVV+imm25SYWGhlixZok2bNqlLly7ueUycOFHnnXee13xnz55daz1//etfZbFYdMcddygzM1OPP/64Ro4cqXXr1ikkJERS1ReYcePGaeDAgbrvvvtktVrdX5S//vprDR48uMZ0U1JSNG/ePElSUVGRbrzxxlrnfc8992jChAm67rrrlJWVpX/84x8aPny41q5dq+jo6Bptpk+frjPPPFOS9M477+jdd9/1ev2GG27QokWLNHXqVP3pT3/Sjh079NRTT2nt2rX65ptvFBgYWOt6qI+8vDz3snlyuVy66KKLtHLlSk2fPl29evXSxo0b9dhjj+mXX37xOrn+aIKDg/XRRx8pMzNT8fHxkqTS0lK98cYbCg4OrjF+XZd30aJFuvbaa9W7d2/Nnj1b0dHRWrt2rT799FNNmjRJd911l6677jpJUnZ2tm6++Wavde1pwYIF6t27ty666CIFBATogw8+0B/+8Ae5XC7NmDHjuMsYFxenxx57TJK0Z88ePfHEEzrvvPO0e/fuWrd59XKOHDnS/fyqq67SpZdeqvHjx7uHtW3bVpL00ksvacqUKRozZoweeughlZSUaMGCBTrjjDO0du1a9/kwGzZs0JlnnqnAwEBNnz5dHTt2VHp6uj744AP99a9/1fjx49W1a1f39G+++Wb16tVL06dPdw/r1auXe/1OnTpVp59+uubNm6eDBw/qiSee0DfffFOjL9dl+efMmaO5c+dq5MiRuvHGG7V161YtWLBAP/7443H78R133HGcLeCtZ8+euuuuu2SMUXp6uh599FGdd955tYaZusrJyZFU9U+HgQMH6sEHH1RWVpaefPJJrVy5UmvXrlVcXJwk6YsvvtC4cePUuXNnzZkzR6WlpfrHP/6hYcOGac2aNTXOX5owYYI6duyoefPm6bvvvtOTTz6pQ4cO6cUXXzxqPdddd52WLVumJUuW6JRTTnEP9+U9CIAPDAActnDhQiPJfPHFFyYrK8vs3r3bvP766yY2NtaEhISYPXv2GGOMKSsrM06n06vtjh07jN1uN/fff7972AsvvGAkmUcffbTGvFwul7udJDN//vwa4/Tu3ducddZZ7udLly41kky7du1MQUGBe/ibb75pJJknnnjCPe1u3bqZMWPGuOdjjDElJSWmU6dOZtSoUTXm9Zvf/Mb06dPH/TwrK8tIMvfdd5972M6dO43NZjN//etfvdpu3LjRBAQE1Bi+bds2I8ksXrzYPey+++4znm+9X3/9tZFkXnnlFa+2n376aY3hHTp0MOeff36N2mfMmGGOfDs/svbbb7/dxMfHm4EDB3qt05deeslYrVbz9ddfe7V/9tlnjSTzzTff1Jifp7POOsv07t3b9OvXz/z973/3mm5KSoo588wzTe/eveu9vHl5eSYiIsIMGTLElJaWeo3ruU2rVfejhQsX1lpnSUlJjWFjxowxnTt3PubyGWPMlClTTIcOHbyG/fOf/zSSzA8//HDc9tWO3CbVCgsLTXR0tLn++uu9hh84cMBERUV5DR8+fLiJiIgwu3bt8hq3tnViTFWfmTJlSo3hFRUVJj4+3vTp08dr/X744YdGkrn33nvdw+qy/JmZmSYoKMiMHj3a673hqaeeMpLMCy+84B521llnefXBjz/+2EgyY8eOrdGPa3Nke2OMufPOO40kk5mZ6R4mycyYMeOo06l+v9uxY4fX81NOOcWrv1S/79xyyy3uYaeeeqqJj483OTk57mHr1683VqvVXH311e5h1X/vF110kde8//CHPxhJZv369V71VveP2bNnG5vNZt577z2vdvV9DwLgOw63A1DDyJEj1bZtW7Vv315XXHGFwsPD9e6776pdu3aSJLvdLqu16u3D6XQqJydH4eHh6tGjh9asWeOezttvv624uDj98Y9/rDGPuhxWczRXX321IiIi3M9/+9vfKikpSR9//LGkqssJb9u2TZMmTVJOTo6ys7OVnZ2t4uJinXvuuVqxYoVcLpfXNMvKymrd6+HpnXfekcvl0oQJE9zTzM7OVmJiorp166alS5d6jV9RUSGpan0dzVtvvaWoqCiNGjXKa5oDBw5UeHh4jWk6HA6v8bKzs2sctnOkvXv36h//+IfuuecehYeH15h/r1691LNnT69pVh9ieeT8j2bq1KlauHCh+/nChQs1ZcoUdz+p7/IuWbJEhYWF+n//7//V2C6+9J3qPYySlJ+fr+zsbJ111ln69ddflZ+ff9z2LpfLXeu6dev04osvKikpyb1X5kQsWbJEeXl5mjhxotc6sdlsGjJkiHudZGVlacWKFbr22muVmprqNY36rpOffvpJmZmZ+sM
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import seaborn as sns\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"# Функция для оценки распределения цены\n",
|
|||
|
"def plot_distribution(y_data, title):\n",
|
|||
|
" plt.figure(figsize=(10, 6))\n",
|
|||
|
" sns.histplot(y_data, kde=True, bins=50)\n",
|
|||
|
" plt.title(title)\n",
|
|||
|
" plt.xlabel('Rank ')\n",
|
|||
|
" plt.ylabel('Frequency')\n",
|
|||
|
" plt.grid(True)\n",
|
|||
|
" plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Оценка распределения цены в каждой выборке\n",
|
|||
|
"plot_distribution(y_train, \"Распределение места в обучающей выборке\")\n",
|
|||
|
"plot_distribution(y_val, \"Распределение места в контрольной выборке\")\n",
|
|||
|
"plot_distribution(y_test, \"Распределение места в тестовой выборке\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Применим min-max нормировку для улучшения качества работы модели"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 6,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Нормированные данные:\n",
|
|||
|
" Networth Age Country Source Industry \\\n",
|
|||
|
"0 1.0 0.319728 0.945946 0.137584 0.000000 \n",
|
|||
|
"1 1.0 0.428571 0.945946 0.005593 0.882353 \n",
|
|||
|
"2 1.0 0.632653 0.270270 0.081655 0.176471 \n",
|
|||
|
"3 1.0 0.537415 0.945946 0.090604 0.882353 \n",
|
|||
|
"4 1.0 0.877551 0.945946 0.012304 0.235294 \n",
|
|||
|
"\n",
|
|||
|
" Name_Abdulla Al Futtaim & family \\\n",
|
|||
|
"0 0.0 \n",
|
|||
|
"1 0.0 \n",
|
|||
|
"2 0.0 \n",
|
|||
|
"3 0.0 \n",
|
|||
|
"4 0.0 \n",
|
|||
|
"\n",
|
|||
|
" Name_Abdulla bin Ahmad Al Ghurair & family Name_Abdulsamad Rabiu \\\n",
|
|||
|
"0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" Name_Abhay Firodia Name_Abigail Johnson ... Name_Zhu Yan & family \\\n",
|
|||
|
"0 0.0 0.0 ... 0.0 \n",
|
|||
|
"1 0.0 0.0 ... 0.0 \n",
|
|||
|
"2 0.0 0.0 ... 0.0 \n",
|
|||
|
"3 0.0 0.0 ... 0.0 \n",
|
|||
|
"4 0.0 0.0 ... 0.0 \n",
|
|||
|
"\n",
|
|||
|
" Name_Zhu Yiming Name_Zhu Yiwen & family Name_Zhuo Jun \\\n",
|
|||
|
"0 0.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" Name_Ziv Aviram Name_Zong Qinghou Name_Zong Yanmin Name_Zugen Ni \\\n",
|
|||
|
"0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" Name_Zuowen Song Name_Zygmunt Solorz-Zak \n",
|
|||
|
"0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
"[5 rows x 2602 columns]\n",
|
|||
|
"\n",
|
|||
|
"Стандартизированные данные:\n",
|
|||
|
" Networth Age Country Source Industry \\\n",
|
|||
|
"0 2.266803 -1.081352 1.173910 -1.505003 -1.701719 \n",
|
|||
|
"1 2.266803 -0.475422 1.173910 -2.004526 1.339990 \n",
|
|||
|
"2 2.266803 0.660697 -0.805574 -1.716665 -1.093377 \n",
|
|||
|
"3 2.266803 0.130508 1.173910 -1.682800 1.339990 \n",
|
|||
|
"4 2.266803 2.024040 1.173910 -1.979126 -0.890597 \n",
|
|||
|
"\n",
|
|||
|
" Name_Abdulla Al Futtaim & family \\\n",
|
|||
|
"0 -0.019615 \n",
|
|||
|
"1 -0.019615 \n",
|
|||
|
"2 -0.019615 \n",
|
|||
|
"3 -0.019615 \n",
|
|||
|
"4 -0.019615 \n",
|
|||
|
"\n",
|
|||
|
" Name_Abdulla bin Ahmad Al Ghurair & family Name_Abdulsamad Rabiu \\\n",
|
|||
|
"0 -0.019615 -0.019615 \n",
|
|||
|
"1 -0.019615 -0.019615 \n",
|
|||
|
"2 -0.019615 -0.019615 \n",
|
|||
|
"3 -0.019615 -0.019615 \n",
|
|||
|
"4 -0.019615 -0.019615 \n",
|
|||
|
"\n",
|
|||
|
" Name_Abhay Firodia Name_Abigail Johnson ... Name_Zhu Yan & family \\\n",
|
|||
|
"0 -0.019615 -0.019615 ... -0.019615 \n",
|
|||
|
"1 -0.019615 -0.019615 ... -0.019615 \n",
|
|||
|
"2 -0.019615 -0.019615 ... -0.019615 \n",
|
|||
|
"3 -0.019615 -0.019615 ... -0.019615 \n",
|
|||
|
"4 -0.019615 -0.019615 ... -0.019615 \n",
|
|||
|
"\n",
|
|||
|
" Name_Zhu Yiming Name_Zhu Yiwen & family Name_Zhuo Jun \\\n",
|
|||
|
"0 -0.019615 -0.019615 -0.019615 \n",
|
|||
|
"1 -0.019615 -0.019615 -0.019615 \n",
|
|||
|
"2 -0.019615 -0.019615 -0.019615 \n",
|
|||
|
"3 -0.019615 -0.019615 -0.019615 \n",
|
|||
|
"4 -0.019615 -0.019615 -0.019615 \n",
|
|||
|
"\n",
|
|||
|
" Name_Ziv Aviram Name_Zong Qinghou Name_Zong Yanmin Name_Zugen Ni \\\n",
|
|||
|
"0 -0.019615 -0.019615 -0.019615 -0.019615 \n",
|
|||
|
"1 -0.019615 -0.019615 -0.019615 -0.019615 \n",
|
|||
|
"2 -0.019615 -0.019615 -0.019615 -0.019615 \n",
|
|||
|
"3 -0.019615 -0.019615 -0.019615 -0.019615 \n",
|
|||
|
"4 -0.019615 -0.019615 -0.019615 -0.019615 \n",
|
|||
|
"\n",
|
|||
|
" Name_Zuowen Song Name_Zygmunt Solorz-Zak \n",
|
|||
|
"0 -0.019615 -0.019615 \n",
|
|||
|
"1 -0.019615 -0.019615 \n",
|
|||
|
"2 -0.019615 -0.019615 \n",
|
|||
|
"3 -0.019615 -0.019615 \n",
|
|||
|
"4 -0.019615 -0.019615 \n",
|
|||
|
"\n",
|
|||
|
"[5 rows x 2602 columns]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
|
|||
|
"\n",
|
|||
|
"# Предполагаем, что вы уже выделили ваши признаки X\n",
|
|||
|
"# Применение нормировки Min-Max к всем числовым признакам\n",
|
|||
|
"min_max_scaler = MinMaxScaler()\n",
|
|||
|
"X_normalized = pd.DataFrame(min_max_scaler.fit_transform(X), columns=X.columns)\n",
|
|||
|
"\n",
|
|||
|
"# Применение стандартизации к всем числовым признакам\n",
|
|||
|
"standard_scaler = StandardScaler()\n",
|
|||
|
"X_standardized = pd.DataFrame(standard_scaler.fit_transform(X), columns=X.columns)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка первых 5 строк после нормировки\n",
|
|||
|
"print(\"Нормированные данные:\")\n",
|
|||
|
"print(X_normalized.head())\n",
|
|||
|
"\n",
|
|||
|
"# Проверка первых 5 строк после стандартизации\n",
|
|||
|
"print(\"\\nСтандартизированные данные:\")\n",
|
|||
|
"print(X_standardized.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Приведём пример использования future tools"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 7,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Collecting featuretoolsNote: you may need to restart the kernel to use updated packages.\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n",
|
|||
|
"[notice] A new release of pip is available: 24.2 -> 24.3.1\n",
|
|||
|
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n",
|
|||
|
" Downloading featuretools-1.31.0-py3-none-any.whl.metadata (15 kB)\n",
|
|||
|
"Collecting cloudpickle>=1.5.0 (from featuretools)\n",
|
|||
|
" Downloading cloudpickle-3.1.0-py3-none-any.whl.metadata (7.0 kB)\n",
|
|||
|
"Collecting holidays>=0.17 (from featuretools)\n",
|
|||
|
" Downloading holidays-0.59-py3-none-any.whl.metadata (25 kB)\n",
|
|||
|
"Requirement already satisfied: numpy>=1.25.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from featuretools) (2.1.1)\n",
|
|||
|
"Requirement already satisfied: packaging>=20.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from featuretools) (24.1)\n",
|
|||
|
"Requirement already satisfied: pandas>=2.0.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from featuretools) (2.2.2)\n",
|
|||
|
"Requirement already satisfied: psutil>=5.7.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from featuretools) (6.0.0)\n",
|
|||
|
"Requirement already satisfied: scipy>=1.10.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from featuretools) (1.14.1)\n",
|
|||
|
"Collecting tqdm>=4.66.3 (from featuretools)\n",
|
|||
|
" Downloading tqdm-4.66.6-py3-none-any.whl.metadata (57 kB)\n",
|
|||
|
"Collecting woodwork>=0.28.0 (from featuretools)\n",
|
|||
|
" Downloading woodwork-0.31.0-py3-none-any.whl.metadata (10 kB)\n",
|
|||
|
"Requirement already satisfied: python-dateutil in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from holidays>=0.17->featuretools) (2.9.0.post0)\n",
|
|||
|
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from pandas>=2.0.0->featuretools) (2024.1)\n",
|
|||
|
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from pandas>=2.0.0->featuretools) (2024.1)\n",
|
|||
|
"Requirement already satisfied: colorama in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from tqdm>=4.66.3->featuretools) (0.4.6)\n",
|
|||
|
"Requirement already satisfied: scikit-learn>=1.1.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from woodwork>=0.28.0->featuretools) (1.5.2)\n",
|
|||
|
"Collecting importlib-resources>=5.10.0 (from woodwork>=0.28.0->featuretools)\n",
|
|||
|
" Downloading importlib_resources-6.4.5-py3-none-any.whl.metadata (4.0 kB)\n",
|
|||
|
"Requirement already satisfied: six>=1.5 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from python-dateutil->holidays>=0.17->featuretools) (1.16.0)\n",
|
|||
|
"Requirement already satisfied: joblib>=1.2.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from scikit-learn>=1.1.0->woodwork>=0.28.0->featuretools) (1.4.2)\n",
|
|||
|
"Requirement already satisfied: threadpoolctl>=3.1.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from scikit-learn>=1.1.0->woodwork>=0.28.0->featuretools) (3.5.0)\n",
|
|||
|
"Downloading featuretools-1.31.0-py3-none-any.whl (587 kB)\n",
|
|||
|
" ---------------------------------------- 0.0/587.9 kB ? eta -:--:--\n",
|
|||
|
" ----------------- ---------------------- 262.1/587.9 kB ? eta -:--:--\n",
|
|||
|
" ---------------------------------------- 587.9/587.9 kB 1.5 MB/s eta 0:00:00\n",
|
|||
|
"Downloading cloudpickle-3.1.0-py3-none-any.whl (22 kB)\n",
|
|||
|
"Downloading holidays-0.59-py3-none-any.whl (1.1 MB)\n",
|
|||
|
" ---------------------------------------- 0.0/1.1 MB ? eta -:--:--\n",
|
|||
|
" --------- ------------------------------ 0.3/1.1 MB ? eta -:--:--\n",
|
|||
|
" ---------------------------- ----------- 0.8/1.1 MB 1.9 MB/s eta 0:00:01\n",
|
|||
|
" ---------------------------------------- 1.1/1.1 MB 2.2 MB/s eta 0:00:00\n",
|
|||
|
"Downloading tqdm-4.66.6-py3-none-any.whl (78 kB)\n",
|
|||
|
"Downloading woodwork-0.31.0-py3-none-any.whl (215 kB)\n",
|
|||
|
"Downloading importlib_resources-6.4.5-py3-none-any.whl (36 kB)\n",
|
|||
|
"Installing collected packages: tqdm, importlib-resources, cloudpickle, holidays, woodwork, featuretools\n",
|
|||
|
"Successfully installed cloudpickle-3.1.0 featuretools-1.31.0 holidays-0.59 importlib-resources-6.4.5 tqdm-4.66.6 woodwork-0.31.0\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"pip install --upgrade featuretools"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 8,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Collecting setuptools\n",
|
|||
|
" Downloading setuptools-75.3.0-py3-none-any.whl.metadata (6.9 kB)\n",
|
|||
|
"Downloading setuptools-75.3.0-py3-none-any.whl (1.3 MB)\n",
|
|||
|
" ---------------------------------------- 0.0/1.3 MB ? eta -:--:--\n",
|
|||
|
" ---------------- ----------------------- 0.5/1.3 MB 3.4 MB/s eta 0:00:01\n",
|
|||
|
" ---------------------------------------- 1.3/1.3 MB 3.7 MB/s eta 0:00:00\n",
|
|||
|
"Installing collected packages: setuptools\n",
|
|||
|
"Successfully installed setuptools-75.3.0\n",
|
|||
|
"Note: you may need to restart the kernel to use updated packages.\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n",
|
|||
|
"[notice] A new release of pip is available: 24.2 -> 24.3.1\n",
|
|||
|
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"pip install --upgrade setuptools"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": ".venv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.6"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|