1194 lines
178 KiB
Plaintext
Raw Normal View History

2024-11-01 17:36:00 +04:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"df = pd.read_csv(\"C://Users//annal//aim//static//csv//Forbes_Billionaires.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Определим бизнес цели:\n",
"## 1- Прогнозирование места в рейтинге\n",
"## 2- Оценка факторов, влияющих на место в рейтинге"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Определим цели технического проекта:\n",
"## Построить модель, которая будет прогнозировать место в рейтинге на основе представленных данных об участнике\n",
"## Провести анализ данных для выявления важнейших характеристик для прогнозирования"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Проверим выбросы и усредним"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Колонка Networth:\n",
" Есть выбросы: Да\n",
" Количество выбросов: 226\n",
" Минимальное значение: 1.0\n",
" Максимальное значение: 9.0\n",
" 1-й квантиль (Q1): 1.5\n",
" 3-й квантиль (Q3): 4.5\n",
"\n",
"Колонка Age:\n",
" Есть выбросы: Да\n",
" Количество выбросов: 6\n",
" Минимальное значение: 26.5\n",
" Максимальное значение: 100.0\n",
" 1-й квантиль (Q1): 55.0\n",
" 3-й квантиль (Q3): 74.0\n",
"\n"
]
}
],
"source": [
"numeric_columns = ['Networth', 'Age']\n",
"for column in numeric_columns:\n",
" if pd.api.types.is_numeric_dtype(df[column]): # Проверяем, является ли колонка числовой\n",
" q1 = df[column].quantile(0.25) # Находим 1-й квантиль (Q1)\n",
" q3 = df[column].quantile(0.75) # Находим 3-й квантиль (Q3)\n",
" iqr = q3 - q1 # Вычисляем межквантильный размах (IQR)\n",
"\n",
" # Определяем границы для выбросов\n",
" lower_bound = q1 - 1.5 * iqr # Нижняя граница\n",
" upper_bound = q3 + 1.5 * iqr # Верхняя граница\n",
"\n",
" # Подсчитываем количество выбросов\n",
" outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]\n",
" outlier_count = outliers.shape[0]\n",
"\n",
" # Устраняем выбросы: заменяем значения ниже нижней границы на саму нижнюю границу, а выше верхней — на верхнюю\n",
" df[column] = df[column].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)\n",
"\n",
" print(f\"Колонка {column}:\")\n",
" print(f\" Есть выбросы: {'Да' if outlier_count > 0 else 'Нет'}\")\n",
" print(f\" Количество выбросов: {outlier_count}\")\n",
" print(f\" Минимальное значение: {df[column].min()}\")\n",
" print(f\" Максимальное значение: {df[column].max()}\")\n",
" print(f\" 1-й квантиль (Q1): {q1}\")\n",
" print(f\" 3-й квантиль (Q3): {q3}\\n\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Превратим номинальные столбцы в числовые"
]
},
{
"cell_type": "code",
2024-11-02 04:40:32 +04:00
"execution_count": null,
2024-11-01 17:36:00 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Rank Networth Age Country Source Industry \\\n",
"0 1 9.0 50.0 70 123 0 \n",
"1 2 9.0 58.0 70 5 15 \n",
"2 3 9.0 73.0 20 73 3 \n",
"3 4 9.0 66.0 70 81 15 \n",
"4 5 9.0 91.0 70 11 4 \n",
"\n",
" Name_Abdulla Al Futtaim & family \\\n",
"0 0.0 \n",
"1 0.0 \n",
"2 0.0 \n",
"3 0.0 \n",
"4 0.0 \n",
"\n",
" Name_Abdulla bin Ahmad Al Ghurair & family Name_Abdulsamad Rabiu \\\n",
"0 0.0 0.0 \n",
"1 0.0 0.0 \n",
"2 0.0 0.0 \n",
"3 0.0 0.0 \n",
"4 0.0 0.0 \n",
"\n",
" Name_Abhay Firodia ... Name_Zhu Yan & family Name_Zhu Yiming \\\n",
"0 0.0 ... 0.0 0.0 \n",
"1 0.0 ... 0.0 0.0 \n",
"2 0.0 ... 0.0 0.0 \n",
"3 0.0 ... 0.0 0.0 \n",
"4 0.0 ... 0.0 0.0 \n",
"\n",
" Name_Zhu Yiwen & family Name_Zhuo Jun Name_Ziv Aviram \\\n",
"0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 \n",
"\n",
" Name_Zong Qinghou Name_Zong Yanmin Name_Zugen Ni Name_Zuowen Song \\\n",
"0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 \n",
"\n",
" Name_Zygmunt Solorz-Zak \n",
"0 0.0 \n",
"1 0.0 \n",
"2 0.0 \n",
"3 0.0 \n",
"4 0.0 \n",
"\n",
"[5 rows x 2603 columns]\n"
]
}
],
"source": [
"from sklearn.preprocessing import OneHotEncoder, LabelEncoder\n",
"\n",
"# Определение категориальных признаков для преобразования\n",
"categorical_columns = ['Name']\n",
"\n",
"# Инициализация OneHotEncoder\n",
"encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
"\n",
"# Применение OneHotEncoder к выбранным категориальным признакам\n",
"encoded_values = encoder.fit_transform(df[categorical_columns])\n",
"\n",
"# Получение имен новых закодированных столбцов\n",
"encoded_columns = encoder.get_feature_names_out(categorical_columns)\n",
"\n",
"# Преобразование в DataFrame\n",
"encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
"\n",
"# Объединение закодированных значений с оригинальным DataFrame, исключив исходные категориальные столбцы\n",
"df = df.drop(columns=categorical_columns)\n",
"df = pd.concat([df.reset_index(drop=True), encoded_values_df.reset_index(drop=True)], axis=1)\n",
"\n",
"# Применение Label Encoding для столбца 'Country', 'Source', 'Industry'\n",
"label_encoder = LabelEncoder()\n",
"df['Country'] = label_encoder.fit_transform(df['Country'])\n",
"df['Source'] = label_encoder.fit_transform(df['Source'])\n",
"df['Industry'] = label_encoder.fit_transform(df['Industry'])\n",
"\n",
"\n",
"print(df.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Создадим выборки данных по параметру места в рейтинге"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: (1560, 2602)\n",
"Размер контрольной выборки: (520, 2602)\n",
"Размер тестовой выборки: (520, 2602)\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Выделение признаков (X) и целевой переменной (y)\n",
"X = df.drop(columns=['Rank ']) # Признаки\n",
"y = df['Rank '] # Целевая переменная\n",
"\n",
"# Разделение данных на обучающую и временную выборки\n",
"X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n",
"\n",
"# Разделение временной выборки на контрольную и тестовую выборки\n",
"X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n",
"\n",
"# Проверка размеров выборок\n",
"print(f\"Размер обучающей выборки: {X_train.shape}\")\n",
"print(f\"Размер контрольной выборки: {X_val.shape}\")\n",
"print(f\"Размер тестовой выборки: {X_test.shape}\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0kAAAIjCAYAAADWYVDIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB2gklEQVR4nO3dd3wUdf7H8ffuZtMbENKA0HsVFEUQUKoVxS4qop7e2bty6oHlTpETsWL5KaCCBXs5UUTEAgJSRUIvoSSBBdLbZvf7+yNk3SUBkpBkk/B6PswDd3a+M5+ZfHez752Z71iMMUYAAAAAAEmS1d8FAAAAAEBdQkgCAAAAAC+EJAAAAADwQkgCAAAAAC+EJAAAAADwQkgCAAAAAC+EJAAAAADwQkgCAAAAAC+EJAAAaklubq527typgwcP+rsUVLPs7Gxt375dubm5/i4FQDUgJAEAUIPmzJmjIUOGKCIiQuHh4UpKStIzzzzj77LqhZycHE2dOtXzOCMjQy+//LL/CvJijNHrr7+u0047TaGhoYqMjFTr1q317rvv+rs0ANXAYowx/i4CQM2ZMWOGxo0b53kcFBSkpKQkDR8+XI8++qji4uL8WB3QsD300EOaNGmSRo0apSuuuEIxMTGyWCzq0KGDWrRo4e/y6jyXy6WoqCi99tprGjhwoJ599lmtX79ec+fO9XdpuvLKK/XBBx9o7NixOu+88xQVFSWLxaIePXqoadOm/i4PwHEK8HcBAGrH448/rtatW6ugoEC//PKLpk2bpv/9739au3atQkND/V0e0OAsXLhQkyZN0lNPPaWHHnrI3+XUSzabTY899piuvfZaud1uRUZG6uuvv/Z3WXr77bf1wQcf6N1339VVV13l73IA1ACOJAENXOmRpGXLlunkk0/2TL/33ns1ZcoUzZ49W1deeaUfKwQapvPPP18HDhzQr7/+6u9S6r1du3Zp586d6ty5s6Kjo/1djrp3764ePXpo1qxZ/i4FQA3hmiTgBHXWWWdJkrZt2yZJOnDggO677z51795d4eHhioyM1Nlnn63Vq1eXaVtQUKCJEyeqQ4cOCg4OVkJCgkaPHq0tW7ZIkrZv3y6LxXLEn8GDB3uW9eOPP8piseiDDz7QP//5T8XHxyssLEwXXHCBdu7cWWbdS5Ys0ciRIxUVFaXQ0FANGjToiB9CBw8eXO76J06cWGbed999V3369FFISIgaN26sK664otz1H23bvLndbk2dOlVdu3ZVcHCw4uLidPPNN5e5YL9Vq1Y677zzyqzntttuK7PM8mqfPHlymX0qSYWFhZowYYLatWunoKAgtWjRQg888IAKCwvL3VfeSvfbhRdeWOa5m2++WRaLRd26davS9krSN998o0GDBikiIkKRkZE65ZRTNHv2bJ91H+2n1PTp03XWWWcpNjZWQUFB6tKli6ZNm3bM7ZOk6667zmeZjRo10uDBg/Xzzz9XqP0PP/ygM844Q2FhYYqOjtaoUaOUnJzsM89vv/2mbt266YorrlDjxo0VEhKiU045RZ999plnnpycHIWFhenOO+8ss45du3bJZrPpqaee8tTcqlWrMvMd3i927NihW265RR07dlRISIiaNGmiSy+9VNu3b/dpV/ra+/HHHz3Tli1bpmHDhikiIkJhYWHl7pMZM2bIYrHo999/90xzOBzl9s/zzjuv3Jor8jqeOHGi5/fdvHlz9evXTwEBAYqPjy9Td3lK25f+REREqG/fvj77Xyrpc4f3Z2+lr/kZM2ZIKhl8Y+3atWrRooXOPfdcRUZGHnFfSdLWrVt16aWXqnHjxgoNDdVpp51W5mhYZd4HBw8eXOb1/u9//1tWq9XzOipVmfdLAL443Q44QZUGmiZNmkgq+UP+2Wef6dJLL1Xr1q2Vnp6u1157TYMGDdK6deuUmJgoqeQagfPOO0/z58/XFVdcoTvvvFPZ2dmaN2+e1q5dq7Zt23rWceWVV+qcc87xWe/48ePLreff//63LBaLHnzwQe3du1dTp07V0KFDtWrVKoWEhEgq+WB69tlnq0+fPpowYYKsVqvng/LPP/+svn37lllu8+bNPR8yc3Jy9I9//KPcdT/66KO67LLLdOONN2rfvn168cUXNXDgQK1cubLcb65vuukmnXHGGZKkTz75RJ9++qnP8zfffLPnKN4dd9yhbdu26aWXXtLKlSv166+/ym63l7sfKiMjI8Ozbd7cbrcuuOAC/fLLL7rpppvUuXNn/fHHH3ruuee0cePGMh8SyxMcHKyvv/5ae/fuVWxsrCQpPz9fH3zwgYKDg8vMX9HtnTFjhq6//np17dpV48ePV3R0tFauXKm5c+fqqquu0sMPP6wbb7xRUskH77vvvttnX3ubNm2aunbtqgsuuEABAQH68ssvdcstt8jtduvWW2895jbGxMToueeek1QSSJ5//nmdc8452rlz51GPVnz//fc6++yz1aZNG02cOFH5+fl68cUX1b9/f61YscITCvbv36/XX39d4eHhuuOOO9S0aVO9++67Gj16tGbNmqUrr7xS4eHhuuiii/TBBx9oypQpstlsnvW89957MsZozJgxx9wWb8uWLdOiRYt0xRVXqHnz5tq+fbumTZumwYMHa926dUc8vXbz5s0aPHiwQkNDdf/99ys0NFRvvPGGhg4dqnnz5mngwIGVquNIqvI6LvXss88qPT29Uut75513JJX0p1deeUWXXnqp1q5dq44dO1ap/v3790uSJk2apPj4eN1///0KDg4ud1+lp6fr9NNPV15enu644w41adJEM2fO1AUXXKCPPvpIF110kc+yK/I+eLjp06frkUce0bPPPutz6t/x7GcAkgyABm369OlGkvn+++/Nvn37zM6dO837779vmjRpYkJCQsyuXbuMMcYUFBQYl8vl03bbtm0mKCjIPP74455pb731lpFkpkyZUmZdbrfb006SmTx5cpl5unbtagYNGuR5vGDBAiPJNGvWzGRlZXmmf/jhh0aSef755z3Lbt++vRkxYoRnPcYYk5eXZ1q3bm2GDRtWZl2nn3666datm+fxvn37jCQzYcIEz7Tt27cbm81m/v3vf/u0/eOPP0xAQECZ6Zs2bTKSzMyZMz3TJkyYYLzfTn/++WcjycyaNcun7dy5c8tMb9mypTn33HPL1H7rrbeaw9+iD6/9gQceMLGxsaZPnz4++/Sdd94xVqvV/Pzzzz7tX331VSPJ/Prrr2XW523QoEGma9eupkePHua///2vz3KbN29uzjjjDNO1a9dKb29GRoaJiIgwp556qsnPz/eZ1/t3Wqq0H02fPr3cOvPy8spMGzFihGnTps1Rt88YY8aOHWtatmzpM+311183kszSpUuP2rZXr14mNjbW7N+/3zNt9erVxmq1mmuvvdYzTZKRZH788Uefmjt37mzi4+NNUVGRMcaYb7/91kgy33zzjc96evTo4fN7HTdunElKSipTz+H9orz9snjxYiPJvP32255ppa+9BQsWGGOMufjii43NZjNr1671zONwOEyTJk1Mnz59PNNK31OWLVvmmVbea8sYY84991yf/VyZ1/Hhr6u9e/eaiIgIc/bZZ/vUfSSHtzfGmO+++85IMh9++KFnWml/P5LD+2Hp48DAQLNx40affXD4vrrrrruMJJ/XYnZ2tmndurVp1aqV5z23ou+DpfWW9ouvv/7aBAQEmHvvvden5qq8XwLwxel2wAli6NChatq0qVq0aKErrrhC4eHh+vTTT9WsWTNJJaPeWa0lbwkul0v79+9XeHi4OnbsqBUrVniW8/HHHysmJka33357mXUcfnpYZVx77bWKiIjwPL7kkkuUkJCg//3vf5KkVatWadOmTbrqqqu0f/9+ORwOORwO5ebmasiQIfrpp5/kdrt9lllQUFDuUQ9vn3zyidxuty677DLPMh0Oh+Lj49W+fXstWLDAZ/6ioiJJJfvrSObMmaOoqCgNGzbMZ5l9+vRReHh4mWU6nU6f+RwOhwoKCo5a9+7du/Xiiy/q0UcfVXh4eJn1d+7cWZ06dfJZZukploev/0jGjRun6dOnex5Pnz5dY8e
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0kAAAIjCAYAAADWYVDIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB7+UlEQVR4nO3dd3xUVf7/8ffMZDKZNEIS0giBEKpURVEs2KhWRNcCKmIBV3BVbF/bAuqubde2urjubwXcFQsquhZQRAFRQOmCEUMoERICk5DeJjP390fI7AwJpJBkkvB6Ph55wNy5597PvXNmMu/ce881GYZhCAAAAAAgSTL7uwAAAAAAaE0ISQAAAADghZAEAAAAAF4ISQAAAADghZAEAAAAAF4ISQAAAADghZAEAAAAAF4ISQAAAADghZAEAADQTNxutxwOh3bu3OnvUgA0ACEJAAC0OatWrdLy5cs9j5cvX67vvvvOfwV52b9/v+6++2517dpVgYGB6tSpk0466SQVFBT4uzQA9URIAtqJefPmyWQyeX6CgoLUq1cvTZ8+XdnZ2f4uDwCa1G+//aY77rhDP/30k3766Sfdcccd+u233/xdlnbs2KHTTjtN77zzjqZOnapPP/1US5cu1bJlyxQSEuLv8gDUU4C/CwDQtB5//HElJyerrKxMq1at0pw5c/T5559r69atCg4O9nd5ANAkxo8frxdffFEDBw6UJA0bNkzjx4/3c1XS1KlTFRgYqDVr1qhz587+LgdAIxGSgHZm7NixOvXUUyVJt956q6KiovT888/r448/1nXXXefn6gCgadhsNn3//ffaunWrJKl///6yWCx+rWn9+vX6+uuv9eWXXxKQgDaO0+2Adu6CCy6QJO3atUuSlJubq/vuu08DBgxQaGiowsPDNXbsWG3evLlG27KyMs2aNUu9evVSUFCQ4uPjNX78eKWnp0uSdu/e7XOK35E/5513nmdZy5cvl8lk0rvvvquHH35YcXFxCgkJ0WWXXVbrKTJr167VmDFj1KFDBwUHB+vcc8896vUG5513Xq3rnzVrVo15//Of/2jIkCGy2+2KjIzUtddeW+v6j7Vt3txut1588UX169dPQUFBio2N1dSpU3Xo0CGf+bp166ZLLrmkxnqmT59eY5m11f7cc8/V2KeSVF5erpkzZ6pHjx6y2Wzq0qWLHnjgAZWXl9e6r7xV77dx48bVeG7q1KkymUzq379/o7ZXkhYvXqxzzz1XYWFhCg8P12mnnaYFCxb4rPtYP9Xmzp2rCy64QDExMbLZbDrppJM0Z86cOrdPkm666SafZXbs2FHnnXeevv3223q17datm8+0//znPzKbzXr66ad9pn/99dc655xzFBISooiICF1++eVKTU31mWfWrFkymUxyOBw+09etWyeTyaR58+bVWnNtP7t375b0v3715ZdfavDgwQoKCtJJJ52kDz/8sMb27Ny5U7/73e8UGRmp4OBgnXHGGfrss8/qtd9qe0/ddNNNCg0NrXM/NqTvV1ZW6oknnlBKSopsNpu6deumhx9+uEZ/7tatm2666SZZLBYNGjRIgwYN0ocffiiTyVTjNTtaTdXbZDabFRcXp2uuuUYZGRmeeao/A/7yl78cdTnVr2m1NWvWKCgoSOnp6erXr59sNpvi4uI0depU5ebm1mi/cOFCz+dRdHS0rr/+eu3bt89nnur9vHPnTo0ePVohISFKSEjQ448/LsMwatRb3Y8kqbCwUEOGDFFycrKysrI80xvyPgZOVBxJAtq56kATFRUlqeqL0kcffaTf/e53Sk5OVnZ2tv7xj3/o3HPP1c8//6yEhARJksvl0iWXXKJly5bp2muv1V133aXCwkItXbpUW7duVUpKimcd1113nS666CKf9T700EO11vOnP/1JJpNJDz74oA4cOKAXX3xRI0aM0KZNm2S32yVVfeEcO3ashgwZopkzZ8psNnu+KH/77bcaOnRojeUmJibqqaeekiQVFRXp97//fa3rfuyxx3T11Vfr1ltv1cGDB/W3v/1Nw4cP18aNGxUREVGjzZQpU3TOOedIkj788EMtWrTI5/mpU6dq3rx5mjx5sv7whz9o165deuWVV7Rx40Z99913slqtte6HhsjLy/Nsmze3263LLrtMq1at0pQpU9S3b1/99NNPeuGFF/Trr7/qo48+qnPZQUFB+uyzz3TgwAHFxMRIkkpLS/Xuu+8qKCioxvz13d558+bp5ptvVr9+/fTQQw8pIiJCGzdu1JIlSzRhwgQ98sgjuvXWWyVJDodD99xzj8++9jZnzhz169dPl112mQICAvTJJ5/ojjvukNvt1rRp0+rcxujoaL3wwguSpL179+qll17SRRddpN9++63W1/xovvzyS918882aPn26/u///s8z/auvvtLYsWPVvXt3zZo1S6Wlpfrb3/6ms846Sxs2bKjXl3ZvU6dO1YgRIzyPb7jhBl1xxRU+p5J16tTJ8/+0tDRdc801uv322zVp0iTNnTtXv/vd77RkyRKNHDlSkpSdna0zzzxTJSUl+sMf/qCoqCjNnz9fl112md5//31dccUVNerw3m/VdTS3W2+9VfPnz9dVV12le++9V2vXrtVTTz2l1NTUGu89b5WVlXrkkUcatK5zzjlHU6ZMkdvt1tatW/Xiiy8qMzOzXgH6aHJyclRWVqbf//73uuCCC3T77bcrPT1dr776qtauXau1a9fKZrNJkud9dNppp+mpp55Sdna2XnrpJX333Xc1Po9cLpfGjBmjM844Q88++6yWLFmimTNnqrKyUo8//nittTidTl155ZXKyMjQd999p/j4eM9zLfG5BbR5BoB2Ye7cuYYk46uvvjIOHjxo/Pbbb8Y777xjREVFGXa73di7d69hGIZRVlZmuFwun7a7du0ybDab8fjjj3umvfHGG4Yk4/nnn6+xLrfb7WknyXjuuedqzNOvXz/j3HPP9Tz+5ptvDElG586djYKCAs/09957z5BkvPTSS55l9+zZ0xg9erRnPYZhGCUlJUZycrIxcuTIGus688wzjf79+3seHzx40JBkzJw50zNt9+7dhsViMf70pz/5tP3pp5+MgICAGtPT0tIMScb8+fM902bOnGl4f2x+++23hiTjrbfe8mm7ZMmSGtO7du1qXHzxxTVqnzZtmnHkR/GRtT/wwANGTEyMMWTIEJ99+u9//9swm83Gt99+69P+tddeMyQZ3333XY31eTv33HONfv36GQMHDjT+8pe/+Cw3MTHROOecc4x+/fo1eHvz8vKMsLAw4/TTTzdKS0t95vV+TatV96O5c+fWWmdJSUmNaaNHjza6d+9+zO0zDMOYNGmS0bVrV59pr7/+uiHJ+OGHH+rddt26dUZoaKjxu9/9rsb7Z/DgwUZMTIyRk5PjmbZ582bDbDYbN954o2dadf85ePCgT/sff/zxmNt/ZH/w1rVrV0OS8cEHH3im5efnG/Hx8cbJJ5/smXb33Xcbknz6SmFhoZGcnGx069atxjZNnDjRSE5OPmYdkyZNMkJCQmqt68ga69P3N23aZEgybr31Vp/57rvvPkOS8fXXX/ssc9KkSZ7Hf//73w2bzWacf/75NV7vo9Xk3d4wDGPChAlGcHCw5/GxPt+qHfmZUP34wgsvNCorKz3Tqz+f//a3vxmGYRgVFRVGTEyM0b9/f5/3yKeffmpIMv74xz96pk2aNMmQZNx5552eaW6327j44ouNwMBAT3/yfh+53W5j4sSJRnBwsLF27VqfmhvyuQWcyDjdDmhnRowYoU6dOqlLly669tprFRoaqkWLFnnOj7fZbDKbq976LpdLOTk5Cg0NVe/evbVhwwbPcj744ANFR0frzjvvrLGOI0+RaYgbb7xRYWFhnsdXXXWV4uPj9fnnn0uSNm3apLS0NE2YMEE5OTlyOBxyOBwqLi7WhRdeqJUrV8rtdvsss6ysrNajHt4+/PBDud1uXX311Z5lOhwOxcXFqWfPnvrmm2985q+oqJAkz199a7Nw4UJ16NBBI0eO9FnmkCFDFBoaWmOZTqfTZz6Hw6GysrJj1r1v3z797W9/02O
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0kAAAIjCAYAAADWYVDIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB2TUlEQVR4nO3dd3hUZf7+8Xtmkkx6QhLSCKE3aQoIXxZFVKpd3EUBFRHFdWHXFdsPK7juorLWVdHdVcBe1rZ2USmi2OhIQIhA6Gmkt8nM8/sjZHaGBEiGJJOE9+u65krmzHnO+ZxznszMndMsxhgjAAAAAIAkyervAgAAAACgOSEkAQAAAIAHQhIAAAAAeCAkAQAAAIAHQhIAAAAAeCAkAQAAAIAHQhIAAAAAeCAkAQAAAIAHQhIAAEAtjDHKzc3Vtm3b/F0KgCZGSAIAAE1q06ZNeu+999zP161bp48++sh/BXkoLCzU3XffrR49eigoKEixsbHq3r27tm7d6u/SADQhQhIAt0WLFslisbgfwcHB6t69u2bOnKmDBw/6uzwArURhYaFuuOEGfffdd9q2bZtuuukmbdy40d9lKScnR0OHDtWTTz6p3/72t3r//fe1ZMkSLVu2TB07dvR3eQCaUIC/CwDQ/Nx///3q1KmTysrKtHLlSi1YsEAff/yxNm3apNDQUH+XB6CFGzp0qPshSd27d9f111/v56qk2267Tfv379eqVavUu3dvf5cDwI8ISQBqGDdunAYNGiRJuu666xQbG6tHH31U77//viZOnOjn6gC0Bu+99542b96s0tJS9e3bV0FBQX6tJzMzU4sXL9azzz5LQALA4XYAju+cc86RJO3YsUOSlJubq1tvvVV9+/ZVeHi4IiMjNW7cOK1fv75G27KyMs2ZM0fdu3dXcHCwkpKSNH78eKWnp0uSdu7c6XWI35GPESNGuKe1bNkyWSwWvfHGG7rzzjuVmJiosLAwXXTRRdq9e3eNeX///fcaO3asoqKiFBoaqrPOOkvffPNNrcs4YsSIWuc/Z86cGuO+/PLLGjhwoEJCQhQTE6Mrrrii1vkfa9k8uVwuPf744+rdu7eCg4OVkJCgG264QYcOHfIar2PHjrrgggtqzGfmzJk1pllb7fPnz6+xTiWpvLxc9913n7p27Sq73a727dvr9ttvV3l5ea3rylP1ervkkktqvHbDDTfIYrGoT58+Pi2vJH3yySc666yzFBERocjISJ1++ul69dVXveZ9rEe1hQsX6pxzzlF8fLzsdrtOOeUULViw4LjLJ0nXXHON1zTbtGmjESNG6Ouvv65Xu9oeO3fu9FrWM888U2FhYYqIiND555+vn3/+ucZ0t2zZogkTJqht27YKCQlRjx49dNddd0mS5syZc9x5Llu2zD2tt956y92X4+LidOWVV2rv3r0+L/8zzzyj3r17y263Kzk5WTNmzFBeXp7XOCNGjHD3wVNOOUUDBw7U+vXra/3bqM2R2z0uLk7nn3++Nm3a5DWexWLRzJkzjzqd6sOLq7fBjz/+KJfLpYqKCg0aNEjBwcGKjY3VxIkTlZGRUaP9V1995d5e0dHRuvjii5WWluY1TvX2qN5mkZGRio2N1U033aSysrIa9Xr+zVZWVuq8885TTEyMNm/e7DVuXd+DAPiOPUkAjqs60MTGxkqSfv31V7333nv63e9+p06dOungwYN67rnndNZZZ2nz5s1KTk6WJDmdTl1wwQX68ssvdcUVV+imm25SYWGhlixZok2bNqlLly7ueUycOFHnnXee13xnz55daz1//etfZbFYdMcddygzM1OPP/64Ro4cqXXr1ikkJERS1ReYcePGaeDAgbrvvvtktVrdX5S//vprDR48uMZ0U1JSNG/ePElSUVGRbrzxxlrnfc8992jChAm67rrrlJWVpX/84x8aPny41q5dq+jo6Bptpk+frjPPPFOS9M477+jdd9/1ev2GG27QokWLNHXqVP3pT3/Sjh079NRTT2nt2rX65ptvFBgYWOt6qI+8vDz3snlyuVy66KKLtHLlSk2fPl29evXSxo0b9dhjj+mXX37xOrn+aIKDg/XRRx8pMzNT8fHxkqTS0lK98cYbCg4OrjF+XZd30aJFuvbaa9W7d2/Nnj1b0dHRWrt2rT799FNNmjRJd911l6677jpJUnZ2tm6++Wavde1pwYIF6t27ty666CIFBATogw8+0B/+8Ae5XC7NmDHjuMsYFxenxx57TJK0Z88ePfHEEzrvvPO0e/fuWrd59XKOHDnS/fyqq67SpZdeqvHjx7uHtW3bVpL00ksvacqUKRozZoweeughlZSUaMGCBTrjjDO0du1a9/kwGzZs0JlnnqnAwEBNnz5dHTt2VHp6uj744AP99a9/1fjx49W1a1f39G+++Wb16tVL06dPdw/r1auXe/1OnTpVp59+uubNm6eDBw/qiSee0DfffFOjL9dl+efMmaO5c+dq5MiRuvHGG7V161YtWLBAP/7443H78R133HGcLeCtZ8+euuuuu2SMUXp6uh599FGdd955tYaZusrJyZFU9U+HgQMH6sEHH1RWVpaefPJJrVy5UmvXrlVcXJwk6YsvvtC4cePUuXNnzZkzR6WlpfrHP/6hYcOGac2aNTXOX5owYYI6duyoefPm6bvvvtOTTz6pQ4cO6cUXXzxqPdddd52WLVumJUuW6JRTTnEP9+U9CIAPDAActnDhQiPJfPHFFyYrK8vs3r3bvP766yY2NtaEhISYPXv2GGOMKSsrM06n06vtjh07jN1uN/fff7972AsvvGAkmUcffbTGvFwul7udJDN//vwa4/Tu3ducddZZ7udLly41kky7du1MQUGBe/ibb75pJJknnnjCPe1u3bqZMWPGuOdjjDElJSWmU6dOZtSoUTXm9Zvf/Mb06dPH/TwrK8tIMvfdd5972M6dO43NZjN//etfvdpu3LjRBAQE1Bi+bds2I8ksXrzYPey+++4znm+9X3/9tZFkXnnlFa+2n376aY3hHTp0MOeff36N2mfMmGGOfDs/svbbb7/dxMfHm4EDB3qt05deeslYrVbz9ddfe7V/9tlnjSTzzTff1Jifp7POOsv07t3b9OvXz/z973/3mm5KSoo588wzTe/eveu9vHl5eSYiIsIMGTLElJaWeo3ruU2rVfejhQsX1lpnSUlJjWFjxowxnTt3PubyGWPMlClTTIcOHbyG/fOf/zSSzA8//HDc9tWO3CbVCgsLTXR0tLn++uu9hh84cMBERUV5DR8+fLiJiIgwu3bt8hq3tnViTFWfmTJlSo3hFRUVJj4+3vTp08dr/X744YdGkrn33nvdw+qy/JmZmSYoKMiMHj3a673hqaeeMpLMCy+84B521llnefXBjz/+2EgyY8eOrdGPa3Nke2OMufPOO40kk5mZ6R4mycyYMeOo06l+v9uxY4fX81NOOcWrv1S/79xyyy3uYaeeeqqJj483OTk57mHr1683VqvVXH311e5h1X/vF110kde8//CHPxhJZv369V71VveP2bNnG5vNZt577z2vdvV9DwLgOw63A1DDyJEj1bZtW7Vv315XXHGFwsPD9e6776pdu3aSJLvdLqu16u3D6XQqJydH4eHh6tGjh9asWeOezttvv624uDj98Y9/rDGPuhxWczRXX321IiIi3M9/+9vfKikpSR9//LGkqssJb9u2TZMmTVJOTo6ys7OVnZ2t4uJinXvuuVqxYoVcLpfXNMvKymrd6+HpnXfekcvl0oQJE9zTzM7OVmJiorp166alS5d6jV9RUSGpan0dzVtvvaWoqCiNGjXKa5oDBw5UeHh4jWk6HA6v8bKzs2sctnOkvXv36h//+IfuuecehYeH15h/r1691LNnT69pVh9ieeT8j2bq1KlauHCh+/nChQs1ZcoUdz+p7/IuWbJEhYWF+n//7//V2C6+9J3qPYySlJ+fr+zsbJ111ln69ddflZ+ff9z2LpfLXeu6dev04osvKikpyb1X5kQsWbJEeXl5mjhxotc6sdlsGjJkiHudZGVlacWKFbr22muVmprqNY36rpOffvpJmZmZ+sM
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"# Функция для оценки распределения цены\n",
"def plot_distribution(y_data, title):\n",
" plt.figure(figsize=(10, 6))\n",
" sns.histplot(y_data, kde=True, bins=50)\n",
" plt.title(title)\n",
" plt.xlabel('Rank ')\n",
" plt.ylabel('Frequency')\n",
" plt.grid(True)\n",
" plt.show()\n",
"\n",
"# Оценка распределения цены в каждой выборке\n",
"plot_distribution(y_train, \"Распределение места в обучающей выборке\")\n",
"plot_distribution(y_val, \"Распределение места в контрольной выборке\")\n",
"plot_distribution(y_test, \"Распределение места в тестовой выборке\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Применим min-max нормировку для улучшения качества работы модели"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Нормированные данные:\n",
" Networth Age Country Source Industry \\\n",
"0 1.0 0.319728 0.945946 0.137584 0.000000 \n",
"1 1.0 0.428571 0.945946 0.005593 0.882353 \n",
"2 1.0 0.632653 0.270270 0.081655 0.176471 \n",
"3 1.0 0.537415 0.945946 0.090604 0.882353 \n",
"4 1.0 0.877551 0.945946 0.012304 0.235294 \n",
"\n",
" Name_Abdulla Al Futtaim & family \\\n",
"0 0.0 \n",
"1 0.0 \n",
"2 0.0 \n",
"3 0.0 \n",
"4 0.0 \n",
"\n",
" Name_Abdulla bin Ahmad Al Ghurair & family Name_Abdulsamad Rabiu \\\n",
"0 0.0 0.0 \n",
"1 0.0 0.0 \n",
"2 0.0 0.0 \n",
"3 0.0 0.0 \n",
"4 0.0 0.0 \n",
"\n",
" Name_Abhay Firodia Name_Abigail Johnson ... Name_Zhu Yan & family \\\n",
"0 0.0 0.0 ... 0.0 \n",
"1 0.0 0.0 ... 0.0 \n",
"2 0.0 0.0 ... 0.0 \n",
"3 0.0 0.0 ... 0.0 \n",
"4 0.0 0.0 ... 0.0 \n",
"\n",
" Name_Zhu Yiming Name_Zhu Yiwen & family Name_Zhuo Jun \\\n",
"0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 \n",
"\n",
" Name_Ziv Aviram Name_Zong Qinghou Name_Zong Yanmin Name_Zugen Ni \\\n",
"0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 \n",
"\n",
" Name_Zuowen Song Name_Zygmunt Solorz-Zak \n",
"0 0.0 0.0 \n",
"1 0.0 0.0 \n",
"2 0.0 0.0 \n",
"3 0.0 0.0 \n",
"4 0.0 0.0 \n",
"\n",
"[5 rows x 2602 columns]\n",
"\n",
"Стандартизированные данные:\n",
" Networth Age Country Source Industry \\\n",
"0 2.266803 -1.081352 1.173910 -1.505003 -1.701719 \n",
"1 2.266803 -0.475422 1.173910 -2.004526 1.339990 \n",
"2 2.266803 0.660697 -0.805574 -1.716665 -1.093377 \n",
"3 2.266803 0.130508 1.173910 -1.682800 1.339990 \n",
"4 2.266803 2.024040 1.173910 -1.979126 -0.890597 \n",
"\n",
" Name_Abdulla Al Futtaim & family \\\n",
"0 -0.019615 \n",
"1 -0.019615 \n",
"2 -0.019615 \n",
"3 -0.019615 \n",
"4 -0.019615 \n",
"\n",
" Name_Abdulla bin Ahmad Al Ghurair & family Name_Abdulsamad Rabiu \\\n",
"0 -0.019615 -0.019615 \n",
"1 -0.019615 -0.019615 \n",
"2 -0.019615 -0.019615 \n",
"3 -0.019615 -0.019615 \n",
"4 -0.019615 -0.019615 \n",
"\n",
" Name_Abhay Firodia Name_Abigail Johnson ... Name_Zhu Yan & family \\\n",
"0 -0.019615 -0.019615 ... -0.019615 \n",
"1 -0.019615 -0.019615 ... -0.019615 \n",
"2 -0.019615 -0.019615 ... -0.019615 \n",
"3 -0.019615 -0.019615 ... -0.019615 \n",
"4 -0.019615 -0.019615 ... -0.019615 \n",
"\n",
" Name_Zhu Yiming Name_Zhu Yiwen & family Name_Zhuo Jun \\\n",
"0 -0.019615 -0.019615 -0.019615 \n",
"1 -0.019615 -0.019615 -0.019615 \n",
"2 -0.019615 -0.019615 -0.019615 \n",
"3 -0.019615 -0.019615 -0.019615 \n",
"4 -0.019615 -0.019615 -0.019615 \n",
"\n",
" Name_Ziv Aviram Name_Zong Qinghou Name_Zong Yanmin Name_Zugen Ni \\\n",
"0 -0.019615 -0.019615 -0.019615 -0.019615 \n",
"1 -0.019615 -0.019615 -0.019615 -0.019615 \n",
"2 -0.019615 -0.019615 -0.019615 -0.019615 \n",
"3 -0.019615 -0.019615 -0.019615 -0.019615 \n",
"4 -0.019615 -0.019615 -0.019615 -0.019615 \n",
"\n",
" Name_Zuowen Song Name_Zygmunt Solorz-Zak \n",
"0 -0.019615 -0.019615 \n",
"1 -0.019615 -0.019615 \n",
"2 -0.019615 -0.019615 \n",
"3 -0.019615 -0.019615 \n",
"4 -0.019615 -0.019615 \n",
"\n",
"[5 rows x 2602 columns]\n"
]
}
],
"source": [
"from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
"\n",
"# Предполагаем, что вы уже выделили ваши признаки X\n",
"# Применение нормировки Min-Max к всем числовым признакам\n",
"min_max_scaler = MinMaxScaler()\n",
"X_normalized = pd.DataFrame(min_max_scaler.fit_transform(X), columns=X.columns)\n",
"\n",
"# Применение стандартизации к всем числовым признакам\n",
"standard_scaler = StandardScaler()\n",
"X_standardized = pd.DataFrame(standard_scaler.fit_transform(X), columns=X.columns)\n",
"\n",
"# Проверка первых 5 строк после нормировки\n",
"print(\"Нормированные данные:\")\n",
"print(X_normalized.head())\n",
"\n",
"# Проверка первых 5 строк после стандартизации\n",
"print(\"\\nСтандартизированные данные:\")\n",
"print(X_standardized.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-11-02 00:19:57 +04:00
"# Приведём пример использования future tools\n",
"## Попробую вынести страну в отдельную таблицу"
2024-11-01 17:36:00 +04:00
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting featuretoolsNote: you may need to restart the kernel to use updated packages.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"[notice] A new release of pip is available: 24.2 -> 24.3.1\n",
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
" Downloading featuretools-1.31.0-py3-none-any.whl.metadata (15 kB)\n",
"Collecting cloudpickle>=1.5.0 (from featuretools)\n",
" Downloading cloudpickle-3.1.0-py3-none-any.whl.metadata (7.0 kB)\n",
"Collecting holidays>=0.17 (from featuretools)\n",
" Downloading holidays-0.59-py3-none-any.whl.metadata (25 kB)\n",
"Requirement already satisfied: numpy>=1.25.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from featuretools) (2.1.1)\n",
"Requirement already satisfied: packaging>=20.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from featuretools) (24.1)\n",
"Requirement already satisfied: pandas>=2.0.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from featuretools) (2.2.2)\n",
"Requirement already satisfied: psutil>=5.7.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from featuretools) (6.0.0)\n",
"Requirement already satisfied: scipy>=1.10.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from featuretools) (1.14.1)\n",
"Collecting tqdm>=4.66.3 (from featuretools)\n",
" Downloading tqdm-4.66.6-py3-none-any.whl.metadata (57 kB)\n",
"Collecting woodwork>=0.28.0 (from featuretools)\n",
" Downloading woodwork-0.31.0-py3-none-any.whl.metadata (10 kB)\n",
"Requirement already satisfied: python-dateutil in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from holidays>=0.17->featuretools) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from pandas>=2.0.0->featuretools) (2024.1)\n",
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from pandas>=2.0.0->featuretools) (2024.1)\n",
"Requirement already satisfied: colorama in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from tqdm>=4.66.3->featuretools) (0.4.6)\n",
"Requirement already satisfied: scikit-learn>=1.1.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from woodwork>=0.28.0->featuretools) (1.5.2)\n",
"Collecting importlib-resources>=5.10.0 (from woodwork>=0.28.0->featuretools)\n",
" Downloading importlib_resources-6.4.5-py3-none-any.whl.metadata (4.0 kB)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from python-dateutil->holidays>=0.17->featuretools) (1.16.0)\n",
"Requirement already satisfied: joblib>=1.2.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from scikit-learn>=1.1.0->woodwork>=0.28.0->featuretools) (1.4.2)\n",
"Requirement already satisfied: threadpoolctl>=3.1.0 in c:\\users\\annal\\aim\\.venv\\lib\\site-packages (from scikit-learn>=1.1.0->woodwork>=0.28.0->featuretools) (3.5.0)\n",
"Downloading featuretools-1.31.0-py3-none-any.whl (587 kB)\n",
" ---------------------------------------- 0.0/587.9 kB ? eta -:--:--\n",
" ----------------- ---------------------- 262.1/587.9 kB ? eta -:--:--\n",
" ---------------------------------------- 587.9/587.9 kB 1.5 MB/s eta 0:00:00\n",
"Downloading cloudpickle-3.1.0-py3-none-any.whl (22 kB)\n",
"Downloading holidays-0.59-py3-none-any.whl (1.1 MB)\n",
" ---------------------------------------- 0.0/1.1 MB ? eta -:--:--\n",
" --------- ------------------------------ 0.3/1.1 MB ? eta -:--:--\n",
" ---------------------------- ----------- 0.8/1.1 MB 1.9 MB/s eta 0:00:01\n",
" ---------------------------------------- 1.1/1.1 MB 2.2 MB/s eta 0:00:00\n",
"Downloading tqdm-4.66.6-py3-none-any.whl (78 kB)\n",
"Downloading woodwork-0.31.0-py3-none-any.whl (215 kB)\n",
"Downloading importlib_resources-6.4.5-py3-none-any.whl (36 kB)\n",
"Installing collected packages: tqdm, importlib-resources, cloudpickle, holidays, woodwork, featuretools\n",
"Successfully installed cloudpickle-3.1.0 featuretools-1.31.0 holidays-0.59 importlib-resources-6.4.5 tqdm-4.66.6 woodwork-0.31.0\n"
]
}
],
"source": [
"pip install --upgrade featuretools"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting setuptools\n",
" Downloading setuptools-75.3.0-py3-none-any.whl.metadata (6.9 kB)\n",
"Downloading setuptools-75.3.0-py3-none-any.whl (1.3 MB)\n",
" ---------------------------------------- 0.0/1.3 MB ? eta -:--:--\n",
" ---------------- ----------------------- 0.5/1.3 MB 3.4 MB/s eta 0:00:01\n",
" ---------------------------------------- 1.3/1.3 MB 3.7 MB/s eta 0:00:00\n",
"Installing collected packages: setuptools\n",
"Successfully installed setuptools-75.3.0\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"[notice] A new release of pip is available: 24.2 -> 24.3.1\n",
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
]
}
],
"source": [
"pip install --upgrade setuptools"
]
2024-11-02 00:19:57 +04:00
},
{
"cell_type": "code",
2024-11-02 04:40:32 +04:00
"execution_count": null,
2024-11-02 00:19:57 +04:00
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index billioner_id not found in dataframe, creating new integer column\n",
" warnings.warn(\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function max at 0x000001952157A520> is currently using SeriesGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"max\" instead.\n",
" ).agg(to_agg)\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function std at 0x000001952157B060> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"std\" instead.\n",
" ).agg(to_agg)\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function sum at 0x0000019521579B20> is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n",
" ).agg(to_agg)\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function min at 0x000001952157A660> is currently using SeriesGroupBy.min. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"min\" instead.\n",
" ).agg(to_agg)\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function mean at 0x000001952157AF20> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
" ).agg(to_agg)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Rank</th>\n",
" <th>Networth</th>\n",
" <th>Age</th>\n",
" <th>Industry</th>\n",
" <th>id</th>\n",
" <th>country_id</th>\n",
" <th>country_table.id</th>\n",
" <th>country_table.Country</th>\n",
" <th>country_table.COUNT(other_about_billioner)</th>\n",
" <th>country_table.MAX(other_about_billioner.Age)</th>\n",
" <th>...</th>\n",
" <th>country_table.SKEW(other_about_billioner.Rank )</th>\n",
" <th>country_table.SKEW(other_about_billioner.id)</th>\n",
" <th>country_table.STD(other_about_billioner.Age)</th>\n",
" <th>country_table.STD(other_about_billioner.Networth)</th>\n",
" <th>country_table.STD(other_about_billioner.Rank )</th>\n",
" <th>country_table.STD(other_about_billioner.id)</th>\n",
" <th>country_table.SUM(other_about_billioner.Age)</th>\n",
" <th>country_table.SUM(other_about_billioner.Networth)</th>\n",
" <th>country_table.SUM(other_about_billioner.Rank )</th>\n",
" <th>country_table.SUM(other_about_billioner.id)</th>\n",
" </tr>\n",
" <tr>\n",
" <th>billioner_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>219</td>\n",
" <td>50</td>\n",
" <td>Automotive</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>United States</td>\n",
" <td>1</td>\n",
" <td>50.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>50.0</td>\n",
" <td>219.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>171</td>\n",
" <td>58</td>\n",
" <td>Technology</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>United States</td>\n",
" <td>1</td>\n",
" <td>58.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>58.0</td>\n",
" <td>171.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>158</td>\n",
" <td>73</td>\n",
" <td>Fashion &amp; Retail</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>France</td>\n",
" <td>1</td>\n",
" <td>73.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>73.0</td>\n",
" <td>158.0</td>\n",
" <td>3.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>129</td>\n",
" <td>66</td>\n",
" <td>Technology</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>United States</td>\n",
" <td>1</td>\n",
" <td>66.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>66.0</td>\n",
" <td>129.0</td>\n",
" <td>4.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>118</td>\n",
" <td>91</td>\n",
" <td>Finance &amp; Investments</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>United States</td>\n",
" <td>1</td>\n",
" <td>91.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>91.0</td>\n",
" <td>118.0</td>\n",
" <td>5.0</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2595</th>\n",
" <td>2578</td>\n",
" <td>1</td>\n",
" <td>80</td>\n",
" <td>Healthcare</td>\n",
" <td>2595</td>\n",
" <td>2595</td>\n",
" <td>2595</td>\n",
" <td>Spain</td>\n",
" <td>1</td>\n",
" <td>80.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>80.0</td>\n",
" <td>1.0</td>\n",
" <td>2578.0</td>\n",
" <td>2595.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2596</th>\n",
" <td>2578</td>\n",
" <td>1</td>\n",
" <td>82</td>\n",
" <td>Fashion &amp; Retail</td>\n",
" <td>2596</td>\n",
" <td>2596</td>\n",
" <td>2596</td>\n",
" <td>Philippines</td>\n",
" <td>1</td>\n",
" <td>82.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>82.0</td>\n",
" <td>1.0</td>\n",
" <td>2578.0</td>\n",
" <td>2596.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2597</th>\n",
" <td>2578</td>\n",
" <td>1</td>\n",
" <td>71</td>\n",
" <td>Fashion &amp; Retail</td>\n",
" <td>2597</td>\n",
" <td>2597</td>\n",
" <td>2597</td>\n",
" <td>Philippines</td>\n",
" <td>1</td>\n",
" <td>71.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>71.0</td>\n",
" <td>1.0</td>\n",
" <td>2578.0</td>\n",
" <td>2597.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2598</th>\n",
" <td>2578</td>\n",
" <td>1</td>\n",
" <td>68</td>\n",
" <td>Fashion &amp; Retail</td>\n",
" <td>2598</td>\n",
" <td>2598</td>\n",
" <td>2598</td>\n",
" <td>Philippines</td>\n",
" <td>1</td>\n",
" <td>68.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>68.0</td>\n",
" <td>1.0</td>\n",
" <td>2578.0</td>\n",
" <td>2598.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2599</th>\n",
" <td>2578</td>\n",
" <td>1</td>\n",
" <td>69</td>\n",
" <td>Food &amp; Beverage</td>\n",
" <td>2599</td>\n",
" <td>2599</td>\n",
" <td>2599</td>\n",
" <td>Germany</td>\n",
" <td>1</td>\n",
" <td>69.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>69.0</td>\n",
" <td>1.0</td>\n",
" <td>2578.0</td>\n",
" <td>2599.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2600 rows × 35 columns</p>\n",
"</div>"
],
"text/plain": [
" Rank Networth Age Industry id country_id \\\n",
"billioner_id \n",
"0 1 219 50 Automotive 0 0 \n",
"1 2 171 58 Technology 1 1 \n",
"2 3 158 73 Fashion & Retail 2 2 \n",
"3 4 129 66 Technology 3 3 \n",
"4 5 118 91 Finance & Investments 4 4 \n",
"... ... ... ... ... ... ... \n",
"2595 2578 1 80 Healthcare 2595 2595 \n",
"2596 2578 1 82 Fashion & Retail 2596 2596 \n",
"2597 2578 1 71 Fashion & Retail 2597 2597 \n",
"2598 2578 1 68 Fashion & Retail 2598 2598 \n",
"2599 2578 1 69 Food & Beverage 2599 2599 \n",
"\n",
" country_table.id country_table.Country \\\n",
"billioner_id \n",
"0 0 United States \n",
"1 1 United States \n",
"2 2 France \n",
"3 3 United States \n",
"4 4 United States \n",
"... ... ... \n",
"2595 2595 Spain \n",
"2596 2596 Philippines \n",
"2597 2597 Philippines \n",
"2598 2598 Philippines \n",
"2599 2599 Germany \n",
"\n",
" country_table.COUNT(other_about_billioner) \\\n",
"billioner_id \n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"... ... \n",
"2595 1 \n",
"2596 1 \n",
"2597 1 \n",
"2598 1 \n",
"2599 1 \n",
"\n",
" country_table.MAX(other_about_billioner.Age) ... \\\n",
"billioner_id ... \n",
"0 50.0 ... \n",
"1 58.0 ... \n",
"2 73.0 ... \n",
"3 66.0 ... \n",
"4 91.0 ... \n",
"... ... ... \n",
"2595 80.0 ... \n",
"2596 82.0 ... \n",
"2597 71.0 ... \n",
"2598 68.0 ... \n",
"2599 69.0 ... \n",
"\n",
" country_table.SKEW(other_about_billioner.Rank ) \\\n",
"billioner_id \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"... ... \n",
"2595 NaN \n",
"2596 NaN \n",
"2597 NaN \n",
"2598 NaN \n",
"2599 NaN \n",
"\n",
" country_table.SKEW(other_about_billioner.id) \\\n",
"billioner_id \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"... ... \n",
"2595 NaN \n",
"2596 NaN \n",
"2597 NaN \n",
"2598 NaN \n",
"2599 NaN \n",
"\n",
" country_table.STD(other_about_billioner.Age) \\\n",
"billioner_id \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"... ... \n",
"2595 NaN \n",
"2596 NaN \n",
"2597 NaN \n",
"2598 NaN \n",
"2599 NaN \n",
"\n",
" country_table.STD(other_about_billioner.Networth) \\\n",
"billioner_id \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"... ... \n",
"2595 NaN \n",
"2596 NaN \n",
"2597 NaN \n",
"2598 NaN \n",
"2599 NaN \n",
"\n",
" country_table.STD(other_about_billioner.Rank ) \\\n",
"billioner_id \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"... ... \n",
"2595 NaN \n",
"2596 NaN \n",
"2597 NaN \n",
"2598 NaN \n",
"2599 NaN \n",
"\n",
" country_table.STD(other_about_billioner.id) \\\n",
"billioner_id \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"... ... \n",
"2595 NaN \n",
"2596 NaN \n",
"2597 NaN \n",
"2598 NaN \n",
"2599 NaN \n",
"\n",
" country_table.SUM(other_about_billioner.Age) \\\n",
"billioner_id \n",
"0 50.0 \n",
"1 58.0 \n",
"2 73.0 \n",
"3 66.0 \n",
"4 91.0 \n",
"... ... \n",
"2595 80.0 \n",
"2596 82.0 \n",
"2597 71.0 \n",
"2598 68.0 \n",
"2599 69.0 \n",
"\n",
" country_table.SUM(other_about_billioner.Networth) \\\n",
"billioner_id \n",
"0 219.0 \n",
"1 171.0 \n",
"2 158.0 \n",
"3 129.0 \n",
"4 118.0 \n",
"... ... \n",
"2595 1.0 \n",
"2596 1.0 \n",
"2597 1.0 \n",
"2598 1.0 \n",
"2599 1.0 \n",
"\n",
" country_table.SUM(other_about_billioner.Rank ) \\\n",
"billioner_id \n",
"0 1.0 \n",
"1 2.0 \n",
"2 3.0 \n",
"3 4.0 \n",
"4 5.0 \n",
"... ... \n",
"2595 2578.0 \n",
"2596 2578.0 \n",
"2597 2578.0 \n",
"2598 2578.0 \n",
"2599 2578.0 \n",
"\n",
" country_table.SUM(other_about_billioner.id) \n",
"billioner_id \n",
"0 0.0 \n",
"1 1.0 \n",
"2 2.0 \n",
"3 3.0 \n",
"4 4.0 \n",
"... ... \n",
"2595 2595.0 \n",
"2596 2596.0 \n",
"2597 2597.0 \n",
"2598 2598.0 \n",
"2599 2599.0 \n",
"\n",
"[2600 rows x 35 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import featuretools as ft\n",
"from woodwork.logical_types import Categorical, Integer\n",
"import pandas as pd\n",
"df = pd.read_csv(\"C://Users//annal//aim//static//csv//Forbes_Billionaires.csv\")\n",
"df['id'] = pd.Series(range(len(df))) \n",
"# Создание двух таблиц: одна с моделью, другая с остальными данными\n",
"country_df = df[['id', 'Country']].drop_duplicates().reset_index(drop=True)\n",
"other_df = df.drop(columns=['Country'])\n",
"\n",
"# Создание уникального идентификатора для связи\n",
"country_df['country_id'] = country_df.index\n",
"other_df['country_id'] = other_df['id'].map(country_df.set_index('id')['country_id'])\n",
"\n",
"es = ft.EntitySet(id=\"orders\")\n",
"es = es.add_dataframe(\n",
" dataframe_name=\"country_table\",\n",
" dataframe=country_df,\n",
" index=\"country_id\", # Индекс для уникальной идентификации моделей\n",
" logical_types={\n",
" \"Country\": Categorical # Определяем логический тип для модели\n",
" },\n",
")\n",
"es = es.add_dataframe(\n",
" dataframe_name=\"other_about_billioner\",\n",
" dataframe=other_df,\n",
2024-11-02 04:40:32 +04:00
" index=\"billioner_id\", # Индекс для уникальной идентификации миллиардеров\n",
2024-11-02 00:19:57 +04:00
" logical_types={\n",
" \"Rank \": Integer, # Целевая переменная (цена)\n",
2024-11-02 04:40:32 +04:00
" \"Networth\": Integer, \n",
2024-11-02 00:19:57 +04:00
" \"Age\": Integer,\n",
2024-11-02 04:40:32 +04:00
" \"country_id\": Integer, \n",
2024-11-02 00:19:57 +04:00
" },\n",
")\n",
"es = es.add_relationship(\"country_table\", \"country_id\", \"other_about_billioner\", \"country_id\")\n",
"\n",
"feature_matrix, feature_defs = ft.dfs(\n",
" entityset=es,\n",
" target_dataframe_name=\"other_about_billioner\"\n",
")\n",
"\n",
"feature_matrix"
]
2024-11-01 17:36:00 +04:00
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}