1201 lines
211 KiB
Plaintext
1201 lines
211 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"1. Определить бизнес-цели\n",
|
|||
|
"Бизнес-цели:\n",
|
|||
|
" а. Прогнозирование цены страховки\n",
|
|||
|
" б. Оценка влияния данных страхователя на цену страховки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"2. Определить цели технического проета для каждой бизнес-цели\n",
|
|||
|
" а. Построить можедь, которая на основе данных страхователя будет предсказывать цену страховки\n",
|
|||
|
" б. Провести анализ для выявления факторов, которые наиболее сильно влияют на итоговую цену страховки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"3. Подготовка данных"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 592,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"2772\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"df = pd.read_csv(\"../dataset.csv\")\n",
|
|||
|
"print(df.shape[0])"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"данных достаточно чтобы шумы усреднились"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 593,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"было 2772\n",
|
|||
|
"age 39.10966810966811 14.081459420836477\n",
|
|||
|
"bmi 30.70134920634921 6.1294486949652205\n",
|
|||
|
"children 1.1026753434562546 1.2157555494600176\n",
|
|||
|
"charges 13325.498588795157 12200.175109274192\n",
|
|||
|
"стало 2710\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(\"было \", df.shape[0])\n",
|
|||
|
"for column in df.select_dtypes(include=['int', 'float']).columns:\n",
|
|||
|
" mean = df[column].mean()\n",
|
|||
|
" std_dev = df[column].std()\n",
|
|||
|
" print(column, mean, std_dev)\n",
|
|||
|
" \n",
|
|||
|
" lower_bound = mean - 3 * std_dev\n",
|
|||
|
" upper_bound = mean + 3 * std_dev\n",
|
|||
|
" \n",
|
|||
|
" df = df[(df[column] <= upper_bound) & (df[column] >= lower_bound)]\n",
|
|||
|
" \n",
|
|||
|
"print(\"стало \", df.shape[0])\n",
|
|||
|
"df = df.reset_index(drop=True)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"были устранены выбросы, отобранные по правилу трех сигм"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 594,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"age 0\n",
|
|||
|
"sex 0\n",
|
|||
|
"bmi 0\n",
|
|||
|
"children 0\n",
|
|||
|
"smoker 0\n",
|
|||
|
"region 0\n",
|
|||
|
"charges 0\n",
|
|||
|
"dtype: int64\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(df.isnull().sum())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Пропущенных значений нет"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"4. Разбиение на выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 595,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"1897 406 407\n",
|
|||
|
"2710 2710\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"train_df, temp_df = train_test_split(df, test_size=0.3, random_state=52)\n",
|
|||
|
"val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=52)\n",
|
|||
|
"\n",
|
|||
|
"print(train_df.shape[0], val_df.shape[0], test_df.shape[0])\n",
|
|||
|
"print(df.shape[0], train_df.shape[0] + val_df.shape[0] + test_df.shape[0])\n",
|
|||
|
"\n",
|
|||
|
"test_df = test_df.reset_index(drop=True)\n",
|
|||
|
"val_df = val_df.reset_index(drop=True)\n",
|
|||
|
"train_df = train_df.reset_index(drop=True)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"данные были разделены на обучающую (70%), контрольную (15%) и тестовую (15%) выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"5. Оценка сбалансированности выборок"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 596,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAj4AAAHHCAYAAAC/R1LgAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABpZ0lEQVR4nO3deXhTVf4G8DdJs3VJ932nLaVlpywWZJNKQXRkwFERBB0VdMANRAYVQVBR8eegiDqOM4IjiqKoIwLKvpZ9K7SUUlpauqelTfclOb8/aiOhBbonbd7P8+Shues3J0n7cu8590qEEAJEREREVkBq7gKIiIiIOgqDDxEREVkNBh8iIiKyGgw+REREZDUYfIiIiMhqMPgQERGR1WDwISIiIqvB4ENERERWg8GHiKiFhBAoLCxEcnKyuUuhNmYwGKDVanHp0iVzl0JtjMGHiKgZSkpK8MorryA8PBwKhQKurq7o3r07kpKSzF1ap7B//37s3r3b+Hz37t04cOCA+Qq6Rk5ODp577jkEBgZCoVDA3d0dkZGR0Ol05i6N2pCNuQsg81qzZg0effRR43OlUomAgACMHTsWixYtgqenpxmrI7IsBQUFGDlyJNLT0/H0009j2LBhUCgUkMvlCAoKMnd5nUJGRgaWLVuGb775BgDwt7/9Da+++qqZqwIuXryI0aNHo6amBs888wwGDBgAGxsbqNVq2NnZmbs8akMMPgQAWLp0KYKDg1FZWYn9+/fj448/xubNm3H27FnY2tqauzwiizB//nxkZ2cjLi4OPXv2NHc5ndKkSZOwcuVK9OnTBwAQHR2NSZMmmbkqYNasWVAoFDh06BB8fX3NXQ61IwYfAgCMHz8eAwcOBAA8/vjjcHV1xXvvvYeffvoJU6ZMMXN1ROaXl5eHtWvX4pNPPmHoaQWlUomDBw/i7NmzAIBevXpBJpOZtabjx49j586d+O233xh6rAD7+FCj7rjjDgBAamoqAKCwsBAvvPACevfuDXt7e2g0GowfPx6nT59usG5lZSWWLFmC7t27Q6VSwdvbG5MmTUJKSgoAIC0tDRKJ5IaPUaNGGbe1e/duSCQSfPPNN3jppZfg5eUFOzs7/OlPf0JGRkaDfR8+fBjjxo2Do6MjbG1tMXLkyBv2Hxg1alSj+1+yZEmDZb/88ktERUVBrVbDxcUFDz74YKP7v9lru5bBYMDKlSvRs2dPqFQqeHp6YtasWbh69arJckFBQbj77rsb7GfOnDkNttlY7StWrGjQpgBQVVWFxYsXIzQ0FEqlEv7+/njxxRdRVVXVaFtda9SoUejVq1eD6e+++y4kEgnS0tJMphcVFeG5556Dv78/lEolQkND8fbbb8NgMBiXqW+3d999t8F2e/Xq1aD+xlzb1jKZDL6+vpg5cyaKiopuuW5tbS2WLVuGkJAQKJVKBAUF4aWXXjJpj6NHj8JgMKC6uhoDBw6ESqWCq6srpkyZgvT0dONyn3/+OSQSCU6ePNlgP2+++SZkMhkyMzONNV//nq1Zs6ZBO/7000+YMGECfHx8oFQqERISgmXLlkGv15us+8gjjzQ45bZy5Ur06NEDSqUSXl5emDVrFgoLC02Waew9bez91Gq1jdbcnM/zI488AplMhr59+6Jv377YuHEjJBJJk04VBgUFGd9jqVQKLy8vPPDAAybtf7PPUr0lS5aYfH8OHToElUqFlJQU9OzZ86ZtBQAbNmww/j5wc3PDtGnTjO9pvUceeQT29va4dOkSYmNjYWdnBx8fHyxduhRCiAb1rlmzxjitpKQEUVFRCA4ORnZ2drPbmW6OR3yoUfUhxdXVFQBw6dIl/Pjjj/jLX/6C4OBg5Obm4p///CdGjhyJhIQE+Pj4AAD0ej3uvvtu7NixAw8++CCeffZZlJSUYNu2bTh79ixCQkKM+5gyZQruuusuk/0uXLiw0XreeOMNSCQSLFiwAHl5eVi5ciViYmJw6tQpqNVqAMDOnTsxfvx4REVFYfHixZBKpfj8889xxx13YN++fRg8eHCD7fr5+WH58uUAgNLSUjz11FON7nvRokW4//778fjjjyM/Px+rVq3CiBEjcPLkSTg5OTVYZ+bMmRg+fDgAYOPGjfjhhx9M5s+aNcvYv+qZZ55BamoqPvzwQ5w8eRIHDhyAXC5vtB2ao6ioyPjarmUwGPCnP/0J+/fvx8yZMxEREYH4+Hj84x//wIULF/Djjz+2et/1ysvLMXLkSGRmZmLWrFkICAjAwYMHsXDhQmRnZ2PlypVtti8A+POf/4xJkyahtrYWcXFx+PTTT1FRUYH//ve/N13v8ccfx9q1a3Hfffdh3rx5OHz4MJYvX47ExETje1dQUACgLnRGRUXhrbfeQn5+Pj744APs378fJ0+ehJubG+677z7Mnj0b69atQ//+/U32s27dOowaNarZRxXWrFkDe3t7zJ07F/b29ti5cydeffVV6HQ6rFix4obrvfnmm3j55ZcxYsQIzJ492/g5O3z4MA4fPgylUtmsOm6kpZ/n2tpavPzyy83a1/DhwzFz5kwYDAacPXsWK1euRFZWFvbt29fi+gsKClBZWYmnnnoKd9xxB5588kmkpKRg9erVDdqq/nUOGjQIy5cvR25uLt5//30cOHCgwe8DvV6PcePG4bbbbsM777yDrVu3YvHixaitrcXSpUsbraWmpgaTJ09Geno6Dhw4AG9vb+O8jvi9YRUEWbXPP/9cABDbt28X+fn5IiMjQ6xfv164uroKtVotrly5IoQQorKyUuj1epN1U1NThVKpFEuXLjVO+89//iMAiPfee6/BvgwGg3E9AGLFihUNlunZs6cYOXKk8fmuXbsEAOHr6yt0Op1x+rfffisAiPfff9+47bCwMBEbG2vcjxBClJeXi+DgYHHnnXc22NfQoUNFr169jM/z8/MFALF48WLjtLS0NCGTycQbb7xhsm58fLywsbFpMD05OVkAEGvXrjVOW7x4sbj2q7Zv3z4BQKxbt85k3a1btzaYHhgYKCZMmNCg9tmzZ4vrv77X1/7iiy8KDw8PERUVZdKm//3vf4VUKhX79u0zWf+TTz4RAMSBAwca7O9aI0eOFD179mwwfcWKFQKASE1NNU5btmyZsLOzExcuXDBZ9u9//7uQyWQiPT1dCNG8z8SNXP/6hah7jyMjI2+63qlTpwQA8fjjj5tMf+GFFwQAsXPnTiHEH9+VyMhIUV5eblyu/jM6b94847QpU6YIHx8fk+/MiRMnBADx+eefG6dJJBLx6quvmuy3fj/XtuO1+6s3a9YsYWtrKyorK43TZsyYIQIDA4UQdZ9nlUolbr/9dlFTU2NcZs2aNQKAWLVqlXFaY+9pY+9nY9+R5n6eZ8yYYXz+0UcfCaVSKUaPHm2s+2auX18IIR566CFha2trfH6zz1K967+T9c/HjBkjamtrjdPr34v6tqqurhYeHh6iV69eoqKiwrjcpk2bBACT93LGjBkCgHj66aeN0wwGg5gwYYJQKBQiPz/fpN7PP/9cGAwGMXXqVGFraysOHz5sUnNz2plujqe6CAAQExMDd3d3+Pv748EHH4S9vT1++OEH4/9MlUolpNK6j4ter0dBQQHs7e0RHh6OEydOGLfz/fffw83NDU8//XSDfVx/aqY5pk+fDgcHB+Pz++67D97e3ti8eTMA4NSpU0hOTsZDDz2EgoICaLVaaLValJWVYcyYMdi7d6/JqRWg7pScSqW66X43btwIg8GA+++/37hNrVYLLy8vhIWFYdeuXSbLV1dXA8BN/ye9YcMGODo64s477zTZZlRUFOzt7Rtss6amxmQ5rVaLysrKm9admZmJVatWYdGiRbC3t2+w/4iICPTo0cNkm/WnN6/ff2ts2LABw4cPh7Ozs8m+YmJioNfrsXfvXpPly8vLG7zW60/n3Ez9+jk5Ofj+++9x+vRpjBkz5qbr1H+G5s6dazJ93rx5AIBffvnFZPrs2bONRxmButNEUVFRJstNnz4dWVlZJm25bt06qNVqTJ482TjNw8MDV65cueXrunZ/JSUl0Gq1GD58OMrLy3H+/HmTZeuvP7Np0yZUVlb
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjYAAAHHCAYAAACskBIUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABlL0lEQVR4nO3dd3xT5f4H8M/JaJKOdNI9oQMouwxBERGQqSAo6g/HVRS8gohwRVEBxQGIFxFFuDgAr7gQwQnKlo2MAoVSCrS0FDrSla505fn9UZpLaFml7UnTz/v1ygty5jcnTfrpOc9zHkkIIUBERERkBxRyF0BERERUXxhsiIiIyG4w2BAREZHdYLAhIiIiu8FgQ0RERHaDwYaIiIjsBoMNERER2Q0GGyIiIrIbKrkLICIishUmkwk5OTlQqVTw9vaWuxyqA56xISKiBvfxxx8jLy/P8nzhwoUoKiqSr6DLbNq0Cffddx/c3Nyg0+kQEBCAF154Qe6yqI4YbOzAihUrIEmS5aHVahEZGYmJEyciIyND7vKIiPDLL7/gjTfeQGpqKlatWoUZM2ZAp9PJXRY++eQTDBw4EPn5+fjwww+xceNGbNy4EbNnz5a7NKojXoqyI7Nnz0ZYWBhMJhN27tyJJUuW4Pfff0dcXBwcHR3lLo+ImrFXX30V9913Hz788EMoFAr8+9//hkIh79/WiYmJmDJlCsaNG4dPPvkEkiTJWg/VDwYbOzJ48GB07doVAPD000/D09MTCxYswE8//YRHHnlE5uqIqDnr06cPzp07h/j4eAQFBSEwMFDukrBo0SL4+vpi0aJFDDV2hJei7Njdd98NAEhKSgIA5OTk4F//+hfat28PZ2dn6PV6DB48GEeOHKmxrslkwhtvvIHIyEhotVr4+flh5MiROHPmDAAgOTnZ6vLXlY+77rrLsq1t27ZBkiR89913ePXVV+Hr6wsnJyfcd999SE1NrbHvffv2YdCgQXB1dYWjoyP69OmDXbt21foa77rrrlr3/8Ybb9RY9quvvkJMTAx0Oh08PDzw8MMP17r/a722y5nNZixcuBDR0dHQarXw8fHB+PHjkZuba7VcaGgohg0bVmM/EydOrLHN2mqfP39+jWMKAKWlpZg1axbCw8Oh0WgQFBSEadOmobS0tNZjdbm77roL7dq1qzH9/fffhyRJSE5Otpqel5eHyZMnIygoCBqNBuHh4Zg3bx7MZrNlmerj9v7779fYbrt27WrUX5vLj7VSqURAQADGjRtn1TbjWutefuwqKiowZMgQeHh44MSJE1bT33rrLbRq1QoajQahoaF49dVXaxy3G33frvU5uPx9u9nPwerVqy0/r15eXnj00UeRlpZ23eN2+ePy91GSJEycOPGax7C6xh9++KHGPGdnZ/zjH/+wmnb27Fk8+OCD8PDwgKOjI2677Tb89ttvtW5z27ZtcHNzQ8+ePREYGIihQ4de9bNa2/rVD41Gg8jISMyZMwdCCMtyb7zxBiRJgsFguOq2QkNDrV7D3r17ERMTg+eeew4+Pj7QaDRo164dPv300xrrFhUVYerUqZbPQFRUFN5//32rGoD/HedVq1YhKioKWq0WMTEx+Ouvv6yWq673clu3boVGo8Gzzz5rNT0tLQ1PPfWUpcbo6Gh88cUX1zxuzRnP2Nix6hDi6ekJoOpLaN26dXjwwQcRFhaGjIwM/Oc//0GfPn1w4sQJ+Pv7AwAqKysxbNgwbN68GQ8//DBeeOEFFBQUYOPGjYiLi0OrVq0s+3jkkUcwZMgQq/1Onz691nreeecdSJKEl19+GZmZmVi4cCH69++P2NhYy7X2LVu2YPDgwYiJicGsWbOgUCiwfPly3H333dixYwe6d+9eY7uBgYGYM2cOAKCwsBD//Oc/a933jBkzMHr0aDz99NPIysrCRx99hDvvvBOHDx+Gm5tbjXXGjRuH3r17AwB+/PFHrF271mr++PHjsWLFCjz55JOYNGkSkpKS8PHHH+Pw4cPYtWsX1Gp1rcfhZuTl5Vle2+XMZjPuu+8+7Ny5E+PGjUObNm1w7NgxfPDBBzh16hTWrVt3y/uuVlxcjD59+iAtLQ3jx49HcHAwdu/ejenTp+PixYtYuHBhve0LAO6//36MHDkSFRUV2LNnD5YtW4aSkhL897//vantPP3009i2bRs2btyItm3bWk1fuXIlHnjgAUydOhX79u3DnDlzEB8fX+M9vhGX17Vjxw4sW7YMH3zwAby8vAAAPj4+VsvfyOeg+ueqW7dumDNnDjIyMvDhhx9i165dV/15rT5ul9fRkDIyMtCrVy8UFxdj0qRJ8PT0xMqVK3Hffffhhx9+wP3333/Vdf/66y/8/vvvN7W/V199FW3atEFJSYklHHp7e2Ps2LF1fg3Z2dk4cOAAVCoVJkyYgFatWmHdunUYN24csrOz8corrwAAhBC47777sHXrVowdOxadOnXCH3/8gZdeeglpaWn44IMPrLa7fft2fPfdd5g0aRI0Gg0++eQTDBo0CPv376/1DwoAOHLkCEaMGIEhQ4Zg8eLFlukZGRm47bbbLIGpRYsWWL9+PcaOHQuj0YjJkyfX+fXbLUFN3vLlywUAsWnTJpGVlSVSU1PFt99+Kzw9PYVOpxPnz58XQghhMplEZWWl1bpJSUlCo9GI2bNnW6Z98cUXAoBYsGBBjX2ZzWbLegDE/PnzaywTHR0t+vTpY3m+detWAUAEBAQIo9Fomf79998LAOLDDz+0bDsiIkIMHDjQsh8hhCguLhZhYWFiwIABNfbVq1cv0a5dO8vzrKwsAUDMmjXLMi05OVkolUrxzjvvWK177NgxoVKpakxPTEwUAMTKlSst02bNmiUu/7js2LFDABCrVq2yWnfDhg01poeEhIihQ4fWqH3ChAniyo/glbVPmzZNeHt7i5iYGKtj+t///lcoFAqxY8cOq/WXLl0qAIhdu3bV2N/l+vTpI6Kjo2tMnz9/vgAgkpKSLNPeeust4eTkJE6dOmW17CuvvCKUSqVISUkRQtzcz8TVXPn6hah6j9u2bXtT606fPl0olUqxbt06q2ViY2MFAPH0009bTf/Xv/4lAIgtW7ZYpt3M+1at+rN4+fGrdqOfg7KyMuHt7S3atWsnSkpKLMv9+uuvAoCYOXOm1XbLy8sFAPHmm29esw4AYsKECbXWfWWNq1evrjHPyclJPPHEE5bnkydPFgCsfgYLCgpEWFiYCA0NtXzXVG9z69atluV69OghBg8eXOv7fbWaLl/fZDIJhUIhnnvuOcu06s9oVlbWVbcVEhJi9RpCQkIEALFixQrLtIqKCtGvXz+h0WiEwWAQQgixbt06AUC8/fbbVtt74IEHhCRJ4vTp05ZpAAQAceDAAcu0c+fOCa1WK+6///4a9QpR9R3l5+cn7rjjDqv3XAghxo4dK/z8/Cy1VHv44YeFq6urKC4uvurrba54KcqO9O/fHy1atEBQUBAefvhhODs7Y+3atQgICAAAaDQaS2O9yspKZGdnw9nZGVFRUTh06JBlO2vWrIGXlxeef/75Gvu4levQjz/+OFxcXCzPH3jgAfj5+Vn+couNjUViYiL+7//+D9nZ2TAYDDAYDCgqKkK/fv3w119/WV36AKoumWm12mvu98cff4TZbMbo0aMt2zQYDPD19UVERAS2bt1qtXxZWRmAquN1NatXr4arqysGDBhgtc2YmBg4OzvX2GZ5ebnVcgaDASaT6Zp1p6Wl4aOPPsKMGTPg7OxcY/9t2rRB69atrbZZffnxyv3fitWrV6N3795wd3e32lf//v1RWVlZ4xR7cXFxjddaWVl5w/urXj89PR1r1qzBkSNH0K9fvxte/+OPP8acOXOwaNEiDB8+3Gpe9c/alClTrKZPnToVAGpcRqnL+3Y91/scHDhwAJmZmXjuueesfraHDh2K1q1b16jxRn5eq5lMJhgMBmRnZ9f4LF2uoKCgxuu+0u+//47u3bvjjjvusExzdnbGuHHjkJycbHX573I//vgj/v77b8ydO/e69V4uPz8fBoMBKSkpeO+992A2my0/75fLycmxfG/cCB8fHzz22GO
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjYAAAHHCAYAAACskBIUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABpZElEQVR4nO3deVhUZf8G8PvMMDMM+yYDKJuA4r7gEu4apaaVaetrZWZpvVqZZcVrZtliZb+0xbTF1F4r0zLbXFJccsElFFdEVBBEVpUZ1gFmnt8fxLyO4IbAGYb7c13n0jnrdw4D3JzzPM+RhBACRERERHZAIXcBRERERPWFwYaIiIjsBoMNERER2Q0GGyIiIrIbDDZERERkNxhsiIiIyG4w2BAREZHdYLAhIiIiu+EgdwFERET1oby8HBcuXIDZbEZAQIDc5ZBMeMWGiIiuavny5UhLS7O8Xrp0KTIzM+Ur6BJ///03/vWvf8HHxwcajQb+/v4YM2aM3GWRjBhsCEDVDypJkiyTo6Mj2rRpgylTpiAnJ0fu8ohIRtu3b8dLL72EtLQ0bNiwAZMnT4ZCIf+vj19++QX9+vXDsWPH8Pbbb2Pjxo3YuHEjPv/8c7lLIxnxVhRZmT17NkJDQ1FWVoYdO3Zg4cKFWLt2LY4cOQInJye5yyMiGTz//PMYNGgQQkNDAQDTpk2Dv7+/rDVduHABTzzxBIYOHYpVq1ZBrVbLWg/ZDgYbsjJ8+HD06NEDAPDEE0/A29sbH374IX755Rc89NBDMldHRHKIjIzEqVOncOTIEfj4+CAsLEzukrBkyRKUlZVh6dKlDDVkRf5riWTThgwZAgBITU0FUPVX0osvvohOnTrBxcUFbm5uGD58OA4ePFhj27KyMrz++uto06YNHB0d4e/vj9GjR+PUqVMAgLS0NKvbX5dPgwYNsuxr69atkCQJP/zwA/7zn//Az88Pzs7OuOuuu5CRkVHj2Hv27MGwYcPg7u4OJycnDBw4EDt37qz1PQ4aNKjW47/++us11l2+fDmioqKg1Wrh5eWFBx98sNbjX+29XcpsNmP+/Pno0KEDHB0dodPpMGnSJFy8eNFqvZCQEIwcObLGcaZMmVJjn7XVPnfu3BrnFACMRiNmzZqF8PBwaDQaBAYG4qWXXoLRaKz1XF1q0KBB6NixY435H3zwASRJsmqTAQAFBQWYOnUqAgMDodFoEB4ejvfeew9ms9myTvV5++CDD2rst2PHjjXqr82l51qpVKJly5aYOHEiCgoKrnu7a30eb+S8LV++HL169YKTkxM8PT0xYMAA/PnnnwCqvq5XO2ZISIhlP8XFxXjhhRcs569t27b44IMPIISo0/vPzc3FhAkToNPp4OjoiC5dumDZsmVW61R/PZYuXQpnZ2f07t0bYWFhmDx5MiRJwmOPPXbVc3r594FKpUJISAimT5+O8vJyy3rVt8L//vvvK+5r0KBBVl+D3bt3o2vXrnjnnXcs5yQiIgLvvvuu1WcKACorK/Hmm28iLCwMGo0GISEh+M9//lPj61X9ffbnn3+ia9eucHR0RPv27bF69Wqr9arrvfQzfvToUXh6emLkyJGorKy0zL+ezz3VL16xoauqDiHe3t4AgNOnT2PNmjW47777EBoaipycHHz++ecYOHAgjh07ZumJYDKZMHLkSMTFxeHBBx/Ec889h8LCQmzcuBFHjhyx+ovvoYcewh133GF13NjY2FrrefvttyFJEl5++WXk5uZi/vz5iImJQWJiIrRaLQBg8+bNGD58OKKiojBr1iwoFAosWbIEQ4YMwfbt29GrV68a+23VqhXmzJkDACgqKsLTTz9d67FnzpyJ+++/H0888QTy8vLwySefYMCAAThw4AA8PDxqbDNx4kT0798fALB69Wr8/PPPVssnTZqEpUuXYvz48Xj22WeRmpqKTz/9FAcOHMDOnTuhUqlqPQ83oqCgwPLeLmU2m3HXXXdhx44dmDhxItq1a4fDhw9j3rx5OHHiBNasWXPTx65WUlKCgQMHIjMzE5MmTUJQUBB27dqF2NhYZGVlYf78+fV2LAC45557MHr0aFRWViI+Ph5ffPEFSktL8d///veK21y6bPv27fjiiy8wb948+Pj4AAB0Oh2AGztvb7zxBl5//XX06dMHs2fPhlqtxp49e7B582bcfvvtmD9/PoqKigAASUlJeOedd/Cf//wH7dq1AwC4uLgAAIQQuOuuu7BlyxZMmDABXbt2xYYNGzB9+nRkZmZi3rx5N/T+S0tLMWjQIJw8eRJTpkxBaGgoVq1ahcceewwFBQV47rnnrnieTp48iS+//PJ6vxQA/vd9YDQasWHDBnzwwQdwdHTEm2++eUP7udT58+exY8cO7NixA48//jiioqIQFxeH2NhYpKWlYdGiRZZ1n3jiCSxbtgz33nsvXnjhBezZswdz5sxBUlJSje/JlJQUPPDAA3jqqacwbtw4LFmyBPfddx/Wr1+P2267rdZaMjIyMGzYMERGRmLlypVwcKj61drYn3v6hyASQixZskQAEJs2bRJ5eXkiIyNDrFixQnh7ewutVivOnj0rhBCirKxMmEwmq21TU1OFRqMRs2fPtsz7+uuvBQDx4Ycf1jiW2Wy2bAdAzJ07t8Y6HTp0EAMHDrS83rJliwAgWrZsKQwGg2X+ypUrBQDx0UcfWfYdEREhhg4dajmOEEKUlJSI0NBQcdttt9U4Vp8+fUTHjh0tr/Py8gQAMWvWLMu8tLQ0oVQqxdtvv2217eHDh4WDg0ON+SkpKQKAWLZsmWXerFmzxKXfctu3bxcAxLfffmu17fr162vMDw4OFiNGjKhR++TJk8Xl38aX1/7SSy8JX19fERUVZXVO//vf/wqFQiG2b99utf2iRYsEALFz584ax7vUwIEDRYcOHWrMnzt3rgAgUlNTLfPefPNN4ezsLE6cOGG17iuvvCKUSqVIT08XQtzYZ+JKLn//QlR9jdu3b3/NbatVfz9c+h6qXe95S0lJEQqFQtxzzz01vmcu/WxWq/6Mb9mypcayNWvWCADirbfespp/7733CkmSxMmTJy3zruf9z58/XwAQy5cvt8wrLy8X0dHRwsXFxfI9Vv31WLJkiWW9+++/X3Ts2FEEBgaKcePG1aj1UrVtL4QQAQEB4o477rC8rj7f+/btu+K+Bg4caPX1HzhwoAAgXn/9dav1HnvsMQFAHD58WAghRGJiogAgnnjiCav1XnzxRQFAbN682TIvODhYABA//fSTZZ5erxf+/v6iW7duNepNTU0VFy5cEO3btxdt27YV+fn5Vse43s891S/eiiIrMTExaNGiBQIDA/Hggw/CxcUFP//8M1q2bAkA0Gg0lt4QJpMJ58+fh4uLC9q2bYv9+/db9vPTTz/Bx8cHzzzzTI1jXH7r5EY8+uijcHV1tby+99574e/vj7Vr1wIAEhMTkZKSgn/96184f/488vPzkZ+fj+LiYtx6663466+/alwCLisrg6Oj41WPu3r1apjNZtx///2Wfebn58PPzw8RERHYsmWL1frVl9k1Gs0V97lq1Sq4u7vjtttus9pnVFQUXFxcauyzoqLCar38/HyUlZVdte7MzEx88sknmDlzpuWv/0uP365dO0RGRlrts/r24+XHvxmrVq1C//794enpaXWsmJgYmEwm/PXXX1brl5SU1HivJpPpuo9XvX12djZ++uknHDx4ELfeemu9vZfrOW9r1qyB2WzGa6+9VqMH0Y1+D6xduxZKpRLPPvus1fwXXngBQgisW7fOav613v/atWvh5+dn1W5OpVLh2WefRVFREbZt21ZrHQkJCVi1ahXmzJlzQ72iioqKkJ+fj8zMTHzxxRfIzs6u9euh1+uRn5+PwsLC69qvUqnE888/bzXvhRdeAAD88ccfAGD52TBt2rSrrlctICAA99xzj+W1m5sbHn30URw4cADZ2dlW65aVleGuu+5CXl4e1q9fb7myXe1GP/dUP3griqwsWLAAbdq0gYODA3Q6Hdq2bWv1A8xsNuOjjz7CZ599htTUVKtfNpd+U586dQpt27a1XJKtLxEREVavJUlCeHi
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"def draw(data, title):\n",
|
|||
|
" sns.histplot(data['charges'], kde=True)\n",
|
|||
|
" plt.title(title)\n",
|
|||
|
" plt.show()\n",
|
|||
|
" \n",
|
|||
|
"draw(train_df, 'Распределение цен в обучающей выборке')\n",
|
|||
|
"draw(val_df, 'Распределение цен в контрольной выборке')\n",
|
|||
|
"draw(test_df, 'Распределение цен в тестовой выборке')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"6,7. Конструирование признаков"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 597,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"['southwest' 'southeast' 'northwest' 'northeast']\n",
|
|||
|
"[0 1 3 2 4]\n",
|
|||
|
"Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges',\n",
|
|||
|
" 'smoker_yes', 'sex_male', 'region_northwest', 'region_southeast',\n",
|
|||
|
" 'region_southwest', 'children_1', 'children_2', 'children_3',\n",
|
|||
|
" 'children_4'],\n",
|
|||
|
" dtype='object')\n",
|
|||
|
"Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges',\n",
|
|||
|
" 'smoker_yes', 'sex_male', 'region_northwest', 'region_southeast',\n",
|
|||
|
" 'region_southwest', 'children_1', 'children_2', 'children_3',\n",
|
|||
|
" 'children_4'],\n",
|
|||
|
" dtype='object')\n",
|
|||
|
"Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges',\n",
|
|||
|
" 'smoker_yes', 'sex_male', 'region_northwest', 'region_southeast',\n",
|
|||
|
" 'region_southwest', 'children_1', 'children_2', 'children_3',\n",
|
|||
|
" 'children_4'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(df['region'].unique())\n",
|
|||
|
"print(df['children'].unique())\n",
|
|||
|
"\n",
|
|||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"\n",
|
|||
|
"encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
|
|||
|
"\n",
|
|||
|
"encoded_values = encoder.fit_transform(train_df[[\"smoker\", \"sex\", \"region\", \"children\"]])\n",
|
|||
|
"encoded_columns = encoder.get_feature_names_out([\"smoker\", \"sex\", \"region\", \"children\"])\n",
|
|||
|
"encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
|
|||
|
"train_df = pd.concat([train_df, encoded_values_df], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"encoded_values = encoder.fit_transform(test_df[[\"smoker\", \"sex\", \"region\", \"children\"]])\n",
|
|||
|
"encoded_columns = encoder.get_feature_names_out([\"smoker\", \"sex\", \"region\", \"children\"])\n",
|
|||
|
"encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
|
|||
|
"test_df = pd.concat([test_df, encoded_values_df], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"encoded_values = encoder.fit_transform(val_df[[\"smoker\", \"sex\", \"region\", \"children\"]])\n",
|
|||
|
"encoded_columns = encoder.get_feature_names_out([\"smoker\", \"sex\", \"region\", \"children\"])\n",
|
|||
|
"encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
|
|||
|
"val_df = pd.concat([val_df, encoded_values_df], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"print(test_df.columns)\n",
|
|||
|
"print(val_df.columns)\n",
|
|||
|
"print(train_df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Было совершено унитарное кодирование признаков Пол (sex), Курильщик (smoker) и Регион (region). Полученные признаки были добавлены в исходный сет."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 598,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"age 18 64\n",
|
|||
|
"bmi 15.96 49.06\n",
|
|||
|
"bmi_category\n",
|
|||
|
"overweight 335\n",
|
|||
|
"normal weight 66\n",
|
|||
|
"underweight 5\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"========================\n",
|
|||
|
"bmi_category\n",
|
|||
|
"overweight 332\n",
|
|||
|
"normal weight 70\n",
|
|||
|
"underweight 5\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"========================\n",
|
|||
|
"bmi_category\n",
|
|||
|
"overweight 1543\n",
|
|||
|
"normal weight 324\n",
|
|||
|
"underweight 30\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print('age', min(df['age']), max(df['age']))\n",
|
|||
|
"#print('charges', min(df['charges']), max(df['charges']))\n",
|
|||
|
"print('bmi', min(df['bmi']), max(df['bmi']))\n",
|
|||
|
"\n",
|
|||
|
"labels_age = ['young', 'middle-aged', 'old']\n",
|
|||
|
"labels_bmi = ['underweight', 'normal weight', 'overweight']\n",
|
|||
|
"#labels_charges = ['low_charges', 'medium_charges', 'high_charges']\n",
|
|||
|
"\n",
|
|||
|
"hist_age, bins_age = np.histogram(test_df['age'], bins = [0, 27, 45, 100])\n",
|
|||
|
"age_df = pd.concat([test_df['age'], pd.cut(test_df['age'], list(bins_age), labels = labels_age)], axis=1)\n",
|
|||
|
"test_df['age_category'] = pd.cut(test_df['age'], bins=bins_age, labels=labels_age)\n",
|
|||
|
"\n",
|
|||
|
"hist_bmi, bins_bmi = np.histogram(test_df['bmi'], bins = [0, 18.5, 25, 100])\n",
|
|||
|
"bmi_df = pd.concat([test_df['bmi'], pd.cut(test_df['bmi'], list(bins_bmi), labels = labels_bmi)], axis=1)\n",
|
|||
|
"test_df['bmi_category'] = pd.cut(test_df['bmi'], bins=bins_bmi, labels=labels_bmi)\n",
|
|||
|
"\n",
|
|||
|
"hist_age, bins_age = np.histogram(train_df['age'], bins = [0, 27, 45, 100])\n",
|
|||
|
"age_df = pd.concat([train_df['age'], pd.cut(train_df['age'], list(bins_age), labels = labels_age)], axis=1)\n",
|
|||
|
"train_df['age_category'] = pd.cut(train_df['age'], bins=bins_age, labels=labels_age)\n",
|
|||
|
"\n",
|
|||
|
"hist_bmi, bins_bmi = np.histogram(train_df['bmi'], bins = [0, 18.5, 25, 100])\n",
|
|||
|
"bmi_df = pd.concat([train_df['bmi'], pd.cut(train_df['bmi'], list(bins_bmi), labels = labels_bmi)], axis=1)\n",
|
|||
|
"train_df['bmi_category'] = pd.cut(train_df['bmi'], bins=bins_bmi, labels=labels_bmi)\n",
|
|||
|
"\n",
|
|||
|
"hist_age, bins_age = np.histogram(val_df['age'], bins = [0, 27, 45, 100])\n",
|
|||
|
"age_df = pd.concat([val_df['age'], pd.cut(val_df['age'], list(bins_age), labels = labels_age)], axis=1)\n",
|
|||
|
"val_df['age_category'] = pd.cut(val_df['age'], bins=bins_age, labels=labels_age)\n",
|
|||
|
"\n",
|
|||
|
"hist_bmi, bins_bmi = np.histogram(val_df['bmi'], bins = [0, 18.5, 25, 100])\n",
|
|||
|
"bmi_df = pd.concat([val_df['bmi'], pd.cut(val_df['bmi'], list(bins_bmi), labels = labels_bmi)], axis=1)\n",
|
|||
|
"val_df['bmi_category'] = pd.cut(val_df['bmi'], bins=bins_bmi, labels=labels_bmi)\n",
|
|||
|
"\n",
|
|||
|
"category_counts = val_df['bmi_category'].value_counts()\n",
|
|||
|
"print(category_counts)\n",
|
|||
|
"print('========================')\n",
|
|||
|
"category_counts = test_df['bmi_category'].value_counts()\n",
|
|||
|
"print(category_counts)\n",
|
|||
|
"print('========================')\n",
|
|||
|
"category_counts = train_df['bmi_category'].value_counts()\n",
|
|||
|
"print(category_counts)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Была выполнена дискретизация числовых признаков Индекс массы тела (bmi) и Возраст (age)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 599,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"0 1.0\n",
|
|||
|
"1 1.0\n",
|
|||
|
"2 0.0\n",
|
|||
|
"3 0.0\n",
|
|||
|
"4 1.0\n",
|
|||
|
"5 0.0\n",
|
|||
|
"6 0.0\n",
|
|||
|
"7 0.0\n",
|
|||
|
"8 1.0\n",
|
|||
|
"9 1.0\n",
|
|||
|
"10 0.0\n",
|
|||
|
"11 0.0\n",
|
|||
|
"12 1.0\n",
|
|||
|
"13 0.0\n",
|
|||
|
"14 0.0\n",
|
|||
|
"15 0.0\n",
|
|||
|
"16 1.0\n",
|
|||
|
"17 1.0\n",
|
|||
|
"18 1.0\n",
|
|||
|
"19 0.0\n",
|
|||
|
"Name: parent_yes, dtype: float64\n",
|
|||
|
"========================\n",
|
|||
|
"0 1.0\n",
|
|||
|
"1 1.0\n",
|
|||
|
"2 0.0\n",
|
|||
|
"3 1.0\n",
|
|||
|
"4 0.0\n",
|
|||
|
"5 1.0\n",
|
|||
|
"6 0.0\n",
|
|||
|
"7 0.0\n",
|
|||
|
"8 1.0\n",
|
|||
|
"9 0.0\n",
|
|||
|
"10 1.0\n",
|
|||
|
"11 1.0\n",
|
|||
|
"12 0.0\n",
|
|||
|
"13 0.0\n",
|
|||
|
"14 0.0\n",
|
|||
|
"15 1.0\n",
|
|||
|
"16 1.0\n",
|
|||
|
"17 0.0\n",
|
|||
|
"18 1.0\n",
|
|||
|
"19 1.0\n",
|
|||
|
"Name: parent_yes, dtype: float64\n",
|
|||
|
"========================\n",
|
|||
|
"0 1.0\n",
|
|||
|
"1 0.0\n",
|
|||
|
"2 1.0\n",
|
|||
|
"3 1.0\n",
|
|||
|
"4 1.0\n",
|
|||
|
"5 0.0\n",
|
|||
|
"6 0.0\n",
|
|||
|
"7 1.0\n",
|
|||
|
"8 0.0\n",
|
|||
|
"9 1.0\n",
|
|||
|
"10 0.0\n",
|
|||
|
"11 1.0\n",
|
|||
|
"12 1.0\n",
|
|||
|
"13 0.0\n",
|
|||
|
"14 1.0\n",
|
|||
|
"15 1.0\n",
|
|||
|
"16 1.0\n",
|
|||
|
"17 0.0\n",
|
|||
|
"18 0.0\n",
|
|||
|
"19 1.0\n",
|
|||
|
"Name: parent_yes, dtype: float64\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"train_df['parent_yes'] = train_df['children'] > 0\n",
|
|||
|
"train_df['parent_yes'] = train_df['parent_yes'].map({True: 1.0, False: 0.0})\n",
|
|||
|
"\n",
|
|||
|
"test_df['parent_yes'] = test_df['children'] > 0\n",
|
|||
|
"test_df['parent_yes'] = test_df['parent_yes'].map({True: 1.0, False: 0.0})\n",
|
|||
|
"\n",
|
|||
|
"val_df['parent_yes'] = val_df['children'] > 0\n",
|
|||
|
"val_df['parent_yes'] = val_df['parent_yes'].map({True: 1.0, False: 0.0})\n",
|
|||
|
"\n",
|
|||
|
"print(train_df['parent_yes'].head(20))\n",
|
|||
|
"print('========================')\n",
|
|||
|
"print(test_df['parent_yes'].head(20))\n",
|
|||
|
"print('========================')\n",
|
|||
|
"print(val_df['parent_yes'].head(20))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Был выполнен ручной синтез признака Родитель, на основе того, есть ли дети у страхователя или нет"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 600,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"0 0.695652\n",
|
|||
|
"1 0.500000\n",
|
|||
|
"2 0.239130\n",
|
|||
|
"3 0.565217\n",
|
|||
|
"4 0.978261\n",
|
|||
|
"5 0.304348\n",
|
|||
|
"6 0.847826\n",
|
|||
|
"7 0.630435\n",
|
|||
|
"8 0.804348\n",
|
|||
|
"9 0.173913\n",
|
|||
|
"Name: age_norm, dtype: float64\n",
|
|||
|
"========================\n",
|
|||
|
"0 0.586957\n",
|
|||
|
"1 0.413043\n",
|
|||
|
"2 0.847826\n",
|
|||
|
"3 0.956522\n",
|
|||
|
"4 0.521739\n",
|
|||
|
"5 0.869565\n",
|
|||
|
"6 0.021739\n",
|
|||
|
"7 0.130435\n",
|
|||
|
"8 0.565217\n",
|
|||
|
"9 0.500000\n",
|
|||
|
"Name: age_norm, dtype: float64\n",
|
|||
|
"========================\n",
|
|||
|
"0 0.217391\n",
|
|||
|
"1 0.173913\n",
|
|||
|
"2 0.760870\n",
|
|||
|
"3 0.108696\n",
|
|||
|
"4 0.326087\n",
|
|||
|
"5 0.065217\n",
|
|||
|
"6 0.282609\n",
|
|||
|
"7 0.260870\n",
|
|||
|
"8 0.152174\n",
|
|||
|
"9 0.565217\n",
|
|||
|
"Name: age_norm, dtype: float64\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn import preprocessing\n",
|
|||
|
"\n",
|
|||
|
"scaler = preprocessing.MinMaxScaler()\n",
|
|||
|
"test_df['age_norm'] = scaler.fit_transform(test_df[['age']])\n",
|
|||
|
"print(test_df['age_norm'].head(10))\n",
|
|||
|
"print('========================')\n",
|
|||
|
"train_df['age_norm'] = scaler.fit_transform(train_df[['age']])\n",
|
|||
|
"print(train_df['age_norm'].head(10))\n",
|
|||
|
"print('========================')\n",
|
|||
|
"val_df['age_norm'] = scaler.fit_transform(val_df[['age']])\n",
|
|||
|
"print(val_df['age_norm'].head(10))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Было выполнено маштабирование признака Возраст (age) на основе нормировки."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 601,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"0 0.414950\n",
|
|||
|
"1 -0.153961\n",
|
|||
|
"2 1.268317\n",
|
|||
|
"3 1.623887\n",
|
|||
|
"4 0.201608\n",
|
|||
|
"5 1.339431\n",
|
|||
|
"6 -1.434012\n",
|
|||
|
"7 -1.078443\n",
|
|||
|
"8 0.343836\n",
|
|||
|
"9 0.130494\n",
|
|||
|
"Name: age_stand, dtype: float64\n",
|
|||
|
"========================\n",
|
|||
|
"0 0.727622\n",
|
|||
|
"1 0.101529\n",
|
|||
|
"2 -0.733262\n",
|
|||
|
"3 0.310226\n",
|
|||
|
"4 1.631978\n",
|
|||
|
"5 -0.524564\n",
|
|||
|
"6 1.214583\n",
|
|||
|
"7 0.518924\n",
|
|||
|
"8 1.075451\n",
|
|||
|
"9 -0.941960\n",
|
|||
|
"Name: age_stand, dtype: float64\n",
|
|||
|
"========================\n",
|
|||
|
"0 -0.766548\n",
|
|||
|
"1 -0.907530\n",
|
|||
|
"2 0.995731\n",
|
|||
|
"3 -1.119003\n",
|
|||
|
"4 -0.414092\n",
|
|||
|
"5 -1.259986\n",
|
|||
|
"6 -0.555074\n",
|
|||
|
"7 -0.625565\n",
|
|||
|
"8 -0.978021\n",
|
|||
|
"9 0.361310\n",
|
|||
|
"Name: age_stand, dtype: float64\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"scaler = preprocessing.StandardScaler()\n",
|
|||
|
"train_df['age_stand'] = scaler.fit_transform(train_df[['age']])\n",
|
|||
|
"print(train_df['age_stand'].head(10))\n",
|
|||
|
"print('========================')\n",
|
|||
|
"test_df['age_stand'] = scaler.fit_transform(test_df[['age']])\n",
|
|||
|
"print(test_df['age_stand'].head(10))\n",
|
|||
|
"print('========================')\n",
|
|||
|
"val_df['age_stand'] = scaler.fit_transform(val_df[['age']])\n",
|
|||
|
"print(val_df['age_stand'].head(10))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Было выполнено маштабирование признака Возраст (age) на основе стандартизации."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"8. Использование Featuretools"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 602,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\ulstu\\mii\\AIM-PIbd-31-Barsukov-P-O\\aimenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index index not found in dataframe, creating new integer column\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"c:\\ulstu\\mii\\AIM-PIbd-31-Barsukov-P-O\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\ulstu\\mii\\AIM-PIbd-31-Barsukov-P-O\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\ulstu\\mii\\AIM-PIbd-31-Barsukov-P-O\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\ulstu\\mii\\AIM-PIbd-31-Barsukov-P-O\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\ulstu\\mii\\AIM-PIbd-31-Barsukov-P-O\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\ulstu\\mii\\AIM-PIbd-31-Barsukov-P-O\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\ulstu\\mii\\AIM-PIbd-31-Barsukov-P-O\\aimenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"c:\\ulstu\\mii\\AIM-PIbd-31-Barsukov-P-O\\aimenv\\Lib\\site-packages\\featuretools\\synthesis\\dfs.py:321: UnusedPrimitiveWarning: Some specified primitives were not used during DFS:\n",
|
|||
|
" agg_primitives: ['count', 'max', 'mean', 'median', 'min', 'std', 'sum']\n",
|
|||
|
"This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.\n",
|
|||
|
" warnings.warn(warning_msg, UnusedPrimitiveWarning)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>age</th>\n",
|
|||
|
" <th>sex</th>\n",
|
|||
|
" <th>bmi</th>\n",
|
|||
|
" <th>children</th>\n",
|
|||
|
" <th>smoker</th>\n",
|
|||
|
" <th>region</th>\n",
|
|||
|
" <th>charges</th>\n",
|
|||
|
" <th>age + bmi</th>\n",
|
|||
|
" <th>age + charges</th>\n",
|
|||
|
" <th>age + children</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>age * children</th>\n",
|
|||
|
" <th>bmi * charges</th>\n",
|
|||
|
" <th>bmi * children</th>\n",
|
|||
|
" <th>charges * children</th>\n",
|
|||
|
" <th>age - bmi</th>\n",
|
|||
|
" <th>age - charges</th>\n",
|
|||
|
" <th>age - children</th>\n",
|
|||
|
" <th>bmi - charges</th>\n",
|
|||
|
" <th>bmi - children</th>\n",
|
|||
|
" <th>charges - children</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>index</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>19</td>\n",
|
|||
|
" <td>female</td>\n",
|
|||
|
" <td>27.900</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>southwest</td>\n",
|
|||
|
" <td>16884.92400</td>\n",
|
|||
|
" <td>46.900</td>\n",
|
|||
|
" <td>16903.92400</td>\n",
|
|||
|
" <td>19.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>471089.379600</td>\n",
|
|||
|
" <td>0.00</td>\n",
|
|||
|
" <td>0.0000</td>\n",
|
|||
|
" <td>-8.900</td>\n",
|
|||
|
" <td>-16865.92400</td>\n",
|
|||
|
" <td>19.0</td>\n",
|
|||
|
" <td>-16857.02400</td>\n",
|
|||
|
" <td>27.900</td>\n",
|
|||
|
" <td>16884.92400</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>18</td>\n",
|
|||
|
" <td>male</td>\n",
|
|||
|
" <td>33.770</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>southeast</td>\n",
|
|||
|
" <td>1725.55230</td>\n",
|
|||
|
" <td>51.770</td>\n",
|
|||
|
" <td>1743.55230</td>\n",
|
|||
|
" <td>19.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>18.0</td>\n",
|
|||
|
" <td>58271.901171</td>\n",
|
|||
|
" <td>33.77</td>\n",
|
|||
|
" <td>1725.5523</td>\n",
|
|||
|
" <td>-15.770</td>\n",
|
|||
|
" <td>-1707.55230</td>\n",
|
|||
|
" <td>17.0</td>\n",
|
|||
|
" <td>-1691.78230</td>\n",
|
|||
|
" <td>32.770</td>\n",
|
|||
|
" <td>1724.55230</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>28</td>\n",
|
|||
|
" <td>male</td>\n",
|
|||
|
" <td>33.000</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>southeast</td>\n",
|
|||
|
" <td>4449.46200</td>\n",
|
|||
|
" <td>61.000</td>\n",
|
|||
|
" <td>4477.46200</td>\n",
|
|||
|
" <td>31.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>84.0</td>\n",
|
|||
|
" <td>146832.246000</td>\n",
|
|||
|
" <td>99.00</td>\n",
|
|||
|
" <td>13348.3860</td>\n",
|
|||
|
" <td>-5.000</td>\n",
|
|||
|
" <td>-4421.46200</td>\n",
|
|||
|
" <td>25.0</td>\n",
|
|||
|
" <td>-4416.46200</td>\n",
|
|||
|
" <td>30.000</td>\n",
|
|||
|
" <td>4446.46200</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>33</td>\n",
|
|||
|
" <td>male</td>\n",
|
|||
|
" <td>22.705</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>northwest</td>\n",
|
|||
|
" <td>21984.47061</td>\n",
|
|||
|
" <td>55.705</td>\n",
|
|||
|
" <td>22017.47061</td>\n",
|
|||
|
" <td>33.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>499157.405200</td>\n",
|
|||
|
" <td>0.00</td>\n",
|
|||
|
" <td>0.0000</td>\n",
|
|||
|
" <td>10.295</td>\n",
|
|||
|
" <td>-21951.47061</td>\n",
|
|||
|
" <td>33.0</td>\n",
|
|||
|
" <td>-21961.76561</td>\n",
|
|||
|
" <td>22.705</td>\n",
|
|||
|
" <td>21984.47061</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>32</td>\n",
|
|||
|
" <td>male</td>\n",
|
|||
|
" <td>28.880</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>northwest</td>\n",
|
|||
|
" <td>3866.85520</td>\n",
|
|||
|
" <td>60.880</td>\n",
|
|||
|
" <td>3898.85520</td>\n",
|
|||
|
" <td>32.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>111674.778176</td>\n",
|
|||
|
" <td>0.00</td>\n",
|
|||
|
" <td>0.0000</td>\n",
|
|||
|
" <td>3.120</td>\n",
|
|||
|
" <td>-3834.85520</td>\n",
|
|||
|
" <td>32.0</td>\n",
|
|||
|
" <td>-3837.97520</td>\n",
|
|||
|
" <td>28.880</td>\n",
|
|||
|
" <td>3866.85520</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2705</th>\n",
|
|||
|
" <td>47</td>\n",
|
|||
|
" <td>female</td>\n",
|
|||
|
" <td>45.320</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>southeast</td>\n",
|
|||
|
" <td>8569.86180</td>\n",
|
|||
|
" <td>92.320</td>\n",
|
|||
|
" <td>8616.86180</td>\n",
|
|||
|
" <td>48.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>47.0</td>\n",
|
|||
|
" <td>388386.136776</td>\n",
|
|||
|
" <td>45.32</td>\n",
|
|||
|
" <td>8569.8618</td>\n",
|
|||
|
" <td>1.680</td>\n",
|
|||
|
" <td>-8522.86180</td>\n",
|
|||
|
" <td>46.0</td>\n",
|
|||
|
" <td>-8524.54180</td>\n",
|
|||
|
" <td>44.320</td>\n",
|
|||
|
" <td>8568.86180</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2706</th>\n",
|
|||
|
" <td>21</td>\n",
|
|||
|
" <td>female</td>\n",
|
|||
|
" <td>34.600</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>southwest</td>\n",
|
|||
|
" <td>2020.17700</td>\n",
|
|||
|
" <td>55.600</td>\n",
|
|||
|
" <td>2041.17700</td>\n",
|
|||
|
" <td>21.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>69898.124200</td>\n",
|
|||
|
" <td>0.00</td>\n",
|
|||
|
" <td>0.0000</td>\n",
|
|||
|
" <td>-13.600</td>\n",
|
|||
|
" <td>-1999.17700</td>\n",
|
|||
|
" <td>21.0</td>\n",
|
|||
|
" <td>-1985.57700</td>\n",
|
|||
|
" <td>34.600</td>\n",
|
|||
|
" <td>2020.17700</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2707</th>\n",
|
|||
|
" <td>19</td>\n",
|
|||
|
" <td>male</td>\n",
|
|||
|
" <td>26.030</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>northwest</td>\n",
|
|||
|
" <td>16450.89470</td>\n",
|
|||
|
" <td>45.030</td>\n",
|
|||
|
" <td>16469.89470</td>\n",
|
|||
|
" <td>20.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>19.0</td>\n",
|
|||
|
" <td>428216.789041</td>\n",
|
|||
|
" <td>26.03</td>\n",
|
|||
|
" <td>16450.8947</td>\n",
|
|||
|
" <td>-7.030</td>\n",
|
|||
|
" <td>-16431.89470</td>\n",
|
|||
|
" <td>18.0</td>\n",
|
|||
|
" <td>-16424.86470</td>\n",
|
|||
|
" <td>25.030</td>\n",
|
|||
|
" <td>16449.89470</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2708</th>\n",
|
|||
|
" <td>23</td>\n",
|
|||
|
" <td>male</td>\n",
|
|||
|
" <td>18.715</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>northwest</td>\n",
|
|||
|
" <td>21595.38229</td>\n",
|
|||
|
" <td>41.715</td>\n",
|
|||
|
" <td>21618.38229</td>\n",
|
|||
|
" <td>23.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>404157.579557</td>\n",
|
|||
|
" <td>0.00</td>\n",
|
|||
|
" <td>0.0000</td>\n",
|
|||
|
" <td>4.285</td>\n",
|
|||
|
" <td>-21572.38229</td>\n",
|
|||
|
" <td>23.0</td>\n",
|
|||
|
" <td>-21576.66729</td>\n",
|
|||
|
" <td>18.715</td>\n",
|
|||
|
" <td>21595.38229</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2709</th>\n",
|
|||
|
" <td>54</td>\n",
|
|||
|
" <td>male</td>\n",
|
|||
|
" <td>31.600</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>southwest</td>\n",
|
|||
|
" <td>9850.43200</td>\n",
|
|||
|
" <td>85.600</td>\n",
|
|||
|
" <td>9904.43200</td>\n",
|
|||
|
" <td>54.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>311273.651200</td>\n",
|
|||
|
" <td>0.00</td>\n",
|
|||
|
" <td>0.0000</td>\n",
|
|||
|
" <td>22.400</td>\n",
|
|||
|
" <td>-9796.43200</td>\n",
|
|||
|
" <td>54.0</td>\n",
|
|||
|
" <td>-9818.83200</td>\n",
|
|||
|
" <td>31.600</td>\n",
|
|||
|
" <td>9850.43200</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>2710 rows × 37 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" age sex bmi children smoker region charges \\\n",
|
|||
|
"index \n",
|
|||
|
"0 19 female 27.900 0 True southwest 16884.92400 \n",
|
|||
|
"1 18 male 33.770 1 False southeast 1725.55230 \n",
|
|||
|
"2 28 male 33.000 3 False southeast 4449.46200 \n",
|
|||
|
"3 33 male 22.705 0 False northwest 21984.47061 \n",
|
|||
|
"4 32 male 28.880 0 False northwest 3866.85520 \n",
|
|||
|
"... ... ... ... ... ... ... ... \n",
|
|||
|
"2705 47 female 45.320 1 False southeast 8569.86180 \n",
|
|||
|
"2706 21 female 34.600 0 False southwest 2020.17700 \n",
|
|||
|
"2707 19 male 26.030 1 True northwest 16450.89470 \n",
|
|||
|
"2708 23 male 18.715 0 False northwest 21595.38229 \n",
|
|||
|
"2709 54 male 31.600 0 False southwest 9850.43200 \n",
|
|||
|
"\n",
|
|||
|
" age + bmi age + charges age + children ... age * children \\\n",
|
|||
|
"index ... \n",
|
|||
|
"0 46.900 16903.92400 19.0 ... 0.0 \n",
|
|||
|
"1 51.770 1743.55230 19.0 ... 18.0 \n",
|
|||
|
"2 61.000 4477.46200 31.0 ... 84.0 \n",
|
|||
|
"3 55.705 22017.47061 33.0 ... 0.0 \n",
|
|||
|
"4 60.880 3898.85520 32.0 ... 0.0 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"2705 92.320 8616.86180 48.0 ... 47.0 \n",
|
|||
|
"2706 55.600 2041.17700 21.0 ... 0.0 \n",
|
|||
|
"2707 45.030 16469.89470 20.0 ... 19.0 \n",
|
|||
|
"2708 41.715 21618.38229 23.0 ... 0.0 \n",
|
|||
|
"2709 85.600 9904.43200 54.0 ... 0.0 \n",
|
|||
|
"\n",
|
|||
|
" bmi * charges bmi * children charges * children age - bmi \\\n",
|
|||
|
"index \n",
|
|||
|
"0 471089.379600 0.00 0.0000 -8.900 \n",
|
|||
|
"1 58271.901171 33.77 1725.5523 -15.770 \n",
|
|||
|
"2 146832.246000 99.00 13348.3860 -5.000 \n",
|
|||
|
"3 499157.405200 0.00 0.0000 10.295 \n",
|
|||
|
"4 111674.778176 0.00 0.0000 3.120 \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"2705 388386.136776 45.32 8569.8618 1.680 \n",
|
|||
|
"2706 69898.124200 0.00 0.0000 -13.600 \n",
|
|||
|
"2707 428216.789041 26.03 16450.8947 -7.030 \n",
|
|||
|
"2708 404157.579557 0.00 0.0000 4.285 \n",
|
|||
|
"2709 311273.651200 0.00 0.0000 22.400 \n",
|
|||
|
"\n",
|
|||
|
" age - charges age - children bmi - charges bmi - children \\\n",
|
|||
|
"index \n",
|
|||
|
"0 -16865.92400 19.0 -16857.02400 27.900 \n",
|
|||
|
"1 -1707.55230 17.0 -1691.78230 32.770 \n",
|
|||
|
"2 -4421.46200 25.0 -4416.46200 30.000 \n",
|
|||
|
"3 -21951.47061 33.0 -21961.76561 22.705 \n",
|
|||
|
"4 -3834.85520 32.0 -3837.97520 28.880 \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"2705 -8522.86180 46.0 -8524.54180 44.320 \n",
|
|||
|
"2706 -1999.17700 21.0 -1985.57700 34.600 \n",
|
|||
|
"2707 -16431.89470 18.0 -16424.86470 25.030 \n",
|
|||
|
"2708 -21572.38229 23.0 -21576.66729 18.715 \n",
|
|||
|
"2709 -9796.43200 54.0 -9818.83200 31.600 \n",
|
|||
|
"\n",
|
|||
|
" charges - children \n",
|
|||
|
"index \n",
|
|||
|
"0 16884.92400 \n",
|
|||
|
"1 1724.55230 \n",
|
|||
|
"2 4446.46200 \n",
|
|||
|
"3 21984.47061 \n",
|
|||
|
"4 3866.85520 \n",
|
|||
|
"... ... \n",
|
|||
|
"2705 8568.86180 \n",
|
|||
|
"2706 2020.17700 \n",
|
|||
|
"2707 16449.89470 \n",
|
|||
|
"2708 21595.38229 \n",
|
|||
|
"2709 9850.43200 \n",
|
|||
|
"\n",
|
|||
|
"[2710 rows x 37 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 602,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import featuretools as ft\n",
|
|||
|
"\n",
|
|||
|
"es = ft.EntitySet(id='insurance')\n",
|
|||
|
"\n",
|
|||
|
"es = es.add_dataframe(dataframe_name=\"insurance_data\", dataframe=df, index='index')\n",
|
|||
|
"\n",
|
|||
|
"agg_primitives = [\"sum\", \"mean\", \"median\", \"std\", \"max\", \"min\", \"count\"]\n",
|
|||
|
"trans_primitives = [\"add_numeric\", \"multiply_numeric\", \"divide_numeric\", \"subtract_numeric\"]\n",
|
|||
|
"\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(\n",
|
|||
|
" entityset=es,\n",
|
|||
|
" target_dataframe_name='insurance_data',\n",
|
|||
|
" agg_primitives=agg_primitives,\n",
|
|||
|
" trans_primitives=trans_primitives,\n",
|
|||
|
" max_depth=2\n",
|
|||
|
")\n",
|
|||
|
"feature_matrix"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Были сконструированы признаки с помощью Featuretools"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 603,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['age', 'bmi', 'children', 'smoker_yes', 'sex_male', 'region_northwest',\n",
|
|||
|
" 'region_southeast', 'region_southwest', 'children_1', 'children_2',\n",
|
|||
|
" 'children_3', 'children_4', 'parent_yes', 'age_norm', 'age_stand'],\n",
|
|||
|
" dtype='object')\n",
|
|||
|
"0.02249455451965332 33039788.648656577 0.7496335938888106\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"Text(0, 0.5, 'Прогнозируемая цена')"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 603,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAlAAAAGwCAYAAABmTltaAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADBQUlEQVR4nOzdd3zV9fX48ddn3H2zIYQRNooiDkARR7WVSluctVXrwlVHcRStrbbWDm21tnWWatUqturX0W/bX53IF1eLqAUEZSlTRsgg496buz7z98dNLgkEzQ0JWef5eOQh3Hxyc3K95J77fp/3OYrrui5CCCGEEKLd1O4OQAghhBCit5EESgghhBAiR5JACSGEEELkSBIoIYQQQogcSQIlhBBCCJEjSaCEEEIIIXIkCZQQQgghRI707g6gr3Ach4qKCvLy8lAUpbvDEUIIIUQ7uK5LLBZjyJAhqGr715UkgeokFRUVlJeXd3cYQgghhOiArVu3MmzYsHZfLwlUJ8nLywMy/wPy8/O7ORohhBBCtEc0GqW8vDz7Ot5ekkB1kuZtu/z8fEmghBBCiF4m1/IbKSIXQgghhMiRJFBCCCGEEDmSBEoIIYQQIkeSQAkhhBBC5EgSKCGEEEKIHEkCJYQQQgiRI0mghBBCCCFyJAmUEEIIIUSOujWB+vnPf46iKK0+xo8fn/18KpVi9uzZlJSUEA6HOeuss6iqqmp1H1u2bGHmzJkEg0FKS0u56aabsCyr1TVvvfUWkyZNwufzMXbsWObNm7dHLHPnzmXkyJH4/X6mTp3KBx980CU/sxBCCCF6v25fgZowYQI7duzIfvznP//Jfm7OnDm8+OKLvPDCC7z99ttUVFTwzW9+M/t527aZOXMmhmHw7rvv8uSTTzJv3jxuu+227DWbNm1i5syZfPnLX2b58uV8//vf5/LLL2f+/PnZa5577jluuOEGfvazn7Fs2TIOO+wwZsyYQXV19f55EIQQQgjRu7jd6Gc/+5l72GGHtfm5hoYG1+PxuC+88EL2tjVr1riAu3jxYtd1XfeVV15xVVV1Kysrs9c89NBDbn5+vptOp13Xdd0f/vCH7oQJE1rd9znnnOPOmDEj+/ejjjrKnT17dvbvtm27Q4YMce+88869xp5KpdxIJJL92Lp1qwu4kUik/Q+AEEIIIbpVJBLp0Ot3t69ArVu3jiFDhjB69GjOP/98tmzZAsDSpUsxTZPp06dnrx0/fjzDhw9n8eLFACxevJiJEycyaNCg7DUzZswgGo2yatWq7DUt76P5mub7MAyDpUuXtrpGVVWmT5+evaYtd955JwUFBdmP8vLyfXwkhBBCCNFbdGsCNXXqVObNm8drr73GQw89xKZNmzj++OOJxWJUVlbi9XopLCxs9TWDBg2isrISgMrKylbJU/Pnmz/3eddEo1GSySQ7d+7Etu02r2m+j7bccsstRCKR7MfWrVs79BgIIYQQovfRu/Obf/3rX8/++dBDD2Xq1KmMGDGC559/nkAg0I2RfTGfz4fP5+vuMIQQQgjRDbp9C6+lwsJCDjjgANavX09ZWRmGYdDQ0NDqmqqqKsrKygAoKyvb41Re89+/6Jr8/HwCgQADBgxA07Q2r2m+DyGEEELsf47jsHDhwu4Oo009KoFqbGxkw4YNDB48mMmTJ+PxeFo9cJ988glbtmxh2rRpAEybNo2PP/641Wm5BQsWkJ+fz8EHH5y9ZvcHf8GCBdn78Hq9TJ48udU1zf/Dmq8RQgghxP61fft2Tj75ZL761a+ydu3a7g5nT11U1N4uN954o/vWW2+5mzZtchctWuROnz7dHTBggFtdXe26ruteddVV7vDhw9033njDXbJkiTtt2jR32rRp2a+3LMs95JBD3JNPPtldvny5+9prr7kDBw50b7nlluw1GzdudIPBoHvTTTe5a9ascefOnetqmua+9tpr2WueffZZ1+fzufPmzXNXr17tXnHFFW5hYWGr031fpKNV/EIIIYRo7YUXXnCLiorcIUOGuAsWLOjS79XR1+9uTaDOOeccd/Dgwa7X63WHDh3qnnPOOe769euzn08mk+73vvc9t6ioyA0Gg+6ZZ57p7tixo9V9bN682f3617/uBgIBd8CAAe6NN97omqbZ6po333zTPfzww12v1+uOHj3afeKJJ/aI5cEHH3SHDx/uer1e96ijjnLfe++9nH4WSaCEEEKIfffkk0+6gPutb33Lra2t7fLv19HXb8V1Xbdbl8D6iGg0SkFBAZFIhPz8/O4ORwghhOhV6urqKC4uJh6P89JLL3H22WejKEqXf9+Ovn73qBooIYQQQvQvpmly2223MWrUKDZu3EgoFOKcc87ZL8nTvujWNgZCCCGE6L/WrVvHBRdcwNKlS/nZz37G8OHDuzukdpMESgghhBD73Ysvvsh3vvMdBg8ezKJFi5g6dWp3h5QT2cITQgghxH7TXHo9YcIEZs2axYcfftjrkieQBEoIIYQQ+8n8+fOzI9tGjx7N3LlzCYfD3R1Wh0gCJYQQQogulUwmuf766/na175GKBQilUp1d0j7TGqghBBCCNFlPvroI8477zzWr1/P/fffzzXXXIOq9v71G0mghBBCCNFlqqqq0HWdJUuWcMghh3R3OJ2m96eAQgghhOhRtm3bxq233orjOHz1q19l6dKlfSp5AkmghBBCCNGJXnjhBQ499FDmzZvH1q1bAdA0rZuj6nySQAkhhBBin0WjUWbNmsXZZ5/N9OnT+eijjxgxYkR3h9VlpAZKCCGEEPvsmWee4e9//zvz5s3joosu6vGjWPaVrEAJIYQQokNM02T+/PkAXHHFFaxevZpZs2b1+eQJJIESQgghRAd8+umnHHvssZx66qls3boVVVUpLy/v7rD2G0mghBBCCNFuruvyyCOPcMQRR9DQ0MB//vOffpU4NZMESgghhBDt9sADD3DllVdy/vnns2zZMo466qjuDqlbSBG5EEIIIb5QdXU1paWlXHzxxYwZM4ZTTjmlu0PqVrICJYQQQoi9SiaTXHvttRxwwAHs2LGDgoKCfp88gaxACSGEEGIvli9fznnnncemTZv47W9/S1lZWXeH1GPICpQQQggh9vD0009z1FFH4fV6WbJkCddcc02/aE/QXpJACSGEECLLdV0AJk+ezA033MD777/PhAkTujmqnkcSKCGEEEIA8Nxzz3HccceRSCQYP348d911Fz6fr7vD6pEkgRJCCCH6uUgkwkUXXcS5555LeXk5pml2d0g9nhSRCyGEEP3Yu+++y3nnnUddXR1/+ctfuOCCC6TWqR0kgRJCCCH6sZ07dzJs2DDefPNNRo0a1d3h9BqyhSeEEEL0M59++ik//vGPcV2X0047jXfeeUeSpxxJAiWEEEL0Ey3n2P3tb3+jpqYGAFWVdCBX8ogJIYQQ/UBNTQ1nnHEGV155JRdccAEffvghpaWl3R1WryU1UEIIIUQ/MG/ePN59913+3//7f5x22mndHU6vp7jNHbPEPolGoxQUFBCJRMjPz+/ucIQQQgiSySRvvfUWX//617Esi7q6Oll12k1HX79lC08IIYTogz788EMmT57M2WefTW1tLbquS/LUiSSBEkIIIfoQ27a5++67mTp1Kj6fj/fff5+SkpLuDqvPkQRKCCGE6EPuuOMObr75ZubMmcN7773HwQcf3N0h9UlSRC6EEEL0ARUVFQwZMoRrrrmGE044gRNPPLG7Q+rTZAVKCCGE6MUikQgXXHABhx56KHV1dZSUlEjytB/ICpQQQgjRS73zzjtceOGFNDQ0MHfuXIqKiro7pH5DVqCEEEKIXmju3LmceOKJjBgxghUrVsgQ4P1MEighhBCiF3EcB4Bjjz2WX/3qV7z55puMHDmye4PqhySBEkIIIXoB13V5+OGH+dKXvoRhGBx++OHccsstaJrW3aH1S5JACSGEED1cdXU1p512GldffTUTJ07Etu3uDqnfkyJyIYQQogd77bXXmDVrFq7r8q9//YtTTz21u0MSSAIlhBBC9GgNDQ1MmTKFxx9/nEGDBnV3OKKJbOEJIYQQPcyyZcv4yU9+AsC5557
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"train_X = train_df.drop(\"charges\", axis=1)\n",
|
|||
|
"train_Y = train_df['charges']\n",
|
|||
|
"\n",
|
|||
|
"test_X = test_df.drop(\"charges\", axis=1)\n",
|
|||
|
"test_Y = test_df['charges']\n",
|
|||
|
"\n",
|
|||
|
"val_X = val_df.drop(\"charges\", axis=1)\n",
|
|||
|
"val_Y = val_df['charges']\n",
|
|||
|
"\n",
|
|||
|
"train_X = train_X.drop(['smoker', 'sex', 'region', 'age_category', 'bmi_category'], axis=1)\n",
|
|||
|
"test_X = test_X.drop(['smoker', 'sex', 'region', 'age_category', 'bmi_category'], axis=1)\n",
|
|||
|
"val_X = val_X.drop(['smoker', 'sex', 'region', 'age_category', 'bmi_category'], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"print(train_X.columns)\n",
|
|||
|
"\n",
|
|||
|
"from sklearn.linear_model import LinearRegression\n",
|
|||
|
"import time \n",
|
|||
|
"from sklearn.metrics import mean_squared_error, r2_score\n",
|
|||
|
"\n",
|
|||
|
"model = LinearRegression()\n",
|
|||
|
"\n",
|
|||
|
"start_time = time.time()\n",
|
|||
|
"model.fit(train_X, train_Y)\n",
|
|||
|
"train_time = time.time() - start_time\n",
|
|||
|
"\n",
|
|||
|
"val_predictions = model.predict(val_X)\n",
|
|||
|
"mse = mean_squared_error(val_Y, val_predictions)\n",
|
|||
|
"r2 = r2_score(val_Y, val_predictions)\n",
|
|||
|
"\n",
|
|||
|
"print(train_time, mse, r2)\n",
|
|||
|
"\n",
|
|||
|
"plt.scatter(val_Y, val_predictions, alpha=0.5)\n",
|
|||
|
"plt.plot([val_Y.min(), val_Y.max()], [val_Y.min(), val_Y.max()], 'k--', lw=1)\n",
|
|||
|
"plt.xlabel('Фактическая цена')\n",
|
|||
|
"plt.ylabel('Прогнозируемая цена')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Обученная модель довольно точно предсказывает цены ниже двух тысяч, для цен более двух тысяч модель занижает или завышает цены"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "aimenv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.5"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|