commit 54a8c00fe118568c75ba6410371f9e4210cc1a1e Author: ksenia_nevaeva Date: Fri Jan 17 06:33:19 2025 +0400 1 and 2 lab diff --git a/lec1.ipynb b/lec1.ipynb new file mode 100644 index 0000000..d3e30f8 --- /dev/null +++ b/lec1.ipynb @@ -0,0 +1,713 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Работа с NumPy" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "matrix = \n", + " [[4 5 0]\n", + " [9 9 9]] \n", + "\n", + "tmatrix = \n", + " [[4 9]\n", + " [5 9]\n", + " [0 9]] \n", + "\n", + "vector = \n", + " [4 5 0 9 9 9] \n", + "\n", + "tvector = \n", + " [[4]\n", + " [5]\n", + " [0]\n", + " [9]\n", + " [9]\n", + " [9]] \n", + "\n", + "list_matrix = \n", + " [array([4, 5, 0]), array([9, 9, 9])] \n", + "\n", + "matrix as str = \n", + " [[4 5 0]\n", + " [9 9 9]] \n", + "\n", + "matrix type is \n", + "\n", + "vector type is \n", + "\n", + "list_matrix type is \n", + "\n", + "str_matrix type is \n", + "\n", + "formatted_vector = \n", + " 4; 5; 0; 9; 9; 9 \n", + "\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "matrix = np.array([[4, 5, 0], [9, 9, 9]])\n", + "print(\"matrix = \\n\", matrix, \"\\n\")\n", + "\n", + "tmatrix = matrix.T\n", + "print(\"tmatrix = \\n\", tmatrix, \"\\n\")\n", + "\n", + "vector = np.ravel(matrix)\n", + "print(\"vector = \\n\", vector, \"\\n\")\n", + "\n", + "tvector = np.reshape(vector, (6, 1))\n", + "print(\"tvector = \\n\", tvector, \"\\n\")\n", + "\n", + "list_matrix = list(matrix)\n", + "print(\"list_matrix = \\n\", list_matrix, \"\\n\")\n", + "\n", + "str_matrix = str(matrix)\n", + "print(\"matrix as str = \\n\", str_matrix, \"\\n\")\n", + "\n", + "print(\"matrix type is\", type(matrix), \"\\n\")\n", + "\n", + "print(\"vector type is\", type(vector), \"\\n\")\n", + "\n", + "print(\"list_matrix type is\", type(list_matrix), \"\\n\")\n", + "\n", + "print(\"str_matrix type is\", type(str_matrix), \"\\n\")\n", + "\n", + "formatted_vector = \"; \".join(map(str, vector))\n", + "print(\"formatted_vector = \\n\", formatted_vector, \"\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Работа с Pandas DataFrame\n", + "\n", + "https://pandas.pydata.org/docs/user_guide/10min.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Работа с данными - чтение и запись CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"data/titanic.csv\", index_col=\"PassengerId\")\n", + "\n", + "df.to_csv(\"test.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Работа с данными - основные команды" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 891 entries, 1 to 891\n", + "Data columns (total 11 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Survived 891 non-null int64 \n", + " 1 Pclass 891 non-null int64 \n", + " 2 Name 891 non-null object \n", + " 3 Sex 891 non-null object \n", + " 4 Age 714 non-null float64\n", + " 5 SibSp 891 non-null int64 \n", + " 6 Parch 891 non-null int64 \n", + " 7 Ticket 891 non-null object \n", + " 8 Fare 891 non-null float64\n", + " 9 Cabin 204 non-null object \n", + " 10 Embarked 889 non-null object \n", + "dtypes: float64(2), int64(4), object(5)\n", + "memory usage: 83.5+ KB\n", + " count mean std min 25% 50% 75% max\n", + "Survived 891.0 0.383838 0.486592 0.00 0.0000 0.0000 1.0 1.0000\n", + "Pclass 891.0 2.308642 0.836071 1.00 2.0000 3.0000 3.0 3.0000\n", + "Age 714.0 29.699118 14.526497 0.42 20.1250 28.0000 38.0 80.0000\n", + "SibSp 891.0 0.523008 1.102743 0.00 0.0000 0.0000 1.0 8.0000\n", + "Parch 891.0 0.381594 0.806057 0.00 0.0000 0.0000 0.0 6.0000\n", + "Fare 891.0 32.204208 49.693429 0.00 7.9104 14.4542 31.0 512.3292\n", + " Survived Pclass Sex Age SibSp Parch Fare Cabin\n", + "PassengerId \n", + "1 0 3 male 22.0 1 0 7.2500 NaN\n", + "2 1 1 female 38.0 1 0 71.2833 C85\n", + "3 1 3 female 26.0 0 0 7.9250 NaN\n", + "4 1 1 female 35.0 1 0 53.1000 C123\n", + "5 0 3 male 35.0 0 0 8.0500 NaN\n", + " Survived Pclass Sex Age SibSp Parch Fare Cabin\n", + "PassengerId \n", + "887 0 2 male 27.0 0 0 13.00 NaN\n", + "888 1 1 female 19.0 0 0 30.00 B42\n", + "889 0 3 female NaN 1 2 23.45 NaN\n", + "890 1 1 male 26.0 0 0 30.00 C148\n", + "891 0 3 male 32.0 0 0 7.75 NaN\n", + " Survived Pclass Sex Age SibSp Parch Fare Cabin\n", + "PassengerId \n", + "804 1 3 male 0.42 0 1 8.5167 NaN\n", + "756 1 2 male 0.67 1 1 14.5000 NaN\n", + "470 1 3 female 0.75 2 1 19.2583 NaN\n", + "645 1 3 female 0.75 2 1 19.2583 NaN\n", + "79 1 2 male 0.83 0 2 29.0000 NaN\n", + " Survived Pclass Sex Age SibSp Parch Fare Cabin\n", + "PassengerId \n", + "860 0 3 male NaN 0 0 7.2292 NaN\n", + "864 0 3 female NaN 8 2 69.5500 NaN\n", + "869 0 3 male NaN 0 0 9.5000 NaN\n", + "879 0 3 male NaN 0 0 7.8958 NaN\n", + "889 0 3 female NaN 1 2 23.4500 NaN\n" + ] + } + ], + "source": [ + "df.info()\n", + "\n", + "print(df.describe().transpose())\n", + "\n", + "cleared_df = df.drop([\"Name\", \"Ticket\", \"Embarked\"], axis=1)\n", + "print(cleared_df.head())\n", + "print(cleared_df.tail())\n", + "\n", + "sorted_df = cleared_df.sort_values(by=\"Age\")\n", + "print(sorted_df.head())\n", + "print(sorted_df.tail())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Работа с данными - работа с элементами" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PassengerId\n", + "1 22.0\n", + "2 38.0\n", + "3 26.0\n", + "4 35.0\n", + "5 35.0\n", + " ... \n", + "887 27.0\n", + "888 19.0\n", + "889 NaN\n", + "890 26.0\n", + "891 32.0\n", + "Name: Age, Length: 891, dtype: float64\n", + "Survived 0\n", + "Pclass 2\n", + "Name Kantor, Mr. Sinai\n", + "Sex male\n", + "Age 34.0\n", + "SibSp 1\n", + "Parch 0\n", + "Ticket 244367\n", + "Fare 26.0\n", + "Cabin NaN\n", + "Embarked S\n", + "Name: 100, dtype: object\n", + "Kantor, Mr. Sinai\n", + " Age Name\n", + "PassengerId \n", + "100 34.0 Kantor, Mr. Sinai\n", + "101 28.0 Petranec, Miss. Matilda\n", + "102 NaN Petroff, Mr. Pastcho (\"Pentcho\")\n", + "103 21.0 White, Mr. Richard Frasar\n", + "104 33.0 Johansson, Mr. Gustaf Joel\n", + "... ... ...\n", + "196 58.0 Lurette, Miss. Elise\n", + "197 NaN Mernagh, Mr. Robert\n", + "198 42.0 Olsen, Mr. Karl Siegwart Andreas\n", + "199 NaN Madigan, Miss. Margaret \"Maggie\"\n", + "200 24.0 Yrois, Miss. Henriette (\"Mrs Harbeck\")\n", + "\n", + "[101 rows x 2 columns]\n", + " Survived Pclass \\\n", + "PassengerId \n", + "1 0 3 \n", + "2 1 1 \n", + "3 1 3 \n", + "\n", + " Name Sex Age \\\n", + "PassengerId \n", + "1 Braund, Mr. Owen Harris male 22.0 \n", + "2 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 \n", + "3 Heikkinen, Miss. Laina female 26.0 \n", + "\n", + " SibSp Parch Ticket Fare Cabin Embarked \n", + "PassengerId \n", + "1 1 0 A/5 21171 7.2500 NaN S \n", + "2 1 0 PC 17599 71.2833 C85 C \n", + "3 0 0 STON/O2. 3101282 7.9250 NaN S \n", + "Survived 0\n", + "Pclass 3\n", + "Name Braund, Mr. Owen Harris\n", + "Sex male\n", + "Age 22.0\n", + "SibSp 1\n", + "Parch 0\n", + "Ticket A/5 21171\n", + "Fare 7.25\n", + "Cabin NaN\n", + "Embarked S\n", + "Name: 1, dtype: object\n", + " Survived Pclass\n", + "PassengerId \n", + "4 1 1\n", + "5 0 3\n", + " Survived Pclass\n", + "PassengerId \n", + "4 1 1\n", + "5 0 3\n" + ] + } + ], + "source": [ + "print(df[\"Age\"])\n", + "\n", + "print(df.loc[100])\n", + "\n", + "print(df.loc[100, \"Name\"])\n", + "\n", + "print(df.loc[100:200, [\"Age\", \"Name\"]])\n", + "\n", + "print(df[0:3])\n", + "\n", + "print(df.iloc[0])\n", + "\n", + "print(df.iloc[3:5, 0:2])\n", + "\n", + "print(df.iloc[[3, 4], [0, 1]])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Работа с данными - отбор и группировка" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['male' 'female']\n", + "male count = 577\n", + "female count = 314\n", + "Total count = 891\n", + " Pclass Survived Count\n", + "0 1 0 80\n", + "1 1 1 136\n", + "2 2 0 97\n", + "3 2 1 87\n", + "4 3 0 372\n", + "5 3 1 119\n" + ] + } + ], + "source": [ + "s_values = df[\"Sex\"].unique()\n", + "print(s_values)\n", + "\n", + "s_total = 0\n", + "for s_value in s_values:\n", + " count = df[df[\"Sex\"] == s_value].shape[0]\n", + " s_total += count\n", + " print(s_value, \"count =\", count)\n", + "print(\"Total count = \", s_total)\n", + "\n", + "print(df.groupby([\"Pclass\", \"Survived\"]).size().reset_index(name=\"Count\")) # type: ignore" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Визуализация - Исходные данные" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Pclass Survived Age\n", + "PassengerId \n", + "1 3 0 22.0\n", + "2 1 1 38.0\n", + "3 3 1 26.0\n", + "4 1 1 35.0\n", + "5 3 0 35.0\n", + "... ... ... ...\n", + "886 3 0 39.0\n", + "887 2 0 27.0\n", + "888 1 1 19.0\n", + "890 1 1 26.0\n", + "891 3 0 32.0\n", + "\n", + "[714 rows x 3 columns]\n" + ] + } + ], + "source": [ + "data = df[[\"Pclass\", \"Survived\", \"Age\"]].copy()\n", + "data.dropna(subset=[\"Age\"], inplace=True)\n", + "print(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Визуализация - Сводка пяти чисел\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Age \n", + " min q1 q2 median q3 max\n", + "Pclass \n", + "1 0.92 27.0 37.0 37.0 49.0 80.0\n", + "2 0.67 23.0 29.0 29.0 36.0 70.0\n", + "3 0.42 18.0 24.0 24.0 32.0 74.0\n", + " Age \n", + " low_iqr iqr high_iqr\n", + "Pclass \n", + "1 0.0 22.0 82.0\n", + "2 3.5 13.0 55.5\n", + "3 0.0 14.0 53.0\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def q1(x):\n", + " return x.quantile(0.25)\n", + "\n", + "\n", + "# median = quantile(0.5)\n", + "def q2(x):\n", + " return x.quantile(0.5)\n", + "\n", + "\n", + "def q3(x):\n", + " return x.quantile(0.75)\n", + "\n", + "\n", + "def iqr(x):\n", + " return q3(x) - q1(x)\n", + "\n", + "\n", + "def low_iqr(x):\n", + " return max(0, q1(x) - 1.5 * iqr(x))\n", + "\n", + "\n", + "def high_iqr(x):\n", + " return q3(x) + 1.5 * iqr(x)\n", + "\n", + "\n", + "quantiles = data[[\"Pclass\", \"Age\"]].groupby([\"Pclass\"]).aggregate([\"min\", q1, q2, \"median\", q3, \"max\"])\n", + "print(quantiles)\n", + "\n", + "iqrs = data[[\"Pclass\", \"Age\"]].groupby([\"Pclass\"]).aggregate([low_iqr, iqr, high_iqr])\n", + "print(iqrs)\n", + "\n", + "data.boxplot(column=\"Age\", by=\"Pclass\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Визуализация - Гистограмма" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "data.plot.hist(column=[\"Age\"], bins=80)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Визуализация - Точечная диаграмма" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlEAAAGwCAYAAACJjDBkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/GU6VOAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAw8UlEQVR4nO3deXRUVb728acyQghJIGEmgUAYRAwEAxIG0QZRRL04cmlEaBVbhRbQRkARnBB9udxmUnEAAbVlukLT0toiQhSkhQgBgRYCBIIyQwZIEITs9w+bMkVVhdQmsZL4/ayVtahdu3b9dp1zdj1UTp04jDFGAAAA8EmAvwsAAACoiAhRAAAAFghRAAAAFghRAAAAFghRAAAAFghRAAAAFghRAAAAFoL8XUBlVVhYqAMHDqh69epyOBz+LgcAAJSAMUYnT55U/fr1FRBQ/GdNhKgycuDAAcXGxvq7DAAAYGH//v1q2LBhsX0IUWWkevXqkn7eCBEREX6uBgAAlEReXp5iY2Od7+PFIUSVkQu/wouIiCBEAQBQwZTkVBxOLAcAALBAiAIAALBAiAIAALBAiAIAALBAiAIAALBAiAIAALBAiAIAALBAiAIAALBAiAIAALDg1xBljNFDDz2kmjVryuFwKD093S917N2716/PDwAAKh6//tmXTz75RHPmzNHq1avVpEkTxcTE+LOcCid1xxEtTf9BDkm3t2uors1qOe97YkG6vt57XClNYjTp7jbO/unf56hdXA2Xvs8v26av9hxTl4RaGntLK2f7jJUZWrv7mLo2q6VHr08odow9R09p34kCNY6upviYai41Xtzf07jFjeGt3dtr4qm+os95U+u62neiQF/uPKp/H8xzq8PTa+etveOEFTp88qzqRYTqq6d6OPs+NHeDvtmfrfaNamrmgORi6/PWt+vLK/VD7o+Kjaqq1FG/K3Z7JY7/RHlnzisyNFCbn7up2L7e2r3N21t9ntq7T1qlfdkFio+upjfuS3Zut4nLt3scY8H6LK3LPK7OTWN0daMa2neiQB98vU/bD+a51eFtv+n/5jptOZCrtg2j9O6DHYut2du+5Mvr4a3dU31F53d38i9/kNzbfuqpv7fn88aXY86XNcGXY9xbf2+vhy/Hvre+3sb2xJc1xVt/b/PGb4vDGGP89eQzZszQpEmTtG/fPn+VIOnnT6Li4+O1adMmtW3btlTGzMvLU2RkpHJzc0v9b+ftO56vW6evUd6P51zaI6oE6k/XJ2jCxzvcHhMW5FDBuV82dY2wYP35huZ6+m/b3PqO7NFMkz7LcGsPDw3UqTPnXcZ474EOeuWTnfoi46iz/dpmtfTkTc01YNZ6ZRf85GyvFhKg/LOFbuO+fV+y5q3b5zbGi31aa+zSrW7t0/slKTIs2O016fPqWpfnqxEWrHG3XKERC7e4PacnQ69tohlf7HFrv7d9Q7234fsSjdGuQXVt/OGkW/ufezbXrDWZLvWFBUoF5926KjkuQmlZeW7t1ybU1Be7TpSojsT64dpy4JRb+5Bu8Xo1NbNEY9yVVF+LNx1wa+/Tpq6Wbj5UojG8GXp9vGam7tW5wksvP0Oui9erq91r7t+hod5fX7LtMq53S63eedxtX7o9qV6J94/+1zTU+1+7P9/Qbk00I9V9vwl0SOeLTC8owKGZ/dtp5P9tcdtPJ92ZqIff3+jyejgkeXp1pvVtq9uSGri1ezoGLj5mL5h6Txs9+9F2tzpG3tBcT3lYE66sH6FtB37ZJ69tVktP3thcA2avdxvD05rQLjZSW37I1bkih39QgEPvPdBBr63eU6Jjv1PTaBkjrdtz3KXvI92aaMDs9S6vXVCAQ8uGdFarBpEu88gpOKvHPkgv0ZrirX+HxjW04/BJ5Z7+Zf2tERasZUO6KDY6zG0MVDy+vH/7LUQNGjRIc+fOdd5u1KiR9uzZo1deeUVvvvmmDh06pObNm+uZZ57RXXfdJUlavXq1rr/+en3yyScaPXq0vvvuO6WkpGj+/Pn65ptv9Pjjj+uHH37QLbfcorffflthYT/v0J988olefPFFbd26VYGBgUpJSdHUqVPVtGlTSZ5D1NatWzVy5Eh9+eWXqlatmnr27Km//OUvJf60rCxDVNLzn7osXP4UFOCQMdL5IrtRoMMhh0MleoMs+piLx4ioGqS80+fc2jsnxGjeAx1cHl+eXhOUT572sfP++z/kZdn7cm+3tl/zGCjuGPe0JlxqrJIc+yV57MV17HrpZpe2+2at19pdx0q0pnjr702NsGBtGtfzkv1Q/vny/u23c6KmTp2q559/Xg0bNtTBgwe1YcMGTZw4UfPmzdPMmTO1bds2jRgxQvfee69SU1NdHvvss89qxowZ+uqrr7R//37dc889mjJliv76179q+fLl+vTTTzV9+nRn//z8fD3++ONKS0vTypUrFRAQoNtvv12Fhe6fikhSTk6Ofve73ykpKUlpaWn65JNPdPjwYd1zzz1e53PmzBnl5eW5/JSF1B1HylVYOFdo3BaY88b4FKAuPObi29kFP3ls/yLjqDKP5TvbyttrgvLJ075UUY1ctNnl9q99DBR3jHtaEy411sW3PR37JXnsxXUsStvvvL3n6Cl9kXG0RGtKcf29yS74SV8W+cQKvw1+OycqMjJS1atXV2BgoOrWraszZ87opZde0meffaaUlBRJUpMmTbRmzRq98cYb6tatm/OxL774ojp37ixJeuCBBzRmzBjt3r1bTZo0kSTdddddWrVqlUaNGiVJuvPOO12ee/bs2apVq5a2b9+u1q1bu9U2Y8YMJSUl6aWXXnJ5TGxsrHbu3KnmzZu7PWbixIl67rnnLvNVubT073PK/Dkqgr3H853nJvCa4Ldm3Z5jLrc5Bjxbu/uY8/yofScKiu1bdE0pSX9PNmZlc37Ub0y5ucTBrl27VFBQoBtuuEHh4eHOn3nz5mn37t0ufRMTE53/rlOnjsLCwpwB6kLbkSNHnLczMjLUr18/NWnSRBEREWrcuLEkKSsry2Mtmzdv1qpVq1zqaNmypSS51XLBmDFjlJub6/zZv3+/x36Xq23DqDIZt6JpHP3LYsdrgt+alCaupxVwDHjWuekvr1OjmsWfr1R0TSlJf0/axdXw+TGo2Pz67byiTp36+STY5cuXq0ED15MmQ0NDXW4HB/9yAqDD4XC5faGt6K/qbr31VjVq1EhvvfWW6tevr8LCQrVu3Vpnz571Wsutt96qV155xe2+evXqeXxMaGioW51loVuL2qoRFlxufn3lr3Oiiv6Psby9JiifKtM5URd/S+/XPgYqyjlRRb+l16RWuK5tVsvrOVEXf0vPW39vaoQF8ynUb1C5+SSqVatWCg0NVVZWlhISElx+YmOL/7pqcY4fP64dO3Zo7Nix6t69u6644gplZ2cX+5h27dpp27Ztaty4sVst1apd+uuwZW3ZkC6KqOKefyOqBGpc7ys8PiYsyOFyu0ZYsF6+3f1XmZI0sqf7ryuln7/pc/EYy4Z0VucE1/8Vd06I0bIhnVXjom+7hId43t1m3ZfsZYwuHtun90tyG2PZkC5uz1cjLFjT+rb1+Jye/On6ph7b7+tY8v0vOdbzSYgjezZ3qy8s0GNXdWwc6bH9+ubRJa6jbYPqHtu9zdGTvsnu3wCTpNvbev6PhC/+dH1TBQU4Lt1RpbNdxvW+wuO+5Mv+4e35vNUXeNH0ggIcmnVfssf9dNZ9yW6vh7dXx1vNno6Bi4/ZomN4qsPbmnBlfdf92tsx7m1NaBcbpaCLDv+gAIcWDO5Y4mO/U9NopTSJduu7YHBHt9fuwrfzLja9X1KJ1xRv/Ts0rqnIqq7r74Vv5+G3x6+XOJgyZYqmTJmivXv3SpLGjh2rmTNnavLkyerSpYtyc3O1du1aRUREaODAgc5v52VnZysqKkqSNGfOHA0fPlw5OTnOcZ999lktXbpU6enpKiwsVO3atdWrVy+NHz9eWVlZGj16tDZs2KAlS5aoT58+bt/OO3DggNq2batu3brpySefVM2aNbVr1y7Nnz9fb7/9tgIDvbz7FVGW38674MuMo1qy8QdJxu06USMXbda6Pcdcri3zZcZRbczKdruuyYsfbdeaXUfdriX02qpd+jLjqMu1ZbyNkXksX3uP57tdd8VTf0/jFjeGt3Zvr4mn+oo+Z6+r6mnv8XytzTimbQdy3erw9Np5a+/00mc6mHfG7TpRD7+bpg37Trhdp8hTfd76dnvlc+3POe12nShP26vN+E+U6+E6Ud62rad2b/P2Vp+n9hsmr1bm8XzFR1fTmwPbO7fbKx//2+MYi9L2a+3uY+rcNEbJjWtq7/F8LVifpa0Hct3q8LbfDHj7X0r/PsflOlHeava2L/nyenhr91Rf0fkV/VTE237qqb+35/PGl2POlzXBl2PcW39vr4cvx763vt7G9sSXNcVbf2/zRsXn0/u38aO//OUvplGjRs7bhYWFZsqUKaZFixYmODjY1KpVy9x4440mNTXVGGPMqlWrjCSTnZ3tfMw777xjIiMjXcYdP368adOmjfP2ihUrzBVXXGFCQ0NNYmKiWb16tZFklixZYowxJjMz00gymzZtcj5m586d5vbbbzdRUVGmatWqpmXLlmb48OGmsLCwRHPLzc01kkxubq4vLwkAAPAjX96//fpJVGX2a3wSBQAASleFuE4UAABARUaIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsECIAgAAsGAVolatWuX1vjfeeMO6GAAAgIrCKkTddNNNGjlypH766Sdn27Fjx3Trrbdq9OjRpVYcAABAeWX9SdSSJUvUvn17bd++XcuXL1fr1q2Vl5en9PT0Ui4RAACg/LEKUZ06dVJ6erpat26tdu3a6fbbb9eIESO0evVqNWrUqLRrBAAAKHesTyzfuXOn0tLS1LBhQwUFBWnHjh0qKCgozdoAAADKLasQ9fLLLyslJUU33HCDtm7dqvXr12vTpk1KTEzUunXrSrtGAACAcscqRE2dOlVLly7V9OnTVaVKFbVu3Vrr16/XHXfcoeuuu66USwQAACh/gmwe9O233yomJsalLTg4WJMmTdItt9xSKoUBAACUZ1afRMXExCgnJ0dvv/22xowZoxMnTkiSNm7cqISEhFItEAAAoDyy+iRqy5Yt6tGjhyIjI7V3714NHjxYNWvW1IcffqisrCzNmzevtOsEAAAoV6w+iRoxYoQGDRqkjIwMValSxdl+880364svvii14gAAAMorq0+i0tLS9Oabb7q1N2jQQIcOHbrsogAAAMo7q0+iQkNDlZeX59a+c+dO1apV67KLAgAAKO+sQtRtt92m559/3vm38xwOh7KysjRq1CjdeeedpVogAABAeWQVoiZPnqxTp06pdu3aOn36tLp166amTZsqPDxcEyZMKO0aAQAAyh2rc6IiIyO1YsUKrVmzRlu2bNGpU6d09dVXq3v37qVdHwAAQLnk0ydR69at00cffeS83aVLF1WrVk2vvfaa+vXrp4ceekhnzpwp9SIBAADKG59C1PPPP69t27Y5b3/77bcaPHiwbrjhBo0ePVp///vfNXHixFIvEgAAoLzxKUSlp6e7/Mpu/vz56tChg9566y09/vjjmjZtmhYuXFjqRQIAAJQ3PoWo7Oxs1alTx3k7NTVVvXr1ct5u37699u/fX3rVAQAAlFM+hag6deooMzNTknT27Flt3LhRHTt2dN5/8uRJBQcHl26FAAAA5ZBPIermm2/W6NGj9eWXX2rMmDEKCwtT165dnfdv2bJFTZs2LfUiAQAAyhufLnHwwgsv6I477lC3bt0UHh6uuXPnKiQkxHn/7Nmz1bNnz1IvEgAAoLxxGGOMrw/Kzc1VeHi4AgMDXdpPnDih8PBwl2D1W5WXl6fIyEjl5uYqIiLC3+UAAIAS8OX92/pim57UrFnTZjgAAIAKx+rPvgAAAPzWEaIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIAAAAsEKIkDRo0SH369PF3GQAAoAIJ8ncB8N2eo6f0deZx7Tx8Ul98d0y5P57V9S3raNLdbZx9ZqzM0Nrdx9S1WS09en2C1zZJen7ZNn2155i6JNTS2FtaXfK5950oUOPoaoqPqeZsT91xROnf56hdXA11bVar2HZvdXgbe8H6LK3LPK7OTWN0d3JssX298VSHtzG6T1qlfdkFio+uphVPXOdsf2juBn2zP1vtG9XUzAHJkqSOE1bo8MmzqhcRqq+e6uHse8XYf+j0OaOwIIe2v3izs71o//cGd9S+EwV69N00nT5nFB4coK0v9HL2TXrun8o+fU41qwZp4/gbne03/yVVu47lq3ntcE37fTvtO1Ggx+dvUvbpc6pdLVjzH+7knNfNU1IvWceFuru+vFI/5P6o2KiqSh31O49zeX1AsvN1HLkw3W2M5Bc+1bH8n9zqGPJumrPmj4Zd63GOix/trH0nCjTp438r83iBrqwXocWPdva4Xd64L7nYsT2N2zi6mt79aq/Hff2JBen6eu9xpTSJcR5H/d9cpy0HctW2YZTefbCjx/2gX4c45+tRWGjc9rGi26rovD3t097qKNp2S2I953McyD7tcQxP+6m3Y9yXNcFTbcWN4Ym3Oooei8YY57/X7zle4mPflzXI2+vva7sn3uoojXXMF0XraBBV1fk8WcfzPdbni7Ksu6LV4TDGGL88czkyaNAg5eTkaOnSpaU2Zl5eniIjI5Wbm6uIiIhSGTOn4KwefX+jvtp93Gufodc10YzVe0o03hM9EjT5s11u7bPuS1b3VnXcnvuxD9L1RcZRZ9u1zWrpyZuaa8Cs9cou+MnZXiMsWK//vp0e+etGl/bw0ECdOnPe/fkGJGvuv/a5jf1ItyYaMHu9zhX+sosGBTiU2CBCG/fnuvSd3i9JkWHBbmPvO56vPq+udakjqmqwmtUO14Z92S5jNIwM0V/TfnAbo1N8lL7KzHFr96Rm1QCdOF3o1h4dFqDjBe7tnkRVlXJOu7fH16yizBM/lmgMb2LCAnSshHXUCgvU0QL37VUa4qOqKDOnZHNp17C6Nn5/skzqeKBTnGZ9lVWivr1a1dbH24+UqG9oUIDOnHN/ne9p10Afph9w26eH/a6px2OxpIICHBrctbFeT80sUf8/90jQ/3h4vie6J2jyypLVMfS6eM1Y7f58CwZ31DVNo13aPt16UA+9t9Gt77R72mrxph9cjn1vggKkxAZR2rg/x9l2bbNaevLG5how28Ma1L+dHnn/4jUoQKd/KtT5wqLjOvQ/d1+lPy/61m27TL4rUU8s3uLWvmxIZ7VqEOlSn6e1pkZYsP7fXVfpkfc2uY1xVf0Ibfq+ZOuYLzzV4U2NsGAtG9JFsdFhJRrb2/tAadTti7Kuw5f37woXoq677jpdddVVCgwM1Ny5cxUSEqIXX3xRv//97zV06FAtXrxYderU0fTp09WrVy+dP39eDz30kD7//HMdOnRIcXFxevTRRzVs2DDnmBeHqMLCQr3yyit68803dejQITVv3lzPPPOM7rrrrhLXWRYh6r5Z60u02JSGvS/3dnvutbuO6XyR3SXQ4ZDDIZfFwVagw+E29vkS7pqBDoc6J8Ro3gMd3O5Lev7TEi0mvjwfgOJdvH40Hr3ca9/LOfZKcw3yRVCAQ7teutmlraRrjTfFrWO+8LWOGmHB2jSuZ4n6ensfKI26fVHWdfjy/l0hz4maO3euYmJitH79ev3pT3/SI488orvvvludOnXSxo0b1bNnTw0YMEAFBQUqLCxUw4YNtWjRIm3fvl3jxo3TU089pYULF3odf+LEiZo3b55mzpypbdu2acSIEbr33nuVmprq9TFnzpxRXl6ey09p2nP01K8WoCTpxY+2uz33xQvdeWNKbfHyNLYvj/0i46gyj+W7tKfuOFLixYQABZSe11b98mnW88u2Fdv3co690lyDfHGu0GhR2n7nbV/WGm+8rWO+sKkju+AnfVmC95bi3gcut25flJc6LqiQIapNmzYaO3asmjVrpjFjxqhKlSqKiYnR4MGD1axZM40bN07Hjx/Xli1bFBwcrOeee07JycmKj49X//799Yc//MFriDpz5oxeeuklzZ49WzfeeKOaNGmiQYMG6d5779Ubb7zhtaaJEycqMjLS+RMbW/zvzX2170RBqY53KWt2/XJQ/drPbWvvcdeDJ/37HP8UAvzGFX1T/mrPMT9WUnbW7v5lXqW51ly8jvnCto6NWdmX7HOp94HLqdsX5aWOCyrkieWJiYnOfwcGBio6OlpXXXWVs61OnZ/P5zly5OfzF1599VXNnj1bWVlZOn36tM6ePau2bdt6HHvXrl0qKCjQDTfc4NJ+9uxZJSUlea1pzJgxevzxx5238/LySjVINapZst9Zl5YuCb+ccPhrP7etxtGuJxa2bRjln0KA37iiJyx3ahKj7w6d8mM1ZaNz0xjnv0tzrbl4HfOFbR3t4mpcss+l3gcup25flJc6LqiQn0QFB7ueOOZwOFzaHA6HpJ/PbZo/f77+/Oc/64EHHtCnn36q9PR0/eEPf9DZs2c9jn3q1M8H+/Lly5Wenu782b59uxYvXuy1ptDQUEVERLj8lKYmtcJ1reU3KWwU/ebMhecO/M/rekGgw6GgAMfFD7XiaWxfHntts1pu387o1qK2apTwJENfng9A8Yp+S2/cbVcW2/dyjr3SXIN8ERTgcPmWni9rjTfe1jFf2NRRIyy4RN/SK+594HLr9kV5qeOCChmifLF27Vp16tRJjz76qJKSkpSQkKDdu3d77d+qVSuFhoYqKytLCQkJLj+l/Ss6X03vl6ROF33r5WJ/ur5piccb2bO5x/ZZ9yV7fO7OCTEubZ0TYrRsSGe3g7ZGWLAWDO7o1l491PMHn7PuS/Y49oLBHd0WyKAAh9rFRrr1nd7P86eEy4Z0casjqmqw2jdy/Z9X54QY3dfR8/bt0vTS/0u7oFa1QJ/aPYkO83xYNo2uWuIxvPGljjrhZfdBtS9zSY4t3f+QFDW4S+MS9+3dus6lO/1HaJDnbdg3uYHHfdrbsVhSQQGOUjn2fanD2/MtGNzRrc3TmiJJ0/q2dTv2vQkKkNrFRrm0+b4GBSjwok0TFODQtL5tPW4Xb+3LhnTWxTytNTXCgjXrvmSPYyQ1LPk65gtPdXhz4dt5JeXtfaA06vZFealDqqDfzmvbtq2mTJnibGvcuLGGDx+u4cOHO9scDoeWLFmirKwsPfPMM1q4cKHi4+P17rvvatq0aYqPj1d6erok92/njR07VjNnztTkyZPVpUsX5ebmau3atYqIiNDAgQNLVGdZfDvvgsxj+fp6z3FlHD6p1TuOKve0+3WiXlu1S19mHHW5dounNunnk8jX7DpaoutEZR7L197j+W7X5fgy46g2ZmW7XXvEU7u3OryNvShtv9buPuZyfRVvfb3xVIe3MW6YvFqZx/PdrhP18Ltp2rDvhMv1dzq99JkO5p1xu05Uq7H/UIGH6zMV7f/+QynaezxfQ95NU4GH60S1e+6fOuHhOlG3TP1CO4+cUvPa4Zre/2rtPZ6vJ+Zv0on/XCdqwSOdnfPqPSX1knVcqLvbK59rf85pt+tEFZ3LGwPbO1/HUYs2u43R4YVPdeQ/14kqWsef3v/GWXPR6yUVneP/DemivcfzNfmT77T7WL7bdaKKbpc3B7YvdmxP4zaOrqb3/7XP474+ctFmrdtzzOUaSAPe/pfSv89xu05U0f2gf8dGztdDkts+VnRbFZ23p33aWx1F225rW9/5HIdyf/Q4hqf91Nsx7sua4Km24sbwxFsdRY9FSc5/p+09UeJj35c1yNvr72u7J97qKI11zBdF62hYI8z5PN9nF3iszxdlWXd5qKPSX+LAlxDVq1cvPfzww1qyZIkcDof69eunyMhIffzxx15DlDFG06ZN0+uvv649e/YoKipK7dq101NPPaVrr/1lISxOWYYoAABQNip1iKooCFEAAFQ8lf46UQAAAP5GiAIAALBAiAIAALBAiAIAALBAiAIAALBAiAIAALBAiAIAALBAiAIAALBAiAIAALBQdn9h9DfuwoXg8/Ly/FwJAAAoqQvv2yX5gy6EqDJy8uRJSVJsbPF/rBIAAJQ/J0+eVGRkZLF9+Nt5ZaSwsFAHDhxQ9erV5XA4Lnu8vLw8xcbGav/+/ZX2b/Exx4qvss9PYo6VQWWfn8QcL4cxRidPnlT9+vUVEFD8WU98ElVGAgIC1LBhw1IfNyIiotIeEBcwx4qvss9PYo6VQWWfn8QcbV3qE6gLOLEcAADAAiEKAADAAiGqgggNDdX48eMVGhrq71LKDHOs+Cr7/CTmWBlU9vlJzPHXwonlAAAAFvgkCgAAwAIhCgAAwAIhCgAAwAIhCgAAwAIhqoJ49dVX1bhxY1WpUkXXXHON1q9f7++SrH3xxRe69dZbVb9+fTkcDi1dutTlfmOMxo0bp3r16qlq1arq0aOHMjIy/FOshYkTJ6p9+/aqXr26ateurT59+mjHjh0ufX788UcNGTJE0dHRCg8P15133qnDhw/7qWLfvf7660pMTHRe5C4lJUUff/yx8/6KPr+Lvfzyy3I4HBo+fLizraLP8dlnn5XD4XD5admypfP+ij6/C3744Qfde++9io6OVtWqVXXVVVcpLS3NeX9FXm8aN27stg0dDoeGDBkiqXJsw/Pnz+uZZ55RfHy8qlatqqZNm+qFF15w+bt2ft2GBuXe/PnzTUhIiJk9e7bZtm2bGTx4sImKijKHDx/2d2lW/vGPf5inn37afPjhh0aSWbJkicv9L7/8somMjDRLly41mzdvNrfddpuJj483p0+f9k/BPrrxxhvNO++8Y7Zu3WrS09PNzTffbOLi4sypU6ecfR5++GETGxtrVq5cadLS0kzHjh1Np06d/Fi1b5YtW2aWL19udu7caXbs2GGeeuopExwcbLZu3WqMqfjzK2r9+vWmcePGJjEx0QwbNszZXtHnOH78eHPllVeagwcPOn+OHj3qvL+iz88YY06cOGEaNWpkBg0aZL7++muzZ88e889//tPs2rXL2acirzdHjhxx2X4rVqwwksyqVauMMZVjG06YMMFER0ebjz76yGRmZppFixaZ8PBwM3XqVGcff25DQlQF0KFDBzNkyBDn7fPnz5v69eubiRMn+rGq0nFxiCosLDR169Y1kyZNcrbl5OSY0NBQ88EHH/ihwst35MgRI8mkpqYaY36eT3BwsFm0aJGzz7///W8jyaxbt85fZV62GjVqmLfffrtSze/kyZOmWbNmZsWKFaZbt27OEFUZ5jh+/HjTpk0bj/dVhvkZY8yoUaNMly5dvN5f2dabYcOGmaZNm5rCwsJKsw179+5t7r//fpe2O+64w/Tv398Y4/9tyK/zyrmzZ8/qm2++UY8ePZxtAQEB6tGjh9atW+fHyspGZmamDh065DLfyMhIXXPNNRV2vrm5uZKkmjVrSpK++eYb/fTTTy5zbNmypeLi4irkHM+fP6/58+crPz9fKSkplWp+Q4YMUe/evV3mIlWebZiRkaH69eurSZMm6t+/v7KysiRVnvktW7ZMycnJuvvuu1W7dm0lJSXprbfect5fmdabs2fP6r333tP9998vh8NRabZhp06dtHLlSu3cuVOStHnzZq1Zs0a9evWS5P9tyB8gLueOHTum8+fPq06dOi7tderU0XfffeenqsrOoUOHJMnjfC/cV5EUFhZq+PDh6ty5s1q3bi3p5zmGhIQoKirKpW9Fm+O3336rlJQU/fjjjwoPD9eSJUvUqlUrpaenV4r5zZ8/Xxs3btSGDRvc7qsM2/Caa67RnDlz1KJFCx08eFDPPfecunbtqq1bt1aK+UnSnj179Prrr+vxxx/XU089pQ0bNuixxx5TSEiIBg4cWKnWm6VLlyonJ0eDBg2SVDn2UUkaPXq08vLy1LJlSwUGBur8+fOaMGGC+vfvL8n/7xmEKKAMDRkyRFu3btWaNWv8XUqpa9GihdLT05Wbm6vFixdr4MCBSk1N9XdZpWL//v0aNmyYVqxYoSpVqvi7nDJx4X/ykpSYmKhrrrlGjRo10sKFC1W1alU/VlZ6CgsLlZycrJdeekmSlJSUpK1bt2rmzJkaOHCgn6srXbNmzVKvXr1Uv359f5dSqhYuXKj3339ff/3rX3XllVcqPT1dw4cPV/369cvFNuTXeeVcTEyMAgMD3b5RcfjwYdWtW9dPVZWdC3OqDPMdOnSoPvroI61atUoNGzZ0ttetW1dnz55VTk6OS/+KNseQkBAlJCTo6quv1sSJE9WmTRtNnTq1Uszvm2++0ZEjR9SuXTsFBQUpKChIqampmjZtmoKCglSnTp0KP8eLRUVFqXnz5tq1a1el2IaSVK9ePbVq1cql7YorrnD+2rKyrDf79u3TZ599pgcffNDZVlm24ciRIzV69Gj993//t6666ioNGDBAI0aM0MSJEyX5fxsSosq5kJAQXX311Vq5cqWzrbCwUCtXrlRKSoofKysb8fHxqlu3rst88/Ly9PXXX1eY+RpjNHToUC1ZskSff/654uPjXe6/+uqrFRwc7DLHHTt2KCsrq8LM0ZPCwkKdOXOmUsyve/fu+vbbb5Wenu78SU5OVv/+/Z3/ruhzvNipU6e0e/du1atXr1JsQ0nq3Lmz2+VFdu7cqUaNGkmqHOuNJL3zzjuqXbu2evfu7WyrLNuwoKBAAQGuUSUwMFCFhYWSysE2LPNT13HZ5s+fb0JDQ82cOXPM9u3bzUMPPWSioqLMoUOH/F2alZMnT5pNmzaZTZs2GUnmf//3f82mTZvMvn37jDE/f101KirK/O1vfzNbtmwx//Vf/1VhvnJsjDGPPPKIiYyMNKtXr3b5+nFBQYGzz8MPP2zi4uLM559/btLS0kxKSopJSUnxY9W+GT16tElNTTWZmZlmy5YtZvTo0cbhcJhPP/3UGFPx5+dJ0W/nGVPx5/jEE0+Y1atXm8zMTLN27VrTo0cPExMTY44cOWKMqfjzM+bny1MEBQWZCRMmmIyMDPP++++bsLAw89577zn7VPT15vz58yYuLs6MGjXK7b7KsA0HDhxoGjRo4LzEwYcffmhiYmLMk08+6ezjz21IiKogpk+fbuLi4kxISIjp0KGD+de//uXvkqytWrXKSHL7GThwoDHm56+sPvPMM6ZOnTomNDTUdO/e3ezYscO/RfvA09wkmXfeecfZ5/Tp0+bRRx81NWrUMGFhYeb22283Bw8e9F/RPrr//vtNo0aNTEhIiKlVq5bp3r27M0AZU/Hn58nFIaqiz7Fv376mXr16JiQkxDRo0MD07dvX5fpJFX1+F/z97383rVu3NqGhoaZly5bmzTffdLm/oq83//znP40kjzVXhm2Yl5dnhg0bZuLi4kyVKlVMkyZNzNNPP23OnDnj7OPPbegwpshlPwEAAFAinBMFAABggRAFAABggRAFAABggRAFAABggRAFAABggRAFAABggRAFAABggRAFAABggRAFAABggRAFAEWsW7dOgYGBLn/MFQA84c++AEARDz74oMLDwzVr1izt2LFD9evX93dJAMopPokCgP84deqUFixYoEceeUS9e/fWnDlzXO5ftmyZmjVrpipVquj666/X3Llz5XA4lJOT4+yzZs0ade3aVVWrVlVsbKwee+wx5efn/7oTAfCrIEQBwH8sXLhQLVu2VIsWLXTvvfdq9uzZuvBhfWZmpu666y716dNHmzdv1h//+Ec9/fTTLo/fvXu3brrpJt15553asmWLFixYoDVr1mjo0KH+mA6AMsav8wDgPzp37qx77rlHw4YN07lz51SvXj0tWrRI1113nUaPHq3ly5fr22+/dfYfO3asJkyYoOzsbEVFRenBBx9UYGCg3njjDWefNWvWqFu3bsrPz1eVKlX8MS0AZYRPogBA0o4dO7R+/Xr169dPkhQUFKS+fftq1qxZzvvbt2/v8pgOHTq43N68ebPmzJmj8PBw58+NN96owsJCZWZm/joTAfCrCfJ3AQBQHsyaNUvnzp1zOZHcGKPQ0FDNmDGjRGOcOnVKf/zjH/XYY4+53RcXF1dqtQIoHwhRAH7zzp07p3nz5mny5Mnq2bOny319+vTRBx98oBYtWugf//iHy30bNmxwud2uXTtt375dCQkJZV4zAP/jnCgAv3lLly5V3759deTIEUVGRrrcN2rUKH3++edauHChWrRooREjRuiBBx5Qenq6nnjiCX3//ffKyclRZGSktmzZoo4dO+r+++/Xgw8+qGrVqmn79u1asWJFiT/NAlBxcE4UgN+8WbNmqUePHm4BSpLuvPNOpaWl6eTJk1q8eLE+/PBDJSYm6vXXX3d+Oy80NFSSlJiYqNTUVO3cuVNdu3ZVUlKSxo0bx7WmgEqKT6IAwNKECRM0c+ZM7d+/39+lAPADzokCgBJ67bXX1L59e0VHR2vt2rWaNGkS14ACfsMIUQBQQhkZGXrxxRd14sQJxcXF6YknntCYMWP8XRYAP+HXeQAAABY4sRwAAMACIQoAAMACIQoAAMACIQoAAMACIQoAAMACIQoAAMACIQoAAMACIQoAAMDC/wfGGnH4UTTIJwAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.plot.scatter(x=\"Age\", y=\"Sex\")\n", + "\n", + "df.plot.scatter(x=\"Pclass\", y=\"Age\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Визуализация - Столбчатая диаграмма" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot = data.groupby([\"Pclass\", \"Survived\"]).size().unstack().plot.bar(color=[\"pink\", \"green\"])\n", + "plot.legend([\"Not survived\", \"Survived\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Визуализация - Временные ряды" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 243 entries, 0 to 242\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 my_date 243 non-null object \n", + " 1 my_value 243 non-null float64 \n", + " 2 bullet 2 non-null object \n", + " 3 bulletClass 2 non-null object \n", + " 4 label 2 non-null object \n", + " 5 date 243 non-null datetime64[ns]\n", + "dtypes: datetime64[ns](1), float64(1), object(4)\n", + "memory usage: 11.5+ KB\n", + " my_date my_value bullet bulletClass label date\n", + "0 28.03.2023 76.5662 NaN NaN NaN 2023-03-28\n", + "1 31.03.2023 77.0863 NaN NaN NaN 2023-03-31\n", + "2 01.04.2023 77.3233 NaN NaN NaN 2023-04-01\n", + "3 04.04.2023 77.9510 NaN NaN NaN 2023-04-04\n", + "4 05.04.2023 79.3563 NaN NaN NaN 2023-04-05\n", + ".. ... ... ... ... ... ...\n", + "238 20.03.2024 92.2243 NaN NaN NaN 2024-03-20\n", + "239 21.03.2024 92.6861 NaN NaN NaN 2024-03-21\n", + "240 22.03.2024 91.9499 NaN NaN NaN 2024-03-22\n", + "241 23.03.2024 92.6118 NaN NaN NaN 2024-03-23\n", + "242 26.03.2024 92.7761 NaN NaN NaN 2024-03-26\n", + "\n", + "[243 rows x 6 columns]\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from datetime import datetime\n", + "import matplotlib.dates as md\n", + "\n", + "ts = pd.read_csv(\"data/dollar.csv\")\n", + "ts[\"date\"] = ts.apply(lambda row: datetime.strptime(row[\"my_date\"], \"%d.%m.%Y\"), axis=1)\n", + "ts.info()\n", + "\n", + "print(ts)\n", + "\n", + "plot = ts.plot.line(x=\"date\", y=\"my_value\")\n", + "plot.xaxis.set_major_locator(md.DayLocator(interval=10))\n", + "plot.xaxis.set_major_formatter(md.DateFormatter(\"%d.%m.%Y\"))\n", + "plot.tick_params(axis=\"x\", labelrotation=90)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lec2.ipynb b/lec2.ipynb new file mode 100644 index 0000000..e201765 --- /dev/null +++ b/lec2.ipynb @@ -0,0 +1,1823 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Лабораторная 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "ДАТАСЕТ СПИСОК ФОРБС\n", + "Объектами наблюдения в данном наборе данных являются миллиардеры, чье состояние оценивается и документируется в ежегодном рейтинге Forbes. Каждая запись в наборе данных представляет собой отдельного миллиардера с его оцененным состоянием.\n", + "Атрибуты объектов\n", + "\n", + "Атрибутами объектов (миллиардеров) являются:\n", + "Имя: имя миллиардера.\n", + "Страна: страна, в которой проживает миллиардер.\n", + "Состояние: оцененное состояние миллиардера в долларах США.\n", + "Источник богатства: источник, из которого миллиардер получил свое состояние (например, технологии, финансы, недвижимость и т.д.).\n", + "Возраст: возраст миллиардера на момент публикации списка.\n", + "Ранг: позиция миллиардера в рейтинге по сравнению с другими миллиардерами.\n", + "\n", + "Связи между объектами могут быть определены через общие источники богатства или страны проживания. Например, миллиардеры из одной страны могут иметь схожие источники дохода, а также могут быть связаны через бизнес-партнерства или семейные связи.\n", + "\n", + "Примеры бизнес-целей\n", + "Привлечение инвестиций: Компании могут использовать данные о миллиардерах для целенаправленного маркетинга и привлечения инвестиций от состоятельных индивидуумов.\n", + "Анализ рынка: Понимание источников богатства и распределения состояния может помочь в анализе рыночных трендов и потребительских предпочтений.\n", + "\n", + "Эффект для бизнеса\n", + "Эти бизнес-цели могут привести к увеличению инвестиций, улучшению репутации компании, расширению клиентской базы и повышению финансовой устойчивости организаций, работающих в различных секторах.\n", + "\n", + "Примеры целей технического проекта\n", + "Для привлечения инвестиций: Разработка платформы для анализа данных о миллиардерах, которая поможет компаниям находить потенциальных инвесторов на основе их интересов и источников богатства.\n", + "Для анализа рынка: Создание аналитической панели, которая визуализирует данные о миллиардерах и их источниках богатства, позволяя компаниям лучше понимать рыночные тренды.\n", + "\n", + "Входные данные: Данные о миллиардерах, включая имя, страну, состояние, источник богатства, возраст и ранг.\n", + "\n", + "Целевой признак: Целевым признаком может быть состояние миллиардера, что позволит строить модели для прогнозирования изменений в состоянии или ранге в будущем." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 2600 entries, 1 to 2578\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Name 2600 non-null object \n", + " 1 Networth 2600 non-null float64\n", + " 2 Age 2600 non-null int64 \n", + " 3 Country 2600 non-null object \n", + " 4 Source 2600 non-null object \n", + " 5 Industry 2600 non-null object \n", + "dtypes: float64(1), int64(1), object(4)\n", + "memory usage: 142.2+ KB\n" + ] + }, + { + "data": { + "text/plain": [ + "(2600, 6)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameNetworthAgeCountrySourceIndustry
RankID
1Elon Musk219.050United StatesTesla, SpaceXAutomotive
2Jeff Bezos171.058United StatesAmazonTechnology
3Bernard Arnault & family158.073FranceLVMHFashion & Retail
4Bill Gates129.066United StatesMicrosoftTechnology
5Warren Buffett118.091United StatesBerkshire HathawayFinance & Investments
\n", + "
" + ], + "text/plain": [ + " Name Networth Age Country \\\n", + "RankID \n", + "1 Elon Musk 219.0 50 United States \n", + "2 Jeff Bezos 171.0 58 United States \n", + "3 Bernard Arnault & family 158.0 73 France \n", + "4 Bill Gates 129.0 66 United States \n", + "5 Warren Buffett 118.0 91 United States \n", + "\n", + " Source Industry \n", + "RankID \n", + "1 Tesla, SpaceX Automotive \n", + "2 Amazon Technology \n", + "3 LVMH Fashion & Retail \n", + "4 Microsoft Technology \n", + "5 Berkshire Hathaway Finance & Investments " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"data/forbes.csv\", index_col=\"RankID\")\n", + "\n", + "df.info()\n", + "\n", + "display(df.shape)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Получение сведений о пропущенных данных" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Типы пропущенных данных:\n", + "- None - представление пустых данных в Python\n", + "- NaN - представление пустых данных в Pandas\n", + "- '' - пустая строка" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Name 0\n", + "Networth 0\n", + "Age 0\n", + "Country 0\n", + "Source 0\n", + "Industry 0\n", + "dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Name False\n", + "Networth False\n", + "Age False\n", + "Country False\n", + "Source False\n", + "Industry False\n", + "dtype: bool" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Количество пустых значений признаков\n", + "display(df.isnull().sum())\n", + "display()\n", + "\n", + "# Есть ли пустые значения признаков\n", + "display(df.isnull().any())\n", + "display()\n", + "\n", + "# Процент пустых значений признаков\n", + "for i in df.columns:\n", + " null_rate = df[i].isnull().sum() / len(df) * 100\n", + " if null_rate > 0:\n", + " display(f\"{i} процент пустых значений: %{null_rate:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Для данного датасета количество пустых значений для каждого из признаков = 0 т.е. не пропущено одно значение -> заполнение и корректировка не нужны." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2600, 7)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Name False\n", + "Networth False\n", + "Age False\n", + "Country False\n", + "Source False\n", + "Industry False\n", + "AgeFillMedian False\n", + "dtype: bool" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameNetworthAgeCountrySourceIndustryAgeFillMedianAgeFillNA
RankID
2578Jorge Gallardo Ballart1.080SpainpharmaceuticalsHealthcare8080
2578Nari Genomal1.082PhilippinesapparelFashion & Retail8282
2578Ramesh Genomal1.071PhilippinesapparelFashion & Retail7171
2578Sunder Genomal1.068PhilippinesgarmentsFashion & Retail6868
2578Horst-Otto Gerberding1.069Germanyflavors and fragrancesFood & Beverage6969
\n", + "
" + ], + "text/plain": [ + " Name Networth Age Country \\\n", + "RankID \n", + "2578 Jorge Gallardo Ballart 1.0 80 Spain \n", + "2578 Nari Genomal 1.0 82 Philippines \n", + "2578 Ramesh Genomal 1.0 71 Philippines \n", + "2578 Sunder Genomal 1.0 68 Philippines \n", + "2578 Horst-Otto Gerberding 1.0 69 Germany \n", + "\n", + " Source Industry AgeFillMedian AgeFillNA \n", + "RankID \n", + "2578 pharmaceuticals Healthcare 80 80 \n", + "2578 apparel Fashion & Retail 82 82 \n", + "2578 apparel Fashion & Retail 71 71 \n", + "2578 garments Fashion & Retail 68 68 \n", + "2578 flavors and fragrances Food & Beverage 69 69 " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fillna_df = df.fillna(0)\n", + "\n", + "display(fillna_df.shape)\n", + "\n", + "display(fillna_df.isnull().any())\n", + "\n", + "# Замена пустых данных на 0\n", + "df[\"AgeFillNA\"] = df[\"Age\"].fillna(0) \n", + "\n", + "# Замена пустых данных на медиану\n", + "df[\"AgeFillMedian\"] = df[\"Age\"].fillna(df[\"Age\"].median())\n", + "\n", + "df.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameNetworthAgeCountrySourceIndustryAgeFillMedianAgeFillNAAgeCopy
RankID
2578Jorge Gallardo Ballart1.080SpainpharmaceuticalsHealthcare808080
2578Nari Genomal1.082PhilippinesapparelFashion & Retail828282
2578Ramesh Genomal1.071PhilippinesapparelFashion & Retail717171
2578Sunder Genomal1.068PhilippinesgarmentsFashion & Retail686868
2578Horst-Otto Gerberding1.069Germanyflavors and fragrancesFood & Beverage696969
\n", + "
" + ], + "text/plain": [ + " Name Networth Age Country \\\n", + "RankID \n", + "2578 Jorge Gallardo Ballart 1.0 80 Spain \n", + "2578 Nari Genomal 1.0 82 Philippines \n", + "2578 Ramesh Genomal 1.0 71 Philippines \n", + "2578 Sunder Genomal 1.0 68 Philippines \n", + "2578 Horst-Otto Gerberding 1.0 69 Germany \n", + "\n", + " Source Industry AgeFillMedian AgeFillNA \\\n", + "RankID \n", + "2578 pharmaceuticals Healthcare 80 80 \n", + "2578 apparel Fashion & Retail 82 82 \n", + "2578 apparel Fashion & Retail 71 71 \n", + "2578 garments Fashion & Retail 68 68 \n", + "2578 flavors and fragrances Food & Beverage 69 69 \n", + "\n", + " AgeCopy \n", + "RankID \n", + "2578 80 \n", + "2578 82 \n", + "2578 71 \n", + "2578 68 \n", + "2578 69 " + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"AgeCopy\"] = df[\"Age\"]\n", + "\n", + "# Замена данных сразу в DataFrame без копирования\n", + "df.fillna({\"AgeCopy\": 0}, inplace=True)\n", + "\n", + "df.tail()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Удаление наблюдений с пропусками" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2600, 9)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Name False\n", + "Networth False\n", + "Age False\n", + "Country False\n", + "Source False\n", + "Industry False\n", + "AgeFillMedian False\n", + "dtype: bool" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "dropna_df = df.dropna()\n", + "\n", + "display(dropna_df.shape)\n", + "\n", + "display(fillna_df.isnull().any())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Создание выборок данных\n", + "\n", + "Библиотека scikit-learn\n", + "\n", + "https://scikit-learn.org/stable/index.html" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Industry\n", + "Finance & Investments 386\n", + "Technology 329\n", + "Manufacturing 322\n", + "Fashion & Retail 246\n", + "Healthcare 212\n", + "Food & Beverage 201\n", + "Real Estate 189\n", + "diversified 178\n", + "Media & Entertainment 95\n", + "Energy 93\n", + "Automotive 69\n", + "Metals & Mining 67\n", + "Service 51\n", + "Construction & Engineering 43\n", + "Logistics 35\n", + "Telecom 35\n", + "Sports 26\n", + "Gambling & Casinos 23\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'Обучающая выборка: '" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "(1560, 3)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Industry\n", + "Finance & Investments 231\n", + "Technology 197\n", + "Manufacturing 193\n", + "Fashion & Retail 148\n", + "Healthcare 127\n", + "Food & Beverage 121\n", + "Real Estate 113\n", + "diversified 107\n", + "Media & Entertainment 57\n", + "Energy 56\n", + "Automotive 41\n", + "Metals & Mining 40\n", + "Service 31\n", + "Construction & Engineering 26\n", + "Logistics 21\n", + "Telecom 21\n", + "Sports 16\n", + "Gambling & Casinos 14\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'Контрольная выборка: '" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "(520, 3)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Industry\n", + "Finance & Investments 77\n", + "Technology 66\n", + "Manufacturing 64\n", + "Fashion & Retail 49\n", + "Healthcare 43\n", + "Food & Beverage 40\n", + "Real Estate 38\n", + "diversified 35\n", + "Media & Entertainment 19\n", + "Energy 18\n", + "Automotive 14\n", + "Metals & Mining 14\n", + "Service 10\n", + "Construction & Engineering 9\n", + "Telecom 7\n", + "Logistics 7\n", + "Sports 5\n", + "Gambling & Casinos 5\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'Тестовая выборка: '" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "(520, 3)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Industry\n", + "Finance & Investments 78\n", + "Technology 66\n", + "Manufacturing 65\n", + "Fashion & Retail 49\n", + "Healthcare 42\n", + "Food & Beverage 40\n", + "Real Estate 38\n", + "diversified 36\n", + "Media & Entertainment 19\n", + "Energy 19\n", + "Automotive 14\n", + "Metals & Mining 13\n", + "Service 10\n", + "Construction & Engineering 8\n", + "Logistics 7\n", + "Telecom 7\n", + "Sports 5\n", + "Gambling & Casinos 4\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Вывод распределения количества наблюдений по Индустрии\n", + "from src.utils import split_stratified_into_train_val_test\n", + "\n", + "\n", + "display(df.Industry.value_counts())\n", + "display()\n", + "\n", + "data = df[[\"Networth\", \"Age\", \"Industry\"]].copy()\n", + "\n", + "df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n", + " data, stratify_colname=\"Industry\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n", + ")\n", + "\n", + "display(\"Обучающая выборка: \", df_train.shape)\n", + "display(df_train.Industry.value_counts())\n", + "\n", + "display(\"Контрольная выборка: \", df_val.shape)\n", + "display(df_val.Industry.value_counts())\n", + "\n", + "display(\"Тестовая выборка: \", df_test.shape)\n", + "display(df_test.Industry.value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Обучающая выборка: '" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "(1560, 3)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Industry\n", + "Finance & Investments 231\n", + "Technology 197\n", + "Manufacturing 193\n", + "Fashion & Retail 148\n", + "Healthcare 127\n", + "Food & Beverage 121\n", + "Real Estate 113\n", + "diversified 107\n", + "Media & Entertainment 57\n", + "Energy 56\n", + "Automotive 41\n", + "Metals & Mining 40\n", + "Service 31\n", + "Construction & Engineering 26\n", + "Logistics 21\n", + "Telecom 21\n", + "Sports 16\n", + "Gambling & Casinos 14\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "ename": "ValueError", + "evalue": "could not convert string to float: 'Technology '", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_1348\\420769102.py\u001b[0m in \u001b[0;36m?\u001b[1;34m()\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mdisplay\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Обучающая выборка: \"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[0mdisplay\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mIndustry\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalue_counts\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 8\u001b[1;33m \u001b[0mX_resampled\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_resampled\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mada\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit_resample\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"Industry\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# type: ignore\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 9\u001b[0m \u001b[0mdf_train_adasyn\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_resampled\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 10\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[0mdisplay\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Обучающая выборка после oversampling: \"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdf_train_adasyn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\Users\\ateks\\Courses\\Courses\\.venv\\Lib\\site-packages\\imblearn\\base.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 204\u001b[0m \u001b[0my_resampled\u001b[0m \u001b[1;33m:\u001b[0m \u001b[0marray\u001b[0m\u001b[1;33m-\u001b[0m\u001b[0mlike\u001b[0m \u001b[0mof\u001b[0m \u001b[0mshape\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mn_samples_new\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 205\u001b[0m \u001b[0mThe\u001b[0m \u001b[0mcorresponding\u001b[0m \u001b[0mlabel\u001b[0m \u001b[0mof\u001b[0m \u001b[1;33m`\u001b[0m\u001b[0mX_resampled\u001b[0m\u001b[1;33m`\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 206\u001b[0m \"\"\"\n\u001b[0;32m 207\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_params\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 208\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit_resample\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32mc:\\Users\\ateks\\Courses\\Courses\\.venv\\Lib\\site-packages\\imblearn\\base.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 102\u001b[0m \u001b[0mThe\u001b[0m \u001b[0mcorresponding\u001b[0m \u001b[0mlabel\u001b[0m \u001b[0mof\u001b[0m \u001b[1;33m`\u001b[0m\u001b[0mX_resampled\u001b[0m\u001b[1;33m`\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 103\u001b[0m \"\"\"\n\u001b[0;32m 104\u001b[0m \u001b[0mcheck_classification_targets\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 105\u001b[0m \u001b[0marrays_transformer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mArraysTransformer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 106\u001b[1;33m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbinarize_y\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_check_X_y\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 107\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 108\u001b[0m self.sampling_strategy_ = check_sampling_strategy(\n\u001b[0;32m 109\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msampling_strategy\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sampling_type\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\Users\\ateks\\Courses\\Courses\\.venv\\Lib\\site-packages\\imblearn\\base.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, X, y, accept_sparse)\u001b[0m\n\u001b[0;32m 157\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_check_X_y\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 158\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0maccept_sparse\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 159\u001b[0m \u001b[0maccept_sparse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m\"csr\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"csc\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 160\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbinarize_y\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcheck_target_type\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindicate_one_vs_all\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 161\u001b[1;33m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_data\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreset\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maccept_sparse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 162\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbinarize_y\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\Users\\ateks\\Courses\\Courses\\.venv\\Lib\\site-packages\\sklearn\\base.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001b[0m\n\u001b[0;32m 646\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;34m\"estimator\"\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mcheck_y_params\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 647\u001b[0m \u001b[0mcheck_y_params\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mdefault_check_params\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 648\u001b[0m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minput_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"y\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 649\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 650\u001b[1;33m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mcheck_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 651\u001b[0m \u001b[0mout\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 652\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 653\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mno_val_X\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mcheck_params\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"ensure_2d\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\Users\\ateks\\Courses\\Courses\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[0;32m 1297\u001b[0m raise ValueError(\n\u001b[0;32m 1298\u001b[0m \u001b[1;33mf\"\u001b[0m\u001b[1;33m{\u001b[0m\u001b[0mestimator_name\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m requires y to be passed, but the target y is None\u001b[0m\u001b[1;33m\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1299\u001b[0m \u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1300\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1301\u001b[1;33m X = check_array(\n\u001b[0m\u001b[0;32m 1302\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1303\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maccept_sparse\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1304\u001b[0m \u001b[0maccept_large_sparse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maccept_large_sparse\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\Users\\ateks\\Courses\\Courses\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[0;32m 1009\u001b[0m \u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1010\u001b[0m \u001b[0marray\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mxp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1011\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1012\u001b[0m \u001b[0marray\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_asarray_with_order\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mxp\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mxp\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1013\u001b[1;33m \u001b[1;32mexcept\u001b[0m \u001b[0mComplexWarning\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mcomplex_warning\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1014\u001b[0m raise ValueError(\n\u001b[0;32m 1015\u001b[0m \u001b[1;34m\"Complex data not supported\\n{}\\n\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1016\u001b[0m \u001b[1;33m)\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mcomplex_warning\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\Users\\ateks\\Courses\\Courses\\.venv\\Lib\\site-packages\\sklearn\\utils\\_array_api.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(array, dtype, order, copy, xp, device)\u001b[0m\n\u001b[0;32m 741\u001b[0m \u001b[1;31m# Use NumPy API to support order\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 742\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mcopy\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 743\u001b[0m \u001b[0marray\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 744\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 745\u001b[1;33m \u001b[0marray\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 746\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 747\u001b[0m \u001b[1;31m# At this point array is a NumPy ndarray. We convert it to an array\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 748\u001b[0m \u001b[1;31m# container that is consistent with the input's namespace.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\Users\\ateks\\Courses\\Courses\\.venv\\Lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, dtype, copy)\u001b[0m\n\u001b[0;32m 2149\u001b[0m def __array__(\n\u001b[0;32m 2150\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mnpt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDTypeLike\u001b[0m \u001b[1;33m|\u001b[0m \u001b[1;32mNone\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mbool_t\u001b[0m \u001b[1;33m|\u001b[0m \u001b[1;32mNone\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2151\u001b[0m \u001b[1;33m)\u001b[0m \u001b[1;33m->\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mndarray\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2152\u001b[0m \u001b[0mvalues\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_values\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2153\u001b[1;33m \u001b[0marr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2154\u001b[0m if (\n\u001b[0;32m 2155\u001b[0m \u001b[0mastype_is_view\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0marr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2156\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0musing_copy_on_write\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mValueError\u001b[0m: could not convert string to float: 'Technology '" + ] + } + ], + "source": [ + "from imblearn.over_sampling import ADASYN\n", + "\n", + "ada = ADASYN()\n", + "\n", + "display(\"Обучающая выборка: \", df_train.shape)\n", + "display(df_train.Industry.value_counts())\n", + "\n", + "X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Industry\"]) # type: ignore\n", + "df_train_adasyn = pd.DataFrame(X_resampled)\n", + "\n", + "display(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n", + "display(df_train_adasyn.Industry.value_counts())\n", + "\n", + "df_train_adasyn" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "________________________________________________________________________________________________________________________________________" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "ДАТАСЕТ ЦЕНЫ НА ЗОЛОТО\n", + "\n", + "Объектами наблюдения в данном наборе данных являются цены на золото, представленные через Gold ETF (Exchange-Traded Fund). Каждая запись в наборе данных соответствует отдельному дню торговли золотыми активами.\n", + "Атрибуты объектов\n", + "\n", + "Атрибутами объектов (цен на золото) являются:\n", + "Дата: дата, когда происходила торговля.\n", + "Цена открытия (Open): цена, по которой золото открывалось в начале торгового дня.\n", + "Максимальная цена (High): наивысшая цена золота в течение дня.\n", + "Минимальная цена (Low): наименьшая цена золота в течение дня.\n", + "Цена закрытия (Close): цена, по которой золото закрылось в конце торгового дня.\n", + "Скорректированная цена закрытия (Adjusted Close): цена закрытия, скорректированная с учетом факторов, таких как дивиденды и сплиты акций.\n", + "Объем (Volume): количество золота, которое было куплено и продано в течение дня.\n", + "\n", + "Связи между объектами могут быть определены через временные последовательности. Например, изменение цен на золото в один день может зависеть от цен в предыдущие дни, а также от внешних факторов, таких как цены на другие драгоценные металлы, цены на нефть, экономические условия и рыночные тренды.\n", + "\n", + "Примеры бизнес-целей\n", + "Оптимизация инвестиционных решений: Анализ исторических данных о ценах на золото может помочь инвесторам принимать более обоснованные решения о покупке или продаже золота.\n", + "Управление рисками: Понимание факторов, влияющих на цены на золото, может помочь компаниям и инвесторам минимизировать риски, связанные с колебаниями цен.\n", + "\n", + "Эффект для бизнеса\n", + "Эти бизнес-цели могут привести к увеличению доходов, привлечению новых инвесторов и повышению общей финансовой устойчивости компаний, работающих с золотом.\n", + "\n", + "Примеры целей технического проекта\n", + "Для управления рисками: Создание системы мониторинга, которая будет отслеживать изменения цен на золото и другие факторы, влияющие на рынок, и предоставлять рекомендации по управлению рисками.\n", + "\n", + "Входные данные: Данные о ценах на золото, включая дату, цену открытия, максимальную и минимальную цены, цену закрытия, скорректированную цену закрытия и объем торгов.\n", + "\n", + "Целевой признак: Целевым признаком может быть скорректированная цена закрытия золота на следующий день, что позволит строить модели для прогнозирования будущих цен." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 1718 entries, 2011-12-15 to 2018-12-31\n", + "Data columns (total 80 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Open 1718 non-null float64\n", + " 1 High 1718 non-null float64\n", + " 2 Low 1718 non-null float64\n", + " 3 Close 1718 non-null float64\n", + " 4 Adj Close 1718 non-null float64\n", + " 5 Volume 1718 non-null int64 \n", + " 6 SP_open 1718 non-null float64\n", + " 7 SP_high 1718 non-null float64\n", + " 8 SP_low 1718 non-null float64\n", + " 9 SP_close 1718 non-null float64\n", + " 10 SP_Ajclose 1718 non-null float64\n", + " 11 SP_volume 1718 non-null int64 \n", + " 12 DJ_open 1718 non-null float64\n", + " 13 DJ_high 1718 non-null float64\n", + " 14 DJ_low 1718 non-null float64\n", + " 15 DJ_close 1718 non-null float64\n", + " 16 DJ_Ajclose 1718 non-null float64\n", + " 17 DJ_volume 1718 non-null int64 \n", + " 18 EG_open 1718 non-null float64\n", + " 19 EG_high 1718 non-null float64\n", + " 20 EG_low 1718 non-null float64\n", + " 21 EG_close 1718 non-null float64\n", + " 22 EG_Ajclose 1718 non-null float64\n", + " 23 EG_volume 1718 non-null int64 \n", + " 24 EU_Price 1718 non-null float64\n", + " 25 EU_open 1718 non-null float64\n", + " 26 EU_high 1718 non-null float64\n", + " 27 EU_low 1718 non-null float64\n", + " 28 EU_Trend 1718 non-null int64 \n", + " 29 OF_Price 1718 non-null float64\n", + " 30 OF_Open 1718 non-null float64\n", + " 31 OF_High 1718 non-null float64\n", + " 32 OF_Low 1718 non-null float64\n", + " 33 OF_Volume 1718 non-null int64 \n", + " 34 OF_Trend 1718 non-null int64 \n", + " 35 OS_Price 1718 non-null float64\n", + " 36 OS_Open 1718 non-null float64\n", + " 37 OS_High 1718 non-null float64\n", + " 38 OS_Low 1718 non-null float64\n", + " 39 OS_Trend 1718 non-null int64 \n", + " 40 SF_Price 1718 non-null int64 \n", + " 41 SF_Open 1718 non-null int64 \n", + " 42 SF_High 1718 non-null int64 \n", + " 43 SF_Low 1718 non-null int64 \n", + " 44 SF_Volume 1718 non-null int64 \n", + " 45 SF_Trend 1718 non-null int64 \n", + " 46 USB_Price 1718 non-null float64\n", + " 47 USB_Open 1718 non-null float64\n", + " 48 USB_High 1718 non-null float64\n", + " 49 USB_Low 1718 non-null float64\n", + " 50 USB_Trend 1718 non-null int64 \n", + " 51 PLT_Price 1718 non-null float64\n", + " 52 PLT_Open 1718 non-null float64\n", + " 53 PLT_High 1718 non-null float64\n", + " 54 PLT_Low 1718 non-null float64\n", + " 55 PLT_Trend 1718 non-null int64 \n", + " 56 PLD_Price 1718 non-null float64\n", + " 57 PLD_Open 1718 non-null float64\n", + " 58 PLD_High 1718 non-null float64\n", + " 59 PLD_Low 1718 non-null float64\n", + " 60 PLD_Trend 1718 non-null int64 \n", + " 61 RHO_PRICE 1718 non-null int64 \n", + " 62 USDI_Price 1718 non-null float64\n", + " 63 USDI_Open 1718 non-null float64\n", + " 64 USDI_High 1718 non-null float64\n", + " 65 USDI_Low 1718 non-null float64\n", + " 66 USDI_Volume 1718 non-null int64 \n", + " 67 USDI_Trend 1718 non-null int64 \n", + " 68 GDX_Open 1718 non-null float64\n", + " 69 GDX_High 1718 non-null float64\n", + " 70 GDX_Low 1718 non-null float64\n", + " 71 GDX_Close 1718 non-null float64\n", + " 72 GDX_Adj Close 1718 non-null float64\n", + " 73 GDX_Volume 1718 non-null int64 \n", + " 74 USO_Open 1718 non-null float64\n", + " 75 USO_High 1718 non-null float64\n", + " 76 USO_Low 1718 non-null float64\n", + " 77 USO_Close 1718 non-null float64\n", + " 78 USO_Adj Close 1718 non-null float64\n", + " 79 USO_Volume 1718 non-null int64 \n", + "dtypes: float64(58), int64(22)\n", + "memory usage: 1.1+ MB\n" + ] + }, + { + "data": { + "text/plain": [ + "(1718, 80)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
OpenHighLowCloseAdj CloseVolumeSP_openSP_highSP_lowSP_close...GDX_LowGDX_CloseGDX_Adj CloseGDX_VolumeUSO_OpenUSO_HighUSO_LowUSO_CloseUSO_Adj CloseUSO_Volume
Date
2011-12-15154.740005154.949997151.710007152.330002152.33000221521900123.029999123.199997121.989998122.180000...51.57000051.68000048.9738772060560036.90000236.93999936.04999936.13000136.13000112616700
2011-12-16154.309998155.369995153.899994155.229996155.22999618124300122.230003122.949997121.300003121.589996...52.04000152.68000049.9215131628540036.18000036.50000035.73000036.27000036.27000012578800
2011-12-19155.479996155.860001154.360001154.869995154.86999512547200122.059998122.320000120.029999120.290001...51.02999951.16999848.4905781512020036.38999936.45000135.93000036.20000136.2000017418200
2011-12-20156.820007157.429993156.580002156.979996156.9799969136300122.180000124.139999120.370003123.930000...52.36999952.99000250.2152821164490037.29999937.61000137.22000137.56000137.56000110041600
2011-12-21156.979996157.529999156.130005157.160004157.16000411996100123.930000124.360001122.750000124.169998...52.41999852.95999950.186852872430037.66999838.24000237.52000038.11000138.11000110728000
\n", + "

5 rows × 80 columns

\n", + "
" + ], + "text/plain": [ + " Open High Low Close Adj Close \\\n", + "Date \n", + "2011-12-15 154.740005 154.949997 151.710007 152.330002 152.330002 \n", + "2011-12-16 154.309998 155.369995 153.899994 155.229996 155.229996 \n", + "2011-12-19 155.479996 155.860001 154.360001 154.869995 154.869995 \n", + "2011-12-20 156.820007 157.429993 156.580002 156.979996 156.979996 \n", + "2011-12-21 156.979996 157.529999 156.130005 157.160004 157.160004 \n", + "\n", + " Volume SP_open SP_high SP_low SP_close ... \\\n", + "Date ... \n", + "2011-12-15 21521900 123.029999 123.199997 121.989998 122.180000 ... \n", + "2011-12-16 18124300 122.230003 122.949997 121.300003 121.589996 ... \n", + "2011-12-19 12547200 122.059998 122.320000 120.029999 120.290001 ... \n", + "2011-12-20 9136300 122.180000 124.139999 120.370003 123.930000 ... \n", + "2011-12-21 11996100 123.930000 124.360001 122.750000 124.169998 ... \n", + "\n", + " GDX_Low GDX_Close GDX_Adj Close GDX_Volume USO_Open \\\n", + "Date \n", + "2011-12-15 51.570000 51.680000 48.973877 20605600 36.900002 \n", + "2011-12-16 52.040001 52.680000 49.921513 16285400 36.180000 \n", + "2011-12-19 51.029999 51.169998 48.490578 15120200 36.389999 \n", + "2011-12-20 52.369999 52.990002 50.215282 11644900 37.299999 \n", + "2011-12-21 52.419998 52.959999 50.186852 8724300 37.669998 \n", + "\n", + " USO_High USO_Low USO_Close USO_Adj Close USO_Volume \n", + "Date \n", + "2011-12-15 36.939999 36.049999 36.130001 36.130001 12616700 \n", + "2011-12-16 36.500000 35.730000 36.270000 36.270000 12578800 \n", + "2011-12-19 36.450001 35.930000 36.200001 36.200001 7418200 \n", + "2011-12-20 37.610001 37.220001 37.560001 37.560001 10041600 \n", + "2011-12-21 38.240002 37.520000 38.110001 38.110001 10728000 \n", + "\n", + "[5 rows x 80 columns]" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfGold = pd.read_csv(\"data/gold.csv\", index_col=\"Date\")\n", + "\n", + "dfGold.info()\n", + "\n", + "display(dfGold.shape)\n", + "\n", + "dfGold.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Пустые значения" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Open 0\n", + "High 0\n", + "Low 0\n", + "Close 0\n", + "Adj Close 0\n", + " ..\n", + "USO_High 0\n", + "USO_Low 0\n", + "USO_Close 0\n", + "USO_Adj Close 0\n", + "USO_Volume 0\n", + "Length: 80, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Open False\n", + "High False\n", + "Low False\n", + "Close False\n", + "Adj Close False\n", + " ... \n", + "USO_High False\n", + "USO_Low False\n", + "USO_Close False\n", + "USO_Adj Close False\n", + "USO_Volume False\n", + "Length: 80, dtype: bool" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Количество пустых значений признаков\n", + "display(dfGold.isnull().sum())\n", + "display()\n", + "\n", + "# Есть ли пустые значения признаков\n", + "display(dfGold.isnull().any())\n", + "display()\n", + "\n", + "# Процент пустых значений признаков\n", + "for i in dfGold.columns:\n", + " null_rate = dfGold[i].isnull().sum() / len(dfGold) * 100\n", + " if null_rate > 0:\n", + " display(f\"{i} процент пустых значений: %{null_rate:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Заполение пустых значений для данного набора так же не требуется." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Создание выборок данных" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "USB_Trend\n", + "0 876\n", + "1 842\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'Обучающая выборка: '" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "(1030, 4)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "USB_Trend\n", + "0 525\n", + "1 505\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'Контрольная выборка: '" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "(344, 4)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "USB_Trend\n", + "0 176\n", + "1 168\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'Тестовая выборка: '" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "(344, 4)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "USB_Trend\n", + "0 175\n", + "1 169\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Вывод распределения количества наблюдений по меркам\n", + "from src.utils import split_stratified_into_train_val_test\n", + "\n", + "display((dfGold.USB_Trend).value_counts())\n", + "display()\n", + "\n", + "selected_columns = [\"Open\", \"High\", \"Low\", \"USB_Trend\"]\n", + "dfGold[\"USB_Trend\"] = round(dfGold[\"USB_Trend\"])\n", + "data = dfGold[selected_columns].copy()\n", + "\n", + "# Создание выборок\n", + "dfGold_train, dfGold_val, dfGold_test, y_train, y_val, y_test = (\n", + " split_stratified_into_train_val_test(\n", + " data,\n", + " stratify_colname=\"USB_Trend\",\n", + " frac_train=0.60,\n", + " frac_val=0.20,\n", + " frac_test=0.20,\n", + " )\n", + ")\n", + "\n", + "# Используем display для вывода информации о выборках\n", + "display(\"Обучающая выборка: \", dfGold_train.shape)\n", + "display(round(dfGold_train.USB_Trend).value_counts())\n", + "\n", + "display(\"Контрольная выборка: \", dfGold_val.shape)\n", + "display(round(dfGold_val.USB_Trend).value_counts())\n", + "\n", + "display(\"Тестовая выборка: \", dfGold_test.shape)\n", + "display(round(dfGold_test.USB_Trend).value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Обучающая выборка: '" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "(1030, 4)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "USB_Trend\n", + "0 525\n", + "1 505\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'Обучающая выборка после undersampling: '" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "(1010, 4)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "USB_Trend\n", + "0 505\n", + "1 505\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
OpenHighLowUSB_Trend
Date
2016-04-27118.970001119.699997118.4300000
2017-02-16117.930000118.349998117.8300020
2016-11-15116.459999117.239998116.2900010
2016-11-07122.660004122.709999121.8799970
2018-04-30124.410004125.199997124.1900020
...............
2012-07-13153.449997154.940002153.4400021
2016-05-25116.589996117.059998116.3200001
2016-03-02118.339996118.970001118.0700001
2013-08-05126.510002126.639999125.3399961
2013-06-20125.220001126.379997123.3300021
\n", + "

1010 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " Open High Low USB_Trend\n", + "Date \n", + "2016-04-27 118.970001 119.699997 118.430000 0\n", + "2017-02-16 117.930000 118.349998 117.830002 0\n", + "2016-11-15 116.459999 117.239998 116.290001 0\n", + "2016-11-07 122.660004 122.709999 121.879997 0\n", + "2018-04-30 124.410004 125.199997 124.190002 0\n", + "... ... ... ... ...\n", + "2012-07-13 153.449997 154.940002 153.440002 1\n", + "2016-05-25 116.589996 117.059998 116.320000 1\n", + "2016-03-02 118.339996 118.970001 118.070000 1\n", + "2013-08-05 126.510002 126.639999 125.339996 1\n", + "2013-06-20 125.220001 126.379997 123.330002 1\n", + "\n", + "[1010 rows x 4 columns]" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from imblearn.under_sampling import RandomUnderSampler\n", + "\n", + "# Создание экземпляра RandomUnderSampler\n", + "rus = RandomUnderSampler(\n", + " sampling_strategy=\"auto\"\n", + ") # 'auto' будет пытаться сбалансировать классы\n", + "\n", + "display(\"Обучающая выборка: \", dfGold_train.shape)\n", + "display(dfGold_train.USB_Trend.value_counts())\n", + "\n", + "\n", + "# Разделение признаков и целевой переменной\n", + "X = dfGold_train.drop(columns=[\"USB_Trend\"])\n", + "y = dfGold_train[\"USB_Trend\"]\n", + "\n", + "# Применение undersampling\n", + "X_resampled, y_resampled = rus.fit_resample(X, y)\n", + "\n", + "# Создание нового DataFrame\n", + "dfGold_train_undersampled = pd.DataFrame(X_resampled)\n", + "dfGold_train_undersampled[\"USB_Trend\"] = y_resampled\n", + "\n", + "display(\"Обучающая выборка после undersampling: \", dfGold_train_undersampled.shape)\n", + "display(dfGold_train_undersampled.USB_Trend.value_counts())\n", + "\n", + "dfGold_train_undersampled" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "ДАТАСЕТ МАРКЕТИНГОВАЯ КОМПАНИЯ" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}