645 lines
153 KiB
Plaintext
645 lines
153 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Выбранные темы: цены на кофе, магазины, оценки студентов\n",
|
||
"Далее идут выбранные таблицы"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 55,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn.preprocessing import LabelEncoder\n",
|
||
"from imblearn.over_sampling import RandomOverSampler\n",
|
||
"from imblearn.under_sampling import RandomUnderSampler\n",
|
||
"\n",
|
||
"label_encoder = LabelEncoder()\n",
|
||
"\n",
|
||
"# Функция для применения oversampling\n",
|
||
"def apply_oversampling(X, y):\n",
|
||
" oversampler = RandomOverSampler(random_state=42)\n",
|
||
" X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
|
||
" return X_resampled, y_resampled\n",
|
||
"\n",
|
||
"# Функция для применения undersampling\n",
|
||
"def apply_undersampling(X, y):\n",
|
||
" undersampler = RandomUnderSampler(random_state=42)\n",
|
||
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
|
||
" return X_resampled, y_resampled\n",
|
||
"\n",
|
||
"def split_stratified_into_train_val_test(\n",
|
||
" df_input,\n",
|
||
" stratify_colname=\"y\",\n",
|
||
" frac_train=0.6,\n",
|
||
" frac_val=0.15,\n",
|
||
" frac_test=0.25,\n",
|
||
" random_state=None,\n",
|
||
"):\n",
|
||
" \"\"\"\n",
|
||
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
|
||
" following fractional ratios provided by the user, where each subset is\n",
|
||
" stratified by the values in a specific column (that is, each subset has\n",
|
||
" the same relative frequency of the values in the column). It performs this\n",
|
||
" splitting by running train_test_split() twice.\n",
|
||
"\n",
|
||
" Parameters\n",
|
||
" ----------\n",
|
||
" df_input : Pandas dataframe\n",
|
||
" Input dataframe to be split.\n",
|
||
" stratify_colname : str\n",
|
||
" The name of the column that will be used for stratification. Usually\n",
|
||
" this column would be for the label.\n",
|
||
" frac_train : float\n",
|
||
" frac_val : float\n",
|
||
" frac_test : float\n",
|
||
" The ratios with which the dataframe will be split into train, val, and\n",
|
||
" test data. The values should be expressed as float fractions and should\n",
|
||
" sum to 1.0.\n",
|
||
" random_state : int, None, or RandomStateInstance\n",
|
||
" Value to be passed to train_test_split().\n",
|
||
"\n",
|
||
" Returns\n",
|
||
" -------\n",
|
||
" df_train, df_val, df_test :\n",
|
||
" Dataframes containing the three splits.\n",
|
||
" \"\"\"\n",
|
||
"\n",
|
||
" if frac_train + frac_val + frac_test != 1.0:\n",
|
||
" raise ValueError(\n",
|
||
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
||
" % (frac_train, frac_val, frac_test)\n",
|
||
" )\n",
|
||
"\n",
|
||
" if stratify_colname not in df_input.columns:\n",
|
||
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
||
"\n",
|
||
" X = df_input # Contains all columns.\n",
|
||
" y = df_input[\n",
|
||
" [stratify_colname]\n",
|
||
" ] # Dataframe of just the column on which to stratify.\n",
|
||
"\n",
|
||
" # Split original dataframe into train and temp dataframes.\n",
|
||
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
||
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
||
" )\n",
|
||
"\n",
|
||
" # Split the temp dataframe into val and test dataframes.\n",
|
||
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
||
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
||
" df_temp,\n",
|
||
" y_temp,\n",
|
||
" stratify=y_temp,\n",
|
||
" test_size=relative_frac_test,\n",
|
||
" random_state=random_state,\n",
|
||
" )\n",
|
||
"\n",
|
||
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
||
"\n",
|
||
" return df_train, df_val, df_test"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Отслеживание цен на акции Старбакс. Объекты связаны между собой датой, т.е. каждая следующая строка это новый день. Можно узнать как, относительно изменения цен на акции, идут продажи акций. Поможет для трейдинговых компаний. Целевым признаком является количество покупающих."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 8036 entries, 0 to 8035\n",
|
||
"Data columns (total 7 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 Date 8036 non-null object \n",
|
||
" 1 Open 8036 non-null float64\n",
|
||
" 2 High 8036 non-null float64\n",
|
||
" 3 Low 8036 non-null float64\n",
|
||
" 4 Close 8036 non-null float64\n",
|
||
" 5 Adj Close 8036 non-null float64\n",
|
||
" 6 Volume 8036 non-null int64 \n",
|
||
"dtypes: float64(5), int64(1), object(1)\n",
|
||
"memory usage: 439.6+ KB\n",
|
||
"Date 0\n",
|
||
"Open 0\n",
|
||
"High 0\n",
|
||
"Low 0\n",
|
||
"Close 0\n",
|
||
"Adj Close 0\n",
|
||
"Volume 0\n",
|
||
"dtype: int64\n",
|
||
" Open High Low Close Adj Close \\\n",
|
||
"count 8036.000000 8036.000000 8036.000000 8036.000000 8036.000000 \n",
|
||
"mean 30.054280 30.351487 29.751322 30.058857 26.674025 \n",
|
||
"std 33.615577 33.906613 33.314569 33.615911 31.728090 \n",
|
||
"min 0.328125 0.347656 0.320313 0.335938 0.260703 \n",
|
||
"25% 4.392031 4.531250 4.304922 4.399610 3.414300 \n",
|
||
"50% 13.325000 13.493750 13.150000 13.330000 10.352452 \n",
|
||
"75% 55.250000 55.722501 54.852499 55.267499 47.464829 \n",
|
||
"max 126.080002 126.320000 124.809998 126.059998 118.010414 \n",
|
||
"\n",
|
||
" Volume \n",
|
||
"count 8.036000e+03 \n",
|
||
"mean 1.470459e+07 \n",
|
||
"std 1.340021e+07 \n",
|
||
"min 1.504000e+06 \n",
|
||
"25% 7.817750e+06 \n",
|
||
"50% 1.169815e+07 \n",
|
||
"75% 1.778795e+07 \n",
|
||
"max 5.855088e+08 \n",
|
||
"\n",
|
||
"[1]\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1 = pd.read_csv(\"../data/coffee.csv\")\n",
|
||
"df1.info()\n",
|
||
"print(df1.isnull().sum())\n",
|
||
"print(df1.describe())\n",
|
||
"print()\n",
|
||
"print(df1[\"Date\"].value_counts().unique())\n",
|
||
"print()\n",
|
||
"plt.plot(df1[\"Date\"], df1[\"High\"])\n",
|
||
"plt.show()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Данные по всем параметрам являются правильными, без шумов, без выбросов, актуальными."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Магазины. Каждая строка представляет собой магазин, его площадь, количество продуктов, количество покупателей и объем продаж. Позволяет увидеть изменения количества продаж относительно размеров магазина и количества покупателей. Ключевой признак - количество продаж"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 896 entries, 0 to 895\n",
|
||
"Data columns (total 5 columns):\n",
|
||
" # Column Non-Null Count Dtype\n",
|
||
"--- ------ -------------- -----\n",
|
||
" 0 Store ID 896 non-null int64\n",
|
||
" 1 Store_Area 896 non-null int64\n",
|
||
" 2 Items_Available 896 non-null int64\n",
|
||
" 3 Daily_Customer_Count 896 non-null int64\n",
|
||
" 4 Store_Sales 896 non-null int64\n",
|
||
"dtypes: int64(5)\n",
|
||
"memory usage: 35.1 KB\n",
|
||
"Store ID 0\n",
|
||
"Store_Area 0\n",
|
||
"Items_Available 0\n",
|
||
"Daily_Customer_Count 0\n",
|
||
"Store_Sales 0\n",
|
||
"dtype: int64\n",
|
||
" Store ID Store_Area Items_Available Daily_Customer_Count \\\n",
|
||
"count 896.000000 896.000000 896.000000 896.000000 \n",
|
||
"mean 448.500000 1485.409598 1782.035714 786.350446 \n",
|
||
"std 258.797218 250.237011 299.872053 265.389281 \n",
|
||
"min 1.000000 775.000000 932.000000 10.000000 \n",
|
||
"25% 224.750000 1316.750000 1575.500000 600.000000 \n",
|
||
"50% 448.500000 1477.000000 1773.500000 780.000000 \n",
|
||
"75% 672.250000 1653.500000 1982.750000 970.000000 \n",
|
||
"max 896.000000 2229.000000 2667.000000 1560.000000 \n",
|
||
"\n",
|
||
" Store_Sales \n",
|
||
"count 896.000000 \n",
|
||
"mean 59351.305804 \n",
|
||
"std 17190.741895 \n",
|
||
"min 14920.000000 \n",
|
||
"25% 46530.000000 \n",
|
||
"50% 58605.000000 \n",
|
||
"75% 71872.500000 \n",
|
||
"max 116320.000000 \n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"df2 = pd.read_csv(\"../data/store.csv\")\n",
|
||
"df2.info()\n",
|
||
"print(df2.isnull().sum())\n",
|
||
"print(df2.describe())\n",
|
||
"print()\n",
|
||
"\n",
|
||
"\n",
|
||
"\n",
|
||
"plt.scatter(df2[\"Store_Sales\"], df2[\"Daily_Customer_Count\"])\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Данные имеют некоторое количество выбросов, что видно на графике."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Оценки студентов. Показывает оценки конкретного студента. Аналитика относительно гендера, расы, уровня образования родителей. Поможет для онлайн-школ для опредения контенгента покупателей курсов. Ключевыми значениями являются оценки по предметам."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 1000 entries, 0 to 999\n",
|
||
"Data columns (total 8 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 gender 1000 non-null object\n",
|
||
" 1 race/ethnicity 1000 non-null object\n",
|
||
" 2 parental level of education 1000 non-null object\n",
|
||
" 3 lunch 1000 non-null object\n",
|
||
" 4 test preparation course 1000 non-null object\n",
|
||
" 5 math score 1000 non-null int64 \n",
|
||
" 6 reading score 1000 non-null int64 \n",
|
||
" 7 writing score 1000 non-null int64 \n",
|
||
"dtypes: int64(3), object(5)\n",
|
||
"memory usage: 62.6+ KB\n",
|
||
" gender race/ethnicity parental level of education lunch \\\n",
|
||
"0 female group B bachelor's degree standard \n",
|
||
"1 female group C some college standard \n",
|
||
"2 female group B master's degree standard \n",
|
||
"3 male group A associate's degree free/reduced \n",
|
||
"4 male group C some college standard \n",
|
||
"\n",
|
||
" test preparation course math score reading score writing score score \n",
|
||
"0 none 72 72 74 72.666667 \n",
|
||
"1 completed 69 90 88 82.333333 \n",
|
||
"2 none 90 95 93 92.666667 \n",
|
||
"3 none 47 57 44 49.333333 \n",
|
||
"4 none 76 78 75 76.333333 \n",
|
||
"gender 0\n",
|
||
"race/ethnicity 0\n",
|
||
"parental level of education 0\n",
|
||
"lunch 0\n",
|
||
"test preparation course 0\n",
|
||
"math score 0\n",
|
||
"reading score 0\n",
|
||
"writing score 0\n",
|
||
"score 0\n",
|
||
"dtype: int64\n",
|
||
" math score reading score writing score score\n",
|
||
"count 1000.00000 1000.000000 1000.000000 1000.000000\n",
|
||
"mean 66.08900 69.169000 68.054000 67.770667\n",
|
||
"std 15.16308 14.600192 15.195657 14.257326\n",
|
||
"min 0.00000 17.000000 10.000000 9.000000\n",
|
||
"25% 57.00000 59.000000 57.750000 58.333333\n",
|
||
"50% 66.00000 70.000000 69.000000 68.333333\n",
|
||
"75% 77.00000 79.000000 79.000000 77.666667\n",
|
||
"max 100.00000 100.000000 100.000000 100.000000\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"df3 = pd.read_csv(\"../data/student.csv\")\n",
|
||
"df3.info()\n",
|
||
"df3[\"score\"] = (df3[\"math score\"] + df3[\"reading score\"] + df3[\"writing score\"]) / 3\n",
|
||
"print(df3.head())\n",
|
||
"print(df3.isnull().sum())\n",
|
||
"print(df3.describe())\n",
|
||
"print()\n",
|
||
"plt.scatter(df3[\"score\"], df3[\"parental level of education\"])\n",
|
||
"plt.show()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Для всех выбранных тем отсутствуют пустые ячейки. Заполнение пустых ячеек не требуется. Данные вполне реальные."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Разбиение наборов на выборки."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Акции старбакс."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 63,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Обучающая выборка: (4821, 4)\n",
|
||
"Volume_Grouped\n",
|
||
" 0 2802\n",
|
||
" 1 1460\n",
|
||
" 2 369\n",
|
||
" 3 111\n",
|
||
" 4 40\n",
|
||
" 5 18\n",
|
||
"-1 10\n",
|
||
" 6 7\n",
|
||
" 7 4\n",
|
||
"Name: count, dtype: int64\n"
|
||
]
|
||
},
|
||
{
|
||
"ename": "RuntimeError",
|
||
"evalue": "Not any neigbours belong to the majority class. This case will induce a NaN case with a division by zero. ADASYN is not suited for this specific dataset. Use SMOTE instead.",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[1;31mRuntimeError\u001b[0m Traceback (most recent call last)",
|
||
"Cell \u001b[1;32mIn[63], line 18\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_coffee_train\u001b[38;5;241m.\u001b[39mshape)\n\u001b[0;32m 16\u001b[0m \u001b[38;5;28mprint\u001b[39m(df_coffee_train[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVolume_Grouped\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mvalue_counts())\n\u001b[1;32m---> 18\u001b[0m X_resampled, y_resampled \u001b[38;5;241m=\u001b[39m \u001b[43mada\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_resample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_coffee_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdf_coffee_train\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mVolume_Grouped\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 19\u001b[0m df_coffee_train_adasyn \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(X_resampled)\n\u001b[0;32m 21\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка после oversampling: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_coffee_train_adasyn\u001b[38;5;241m.\u001b[39mshape)\n",
|
||
"File \u001b[1;32mc:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\imblearn\\base.py:208\u001b[0m, in \u001b[0;36mBaseSampler.fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 187\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Resample the dataset.\u001b[39;00m\n\u001b[0;32m 188\u001b[0m \n\u001b[0;32m 189\u001b[0m \u001b[38;5;124;03mParameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 205\u001b[0m \u001b[38;5;124;03m The corresponding label of `X_resampled`.\u001b[39;00m\n\u001b[0;32m 206\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[1;32m--> 208\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_resample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"File \u001b[1;32mc:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\imblearn\\base.py:112\u001b[0m, in \u001b[0;36mSamplerMixin.fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 106\u001b[0m X, y, binarize_y \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_X_y(X, y)\n\u001b[0;32m 108\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy_ \u001b[38;5;241m=\u001b[39m check_sampling_strategy(\n\u001b[0;32m 109\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy, y, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sampling_type\n\u001b[0;32m 110\u001b[0m )\n\u001b[1;32m--> 112\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_resample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 114\u001b[0m y_ \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 115\u001b[0m label_binarize(output[\u001b[38;5;241m1\u001b[39m], classes\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39munique(y)) \u001b[38;5;28;01mif\u001b[39;00m binarize_y \u001b[38;5;28;01melse\u001b[39;00m output[\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m 116\u001b[0m )\n\u001b[0;32m 118\u001b[0m X_, y_ \u001b[38;5;241m=\u001b[39m arrays_transformer\u001b[38;5;241m.\u001b[39mtransform(output[\u001b[38;5;241m0\u001b[39m], y_)\n",
|
||
"File \u001b[1;32mc:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\imblearn\\over_sampling\\_adasyn.py:183\u001b[0m, in \u001b[0;36mADASYN._fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 181\u001b[0m ratio_nn \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39msum(y[nns] \u001b[38;5;241m!=\u001b[39m class_sample, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m) \u001b[38;5;241m/\u001b[39m n_neighbors\n\u001b[0;32m 182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m np\u001b[38;5;241m.\u001b[39msum(ratio_nn):\n\u001b[1;32m--> 183\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[0;32m 184\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNot any neigbours belong to the majority\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 185\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m class. This case will induce a NaN case\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 186\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m with a division by zero. ADASYN is not\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 187\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m suited for this specific dataset.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 188\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m Use SMOTE instead.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 189\u001b[0m )\n\u001b[0;32m 190\u001b[0m ratio_nn \u001b[38;5;241m/\u001b[39m\u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39msum(ratio_nn)\n\u001b[0;32m 191\u001b[0m n_samples_generate \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mrint(ratio_nn \u001b[38;5;241m*\u001b[39m n_samples)\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mint\u001b[39m)\n",
|
||
"\u001b[1;31mRuntimeError\u001b[0m: Not any neigbours belong to the majority class. This case will induce a NaN case with a division by zero. ADASYN is not suited for this specific dataset. Use SMOTE instead."
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"data = df1[[\"Volume\", \"High\", \"Low\"]].copy()\n",
|
||
"data[\"Volume_Grouped\"] = pd.cut(data[\"Volume\"], bins=50, labels=False)\n",
|
||
"\n",
|
||
"interval_counts = data[\"Volume_Grouped\"].value_counts().sort_index()\n",
|
||
"\n",
|
||
"min_samples_per_interval = 5\n",
|
||
"for interval, count in interval_counts.items():\n",
|
||
" if count < min_samples_per_interval:\n",
|
||
" data.loc[data[\"Volume_Grouped\"] == interval, \"Volume_Grouped\"] = -1\n",
|
||
"\n",
|
||
"\n",
|
||
"df_coffee_train, df_coffee_val, df_coffee_test = split_stratified_into_train_val_test(\n",
|
||
" data, stratify_colname=\"Volume_Grouped\", frac_train=0.60, frac_val=0.20, frac_test=0.20)\n",
|
||
"\n",
|
||
"print(\"Обучающая выборка: \", df_coffee_train.shape)\n",
|
||
"print(df_coffee_train[\"Volume_Grouped\"].value_counts())\n",
|
||
"\n",
|
||
"X_resampled, y_resampled = ada.fit_resample(df_coffee_train, df_coffee_train[\"Volume_Grouped\"])\n",
|
||
"df_coffee_train_adasyn = pd.DataFrame(X_resampled)\n",
|
||
"\n",
|
||
"print(\"Обучающая выборка после oversampling: \", df_coffee_train_adasyn.shape)\n",
|
||
"print(df_coffee_train_adasyn.Pclass.value_counts())\n",
|
||
"\n",
|
||
"print(\"Контрольная выборка: \", df_coffee_val.shape)\n",
|
||
"print(df_coffee_val[\"Volume_Grouped\"].value_counts())\n",
|
||
"\n",
|
||
"print(\"Тестовая выборка: \", df_coffee_test.shape)\n",
|
||
"print(df_coffee_test[\"Volume_Grouped\"].value_counts())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Магазины"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 46,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Обучающая выборка: (537, 4)\n",
|
||
"Sales_Grouped\n",
|
||
" 2 184\n",
|
||
" 3 148\n",
|
||
" 1 135\n",
|
||
" 4 45\n",
|
||
" 0 20\n",
|
||
"-1 5\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Контрольная выборка: (179, 4)\n",
|
||
"Sales_Grouped\n",
|
||
" 2 61\n",
|
||
" 3 49\n",
|
||
" 1 45\n",
|
||
" 4 15\n",
|
||
" 0 7\n",
|
||
"-1 2\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Тестовая выборка: (180, 4)\n",
|
||
"Sales_Grouped\n",
|
||
" 2 61\n",
|
||
" 3 50\n",
|
||
" 1 45\n",
|
||
" 4 15\n",
|
||
" 0 7\n",
|
||
"-1 2\n",
|
||
"Name: count, dtype: int64\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"data = df2[[\"Store_Sales\", \"Store_Area\", \"Daily_Customer_Count\"]].copy()\n",
|
||
"data[\"Sales_Grouped\"] = pd.cut(data[\"Store_Sales\"], bins=6, labels=False)\n",
|
||
"\n",
|
||
"interval_counts = data[\"Sales_Grouped\"].value_counts().sort_index()\n",
|
||
"\n",
|
||
"min_samples_per_interval = 10\n",
|
||
"for interval, count in interval_counts.items():\n",
|
||
" if count < min_samples_per_interval:\n",
|
||
" data.loc[data[\"Sales_Grouped\"] == interval, \"Sales_Grouped\"] = -1\n",
|
||
"\n",
|
||
"df_shop_train, df_shop_val, df_shop_test = split_stratified_into_train_val_test(\n",
|
||
" data, stratify_colname=\"Sales_Grouped\", frac_train=0.60, frac_val=0.20, frac_test=0.20)\n",
|
||
"\n",
|
||
"\n",
|
||
"print(\"Обучающая выборка: \", df_shop_train.shape)\n",
|
||
"print(df_shop_train[\"Sales_Grouped\"].value_counts())\n",
|
||
"\n",
|
||
"X_resampled, y_resampled = ada.fit_resample(df_mark_train, df_mark_train[\"score_grouped\"])\n",
|
||
"df_mark_train_adasyn = pd.DataFrame(X_resampled)\n",
|
||
"\n",
|
||
"print(\"Обучающая выборка после oversampling: \", df_mark_train_adasyn.shape)\n",
|
||
"print(df_mark_train_adasyn.Pclass.value_counts())\n",
|
||
"\n",
|
||
"print(\"Контрольная выборка: \", df_shop_val.shape)\n",
|
||
"print(df_shop_val[\"Sales_Grouped\"].value_counts())\n",
|
||
"\n",
|
||
"print(\"Тестовая выборка: \", df_shop_test.shape)\n",
|
||
"print(df_shop_test[\"Sales_Grouped\"].value_counts())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Оценки студентов"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 58,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Обучающая выборка: (600, 4)\n",
|
||
"score_grouped\n",
|
||
" 3 283\n",
|
||
" 2 181\n",
|
||
" 4 101\n",
|
||
" 1 31\n",
|
||
"-1 4\n",
|
||
"Name: count, dtype: int64\n"
|
||
]
|
||
},
|
||
{
|
||
"ename": "ValueError",
|
||
"evalue": "Expected n_neighbors <= n_samples_fit, but n_neighbors = 6, n_samples_fit = 4, n_samples = 4",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
|
||
"Cell \u001b[1;32mIn[58], line 23\u001b[0m\n\u001b[0;32m 20\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_mark_train\u001b[38;5;241m.\u001b[39mshape)\n\u001b[0;32m 21\u001b[0m \u001b[38;5;28mprint\u001b[39m(df_mark_train[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mscore_grouped\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mvalue_counts())\n\u001b[1;32m---> 23\u001b[0m X_resampled, y_resampled \u001b[38;5;241m=\u001b[39m \u001b[43mada\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_resample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_mark_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdf_mark_train\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mscore_grouped\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 24\u001b[0m df_mark_train_adasyn \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(X_resampled)\n\u001b[0;32m 26\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка после oversampling: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_mark_train_adasyn\u001b[38;5;241m.\u001b[39mshape)\n",
|
||
"File \u001b[1;32mc:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\imblearn\\base.py:208\u001b[0m, in \u001b[0;36mBaseSampler.fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 187\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Resample the dataset.\u001b[39;00m\n\u001b[0;32m 188\u001b[0m \n\u001b[0;32m 189\u001b[0m \u001b[38;5;124;03mParameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 205\u001b[0m \u001b[38;5;124;03m The corresponding label of `X_resampled`.\u001b[39;00m\n\u001b[0;32m 206\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[1;32m--> 208\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_resample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"File \u001b[1;32mc:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\imblearn\\base.py:112\u001b[0m, in \u001b[0;36mSamplerMixin.fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 106\u001b[0m X, y, binarize_y \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_X_y(X, y)\n\u001b[0;32m 108\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy_ \u001b[38;5;241m=\u001b[39m check_sampling_strategy(\n\u001b[0;32m 109\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy, y, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sampling_type\n\u001b[0;32m 110\u001b[0m )\n\u001b[1;32m--> 112\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_resample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 114\u001b[0m y_ \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 115\u001b[0m label_binarize(output[\u001b[38;5;241m1\u001b[39m], classes\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39munique(y)) \u001b[38;5;28;01mif\u001b[39;00m binarize_y \u001b[38;5;28;01melse\u001b[39;00m output[\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m 116\u001b[0m )\n\u001b[0;32m 118\u001b[0m X_, y_ \u001b[38;5;241m=\u001b[39m arrays_transformer\u001b[38;5;241m.\u001b[39mtransform(output[\u001b[38;5;241m0\u001b[39m], y_)\n",
|
||
"File \u001b[1;32mc:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\imblearn\\over_sampling\\_adasyn.py:202\u001b[0m, in \u001b[0;36mADASYN._fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 199\u001b[0m \u001b[38;5;66;03m# the nearest neighbors need to be fitted only on the current class\u001b[39;00m\n\u001b[0;32m 200\u001b[0m \u001b[38;5;66;03m# to find the class NN to generate new samples\u001b[39;00m\n\u001b[0;32m 201\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnn_\u001b[38;5;241m.\u001b[39mfit(X_class)\n\u001b[1;32m--> 202\u001b[0m nns \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnn_\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkneighbors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_class\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_distance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m[:, \u001b[38;5;241m1\u001b[39m:]\n\u001b[0;32m 204\u001b[0m enumerated_class_indices \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39marange(\u001b[38;5;28mlen\u001b[39m(target_class_indices))\n\u001b[0;32m 205\u001b[0m rows \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mrepeat(enumerated_class_indices, n_samples_generate)\n",
|
||
"File \u001b[1;32mc:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\sklearn\\neighbors\\_base.py:834\u001b[0m, in \u001b[0;36mKNeighborsMixin.kneighbors\u001b[1;34m(self, X, n_neighbors, return_distance)\u001b[0m\n\u001b[0;32m 832\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 833\u001b[0m inequality_str \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mn_neighbors <= n_samples_fit\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m--> 834\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 835\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExpected \u001b[39m\u001b[38;5;132;01m{\u001b[39;00minequality_str\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, but \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 836\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mn_neighbors = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mn_neighbors\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, n_samples_fit = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mn_samples_fit\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 837\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mn_samples = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mX\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;66;03m# include n_samples for common tests\u001b[39;00m\n\u001b[0;32m 838\u001b[0m )\n\u001b[0;32m 840\u001b[0m n_jobs \u001b[38;5;241m=\u001b[39m effective_n_jobs(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_jobs)\n\u001b[0;32m 841\u001b[0m chunked_results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
|
||
"\u001b[1;31mValueError\u001b[0m: Expected n_neighbors <= n_samples_fit, but n_neighbors = 6, n_samples_fit = 4, n_samples = 4"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"data = df3[[\"score\", \"gender\", \"race/ethnicity\"]].copy()\n",
|
||
"data[\"score_grouped\"] = pd.cut(data[\"score\"], bins=5, labels=False)\n",
|
||
"\n",
|
||
"data[\"gender\"] = label_encoder.fit_transform(data['gender'])\n",
|
||
"data[\"race/ethnicity\"] = label_encoder.fit_transform(data['race/ethnicity'])\n",
|
||
"\n",
|
||
"interval_counts = data[\"score_grouped\"].value_counts().sort_index()\n",
|
||
"\n",
|
||
"min_samples_per_interval = 10\n",
|
||
"for interval, count in interval_counts.items():\n",
|
||
" if count < min_samples_per_interval:\n",
|
||
" data.loc[data[\"score_grouped\"] == interval, \"score_grouped\"] = -1\n",
|
||
"\n",
|
||
"df_mark_train, df_mark_val, df_mark_test = split_stratified_into_train_val_test(\n",
|
||
" data, stratify_colname=\"score_grouped\", frac_train=0.60, frac_val=0.20, frac_test=0.20)\n",
|
||
"\n",
|
||
"\n",
|
||
"\n",
|
||
"\n",
|
||
"print(\"Обучающая выборка: \", df_mark_train.shape)\n",
|
||
"print(df_mark_train[\"score_grouped\"].value_counts())\n",
|
||
"\n",
|
||
"X_resampled, y_resampled = ada.fit_resample(df_mark_train, df_mark_train[\"score_grouped\"])\n",
|
||
"df_mark_train_adasyn = pd.DataFrame(X_resampled)\n",
|
||
"\n",
|
||
"print(\"Обучающая выборка после oversampling: \", df_mark_train_adasyn.shape)\n",
|
||
"print(df_mark_train_adasyn.Pclass.value_counts())\n",
|
||
"\n",
|
||
"print(\"Контрольная выборка: \", df_mark_val.shape)\n",
|
||
"print(df_mark_val[\"score_grouped\"].value_counts())\n",
|
||
"\n",
|
||
"print(\"Тестовая выборка: \", df_mark_test.shape)\n",
|
||
"print(df_mark_test[\"score_grouped\"].value_counts())"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "aimvenv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|