lab 2 is done
This commit is contained in:
parent
792fbe75b9
commit
9b2346eabc
@ -1,3 +1,4 @@
|
||||
# AIM_PIbd-31_Tabeev_A.P
|
||||
# Табеев Александр Павлович
|
||||
# Вариант 3
|
||||
# https://clck.yandex.ru/redir/nWO_r1F33ck?data=NnBZTWRhdFZKOHRaTENSMFc4S0VQUGMtSXk0bDRzNnVpakFkYjNNRE5ZNFRuVk4yRGpQaHlFSHNPNVpscDY1RGxPdmF0UFlUU3V4cmpoNDBvcE5vQVAxUzRxUzFpU1YzejluV1ozdUpoS1ZMeWdjcHktYS1IT2diWFhTLWpZcVhkdzF4a25GUGRYZGtSQ3RsclBRV1RiTWJsdFlyNVFIV1MyVHp0NDJTY21Z&b64e=2&sign=0e97a68a5fb67b83ca9ea592b182bbae&keyno=17
|
@ -11,7 +11,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
700
lab_2/lab2.ipynb
Normal file
700
lab_2/lab2.ipynb
Normal file
@ -0,0 +1,700 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Загрузка датасетов и вывод информации"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"RangeIndex: 8036 entries, 0 to 8035\n",
|
||||
"Data columns (total 7 columns):\n",
|
||||
" # Column Non-Null Count Dtype \n",
|
||||
"--- ------ -------------- ----- \n",
|
||||
" 0 Date 8036 non-null object \n",
|
||||
" 1 Open 8036 non-null float64\n",
|
||||
" 2 High 8036 non-null float64\n",
|
||||
" 3 Low 8036 non-null float64\n",
|
||||
" 4 Close 8036 non-null float64\n",
|
||||
" 5 Adj Close 8036 non-null float64\n",
|
||||
" 6 Volume 8036 non-null int64 \n",
|
||||
"dtypes: float64(5), int64(1), object(1)\n",
|
||||
"memory usage: 439.6+ KB\n",
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"RangeIndex: 896 entries, 0 to 895\n",
|
||||
"Data columns (total 5 columns):\n",
|
||||
" # Column Non-Null Count Dtype\n",
|
||||
"--- ------ -------------- -----\n",
|
||||
" 0 Store ID 896 non-null int64\n",
|
||||
" 1 Store_Area 896 non-null int64\n",
|
||||
" 2 Items_Available 896 non-null int64\n",
|
||||
" 3 Daily_Customer_Count 896 non-null int64\n",
|
||||
" 4 Store_Sales 896 non-null int64\n",
|
||||
"dtypes: int64(5)\n",
|
||||
"memory usage: 35.1 KB\n",
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"RangeIndex: 1000 entries, 0 to 999\n",
|
||||
"Data columns (total 8 columns):\n",
|
||||
" # Column Non-Null Count Dtype \n",
|
||||
"--- ------ -------------- ----- \n",
|
||||
" 0 gender 1000 non-null object\n",
|
||||
" 1 race/ethnicity 1000 non-null object\n",
|
||||
" 2 parental level of education 1000 non-null object\n",
|
||||
" 3 lunch 1000 non-null object\n",
|
||||
" 4 test preparation course 1000 non-null object\n",
|
||||
" 5 math score 1000 non-null int64 \n",
|
||||
" 6 reading score 1000 non-null int64 \n",
|
||||
" 7 writing score 1000 non-null int64 \n",
|
||||
"dtypes: int64(3), object(5)\n",
|
||||
"memory usage: 62.6+ KB\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"df = pd.read_csv(\"datasets_lab2/coffee.csv\")\n",
|
||||
"df2 = pd.read_csv(\"datasets_lab2/Stores.csv\")\n",
|
||||
"df3 = pd.read_csv(\"datasets_lab2/StudentsPerformance.csv\")\n",
|
||||
"df.info()\n",
|
||||
"df2.info()\n",
|
||||
"df3.info()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Проблемная область"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Первый датасет coffee.csv позволяет спрогнозировать будущие показатели акций кофейни Starbucks. Второй датасет Stores.csv - магазины. Третий датасет StudentsPerformance.csv - успеваемость студентов на экзамене"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Анализ набора данных"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Объекты: \n",
|
||||
"1. Акции\n",
|
||||
"2. Магазины\n",
|
||||
"3. Успеваемость студентов\n",
|
||||
"Атрибуты: \n",
|
||||
"1. Дата; начальная цена за день; максимальная цена; минимальная цена; цена на момент закрытия продаж; скорректированая цена на момент закрытия; объем торговли акций за день.\n",
|
||||
"2. Идентификатор магазина; физическая площадь; кол-во доступных товаров; количество покупателей, посетивших магазины в среднем за месяц; продажив магазинах (в долларах США).\n",
|
||||
"3. Пол, раса/этническая принадлежность, уровень образования родителей, обед, курс подготовки к тестированию, оценка по математике, оценка по чтению, оценка по письму."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Бизнес цели"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"1. Анализ показателей акций Starbucks — прогнозирование будущей стоимости акций для улучшения финансовых решений, предотвращения резких убытков и повышения прибыли.\n",
|
||||
"2. Оценка эффективности магазинов — понимание факторов, влияющих на успешность магазинов, включая выручку и посещаемость, что поможет принимать решения о возможном расширении сети или оптимизации текущей стратегии размещения.\n",
|
||||
"3. Анализ образовательной успеваемости студентов — выявление факторов, влияющих на академические успехи для разработки программ поддержки студентов."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Примеры целей технического проекта. Что поступает на вход, что является целевым признаком."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"На входе будут переданы следующие датасеты, целевые признаки для каждого:\n",
|
||||
"1. coffee.csv — максимальная цена акций за день.\n",
|
||||
"2. Stores.csv — выручка магазина.\n",
|
||||
"3. StudentsPerformance.csv — общий средний балл (среднее значение по математике, чтению и письму)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Проблемы набора данных и их решения"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"1. Устаревшие данные: Важно проверять актуальность данных, особенно для финансовых временных рядов и данных о магазинах. Для этого старые записи могут быть удалены или обновлены, если доступны более свежие данные.\n",
|
||||
"2. Выбросы: Необходимо выявить аномалии, такие как резкие изменения в ценах акций или посещаемости, и принять решение об их удалении или сглаживании, исходя из размера выборки."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Качество набора данных"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Наборы данных содержат достаточно примеров и признаков для обучения модели. Учтены различные ситуации проблемной области. Данные соответствуют данным, которые будут подаваться в производственной среде. Все метки согласованы."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Поиск аномалий"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Open High Low Close Adj Close \\\n",
|
||||
"count 8036.000000 8036.000000 8036.000000 8036.000000 8036.000000 \n",
|
||||
"mean 30.054280 30.351487 29.751322 30.058857 26.674025 \n",
|
||||
"std 33.615577 33.906613 33.314569 33.615911 31.728090 \n",
|
||||
"min 0.328125 0.347656 0.320313 0.335938 0.260703 \n",
|
||||
"25% 4.392031 4.531250 4.304922 4.399610 3.414300 \n",
|
||||
"50% 13.325000 13.493750 13.150000 13.330000 10.352452 \n",
|
||||
"75% 55.250000 55.722501 54.852499 55.267499 47.464829 \n",
|
||||
"max 126.080002 126.320000 124.809998 126.059998 118.010414 \n",
|
||||
"\n",
|
||||
" Volume \n",
|
||||
"count 8.036000e+03 \n",
|
||||
"mean 1.470459e+07 \n",
|
||||
"std 1.340021e+07 \n",
|
||||
"min 1.504000e+06 \n",
|
||||
"25% 7.817750e+06 \n",
|
||||
"50% 1.169815e+07 \n",
|
||||
"75% 1.778795e+07 \n",
|
||||
"max 5.855088e+08 \n",
|
||||
" Store ID Store_Area Items_Available Daily_Customer_Count \\\n",
|
||||
"count 896.000000 896.000000 896.000000 896.000000 \n",
|
||||
"mean 448.500000 1485.409598 1782.035714 786.350446 \n",
|
||||
"std 258.797218 250.237011 299.872053 265.389281 \n",
|
||||
"min 1.000000 775.000000 932.000000 10.000000 \n",
|
||||
"25% 224.750000 1316.750000 1575.500000 600.000000 \n",
|
||||
"50% 448.500000 1477.000000 1773.500000 780.000000 \n",
|
||||
"75% 672.250000 1653.500000 1982.750000 970.000000 \n",
|
||||
"max 896.000000 2229.000000 2667.000000 1560.000000 \n",
|
||||
"\n",
|
||||
" Store_Sales \n",
|
||||
"count 896.000000 \n",
|
||||
"mean 59351.305804 \n",
|
||||
"std 17190.741895 \n",
|
||||
"min 14920.000000 \n",
|
||||
"25% 46530.000000 \n",
|
||||
"50% 58605.000000 \n",
|
||||
"75% 71872.500000 \n",
|
||||
"max 116320.000000 \n",
|
||||
" math score reading score writing score\n",
|
||||
"count 1000.00000 1000.000000 1000.000000\n",
|
||||
"mean 66.08900 69.169000 68.054000\n",
|
||||
"std 15.16308 14.600192 15.195657\n",
|
||||
"min 0.00000 17.000000 10.000000\n",
|
||||
"25% 57.00000 59.000000 57.750000\n",
|
||||
"50% 66.00000 70.000000 69.000000\n",
|
||||
"75% 77.00000 79.000000 79.000000\n",
|
||||
"max 100.00000 100.000000 100.000000\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(df.describe())\n",
|
||||
"print(df2.describe())\n",
|
||||
"print(df3.describe())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"При просмотре вывода не было замечено аномалий в столбцах датасетов."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Проблема пропущенных данных"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Первый датасет coffee.csv\n",
|
||||
"Второй датасет Stores.csv\n",
|
||||
"Третий датасет StudentsPerformance.csv\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"Первый датасет coffee.csv\")\n",
|
||||
"for i in df.columns:\n",
|
||||
" null_rate = df[i].isnull().sum() / len(df)*100\n",
|
||||
" if null_rate > 0:\n",
|
||||
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n",
|
||||
"print(\"Второй датасет Stores.csv\")\n",
|
||||
"for i in df2.columns:\n",
|
||||
" null_rate = df2[i].isnull().sum() / len(df2)*100\n",
|
||||
" if null_rate > 0:\n",
|
||||
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n",
|
||||
"print(\"Третий датасет StudentsPerformance.csv\")\n",
|
||||
"for i in df3.columns:\n",
|
||||
" null_rate = df3[i].isnull().sum() / len(df3)*100\n",
|
||||
" if null_rate > 0:\n",
|
||||
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Во всех трех датасетах пустых значений не найдено, по наполненности датасеты удовлетворительны. Но можно добавить побольше столбцов в датасет магазинов."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"В первом датасете coffee.csv удалим строки, где год ниже 2000."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Date Open High Low Close Adj Close \\\n",
|
||||
"1899 2000-01-03 2.984375 3.085938 2.906250 3.082031 2.391797 \n",
|
||||
"1900 2000-01-04 3.007813 3.109375 2.968750 2.984375 2.316012 \n",
|
||||
"1901 2000-01-05 2.992188 3.078125 2.960938 3.023438 2.346326 \n",
|
||||
"1902 2000-01-06 3.000000 3.203125 3.000000 3.132813 2.431207 \n",
|
||||
"1903 2000-01-07 3.093750 3.125000 3.031250 3.117188 2.419082 \n",
|
||||
"... ... ... ... ... ... ... \n",
|
||||
"8031 2024-05-17 75.269997 78.000000 74.919998 77.849998 77.849998 \n",
|
||||
"8032 2024-05-20 77.680000 78.320000 76.709999 77.540001 77.540001 \n",
|
||||
"8033 2024-05-21 77.559998 78.220001 77.500000 77.720001 77.720001 \n",
|
||||
"8034 2024-05-22 77.699997 81.019997 77.440002 80.720001 80.720001 \n",
|
||||
"8035 2024-05-23 80.099998 80.699997 79.169998 79.260002 79.260002 \n",
|
||||
"\n",
|
||||
" Volume \n",
|
||||
"1899 24232000 \n",
|
||||
"1900 21564800 \n",
|
||||
"1901 28206400 \n",
|
||||
"1902 30825600 \n",
|
||||
"1903 26044800 \n",
|
||||
"... ... \n",
|
||||
"8031 14436500 \n",
|
||||
"8032 11183800 \n",
|
||||
"8033 8916600 \n",
|
||||
"8034 22063400 \n",
|
||||
"8035 4651418 \n",
|
||||
"\n",
|
||||
"[6137 rows x 7 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df_filtered = df[df['Date'] > '2000-01-01']\n",
|
||||
"print(df_filtered)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Во втором датасете Stores.csv всем магазинам поставим среднее значение площади."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Store ID Store_Area Items_Available Daily_Customer_Count Store_Sales\n",
|
||||
"0 1 1500.0 1961 530 66490\n",
|
||||
"1 2 1500.0 1752 210 39820\n",
|
||||
"2 3 1500.0 1609 720 54010\n",
|
||||
"3 4 1500.0 1748 620 53730\n",
|
||||
"4 5 1500.0 2111 450 46620\n",
|
||||
".. ... ... ... ... ...\n",
|
||||
"891 892 1500.0 1910 1080 66390\n",
|
||||
"892 893 1500.0 1663 850 82080\n",
|
||||
"893 894 1500.0 1436 1060 76440\n",
|
||||
"894 895 1500.0 1560 770 96610\n",
|
||||
"895 896 1500.0 1429 1110 54340\n",
|
||||
"\n",
|
||||
"[896 rows x 5 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"store_area_mean = df2['Store_Area'].mean()\n",
|
||||
"df2['Store_Area'] = store_area_mean\n",
|
||||
"print(df2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"В третьем датасете StudentsPerformance.csv всем студентам сделаем ланч стандартным."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" gender race/ethnicity parental level of education lunch \\\n",
|
||||
"0 female group B bachelor's degree standard \n",
|
||||
"1 female group C some college standard \n",
|
||||
"2 female group B master's degree standard \n",
|
||||
"3 male group A associate's degree standard \n",
|
||||
"4 male group C some college standard \n",
|
||||
".. ... ... ... ... \n",
|
||||
"995 female group E master's degree standard \n",
|
||||
"996 male group C high school standard \n",
|
||||
"997 female group C high school standard \n",
|
||||
"998 female group D some college standard \n",
|
||||
"999 female group D some college standard \n",
|
||||
"\n",
|
||||
" test preparation course math score reading score writing score \n",
|
||||
"0 none 72 72 74 \n",
|
||||
"1 completed 69 90 88 \n",
|
||||
"2 none 90 95 93 \n",
|
||||
"3 none 47 57 44 \n",
|
||||
"4 none 76 78 75 \n",
|
||||
".. ... ... ... ... \n",
|
||||
"995 completed 88 99 95 \n",
|
||||
"996 none 62 55 55 \n",
|
||||
"997 completed 59 71 65 \n",
|
||||
"998 completed 68 78 77 \n",
|
||||
"999 none 77 86 86 \n",
|
||||
"\n",
|
||||
"[1000 rows x 8 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df3['lunch'] = 'standard'\n",
|
||||
"print(df3)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"def split_stratified_into_train_val_test(\n",
|
||||
" df_input,\n",
|
||||
" stratify_colname=\"y\",\n",
|
||||
" frac_train=0.6,\n",
|
||||
" frac_val=0.15,\n",
|
||||
" frac_test=0.25,\n",
|
||||
" random_state=None,\n",
|
||||
"):\n",
|
||||
" if frac_train + frac_val + frac_test != 1.0:\n",
|
||||
" raise ValueError(\n",
|
||||
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
||||
" % (frac_train, frac_val, frac_test)\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if stratify_colname not in df_input.columns:\n",
|
||||
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
||||
"\n",
|
||||
" X = df_input # Contains all columns.\n",
|
||||
" y = df_input[\n",
|
||||
" [stratify_colname]\n",
|
||||
" ] # Dataframe of just the column on which to stratify.\n",
|
||||
"\n",
|
||||
" # Split original dataframe into train and temp dataframes.\n",
|
||||
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
||||
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Split the temp dataframe into val and test dataframes.\n",
|
||||
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
||||
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
||||
" df_temp,\n",
|
||||
" y_temp,\n",
|
||||
" stratify=y_temp,\n",
|
||||
" test_size=relative_frac_test,\n",
|
||||
" random_state=random_state,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
||||
"\n",
|
||||
" return df_train, df_val, df_test"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Train df: (4821, 7), Validation df: (1607, 7), Test df: (1608, 7)\n",
|
||||
"Train df2: (537, 5), Validation df2: (179, 5), Test df2: (180, 5)\n",
|
||||
"Train df3: (600, 8), Validation df3: (200, 8), Test df3: (200, 8)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)\n",
|
||||
"val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42) \n",
|
||||
"\n",
|
||||
"train_df2, temp_df2 = train_test_split(df2, test_size=0.4, random_state=42)\n",
|
||||
"val_df2, test_df2 = train_test_split(temp_df2, test_size=0.5, random_state=42)\n",
|
||||
"\n",
|
||||
"train_df3, temp_df3 = train_test_split(df3, test_size=0.4, random_state=42)\n",
|
||||
"val_df3, test_df3 = train_test_split(temp_df3, test_size=0.5, random_state=42)\n",
|
||||
"print(f\"Train df: {train_df.shape}, Validation df: {val_df.shape}, Test df: {test_df.shape}\")\n",
|
||||
"print(f\"Train df2: {train_df2.shape}, Validation df2: {val_df2.shape}, Test df2: {test_df2.shape}\")\n",
|
||||
"print(f\"Train df3: {train_df3.shape}, Validation df3: {val_df3.shape}, Test df3: {test_df3.shape}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Было сделано разбиение на три выборки: 60%, 20% и 20% при помощи библиотеки scikit-learn и функции train_test_split. На взгляд сбалансированные"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Приращение методами выборки с избытком (oversampling) и выборки с недостатком (undersampling)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Выборка после oversampling (df):\n",
|
||||
"High_category\n",
|
||||
"Low 4835\n",
|
||||
"Medium 4835\n",
|
||||
"High 4835\n",
|
||||
"Luxury 4835\n",
|
||||
"Name: count, dtype: int64\n",
|
||||
"Выборка после undersampling (df):\n",
|
||||
"High_category\n",
|
||||
"Low 321\n",
|
||||
"Medium 321\n",
|
||||
"High 321\n",
|
||||
"Luxury 321\n",
|
||||
"Name: count, dtype: int64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from imblearn.over_sampling import RandomOverSampler\n",
|
||||
"from imblearn.under_sampling import RandomUnderSampler\n",
|
||||
"\n",
|
||||
"df = pd.read_csv(\"datasets_lab2/coffee.csv\")\n",
|
||||
"df['High_category'] = pd.cut(df['High'], bins=[0.1, 20, 60, 105, float('inf')],\n",
|
||||
" labels=['Low', 'Medium', 'High', 'Luxury'])\n",
|
||||
"\n",
|
||||
"y = df['High_category']\n",
|
||||
"X = df.drop(columns=['High', 'High_category'])\n",
|
||||
"\n",
|
||||
"oversampler = RandomOverSampler(random_state=42)\n",
|
||||
"X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
|
||||
"\n",
|
||||
"undersampler = RandomUnderSampler(random_state=42)\n",
|
||||
"X_resampled_under, y_resampled_under = undersampler.fit_resample(X, y)\n",
|
||||
"\n",
|
||||
"print(\"Выборка после oversampling (df):\")\n",
|
||||
"print(pd.Series(y_resampled).value_counts())\n",
|
||||
"\n",
|
||||
"print(\"Выборка после undersampling (df):\")\n",
|
||||
"print(pd.Series(y_resampled_under).value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Выборка после oversampling (df2):\n",
|
||||
"Sales_category\n",
|
||||
"Low 598\n",
|
||||
"Medium 598\n",
|
||||
"High 598\n",
|
||||
"Luxury 0\n",
|
||||
"Name: count, dtype: int64\n",
|
||||
"Выборка после undersampling (df2):\n",
|
||||
"Sales_category\n",
|
||||
"Low 7\n",
|
||||
"Medium 7\n",
|
||||
"High 7\n",
|
||||
"Luxury 0\n",
|
||||
"Name: count, dtype: int64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df2 = pd.read_csv(\"datasets_lab2/Stores.csv\")\n",
|
||||
"\n",
|
||||
"df2['Sales_category'] = pd.cut(df2['Store_Sales'], bins=[0, 50000, 100000, 200000, float('inf')],\n",
|
||||
" labels=['Low', 'Medium', 'High', 'Luxury'])\n",
|
||||
"\n",
|
||||
"y2 = df2['Sales_category']\n",
|
||||
"X2 = df2.drop(columns=['Store_Sales', 'Sales_category'])\n",
|
||||
"\n",
|
||||
"oversampler2 = RandomOverSampler(random_state=42)\n",
|
||||
"X_resampled_2, y_resampled_2 = oversampler2.fit_resample(X2, y2)\n",
|
||||
"\n",
|
||||
"undersampler2 = RandomUnderSampler(random_state=42)\n",
|
||||
"X_resampled_2_under, y_resampled_2_under = undersampler2.fit_resample(X2, y2)\n",
|
||||
"\n",
|
||||
"print(\"Выборка после oversampling (df2):\")\n",
|
||||
"print(pd.Series(y_resampled_2).value_counts())\n",
|
||||
"\n",
|
||||
"print(\"Выборка после undersampling (df2):\")\n",
|
||||
"print(pd.Series(y_resampled_2_under).value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Выборка после oversampling (df3):\n",
|
||||
"reading_score_category\n",
|
||||
"Low 903\n",
|
||||
"Medium 903\n",
|
||||
"High 903\n",
|
||||
"Luxury 0\n",
|
||||
"Name: count, dtype: int64\n",
|
||||
"Выборка после undersampling (df3):\n",
|
||||
"reading_score_category\n",
|
||||
"Low 1\n",
|
||||
"Medium 1\n",
|
||||
"High 1\n",
|
||||
"Luxury 0\n",
|
||||
"Name: count, dtype: int64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df3 = pd.read_csv(\"datasets_lab2/StudentsPerformance.csv\")\n",
|
||||
"\n",
|
||||
"df3['reading_score_category'] = pd.cut(df3['reading score'], bins=[0, 20, 50, 100, float('inf')],\n",
|
||||
" labels=['Low', 'Medium', 'High', 'Luxury'])\n",
|
||||
"\n",
|
||||
"y3 = df3['reading_score_category']\n",
|
||||
"X3 = df3.drop(columns=['reading score', 'reading_score_category'])\n",
|
||||
"\n",
|
||||
"oversampler3 = RandomOverSampler(random_state=42)\n",
|
||||
"X_resampled_3, y_resampled_3 = oversampler3.fit_resample(X3, y3)\n",
|
||||
"\n",
|
||||
"undersampler3 = RandomUnderSampler(random_state=42)\n",
|
||||
"X_resampled_3_under, y_resampled_3_under = undersampler3.fit_resample(X3, y3)\n",
|
||||
"\n",
|
||||
"print(\"Выборка после oversampling (df3):\")\n",
|
||||
"print(pd.Series(y_resampled_3).value_counts())\n",
|
||||
"\n",
|
||||
"print(\"Выборка после undersampling (df3):\")\n",
|
||||
"print(pd.Series(y_resampled_3_under).value_counts())"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
BIN
lab_2/requirements.txt
Normal file
BIN
lab_2/requirements.txt
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user