600 lines
24 KiB
Plaintext
600 lines
24 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"**Lab2 Pibd-31 Malafeev**"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"*Загрузка трёх других датасетов(не своего варианта)*"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 97,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"df = pd.read_csv(\".//datasetlab2//kc_house_data.csv\", sep=\",\")\n",
|
||
"df2 = pd.read_csv(\".//datasetlab2//Stores.csv\", sep=\",\")\n",
|
||
"df3 = pd.read_csv(\".//datasetlab2//Forbes Billionaires.csv\", sep=\",\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Далее будут выполнены в Markdown пукнты лабораторной 2-8 с пометкой каждого пункта."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"2.Проблемной областью явлются: датасет stores.csv - магазины, kc_house_data.csv - датасет продажи домов и Forber Billionares.csv - датасет миллионеров."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"3.Объектами наблюдениями явлются магазины, дома и миллионеры. Связи между объектами нет, единственная схожесть - магазин и дом являются зданиями."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"4.Датасет миллионеров нужны например для сайта forbes - чтобы составить тир лист. В целом, другие датасеты тоже подходят для составления тир листа)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"5.Технический проект - тир лист, на входе датасет"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"6.Пункт будем выполнять в коде, оставлю к каждому комменты:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 98,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"id 0\n",
|
||
"date 0\n",
|
||
"price 0\n",
|
||
"bedrooms 0\n",
|
||
"bathrooms 0\n",
|
||
"sqft_living 0\n",
|
||
"sqft_lot 0\n",
|
||
"floors 0\n",
|
||
"waterfront 0\n",
|
||
"view 0\n",
|
||
"condition 0\n",
|
||
"grade 0\n",
|
||
"sqft_above 0\n",
|
||
"sqft_basement 0\n",
|
||
"yr_built 0\n",
|
||
"yr_renovated 0\n",
|
||
"zipcode 0\n",
|
||
"lat 0\n",
|
||
"long 0\n",
|
||
"sqft_living15 0\n",
|
||
"sqft_lot15 0\n",
|
||
"dtype: int64\n",
|
||
"Store ID 0\n",
|
||
"Store_Area 0\n",
|
||
"Items_Available 0\n",
|
||
"Daily_Customer_Count 0\n",
|
||
"Store_Sales 0\n",
|
||
"dtype: int64\n",
|
||
"Rank 0\n",
|
||
"Name 0\n",
|
||
"Networth 0\n",
|
||
"Age 0\n",
|
||
"Country 0\n",
|
||
"Source 0\n",
|
||
"Industry 0\n",
|
||
"dtype: int64\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Проверка на пропущенные значения\n",
|
||
"print(df.isnull().sum())\n",
|
||
"print(df2.isnull().sum())\n",
|
||
"print(df3.isnull().sum())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 99,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" id price bedrooms bathrooms sqft_living \\\n",
|
||
"count 2.161300e+04 2.161300e+04 21613.000000 21613.000000 21613.000000 \n",
|
||
"mean 4.580302e+09 5.400881e+05 3.370842 2.114757 2079.899736 \n",
|
||
"std 2.876566e+09 3.671272e+05 0.930062 0.770163 918.440897 \n",
|
||
"min 1.000102e+06 7.500000e+04 0.000000 0.000000 290.000000 \n",
|
||
"25% 2.123049e+09 3.219500e+05 3.000000 1.750000 1427.000000 \n",
|
||
"50% 3.904930e+09 4.500000e+05 3.000000 2.250000 1910.000000 \n",
|
||
"75% 7.308900e+09 6.450000e+05 4.000000 2.500000 2550.000000 \n",
|
||
"max 9.900000e+09 7.700000e+06 33.000000 8.000000 13540.000000 \n",
|
||
"\n",
|
||
" sqft_lot floors waterfront view condition \\\n",
|
||
"count 2.161300e+04 21613.000000 21613.000000 21613.000000 21613.000000 \n",
|
||
"mean 1.510697e+04 1.494309 0.007542 0.234303 3.409430 \n",
|
||
"std 4.142051e+04 0.539989 0.086517 0.766318 0.650743 \n",
|
||
"min 5.200000e+02 1.000000 0.000000 0.000000 1.000000 \n",
|
||
"25% 5.040000e+03 1.000000 0.000000 0.000000 3.000000 \n",
|
||
"50% 7.618000e+03 1.500000 0.000000 0.000000 3.000000 \n",
|
||
"75% 1.068800e+04 2.000000 0.000000 0.000000 4.000000 \n",
|
||
"max 1.651359e+06 3.500000 1.000000 4.000000 5.000000 \n",
|
||
"\n",
|
||
" grade sqft_above sqft_basement yr_built yr_renovated \\\n",
|
||
"count 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 \n",
|
||
"mean 7.656873 1788.390691 291.509045 1971.005136 84.402258 \n",
|
||
"std 1.175459 828.090978 442.575043 29.373411 401.679240 \n",
|
||
"min 1.000000 290.000000 0.000000 1900.000000 0.000000 \n",
|
||
"25% 7.000000 1190.000000 0.000000 1951.000000 0.000000 \n",
|
||
"50% 7.000000 1560.000000 0.000000 1975.000000 0.000000 \n",
|
||
"75% 8.000000 2210.000000 560.000000 1997.000000 0.000000 \n",
|
||
"max 13.000000 9410.000000 4820.000000 2015.000000 2015.000000 \n",
|
||
"\n",
|
||
" zipcode lat long sqft_living15 sqft_lot15 \n",
|
||
"count 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 \n",
|
||
"mean 98077.939805 47.560053 -122.213896 1986.552492 12768.455652 \n",
|
||
"std 53.505026 0.138564 0.140828 685.391304 27304.179631 \n",
|
||
"min 98001.000000 47.155900 -122.519000 399.000000 651.000000 \n",
|
||
"25% 98033.000000 47.471000 -122.328000 1490.000000 5100.000000 \n",
|
||
"50% 98065.000000 47.571800 -122.230000 1840.000000 7620.000000 \n",
|
||
"75% 98118.000000 47.678000 -122.125000 2360.000000 10083.000000 \n",
|
||
"max 98199.000000 47.777600 -121.315000 6210.000000 871200.000000 \n",
|
||
" Store ID Store_Area Items_Available Daily_Customer_Count \\\n",
|
||
"count 896.000000 896.000000 896.000000 896.000000 \n",
|
||
"mean 448.500000 1485.409598 1782.035714 786.350446 \n",
|
||
"std 258.797218 250.237011 299.872053 265.389281 \n",
|
||
"min 1.000000 775.000000 932.000000 10.000000 \n",
|
||
"25% 224.750000 1316.750000 1575.500000 600.000000 \n",
|
||
"50% 448.500000 1477.000000 1773.500000 780.000000 \n",
|
||
"75% 672.250000 1653.500000 1982.750000 970.000000 \n",
|
||
"max 896.000000 2229.000000 2667.000000 1560.000000 \n",
|
||
"\n",
|
||
" Store_Sales \n",
|
||
"count 896.000000 \n",
|
||
"mean 59351.305804 \n",
|
||
"std 17190.741895 \n",
|
||
"min 14920.000000 \n",
|
||
"25% 46530.000000 \n",
|
||
"50% 58605.000000 \n",
|
||
"75% 71872.500000 \n",
|
||
"max 116320.000000 \n",
|
||
" Rank Networth Age\n",
|
||
"count 2600.000000 2600.000000 2600.000000\n",
|
||
"mean 1269.570769 4.860750 64.271923\n",
|
||
"std 728.146364 10.659671 13.220607\n",
|
||
"min 1.000000 1.000000 19.000000\n",
|
||
"25% 637.000000 1.500000 55.000000\n",
|
||
"50% 1292.000000 2.400000 64.000000\n",
|
||
"75% 1929.000000 4.500000 74.000000\n",
|
||
"max 2578.000000 219.000000 100.000000\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Статистика по числовым данным для выявления аномальных распределений\n",
|
||
"print(df.describe())\n",
|
||
"print(df2.describe())\n",
|
||
"print(df3.describe())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"7.По перым трём строкам кода, т.е после проверки на пропущенные значения выявлено, что их нет. А дальше я обнаружил аномалию: в датасете миллионеров есть столбец networth - чистое количество денег во всех формах ( в миллиардах ), в этом солбце минимальное значение является единицей, медиана в районе 2.4, а максимальное - 219. В ЭТОМ СТОЛБЦЕ АНОМАЛИЯ"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"8.Наши датасеты довольно информационные. Например у миллионер датасета можно посмотреть фио, сколько денег, что он сделал. Датасет по продаже домов гораздо информационнее, является лидером по наполненности и соответствует реальности. А вот датасет магазинов слабоват, можно например добавить: количество филлиалов, работников, прибыль"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"9.Возьмём датасет магазинов, будем удалять столбцы, где площадь ниже 1500 (по тз надо)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 100,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" Store ID Store_Area Items_Available Daily_Customer_Count Store_Sales\n",
|
||
"0 1 1659 1961 530 66490\n",
|
||
"4 5 1770 2111 450 46620\n",
|
||
"6 7 1542 1858 1030 72240\n",
|
||
"11 12 1751 2098 720 57620\n",
|
||
"12 13 1746 2064 1050 60470\n",
|
||
".. ... ... ... ... ...\n",
|
||
"882 883 1819 2187 590 47920\n",
|
||
"886 887 1655 1986 1150 77430\n",
|
||
"889 890 1539 1829 650 46580\n",
|
||
"890 891 1549 1851 1220 70620\n",
|
||
"891 892 1582 1910 1080 66390\n",
|
||
"\n",
|
||
"[415 rows x 5 columns]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df2_filtered = df2[df2['Store_Area'] >= 1500]\n",
|
||
"print(df2_filtered)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Теперь в датасете магазнов price заменим у всех на константное значение - 1 500 000"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 101,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" id date price bedrooms bathrooms sqft_living \\\n",
|
||
"0 7129300520 20141013T000000 1500000 3 1.00 1180 \n",
|
||
"1 6414100192 20141209T000000 1500000 3 2.25 2570 \n",
|
||
"2 5631500400 20150225T000000 1500000 2 1.00 770 \n",
|
||
"3 2487200875 20141209T000000 1500000 4 3.00 1960 \n",
|
||
"4 1954400510 20150218T000000 1500000 3 2.00 1680 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"21608 263000018 20140521T000000 1500000 3 2.50 1530 \n",
|
||
"21609 6600060120 20150223T000000 1500000 4 2.50 2310 \n",
|
||
"21610 1523300141 20140623T000000 1500000 2 0.75 1020 \n",
|
||
"21611 291310100 20150116T000000 1500000 3 2.50 1600 \n",
|
||
"21612 1523300157 20141015T000000 1500000 2 0.75 1020 \n",
|
||
"\n",
|
||
" sqft_lot floors waterfront view ... grade sqft_above \\\n",
|
||
"0 5650 1.0 0 0 ... 7 1180 \n",
|
||
"1 7242 2.0 0 0 ... 7 2170 \n",
|
||
"2 10000 1.0 0 0 ... 6 770 \n",
|
||
"3 5000 1.0 0 0 ... 7 1050 \n",
|
||
"4 8080 1.0 0 0 ... 8 1680 \n",
|
||
"... ... ... ... ... ... ... ... \n",
|
||
"21608 1131 3.0 0 0 ... 8 1530 \n",
|
||
"21609 5813 2.0 0 0 ... 8 2310 \n",
|
||
"21610 1350 2.0 0 0 ... 7 1020 \n",
|
||
"21611 2388 2.0 0 0 ... 8 1600 \n",
|
||
"21612 1076 2.0 0 0 ... 7 1020 \n",
|
||
"\n",
|
||
" sqft_basement yr_built yr_renovated zipcode lat long \\\n",
|
||
"0 0 1955 0 98178 47.5112 -122.257 \n",
|
||
"1 400 1951 1991 98125 47.7210 -122.319 \n",
|
||
"2 0 1933 0 98028 47.7379 -122.233 \n",
|
||
"3 910 1965 0 98136 47.5208 -122.393 \n",
|
||
"4 0 1987 0 98074 47.6168 -122.045 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"21608 0 2009 0 98103 47.6993 -122.346 \n",
|
||
"21609 0 2014 0 98146 47.5107 -122.362 \n",
|
||
"21610 0 2009 0 98144 47.5944 -122.299 \n",
|
||
"21611 0 2004 0 98027 47.5345 -122.069 \n",
|
||
"21612 0 2008 0 98144 47.5941 -122.299 \n",
|
||
"\n",
|
||
" sqft_living15 sqft_lot15 \n",
|
||
"0 1340 5650 \n",
|
||
"1 1690 7639 \n",
|
||
"2 2720 8062 \n",
|
||
"3 1360 5000 \n",
|
||
"4 1800 7503 \n",
|
||
"... ... ... \n",
|
||
"21608 1530 1509 \n",
|
||
"21609 1830 7200 \n",
|
||
"21610 1020 2007 \n",
|
||
"21611 1410 1287 \n",
|
||
"21612 1020 1357 \n",
|
||
"\n",
|
||
"[21613 rows x 21 columns]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df['price'] = 1500000\n",
|
||
"print(df)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Теперь у миллионеров в networth подставим среднее по столбцу:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 102,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" Networth\n",
|
||
"0 4.86075\n",
|
||
"1 4.86075\n",
|
||
"2 4.86075\n",
|
||
"3 4.86075\n",
|
||
"4 4.86075\n",
|
||
"... ...\n",
|
||
"2595 4.86075\n",
|
||
"2596 4.86075\n",
|
||
"2597 4.86075\n",
|
||
"2598 4.86075\n",
|
||
"2599 4.86075\n",
|
||
"\n",
|
||
"[2600 rows x 1 columns]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"networth_mean = df3['Networth'].mean()\n",
|
||
"df3['Networth'] = networth_mean\n",
|
||
"print(df3[['Networth']])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"10.КОД"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 103,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Train df: (15129, 21), Validation df: (3242, 21), Test df: (3242, 21)\n",
|
||
"Train df2: (627, 5), Validation df2: (134, 5), Test df2: (135, 5)\n",
|
||
"Train df3: (1820, 7), Validation df3: (390, 7), Test df3: (390, 7)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"\n",
|
||
"train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
||
"val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42) \n",
|
||
"\n",
|
||
"train_df2, temp_df2 = train_test_split(df2, test_size=0.3, random_state=42)\n",
|
||
"val_df2, test_df2 = train_test_split(temp_df2, test_size=0.5, random_state=42)\n",
|
||
"\n",
|
||
"train_df3, temp_df3 = train_test_split(df3, test_size=0.3, random_state=42)\n",
|
||
"val_df3, test_df3 = train_test_split(temp_df3, test_size=0.5, random_state=42)\n",
|
||
"print(f\"Train df: {train_df.shape}, Validation df: {val_df.shape}, Test df: {test_df.shape}\")\n",
|
||
"print(f\"Train df2: {train_df2.shape}, Validation df2: {val_df2.shape}, Test df2: {test_df2.shape}\")\n",
|
||
"print(f\"Train df3: {train_df3.shape}, Validation df3: {val_df3.shape}, Test df3: {test_df3.shape}\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Было сделаное разбиение на три выборки: 70%, 15% и 15%. Подключена была библиотека scikit-learn и функция train_test_split , как сказано в пункте 15. Вполне сбалансированные"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"12.Качаем библиотеку imbalanced-learn, достаём нужные функции и погнали"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 104,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Class distribution after oversampling (df):\n",
|
||
"price_category\n",
|
||
"Low 10787\n",
|
||
"Medium 10787\n",
|
||
"High 10787\n",
|
||
"Luxury 10787\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Class distribution after undersampling (df):\n",
|
||
"price_category\n",
|
||
"Low 1465\n",
|
||
"Medium 1465\n",
|
||
"High 1465\n",
|
||
"Luxury 1465\n",
|
||
"Name: count, dtype: int64\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from imblearn.over_sampling import RandomOverSampler\n",
|
||
"from imblearn.under_sampling import RandomUnderSampler\n",
|
||
"df = pd.read_csv(\".//datasetlab2//kc_house_data.csv\", sep=\",\")\n",
|
||
"df['price_category'] = pd.cut(df['price'], bins=[0, 300000, 600000, 1000000, float('inf')],\n",
|
||
" labels=['Low', 'Medium', 'High', 'Luxury'])\n",
|
||
"\n",
|
||
"y = df['price_category']\n",
|
||
"X = df.drop(columns=['price', 'price_category'])\n",
|
||
"\n",
|
||
"oversampler = RandomOverSampler(random_state=42)\n",
|
||
"X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
|
||
"\n",
|
||
"undersampler = RandomUnderSampler(random_state=42)\n",
|
||
"X_resampled_under, y_resampled_under = undersampler.fit_resample(X, y)\n",
|
||
"\n",
|
||
"print(\"Class distribution after oversampling (df):\")\n",
|
||
"print(pd.Series(y_resampled).value_counts())\n",
|
||
"\n",
|
||
"print(\"Class distribution after undersampling (df):\")\n",
|
||
"print(pd.Series(y_resampled_under).value_counts())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 105,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Class distribution after oversampling (df3):\n",
|
||
"AGE_category\n",
|
||
"Young 1401\n",
|
||
"Middle-aged 1401\n",
|
||
"Senior 1401\n",
|
||
"Elderly 1401\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Class distribution after undersampling (df3):\n",
|
||
"AGE_category\n",
|
||
"Young 15\n",
|
||
"Middle-aged 15\n",
|
||
"Senior 15\n",
|
||
"Elderly 15\n",
|
||
"Name: count, dtype: int64\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df3 = pd.read_csv(\".//datasetlab2//Forbes Billionaires.csv\", sep=\",\")\n",
|
||
"\n",
|
||
"df3['AGE_category'] = pd.cut(df3['Age'], bins=[0, 30, 50, 70, float('inf')],\n",
|
||
" labels=['Young', 'Middle-aged', 'Senior', 'Elderly'])\n",
|
||
"\n",
|
||
"y3 = df3['AGE_category']\n",
|
||
"X3 = df3.drop(columns=['Age', 'AGE_category'])\n",
|
||
"\n",
|
||
"oversampler3 = RandomOverSampler(random_state=42)\n",
|
||
"X_resampled_3, y_resampled_3 = oversampler3.fit_resample(X3, y3)\n",
|
||
"\n",
|
||
"undersampler3 = RandomUnderSampler(random_state=42)\n",
|
||
"X_resampled_3_under, y_resampled_3_under = undersampler3.fit_resample(X3, y3)\n",
|
||
"\n",
|
||
"print(\"Class distribution after oversampling (df3):\")\n",
|
||
"print(pd.Series(y_resampled_3).value_counts())\n",
|
||
"\n",
|
||
"print(\"Class distribution after undersampling (df3):\")\n",
|
||
"print(pd.Series(y_resampled_3_under).value_counts())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 106,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Class distribution after oversampling (df2):\n",
|
||
"Sales_category\n",
|
||
"Low 598\n",
|
||
"Medium 598\n",
|
||
"High 598\n",
|
||
"Luxury 0\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Class distribution after undersampling (df2):\n",
|
||
"Sales_category\n",
|
||
"Low 7\n",
|
||
"Medium 7\n",
|
||
"High 7\n",
|
||
"Luxury 0\n",
|
||
"Name: count, dtype: int64\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df2 = pd.read_csv(\".//datasetlab2//Stores.csv\", sep=\",\")\n",
|
||
"\n",
|
||
"df2['Sales_category'] = pd.cut(df2['Store_Sales'], bins=[0, 50000, 100000, 200000, float('inf')],\n",
|
||
" labels=['Low', 'Medium', 'High', 'Luxury'])\n",
|
||
"\n",
|
||
"y2 = df2['Sales_category']\n",
|
||
"X2 = df2.drop(columns=['Store_Sales', 'Sales_category'])\n",
|
||
"\n",
|
||
"oversampler2 = RandomOverSampler(random_state=42)\n",
|
||
"X_resampled_2, y_resampled_2 = oversampler2.fit_resample(X2, y2)\n",
|
||
"\n",
|
||
"undersampler2 = RandomUnderSampler(random_state=42)\n",
|
||
"X_resampled_2_under, y_resampled_2_under = undersampler2.fit_resample(X2, y2)\n",
|
||
"\n",
|
||
"print(\"Class distribution after oversampling (df2):\")\n",
|
||
"print(pd.Series(y_resampled_2).value_counts())\n",
|
||
"\n",
|
||
"print(\"Class distribution after undersampling (df2):\")\n",
|
||
"print(pd.Series(y_resampled_2_under).value_counts())"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "miivenv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|