AIM-PIbd-31-Malafeev-L-S/lab_2/lab2.ipynb

600 lines
24 KiB
Plaintext
Raw Permalink Normal View History

2024-10-11 22:19:08 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Lab2 Pibd-31 Malafeev**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"*Загрузка трёх других датасетов(не своего варианта)*"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"df = pd.read_csv(\".//datasetlab2//kc_house_data.csv\", sep=\",\")\n",
"df2 = pd.read_csv(\".//datasetlab2//Stores.csv\", sep=\",\")\n",
"df3 = pd.read_csv(\".//datasetlab2//Forbes Billionaires.csv\", sep=\",\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Далее будут выполнены в Markdown пукнты лабораторной 2-8 с пометкой каждого пункта."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"2.Проблемной областью явлются: датасет stores.csv - магазины, kc_house_data.csv - датасет продажи домов и Forber Billionares.csv - датасет миллионеров."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"3.Объектами наблюдениями явлются магазины, дома и миллионеры. Связи между объектами нет, единственная схожесть - магазин и дом являются зданиями."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"4.Датасет миллионеров нужны например для сайта forbes - чтобы составить тир лист. В целом, другие датасеты тоже подходят для составления тир листа)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"5.Технический проект - тир лист, на входе датасет"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"6.Пункт будем выполнять в коде, оставлю к каждому комменты:"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id 0\n",
"date 0\n",
"price 0\n",
"bedrooms 0\n",
"bathrooms 0\n",
"sqft_living 0\n",
"sqft_lot 0\n",
"floors 0\n",
"waterfront 0\n",
"view 0\n",
"condition 0\n",
"grade 0\n",
"sqft_above 0\n",
"sqft_basement 0\n",
"yr_built 0\n",
"yr_renovated 0\n",
"zipcode 0\n",
"lat 0\n",
"long 0\n",
"sqft_living15 0\n",
"sqft_lot15 0\n",
"dtype: int64\n",
"Store ID 0\n",
"Store_Area 0\n",
"Items_Available 0\n",
"Daily_Customer_Count 0\n",
"Store_Sales 0\n",
"dtype: int64\n",
"Rank 0\n",
"Name 0\n",
"Networth 0\n",
"Age 0\n",
"Country 0\n",
"Source 0\n",
"Industry 0\n",
"dtype: int64\n"
]
}
],
"source": [
"# Проверка на пропущенные значения\n",
"print(df.isnull().sum())\n",
"print(df2.isnull().sum())\n",
"print(df3.isnull().sum())"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" id price bedrooms bathrooms sqft_living \\\n",
"count 2.161300e+04 2.161300e+04 21613.000000 21613.000000 21613.000000 \n",
"mean 4.580302e+09 5.400881e+05 3.370842 2.114757 2079.899736 \n",
"std 2.876566e+09 3.671272e+05 0.930062 0.770163 918.440897 \n",
"min 1.000102e+06 7.500000e+04 0.000000 0.000000 290.000000 \n",
"25% 2.123049e+09 3.219500e+05 3.000000 1.750000 1427.000000 \n",
"50% 3.904930e+09 4.500000e+05 3.000000 2.250000 1910.000000 \n",
"75% 7.308900e+09 6.450000e+05 4.000000 2.500000 2550.000000 \n",
"max 9.900000e+09 7.700000e+06 33.000000 8.000000 13540.000000 \n",
"\n",
" sqft_lot floors waterfront view condition \\\n",
"count 2.161300e+04 21613.000000 21613.000000 21613.000000 21613.000000 \n",
"mean 1.510697e+04 1.494309 0.007542 0.234303 3.409430 \n",
"std 4.142051e+04 0.539989 0.086517 0.766318 0.650743 \n",
"min 5.200000e+02 1.000000 0.000000 0.000000 1.000000 \n",
"25% 5.040000e+03 1.000000 0.000000 0.000000 3.000000 \n",
"50% 7.618000e+03 1.500000 0.000000 0.000000 3.000000 \n",
"75% 1.068800e+04 2.000000 0.000000 0.000000 4.000000 \n",
"max 1.651359e+06 3.500000 1.000000 4.000000 5.000000 \n",
"\n",
" grade sqft_above sqft_basement yr_built yr_renovated \\\n",
"count 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 \n",
"mean 7.656873 1788.390691 291.509045 1971.005136 84.402258 \n",
"std 1.175459 828.090978 442.575043 29.373411 401.679240 \n",
"min 1.000000 290.000000 0.000000 1900.000000 0.000000 \n",
"25% 7.000000 1190.000000 0.000000 1951.000000 0.000000 \n",
"50% 7.000000 1560.000000 0.000000 1975.000000 0.000000 \n",
"75% 8.000000 2210.000000 560.000000 1997.000000 0.000000 \n",
"max 13.000000 9410.000000 4820.000000 2015.000000 2015.000000 \n",
"\n",
" zipcode lat long sqft_living15 sqft_lot15 \n",
"count 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 \n",
"mean 98077.939805 47.560053 -122.213896 1986.552492 12768.455652 \n",
"std 53.505026 0.138564 0.140828 685.391304 27304.179631 \n",
"min 98001.000000 47.155900 -122.519000 399.000000 651.000000 \n",
"25% 98033.000000 47.471000 -122.328000 1490.000000 5100.000000 \n",
"50% 98065.000000 47.571800 -122.230000 1840.000000 7620.000000 \n",
"75% 98118.000000 47.678000 -122.125000 2360.000000 10083.000000 \n",
"max 98199.000000 47.777600 -121.315000 6210.000000 871200.000000 \n",
" Store ID Store_Area Items_Available Daily_Customer_Count \\\n",
"count 896.000000 896.000000 896.000000 896.000000 \n",
"mean 448.500000 1485.409598 1782.035714 786.350446 \n",
"std 258.797218 250.237011 299.872053 265.389281 \n",
"min 1.000000 775.000000 932.000000 10.000000 \n",
"25% 224.750000 1316.750000 1575.500000 600.000000 \n",
"50% 448.500000 1477.000000 1773.500000 780.000000 \n",
"75% 672.250000 1653.500000 1982.750000 970.000000 \n",
"max 896.000000 2229.000000 2667.000000 1560.000000 \n",
"\n",
" Store_Sales \n",
"count 896.000000 \n",
"mean 59351.305804 \n",
"std 17190.741895 \n",
"min 14920.000000 \n",
"25% 46530.000000 \n",
"50% 58605.000000 \n",
"75% 71872.500000 \n",
"max 116320.000000 \n",
" Rank Networth Age\n",
"count 2600.000000 2600.000000 2600.000000\n",
"mean 1269.570769 4.860750 64.271923\n",
"std 728.146364 10.659671 13.220607\n",
"min 1.000000 1.000000 19.000000\n",
"25% 637.000000 1.500000 55.000000\n",
"50% 1292.000000 2.400000 64.000000\n",
"75% 1929.000000 4.500000 74.000000\n",
"max 2578.000000 219.000000 100.000000\n"
]
}
],
"source": [
"# Статистика по числовым данным для выявления аномальных распределений\n",
"print(df.describe())\n",
"print(df2.describe())\n",
"print(df3.describe())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"7.По перым трём строкам кода, т.е после проверки на пропущенные значения выявлено, что их нет. А дальше я обнаружил аномалию: в датасете миллионеров есть столбец networth - чистое количество денег во всех формах ( в миллиардах ), в этом солбце минимальное значение является единицей, медиана в районе 2.4, а максимальное - 219. В ЭТОМ СТОЛБЦЕ АНОМАЛИЯ"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"8.Наши датасеты довольно информационные. Например у миллионер датасета можно посмотреть фио, сколько денег, что он сделал. Датасет по продаже домов гораздо информационнее, является лидером по наполненности и соответствует реальности. А вот датасет магазинов слабоват, можно например добавить: количество филлиалов, работников, прибыль"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"9.Возьмём датасет магазинов, будем удалять столбцы, где площадь ниже 1500 (по тз надо)"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Store ID Store_Area Items_Available Daily_Customer_Count Store_Sales\n",
"0 1 1659 1961 530 66490\n",
"4 5 1770 2111 450 46620\n",
"6 7 1542 1858 1030 72240\n",
"11 12 1751 2098 720 57620\n",
"12 13 1746 2064 1050 60470\n",
".. ... ... ... ... ...\n",
"882 883 1819 2187 590 47920\n",
"886 887 1655 1986 1150 77430\n",
"889 890 1539 1829 650 46580\n",
"890 891 1549 1851 1220 70620\n",
"891 892 1582 1910 1080 66390\n",
"\n",
"[415 rows x 5 columns]\n"
]
}
],
"source": [
"df2_filtered = df2[df2['Store_Area'] >= 1500]\n",
"print(df2_filtered)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Теперь в датасете магазнов price заменим у всех на константное значение - 1 500 000"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" id date price bedrooms bathrooms sqft_living \\\n",
"0 7129300520 20141013T000000 1500000 3 1.00 1180 \n",
"1 6414100192 20141209T000000 1500000 3 2.25 2570 \n",
"2 5631500400 20150225T000000 1500000 2 1.00 770 \n",
"3 2487200875 20141209T000000 1500000 4 3.00 1960 \n",
"4 1954400510 20150218T000000 1500000 3 2.00 1680 \n",
"... ... ... ... ... ... ... \n",
"21608 263000018 20140521T000000 1500000 3 2.50 1530 \n",
"21609 6600060120 20150223T000000 1500000 4 2.50 2310 \n",
"21610 1523300141 20140623T000000 1500000 2 0.75 1020 \n",
"21611 291310100 20150116T000000 1500000 3 2.50 1600 \n",
"21612 1523300157 20141015T000000 1500000 2 0.75 1020 \n",
"\n",
" sqft_lot floors waterfront view ... grade sqft_above \\\n",
"0 5650 1.0 0 0 ... 7 1180 \n",
"1 7242 2.0 0 0 ... 7 2170 \n",
"2 10000 1.0 0 0 ... 6 770 \n",
"3 5000 1.0 0 0 ... 7 1050 \n",
"4 8080 1.0 0 0 ... 8 1680 \n",
"... ... ... ... ... ... ... ... \n",
"21608 1131 3.0 0 0 ... 8 1530 \n",
"21609 5813 2.0 0 0 ... 8 2310 \n",
"21610 1350 2.0 0 0 ... 7 1020 \n",
"21611 2388 2.0 0 0 ... 8 1600 \n",
"21612 1076 2.0 0 0 ... 7 1020 \n",
"\n",
" sqft_basement yr_built yr_renovated zipcode lat long \\\n",
"0 0 1955 0 98178 47.5112 -122.257 \n",
"1 400 1951 1991 98125 47.7210 -122.319 \n",
"2 0 1933 0 98028 47.7379 -122.233 \n",
"3 910 1965 0 98136 47.5208 -122.393 \n",
"4 0 1987 0 98074 47.6168 -122.045 \n",
"... ... ... ... ... ... ... \n",
"21608 0 2009 0 98103 47.6993 -122.346 \n",
"21609 0 2014 0 98146 47.5107 -122.362 \n",
"21610 0 2009 0 98144 47.5944 -122.299 \n",
"21611 0 2004 0 98027 47.5345 -122.069 \n",
"21612 0 2008 0 98144 47.5941 -122.299 \n",
"\n",
" sqft_living15 sqft_lot15 \n",
"0 1340 5650 \n",
"1 1690 7639 \n",
"2 2720 8062 \n",
"3 1360 5000 \n",
"4 1800 7503 \n",
"... ... ... \n",
"21608 1530 1509 \n",
"21609 1830 7200 \n",
"21610 1020 2007 \n",
"21611 1410 1287 \n",
"21612 1020 1357 \n",
"\n",
"[21613 rows x 21 columns]\n"
]
}
],
"source": [
"df['price'] = 1500000\n",
"print(df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Теперь у миллионеров в networth подставим среднее по столбцу:"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Networth\n",
"0 4.86075\n",
"1 4.86075\n",
"2 4.86075\n",
"3 4.86075\n",
"4 4.86075\n",
"... ...\n",
"2595 4.86075\n",
"2596 4.86075\n",
"2597 4.86075\n",
"2598 4.86075\n",
"2599 4.86075\n",
"\n",
"[2600 rows x 1 columns]\n"
]
}
],
"source": [
"networth_mean = df3['Networth'].mean()\n",
"df3['Networth'] = networth_mean\n",
"print(df3[['Networth']])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"10.КОД"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train df: (15129, 21), Validation df: (3242, 21), Test df: (3242, 21)\n",
"Train df2: (627, 5), Validation df2: (134, 5), Test df2: (135, 5)\n",
"Train df3: (1820, 7), Validation df3: (390, 7), Test df3: (390, 7)\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)\n",
"val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42) \n",
"\n",
"train_df2, temp_df2 = train_test_split(df2, test_size=0.3, random_state=42)\n",
"val_df2, test_df2 = train_test_split(temp_df2, test_size=0.5, random_state=42)\n",
"\n",
"train_df3, temp_df3 = train_test_split(df3, test_size=0.3, random_state=42)\n",
"val_df3, test_df3 = train_test_split(temp_df3, test_size=0.5, random_state=42)\n",
"print(f\"Train df: {train_df.shape}, Validation df: {val_df.shape}, Test df: {test_df.shape}\")\n",
"print(f\"Train df2: {train_df2.shape}, Validation df2: {val_df2.shape}, Test df2: {test_df2.shape}\")\n",
"print(f\"Train df3: {train_df3.shape}, Validation df3: {val_df3.shape}, Test df3: {test_df3.shape}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Было сделаное разбиение на три выборки: 70%, 15% и 15%. Подключена была библиотека scikit-learn и функция train_test_split , как сказано в пункте 15. Вполне сбалансированные"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"12.Качаем библиотеку imbalanced-learn, достаём нужные функции и погнали"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Class distribution after oversampling (df):\n",
"price_category\n",
"Low 10787\n",
"Medium 10787\n",
"High 10787\n",
"Luxury 10787\n",
"Name: count, dtype: int64\n",
"Class distribution after undersampling (df):\n",
"price_category\n",
"Low 1465\n",
"Medium 1465\n",
"High 1465\n",
"Luxury 1465\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"from imblearn.over_sampling import RandomOverSampler\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"df = pd.read_csv(\".//datasetlab2//kc_house_data.csv\", sep=\",\")\n",
"df['price_category'] = pd.cut(df['price'], bins=[0, 300000, 600000, 1000000, float('inf')],\n",
" labels=['Low', 'Medium', 'High', 'Luxury'])\n",
"\n",
"y = df['price_category']\n",
"X = df.drop(columns=['price', 'price_category'])\n",
"\n",
"oversampler = RandomOverSampler(random_state=42)\n",
"X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
"\n",
"undersampler = RandomUnderSampler(random_state=42)\n",
"X_resampled_under, y_resampled_under = undersampler.fit_resample(X, y)\n",
"\n",
"print(\"Class distribution after oversampling (df):\")\n",
"print(pd.Series(y_resampled).value_counts())\n",
"\n",
"print(\"Class distribution after undersampling (df):\")\n",
"print(pd.Series(y_resampled_under).value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Class distribution after oversampling (df3):\n",
"AGE_category\n",
"Young 1401\n",
"Middle-aged 1401\n",
"Senior 1401\n",
"Elderly 1401\n",
"Name: count, dtype: int64\n",
"Class distribution after undersampling (df3):\n",
"AGE_category\n",
"Young 15\n",
"Middle-aged 15\n",
"Senior 15\n",
"Elderly 15\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"df3 = pd.read_csv(\".//datasetlab2//Forbes Billionaires.csv\", sep=\",\")\n",
"\n",
"df3['AGE_category'] = pd.cut(df3['Age'], bins=[0, 30, 50, 70, float('inf')],\n",
" labels=['Young', 'Middle-aged', 'Senior', 'Elderly'])\n",
"\n",
"y3 = df3['AGE_category']\n",
"X3 = df3.drop(columns=['Age', 'AGE_category'])\n",
"\n",
"oversampler3 = RandomOverSampler(random_state=42)\n",
"X_resampled_3, y_resampled_3 = oversampler3.fit_resample(X3, y3)\n",
"\n",
"undersampler3 = RandomUnderSampler(random_state=42)\n",
"X_resampled_3_under, y_resampled_3_under = undersampler3.fit_resample(X3, y3)\n",
"\n",
"print(\"Class distribution after oversampling (df3):\")\n",
"print(pd.Series(y_resampled_3).value_counts())\n",
"\n",
"print(\"Class distribution after undersampling (df3):\")\n",
"print(pd.Series(y_resampled_3_under).value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Class distribution after oversampling (df2):\n",
"Sales_category\n",
"Low 598\n",
"Medium 598\n",
"High 598\n",
"Luxury 0\n",
"Name: count, dtype: int64\n",
"Class distribution after undersampling (df2):\n",
"Sales_category\n",
"Low 7\n",
"Medium 7\n",
"High 7\n",
"Luxury 0\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"df2 = pd.read_csv(\".//datasetlab2//Stores.csv\", sep=\",\")\n",
"\n",
"df2['Sales_category'] = pd.cut(df2['Store_Sales'], bins=[0, 50000, 100000, 200000, float('inf')],\n",
" labels=['Low', 'Medium', 'High', 'Luxury'])\n",
"\n",
"y2 = df2['Sales_category']\n",
"X2 = df2.drop(columns=['Store_Sales', 'Sales_category'])\n",
"\n",
"oversampler2 = RandomOverSampler(random_state=42)\n",
"X_resampled_2, y_resampled_2 = oversampler2.fit_resample(X2, y2)\n",
"\n",
"undersampler2 = RandomUnderSampler(random_state=42)\n",
"X_resampled_2_under, y_resampled_2_under = undersampler2.fit_resample(X2, y2)\n",
"\n",
"print(\"Class distribution after oversampling (df2):\")\n",
"print(pd.Series(y_resampled_2).value_counts())\n",
"\n",
"print(\"Class distribution after undersampling (df2):\")\n",
"print(pd.Series(y_resampled_2_under).value_counts())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "miivenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}