Наконец то все (╥﹏╥)
This commit is contained in:
parent
fdbdb674e8
commit
d19b7e0793
@ -4,13 +4,13 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### DataSet - \"Gaming Laptop Specs and Price\"\n",
|
"### Lab2 PIbd-31 Yakovlev\n",
|
||||||
"Данный датасет содержит данные о игровых ноутбуках."
|
"Загрузим три датасета"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 110,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -41,201 +41,58 @@
|
|||||||
" 16 operating_system 502 non-null float64\n",
|
" 16 operating_system 502 non-null float64\n",
|
||||||
" 17 SSD_storage 532 non-null object \n",
|
" 17 SSD_storage 532 non-null object \n",
|
||||||
"dtypes: float64(3), int64(3), object(12)\n",
|
"dtypes: float64(3), int64(3), object(12)\n",
|
||||||
"memory usage: 74.9+ KB\n"
|
"memory usage: 74.9+ KB\n",
|
||||||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||||
|
"RangeIndex: 8036 entries, 0 to 8035\n",
|
||||||
|
"Data columns (total 7 columns):\n",
|
||||||
|
" # Column Non-Null Count Dtype \n",
|
||||||
|
"--- ------ -------------- ----- \n",
|
||||||
|
" 0 Date 8036 non-null object \n",
|
||||||
|
" 1 Open 8036 non-null float64\n",
|
||||||
|
" 2 High 8036 non-null float64\n",
|
||||||
|
" 3 Low 8036 non-null float64\n",
|
||||||
|
" 4 Close 8036 non-null float64\n",
|
||||||
|
" 5 Adj Close 8036 non-null float64\n",
|
||||||
|
" 6 Volume 8036 non-null int64 \n",
|
||||||
|
"dtypes: float64(5), int64(1), object(1)\n",
|
||||||
|
"memory usage: 439.6+ KB\n",
|
||||||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||||
|
"RangeIndex: 19237 entries, 0 to 19236\n",
|
||||||
|
"Data columns (total 18 columns):\n",
|
||||||
|
" # Column Non-Null Count Dtype \n",
|
||||||
|
"--- ------ -------------- ----- \n",
|
||||||
|
" 0 ID 19237 non-null int64 \n",
|
||||||
|
" 1 Price 19237 non-null int64 \n",
|
||||||
|
" 2 Levy 19237 non-null object \n",
|
||||||
|
" 3 Manufacturer 19237 non-null object \n",
|
||||||
|
" 4 Model 19237 non-null object \n",
|
||||||
|
" 5 Prod. year 19237 non-null int64 \n",
|
||||||
|
" 6 Category 19237 non-null object \n",
|
||||||
|
" 7 Leather interior 19237 non-null object \n",
|
||||||
|
" 8 Fuel type 19237 non-null object \n",
|
||||||
|
" 9 Engine volume 19237 non-null object \n",
|
||||||
|
" 10 Mileage 19237 non-null object \n",
|
||||||
|
" 11 Cylinders 19237 non-null float64\n",
|
||||||
|
" 12 Gear box type 19237 non-null object \n",
|
||||||
|
" 13 Drive wheels 19237 non-null object \n",
|
||||||
|
" 14 Doors 19237 non-null object \n",
|
||||||
|
" 15 Wheel 19237 non-null object \n",
|
||||||
|
" 16 Color 19237 non-null object \n",
|
||||||
|
" 17 Airbags 19237 non-null int64 \n",
|
||||||
|
"dtypes: float64(1), int64(4), object(13)\n",
|
||||||
|
"memory usage: 2.6+ MB\n"
|
||||||
]
|
]
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/html": [
|
|
||||||
"<div>\n",
|
|
||||||
"<style scoped>\n",
|
|
||||||
" .dataframe tbody tr th:only-of-type {\n",
|
|
||||||
" vertical-align: middle;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe tbody tr th {\n",
|
|
||||||
" vertical-align: top;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe thead th {\n",
|
|
||||||
" text-align: right;\n",
|
|
||||||
" }\n",
|
|
||||||
"</style>\n",
|
|
||||||
"<table border=\"1\" class=\"dataframe\">\n",
|
|
||||||
" <thead>\n",
|
|
||||||
" <tr style=\"text-align: right;\">\n",
|
|
||||||
" <th></th>\n",
|
|
||||||
" <th>brand_name</th>\n",
|
|
||||||
" <th>price</th>\n",
|
|
||||||
" <th>rating</th>\n",
|
|
||||||
" <th>processor_gen</th>\n",
|
|
||||||
" <th>processor_brand</th>\n",
|
|
||||||
" <th>processor_segment</th>\n",
|
|
||||||
" <th>CPU_mark</th>\n",
|
|
||||||
" <th>CPU_performance</th>\n",
|
|
||||||
" <th>Graphic_card_memory</th>\n",
|
|
||||||
" <th>graphic_card_name</th>\n",
|
|
||||||
" <th>graphic_card_num</th>\n",
|
|
||||||
" <th>Core</th>\n",
|
|
||||||
" <th>threads</th>\n",
|
|
||||||
" <th>display_inches</th>\n",
|
|
||||||
" <th>ram_storage</th>\n",
|
|
||||||
" <th>ram_type</th>\n",
|
|
||||||
" <th>operating_system</th>\n",
|
|
||||||
" <th>SSD_storage</th>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </thead>\n",
|
|
||||||
" <tbody>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>0</th>\n",
|
|
||||||
" <td>hp</td>\n",
|
|
||||||
" <td>49490</td>\n",
|
|
||||||
" <td>70</td>\n",
|
|
||||||
" <td>5th</td>\n",
|
|
||||||
" <td>amd</td>\n",
|
|
||||||
" <td>5</td>\n",
|
|
||||||
" <td>5600H</td>\n",
|
|
||||||
" <td>maximum performance</td>\n",
|
|
||||||
" <td>4 GB</td>\n",
|
|
||||||
" <td>amd radeon</td>\n",
|
|
||||||
" <td>other</td>\n",
|
|
||||||
" <td>6.0</td>\n",
|
|
||||||
" <td>12.0</td>\n",
|
|
||||||
" <td>15.6</td>\n",
|
|
||||||
" <td>8</td>\n",
|
|
||||||
" <td>DDR4</td>\n",
|
|
||||||
" <td>11.0</td>\n",
|
|
||||||
" <td>512 GB SSD</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>1</th>\n",
|
|
||||||
" <td>xiaomi</td>\n",
|
|
||||||
" <td>102990</td>\n",
|
|
||||||
" <td>78</td>\n",
|
|
||||||
" <td>14th</td>\n",
|
|
||||||
" <td>intel</td>\n",
|
|
||||||
" <td>i9</td>\n",
|
|
||||||
" <td>14900HX</td>\n",
|
|
||||||
" <td>maximum performance</td>\n",
|
|
||||||
" <td>8 GB</td>\n",
|
|
||||||
" <td>nvidia geforce</td>\n",
|
|
||||||
" <td>4060</td>\n",
|
|
||||||
" <td>24.0</td>\n",
|
|
||||||
" <td>32.0</td>\n",
|
|
||||||
" <td>other</td>\n",
|
|
||||||
" <td>16</td>\n",
|
|
||||||
" <td>DDR5</td>\n",
|
|
||||||
" <td>11.0</td>\n",
|
|
||||||
" <td>1 TB SSD</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>2</th>\n",
|
|
||||||
" <td>hp</td>\n",
|
|
||||||
" <td>81490</td>\n",
|
|
||||||
" <td>73</td>\n",
|
|
||||||
" <td>7th</td>\n",
|
|
||||||
" <td>amd</td>\n",
|
|
||||||
" <td>7</td>\n",
|
|
||||||
" <td>7840HS</td>\n",
|
|
||||||
" <td>high efficiency</td>\n",
|
|
||||||
" <td>6 GB</td>\n",
|
|
||||||
" <td>nvidia geforce</td>\n",
|
|
||||||
" <td>3050</td>\n",
|
|
||||||
" <td>8.0</td>\n",
|
|
||||||
" <td>16.0</td>\n",
|
|
||||||
" <td>other</td>\n",
|
|
||||||
" <td>16</td>\n",
|
|
||||||
" <td>DDR5</td>\n",
|
|
||||||
" <td>11.0</td>\n",
|
|
||||||
" <td>1 TB SSD</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>3</th>\n",
|
|
||||||
" <td>asus</td>\n",
|
|
||||||
" <td>49990</td>\n",
|
|
||||||
" <td>64</td>\n",
|
|
||||||
" <td>11th</td>\n",
|
|
||||||
" <td>intel</td>\n",
|
|
||||||
" <td>i5</td>\n",
|
|
||||||
" <td>11400H</td>\n",
|
|
||||||
" <td>maximum performance</td>\n",
|
|
||||||
" <td>4 GB</td>\n",
|
|
||||||
" <td>nvidia geforce</td>\n",
|
|
||||||
" <td>2050</td>\n",
|
|
||||||
" <td>6.0</td>\n",
|
|
||||||
" <td>12.0</td>\n",
|
|
||||||
" <td>15.6</td>\n",
|
|
||||||
" <td>8</td>\n",
|
|
||||||
" <td>DDR4</td>\n",
|
|
||||||
" <td>11.0</td>\n",
|
|
||||||
" <td>512 GB SSD</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>4</th>\n",
|
|
||||||
" <td>asus</td>\n",
|
|
||||||
" <td>52990</td>\n",
|
|
||||||
" <td>66</td>\n",
|
|
||||||
" <td>11th</td>\n",
|
|
||||||
" <td>intel</td>\n",
|
|
||||||
" <td>i5</td>\n",
|
|
||||||
" <td>11400H</td>\n",
|
|
||||||
" <td>maximum performance</td>\n",
|
|
||||||
" <td>4 GB</td>\n",
|
|
||||||
" <td>nvidia geforce</td>\n",
|
|
||||||
" <td>2050</td>\n",
|
|
||||||
" <td>6.0</td>\n",
|
|
||||||
" <td>12.0</td>\n",
|
|
||||||
" <td>15.6</td>\n",
|
|
||||||
" <td>16</td>\n",
|
|
||||||
" <td>DDR4</td>\n",
|
|
||||||
" <td>11.0</td>\n",
|
|
||||||
" <td>512 GB SSD</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </tbody>\n",
|
|
||||||
"</table>\n",
|
|
||||||
"</div>"
|
|
||||||
],
|
|
||||||
"text/plain": [
|
|
||||||
" brand_name price rating processor_gen processor_brand processor_segment \\\n",
|
|
||||||
"0 hp 49490 70 5th amd 5 \n",
|
|
||||||
"1 xiaomi 102990 78 14th intel i9 \n",
|
|
||||||
"2 hp 81490 73 7th amd 7 \n",
|
|
||||||
"3 asus 49990 64 11th intel i5 \n",
|
|
||||||
"4 asus 52990 66 11th intel i5 \n",
|
|
||||||
"\n",
|
|
||||||
" CPU_mark CPU_performance Graphic_card_memory graphic_card_name \\\n",
|
|
||||||
"0 5600H maximum performance 4 GB amd radeon \n",
|
|
||||||
"1 14900HX maximum performance 8 GB nvidia geforce \n",
|
|
||||||
"2 7840HS high efficiency 6 GB nvidia geforce \n",
|
|
||||||
"3 11400H maximum performance 4 GB nvidia geforce \n",
|
|
||||||
"4 11400H maximum performance 4 GB nvidia geforce \n",
|
|
||||||
"\n",
|
|
||||||
" graphic_card_num Core threads display_inches ram_storage ram_type \\\n",
|
|
||||||
"0 other 6.0 12.0 15.6 8 DDR4 \n",
|
|
||||||
"1 4060 24.0 32.0 other 16 DDR5 \n",
|
|
||||||
"2 3050 8.0 16.0 other 16 DDR5 \n",
|
|
||||||
"3 2050 6.0 12.0 15.6 8 DDR4 \n",
|
|
||||||
"4 2050 6.0 12.0 15.6 16 DDR4 \n",
|
|
||||||
"\n",
|
|
||||||
" operating_system SSD_storage \n",
|
|
||||||
"0 11.0 512 GB SSD \n",
|
|
||||||
"1 11.0 1 TB SSD \n",
|
|
||||||
"2 11.0 1 TB SSD \n",
|
|
||||||
"3 11.0 512 GB SSD \n",
|
|
||||||
"4 11.0 512 GB SSD "
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 1,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"\n",
|
"\n",
|
||||||
"df = pd.read_csv(\"datasets/laptop.csv\")\n",
|
"df = pd.read_csv(\"datasets/laptop.csv\")\n",
|
||||||
|
"df2 = pd.read_csv(\"datasets/coffee.csv\")\n",
|
||||||
|
"df3 = pd.read_csv(\"datasets/car_price_prediction.csv\")\n",
|
||||||
"df.info()\n",
|
"df.info()\n",
|
||||||
"df.head()"
|
"df2.info()\n",
|
||||||
|
"df3.info()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -243,56 +100,169 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"#### Проблемная область\n",
|
"#### Проблемная область\n",
|
||||||
"Данный датасет позволяет проанализировать данные, и понять какие ноутбуки имеют превосходство на рынке техники, и какие чаще всего выбирают пользователи.\n",
|
"Первый датасет позволяет проанализировать данные, и понять какие ноутбуки имеют превосходство на рынке техники, и какие чаще всего выбирают пользователи.\n",
|
||||||
|
"Второй датасет позволяет при помощи данных акций за последние 25 лет спрогнозировать будущие показатели акций кофейни Starbucks\n",
|
||||||
|
"Третий датасет позволяет проанализировать данные, и спрогнозировать категорию цены для машины, по ее комплектующим.\n",
|
||||||
"#### Анализ набора данных\n",
|
"#### Анализ набора данных\n",
|
||||||
"Объекты наблюдения - игровые ноутбуки\n",
|
"Объекты наблюдения - игровые ноутбуки, акции, машины\n",
|
||||||
"Атрибуты - Имя бренда, цена, рейтинг, поколение процессора, марка процессора, сегмент процессора, специальная метка, производительность процессора, видеокарта, видеопамять, номер видеокарты, кол-во потоков видеокарты, размер дисплея, оперативная память, тип оперативной памяти, ОС, SSD.\n",
|
"Атрибуты - \n",
|
||||||
|
"1. Имя бренда, цена, рейтинг, поколение процессора, марка процессора, сегмент процессора, специальная метка, производительность процессора, видеокарта, видеопамять, номер видеокарты, кол-во потоков видеокарты, размер дисплея, оперативная память, тип оперативной памяти, ОС, SSD.\n",
|
||||||
|
"2. Дата, начальная цена за день, максимальная цена, минимальная цена, цена на момент закрытия продаж, скорректированая цена на момент закрытия, объем торговли акций за день.\n",
|
||||||
|
"3. Цена обслуживания, производитель, модель, год выпуска, категория, кожанный салон, тип топлива, объем двигателя.\n",
|
||||||
"Связи между объектами - нет\n",
|
"Связи между объектами - нет\n",
|
||||||
"#### Бизнес-цели\n",
|
"#### Бизнес-цели\n",
|
||||||
"Данный набор данных может помочь определить лидеров на рынке игровых ноутбуков.\n",
|
"1. Какие модели игровых ноутбуков более выгодно продавать магазинам техники. Какие комплектующие наиболее популярны у производителей и покупателей, для дальнейшего увеличения производства данных комплектующих.\n",
|
||||||
"В свою очередь определение лидеров поможет определить:\n",
|
"2. Прогноз цен, для дальнешей покупки, продажи акций. Прогнозирование, для предотвращения упадка.\n",
|
||||||
"1. Какие модели игровых ноутбуков более выгодно продавать магазинам техники.\n",
|
"3. Для составления списка лучших моделей автомобилей. Определения наилучшего буджетного автомобиля, который не будет часто ломаться и приносить убытки.\n",
|
||||||
"2. Какие комплектующие наиболее популярны у производителей и покупателей, для дальнейшего увеличения производства данных комплектующих.\n",
|
"#### Примеры целей технического проекта. Что поступает на вход, что является целевым признаком.\n",
|
||||||
"3. Определение популярных комлпектующих, для дальнейшей сборки других игровых ноутбуков новых версий.\n",
|
"На входе всегда датасет, целевые признаки:\n",
|
||||||
"#### Примеры целей технического проекта. Что поступает на вход, что является целевым признаком?????????\n",
|
"1. Рейтинг ноутбука\n",
|
||||||
|
"2. Максимальная цена за день\n",
|
||||||
|
"3. Цена обслуживания\n",
|
||||||
"#### Проблемы набора данных и их решения\n",
|
"#### Проблемы набора данных и их решения\n",
|
||||||
"1. Возможны устаревшие данные, т.к. новые комплектующие выходят довольно часто. Для решения данной проблемы требуется удаление самых старых записей о ноутбуках, и добавление более новых моделей.\n",
|
"1. Возможны устаревшие данные. Для решения данной проблемы требуется удаление самых старых записей, и добавление более новых.\n",
|
||||||
"2. Возможны выбросы, какие-то \"сверхестественные сборки\". Решить эту проблему помогает удаление таких выбросов. В маленькой выборке не рекомендуется удалять выбросы. В большой выборке выбросы усредняются.\n",
|
"2. Возможны выбросы. Решить эту проблему помогает удаление таких выбросов. В маленькой выборке не рекомендуется удалять выбросы. В большой выборке выбросы усредняются.\n",
|
||||||
"#### Качество набора данных\n",
|
"#### Качество набора данных\n",
|
||||||
"Набор данных содержит достаточно примеров и признаков для обучения модели. Учтены различные ситуации проблемной области. Данные соответствуют данным, которые будут\n",
|
"Наборы данных содержат достаточно примеров и признаков для обучения модели. Учтены различные ситуации проблемной области. Данные соответствуют данным, которые будут\n",
|
||||||
"подаваться в производственной среде. Все метки согласованы.\n",
|
"подаваться в производственной среде. Все метки согласованы.\n"
|
||||||
"#### Проблема пропущенных данных"
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Поиск аномалий"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": 111,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
|
" price rating Core threads ram_storage \\\n",
|
||||||
|
"count 532.000000 532.000000 530.000000 514.000000 532.000000 \n",
|
||||||
|
"mean 107684.492481 67.781955 9.035849 15.089494 15.676692 \n",
|
||||||
|
"std 80187.648965 8.161356 4.413487 5.216162 8.901257 \n",
|
||||||
|
"min 30999.000000 43.000000 2.000000 4.000000 4.000000 \n",
|
||||||
|
"25% 62371.750000 63.000000 6.000000 12.000000 8.000000 \n",
|
||||||
|
"50% 83745.000000 66.000000 8.000000 16.000000 16.000000 \n",
|
||||||
|
"75% 114040.000000 72.000000 10.000000 16.000000 16.000000 \n",
|
||||||
|
"max 599990.000000 98.000000 24.000000 32.000000 64.000000 \n",
|
||||||
|
"\n",
|
||||||
|
" operating_system \n",
|
||||||
|
"count 502.000000 \n",
|
||||||
|
"mean 10.842629 \n",
|
||||||
|
"std 0.364513 \n",
|
||||||
|
"min 10.000000 \n",
|
||||||
|
"25% 11.000000 \n",
|
||||||
|
"50% 11.000000 \n",
|
||||||
|
"75% 11.000000 \n",
|
||||||
|
"max 11.000000 \n",
|
||||||
|
" Open High Low Close Adj Close \\\n",
|
||||||
|
"count 8036.000000 8036.000000 8036.000000 8036.000000 8036.000000 \n",
|
||||||
|
"mean 30.054280 30.351487 29.751322 30.058857 26.674025 \n",
|
||||||
|
"std 33.615577 33.906613 33.314569 33.615911 31.728090 \n",
|
||||||
|
"min 0.328125 0.347656 0.320313 0.335938 0.260703 \n",
|
||||||
|
"25% 4.392031 4.531250 4.304922 4.399610 3.414300 \n",
|
||||||
|
"50% 13.325000 13.493750 13.150000 13.330000 10.352452 \n",
|
||||||
|
"75% 55.250000 55.722501 54.852499 55.267499 47.464829 \n",
|
||||||
|
"max 126.080002 126.320000 124.809998 126.059998 118.010414 \n",
|
||||||
|
"\n",
|
||||||
|
" Volume \n",
|
||||||
|
"count 8.036000e+03 \n",
|
||||||
|
"mean 1.470459e+07 \n",
|
||||||
|
"std 1.340021e+07 \n",
|
||||||
|
"min 1.504000e+06 \n",
|
||||||
|
"25% 7.817750e+06 \n",
|
||||||
|
"50% 1.169815e+07 \n",
|
||||||
|
"75% 1.778795e+07 \n",
|
||||||
|
"max 5.855088e+08 \n",
|
||||||
|
" ID Price Prod. year Cylinders Airbags\n",
|
||||||
|
"count 1.923700e+04 1.923700e+04 19237.000000 19237.000000 19237.000000\n",
|
||||||
|
"mean 4.557654e+07 1.855593e+04 2010.912824 4.582991 6.582627\n",
|
||||||
|
"std 9.365914e+05 1.905813e+05 5.668673 1.199933 4.320168\n",
|
||||||
|
"min 2.074688e+07 1.000000e+00 1939.000000 1.000000 0.000000\n",
|
||||||
|
"25% 4.569837e+07 5.331000e+03 2009.000000 4.000000 4.000000\n",
|
||||||
|
"50% 4.577231e+07 1.317200e+04 2012.000000 4.000000 6.000000\n",
|
||||||
|
"75% 4.580204e+07 2.207500e+04 2015.000000 4.000000 12.000000\n",
|
||||||
|
"max 4.581665e+07 2.630750e+07 2020.000000 16.000000 16.000000\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(df.describe())\n",
|
||||||
|
"print(df2.describe())\n",
|
||||||
|
"print(df3.describe())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"При просмотре вывода не было замечено аномалий в столбцах датасетов."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### Проблема пропущенных данных"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 112,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"DATASET 1\n",
|
||||||
"processor_gen процент пустых значений: %2.26\n",
|
"processor_gen процент пустых значений: %2.26\n",
|
||||||
"processor_segment процент пустых значений: %0.75\n",
|
"processor_segment процент пустых значений: %0.75\n",
|
||||||
"Graphic_card_memory процент пустых значений: %0.38\n",
|
"Graphic_card_memory процент пустых значений: %0.38\n",
|
||||||
"graphic_card_name процент пустых значений: %0.38\n",
|
"graphic_card_name процент пустых значений: %0.38\n",
|
||||||
"Core процент пустых значений: %0.38\n",
|
"Core процент пустых значений: %0.38\n",
|
||||||
"threads процент пустых значений: %3.38\n",
|
"threads процент пустых значений: %3.38\n",
|
||||||
"operating_system процент пустых значений: %5.64\n"
|
"operating_system процент пустых значений: %5.64\n",
|
||||||
|
"DATASET 2\n",
|
||||||
|
"DATASET 3\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
|
"print(\"DATASET 1\")\n",
|
||||||
"for i in df.columns:\n",
|
"for i in df.columns:\n",
|
||||||
" null_rate = df[i].isnull().sum() / len(df)*100\n",
|
" null_rate = df[i].isnull().sum() / len(df)*100\n",
|
||||||
" if null_rate > 0:\n",
|
" if null_rate > 0:\n",
|
||||||
|
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n",
|
||||||
|
"print(\"DATASET 2\")\n",
|
||||||
|
"for i in df2.columns:\n",
|
||||||
|
" null_rate = df2[i].isnull().sum() / len(df2)*100\n",
|
||||||
|
" if null_rate > 0:\n",
|
||||||
|
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n",
|
||||||
|
"print(\"DATASET 3\")\n",
|
||||||
|
"for i in df3.columns:\n",
|
||||||
|
" null_rate = df3[i].isnull().sum() / len(df3)*100\n",
|
||||||
|
" if null_rate > 0:\n",
|
||||||
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"В первом датасете были поля с пустыми значениями, в остальных пустых значений не найдено."
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 10,
|
"execution_count": 113,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -501,7 +471,7 @@
|
|||||||
"531 8 DDR4 0.0 other "
|
"531 8 DDR4 0.0 other "
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 10,
|
"execution_count": 113,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -516,31 +486,31 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"#### Разбиение на выборки"
|
"#### Разбиение на выборки\n",
|
||||||
|
"Для разбиения на выборке для начала уменьшим количество уникальных значений в столбцах с целевыми признаками, путем добавления новых столбцов с малым количеством уникальных значений."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 24,
|
"execution_count": 114,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
"source": [
|
||||||
"ename": "ValueError",
|
"#У первого дата сета добавим новый столбец с рейтингом от 1 до 5 на основе столбца от 1 до 100.\n",
|
||||||
"evalue": "The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.",
|
"df['new_rating'] = pd.cut(df['rating'], bins=[0,20,40,60,80,100], labels=[1,2,3,4,5], include_lowest=True)\n",
|
||||||
"output_type": "error",
|
"#У второго добавим столбец с наибольшей ценой от 1 до 10, на основе столбца от 1 до 127.\n",
|
||||||
"traceback": [
|
"df2['new_high'] = pd.cut(df2['High'], bins=[0,13,26,39,52,65,78,91,104,117,130], labels=[1,2,3,4,5,6,7,8,9,10], include_lowest=True)\n",
|
||||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
"#У третьего удалим слишком большие значения обслуживания и слишком маленькие и добавим новый столбец с категориями цен от 1 до 5.\n",
|
||||||
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
|
"df3_filtered = df3[df3['Price'] >= 10000]\n",
|
||||||
"Cell \u001b[1;32mIn[24], line 46\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df_train, df_val, df_test\n\u001b[0;32m 44\u001b[0m data \u001b[38;5;241m=\u001b[39m df[[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrating\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprice\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mram_storage\u001b[39m\u001b[38;5;124m\"\u001b[39m]]\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[1;32m---> 46\u001b[0m df_train, df_val, df_test \u001b[38;5;241m=\u001b[39m \u001b[43msplit_stratified_into_train_val_test\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 47\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstratify_colname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrating\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_train\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.60\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_val\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_test\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\n\u001b[0;32m 48\u001b[0m \u001b[43m)\u001b[49m\n\u001b[0;32m 50\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_train\u001b[38;5;241m.\u001b[39mshape)\n\u001b[0;32m 52\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mКонтрольная выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_val\u001b[38;5;241m.\u001b[39mshape)\n",
|
"df3_filtered = df3_filtered[df3_filtered['Price'] <= 100000]\n",
|
||||||
"Cell \u001b[1;32mIn[24], line 26\u001b[0m, in \u001b[0;36msplit_stratified_into_train_val_test\u001b[1;34m(df_input, stratify_colname, frac_train, frac_val, frac_test, random_state)\u001b[0m\n\u001b[0;32m 21\u001b[0m y \u001b[38;5;241m=\u001b[39m df_input[\n\u001b[0;32m 22\u001b[0m [stratify_colname]\n\u001b[0;32m 23\u001b[0m ] \u001b[38;5;66;03m# Dataframe of just the column on which to stratify.\u001b[39;00m\n\u001b[0;32m 25\u001b[0m \u001b[38;5;66;03m# Split original dataframe into train and temp dataframes.\u001b[39;00m\n\u001b[1;32m---> 26\u001b[0m df_train, df_temp, y_train, y_temp \u001b[38;5;241m=\u001b[39m \u001b[43mtrain_test_split\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 27\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstratify\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m1.0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mfrac_train\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrandom_state\u001b[49m\n\u001b[0;32m 28\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 30\u001b[0m \u001b[38;5;66;03m# Split the temp dataframe into val and test dataframes.\u001b[39;00m\n\u001b[0;32m 31\u001b[0m relative_frac_test \u001b[38;5;241m=\u001b[39m frac_test \u001b[38;5;241m/\u001b[39m (frac_val \u001b[38;5;241m+\u001b[39m frac_test)\n",
|
"df3_filtered['new_price'] = pd.cut(df3_filtered['Price'], bins=[10000,28000,46000,64000,82000,100000], labels=[1,2,3,4,5], include_lowest=True)"
|
||||||
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:213\u001b[0m, in \u001b[0;36mvalidate_params.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 208\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 209\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 210\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 211\u001b[0m )\n\u001b[0;32m 212\u001b[0m ):\n\u001b[1;32m--> 213\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 215\u001b[0m \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m 216\u001b[0m \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m 217\u001b[0m \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m 218\u001b[0m \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m 219\u001b[0m msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[0;32m 220\u001b[0m \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 221\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 222\u001b[0m \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[0;32m 223\u001b[0m )\n",
|
|
||||||
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:2806\u001b[0m, in \u001b[0;36mtrain_test_split\u001b[1;34m(test_size, train_size, random_state, shuffle, stratify, *arrays)\u001b[0m\n\u001b[0;32m 2802\u001b[0m CVClass \u001b[38;5;241m=\u001b[39m ShuffleSplit\n\u001b[0;32m 2804\u001b[0m cv \u001b[38;5;241m=\u001b[39m CVClass(test_size\u001b[38;5;241m=\u001b[39mn_test, train_size\u001b[38;5;241m=\u001b[39mn_train, random_state\u001b[38;5;241m=\u001b[39mrandom_state)\n\u001b[1;32m-> 2806\u001b[0m train, test \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcv\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43marrays\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstratify\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2808\u001b[0m train, test \u001b[38;5;241m=\u001b[39m ensure_common_namespace_device(arrays[\u001b[38;5;241m0\u001b[39m], train, test)\n\u001b[0;32m 2810\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(\n\u001b[0;32m 2811\u001b[0m chain\u001b[38;5;241m.\u001b[39mfrom_iterable(\n\u001b[0;32m 2812\u001b[0m (_safe_indexing(a, train), _safe_indexing(a, test)) \u001b[38;5;28;01mfor\u001b[39;00m a \u001b[38;5;129;01min\u001b[39;00m arrays\n\u001b[0;32m 2813\u001b[0m )\n\u001b[0;32m 2814\u001b[0m )\n",
|
|
||||||
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:1843\u001b[0m, in \u001b[0;36mBaseShuffleSplit.split\u001b[1;34m(self, X, y, groups)\u001b[0m\n\u001b[0;32m 1813\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Generate indices to split data into training and test set.\u001b[39;00m\n\u001b[0;32m 1814\u001b[0m \n\u001b[0;32m 1815\u001b[0m \u001b[38;5;124;03mParameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1840\u001b[0m \u001b[38;5;124;03mto an integer.\u001b[39;00m\n\u001b[0;32m 1841\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 1842\u001b[0m X, y, groups \u001b[38;5;241m=\u001b[39m indexable(X, y, groups)\n\u001b[1;32m-> 1843\u001b[0m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_iter_indices\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgroups\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[0;32m 1844\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01myield\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\n",
|
|
||||||
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:2252\u001b[0m, in \u001b[0;36mStratifiedShuffleSplit._iter_indices\u001b[1;34m(self, X, y, groups)\u001b[0m\n\u001b[0;32m 2250\u001b[0m class_counts \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mbincount(y_indices)\n\u001b[0;32m 2251\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m np\u001b[38;5;241m.\u001b[39mmin(class_counts) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m2\u001b[39m:\n\u001b[1;32m-> 2252\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 2253\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe least populated class in y has only 1\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2254\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m member, which is too few. The minimum\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2255\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m number of groups for any class cannot\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2256\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m be less than 2.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2257\u001b[0m )\n\u001b[0;32m 2259\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n_train \u001b[38;5;241m<\u001b[39m n_classes:\n\u001b[0;32m 2260\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 2261\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe train_size = \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m should be greater or \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2262\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mequal to the number of classes = \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m (n_train, n_classes)\n\u001b[0;32m 2263\u001b[0m )\n",
|
|
||||||
"\u001b[1;31mValueError\u001b[0m: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2."
|
|
||||||
]
|
]
|
||||||
}
|
},
|
||||||
],
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 115,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.model_selection import train_test_split\n",
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -583,19 +553,234 @@
|
|||||||
"\n",
|
"\n",
|
||||||
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" return df_train, df_val, df_test\n",
|
" return df_train, df_val, df_test"
|
||||||
"\n",
|
]
|
||||||
"data = df[[\"rating\", \"price\", \"ram_storage\"]].copy()\n",
|
},
|
||||||
"\n",
|
{
|
||||||
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
|
"cell_type": "markdown",
|
||||||
" data, stratify_colname=\"rating\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Выборки датасетов"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 116,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"train df: (319, 19), Val df: (106, 19), Test df:(107, 19)\n",
|
||||||
|
"train df2: (4821, 8), Val df2: (1607, 8), Test df2:(1608, 8)\n",
|
||||||
|
"train df3_filtered: (6931, 19), Val df3_filtered: (2310, 19), Test df3_filtered:(2311, 19)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df_train1, df_val1, df_test1 = split_stratified_into_train_val_test(\n",
|
||||||
|
" df, stratify_colname=\"new_rating\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
"df_train2, df_val2, df_test2 = split_stratified_into_train_val_test(\n",
|
||||||
|
" df2, stratify_colname=\"new_high\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
|
||||||
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(\"Контрольная выборка: \", df_val.shape)\n",
|
"df_train3, df_val3, df_test3 = split_stratified_into_train_val_test(\n",
|
||||||
|
" df3_filtered, stratify_colname=\"new_price\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
|
||||||
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(\"Тестовая выборка: \", df_test.shape)"
|
"print(f\"train df: {df_train1.shape}, Val df: {df_val1.shape}, Test df:{df_test1.shape}\")\n",
|
||||||
|
"print(f\"train df2: {df_train2.shape}, Val df2: {df_val2.shape}, Test df2:{df_test2.shape}\")\n",
|
||||||
|
"print(f\"train df3_filtered: {df_train3.shape}, Val df3_filtered: {df_val3.shape}, Test df3_filtered:{df_test3.shape}\")\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Было сделано разбиение на три выборки: 60%, 20% и 20% при помощи библиотеки scikit-learn и функции train_test_split. На взгляд сбалансированные\n",
|
||||||
|
"### Приращение методами выборки с избытком (oversampling) и выборки с недостатком (undersampling)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 121,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Выборка до oversampling и undersampling: (319, 6)\n",
|
||||||
|
"new_rating\n",
|
||||||
|
"4 249\n",
|
||||||
|
"3 44\n",
|
||||||
|
"5 26\n",
|
||||||
|
"1 0\n",
|
||||||
|
"2 0\n",
|
||||||
|
"Name: count, dtype: int64\n",
|
||||||
|
"Выборка после oversampling: (748, 6)\n",
|
||||||
|
"new_rating\n",
|
||||||
|
"5 252\n",
|
||||||
|
"4 249\n",
|
||||||
|
"3 247\n",
|
||||||
|
"1 0\n",
|
||||||
|
"2 0\n",
|
||||||
|
"Name: count, dtype: int64\n",
|
||||||
|
"Выборка после undersampling: (78, 6)\n",
|
||||||
|
"new_rating\n",
|
||||||
|
"3 26\n",
|
||||||
|
"5 26\n",
|
||||||
|
"4 26\n",
|
||||||
|
"1 0\n",
|
||||||
|
"2 0\n",
|
||||||
|
"Name: count, dtype: int64\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from imblearn.over_sampling import ADASYN\n",
|
||||||
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
||||||
|
"\n",
|
||||||
|
"df_train1 = df_train1[['price', 'rating', 'threads', 'ram_storage', 'operating_system', 'new_rating']].copy()\n",
|
||||||
|
"\n",
|
||||||
|
"ada = ADASYN()\n",
|
||||||
|
"undersampler = RandomUnderSampler(random_state=42)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Выборка до oversampling и undersampling:\", df_train1.shape)\n",
|
||||||
|
"print(df_train1.new_rating.value_counts())\n",
|
||||||
|
"\n",
|
||||||
|
"X_resampled, y_resampled = ada.fit_resample(df_train1, df_train1['new_rating'])\n",
|
||||||
|
"df_train1_adasyn = pd.DataFrame(X_resampled)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Выборка после oversampling: \", df_train1_adasyn.shape)\n",
|
||||||
|
"print(df_train1_adasyn.new_rating.value_counts())\n",
|
||||||
|
"\n",
|
||||||
|
"X_resampled_under, y_resampled_under = undersampler.fit_resample(df_train1, df_train1['new_rating'])\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Выборка после undersampling: \", pd.DataFrame(X_resampled_under).shape)\n",
|
||||||
|
"print(pd.DataFrame(X_resampled_under).new_rating.value_counts())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 122,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Выборка до oversampling: (4821, 7)\n",
|
||||||
|
"new_high\n",
|
||||||
|
"1 2326\n",
|
||||||
|
"2 704\n",
|
||||||
|
"5 519\n",
|
||||||
|
"3 299\n",
|
||||||
|
"8 242\n",
|
||||||
|
"7 222\n",
|
||||||
|
"9 181\n",
|
||||||
|
"4 151\n",
|
||||||
|
"6 146\n",
|
||||||
|
"10 31\n",
|
||||||
|
"Name: count, dtype: int64\n",
|
||||||
|
"Выборка после oversampling: (22990, 7)\n",
|
||||||
|
"new_high\n",
|
||||||
|
"6 2375\n",
|
||||||
|
"2 2353\n",
|
||||||
|
"10 2327\n",
|
||||||
|
"1 2326\n",
|
||||||
|
"9 2306\n",
|
||||||
|
"8 2296\n",
|
||||||
|
"4 2293\n",
|
||||||
|
"3 2265\n",
|
||||||
|
"7 2254\n",
|
||||||
|
"5 2195\n",
|
||||||
|
"Name: count, dtype: int64\n",
|
||||||
|
"Выборка после undersampling: (310, 7)\n",
|
||||||
|
"new_high\n",
|
||||||
|
"1 31\n",
|
||||||
|
"2 31\n",
|
||||||
|
"3 31\n",
|
||||||
|
"4 31\n",
|
||||||
|
"5 31\n",
|
||||||
|
"6 31\n",
|
||||||
|
"7 31\n",
|
||||||
|
"8 31\n",
|
||||||
|
"9 31\n",
|
||||||
|
"10 31\n",
|
||||||
|
"Name: count, dtype: int64\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df_train2 = df_train2[['Open', 'High', 'new_high', 'Low', 'Close', 'Adj Close', 'Volume']].copy()\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Выборка до oversampling и undersampling:\", df_train2.shape)\n",
|
||||||
|
"print(df_train2.new_high.value_counts())\n",
|
||||||
|
"\n",
|
||||||
|
"X_resampled, y_resampled = ada.fit_resample(df_train2, df_train2['new_high'])\n",
|
||||||
|
"df_train2_adasyn = pd.DataFrame(X_resampled)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Выборка после oversampling: \", df_train2_adasyn.shape)\n",
|
||||||
|
"print(df_train2_adasyn.new_high.value_counts())\n",
|
||||||
|
"\n",
|
||||||
|
"X_resampled_under2, y_resampled_under2 = undersampler.fit_resample(df_train2, df_train2['new_high'])\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Выборка после undersampling: \", pd.DataFrame(X_resampled_under2).shape)\n",
|
||||||
|
"print(pd.DataFrame(X_resampled_under2).new_high.value_counts())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 120,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Выборка до oversampling: (6931, 5)\n",
|
||||||
|
"new_price\n",
|
||||||
|
"1 5008\n",
|
||||||
|
"2 1281\n",
|
||||||
|
"3 449\n",
|
||||||
|
"4 136\n",
|
||||||
|
"5 57\n",
|
||||||
|
"Name: count, dtype: int64\n",
|
||||||
|
"Выборка после oversampling: (25040, 5)\n",
|
||||||
|
"new_price\n",
|
||||||
|
"1 5008\n",
|
||||||
|
"2 5008\n",
|
||||||
|
"3 5008\n",
|
||||||
|
"4 5008\n",
|
||||||
|
"5 5008\n",
|
||||||
|
"Name: count, dtype: int64\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from imblearn.over_sampling import SMOTE\n",
|
||||||
|
"\n",
|
||||||
|
"df_train3 = df_train3[['Price', 'new_price','Prod. year' ,'Cylinders' ,'Airbags']].copy()\n",
|
||||||
|
"\n",
|
||||||
|
"smote = SMOTE(random_state=42)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Выборка до oversampling и undersampling:\", df_train3.shape)\n",
|
||||||
|
"print(df_train3.new_price.value_counts())\n",
|
||||||
|
"\n",
|
||||||
|
"X_resampled, y_resampled = smote.fit_resample(df_train3, df_train3['new_price'])\n",
|
||||||
|
"df_train3_smote = pd.DataFrame(X_resampled)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Выборка после oversampling: \", df_train3_smote.shape)\n",
|
||||||
|
"print(df_train3_smote.new_price.value_counts())\n",
|
||||||
|
"\n",
|
||||||
|
"X_resampled_under3, y_resampled_under3 = undersampler.fit_resample(df_train3, df_train3['new_price'])\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Выборка после undersampling: \", pd.DataFrame(X_resampled_under3).shape)\n",
|
||||||
|
"print(pd.DataFrame(X_resampled_under3).new_price.value_counts())"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
Loading…
Reference in New Issue
Block a user