620 lines
28 KiB
Plaintext
620 lines
28 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### DataSet - \"Gaming Laptop Specs and Price\"\n",
|
|||
|
"Данный датасет содержит данные о игровых ноутбуках."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 1,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"RangeIndex: 532 entries, 0 to 531\n",
|
|||
|
"Data columns (total 18 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 brand_name 532 non-null object \n",
|
|||
|
" 1 price 532 non-null int64 \n",
|
|||
|
" 2 rating 532 non-null int64 \n",
|
|||
|
" 3 processor_gen 520 non-null object \n",
|
|||
|
" 4 processor_brand 532 non-null object \n",
|
|||
|
" 5 processor_segment 528 non-null object \n",
|
|||
|
" 6 CPU_mark 532 non-null object \n",
|
|||
|
" 7 CPU_performance 532 non-null object \n",
|
|||
|
" 8 Graphic_card_memory 530 non-null object \n",
|
|||
|
" 9 graphic_card_name 530 non-null object \n",
|
|||
|
" 10 graphic_card_num 532 non-null object \n",
|
|||
|
" 11 Core 530 non-null float64\n",
|
|||
|
" 12 threads 514 non-null float64\n",
|
|||
|
" 13 display_inches 532 non-null object \n",
|
|||
|
" 14 ram_storage 532 non-null int64 \n",
|
|||
|
" 15 ram_type 532 non-null object \n",
|
|||
|
" 16 operating_system 502 non-null float64\n",
|
|||
|
" 17 SSD_storage 532 non-null object \n",
|
|||
|
"dtypes: float64(3), int64(3), object(12)\n",
|
|||
|
"memory usage: 74.9+ KB\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>brand_name</th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>rating</th>\n",
|
|||
|
" <th>processor_gen</th>\n",
|
|||
|
" <th>processor_brand</th>\n",
|
|||
|
" <th>processor_segment</th>\n",
|
|||
|
" <th>CPU_mark</th>\n",
|
|||
|
" <th>CPU_performance</th>\n",
|
|||
|
" <th>Graphic_card_memory</th>\n",
|
|||
|
" <th>graphic_card_name</th>\n",
|
|||
|
" <th>graphic_card_num</th>\n",
|
|||
|
" <th>Core</th>\n",
|
|||
|
" <th>threads</th>\n",
|
|||
|
" <th>display_inches</th>\n",
|
|||
|
" <th>ram_storage</th>\n",
|
|||
|
" <th>ram_type</th>\n",
|
|||
|
" <th>operating_system</th>\n",
|
|||
|
" <th>SSD_storage</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>hp</td>\n",
|
|||
|
" <td>49490</td>\n",
|
|||
|
" <td>70</td>\n",
|
|||
|
" <td>5th</td>\n",
|
|||
|
" <td>amd</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>5600H</td>\n",
|
|||
|
" <td>maximum performance</td>\n",
|
|||
|
" <td>4 GB</td>\n",
|
|||
|
" <td>amd radeon</td>\n",
|
|||
|
" <td>other</td>\n",
|
|||
|
" <td>6.0</td>\n",
|
|||
|
" <td>12.0</td>\n",
|
|||
|
" <td>15.6</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>DDR4</td>\n",
|
|||
|
" <td>11.0</td>\n",
|
|||
|
" <td>512 GB SSD</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>xiaomi</td>\n",
|
|||
|
" <td>102990</td>\n",
|
|||
|
" <td>78</td>\n",
|
|||
|
" <td>14th</td>\n",
|
|||
|
" <td>intel</td>\n",
|
|||
|
" <td>i9</td>\n",
|
|||
|
" <td>14900HX</td>\n",
|
|||
|
" <td>maximum performance</td>\n",
|
|||
|
" <td>8 GB</td>\n",
|
|||
|
" <td>nvidia geforce</td>\n",
|
|||
|
" <td>4060</td>\n",
|
|||
|
" <td>24.0</td>\n",
|
|||
|
" <td>32.0</td>\n",
|
|||
|
" <td>other</td>\n",
|
|||
|
" <td>16</td>\n",
|
|||
|
" <td>DDR5</td>\n",
|
|||
|
" <td>11.0</td>\n",
|
|||
|
" <td>1 TB SSD</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>hp</td>\n",
|
|||
|
" <td>81490</td>\n",
|
|||
|
" <td>73</td>\n",
|
|||
|
" <td>7th</td>\n",
|
|||
|
" <td>amd</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>7840HS</td>\n",
|
|||
|
" <td>high efficiency</td>\n",
|
|||
|
" <td>6 GB</td>\n",
|
|||
|
" <td>nvidia geforce</td>\n",
|
|||
|
" <td>3050</td>\n",
|
|||
|
" <td>8.0</td>\n",
|
|||
|
" <td>16.0</td>\n",
|
|||
|
" <td>other</td>\n",
|
|||
|
" <td>16</td>\n",
|
|||
|
" <td>DDR5</td>\n",
|
|||
|
" <td>11.0</td>\n",
|
|||
|
" <td>1 TB SSD</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>asus</td>\n",
|
|||
|
" <td>49990</td>\n",
|
|||
|
" <td>64</td>\n",
|
|||
|
" <td>11th</td>\n",
|
|||
|
" <td>intel</td>\n",
|
|||
|
" <td>i5</td>\n",
|
|||
|
" <td>11400H</td>\n",
|
|||
|
" <td>maximum performance</td>\n",
|
|||
|
" <td>4 GB</td>\n",
|
|||
|
" <td>nvidia geforce</td>\n",
|
|||
|
" <td>2050</td>\n",
|
|||
|
" <td>6.0</td>\n",
|
|||
|
" <td>12.0</td>\n",
|
|||
|
" <td>15.6</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>DDR4</td>\n",
|
|||
|
" <td>11.0</td>\n",
|
|||
|
" <td>512 GB SSD</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>asus</td>\n",
|
|||
|
" <td>52990</td>\n",
|
|||
|
" <td>66</td>\n",
|
|||
|
" <td>11th</td>\n",
|
|||
|
" <td>intel</td>\n",
|
|||
|
" <td>i5</td>\n",
|
|||
|
" <td>11400H</td>\n",
|
|||
|
" <td>maximum performance</td>\n",
|
|||
|
" <td>4 GB</td>\n",
|
|||
|
" <td>nvidia geforce</td>\n",
|
|||
|
" <td>2050</td>\n",
|
|||
|
" <td>6.0</td>\n",
|
|||
|
" <td>12.0</td>\n",
|
|||
|
" <td>15.6</td>\n",
|
|||
|
" <td>16</td>\n",
|
|||
|
" <td>DDR4</td>\n",
|
|||
|
" <td>11.0</td>\n",
|
|||
|
" <td>512 GB SSD</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" brand_name price rating processor_gen processor_brand processor_segment \\\n",
|
|||
|
"0 hp 49490 70 5th amd 5 \n",
|
|||
|
"1 xiaomi 102990 78 14th intel i9 \n",
|
|||
|
"2 hp 81490 73 7th amd 7 \n",
|
|||
|
"3 asus 49990 64 11th intel i5 \n",
|
|||
|
"4 asus 52990 66 11th intel i5 \n",
|
|||
|
"\n",
|
|||
|
" CPU_mark CPU_performance Graphic_card_memory graphic_card_name \\\n",
|
|||
|
"0 5600H maximum performance 4 GB amd radeon \n",
|
|||
|
"1 14900HX maximum performance 8 GB nvidia geforce \n",
|
|||
|
"2 7840HS high efficiency 6 GB nvidia geforce \n",
|
|||
|
"3 11400H maximum performance 4 GB nvidia geforce \n",
|
|||
|
"4 11400H maximum performance 4 GB nvidia geforce \n",
|
|||
|
"\n",
|
|||
|
" graphic_card_num Core threads display_inches ram_storage ram_type \\\n",
|
|||
|
"0 other 6.0 12.0 15.6 8 DDR4 \n",
|
|||
|
"1 4060 24.0 32.0 other 16 DDR5 \n",
|
|||
|
"2 3050 8.0 16.0 other 16 DDR5 \n",
|
|||
|
"3 2050 6.0 12.0 15.6 8 DDR4 \n",
|
|||
|
"4 2050 6.0 12.0 15.6 16 DDR4 \n",
|
|||
|
"\n",
|
|||
|
" operating_system SSD_storage \n",
|
|||
|
"0 11.0 512 GB SSD \n",
|
|||
|
"1 11.0 1 TB SSD \n",
|
|||
|
"2 11.0 1 TB SSD \n",
|
|||
|
"3 11.0 512 GB SSD \n",
|
|||
|
"4 11.0 512 GB SSD "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 1,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\"datasets/laptop.csv\")\n",
|
|||
|
"df.info()\n",
|
|||
|
"df.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Проблемная область\n",
|
|||
|
"Данный датасет позволяет проанализировать данные, и понять какие ноутбуки имеют превосходство на рынке техники, и какие чаще всего выбирают пользователи.\n",
|
|||
|
"#### Анализ набора данных\n",
|
|||
|
"Объекты наблюдения - игровые ноутбуки\n",
|
|||
|
"Атрибуты - Имя бренда, цена, рейтинг, поколение процессора, марка процессора, сегмент процессора, специальная метка, производительность процессора, видеокарта, видеопамять, номер видеокарты, кол-во потоков видеокарты, размер дисплея, оперативная память, тип оперативной памяти, ОС, SSD.\n",
|
|||
|
"Связи между объектами - нет\n",
|
|||
|
"#### Бизнес-цели\n",
|
|||
|
"Данный набор данных может помочь определить лидеров на рынке игровых ноутбуков.\n",
|
|||
|
"В свою очередь определение лидеров поможет определить:\n",
|
|||
|
"1. Какие модели игровых ноутбуков более выгодно продавать магазинам техники.\n",
|
|||
|
"2. Какие комплектующие наиболее популярны у производителей и покупателей, для дальнейшего увеличения производства данных комплектующих.\n",
|
|||
|
"3. Определение популярных комлпектующих, для дальнейшей сборки других игровых ноутбуков новых версий.\n",
|
|||
|
"#### Примеры целей технического проекта. Что поступает на вход, что является целевым признаком?????????\n",
|
|||
|
"#### Проблемы набора данных и их решения\n",
|
|||
|
"1. Возможны устаревшие данные, т.к. новые комплектующие выходят довольно часто. Для решения данной проблемы требуется удаление самых старых записей о ноутбуках, и добавление более новых моделей.\n",
|
|||
|
"2. Возможны выбросы, какие-то \"сверхестественные сборки\". Решить эту проблему помогает удаление таких выбросов. В маленькой выборке не рекомендуется удалять выбросы. В большой выборке выбросы усредняются.\n",
|
|||
|
"#### Качество набора данных\n",
|
|||
|
"Набор данных содержит достаточно примеров и признаков для обучения модели. Учтены различные ситуации проблемной области. Данные соответствуют данным, которые будут\n",
|
|||
|
"подаваться в производственной среде. Все метки согласованы.\n",
|
|||
|
"#### Проблема пропущенных данных"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 5,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"processor_gen процент пустых значений: %2.26\n",
|
|||
|
"processor_segment процент пустых значений: %0.75\n",
|
|||
|
"Graphic_card_memory процент пустых значений: %0.38\n",
|
|||
|
"graphic_card_name процент пустых значений: %0.38\n",
|
|||
|
"Core процент пустых значений: %0.38\n",
|
|||
|
"threads процент пустых значений: %3.38\n",
|
|||
|
"operating_system процент пустых значений: %5.64\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"for i in df.columns:\n",
|
|||
|
" null_rate = df[i].isnull().sum() / len(df)*100\n",
|
|||
|
" if null_rate > 0:\n",
|
|||
|
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 10,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"brand_name False\n",
|
|||
|
"price False\n",
|
|||
|
"rating False\n",
|
|||
|
"processor_gen False\n",
|
|||
|
"processor_brand False\n",
|
|||
|
"processor_segment False\n",
|
|||
|
"CPU_mark False\n",
|
|||
|
"CPU_performance False\n",
|
|||
|
"Graphic_card_memory False\n",
|
|||
|
"graphic_card_name False\n",
|
|||
|
"graphic_card_num False\n",
|
|||
|
"Core False\n",
|
|||
|
"threads False\n",
|
|||
|
"display_inches False\n",
|
|||
|
"ram_storage False\n",
|
|||
|
"ram_type False\n",
|
|||
|
"operating_system False\n",
|
|||
|
"SSD_storage False\n",
|
|||
|
"dtype: bool\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>brand_name</th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>rating</th>\n",
|
|||
|
" <th>processor_gen</th>\n",
|
|||
|
" <th>processor_brand</th>\n",
|
|||
|
" <th>processor_segment</th>\n",
|
|||
|
" <th>CPU_mark</th>\n",
|
|||
|
" <th>CPU_performance</th>\n",
|
|||
|
" <th>Graphic_card_memory</th>\n",
|
|||
|
" <th>graphic_card_name</th>\n",
|
|||
|
" <th>graphic_card_num</th>\n",
|
|||
|
" <th>Core</th>\n",
|
|||
|
" <th>threads</th>\n",
|
|||
|
" <th>display_inches</th>\n",
|
|||
|
" <th>ram_storage</th>\n",
|
|||
|
" <th>ram_type</th>\n",
|
|||
|
" <th>operating_system</th>\n",
|
|||
|
" <th>SSD_storage</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>527</th>\n",
|
|||
|
" <td>dell</td>\n",
|
|||
|
" <td>75500</td>\n",
|
|||
|
" <td>63</td>\n",
|
|||
|
" <td>4th</td>\n",
|
|||
|
" <td>amd</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>4600H</td>\n",
|
|||
|
" <td>maximum performance</td>\n",
|
|||
|
" <td>6 GB</td>\n",
|
|||
|
" <td>amd radeon</td>\n",
|
|||
|
" <td>other</td>\n",
|
|||
|
" <td>6.0</td>\n",
|
|||
|
" <td>12.0</td>\n",
|
|||
|
" <td>15.6</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>DDR4</td>\n",
|
|||
|
" <td>10.0</td>\n",
|
|||
|
" <td>512 GB SSD</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>528</th>\n",
|
|||
|
" <td>lenovo</td>\n",
|
|||
|
" <td>151990</td>\n",
|
|||
|
" <td>75</td>\n",
|
|||
|
" <td>10th</td>\n",
|
|||
|
" <td>intel</td>\n",
|
|||
|
" <td>i7</td>\n",
|
|||
|
" <td>10875H</td>\n",
|
|||
|
" <td>maximum performance</td>\n",
|
|||
|
" <td>8 GB</td>\n",
|
|||
|
" <td>nvidia geforce</td>\n",
|
|||
|
" <td>other</td>\n",
|
|||
|
" <td>8.0</td>\n",
|
|||
|
" <td>16.0</td>\n",
|
|||
|
" <td>15.6</td>\n",
|
|||
|
" <td>16</td>\n",
|
|||
|
" <td>DDR4</td>\n",
|
|||
|
" <td>10.0</td>\n",
|
|||
|
" <td>1 TB SSD</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>529</th>\n",
|
|||
|
" <td>lenovo</td>\n",
|
|||
|
" <td>46500</td>\n",
|
|||
|
" <td>48</td>\n",
|
|||
|
" <td>8th</td>\n",
|
|||
|
" <td>intel</td>\n",
|
|||
|
" <td>i5</td>\n",
|
|||
|
" <td>8250U</td>\n",
|
|||
|
" <td>ultra-low power</td>\n",
|
|||
|
" <td>Integrated</td>\n",
|
|||
|
" <td>Intel Integrated</td>\n",
|
|||
|
" <td>other</td>\n",
|
|||
|
" <td>4.0</td>\n",
|
|||
|
" <td>8.0</td>\n",
|
|||
|
" <td>other</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>DDR4</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>other</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>530</th>\n",
|
|||
|
" <td>msi</td>\n",
|
|||
|
" <td>109990</td>\n",
|
|||
|
" <td>61</td>\n",
|
|||
|
" <td>9th</td>\n",
|
|||
|
" <td>intel</td>\n",
|
|||
|
" <td>i7</td>\n",
|
|||
|
" <td>9750H</td>\n",
|
|||
|
" <td>maximum performance</td>\n",
|
|||
|
" <td>6 GB</td>\n",
|
|||
|
" <td>nvidia geforce</td>\n",
|
|||
|
" <td>other</td>\n",
|
|||
|
" <td>6.0</td>\n",
|
|||
|
" <td>12.0</td>\n",
|
|||
|
" <td>other</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>other</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>other</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>531</th>\n",
|
|||
|
" <td>hp</td>\n",
|
|||
|
" <td>95800</td>\n",
|
|||
|
" <td>70</td>\n",
|
|||
|
" <td>9th</td>\n",
|
|||
|
" <td>intel</td>\n",
|
|||
|
" <td>i7</td>\n",
|
|||
|
" <td>9750H</td>\n",
|
|||
|
" <td>maximum performance</td>\n",
|
|||
|
" <td>4 GB</td>\n",
|
|||
|
" <td>nvidia geforce</td>\n",
|
|||
|
" <td>1650</td>\n",
|
|||
|
" <td>6.0</td>\n",
|
|||
|
" <td>12.0</td>\n",
|
|||
|
" <td>15.6</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>DDR4</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>other</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" brand_name price rating processor_gen processor_brand \\\n",
|
|||
|
"527 dell 75500 63 4th amd \n",
|
|||
|
"528 lenovo 151990 75 10th intel \n",
|
|||
|
"529 lenovo 46500 48 8th intel \n",
|
|||
|
"530 msi 109990 61 9th intel \n",
|
|||
|
"531 hp 95800 70 9th intel \n",
|
|||
|
"\n",
|
|||
|
" processor_segment CPU_mark CPU_performance Graphic_card_memory \\\n",
|
|||
|
"527 5 4600H maximum performance 6 GB \n",
|
|||
|
"528 i7 10875H maximum performance 8 GB \n",
|
|||
|
"529 i5 8250U ultra-low power Integrated \n",
|
|||
|
"530 i7 9750H maximum performance 6 GB \n",
|
|||
|
"531 i7 9750H maximum performance 4 GB \n",
|
|||
|
"\n",
|
|||
|
" graphic_card_name graphic_card_num Core threads display_inches \\\n",
|
|||
|
"527 amd radeon other 6.0 12.0 15.6 \n",
|
|||
|
"528 nvidia geforce other 8.0 16.0 15.6 \n",
|
|||
|
"529 Intel Integrated other 4.0 8.0 other \n",
|
|||
|
"530 nvidia geforce other 6.0 12.0 other \n",
|
|||
|
"531 nvidia geforce 1650 6.0 12.0 15.6 \n",
|
|||
|
"\n",
|
|||
|
" ram_storage ram_type operating_system SSD_storage \n",
|
|||
|
"527 8 DDR4 10.0 512 GB SSD \n",
|
|||
|
"528 16 DDR4 10.0 1 TB SSD \n",
|
|||
|
"529 4 DDR4 0.0 other \n",
|
|||
|
"530 8 other 0.0 other \n",
|
|||
|
"531 8 DDR4 0.0 other "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 10,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df = df.fillna(0) #Замена пустых значений на 0\n",
|
|||
|
"print(df.isnull().any())\n",
|
|||
|
"df.tail()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Разбиение на выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 22,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"ename": "ValueError",
|
|||
|
"evalue": "brand_name is not a column in the dataframe",
|
|||
|
"output_type": "error",
|
|||
|
"traceback": [
|
|||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
|||
|
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
|
|||
|
"Cell \u001b[1;32mIn[22], line 46\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df_train, df_val, df_test\n\u001b[0;32m 44\u001b[0m data \u001b[38;5;241m=\u001b[39m df[[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrating\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprice\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mram_storage\u001b[39m\u001b[38;5;124m\"\u001b[39m]]\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[1;32m---> 46\u001b[0m df_train, df_val, df_test \u001b[38;5;241m=\u001b[39m \u001b[43msplit_stratified_into_train_val_test\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 47\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstratify_colname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mbrand_name\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_train\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.60\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_val\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_test\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\n\u001b[0;32m 48\u001b[0m \u001b[43m)\u001b[49m\n\u001b[0;32m 50\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_train\u001b[38;5;241m.\u001b[39mshape)\n\u001b[0;32m 52\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mКонтрольная выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_val\u001b[38;5;241m.\u001b[39mshape)\n",
|
|||
|
"Cell \u001b[1;32mIn[22], line 18\u001b[0m, in \u001b[0;36msplit_stratified_into_train_val_test\u001b[1;34m(df_input, stratify_colname, frac_train, frac_val, frac_test, random_state)\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 13\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfractions \u001b[39m\u001b[38;5;132;01m%f\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m%f\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m%f\u001b[39;00m\u001b[38;5;124m do not add up to 1.0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 14\u001b[0m \u001b[38;5;241m%\u001b[39m (frac_train, frac_val, frac_test)\n\u001b[0;32m 15\u001b[0m )\n\u001b[0;32m 17\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m stratify_colname \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m df_input\u001b[38;5;241m.\u001b[39mcolumns:\n\u001b[1;32m---> 18\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m is not a column in the dataframe\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m (stratify_colname))\n\u001b[0;32m 20\u001b[0m X \u001b[38;5;241m=\u001b[39m df_input \u001b[38;5;66;03m# Contains all columns.\u001b[39;00m\n\u001b[0;32m 21\u001b[0m y \u001b[38;5;241m=\u001b[39m df_input[\n\u001b[0;32m 22\u001b[0m [stratify_colname]\n\u001b[0;32m 23\u001b[0m ] \u001b[38;5;66;03m# Dataframe of just the column on which to stratify.\u001b[39;00m\n",
|
|||
|
"\u001b[1;31mValueError\u001b[0m: brand_name is not a column in the dataframe"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"def split_stratified_into_train_val_test(\n",
|
|||
|
" df_input,\n",
|
|||
|
" stratify_colname=\"y\",\n",
|
|||
|
" frac_train=0.6,\n",
|
|||
|
" frac_val=0.15,\n",
|
|||
|
" frac_test=0.25,\n",
|
|||
|
" random_state=None,\n",
|
|||
|
"):\n",
|
|||
|
" if frac_train + frac_val + frac_test != 1.0:\n",
|
|||
|
" raise ValueError(\n",
|
|||
|
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
|||
|
" % (frac_train, frac_val, frac_test)\n",
|
|||
|
" )\n",
|
|||
|
"\n",
|
|||
|
" if stratify_colname not in df_input.columns:\n",
|
|||
|
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
|||
|
"\n",
|
|||
|
" X = df_input # Contains all columns.\n",
|
|||
|
" y = df_input[\n",
|
|||
|
" [stratify_colname]\n",
|
|||
|
" ] # Dataframe of just the column on which to stratify.\n",
|
|||
|
"\n",
|
|||
|
" # Split original dataframe into train and temp dataframes.\n",
|
|||
|
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
|||
|
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
|||
|
" )\n",
|
|||
|
"\n",
|
|||
|
" # Split the temp dataframe into val and test dataframes.\n",
|
|||
|
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
|||
|
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
|||
|
" df_temp,\n",
|
|||
|
" y_temp,\n",
|
|||
|
" stratify=y_temp,\n",
|
|||
|
" test_size=relative_frac_test,\n",
|
|||
|
" random_state=random_state,\n",
|
|||
|
" )\n",
|
|||
|
"\n",
|
|||
|
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
|||
|
"\n",
|
|||
|
" return df_train, df_val, df_test\n",
|
|||
|
"\n",
|
|||
|
"data = df[[\"rating\", \"price\", \"ram_storage\"]].copy()\n",
|
|||
|
"\n",
|
|||
|
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
|
|||
|
" data, stratify_colname=\"rating\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Контрольная выборка: \", df_val.shape)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Тестовая выборка: \", df_test.shape)"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "kernel",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.5"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|