diff --git a/lab_2/lab_2.ipynb b/lab_2/lab_2.ipynb new file mode 100644 index 0000000..dddaafe --- /dev/null +++ b/lab_2/lab_2.ipynb @@ -0,0 +1,619 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataSet - \"Gaming Laptop Specs and Price\"\n", + "Данный датасет содержит данные о игровых ноутбуках." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 532 entries, 0 to 531\n", + "Data columns (total 18 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 brand_name 532 non-null object \n", + " 1 price 532 non-null int64 \n", + " 2 rating 532 non-null int64 \n", + " 3 processor_gen 520 non-null object \n", + " 4 processor_brand 532 non-null object \n", + " 5 processor_segment 528 non-null object \n", + " 6 CPU_mark 532 non-null object \n", + " 7 CPU_performance 532 non-null object \n", + " 8 Graphic_card_memory 530 non-null object \n", + " 9 graphic_card_name 530 non-null object \n", + " 10 graphic_card_num 532 non-null object \n", + " 11 Core 530 non-null float64\n", + " 12 threads 514 non-null float64\n", + " 13 display_inches 532 non-null object \n", + " 14 ram_storage 532 non-null int64 \n", + " 15 ram_type 532 non-null object \n", + " 16 operating_system 502 non-null float64\n", + " 17 SSD_storage 532 non-null object \n", + "dtypes: float64(3), int64(3), object(12)\n", + "memory usage: 74.9+ KB\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
brand_namepriceratingprocessor_genprocessor_brandprocessor_segmentCPU_markCPU_performanceGraphic_card_memorygraphic_card_namegraphic_card_numCorethreadsdisplay_inchesram_storageram_typeoperating_systemSSD_storage
0hp49490705thamd55600Hmaximum performance4 GBamd radeonother6.012.015.68DDR411.0512 GB SSD
1xiaomi1029907814thinteli914900HXmaximum performance8 GBnvidia geforce406024.032.0other16DDR511.01 TB SSD
2hp81490737thamd77840HShigh efficiency6 GBnvidia geforce30508.016.0other16DDR511.01 TB SSD
3asus499906411thinteli511400Hmaximum performance4 GBnvidia geforce20506.012.015.68DDR411.0512 GB SSD
4asus529906611thinteli511400Hmaximum performance4 GBnvidia geforce20506.012.015.616DDR411.0512 GB SSD
\n", + "
" + ], + "text/plain": [ + " brand_name price rating processor_gen processor_brand processor_segment \\\n", + "0 hp 49490 70 5th amd 5 \n", + "1 xiaomi 102990 78 14th intel i9 \n", + "2 hp 81490 73 7th amd 7 \n", + "3 asus 49990 64 11th intel i5 \n", + "4 asus 52990 66 11th intel i5 \n", + "\n", + " CPU_mark CPU_performance Graphic_card_memory graphic_card_name \\\n", + "0 5600H maximum performance 4 GB amd radeon \n", + "1 14900HX maximum performance 8 GB nvidia geforce \n", + "2 7840HS high efficiency 6 GB nvidia geforce \n", + "3 11400H maximum performance 4 GB nvidia geforce \n", + "4 11400H maximum performance 4 GB nvidia geforce \n", + "\n", + " graphic_card_num Core threads display_inches ram_storage ram_type \\\n", + "0 other 6.0 12.0 15.6 8 DDR4 \n", + "1 4060 24.0 32.0 other 16 DDR5 \n", + "2 3050 8.0 16.0 other 16 DDR5 \n", + "3 2050 6.0 12.0 15.6 8 DDR4 \n", + "4 2050 6.0 12.0 15.6 16 DDR4 \n", + "\n", + " operating_system SSD_storage \n", + "0 11.0 512 GB SSD \n", + "1 11.0 1 TB SSD \n", + "2 11.0 1 TB SSD \n", + "3 11.0 512 GB SSD \n", + "4 11.0 512 GB SSD " + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"datasets/laptop.csv\")\n", + "df.info()\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Проблемная область\n", + "Данный датасет позволяет проанализировать данные, и понять какие ноутбуки имеют превосходство на рынке техники, и какие чаще всего выбирают пользователи.\n", + "#### Анализ набора данных\n", + "Объекты наблюдения - игровые ноутбуки\n", + "Атрибуты - Имя бренда, цена, рейтинг, поколение процессора, марка процессора, сегмент процессора, специальная метка, производительность процессора, видеокарта, видеопамять, номер видеокарты, кол-во потоков видеокарты, размер дисплея, оперативная память, тип оперативной памяти, ОС, SSD.\n", + "Связи между объектами - нет\n", + "#### Бизнес-цели\n", + "Данный набор данных может помочь определить лидеров на рынке игровых ноутбуков.\n", + "В свою очередь определение лидеров поможет определить:\n", + "1. Какие модели игровых ноутбуков более выгодно продавать магазинам техники.\n", + "2. Какие комплектующие наиболее популярны у производителей и покупателей, для дальнейшего увеличения производства данных комплектующих.\n", + "3. Определение популярных комлпектующих, для дальнейшей сборки других игровых ноутбуков новых версий.\n", + "#### Примеры целей технического проекта. Что поступает на вход, что является целевым признаком?????????\n", + "#### Проблемы набора данных и их решения\n", + "1. Возможны устаревшие данные, т.к. новые комплектующие выходят довольно часто. Для решения данной проблемы требуется удаление самых старых записей о ноутбуках, и добавление более новых моделей.\n", + "2. Возможны выбросы, какие-то \"сверхестественные сборки\". Решить эту проблему помогает удаление таких выбросов. В маленькой выборке не рекомендуется удалять выбросы. В большой выборке выбросы усредняются.\n", + "#### Качество набора данных\n", + "Набор данных содержит достаточно примеров и признаков для обучения модели. Учтены различные ситуации проблемной области. Данные соответствуют данным, которые будут\n", + "подаваться в производственной среде. Все метки согласованы.\n", + "#### Проблема пропущенных данных" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "processor_gen процент пустых значений: %2.26\n", + "processor_segment процент пустых значений: %0.75\n", + "Graphic_card_memory процент пустых значений: %0.38\n", + "graphic_card_name процент пустых значений: %0.38\n", + "Core процент пустых значений: %0.38\n", + "threads процент пустых значений: %3.38\n", + "operating_system процент пустых значений: %5.64\n" + ] + } + ], + "source": [ + "for i in df.columns:\n", + " null_rate = df[i].isnull().sum() / len(df)*100\n", + " if null_rate > 0:\n", + " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "brand_name False\n", + "price False\n", + "rating False\n", + "processor_gen False\n", + "processor_brand False\n", + "processor_segment False\n", + "CPU_mark False\n", + "CPU_performance False\n", + "Graphic_card_memory False\n", + "graphic_card_name False\n", + "graphic_card_num False\n", + "Core False\n", + "threads False\n", + "display_inches False\n", + "ram_storage False\n", + "ram_type False\n", + "operating_system False\n", + "SSD_storage False\n", + "dtype: bool\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
brand_namepriceratingprocessor_genprocessor_brandprocessor_segmentCPU_markCPU_performanceGraphic_card_memorygraphic_card_namegraphic_card_numCorethreadsdisplay_inchesram_storageram_typeoperating_systemSSD_storage
527dell75500634thamd54600Hmaximum performance6 GBamd radeonother6.012.015.68DDR410.0512 GB SSD
528lenovo1519907510thinteli710875Hmaximum performance8 GBnvidia geforceother8.016.015.616DDR410.01 TB SSD
529lenovo46500488thinteli58250Uultra-low powerIntegratedIntel Integratedother4.08.0other4DDR40.0other
530msi109990619thinteli79750Hmaximum performance6 GBnvidia geforceother6.012.0other8other0.0other
531hp95800709thinteli79750Hmaximum performance4 GBnvidia geforce16506.012.015.68DDR40.0other
\n", + "
" + ], + "text/plain": [ + " brand_name price rating processor_gen processor_brand \\\n", + "527 dell 75500 63 4th amd \n", + "528 lenovo 151990 75 10th intel \n", + "529 lenovo 46500 48 8th intel \n", + "530 msi 109990 61 9th intel \n", + "531 hp 95800 70 9th intel \n", + "\n", + " processor_segment CPU_mark CPU_performance Graphic_card_memory \\\n", + "527 5 4600H maximum performance 6 GB \n", + "528 i7 10875H maximum performance 8 GB \n", + "529 i5 8250U ultra-low power Integrated \n", + "530 i7 9750H maximum performance 6 GB \n", + "531 i7 9750H maximum performance 4 GB \n", + "\n", + " graphic_card_name graphic_card_num Core threads display_inches \\\n", + "527 amd radeon other 6.0 12.0 15.6 \n", + "528 nvidia geforce other 8.0 16.0 15.6 \n", + "529 Intel Integrated other 4.0 8.0 other \n", + "530 nvidia geforce other 6.0 12.0 other \n", + "531 nvidia geforce 1650 6.0 12.0 15.6 \n", + "\n", + " ram_storage ram_type operating_system SSD_storage \n", + "527 8 DDR4 10.0 512 GB SSD \n", + "528 16 DDR4 10.0 1 TB SSD \n", + "529 4 DDR4 0.0 other \n", + "530 8 other 0.0 other \n", + "531 8 DDR4 0.0 other " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.fillna(0) #Замена пустых значений на 0\n", + "print(df.isnull().any())\n", + "df.tail()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Разбиение на выборки" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "brand_name is not a column in the dataframe", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[22], line 46\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df_train, df_val, df_test\n\u001b[0;32m 44\u001b[0m data \u001b[38;5;241m=\u001b[39m df[[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrating\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprice\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mram_storage\u001b[39m\u001b[38;5;124m\"\u001b[39m]]\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[1;32m---> 46\u001b[0m df_train, df_val, df_test \u001b[38;5;241m=\u001b[39m \u001b[43msplit_stratified_into_train_val_test\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 47\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstratify_colname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mbrand_name\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_train\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.60\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_val\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_test\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\n\u001b[0;32m 48\u001b[0m \u001b[43m)\u001b[49m\n\u001b[0;32m 50\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_train\u001b[38;5;241m.\u001b[39mshape)\n\u001b[0;32m 52\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mКонтрольная выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_val\u001b[38;5;241m.\u001b[39mshape)\n", + "Cell \u001b[1;32mIn[22], line 18\u001b[0m, in \u001b[0;36msplit_stratified_into_train_val_test\u001b[1;34m(df_input, stratify_colname, frac_train, frac_val, frac_test, random_state)\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 13\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfractions \u001b[39m\u001b[38;5;132;01m%f\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m%f\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m%f\u001b[39;00m\u001b[38;5;124m do not add up to 1.0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 14\u001b[0m \u001b[38;5;241m%\u001b[39m (frac_train, frac_val, frac_test)\n\u001b[0;32m 15\u001b[0m )\n\u001b[0;32m 17\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m stratify_colname \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m df_input\u001b[38;5;241m.\u001b[39mcolumns:\n\u001b[1;32m---> 18\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m is not a column in the dataframe\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m (stratify_colname))\n\u001b[0;32m 20\u001b[0m X \u001b[38;5;241m=\u001b[39m df_input \u001b[38;5;66;03m# Contains all columns.\u001b[39;00m\n\u001b[0;32m 21\u001b[0m y \u001b[38;5;241m=\u001b[39m df_input[\n\u001b[0;32m 22\u001b[0m [stratify_colname]\n\u001b[0;32m 23\u001b[0m ] \u001b[38;5;66;03m# Dataframe of just the column on which to stratify.\u001b[39;00m\n", + "\u001b[1;31mValueError\u001b[0m: brand_name is not a column in the dataframe" + ] + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "def split_stratified_into_train_val_test(\n", + " df_input,\n", + " stratify_colname=\"y\",\n", + " frac_train=0.6,\n", + " frac_val=0.15,\n", + " frac_test=0.25,\n", + " random_state=None,\n", + "):\n", + " if frac_train + frac_val + frac_test != 1.0:\n", + " raise ValueError(\n", + " \"fractions %f, %f, %f do not add up to 1.0\"\n", + " % (frac_train, frac_val, frac_test)\n", + " )\n", + "\n", + " if stratify_colname not in df_input.columns:\n", + " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", + "\n", + " X = df_input # Contains all columns.\n", + " y = df_input[\n", + " [stratify_colname]\n", + " ] # Dataframe of just the column on which to stratify.\n", + "\n", + " # Split original dataframe into train and temp dataframes.\n", + " df_train, df_temp, y_train, y_temp = train_test_split(\n", + " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", + " )\n", + "\n", + " # Split the temp dataframe into val and test dataframes.\n", + " relative_frac_test = frac_test / (frac_val + frac_test)\n", + " df_val, df_test, y_val, y_test = train_test_split(\n", + " df_temp,\n", + " y_temp,\n", + " stratify=y_temp,\n", + " test_size=relative_frac_test,\n", + " random_state=random_state,\n", + " )\n", + "\n", + " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", + "\n", + " return df_train, df_val, df_test\n", + "\n", + "data = df[[\"rating\", \"price\", \"ram_storage\"]].copy()\n", + "\n", + "df_train, df_val, df_test = split_stratified_into_train_val_test(\n", + " data, stratify_colname=\"rating\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n", + ")\n", + "\n", + "print(\"Обучающая выборка: \", df_train.shape)\n", + "\n", + "print(\"Контрольная выборка: \", df_val.shape)\n", + "\n", + "print(\"Тестовая выборка: \", df_test.shape)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "kernel", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}