From 3402751e112f42909e39e3ad601d3932163136a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=AF=D0=BA=D0=BE?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=B2?= Date: Fri, 4 Oct 2024 18:57:09 +0400 Subject: [PATCH 1/6] =?UTF-8?q?=D0=9E=D0=B1=D0=BD=D0=BE=D0=B2=D0=B8=D0=BB?= =?UTF-8?q?=20gitignore?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 880fad8..8c4caaa 100644 --- a/.gitignore +++ b/.gitignore @@ -208,3 +208,6 @@ kernel/share/jupyter/kernels/python3/logo-64x64.png kernel/share/jupyter/kernels/python3/logo-svg.svg kernel/share/man/man1/ipython.1 kernel/share/man/man1/ttx.1 +lab_2/datasets/game_reviews.csv +lab_2/datasets/laptop.csv +lab_2/datasets/Popular_PL.csv -- 2.25.1 From 617ecc5c665df6fb703ce365f5c8921a941317db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=AF=D0=BA=D0=BE?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=B2?= Date: Fri, 4 Oct 2024 20:21:29 +0400 Subject: [PATCH 2/6] =?UTF-8?q?=D0=BF=D1=80=D0=B0=D0=B2=D0=BE=D1=87=D0=BA?= =?UTF-8?q?=D0=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 8c4caaa..f53ce5a 100644 --- a/.gitignore +++ b/.gitignore @@ -211,3 +211,5 @@ kernel/share/man/man1/ttx.1 lab_2/datasets/game_reviews.csv lab_2/datasets/laptop.csv lab_2/datasets/Popular_PL.csv +lab_2/datasets/coffee.csv +lab_2/datasets/car_price_prediction.csv -- 2.25.1 From 003b2f8c7a6667862455256fc0b9229020880205 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=AF=D0=BA=D0=BE?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=B2?= Date: Fri, 4 Oct 2024 21:44:32 +0400 Subject: [PATCH 3/6] =?UTF-8?q?=D0=BD=D0=B8=D1=87=D0=B5=20=D0=BD=D0=B5=20?= =?UTF-8?q?=D1=80=D0=B0=D0=B1=D0=BE=D1=82=D0=B0=D0=B5=D1=82=20(=E2=95=A5?= =?UTF-8?q?=EF=B9=8F=E2=95=A5)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lab_2/lab_2.ipynb | 619 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 619 insertions(+) create mode 100644 lab_2/lab_2.ipynb diff --git a/lab_2/lab_2.ipynb b/lab_2/lab_2.ipynb new file mode 100644 index 0000000..dddaafe --- /dev/null +++ b/lab_2/lab_2.ipynb @@ -0,0 +1,619 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataSet - \"Gaming Laptop Specs and Price\"\n", + "Данный датасет содержит данные о игровых ноутбуках." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 532 entries, 0 to 531\n", + "Data columns (total 18 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 brand_name 532 non-null object \n", + " 1 price 532 non-null int64 \n", + " 2 rating 532 non-null int64 \n", + " 3 processor_gen 520 non-null object \n", + " 4 processor_brand 532 non-null object \n", + " 5 processor_segment 528 non-null object \n", + " 6 CPU_mark 532 non-null object \n", + " 7 CPU_performance 532 non-null object \n", + " 8 Graphic_card_memory 530 non-null object \n", + " 9 graphic_card_name 530 non-null object \n", + " 10 graphic_card_num 532 non-null object \n", + " 11 Core 530 non-null float64\n", + " 12 threads 514 non-null float64\n", + " 13 display_inches 532 non-null object \n", + " 14 ram_storage 532 non-null int64 \n", + " 15 ram_type 532 non-null object \n", + " 16 operating_system 502 non-null float64\n", + " 17 SSD_storage 532 non-null object \n", + "dtypes: float64(3), int64(3), object(12)\n", + "memory usage: 74.9+ KB\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
brand_namepriceratingprocessor_genprocessor_brandprocessor_segmentCPU_markCPU_performanceGraphic_card_memorygraphic_card_namegraphic_card_numCorethreadsdisplay_inchesram_storageram_typeoperating_systemSSD_storage
0hp49490705thamd55600Hmaximum performance4 GBamd radeonother6.012.015.68DDR411.0512 GB SSD
1xiaomi1029907814thinteli914900HXmaximum performance8 GBnvidia geforce406024.032.0other16DDR511.01 TB SSD
2hp81490737thamd77840HShigh efficiency6 GBnvidia geforce30508.016.0other16DDR511.01 TB SSD
3asus499906411thinteli511400Hmaximum performance4 GBnvidia geforce20506.012.015.68DDR411.0512 GB SSD
4asus529906611thinteli511400Hmaximum performance4 GBnvidia geforce20506.012.015.616DDR411.0512 GB SSD
\n", + "
" + ], + "text/plain": [ + " brand_name price rating processor_gen processor_brand processor_segment \\\n", + "0 hp 49490 70 5th amd 5 \n", + "1 xiaomi 102990 78 14th intel i9 \n", + "2 hp 81490 73 7th amd 7 \n", + "3 asus 49990 64 11th intel i5 \n", + "4 asus 52990 66 11th intel i5 \n", + "\n", + " CPU_mark CPU_performance Graphic_card_memory graphic_card_name \\\n", + "0 5600H maximum performance 4 GB amd radeon \n", + "1 14900HX maximum performance 8 GB nvidia geforce \n", + "2 7840HS high efficiency 6 GB nvidia geforce \n", + "3 11400H maximum performance 4 GB nvidia geforce \n", + "4 11400H maximum performance 4 GB nvidia geforce \n", + "\n", + " graphic_card_num Core threads display_inches ram_storage ram_type \\\n", + "0 other 6.0 12.0 15.6 8 DDR4 \n", + "1 4060 24.0 32.0 other 16 DDR5 \n", + "2 3050 8.0 16.0 other 16 DDR5 \n", + "3 2050 6.0 12.0 15.6 8 DDR4 \n", + "4 2050 6.0 12.0 15.6 16 DDR4 \n", + "\n", + " operating_system SSD_storage \n", + "0 11.0 512 GB SSD \n", + "1 11.0 1 TB SSD \n", + "2 11.0 1 TB SSD \n", + "3 11.0 512 GB SSD \n", + "4 11.0 512 GB SSD " + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"datasets/laptop.csv\")\n", + "df.info()\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Проблемная область\n", + "Данный датасет позволяет проанализировать данные, и понять какие ноутбуки имеют превосходство на рынке техники, и какие чаще всего выбирают пользователи.\n", + "#### Анализ набора данных\n", + "Объекты наблюдения - игровые ноутбуки\n", + "Атрибуты - Имя бренда, цена, рейтинг, поколение процессора, марка процессора, сегмент процессора, специальная метка, производительность процессора, видеокарта, видеопамять, номер видеокарты, кол-во потоков видеокарты, размер дисплея, оперативная память, тип оперативной памяти, ОС, SSD.\n", + "Связи между объектами - нет\n", + "#### Бизнес-цели\n", + "Данный набор данных может помочь определить лидеров на рынке игровых ноутбуков.\n", + "В свою очередь определение лидеров поможет определить:\n", + "1. Какие модели игровых ноутбуков более выгодно продавать магазинам техники.\n", + "2. Какие комплектующие наиболее популярны у производителей и покупателей, для дальнейшего увеличения производства данных комплектующих.\n", + "3. Определение популярных комлпектующих, для дальнейшей сборки других игровых ноутбуков новых версий.\n", + "#### Примеры целей технического проекта. Что поступает на вход, что является целевым признаком?????????\n", + "#### Проблемы набора данных и их решения\n", + "1. Возможны устаревшие данные, т.к. новые комплектующие выходят довольно часто. Для решения данной проблемы требуется удаление самых старых записей о ноутбуках, и добавление более новых моделей.\n", + "2. Возможны выбросы, какие-то \"сверхестественные сборки\". Решить эту проблему помогает удаление таких выбросов. В маленькой выборке не рекомендуется удалять выбросы. В большой выборке выбросы усредняются.\n", + "#### Качество набора данных\n", + "Набор данных содержит достаточно примеров и признаков для обучения модели. Учтены различные ситуации проблемной области. Данные соответствуют данным, которые будут\n", + "подаваться в производственной среде. Все метки согласованы.\n", + "#### Проблема пропущенных данных" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "processor_gen процент пустых значений: %2.26\n", + "processor_segment процент пустых значений: %0.75\n", + "Graphic_card_memory процент пустых значений: %0.38\n", + "graphic_card_name процент пустых значений: %0.38\n", + "Core процент пустых значений: %0.38\n", + "threads процент пустых значений: %3.38\n", + "operating_system процент пустых значений: %5.64\n" + ] + } + ], + "source": [ + "for i in df.columns:\n", + " null_rate = df[i].isnull().sum() / len(df)*100\n", + " if null_rate > 0:\n", + " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "brand_name False\n", + "price False\n", + "rating False\n", + "processor_gen False\n", + "processor_brand False\n", + "processor_segment False\n", + "CPU_mark False\n", + "CPU_performance False\n", + "Graphic_card_memory False\n", + "graphic_card_name False\n", + "graphic_card_num False\n", + "Core False\n", + "threads False\n", + "display_inches False\n", + "ram_storage False\n", + "ram_type False\n", + "operating_system False\n", + "SSD_storage False\n", + "dtype: bool\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
brand_namepriceratingprocessor_genprocessor_brandprocessor_segmentCPU_markCPU_performanceGraphic_card_memorygraphic_card_namegraphic_card_numCorethreadsdisplay_inchesram_storageram_typeoperating_systemSSD_storage
527dell75500634thamd54600Hmaximum performance6 GBamd radeonother6.012.015.68DDR410.0512 GB SSD
528lenovo1519907510thinteli710875Hmaximum performance8 GBnvidia geforceother8.016.015.616DDR410.01 TB SSD
529lenovo46500488thinteli58250Uultra-low powerIntegratedIntel Integratedother4.08.0other4DDR40.0other
530msi109990619thinteli79750Hmaximum performance6 GBnvidia geforceother6.012.0other8other0.0other
531hp95800709thinteli79750Hmaximum performance4 GBnvidia geforce16506.012.015.68DDR40.0other
\n", + "
" + ], + "text/plain": [ + " brand_name price rating processor_gen processor_brand \\\n", + "527 dell 75500 63 4th amd \n", + "528 lenovo 151990 75 10th intel \n", + "529 lenovo 46500 48 8th intel \n", + "530 msi 109990 61 9th intel \n", + "531 hp 95800 70 9th intel \n", + "\n", + " processor_segment CPU_mark CPU_performance Graphic_card_memory \\\n", + "527 5 4600H maximum performance 6 GB \n", + "528 i7 10875H maximum performance 8 GB \n", + "529 i5 8250U ultra-low power Integrated \n", + "530 i7 9750H maximum performance 6 GB \n", + "531 i7 9750H maximum performance 4 GB \n", + "\n", + " graphic_card_name graphic_card_num Core threads display_inches \\\n", + "527 amd radeon other 6.0 12.0 15.6 \n", + "528 nvidia geforce other 8.0 16.0 15.6 \n", + "529 Intel Integrated other 4.0 8.0 other \n", + "530 nvidia geforce other 6.0 12.0 other \n", + "531 nvidia geforce 1650 6.0 12.0 15.6 \n", + "\n", + " ram_storage ram_type operating_system SSD_storage \n", + "527 8 DDR4 10.0 512 GB SSD \n", + "528 16 DDR4 10.0 1 TB SSD \n", + "529 4 DDR4 0.0 other \n", + "530 8 other 0.0 other \n", + "531 8 DDR4 0.0 other " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.fillna(0) #Замена пустых значений на 0\n", + "print(df.isnull().any())\n", + "df.tail()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Разбиение на выборки" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "brand_name is not a column in the dataframe", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[22], line 46\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df_train, df_val, df_test\n\u001b[0;32m 44\u001b[0m data \u001b[38;5;241m=\u001b[39m df[[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrating\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprice\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mram_storage\u001b[39m\u001b[38;5;124m\"\u001b[39m]]\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[1;32m---> 46\u001b[0m df_train, df_val, df_test \u001b[38;5;241m=\u001b[39m \u001b[43msplit_stratified_into_train_val_test\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 47\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstratify_colname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mbrand_name\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_train\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.60\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_val\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_test\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\n\u001b[0;32m 48\u001b[0m \u001b[43m)\u001b[49m\n\u001b[0;32m 50\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_train\u001b[38;5;241m.\u001b[39mshape)\n\u001b[0;32m 52\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mКонтрольная выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_val\u001b[38;5;241m.\u001b[39mshape)\n", + "Cell \u001b[1;32mIn[22], line 18\u001b[0m, in \u001b[0;36msplit_stratified_into_train_val_test\u001b[1;34m(df_input, stratify_colname, frac_train, frac_val, frac_test, random_state)\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 13\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfractions \u001b[39m\u001b[38;5;132;01m%f\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m%f\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m%f\u001b[39;00m\u001b[38;5;124m do not add up to 1.0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 14\u001b[0m \u001b[38;5;241m%\u001b[39m (frac_train, frac_val, frac_test)\n\u001b[0;32m 15\u001b[0m )\n\u001b[0;32m 17\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m stratify_colname \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m df_input\u001b[38;5;241m.\u001b[39mcolumns:\n\u001b[1;32m---> 18\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m is not a column in the dataframe\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m (stratify_colname))\n\u001b[0;32m 20\u001b[0m X \u001b[38;5;241m=\u001b[39m df_input \u001b[38;5;66;03m# Contains all columns.\u001b[39;00m\n\u001b[0;32m 21\u001b[0m y \u001b[38;5;241m=\u001b[39m df_input[\n\u001b[0;32m 22\u001b[0m [stratify_colname]\n\u001b[0;32m 23\u001b[0m ] \u001b[38;5;66;03m# Dataframe of just the column on which to stratify.\u001b[39;00m\n", + "\u001b[1;31mValueError\u001b[0m: brand_name is not a column in the dataframe" + ] + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "def split_stratified_into_train_val_test(\n", + " df_input,\n", + " stratify_colname=\"y\",\n", + " frac_train=0.6,\n", + " frac_val=0.15,\n", + " frac_test=0.25,\n", + " random_state=None,\n", + "):\n", + " if frac_train + frac_val + frac_test != 1.0:\n", + " raise ValueError(\n", + " \"fractions %f, %f, %f do not add up to 1.0\"\n", + " % (frac_train, frac_val, frac_test)\n", + " )\n", + "\n", + " if stratify_colname not in df_input.columns:\n", + " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", + "\n", + " X = df_input # Contains all columns.\n", + " y = df_input[\n", + " [stratify_colname]\n", + " ] # Dataframe of just the column on which to stratify.\n", + "\n", + " # Split original dataframe into train and temp dataframes.\n", + " df_train, df_temp, y_train, y_temp = train_test_split(\n", + " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", + " )\n", + "\n", + " # Split the temp dataframe into val and test dataframes.\n", + " relative_frac_test = frac_test / (frac_val + frac_test)\n", + " df_val, df_test, y_val, y_test = train_test_split(\n", + " df_temp,\n", + " y_temp,\n", + " stratify=y_temp,\n", + " test_size=relative_frac_test,\n", + " random_state=random_state,\n", + " )\n", + "\n", + " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", + "\n", + " return df_train, df_val, df_test\n", + "\n", + "data = df[[\"rating\", \"price\", \"ram_storage\"]].copy()\n", + "\n", + "df_train, df_val, df_test = split_stratified_into_train_val_test(\n", + " data, stratify_colname=\"rating\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n", + ")\n", + "\n", + "print(\"Обучающая выборка: \", df_train.shape)\n", + "\n", + "print(\"Контрольная выборка: \", df_val.shape)\n", + "\n", + "print(\"Тестовая выборка: \", df_test.shape)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "kernel", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} -- 2.25.1 From fdbdb674e81803f471920c4c67fc2d8780b458ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=AF=D0=BA=D0=BE?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=B2?= Date: Fri, 4 Oct 2024 21:50:20 +0400 Subject: [PATCH 4/6] =?UTF-8?q?=D0=B2=D1=81=D0=B5=20=D0=B5=D1=89=D0=B5=20?= =?UTF-8?q?=D0=BD=D0=B0=20=D1=80=D0=B0=D0=B1=D0=BE=D1=82=D0=B0=D0=B5=D1=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lab_2/lab_2.ipynb | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/lab_2/lab_2.ipynb b/lab_2/lab_2.ipynb index dddaafe..443a71e 100644 --- a/lab_2/lab_2.ipynb +++ b/lab_2/lab_2.ipynb @@ -521,19 +521,23 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 24, "metadata": {}, "outputs": [ { "ename": "ValueError", - "evalue": "brand_name is not a column in the dataframe", + "evalue": "The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[22], line 46\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df_train, df_val, df_test\n\u001b[0;32m 44\u001b[0m data \u001b[38;5;241m=\u001b[39m df[[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrating\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprice\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mram_storage\u001b[39m\u001b[38;5;124m\"\u001b[39m]]\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[1;32m---> 46\u001b[0m df_train, df_val, df_test \u001b[38;5;241m=\u001b[39m \u001b[43msplit_stratified_into_train_val_test\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 47\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstratify_colname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mbrand_name\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_train\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.60\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_val\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_test\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\n\u001b[0;32m 48\u001b[0m \u001b[43m)\u001b[49m\n\u001b[0;32m 50\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_train\u001b[38;5;241m.\u001b[39mshape)\n\u001b[0;32m 52\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mКонтрольная выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_val\u001b[38;5;241m.\u001b[39mshape)\n", - "Cell \u001b[1;32mIn[22], line 18\u001b[0m, in \u001b[0;36msplit_stratified_into_train_val_test\u001b[1;34m(df_input, stratify_colname, frac_train, frac_val, frac_test, random_state)\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 13\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfractions \u001b[39m\u001b[38;5;132;01m%f\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m%f\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m%f\u001b[39;00m\u001b[38;5;124m do not add up to 1.0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 14\u001b[0m \u001b[38;5;241m%\u001b[39m (frac_train, frac_val, frac_test)\n\u001b[0;32m 15\u001b[0m )\n\u001b[0;32m 17\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m stratify_colname \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m df_input\u001b[38;5;241m.\u001b[39mcolumns:\n\u001b[1;32m---> 18\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m is not a column in the dataframe\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m (stratify_colname))\n\u001b[0;32m 20\u001b[0m X \u001b[38;5;241m=\u001b[39m df_input \u001b[38;5;66;03m# Contains all columns.\u001b[39;00m\n\u001b[0;32m 21\u001b[0m y \u001b[38;5;241m=\u001b[39m df_input[\n\u001b[0;32m 22\u001b[0m [stratify_colname]\n\u001b[0;32m 23\u001b[0m ] \u001b[38;5;66;03m# Dataframe of just the column on which to stratify.\u001b[39;00m\n", - "\u001b[1;31mValueError\u001b[0m: brand_name is not a column in the dataframe" + "Cell \u001b[1;32mIn[24], line 46\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df_train, df_val, df_test\n\u001b[0;32m 44\u001b[0m data \u001b[38;5;241m=\u001b[39m df[[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrating\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprice\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mram_storage\u001b[39m\u001b[38;5;124m\"\u001b[39m]]\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[1;32m---> 46\u001b[0m df_train, df_val, df_test \u001b[38;5;241m=\u001b[39m \u001b[43msplit_stratified_into_train_val_test\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 47\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstratify_colname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrating\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_train\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.60\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_val\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_test\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\n\u001b[0;32m 48\u001b[0m \u001b[43m)\u001b[49m\n\u001b[0;32m 50\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_train\u001b[38;5;241m.\u001b[39mshape)\n\u001b[0;32m 52\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mКонтрольная выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_val\u001b[38;5;241m.\u001b[39mshape)\n", + "Cell \u001b[1;32mIn[24], line 26\u001b[0m, in \u001b[0;36msplit_stratified_into_train_val_test\u001b[1;34m(df_input, stratify_colname, frac_train, frac_val, frac_test, random_state)\u001b[0m\n\u001b[0;32m 21\u001b[0m y \u001b[38;5;241m=\u001b[39m df_input[\n\u001b[0;32m 22\u001b[0m [stratify_colname]\n\u001b[0;32m 23\u001b[0m ] \u001b[38;5;66;03m# Dataframe of just the column on which to stratify.\u001b[39;00m\n\u001b[0;32m 25\u001b[0m \u001b[38;5;66;03m# Split original dataframe into train and temp dataframes.\u001b[39;00m\n\u001b[1;32m---> 26\u001b[0m df_train, df_temp, y_train, y_temp \u001b[38;5;241m=\u001b[39m \u001b[43mtrain_test_split\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 27\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstratify\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m1.0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mfrac_train\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrandom_state\u001b[49m\n\u001b[0;32m 28\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 30\u001b[0m \u001b[38;5;66;03m# Split the temp dataframe into val and test dataframes.\u001b[39;00m\n\u001b[0;32m 31\u001b[0m relative_frac_test \u001b[38;5;241m=\u001b[39m frac_test \u001b[38;5;241m/\u001b[39m (frac_val \u001b[38;5;241m+\u001b[39m frac_test)\n", + "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:213\u001b[0m, in \u001b[0;36mvalidate_params..decorator..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 208\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 209\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 210\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 211\u001b[0m )\n\u001b[0;32m 212\u001b[0m ):\n\u001b[1;32m--> 213\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 215\u001b[0m \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m 216\u001b[0m \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m 217\u001b[0m \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m 218\u001b[0m \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m 219\u001b[0m msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[0;32m 220\u001b[0m \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 221\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 222\u001b[0m \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[0;32m 223\u001b[0m )\n", + "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:2806\u001b[0m, in \u001b[0;36mtrain_test_split\u001b[1;34m(test_size, train_size, random_state, shuffle, stratify, *arrays)\u001b[0m\n\u001b[0;32m 2802\u001b[0m CVClass \u001b[38;5;241m=\u001b[39m ShuffleSplit\n\u001b[0;32m 2804\u001b[0m cv \u001b[38;5;241m=\u001b[39m CVClass(test_size\u001b[38;5;241m=\u001b[39mn_test, train_size\u001b[38;5;241m=\u001b[39mn_train, random_state\u001b[38;5;241m=\u001b[39mrandom_state)\n\u001b[1;32m-> 2806\u001b[0m train, test \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcv\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43marrays\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstratify\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2808\u001b[0m train, test \u001b[38;5;241m=\u001b[39m ensure_common_namespace_device(arrays[\u001b[38;5;241m0\u001b[39m], train, test)\n\u001b[0;32m 2810\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(\n\u001b[0;32m 2811\u001b[0m chain\u001b[38;5;241m.\u001b[39mfrom_iterable(\n\u001b[0;32m 2812\u001b[0m (_safe_indexing(a, train), _safe_indexing(a, test)) \u001b[38;5;28;01mfor\u001b[39;00m a \u001b[38;5;129;01min\u001b[39;00m arrays\n\u001b[0;32m 2813\u001b[0m )\n\u001b[0;32m 2814\u001b[0m )\n", + "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:1843\u001b[0m, in \u001b[0;36mBaseShuffleSplit.split\u001b[1;34m(self, X, y, groups)\u001b[0m\n\u001b[0;32m 1813\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Generate indices to split data into training and test set.\u001b[39;00m\n\u001b[0;32m 1814\u001b[0m \n\u001b[0;32m 1815\u001b[0m \u001b[38;5;124;03mParameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1840\u001b[0m \u001b[38;5;124;03mto an integer.\u001b[39;00m\n\u001b[0;32m 1841\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 1842\u001b[0m X, y, groups \u001b[38;5;241m=\u001b[39m indexable(X, y, groups)\n\u001b[1;32m-> 1843\u001b[0m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_iter_indices\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgroups\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[0;32m 1844\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01myield\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\n", + "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:2252\u001b[0m, in \u001b[0;36mStratifiedShuffleSplit._iter_indices\u001b[1;34m(self, X, y, groups)\u001b[0m\n\u001b[0;32m 2250\u001b[0m class_counts \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mbincount(y_indices)\n\u001b[0;32m 2251\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m np\u001b[38;5;241m.\u001b[39mmin(class_counts) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m2\u001b[39m:\n\u001b[1;32m-> 2252\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 2253\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe least populated class in y has only 1\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2254\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m member, which is too few. The minimum\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2255\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m number of groups for any class cannot\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2256\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m be less than 2.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2257\u001b[0m )\n\u001b[0;32m 2259\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n_train \u001b[38;5;241m<\u001b[39m n_classes:\n\u001b[0;32m 2260\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 2261\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe train_size = \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m should be greater or \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2262\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mequal to the number of classes = \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m (n_train, n_classes)\n\u001b[0;32m 2263\u001b[0m )\n", + "\u001b[1;31mValueError\u001b[0m: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2." ] } ], -- 2.25.1 From d19b7e0793932ff2a5a0579a5a854ff94921f43e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=AF=D0=BA=D0=BE?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=B2?= Date: Fri, 18 Oct 2024 18:40:10 +0400 Subject: [PATCH 5/6] =?UTF-8?q?=D0=9D=D0=B0=D0=BA=D0=BE=D0=BD=D0=B5=D1=86?= =?UTF-8?q?=20=D1=82=D0=BE=20=D0=B2=D1=81=D0=B5=20(=E2=95=A5=EF=B9=8F?= =?UTF-8?q?=E2=95=A5)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lab_2/lab_2.ipynb | 659 +++++++++++++++++++++++++++++----------------- 1 file changed, 422 insertions(+), 237 deletions(-) diff --git a/lab_2/lab_2.ipynb b/lab_2/lab_2.ipynb index 443a71e..11cab85 100644 --- a/lab_2/lab_2.ipynb +++ b/lab_2/lab_2.ipynb @@ -4,13 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### DataSet - \"Gaming Laptop Specs and Price\"\n", - "Данный датасет содержит данные о игровых ноутбуках." + "### Lab2 PIbd-31 Yakovlev\n", + "Загрузим три датасета" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 110, "metadata": {}, "outputs": [ { @@ -41,201 +41,58 @@ " 16 operating_system 502 non-null float64\n", " 17 SSD_storage 532 non-null object \n", "dtypes: float64(3), int64(3), object(12)\n", - "memory usage: 74.9+ KB\n" + "memory usage: 74.9+ KB\n", + "\n", + "RangeIndex: 8036 entries, 0 to 8035\n", + "Data columns (total 7 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Date 8036 non-null object \n", + " 1 Open 8036 non-null float64\n", + " 2 High 8036 non-null float64\n", + " 3 Low 8036 non-null float64\n", + " 4 Close 8036 non-null float64\n", + " 5 Adj Close 8036 non-null float64\n", + " 6 Volume 8036 non-null int64 \n", + "dtypes: float64(5), int64(1), object(1)\n", + "memory usage: 439.6+ KB\n", + "\n", + "RangeIndex: 19237 entries, 0 to 19236\n", + "Data columns (total 18 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 ID 19237 non-null int64 \n", + " 1 Price 19237 non-null int64 \n", + " 2 Levy 19237 non-null object \n", + " 3 Manufacturer 19237 non-null object \n", + " 4 Model 19237 non-null object \n", + " 5 Prod. year 19237 non-null int64 \n", + " 6 Category 19237 non-null object \n", + " 7 Leather interior 19237 non-null object \n", + " 8 Fuel type 19237 non-null object \n", + " 9 Engine volume 19237 non-null object \n", + " 10 Mileage 19237 non-null object \n", + " 11 Cylinders 19237 non-null float64\n", + " 12 Gear box type 19237 non-null object \n", + " 13 Drive wheels 19237 non-null object \n", + " 14 Doors 19237 non-null object \n", + " 15 Wheel 19237 non-null object \n", + " 16 Color 19237 non-null object \n", + " 17 Airbags 19237 non-null int64 \n", + "dtypes: float64(1), int64(4), object(13)\n", + "memory usage: 2.6+ MB\n" ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
brand_namepriceratingprocessor_genprocessor_brandprocessor_segmentCPU_markCPU_performanceGraphic_card_memorygraphic_card_namegraphic_card_numCorethreadsdisplay_inchesram_storageram_typeoperating_systemSSD_storage
0hp49490705thamd55600Hmaximum performance4 GBamd radeonother6.012.015.68DDR411.0512 GB SSD
1xiaomi1029907814thinteli914900HXmaximum performance8 GBnvidia geforce406024.032.0other16DDR511.01 TB SSD
2hp81490737thamd77840HShigh efficiency6 GBnvidia geforce30508.016.0other16DDR511.01 TB SSD
3asus499906411thinteli511400Hmaximum performance4 GBnvidia geforce20506.012.015.68DDR411.0512 GB SSD
4asus529906611thinteli511400Hmaximum performance4 GBnvidia geforce20506.012.015.616DDR411.0512 GB SSD
\n", - "
" - ], - "text/plain": [ - " brand_name price rating processor_gen processor_brand processor_segment \\\n", - "0 hp 49490 70 5th amd 5 \n", - "1 xiaomi 102990 78 14th intel i9 \n", - "2 hp 81490 73 7th amd 7 \n", - "3 asus 49990 64 11th intel i5 \n", - "4 asus 52990 66 11th intel i5 \n", - "\n", - " CPU_mark CPU_performance Graphic_card_memory graphic_card_name \\\n", - "0 5600H maximum performance 4 GB amd radeon \n", - "1 14900HX maximum performance 8 GB nvidia geforce \n", - "2 7840HS high efficiency 6 GB nvidia geforce \n", - "3 11400H maximum performance 4 GB nvidia geforce \n", - "4 11400H maximum performance 4 GB nvidia geforce \n", - "\n", - " graphic_card_num Core threads display_inches ram_storage ram_type \\\n", - "0 other 6.0 12.0 15.6 8 DDR4 \n", - "1 4060 24.0 32.0 other 16 DDR5 \n", - "2 3050 8.0 16.0 other 16 DDR5 \n", - "3 2050 6.0 12.0 15.6 8 DDR4 \n", - "4 2050 6.0 12.0 15.6 16 DDR4 \n", - "\n", - " operating_system SSD_storage \n", - "0 11.0 512 GB SSD \n", - "1 11.0 1 TB SSD \n", - "2 11.0 1 TB SSD \n", - "3 11.0 512 GB SSD \n", - "4 11.0 512 GB SSD " - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv(\"datasets/laptop.csv\")\n", + "df2 = pd.read_csv(\"datasets/coffee.csv\")\n", + "df3 = pd.read_csv(\"datasets/car_price_prediction.csv\")\n", "df.info()\n", - "df.head()" + "df2.info()\n", + "df3.info()" ] }, { @@ -243,56 +100,169 @@ "metadata": {}, "source": [ "#### Проблемная область\n", - "Данный датасет позволяет проанализировать данные, и понять какие ноутбуки имеют превосходство на рынке техники, и какие чаще всего выбирают пользователи.\n", + "Первый датасет позволяет проанализировать данные, и понять какие ноутбуки имеют превосходство на рынке техники, и какие чаще всего выбирают пользователи.\n", + "Второй датасет позволяет при помощи данных акций за последние 25 лет спрогнозировать будущие показатели акций кофейни Starbucks\n", + "Третий датасет позволяет проанализировать данные, и спрогнозировать категорию цены для машины, по ее комплектующим.\n", "#### Анализ набора данных\n", - "Объекты наблюдения - игровые ноутбуки\n", - "Атрибуты - Имя бренда, цена, рейтинг, поколение процессора, марка процессора, сегмент процессора, специальная метка, производительность процессора, видеокарта, видеопамять, номер видеокарты, кол-во потоков видеокарты, размер дисплея, оперативная память, тип оперативной памяти, ОС, SSD.\n", + "Объекты наблюдения - игровые ноутбуки, акции, машины\n", + "Атрибуты - \n", + "1. Имя бренда, цена, рейтинг, поколение процессора, марка процессора, сегмент процессора, специальная метка, производительность процессора, видеокарта, видеопамять, номер видеокарты, кол-во потоков видеокарты, размер дисплея, оперативная память, тип оперативной памяти, ОС, SSD.\n", + "2. Дата, начальная цена за день, максимальная цена, минимальная цена, цена на момент закрытия продаж, скорректированая цена на момент закрытия, объем торговли акций за день.\n", + "3. Цена обслуживания, производитель, модель, год выпуска, категория, кожанный салон, тип топлива, объем двигателя.\n", "Связи между объектами - нет\n", "#### Бизнес-цели\n", - "Данный набор данных может помочь определить лидеров на рынке игровых ноутбуков.\n", - "В свою очередь определение лидеров поможет определить:\n", - "1. Какие модели игровых ноутбуков более выгодно продавать магазинам техники.\n", - "2. Какие комплектующие наиболее популярны у производителей и покупателей, для дальнейшего увеличения производства данных комплектующих.\n", - "3. Определение популярных комлпектующих, для дальнейшей сборки других игровых ноутбуков новых версий.\n", - "#### Примеры целей технического проекта. Что поступает на вход, что является целевым признаком?????????\n", + "1. Какие модели игровых ноутбуков более выгодно продавать магазинам техники. Какие комплектующие наиболее популярны у производителей и покупателей, для дальнейшего увеличения производства данных комплектующих.\n", + "2. Прогноз цен, для дальнешей покупки, продажи акций. Прогнозирование, для предотвращения упадка.\n", + "3. Для составления списка лучших моделей автомобилей. Определения наилучшего буджетного автомобиля, который не будет часто ломаться и приносить убытки.\n", + "#### Примеры целей технического проекта. Что поступает на вход, что является целевым признаком.\n", + "На входе всегда датасет, целевые признаки:\n", + "1. Рейтинг ноутбука\n", + "2. Максимальная цена за день\n", + "3. Цена обслуживания\n", "#### Проблемы набора данных и их решения\n", - "1. Возможны устаревшие данные, т.к. новые комплектующие выходят довольно часто. Для решения данной проблемы требуется удаление самых старых записей о ноутбуках, и добавление более новых моделей.\n", - "2. Возможны выбросы, какие-то \"сверхестественные сборки\". Решить эту проблему помогает удаление таких выбросов. В маленькой выборке не рекомендуется удалять выбросы. В большой выборке выбросы усредняются.\n", + "1. Возможны устаревшие данные. Для решения данной проблемы требуется удаление самых старых записей, и добавление более новых.\n", + "2. Возможны выбросы. Решить эту проблему помогает удаление таких выбросов. В маленькой выборке не рекомендуется удалять выбросы. В большой выборке выбросы усредняются.\n", "#### Качество набора данных\n", - "Набор данных содержит достаточно примеров и признаков для обучения модели. Учтены различные ситуации проблемной области. Данные соответствуют данным, которые будут\n", - "подаваться в производственной среде. Все метки согласованы.\n", - "#### Проблема пропущенных данных" + "Наборы данных содержат достаточно примеров и признаков для обучения модели. Учтены различные ситуации проблемной области. Данные соответствуют данным, которые будут\n", + "подаваться в производственной среде. Все метки согласованы.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Поиск аномалий" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 111, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + " price rating Core threads ram_storage \\\n", + "count 532.000000 532.000000 530.000000 514.000000 532.000000 \n", + "mean 107684.492481 67.781955 9.035849 15.089494 15.676692 \n", + "std 80187.648965 8.161356 4.413487 5.216162 8.901257 \n", + "min 30999.000000 43.000000 2.000000 4.000000 4.000000 \n", + "25% 62371.750000 63.000000 6.000000 12.000000 8.000000 \n", + "50% 83745.000000 66.000000 8.000000 16.000000 16.000000 \n", + "75% 114040.000000 72.000000 10.000000 16.000000 16.000000 \n", + "max 599990.000000 98.000000 24.000000 32.000000 64.000000 \n", + "\n", + " operating_system \n", + "count 502.000000 \n", + "mean 10.842629 \n", + "std 0.364513 \n", + "min 10.000000 \n", + "25% 11.000000 \n", + "50% 11.000000 \n", + "75% 11.000000 \n", + "max 11.000000 \n", + " Open High Low Close Adj Close \\\n", + "count 8036.000000 8036.000000 8036.000000 8036.000000 8036.000000 \n", + "mean 30.054280 30.351487 29.751322 30.058857 26.674025 \n", + "std 33.615577 33.906613 33.314569 33.615911 31.728090 \n", + "min 0.328125 0.347656 0.320313 0.335938 0.260703 \n", + "25% 4.392031 4.531250 4.304922 4.399610 3.414300 \n", + "50% 13.325000 13.493750 13.150000 13.330000 10.352452 \n", + "75% 55.250000 55.722501 54.852499 55.267499 47.464829 \n", + "max 126.080002 126.320000 124.809998 126.059998 118.010414 \n", + "\n", + " Volume \n", + "count 8.036000e+03 \n", + "mean 1.470459e+07 \n", + "std 1.340021e+07 \n", + "min 1.504000e+06 \n", + "25% 7.817750e+06 \n", + "50% 1.169815e+07 \n", + "75% 1.778795e+07 \n", + "max 5.855088e+08 \n", + " ID Price Prod. year Cylinders Airbags\n", + "count 1.923700e+04 1.923700e+04 19237.000000 19237.000000 19237.000000\n", + "mean 4.557654e+07 1.855593e+04 2010.912824 4.582991 6.582627\n", + "std 9.365914e+05 1.905813e+05 5.668673 1.199933 4.320168\n", + "min 2.074688e+07 1.000000e+00 1939.000000 1.000000 0.000000\n", + "25% 4.569837e+07 5.331000e+03 2009.000000 4.000000 4.000000\n", + "50% 4.577231e+07 1.317200e+04 2012.000000 4.000000 6.000000\n", + "75% 4.580204e+07 2.207500e+04 2015.000000 4.000000 12.000000\n", + "max 4.581665e+07 2.630750e+07 2020.000000 16.000000 16.000000\n" + ] + } + ], + "source": [ + "print(df.describe())\n", + "print(df2.describe())\n", + "print(df3.describe())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "При просмотре вывода не было замечено аномалий в столбцах датасетов." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Проблема пропущенных данных" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DATASET 1\n", "processor_gen процент пустых значений: %2.26\n", "processor_segment процент пустых значений: %0.75\n", "Graphic_card_memory процент пустых значений: %0.38\n", "graphic_card_name процент пустых значений: %0.38\n", "Core процент пустых значений: %0.38\n", "threads процент пустых значений: %3.38\n", - "operating_system процент пустых значений: %5.64\n" + "operating_system процент пустых значений: %5.64\n", + "DATASET 2\n", + "DATASET 3\n" ] } ], "source": [ + "print(\"DATASET 1\")\n", "for i in df.columns:\n", " null_rate = df[i].isnull().sum() / len(df)*100\n", " if null_rate > 0:\n", + " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n", + "print(\"DATASET 2\")\n", + "for i in df2.columns:\n", + " null_rate = df2[i].isnull().sum() / len(df2)*100\n", + " if null_rate > 0:\n", + " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")\n", + "print(\"DATASET 3\")\n", + "for i in df3.columns:\n", + " null_rate = df3[i].isnull().sum() / len(df3)*100\n", + " if null_rate > 0:\n", " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В первом датасете были поля с пустыми значениями, в остальных пустых значений не найдено." + ] + }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 113, "metadata": {}, "outputs": [ { @@ -501,7 +471,7 @@ "531 8 DDR4 0.0 other " ] }, - "execution_count": 10, + "execution_count": 113, "metadata": {}, "output_type": "execute_result" } @@ -516,31 +486,31 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Разбиение на выборки" + "#### Разбиение на выборки\n", + "Для разбиения на выборке для начала уменьшим количество уникальных значений в столбцах с целевыми признаками, путем добавления новых столбцов с малым количеством уникальных значений." ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 114, "metadata": {}, - "outputs": [ - { - "ename": "ValueError", - "evalue": "The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[24], line 46\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df_train, df_val, df_test\n\u001b[0;32m 44\u001b[0m data \u001b[38;5;241m=\u001b[39m df[[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrating\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprice\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mram_storage\u001b[39m\u001b[38;5;124m\"\u001b[39m]]\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[1;32m---> 46\u001b[0m df_train, df_val, df_test \u001b[38;5;241m=\u001b[39m \u001b[43msplit_stratified_into_train_val_test\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 47\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstratify_colname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrating\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_train\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.60\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_val\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrac_test\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\n\u001b[0;32m 48\u001b[0m \u001b[43m)\u001b[49m\n\u001b[0;32m 50\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_train\u001b[38;5;241m.\u001b[39mshape)\n\u001b[0;32m 52\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mКонтрольная выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_val\u001b[38;5;241m.\u001b[39mshape)\n", - "Cell \u001b[1;32mIn[24], line 26\u001b[0m, in \u001b[0;36msplit_stratified_into_train_val_test\u001b[1;34m(df_input, stratify_colname, frac_train, frac_val, frac_test, random_state)\u001b[0m\n\u001b[0;32m 21\u001b[0m y \u001b[38;5;241m=\u001b[39m df_input[\n\u001b[0;32m 22\u001b[0m [stratify_colname]\n\u001b[0;32m 23\u001b[0m ] \u001b[38;5;66;03m# Dataframe of just the column on which to stratify.\u001b[39;00m\n\u001b[0;32m 25\u001b[0m \u001b[38;5;66;03m# Split original dataframe into train and temp dataframes.\u001b[39;00m\n\u001b[1;32m---> 26\u001b[0m df_train, df_temp, y_train, y_temp \u001b[38;5;241m=\u001b[39m \u001b[43mtrain_test_split\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 27\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstratify\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m1.0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mfrac_train\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrandom_state\u001b[49m\n\u001b[0;32m 28\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 30\u001b[0m \u001b[38;5;66;03m# Split the temp dataframe into val and test dataframes.\u001b[39;00m\n\u001b[0;32m 31\u001b[0m relative_frac_test \u001b[38;5;241m=\u001b[39m frac_test \u001b[38;5;241m/\u001b[39m (frac_val \u001b[38;5;241m+\u001b[39m frac_test)\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:213\u001b[0m, in \u001b[0;36mvalidate_params..decorator..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 208\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 209\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 210\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 211\u001b[0m )\n\u001b[0;32m 212\u001b[0m ):\n\u001b[1;32m--> 213\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 215\u001b[0m \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m 216\u001b[0m \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m 217\u001b[0m \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m 218\u001b[0m \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m 219\u001b[0m msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[0;32m 220\u001b[0m \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 221\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 222\u001b[0m \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[0;32m 223\u001b[0m )\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:2806\u001b[0m, in \u001b[0;36mtrain_test_split\u001b[1;34m(test_size, train_size, random_state, shuffle, stratify, *arrays)\u001b[0m\n\u001b[0;32m 2802\u001b[0m CVClass \u001b[38;5;241m=\u001b[39m ShuffleSplit\n\u001b[0;32m 2804\u001b[0m cv \u001b[38;5;241m=\u001b[39m CVClass(test_size\u001b[38;5;241m=\u001b[39mn_test, train_size\u001b[38;5;241m=\u001b[39mn_train, random_state\u001b[38;5;241m=\u001b[39mrandom_state)\n\u001b[1;32m-> 2806\u001b[0m train, test \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcv\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43marrays\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstratify\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2808\u001b[0m train, test \u001b[38;5;241m=\u001b[39m ensure_common_namespace_device(arrays[\u001b[38;5;241m0\u001b[39m], train, test)\n\u001b[0;32m 2810\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(\n\u001b[0;32m 2811\u001b[0m chain\u001b[38;5;241m.\u001b[39mfrom_iterable(\n\u001b[0;32m 2812\u001b[0m (_safe_indexing(a, train), _safe_indexing(a, test)) \u001b[38;5;28;01mfor\u001b[39;00m a \u001b[38;5;129;01min\u001b[39;00m arrays\n\u001b[0;32m 2813\u001b[0m )\n\u001b[0;32m 2814\u001b[0m )\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:1843\u001b[0m, in \u001b[0;36mBaseShuffleSplit.split\u001b[1;34m(self, X, y, groups)\u001b[0m\n\u001b[0;32m 1813\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Generate indices to split data into training and test set.\u001b[39;00m\n\u001b[0;32m 1814\u001b[0m \n\u001b[0;32m 1815\u001b[0m \u001b[38;5;124;03mParameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1840\u001b[0m \u001b[38;5;124;03mto an integer.\u001b[39;00m\n\u001b[0;32m 1841\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 1842\u001b[0m X, y, groups \u001b[38;5;241m=\u001b[39m indexable(X, y, groups)\n\u001b[1;32m-> 1843\u001b[0m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_iter_indices\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgroups\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[0;32m 1844\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01myield\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:2252\u001b[0m, in \u001b[0;36mStratifiedShuffleSplit._iter_indices\u001b[1;34m(self, X, y, groups)\u001b[0m\n\u001b[0;32m 2250\u001b[0m class_counts \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mbincount(y_indices)\n\u001b[0;32m 2251\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m np\u001b[38;5;241m.\u001b[39mmin(class_counts) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m2\u001b[39m:\n\u001b[1;32m-> 2252\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 2253\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe least populated class in y has only 1\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2254\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m member, which is too few. The minimum\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2255\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m number of groups for any class cannot\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2256\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m be less than 2.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2257\u001b[0m )\n\u001b[0;32m 2259\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n_train \u001b[38;5;241m<\u001b[39m n_classes:\n\u001b[0;32m 2260\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 2261\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe train_size = \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m should be greater or \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2262\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mequal to the number of classes = \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m (n_train, n_classes)\n\u001b[0;32m 2263\u001b[0m )\n", - "\u001b[1;31mValueError\u001b[0m: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2." - ] - } - ], + "outputs": [], + "source": [ + "#У первого дата сета добавим новый столбец с рейтингом от 1 до 5 на основе столбца от 1 до 100.\n", + "df['new_rating'] = pd.cut(df['rating'], bins=[0,20,40,60,80,100], labels=[1,2,3,4,5], include_lowest=True)\n", + "#У второго добавим столбец с наибольшей ценой от 1 до 10, на основе столбца от 1 до 127.\n", + "df2['new_high'] = pd.cut(df2['High'], bins=[0,13,26,39,52,65,78,91,104,117,130], labels=[1,2,3,4,5,6,7,8,9,10], include_lowest=True)\n", + "#У третьего удалим слишком большие значения обслуживания и слишком маленькие и добавим новый столбец с категориями цен от 1 до 5.\n", + "df3_filtered = df3[df3['Price'] >= 10000]\n", + "df3_filtered = df3_filtered[df3_filtered['Price'] <= 100000]\n", + "df3_filtered['new_price'] = pd.cut(df3_filtered['Price'], bins=[10000,28000,46000,64000,82000,100000], labels=[1,2,3,4,5], include_lowest=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": {}, + "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", @@ -583,19 +553,234 @@ "\n", " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", "\n", - " return df_train, df_val, df_test\n", - "\n", - "data = df[[\"rating\", \"price\", \"ram_storage\"]].copy()\n", - "\n", - "df_train, df_val, df_test = split_stratified_into_train_val_test(\n", - " data, stratify_colname=\"rating\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n", + " return df_train, df_val, df_test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Выборки датасетов" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "train df: (319, 19), Val df: (106, 19), Test df:(107, 19)\n", + "train df2: (4821, 8), Val df2: (1607, 8), Test df2:(1608, 8)\n", + "train df3_filtered: (6931, 19), Val df3_filtered: (2310, 19), Test df3_filtered:(2311, 19)\n" + ] + } + ], + "source": [ + "df_train1, df_val1, df_test1 = split_stratified_into_train_val_test(\n", + " df, stratify_colname=\"new_rating\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n", ")\n", "\n", - "print(\"Обучающая выборка: \", df_train.shape)\n", + "df_train2, df_val2, df_test2 = split_stratified_into_train_val_test(\n", + " df2, stratify_colname=\"new_high\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n", + ")\n", "\n", - "print(\"Контрольная выборка: \", df_val.shape)\n", + "df_train3, df_val3, df_test3 = split_stratified_into_train_val_test(\n", + " df3_filtered, stratify_colname=\"new_price\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n", + ")\n", "\n", - "print(\"Тестовая выборка: \", df_test.shape)" + "print(f\"train df: {df_train1.shape}, Val df: {df_val1.shape}, Test df:{df_test1.shape}\")\n", + "print(f\"train df2: {df_train2.shape}, Val df2: {df_val2.shape}, Test df2:{df_test2.shape}\")\n", + "print(f\"train df3_filtered: {df_train3.shape}, Val df3_filtered: {df_val3.shape}, Test df3_filtered:{df_test3.shape}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Было сделано разбиение на три выборки: 60%, 20% и 20% при помощи библиотеки scikit-learn и функции train_test_split. На взгляд сбалансированные\n", + "### Приращение методами выборки с избытком (oversampling) и выборки с недостатком (undersampling)" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Выборка до oversampling и undersampling: (319, 6)\n", + "new_rating\n", + "4 249\n", + "3 44\n", + "5 26\n", + "1 0\n", + "2 0\n", + "Name: count, dtype: int64\n", + "Выборка после oversampling: (748, 6)\n", + "new_rating\n", + "5 252\n", + "4 249\n", + "3 247\n", + "1 0\n", + "2 0\n", + "Name: count, dtype: int64\n", + "Выборка после undersampling: (78, 6)\n", + "new_rating\n", + "3 26\n", + "5 26\n", + "4 26\n", + "1 0\n", + "2 0\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "from imblearn.over_sampling import ADASYN\n", + "from imblearn.under_sampling import RandomUnderSampler\n", + "\n", + "df_train1 = df_train1[['price', 'rating', 'threads', 'ram_storage', 'operating_system', 'new_rating']].copy()\n", + "\n", + "ada = ADASYN()\n", + "undersampler = RandomUnderSampler(random_state=42)\n", + "\n", + "print(\"Выборка до oversampling и undersampling:\", df_train1.shape)\n", + "print(df_train1.new_rating.value_counts())\n", + "\n", + "X_resampled, y_resampled = ada.fit_resample(df_train1, df_train1['new_rating'])\n", + "df_train1_adasyn = pd.DataFrame(X_resampled)\n", + "\n", + "print(\"Выборка после oversampling: \", df_train1_adasyn.shape)\n", + "print(df_train1_adasyn.new_rating.value_counts())\n", + "\n", + "X_resampled_under, y_resampled_under = undersampler.fit_resample(df_train1, df_train1['new_rating'])\n", + "\n", + "print(\"Выборка после undersampling: \", pd.DataFrame(X_resampled_under).shape)\n", + "print(pd.DataFrame(X_resampled_under).new_rating.value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Выборка до oversampling: (4821, 7)\n", + "new_high\n", + "1 2326\n", + "2 704\n", + "5 519\n", + "3 299\n", + "8 242\n", + "7 222\n", + "9 181\n", + "4 151\n", + "6 146\n", + "10 31\n", + "Name: count, dtype: int64\n", + "Выборка после oversampling: (22990, 7)\n", + "new_high\n", + "6 2375\n", + "2 2353\n", + "10 2327\n", + "1 2326\n", + "9 2306\n", + "8 2296\n", + "4 2293\n", + "3 2265\n", + "7 2254\n", + "5 2195\n", + "Name: count, dtype: int64\n", + "Выборка после undersampling: (310, 7)\n", + "new_high\n", + "1 31\n", + "2 31\n", + "3 31\n", + "4 31\n", + "5 31\n", + "6 31\n", + "7 31\n", + "8 31\n", + "9 31\n", + "10 31\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "df_train2 = df_train2[['Open', 'High', 'new_high', 'Low', 'Close', 'Adj Close', 'Volume']].copy()\n", + "\n", + "print(\"Выборка до oversampling и undersampling:\", df_train2.shape)\n", + "print(df_train2.new_high.value_counts())\n", + "\n", + "X_resampled, y_resampled = ada.fit_resample(df_train2, df_train2['new_high'])\n", + "df_train2_adasyn = pd.DataFrame(X_resampled)\n", + "\n", + "print(\"Выборка после oversampling: \", df_train2_adasyn.shape)\n", + "print(df_train2_adasyn.new_high.value_counts())\n", + "\n", + "X_resampled_under2, y_resampled_under2 = undersampler.fit_resample(df_train2, df_train2['new_high'])\n", + "\n", + "print(\"Выборка после undersampling: \", pd.DataFrame(X_resampled_under2).shape)\n", + "print(pd.DataFrame(X_resampled_under2).new_high.value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Выборка до oversampling: (6931, 5)\n", + "new_price\n", + "1 5008\n", + "2 1281\n", + "3 449\n", + "4 136\n", + "5 57\n", + "Name: count, dtype: int64\n", + "Выборка после oversampling: (25040, 5)\n", + "new_price\n", + "1 5008\n", + "2 5008\n", + "3 5008\n", + "4 5008\n", + "5 5008\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "from imblearn.over_sampling import SMOTE\n", + "\n", + "df_train3 = df_train3[['Price', 'new_price','Prod. year' ,'Cylinders' ,'Airbags']].copy()\n", + "\n", + "smote = SMOTE(random_state=42)\n", + "\n", + "print(\"Выборка до oversampling и undersampling:\", df_train3.shape)\n", + "print(df_train3.new_price.value_counts())\n", + "\n", + "X_resampled, y_resampled = smote.fit_resample(df_train3, df_train3['new_price'])\n", + "df_train3_smote = pd.DataFrame(X_resampled)\n", + "\n", + "print(\"Выборка после oversampling: \", df_train3_smote.shape)\n", + "print(df_train3_smote.new_price.value_counts())\n", + "\n", + "X_resampled_under3, y_resampled_under3 = undersampler.fit_resample(df_train3, df_train3['new_price'])\n", + "\n", + "print(\"Выборка после undersampling: \", pd.DataFrame(X_resampled_under3).shape)\n", + "print(pd.DataFrame(X_resampled_under3).new_price.value_counts())" ] } ], -- 2.25.1 From ed7c7f729887f2d8c65f2ee67a43f3fea07eca82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=AF=D0=BA=D0=BE?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=B2?= Date: Fri, 18 Oct 2024 18:46:34 +0400 Subject: [PATCH 6/6] =?UTF-8?q?=D1=82=D0=B5=D0=BF=D0=B5=D1=80=D1=8C=20?= =?UTF-8?q?=D1=82=D0=BE=D1=87=D0=BD=D0=BE=20=D0=B2=D1=81=D0=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lab_2/lab_2.ipynb | 65 ++++++++++++++++++++++++----------------- lab_2/requirements.txt | Bin 0 -> 1620 bytes 2 files changed, 39 insertions(+), 26 deletions(-) create mode 100644 lab_2/requirements.txt diff --git a/lab_2/lab_2.ipynb b/lab_2/lab_2.ipynb index 11cab85..dc64ebb 100644 --- a/lab_2/lab_2.ipynb +++ b/lab_2/lab_2.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 124, "metadata": {}, "outputs": [ { @@ -136,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 125, "metadata": {}, "outputs": [ { @@ -215,7 +215,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 126, "metadata": {}, "outputs": [ { @@ -262,7 +262,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 127, "metadata": {}, "outputs": [ { @@ -471,7 +471,7 @@ "531 8 DDR4 0.0 other " ] }, - "execution_count": 113, + "execution_count": 127, "metadata": {}, "output_type": "execute_result" } @@ -492,15 +492,20 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 128, "metadata": {}, "outputs": [], "source": [ "#У первого дата сета добавим новый столбец с рейтингом от 1 до 5 на основе столбца от 1 до 100.\n", + "\n", "df['new_rating'] = pd.cut(df['rating'], bins=[0,20,40,60,80,100], labels=[1,2,3,4,5], include_lowest=True)\n", + "\n", "#У второго добавим столбец с наибольшей ценой от 1 до 10, на основе столбца от 1 до 127.\n", + "\n", "df2['new_high'] = pd.cut(df2['High'], bins=[0,13,26,39,52,65,78,91,104,117,130], labels=[1,2,3,4,5,6,7,8,9,10], include_lowest=True)\n", + "\n", "#У третьего удалим слишком большие значения обслуживания и слишком маленькие и добавим новый столбец с категориями цен от 1 до 5.\n", + "\n", "df3_filtered = df3[df3['Price'] >= 10000]\n", "df3_filtered = df3_filtered[df3_filtered['Price'] <= 100000]\n", "df3_filtered['new_price'] = pd.cut(df3_filtered['Price'], bins=[10000,28000,46000,64000,82000,100000], labels=[1,2,3,4,5], include_lowest=True)" @@ -508,7 +513,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 129, "metadata": {}, "outputs": [], "source": [ @@ -565,7 +570,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 130, "metadata": {}, "outputs": [ { @@ -606,7 +611,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 131, "metadata": {}, "outputs": [ { @@ -621,11 +626,11 @@ "1 0\n", "2 0\n", "Name: count, dtype: int64\n", - "Выборка после oversampling: (748, 6)\n", + "Выборка после oversampling: (750, 6)\n", "new_rating\n", - "5 252\n", + "5 251\n", + "3 250\n", "4 249\n", - "3 247\n", "1 0\n", "2 0\n", "Name: count, dtype: int64\n", @@ -666,14 +671,14 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 132, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Выборка до oversampling: (4821, 7)\n", + "Выборка до oversampling и undersampling: (4821, 7)\n", "new_high\n", "1 2326\n", "2 704\n", @@ -686,18 +691,18 @@ "6 146\n", "10 31\n", "Name: count, dtype: int64\n", - "Выборка после oversampling: (22990, 7)\n", + "Выборка после oversampling: (23144, 7)\n", "new_high\n", - "6 2375\n", - "2 2353\n", - "10 2327\n", + "8 2374\n", + "6 2368\n", + "2 2351\n", + "4 2335\n", "1 2326\n", - "9 2306\n", - "8 2296\n", - "4 2293\n", - "3 2265\n", - "7 2254\n", - "5 2195\n", + "9 2317\n", + "10 2312\n", + "5 2256\n", + "7 2256\n", + "3 2249\n", "Name: count, dtype: int64\n", "Выборка после undersampling: (310, 7)\n", "new_high\n", @@ -735,14 +740,14 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 133, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Выборка до oversampling: (6931, 5)\n", + "Выборка до oversampling и undersampling: (6931, 5)\n", "new_price\n", "1 5008\n", "2 1281\n", @@ -757,6 +762,14 @@ "3 5008\n", "4 5008\n", "5 5008\n", + "Name: count, dtype: int64\n", + "Выборка после undersampling: (285, 5)\n", + "new_price\n", + "1 57\n", + "2 57\n", + "3 57\n", + "4 57\n", + "5 57\n", "Name: count, dtype: int64\n" ] } diff --git a/lab_2/requirements.txt b/lab_2/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..f17ad4b524faac00942277a654e70a3dbf2196e6 GIT binary patch literal 1620 zcmZvcQF7Wq5JdMoRXGYNOBg%($Q^P6mjDF|V_5-Vj1Qlr-?SD9Cq;>}*_rO1o?iX= zU8O_n^?6B8sZOo_`>s!!zNA|{Mf#jRrIk*1diJSG&w8l187l5B70)`$C-j$>lQ!^= z>hIDyY6_jt_1p|KXL0PrzrP?9p~3uXoh;KaZLa#is(%svoTCwaOk$LU&fzgeo!+AA zsES5&Zh{G&)L6$)^`aFFCbk#&un4tAYBc9kOglZTcuhSGpgSQZvoY zctfaYl>=D7cK)++PWRzKR>d8bo%`z zX6Vy{I-FrP;2rwD!E0|iq4+k8d4oIia$jp9)#+6VT*EoD&0}vr&dE4=5y{Lr zXSwS6NAJ+$1d=Bw&$*t3i{x;s4Nj5(f2YaV>4@K__wb|CZ)LcW_jZo-wbENg#atDB zW5M|@?Z1Xjr#nx7h@9BFk9Qm0q};B%611GDVxr(6_unQbW8Rp+smc?N?^UYoFt?p8 zRC0y6c`L2d>22U5O6-K+d~MV^;oyuJ?S)iD8h8)&Lx%j8bduNwUXo+fImy?ap2W|M OUaR6UDsm=@LGxdg`s~*L literal 0 HcmV?d00001 -- 2.25.1