{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 1354 entries, 0 to 1369\n", "Data columns (total 18 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Name_phone 1354 non-null object \n", " 1 Rating 1354 non-null float64\n", " 2 Spec_score 1354 non-null int64 \n", " 3 No_of_sim 1354 non-null object \n", " 4 Ram 1354 non-null object \n", " 5 Battery 1354 non-null object \n", " 6 Display 1354 non-null object \n", " 7 Camera 1354 non-null object \n", " 8 External_Memory 1354 non-null object \n", " 9 Android_version 914 non-null object \n", " 10 Price 1354 non-null object \n", " 11 company 1354 non-null object \n", " 12 Inbuilt_memory 1335 non-null object \n", " 13 fast_charging 1267 non-null object \n", " 14 Screen_resolution 1353 non-null object \n", " 15 Processor 1327 non-null object \n", " 16 Processor_name 1354 non-null object \n", " 17 Class 1354 non-null int64 \n", "dtypes: float64(1), int64(2), object(15)\n", "memory usage: 201.0+ KB\n" ] }, { "data": { "text/plain": [ "(1354, 18)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Name_phoneRatingSpec_scoreNo_of_simRamBatteryDisplayCameraExternal_MemoryAndroid_versionPricecompanyInbuilt_memoryfast_chargingScreen_resolutionProcessorProcessor_nameClass
Id
0Samsung Galaxy F14 5G4.6568Dual Sim, 3G, 4G, 5G, VoLTE,4 GB RAM6000 mAh Battery6.6 inches50 MP + 2 MP Dual Rear & 13 MP Front CameraMemory Card Supported, upto 1 TB139,999Samsung128 GB inbuilt25W Fast Charging2408 x 1080 px Display with Water Drop NotchOcta Core ProcessorExynos 13302
1Samsung Galaxy A114.2063Dual Sim, 3G, 4G, VoLTE,2 GB RAM4000 mAh Battery6.4 inches13 MP + 5 MP + 2 MP Triple Rear & 8 MP Fro...Memory Card Supported, upto 512 GB109,990Samsung32 GB inbuilt15W Fast Charging720 x 1560 px Display with Punch Hole1.8 GHz ProcessorOcta Core2
3Samsung Galaxy F234.1073Dual Sim, 3G, 4G, VoLTE,4 GB RAM6000 mAh Battery6.4 inches48 MP Quad Rear & 13 MP Front CameraMemory Card Supported, upto 1 TB1211,999Samsung64 GB inbuiltNaN720 x 1600 pxOcta CoreHelio G881
4Samsung Galaxy A03s (4GB RAM + 64GB)4.1069Dual Sim, 3G, 4G, VoLTE,4 GB RAM5000 mAh Battery6.5 inches13 MP + 2 MP + 2 MP Triple Rear & 5 MP Fro...Memory Card Supported, upto 1 TB1111,999Samsung64 GB inbuilt15W Fast Charging720 x 1600 px Display with Water Drop NotchOcta CoreHelio P352
5Samsung Galaxy M13 5G4.4075Dual Sim, 3G, 4G, 5G, VoLTE,6 GB RAM5000 mAh Battery6.5 inches50 MP + 2 MP Dual Rear & 5 MP Front CameraMemory Card Supported, upto 1 TB1211,990Samsung128 GB inbuilt15W Fast Charging720 x 1600 pxOcta CoreDimensity 7001
\n", "
" ], "text/plain": [ " Name_phone Rating Spec_score \\\n", "Id \n", "0 Samsung Galaxy F14 5G 4.65 68 \n", "1 Samsung Galaxy A11 4.20 63 \n", "3 Samsung Galaxy F23 4.10 73 \n", "4 Samsung Galaxy A03s (4GB RAM + 64GB) 4.10 69 \n", "5 Samsung Galaxy M13 5G 4.40 75 \n", "\n", " No_of_sim Ram Battery Display \\\n", "Id \n", "0 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 6000 mAh Battery 6.6 inches \n", "1 Dual Sim, 3G, 4G, VoLTE, 2 GB RAM 4000 mAh Battery 6.4 inches \n", "3 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 6000 mAh Battery 6.4 inches \n", "4 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.5 inches \n", "5 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery 6.5 inches \n", "\n", " Camera \\\n", "Id \n", "0 50 MP + 2 MP Dual Rear & 13 MP Front Camera \n", "1 13 MP + 5 MP + 2 MP Triple Rear & 8 MP Fro... \n", "3 48 MP Quad Rear & 13 MP Front Camera \n", "4 13 MP + 2 MP + 2 MP Triple Rear & 5 MP Fro... \n", "5 50 MP + 2 MP Dual Rear & 5 MP Front Camera \n", "\n", " External_Memory Android_version Price company \\\n", "Id \n", "0 Memory Card Supported, upto 1 TB 13 9,999 Samsung \n", "1 Memory Card Supported, upto 512 GB 10 9,990 Samsung \n", "3 Memory Card Supported, upto 1 TB 12 11,999 Samsung \n", "4 Memory Card Supported, upto 1 TB 11 11,999 Samsung \n", "5 Memory Card Supported, upto 1 TB 12 11,990 Samsung \n", "\n", " Inbuilt_memory fast_charging \\\n", "Id \n", "0 128 GB inbuilt 25W Fast Charging \n", "1 32 GB inbuilt 15W Fast Charging \n", "3 64 GB inbuilt NaN \n", "4 64 GB inbuilt 15W Fast Charging \n", "5 128 GB inbuilt 15W Fast Charging \n", "\n", " Screen_resolution Processor \\\n", "Id \n", "0 2408 x 1080 px Display with Water Drop Notch Octa Core Processor \n", "1 720 x 1560 px Display with Punch Hole 1.8 GHz Processor \n", "3 720 x 1600 px Octa Core \n", "4 720 x 1600 px Display with Water Drop Notch Octa Core \n", "5 720 x 1600 px Octa Core \n", "\n", " Processor_name Class \n", "Id \n", "0 Exynos 1330 2 \n", "1 Octa Core 2 \n", "3 Helio G88 1 \n", "4 Helio P35 2 \n", "5 Dimensity 700 1 " ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "phone = pd.read_csv(\"data/phone_price.csv\", index_col=\"Id\")\n", "\n", "phone.info()\n", "\n", "display(phone.shape)\n", "\n", "phone.head()\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 19227 entries, 0 to 19226\n", "Data columns (total 20 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Unnamed: 0 19227 non-null int64 \n", " 1 ID 19227 non-null int64 \n", " 2 Price 19227 non-null int64 \n", " 3 Levy 19227 non-null object \n", " 4 Manufacturer 19227 non-null object \n", " 5 Model 19227 non-null object \n", " 6 ProdYear 19227 non-null int64 \n", " 7 Category 19227 non-null object \n", " 8 Leather interior 19227 non-null object \n", " 9 Fuel type 19227 non-null object \n", " 10 Engine volume 19227 non-null object \n", " 11 Mileage 19227 non-null object \n", " 12 Cylinders 19227 non-null float64\n", " 13 Gear box type 19227 non-null object \n", " 14 Drive wheels 19227 non-null object \n", " 15 Doors 19227 non-null object \n", " 16 Wheel 19227 non-null object \n", " 17 Color 19227 non-null object \n", " 18 Airbags 19227 non-null int64 \n", " 19 LeatherInterior1 19227 non-null int64 \n", "dtypes: float64(1), int64(6), object(13)\n", "memory usage: 2.9+ MB\n" ] }, { "data": { "text/plain": [ "(19227, 20)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0IDPriceLevyManufacturerModelProdYearCategoryLeather interiorFuel typeEngine volumeMileageCylindersGear box typeDrive wheelsDoorsWheelColorAirbagsLeatherInterior1
0045654403133281399LEXUSRX 4502010JeepYesHybrid3.5186005 km6.0Automatic4x404-MayLeft wheelSilver121
1144731507166211018CHEVROLETEquinox2011JeepNoPetrol3192000 km6.0Tiptronic4x404-MayLeft wheelBlack80
22457744198467-HONDAFIT2006HatchbackNoPetrol1.3200000 km4.0VariatorFront04-MayRight-hand driveBlack20
33457691853607862FORDEscape2011JeepYesHybrid2.5168966 km4.0Automatic4x404-MayLeft wheelWhite01
444580926311726446HONDAFIT2014HatchbackYesPetrol1.391901 km4.0AutomaticFront04-MayLeft wheelSilver41
\n", "
" ], "text/plain": [ " Unnamed: 0 ID Price Levy Manufacturer Model ProdYear \\\n", "0 0 45654403 13328 1399 LEXUS RX 450 2010 \n", "1 1 44731507 16621 1018 CHEVROLET Equinox 2011 \n", "2 2 45774419 8467 - HONDA FIT 2006 \n", "3 3 45769185 3607 862 FORD Escape 2011 \n", "4 4 45809263 11726 446 HONDA FIT 2014 \n", "\n", " Category Leather interior Fuel type Engine volume Mileage Cylinders \\\n", "0 Jeep Yes Hybrid 3.5 186005 km 6.0 \n", "1 Jeep No Petrol 3 192000 km 6.0 \n", "2 Hatchback No Petrol 1.3 200000 km 4.0 \n", "3 Jeep Yes Hybrid 2.5 168966 km 4.0 \n", "4 Hatchback Yes Petrol 1.3 91901 km 4.0 \n", "\n", " Gear box type Drive wheels Doors Wheel Color Airbags \\\n", "0 Automatic 4x4 04-May Left wheel Silver 12 \n", "1 Tiptronic 4x4 04-May Left wheel Black 8 \n", "2 Variator Front 04-May Right-hand drive Black 2 \n", "3 Automatic 4x4 04-May Left wheel White 0 \n", "4 Automatic Front 04-May Left wheel Silver 4 \n", "\n", " LeatherInterior1 \n", "0 1 \n", "1 0 \n", "2 0 \n", "3 1 \n", "4 1 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "car = pd.read_csv(\"data/car_price.csv\")\n", "\n", "car.info()\n", "\n", "display(car.shape)\n", "\n", "car.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 3299 entries, 0 to 3298\n", "Data columns (total 21 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 3299 non-null int64 \n", " 1 date 3299 non-null object \n", " 2 price 3299 non-null float64\n", " 3 bedrooms 3299 non-null int64 \n", " 4 bathrooms 3299 non-null float64\n", " 5 sqft_living 3299 non-null int64 \n", " 6 sqft_lot 3299 non-null int64 \n", " 7 floors 3299 non-null float64\n", " 8 waterfront 3299 non-null int64 \n", " 9 view 3299 non-null int64 \n", " 10 condition 3299 non-null int64 \n", " 11 grade 3299 non-null int64 \n", " 12 sqft_above 3299 non-null int64 \n", " 13 sqft_basement 3299 non-null int64 \n", " 14 yr_built 3299 non-null int64 \n", " 15 yr_renovated 3299 non-null int64 \n", " 16 zipcode 3299 non-null int64 \n", " 17 lat 3299 non-null float64\n", " 18 long 3299 non-null float64\n", " 19 sqft_living15 3299 non-null int64 \n", " 20 sqft_lot15 3299 non-null int64 \n", "dtypes: float64(5), int64(15), object(1)\n", "memory usage: 541.4+ KB\n" ] }, { "data": { "text/plain": [ "(3299, 21)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...gradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15
0712930052020141013T000000221900.031.00118056501.000...711800195509817847.5112-122.25713405650
1641410019220141209T000000538000.032.25257072422.000...72170400195119919812547.7210-122.31916907639
2563150040020150225T000000180000.021.00770100001.000...67700193309802847.7379-122.23327208062
3248720087520141209T000000604000.043.00196050001.000...71050910196509813647.5208-122.39313605000
4195440051020150218T000000510000.032.00168080801.000...816800198709807447.6168-122.04518007503
\n", "

5 rows × 21 columns

\n", "
" ], "text/plain": [ " id date price bedrooms bathrooms sqft_living \\\n", "0 7129300520 20141013T000000 221900.0 3 1.00 1180 \n", "1 6414100192 20141209T000000 538000.0 3 2.25 2570 \n", "2 5631500400 20150225T000000 180000.0 2 1.00 770 \n", "3 2487200875 20141209T000000 604000.0 4 3.00 1960 \n", "4 1954400510 20150218T000000 510000.0 3 2.00 1680 \n", "\n", " sqft_lot floors waterfront view ... grade sqft_above sqft_basement \\\n", "0 5650 1.0 0 0 ... 7 1180 0 \n", "1 7242 2.0 0 0 ... 7 2170 400 \n", "2 10000 1.0 0 0 ... 6 770 0 \n", "3 5000 1.0 0 0 ... 7 1050 910 \n", "4 8080 1.0 0 0 ... 8 1680 0 \n", "\n", " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", "0 1955 0 98178 47.5112 -122.257 1340 \n", "1 1951 1991 98125 47.7210 -122.319 1690 \n", "2 1933 0 98028 47.7379 -122.233 2720 \n", "3 1965 0 98136 47.5208 -122.393 1360 \n", "4 1987 0 98074 47.6168 -122.045 1800 \n", "\n", " sqft_lot15 \n", "0 5650 \n", "1 7639 \n", "2 8062 \n", "3 5000 \n", "4 7503 \n", "\n", "[5 rows x 21 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "house = pd.read_csv(\"data/house_data.csv\")\n", "\n", "house.info()\n", "\n", "display(house.shape)\n", "\n", "house.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Name_phone 0\n", "Rating 0\n", "Spec_score 0\n", "No_of_sim 0\n", "Ram 0\n", "Battery 0\n", "Display 0\n", "Camera 0\n", "External_Memory 0\n", "Android_version 440\n", "Price 0\n", "company 0\n", "Inbuilt_memory 19\n", "fast_charging 87\n", "Screen_resolution 1\n", "Processor 27\n", "Processor_name 0\n", "Class 0\n", "dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Name_phone False\n", "Rating False\n", "Spec_score False\n", "No_of_sim False\n", "Ram False\n", "Battery False\n", "Display False\n", "Camera False\n", "External_Memory False\n", "Android_version True\n", "Price False\n", "company False\n", "Inbuilt_memory True\n", "fast_charging True\n", "Screen_resolution True\n", "Processor True\n", "Processor_name False\n", "Class False\n", "dtype: bool" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Android_version процент пустых значений: %32.50'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Inbuilt_memory процент пустых значений: %1.40'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'fast_charging процент пустых значений: %6.43'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Screen_resolution процент пустых значений: %0.07'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Processor процент пустых значений: %1.99'" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Количество пустых значений признаков\n", "display(phone.isnull().sum())\n", "display()\n", "\n", "# Есть ли пустые значения признаков\n", "display(phone.isnull().any())\n", "display()\n", "\n", "# Процент пустых значений признаков\n", "for i in phone.columns:\n", " null_rate = phone[i].isnull().sum() / len(phone) * 100\n", " if null_rate > 0:\n", " display(f\"{i} процент пустых значений: %{null_rate:.2f}\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Unnamed: 0 0\n", "ID 0\n", "Price 0\n", "Levy 0\n", "Manufacturer 0\n", "Model 0\n", "ProdYear 0\n", "Category 0\n", "Leather interior 0\n", "Fuel type 0\n", "Engine volume 0\n", "Mileage 0\n", "Cylinders 0\n", "Gear box type 0\n", "Drive wheels 0\n", "Doors 0\n", "Wheel 0\n", "Color 0\n", "Airbags 0\n", "LeatherInterior1 0\n", "dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Unnamed: 0 False\n", "ID False\n", "Price False\n", "Levy False\n", "Manufacturer False\n", "Model False\n", "ProdYear False\n", "Category False\n", "Leather interior False\n", "Fuel type False\n", "Engine volume False\n", "Mileage False\n", "Cylinders False\n", "Gear box type False\n", "Drive wheels False\n", "Doors False\n", "Wheel False\n", "Color False\n", "Airbags False\n", "LeatherInterior1 False\n", "dtype: bool" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Количество пустых значений признаков\n", "display(car.isnull().sum())\n", "display()\n", "\n", "# Есть ли пустые значения признаков\n", "display(car.isnull().any())\n", "display()\n", "\n", "# Процент пустых значений признаков\n", "for i in car.columns:\n", " null_rate = car[i].isnull().sum() / len(car) * 100\n", " if null_rate > 0:\n", " display(f\"{i} процент пустых значений: %{null_rate:.2f}\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id 0\n", "date 0\n", "price 0\n", "bedrooms 0\n", "bathrooms 0\n", "sqft_living 0\n", "sqft_lot 0\n", "floors 0\n", "waterfront 0\n", "view 0\n", "condition 0\n", "grade 0\n", "sqft_above 0\n", "sqft_basement 0\n", "yr_built 0\n", "yr_renovated 0\n", "zipcode 0\n", "lat 0\n", "long 0\n", "sqft_living15 0\n", "sqft_lot15 0\n", "dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "id False\n", "date False\n", "price False\n", "bedrooms False\n", "bathrooms False\n", "sqft_living False\n", "sqft_lot False\n", "floors False\n", "waterfront False\n", "view False\n", "condition False\n", "grade False\n", "sqft_above False\n", "sqft_basement False\n", "yr_built False\n", "yr_renovated False\n", "zipcode False\n", "lat False\n", "long False\n", "sqft_living15 False\n", "sqft_lot15 False\n", "dtype: bool" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Количество пустых значений признаков\n", "display(house.isnull().sum())\n", "display()\n", "\n", "# Есть ли пустые значения признаков\n", "display(house.isnull().any())\n", "display()\n", "\n", "# Процент пустых значений признаков\n", "for i in house.columns:\n", " null_rate = house[i].isnull().sum() / len(house) * 100\n", " if null_rate > 0:\n", " display(f\"{i} процент пустых значений: %{null_rate:.2f}\")" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "В 1 наборе данных phone имеются пустые значения, в остальных они отсутствуют.\n", "Заполним пустые значения из 1 датасета." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(805, 18)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Name_phone False\n", "Rating False\n", "Spec_score False\n", "No_of_sim False\n", "Ram False\n", "Battery False\n", "Display False\n", "Camera False\n", "External_Memory False\n", "Android_version False\n", "Price False\n", "company False\n", "Inbuilt_memory False\n", "fast_charging False\n", "Screen_resolution False\n", "Processor False\n", "Processor_name False\n", "Class False\n", "dtype: bool" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Name_phoneRatingSpec_scoreNo_of_simRamBatteryDisplayCameraExternal_MemoryAndroid_versionPricecompanyInbuilt_memoryfast_chargingScreen_resolutionProcessorProcessor_nameClass
Id
1365TCL 40R4.0575Dual Sim, 3G, 4G, 5G, VoLTE,4 GB RAM5000 mAh Battery6.6 inches50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro...Memory Card (Hybrid)1218,999TCL64 GB inbuilt15W Fast Charging720 x 1612 pxOcta CoreDimensity 700 5G2
1366TCL 50 XL NxtPaper 5G4.1080Dual Sim, 3G, 4G, VoLTE,8 GB RAM5000 mAh Battery6.8 inches50 MP + 2 MP Dual Rear & 16 MP Front CameraMemory Card (Hybrid)1424,990TCL128 GB inbuilt33W Fast Charging1200 x 2400 pxOcta CoreDimensity 70501
1367TCL 50 XE NxtPaper 5G4.0080Dual Sim, 3G, 4G, 5G, VoLTE,6 GB RAM5000 mAh Battery6.6 inches50 MP + 2 MP Dual Rear & 16 MP Front CameraMemory Card Supported, upto 1 TB1323,990TCL256 GB inbuilt18W Fast Charging720 x 1612 pxOcta CoreDimensity 60802
1368TCL 40 NxtPaper 5G4.5079Dual Sim, 3G, 4G, 5G, VoLTE,6 GB RAM5000 mAh Battery6.6 inches50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro...Memory Card Supported, upto 1 TB1322,499TCL256 GB inbuilt15W Fast Charging720 x 1612 pxOcta CoreDimensity 60202
1369TCL Trifold4.6593Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G,12 GB RAM4600 mAh Battery10 inchesFoldable Display, Dual Display50 MP + 48 MP + 8 MP Triple Rear & 32 MP F...131,19,990TCL256 GB inbuilt67W Fast Charging1916 x 2160 pxOcta CoreSnapdragon 8 Gen22
\n", "
" ], "text/plain": [ " Name_phone Rating Spec_score \\\n", "Id \n", "1365 TCL 40R 4.05 75 \n", "1366 TCL 50 XL NxtPaper 5G 4.10 80 \n", "1367 TCL 50 XE NxtPaper 5G 4.00 80 \n", "1368 TCL 40 NxtPaper 5G 4.50 79 \n", "1369 TCL Trifold 4.65 93 \n", "\n", " No_of_sim Ram Battery \\\n", "Id \n", "1365 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 5000 mAh Battery \n", "1366 Dual Sim, 3G, 4G, VoLTE, 8 GB RAM 5000 mAh Battery \n", "1367 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery \n", "1368 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery \n", "1369 Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G, 12 GB RAM 4600 mAh Battery \n", "\n", " Display Camera \\\n", "Id \n", "1365 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro... \n", "1366 6.8 inches 50 MP + 2 MP Dual Rear & 16 MP Front Camera \n", "1367 6.6 inches 50 MP + 2 MP Dual Rear & 16 MP Front Camera \n", "1368 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro... \n", "1369 10 inches Foldable Display, Dual Display \n", "\n", " External_Memory Android_version \\\n", "Id \n", "1365 Memory Card (Hybrid) 12 \n", "1366 Memory Card (Hybrid) 14 \n", "1367 Memory Card Supported, upto 1 TB 13 \n", "1368 Memory Card Supported, upto 1 TB 13 \n", "1369 50 MP + 48 MP + 8 MP Triple Rear & 32 MP F... 13 \n", "\n", " Price company Inbuilt_memory fast_charging Screen_resolution \\\n", "Id \n", "1365 18,999 TCL 64 GB inbuilt 15W Fast Charging 720 x 1612 px \n", "1366 24,990 TCL 128 GB inbuilt 33W Fast Charging 1200 x 2400 px \n", "1367 23,990 TCL 256 GB inbuilt 18W Fast Charging 720 x 1612 px \n", "1368 22,499 TCL 256 GB inbuilt 15W Fast Charging 720 x 1612 px \n", "1369 1,19,990 TCL 256 GB inbuilt 67W Fast Charging 1916 x 2160 px \n", "\n", " Processor Processor_name Class \n", "Id \n", "1365 Octa Core Dimensity 700 5G 2 \n", "1366 Octa Core Dimensity 7050 1 \n", "1367 Octa Core Dimensity 6080 2 \n", "1368 Octa Core Dimensity 6020 2 \n", "1369 Octa Core Snapdragon 8 Gen2 2 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Замена пустых данных на моду (часто встречающееся значение)\n", "mode_Android = phone['Android_version'].mode() \n", "phone.fillna({'Android_version':mode_Android}, inplace=True)\n", "\n", "Inbuilt_memory = phone['Inbuilt_memory'].mode() \n", "phone.fillna({'Inbuilt_memory':Inbuilt_memory}, inplace=True)\n", "\n", "fillna_df = phone.fillna(0)\n", "\n", "dropna_df = phone.dropna()\n", "\n", "display(dropna_df.shape)\n", "\n", "display(fillna_df.isnull().any())\n", "\n", "phone.tail()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Class\n", "1 718\n", "2 636\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Обучающая выборка: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(812, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Class\n", "1 431\n", "2 381\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Контрольная выборка: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(271, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Class\n", "1 143\n", "2 128\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Тестовая выборка: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(271, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Class\n", "1 144\n", "2 127\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Вывод распределения количества наблюдений по меткам (классам)\n", "from src.utils import split_stratified_into_train_val_test\n", "\n", "\n", "display(phone.Class.value_counts())\n", "display()\n", "\n", "data = phone[[\"Class\", \"Spec_score\", \"Rating\"]].copy()\n", "\n", "df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n", " data, stratify_colname=\"Class\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n", ")\n", "\n", "display(\"Обучающая выборка: \", df_train.shape)\n", "display(df_train.Class.value_counts())\n", "\n", "display(\"Контрольная выборка: \", df_val.shape)\n", "display(df_val.Class.value_counts())\n", "\n", "display(\"Тестовая выборка: \", df_test.shape)\n", "display(df_test.Class.value_counts())" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Обучающая выборка: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(812, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Class\n", "1 431\n", "2 381\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Обучающая выборка после oversampling: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(856, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Class\n", "1 431\n", "2 425\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ClassSpec_scoreRating
01904.550000
11804.400000
21804.650000
31754.500000
42924.650000
............
8512964.061287
8522954.191740
8532734.612413
8542944.193295
8552924.650000
\n", "

856 rows × 3 columns

\n", "
" ], "text/plain": [ " Class Spec_score Rating\n", "0 1 90 4.550000\n", "1 1 80 4.400000\n", "2 1 80 4.650000\n", "3 1 75 4.500000\n", "4 2 92 4.650000\n", ".. ... ... ...\n", "851 2 96 4.061287\n", "852 2 95 4.191740\n", "853 2 73 4.612413\n", "854 2 94 4.193295\n", "855 2 92 4.650000\n", "\n", "[856 rows x 3 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from imblearn.over_sampling import ADASYN\n", "\n", "ada = ADASYN()\n", "\n", "display(\"Обучающая выборка: \", df_train.shape)\n", "display(df_train.Class.value_counts())\n", "\n", "X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Class\"]) # type: ignore\n", "df_train_adasyn = pd.DataFrame(X_resampled)\n", "\n", "display(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n", "display(df_train_adasyn.Class.value_counts())\n", "\n", "df_train_adasyn" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Обучающая выборка: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(812, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Class\n", "1 431\n", "2 381\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Обучающая выборка после undersampling: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(762, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Class\n", "1 381\n", "2 381\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Spec_scoreRatingClass
Id
468694.451
1093864.451
1102894.001
422704.151
1192814.001
............
298754.702
249754.252
1341664.252
923884.152
301804.502
\n", "

762 rows × 3 columns

\n", "
" ], "text/plain": [ " Spec_score Rating Class\n", "Id \n", "468 69 4.45 1\n", "1093 86 4.45 1\n", "1102 89 4.00 1\n", "422 70 4.15 1\n", "1192 81 4.00 1\n", "... ... ... ...\n", "298 75 4.70 2\n", "249 75 4.25 2\n", "1341 66 4.25 2\n", "923 88 4.15 2\n", "301 80 4.50 2\n", "\n", "[762 rows x 3 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from imblearn.under_sampling import RandomUnderSampler\n", "import pandas as pd\n", "\n", "# Создание экземпляра RandomUnderSampler\n", "rus = RandomUnderSampler()\n", "\n", "# Проверка исходной обучающей выборки\n", "display(\"Обучающая выборка: \", df_train.shape)\n", "display(df_train.Class.value_counts())\n", "\n", "# Выполнение undersampling\n", "X_resampled, y_resampled = rus.fit_resample(df_train.drop(columns=[\"Class\"]), df_train[\"Class\"]) # type: ignore\n", "df_train_undersampled = pd.DataFrame(X_resampled)\n", "\n", "# Добавление целевой переменной обратно в выборку\n", "df_train_undersampled[\"Class\"] = y_resampled\n", "\n", "# Проверка обучающей выборки после undersampling\n", "display(\"Обучающая выборка после undersampling: \", df_train_undersampled.shape)\n", "display(df_train_undersampled.Class.value_counts())\n", "\n", "df_train_undersampled\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LeatherInterior1\n", "1 13949\n", "0 5278\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Обучающая выборка: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(11536, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "LeatherInterior1\n", "1 8369\n", "0 3167\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Контрольная выборка: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(3845, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "LeatherInterior1\n", "1 2790\n", "0 1055\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Тестовая выборка: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(3846, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "LeatherInterior1\n", "1 2790\n", "0 1056\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Вывод распределения количества наблюдений по меткам (классам)\n", "from src.utils import split_stratified_into_train_val_test\n", "\n", "\n", "display(car.LeatherInterior1.value_counts())\n", "display()\n", "\n", "data = car[[\"LeatherInterior1\", \"Airbags\", \"ProdYear\"]].copy()\n", "\n", "df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n", " data, stratify_colname=\"LeatherInterior1\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n", ")\n", "\n", "display(\"Обучающая выборка: \", df_train.shape)\n", "display(df_train.LeatherInterior1.value_counts())\n", "\n", "display(\"Контрольная выборка: \", df_val.shape)\n", "display(df_val.LeatherInterior1.value_counts())\n", "\n", "display(\"Тестовая выборка: \", df_test.shape)\n", "display(df_test.LeatherInterior1.value_counts())" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Обучающая выборка: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(11536, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "LeatherInterior1\n", "1 8369\n", "0 3167\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Обучающая выборка после oversampling: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(16728, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "LeatherInterior1\n", "1 8369\n", "0 8359\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LeatherInterior1AirbagsProdYear
00162011
11102017
2142018
3102011
4162010
............
167230122004
167240122004
167250122004
167260122004
167270122003
\n", "

16728 rows × 3 columns

\n", "
" ], "text/plain": [ " LeatherInterior1 Airbags ProdYear\n", "0 0 16 2011\n", "1 1 10 2017\n", "2 1 4 2018\n", "3 1 0 2011\n", "4 1 6 2010\n", "... ... ... ...\n", "16723 0 12 2004\n", "16724 0 12 2004\n", "16725 0 12 2004\n", "16726 0 12 2004\n", "16727 0 12 2003\n", "\n", "[16728 rows x 3 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from imblearn.over_sampling import ADASYN\n", "\n", "ada = ADASYN()\n", "\n", "display(\"Обучающая выборка: \", df_train.shape)\n", "display(df_train.LeatherInterior1.value_counts())\n", "\n", "X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"LeatherInterior1\"]) # type: ignore\n", "df_train_adasyn = pd.DataFrame(X_resampled)\n", "\n", "display(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n", "display(df_train_adasyn.LeatherInterior1.value_counts())\n", "\n", "df_train_adasyn" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Обучающая выборка: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(11536, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "LeatherInterior1\n", "1 8369\n", "0 3167\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Обучающая выборка после undersampling: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(6334, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "LeatherInterior1\n", "0 3167\n", "1 3167\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AirbagsProdYearLeatherInterior1
13151620110
1569020140
6317620060
626420000
124391220120
............
33611220121
3286420171
17666420151
4902020121
4613420141
\n", "

6334 rows × 3 columns

\n", "
" ], "text/plain": [ " Airbags ProdYear LeatherInterior1\n", "1315 16 2011 0\n", "1569 0 2014 0\n", "6317 6 2006 0\n", "626 4 2000 0\n", "12439 12 2012 0\n", "... ... ... ...\n", "3361 12 2012 1\n", "3286 4 2017 1\n", "17666 4 2015 1\n", "4902 0 2012 1\n", "4613 4 2014 1\n", "\n", "[6334 rows x 3 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from imblearn.under_sampling import RandomUnderSampler\n", "import pandas as pd\n", "\n", "# Создание экземпляра RandomUnderSampler\n", "rus = RandomUnderSampler()\n", "\n", "# Проверка исходной обучающей выборки\n", "display(\"Обучающая выборка: \", df_train.shape)\n", "display(df_train.LeatherInterior1.value_counts())\n", "\n", "# Выполнение undersampling\n", "X_resampled, y_resampled = rus.fit_resample(df_train.drop(columns=[\"LeatherInterior1\"]), df_train[\"LeatherInterior1\"]) # type: ignore\n", "df_train_undersampled = pd.DataFrame(X_resampled)\n", "\n", "# Добавление целевой переменной обратно в выборку\n", "df_train_undersampled[\"LeatherInterior1\"] = y_resampled\n", "\n", "# Проверка обучающей выборки после undersampling\n", "display(\"Обучающая выборка после undersampling: \", df_train_undersampled.shape)\n", "display(df_train_undersampled.LeatherInterior1.value_counts())\n", "\n", "df_train_undersampled" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "bedrooms\n", "3 1518\n", "4 1058\n", "2 404\n", "5 234\n", "6 55\n", "1 30\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Обучающая выборка: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(1979, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "bedrooms\n", "3 911\n", "4 635\n", "2 242\n", "5 140\n", "6 33\n", "1 18\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Контрольная выборка: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(660, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "bedrooms\n", "3 304\n", "4 211\n", "2 81\n", "5 47\n", "6 11\n", "1 6\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Тестовая выборка: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(660, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "bedrooms\n", "3 303\n", "4 212\n", "2 81\n", "5 47\n", "6 11\n", "1 6\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Вывод распределения количества наблюдений по меткам (классам)\n", "from src.utils import split_stratified_into_train_val_test\n", "\n", "\n", "display(house.bedrooms.value_counts())\n", "display()\n", "\n", "data = house[[\"bedrooms\", \"sqft_living\", \"sqft_lot\"]].copy()\n", "\n", "df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n", " data, stratify_colname=\"bedrooms\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n", ")\n", "\n", "display(\"Обучающая выборка: \", df_train.shape)\n", "display(df_train.bedrooms.value_counts())\n", "\n", "display(\"Контрольная выборка: \", df_val.shape)\n", "display(df_val.bedrooms.value_counts())\n", "\n", "display(\"Тестовая выборка: \", df_test.shape)\n", "display(df_test.bedrooms.value_counts())" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Обучающая выборка: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(1979, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "bedrooms\n", "3 911\n", "4 635\n", "2 242\n", "5 140\n", "6 33\n", "1 18\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Обучающая выборка после oversampling: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(5380, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "bedrooms\n", "2 932\n", "1 914\n", "3 911\n", "6 907\n", "5 888\n", "4 828\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bedroomssqft_livingsqft_lot
03194010035
1419204862
2423403784
34345033460
44223026989
............
5375625756858
5376626358286
5377628157930
5378628577735
5379629237315
\n", "

5380 rows × 3 columns

\n", "
" ], "text/plain": [ " bedrooms sqft_living sqft_lot\n", "0 3 1940 10035\n", "1 4 1920 4862\n", "2 4 2340 3784\n", "3 4 3450 33460\n", "4 4 2230 26989\n", "... ... ... ...\n", "5375 6 2575 6858\n", "5376 6 2635 8286\n", "5377 6 2815 7930\n", "5378 6 2857 7735\n", "5379 6 2923 7315\n", "\n", "[5380 rows x 3 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from imblearn.over_sampling import ADASYN\n", "\n", "ada = ADASYN()\n", "\n", "display(\"Обучающая выборка: \", df_train.shape)\n", "display(df_train.bedrooms.value_counts())\n", "\n", "X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"bedrooms\"]) # type: ignore\n", "df_train_adasyn = pd.DataFrame(X_resampled)\n", "\n", "display(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n", "display(df_train_adasyn.bedrooms.value_counts())\n", "\n", "df_train_adasyn" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Обучающая выборка: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(1979, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "bedrooms\n", "3 911\n", "4 635\n", "2 242\n", "5 140\n", "6 33\n", "1 18\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Обучающая выборка после undersampling: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(108, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "bedrooms\n", "1 18\n", "2 18\n", "3 18\n", "4 18\n", "5 18\n", "6 18\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sqft_livingsqft_lotbedrooms
2003109087501
2922780102351
350560121201
268271060001
30378902115761
............
8322450256006
7833610100036
4864860117936
14993840140406
13023010178646
\n", "

108 rows × 3 columns

\n", "
" ], "text/plain": [ " sqft_living sqft_lot bedrooms\n", "2003 1090 8750 1\n", "2922 780 10235 1\n", "350 560 12120 1\n", "2682 710 6000 1\n", "3037 890 211576 1\n", "... ... ... ...\n", "832 2450 25600 6\n", "783 3610 10003 6\n", "486 4860 11793 6\n", "1499 3840 14040 6\n", "1302 3010 17864 6\n", "\n", "[108 rows x 3 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from imblearn.under_sampling import RandomUnderSampler\n", "import pandas as pd\n", "\n", "# Создание экземпляра RandomUnderSampler\n", "rus = RandomUnderSampler()\n", "\n", "# Проверка исходной обучающей выборки\n", "display(\"Обучающая выборка: \", df_train.shape)\n", "display(df_train.bedrooms.value_counts())\n", "\n", "# Выполнение undersampling\n", "X_resampled, y_resampled = rus.fit_resample(df_train.drop(columns=[\"bedrooms\"]), df_train[\"bedrooms\"]) # type: ignore\n", "df_train_undersampled = pd.DataFrame(X_resampled)\n", "\n", "# Добавление целевой переменной обратно в выборку\n", "df_train_undersampled[\"bedrooms\"] = y_resampled\n", "\n", "# Проверка обучающей выборки после undersampling\n", "display(\"Обучающая выборка после undersampling: \", df_train_undersampled.shape)\n", "display(df_train_undersampled.bedrooms.value_counts())\n", "\n", "df_train_undersampled" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.5" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }