{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Загрузка данных в DataFrame" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 1370 entries, 0 to 1369\n", "Data columns (total 18 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Name 1370 non-null object \n", " 1 Rating 1370 non-null float64\n", " 2 Spec_score 1367 non-null float64\n", " 3 No_of_sim 1370 non-null object \n", " 4 Ram 1370 non-null object \n", " 5 Battery 1370 non-null object \n", " 6 Display 1370 non-null object \n", " 7 Camera 1370 non-null object \n", " 8 External_Memory 1370 non-null object \n", " 9 Android_version 927 non-null object \n", " 10 Price 1370 non-null object \n", " 11 company 1370 non-null object \n", " 12 Inbuilt_memory 1351 non-null object \n", " 13 fast_charging 1281 non-null object \n", " 14 Screen_resolution 1368 non-null object \n", " 15 Processor 1342 non-null object \n", " 16 Processor_name 1370 non-null object \n", " 17 Rating_index 1370 non-null int64 \n", "dtypes: float64(2), int64(1), object(15)\n", "memory usage: 203.4+ KB\n", "(1370, 18)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameRatingSpec_scoreNo_of_simRamBatteryDisplayCameraExternal_MemoryAndroid_versionPricecompanyInbuilt_memoryfast_chargingScreen_resolutionProcessorProcessor_nameRating_index
ID
0Samsung Galaxy F14 5G4.6568.0Dual Sim, 3G, 4G, 5G, VoLTE,4 GB RAM6000 mAh Battery6.6 inches50 MP + 2 MP Dual Rear & 13 MP Front CameraMemory Card Supported, upto 1 TB139.999Samsung128 GB inbuilt25W Fast Charging2408 x 1080 px Display with Water Drop NotchOcta Core ProcessorExynos 13301
1Samsung Galaxy A114.2063.0Dual Sim, 3G, 4G, VoLTE,2 GB RAM4000 mAh Battery6.4 inches13 MP + 5 MP + 2 MP Triple Rear & 8 MP Fro...Memory Card Supported, upto 512 GB109,990Samsung32 GB inbuilt15W Fast Charging720 x 1560 px Display with Punch Hole1.8 GHz ProcessorOcta Core0
2Samsung Galaxy A134.30NaNDual Sim, 3G, 4G, VoLTE,4 GB RAM5000 mAh Battery6.6 inches50 MP Quad Rear & 8 MP Front CameraMemory Card Supported, upto 1 TB1211,999Samsung64 GB inbuilt25W Fast Charging1080 x 2408 px Display with Water Drop Notch2 GHz ProcessorOcta Core0
3Samsung Galaxy F234.1073.0Dual Sim, 3G, 4G, VoLTE,4 GB RAM6000 mAh Battery6.4 inches48 MP Quad Rear & 13 MP Front CameraMemory Card Supported, upto 1 TB1211,999Samsung64 GB inbuiltNaN720 x 1600 pxOcta CoreHelio G880
4Samsung Galaxy A03s (4GB RAM + 64GB)4.1069.0Dual Sim, 3G, 4G, VoLTE,4 GB RAM5000 mAh Battery6.5 inches13 MP + 2 MP + 2 MP Triple Rear & 5 MP Fro...Memory Card Supported, upto 1 TB1111,999Samsung64 GB inbuilt15W Fast Charging720 x 1600 px Display with Water Drop NotchOcta CoreHelio P350
\n", "
" ], "text/plain": [ " Name Rating Spec_score \\\n", "ID \n", "0 Samsung Galaxy F14 5G 4.65 68.0 \n", "1 Samsung Galaxy A11 4.20 63.0 \n", "2 Samsung Galaxy A13 4.30 NaN \n", "3 Samsung Galaxy F23 4.10 73.0 \n", "4 Samsung Galaxy A03s (4GB RAM + 64GB) 4.10 69.0 \n", "\n", " No_of_sim Ram Battery Display \\\n", "ID \n", "0 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 6000 mAh Battery 6.6 inches \n", "1 Dual Sim, 3G, 4G, VoLTE, 2 GB RAM 4000 mAh Battery 6.4 inches \n", "2 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.6 inches \n", "3 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 6000 mAh Battery 6.4 inches \n", "4 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.5 inches \n", "\n", " Camera \\\n", "ID \n", "0 50 MP + 2 MP Dual Rear & 13 MP Front Camera \n", "1 13 MP + 5 MP + 2 MP Triple Rear & 8 MP Fro... \n", "2 50 MP Quad Rear & 8 MP Front Camera \n", "3 48 MP Quad Rear & 13 MP Front Camera \n", "4 13 MP + 2 MP + 2 MP Triple Rear & 5 MP Fro... \n", "\n", " External_Memory Android_version Price company \\\n", "ID \n", "0 Memory Card Supported, upto 1 TB 13 9.999 Samsung \n", "1 Memory Card Supported, upto 512 GB 10 9,990 Samsung \n", "2 Memory Card Supported, upto 1 TB 12 11,999 Samsung \n", "3 Memory Card Supported, upto 1 TB 12 11,999 Samsung \n", "4 Memory Card Supported, upto 1 TB 11 11,999 Samsung \n", "\n", " Inbuilt_memory fast_charging \\\n", "ID \n", "0 128 GB inbuilt 25W Fast Charging \n", "1 32 GB inbuilt 15W Fast Charging \n", "2 64 GB inbuilt 25W Fast Charging \n", "3 64 GB inbuilt NaN \n", "4 64 GB inbuilt 15W Fast Charging \n", "\n", " Screen_resolution Processor \\\n", "ID \n", "0 2408 x 1080 px Display with Water Drop Notch Octa Core Processor \n", "1 720 x 1560 px Display with Punch Hole 1.8 GHz Processor \n", "2 1080 x 2408 px Display with Water Drop Notch 2 GHz Processor \n", "3 720 x 1600 px Octa Core \n", "4 720 x 1600 px Display with Water Drop Notch Octa Core \n", "\n", " Processor_name Rating_index \n", "ID \n", "0 Exynos 1330 1 \n", "1 Octa Core 0 \n", "2 Octa Core 0 \n", "3 Helio G88 0 \n", "4 Helio P35 0 " ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df1 = pd.read_csv(\"../data/mobile_phone_price_prediction.csv\", index_col=\"ID\")\n", "df1[\"Spec_score\"] = df1[\"Spec_score\"].replace({\"\": None})\n", "df1[\"Rating_index\"] = df1[\"Rating\"].apply(lambda x: 1 if float(x) > 4.5 else 0)\n", "\n", "df1.info()\n", "\n", "print(df1.shape)\n", "\n", "df1.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Получение сведений о пропущенных данных" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Типы пропущенных данных:\n", "- None - представление пустых данных в Python\n", "- NaN - представление пустых данных в Pandas\n", "- '' - пустая строка" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Name 0\n", "Rating 0\n", "Spec_score 3\n", "No_of_sim 0\n", "Ram 0\n", "Battery 0\n", "Display 0\n", "Camera 0\n", "External_Memory 0\n", "Android_version 443\n", "Price 0\n", "company 0\n", "Inbuilt_memory 19\n", "fast_charging 89\n", "Screen_resolution 2\n", "Processor 28\n", "Processor_name 0\n", "company_index 0\n", "dtype: int64\n", "\n", "Name False\n", "Rating False\n", "Spec_score True\n", "No_of_sim False\n", "Ram False\n", "Battery False\n", "Display False\n", "Camera False\n", "External_Memory False\n", "Android_version True\n", "Price False\n", "company False\n", "Inbuilt_memory True\n", "fast_charging True\n", "Screen_resolution True\n", "Processor True\n", "Processor_name False\n", "company_index False\n", "dtype: bool\n", "\n", "Spec_score процент пустых значений: %0.22\n", "Android_version процент пустых значений: %32.34\n", "Inbuilt_memory процент пустых значений: %1.39\n", "fast_charging процент пустых значений: %6.50\n", "Screen_resolution процент пустых значений: %0.15\n", "Processor процент пустых значений: %2.04\n" ] } ], "source": [ "# Количество пустых значений признаков\n", "print(df1.isnull().sum())\n", "\n", "print()\n", "\n", "# Есть ли пустые значения признаков\n", "print(df1.isnull().any())\n", "\n", "print()\n", "\n", "# Процент пустых значений признаков\n", "for i in df1.columns:\n", " null_rate = df1[i].isnull().sum() / len(df1) * 100\n", " if null_rate > 0:\n", " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Заполнение пропущенных данных\n", "\n", "https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n", "\n", "https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1370, 18)\n", "Name False\n", "Rating False\n", "Spec_score False\n", "No_of_sim False\n", "Ram False\n", "Battery False\n", "Display False\n", "Camera False\n", "External_Memory False\n", "Android_version False\n", "Price False\n", "company False\n", "Inbuilt_memory False\n", "fast_charging False\n", "Screen_resolution False\n", "Processor False\n", "Processor_name False\n", "Rating_index False\n", "dtype: bool\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameRatingSpec_scoreNo_of_simRamBatteryDisplayCameraExternal_MemoryAndroid_versionPricecompanyInbuilt_memoryfast_chargingScreen_resolutionProcessorProcessor_nameRating_indexSpec_scoreFillNASpec_scoreFillMedian
ID
1365TCL 40R4.0575.0Dual Sim, 3G, 4G, 5G, VoLTE,4 GB RAM5000 mAh Battery6.6 inches50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro...Memory Card (Hybrid)1218,999TCL64 GB inbuilt15W Fast Charging720 x 1612 pxOcta CoreDimensity 700 5G075.075.0
1366TCL 50 XL NxtPaper 5G4.1080.0Dual Sim, 3G, 4G, VoLTE,8 GB RAM5000 mAh Battery6.8 inches50 MP + 2 MP Dual Rear & 16 MP Front CameraMemory Card (Hybrid)1424,990TCL128 GB inbuilt33W Fast Charging1200 x 2400 pxOcta CoreDimensity 7050080.080.0
1367TCL 50 XE NxtPaper 5G4.0080.0Dual Sim, 3G, 4G, 5G, VoLTE,6 GB RAM5000 mAh Battery6.6 inches50 MP + 2 MP Dual Rear & 16 MP Front CameraMemory Card Supported, upto 1 TB1323,990TCL256 GB inbuilt18W Fast Charging720 x 1612 pxOcta CoreDimensity 6080080.080.0
1368TCL 40 NxtPaper 5G4.5079.0Dual Sim, 3G, 4G, 5G, VoLTE,6 GB RAM5000 mAh Battery6.6 inches50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro...Memory Card Supported, upto 1 TB1322,499TCL256 GB inbuilt15W Fast Charging720 x 1612 pxOcta CoreDimensity 6020079.079.0
1369TCL Trifold4.6593.0Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G,12 GB RAM4600 mAh Battery10 inchesFoldable Display, Dual Display50 MP + 48 MP + 8 MP Triple Rear & 32 MP F...131,19,990TCL256 GB inbuilt67W Fast Charging1916 x 2160 pxOcta CoreSnapdragon 8 Gen2193.093.0
\n", "
" ], "text/plain": [ " Name Rating Spec_score \\\n", "ID \n", "1365 TCL 40R 4.05 75.0 \n", "1366 TCL 50 XL NxtPaper 5G 4.10 80.0 \n", "1367 TCL 50 XE NxtPaper 5G 4.00 80.0 \n", "1368 TCL 40 NxtPaper 5G 4.50 79.0 \n", "1369 TCL Trifold 4.65 93.0 \n", "\n", " No_of_sim Ram Battery \\\n", "ID \n", "1365 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 5000 mAh Battery \n", "1366 Dual Sim, 3G, 4G, VoLTE, 8 GB RAM 5000 mAh Battery \n", "1367 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery \n", "1368 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery \n", "1369 Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G, 12 GB RAM 4600 mAh Battery \n", "\n", " Display Camera \\\n", "ID \n", "1365 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro... \n", "1366 6.8 inches 50 MP + 2 MP Dual Rear & 16 MP Front Camera \n", "1367 6.6 inches 50 MP + 2 MP Dual Rear & 16 MP Front Camera \n", "1368 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro... \n", "1369 10 inches Foldable Display, Dual Display \n", "\n", " External_Memory Android_version \\\n", "ID \n", "1365 Memory Card (Hybrid) 12 \n", "1366 Memory Card (Hybrid) 14 \n", "1367 Memory Card Supported, upto 1 TB 13 \n", "1368 Memory Card Supported, upto 1 TB 13 \n", "1369 50 MP + 48 MP + 8 MP Triple Rear & 32 MP F... 13 \n", "\n", " Price company Inbuilt_memory fast_charging Screen_resolution \\\n", "ID \n", "1365 18,999 TCL 64 GB inbuilt 15W Fast Charging 720 x 1612 px \n", "1366 24,990 TCL 128 GB inbuilt 33W Fast Charging 1200 x 2400 px \n", "1367 23,990 TCL 256 GB inbuilt 18W Fast Charging 720 x 1612 px \n", "1368 22,499 TCL 256 GB inbuilt 15W Fast Charging 720 x 1612 px \n", "1369 1,19,990 TCL 256 GB inbuilt 67W Fast Charging 1916 x 2160 px \n", "\n", " Processor Processor_name Rating_index Spec_scoreFillNA \\\n", "ID \n", "1365 Octa Core Dimensity 700 5G 0 75.0 \n", "1366 Octa Core Dimensity 7050 0 80.0 \n", "1367 Octa Core Dimensity 6080 0 80.0 \n", "1368 Octa Core Dimensity 6020 0 79.0 \n", "1369 Octa Core Snapdragon 8 Gen2 1 93.0 \n", "\n", " Spec_scoreFillMedian \n", "ID \n", "1365 75.0 \n", "1366 80.0 \n", "1367 80.0 \n", "1368 79.0 \n", "1369 93.0 " ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fillna_df = df1.fillna(0)\n", "\n", "print(fillna_df.shape)\n", "\n", "print(fillna_df.isnull().any())\n", "\n", "# Замена пустых данных на 0\n", "df1[\"Spec_scoreFillNA\"] = df1[\"Spec_score\"].fillna(0)\n", "\n", "# Замена пустых данных на медиану\n", "df1[\"Spec_scoreFillMedian\"] = df1[\"Spec_score\"].fillna(df1[\"Spec_scoreFillNA\"].median())\n", "\n", "df1.tail()" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameRatingSpec_scoreNo_of_simRamBatteryDisplayCameraExternal_MemoryAndroid_version...companyInbuilt_memoryfast_chargingScreen_resolutionProcessorProcessor_namecompany_indexSpec_scoreFillNASpec_scoreFillMedianAndroid_versionCopy
ID
1365TCL 40R4.0575.0Dual Sim, 3G, 4G, 5G, VoLTE,4 GB RAM5000 mAh Battery6.6 inches50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro...Memory Card (Hybrid)12...TCL64 GB inbuilt15W Fast Charging720 x 1612 pxOcta CoreDimensity 700 5GTCL75.075.012
1366TCL 50 XL NxtPaper 5G4.1080.0Dual Sim, 3G, 4G, VoLTE,8 GB RAM5000 mAh Battery6.8 inches50 MP + 2 MP Dual Rear & 16 MP Front CameraMemory Card (Hybrid)14...TCL128 GB inbuilt33W Fast Charging1200 x 2400 pxOcta CoreDimensity 7050TCL80.080.014
1367TCL 50 XE NxtPaper 5G4.0080.0Dual Sim, 3G, 4G, 5G, VoLTE,6 GB RAM5000 mAh Battery6.6 inches50 MP + 2 MP Dual Rear & 16 MP Front CameraMemory Card Supported, upto 1 TB13...TCL256 GB inbuilt18W Fast Charging720 x 1612 pxOcta CoreDimensity 6080TCL80.080.013
1368TCL 40 NxtPaper 5G4.5079.0Dual Sim, 3G, 4G, 5G, VoLTE,6 GB RAM5000 mAh Battery6.6 inches50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro...Memory Card Supported, upto 1 TB13...TCL256 GB inbuilt15W Fast Charging720 x 1612 pxOcta CoreDimensity 6020TCL79.079.013
1369TCL Trifold4.6593.0Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G,12 GB RAM4600 mAh Battery10 inchesFoldable Display, Dual Display50 MP + 48 MP + 8 MP Triple Rear & 32 MP F...13...TCL256 GB inbuilt67W Fast Charging1916 x 2160 pxOcta CoreSnapdragon 8 Gen2TCL93.093.013
\n", "

5 rows × 21 columns

\n", "
" ], "text/plain": [ " Name Rating Spec_score \\\n", "ID \n", "1365 TCL 40R 4.05 75.0 \n", "1366 TCL 50 XL NxtPaper 5G 4.10 80.0 \n", "1367 TCL 50 XE NxtPaper 5G 4.00 80.0 \n", "1368 TCL 40 NxtPaper 5G 4.50 79.0 \n", "1369 TCL Trifold 4.65 93.0 \n", "\n", " No_of_sim Ram Battery \\\n", "ID \n", "1365 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 5000 mAh Battery \n", "1366 Dual Sim, 3G, 4G, VoLTE, 8 GB RAM 5000 mAh Battery \n", "1367 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery \n", "1368 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery \n", "1369 Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G, 12 GB RAM 4600 mAh Battery \n", "\n", " Display Camera \\\n", "ID \n", "1365 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro... \n", "1366 6.8 inches 50 MP + 2 MP Dual Rear & 16 MP Front Camera \n", "1367 6.6 inches 50 MP + 2 MP Dual Rear & 16 MP Front Camera \n", "1368 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro... \n", "1369 10 inches Foldable Display, Dual Display \n", "\n", " External_Memory Android_version ... \\\n", "ID ... \n", "1365 Memory Card (Hybrid) 12 ... \n", "1366 Memory Card (Hybrid) 14 ... \n", "1367 Memory Card Supported, upto 1 TB 13 ... \n", "1368 Memory Card Supported, upto 1 TB 13 ... \n", "1369 50 MP + 48 MP + 8 MP Triple Rear & 32 MP F... 13 ... \n", "\n", " company Inbuilt_memory fast_charging Screen_resolution \\\n", "ID \n", "1365 TCL 64 GB inbuilt 15W Fast Charging 720 x 1612 px \n", "1366 TCL 128 GB inbuilt 33W Fast Charging 1200 x 2400 px \n", "1367 TCL 256 GB inbuilt 18W Fast Charging 720 x 1612 px \n", "1368 TCL 256 GB inbuilt 15W Fast Charging 720 x 1612 px \n", "1369 TCL 256 GB inbuilt 67W Fast Charging 1916 x 2160 px \n", "\n", " Processor Processor_name company_index Spec_scoreFillNA \\\n", "ID \n", "1365 Octa Core Dimensity 700 5G TCL 75.0 \n", "1366 Octa Core Dimensity 7050 TCL 80.0 \n", "1367 Octa Core Dimensity 6080 TCL 80.0 \n", "1368 Octa Core Dimensity 6020 TCL 79.0 \n", "1369 Octa Core Snapdragon 8 Gen2 TCL 93.0 \n", "\n", " Spec_scoreFillMedian Android_versionCopy \n", "ID \n", "1365 75.0 12 \n", "1366 80.0 14 \n", "1367 80.0 13 \n", "1368 79.0 13 \n", "1369 93.0 13 \n", "\n", "[5 rows x 21 columns]" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1[\"Android_versionCopy\"] = df1[\"Android_version\"]\n", "\n", "# Замена данных сразу в DataFrame без копирования\n", "df1.fillna({\"Android_versionCopy\": 0}, inplace=True)\n", "\n", "df1.tail()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Удаление наблюдений с пропусками" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(814, 21)\n", "Name False\n", "Rating False\n", "Spec_score False\n", "No_of_sim False\n", "Ram False\n", "Battery False\n", "Display False\n", "Camera False\n", "External_Memory False\n", "Android_version False\n", "Price False\n", "company False\n", "Inbuilt_memory False\n", "fast_charging False\n", "Screen_resolution False\n", "Processor False\n", "Processor_name False\n", "company_index False\n", "dtype: bool\n" ] } ], "source": [ "dropna_df = df1.dropna()\n", "\n", "print(dropna_df.shape)\n", "\n", "print(fillna_df.isnull().any())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Создание выборок данных\n", "\n", "Библиотека scikit-learn\n", "\n", "https://scikit-learn.org/stable/index.html" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "# Функция для создания выборок\n", "from sklearn.model_selection import train_test_split\n", "\n", "\n", "def split_stratified_into_train_val_test(\n", " df_input,\n", " stratify_colname=\"y\",\n", " frac_train=0.6,\n", " frac_val=0.15,\n", " frac_test=0.25,\n", " random_state=None,\n", "):\n", " \"\"\"\n", " Splits a Pandas dataframe into three subsets (train, val, and test)\n", " following fractional ratios provided by the user, where each subset is\n", " stratified by the values in a specific column (that is, each subset has\n", " the same relative frequency of the values in the column). It performs this\n", " splitting by running train_test_split() twice.\n", "\n", " Parameters\n", " ----------\n", " df_input : Pandas dataframe\n", " Input dataframe to be split.\n", " stratify_colname : str\n", " The name of the column that will be used for stratification. Usually\n", " this column would be for the label.\n", " frac_train : float\n", " frac_val : float\n", " frac_test : float\n", " The ratios with which the dataframe will be split into train, val, and\n", " test data. The values should be expressed as float fractions and should\n", " sum to 1.0.\n", " random_state : int, None, or RandomStateInstance\n", " Value to be passed to train_test_split().\n", "\n", " Returns\n", " -------\n", " df_train, df_val, df_test :\n", " Dataframes containing the three splits.\n", " \"\"\"\n", "\n", " if frac_train + frac_val + frac_test != 1.0:\n", " raise ValueError(\n", " \"fractions %f, %f, %f do not add up to 1.0\"\n", " % (frac_train, frac_val, frac_test)\n", " )\n", "\n", " if stratify_colname not in df_input.columns:\n", " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", "\n", " X = df_input # Contains all columns.\n", " y = df_input[\n", " [stratify_colname]\n", " ] # Dataframe of just the column on which to stratify.\n", "\n", " # Split original dataframe into train and temp dataframes.\n", " df_train, df_temp, y_train, y_temp = train_test_split(\n", " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", " )\n", "\n", " # Split the temp dataframe into val and test dataframes.\n", " relative_frac_test = frac_test / (frac_val + frac_test)\n", " df_val, df_test, y_val, y_test = train_test_split(\n", " df_temp,\n", " y_temp,\n", " stratify=y_temp,\n", " test_size=relative_frac_test,\n", " random_state=random_state,\n", " )\n", "\n", " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", "\n", " return df_train, df_val, df_test" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rating_index\n", "0 942\n", "1 428\n", "Name: count, dtype: int64\n", "Обучающая выборка: (822, 2)\n", "Rating_index\n", "0 565\n", "1 257\n", "Name: count, dtype: int64\n", "Контрольная выборка: (274, 2)\n", "Rating_index\n", "0 189\n", "1 85\n", "Name: count, dtype: int64\n", "Тестовая выборка: (274, 2)\n", "Rating_index\n", "0 188\n", "1 86\n", "Name: count, dtype: int64\n" ] } ], "source": [ "# Вывод распределения количества наблюдений по меткам (классам)\n", "print(df1.Rating_index.value_counts())\n", "\n", "data = df1[[\"Rating_index\", \"Spec_scoreFillMedian\"]].copy()\n", "\n", "df_train, df_val, df_test = split_stratified_into_train_val_test(\n", " data,\n", " stratify_colname=\"Rating_index\",\n", " frac_train=0.60,\n", " frac_val=0.20,\n", " frac_test=0.20,\n", ")\n", "\n", "print(\"Обучающая выборка: \", df_train.shape)\n", "print(df_train.Rating_index.value_counts())\n", "\n", "print(\"Контрольная выборка: \", df_val.shape)\n", "print(df_val.Rating_index.value_counts())\n", "\n", "print(\"Тестовая выборка: \", df_test.shape)\n", "print(df_test.Rating_index.value_counts())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Выборка с избытком (oversampling)\n", "\n", "https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n", "\n", "https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n", "\n", "Выборка с недостатком (undersampling)\n", "\n", "https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n", "\n", "Библиотека imbalanced-learn\n", "\n", "https://imbalanced-learn.org/stable/" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Обучающая выборка: (822, 2)\n", "Rating_index\n", "0 565\n", "1 257\n", "Name: count, dtype: int64\n", "Обучающая выборка после oversampling: (1127, 2)\n", "Rating_index\n", "0 565\n", "1 562\n", "Name: count, dtype: int64\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Rating_indexSpec_scoreFillMedian
0075.000000
1083.000000
2076.000000
3080.000000
4065.000000
.........
1122164.392639
1123163.805529
1124164.607184
1125164.688453
1126164.376356
\n", "

1127 rows × 2 columns

\n", "
" ], "text/plain": [ " Rating_index Spec_scoreFillMedian\n", "0 0 75.000000\n", "1 0 83.000000\n", "2 0 76.000000\n", "3 0 80.000000\n", "4 0 65.000000\n", "... ... ...\n", "1122 1 64.392639\n", "1123 1 63.805529\n", "1124 1 64.607184\n", "1125 1 64.688453\n", "1126 1 64.376356\n", "\n", "[1127 rows x 2 columns]" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from imblearn.over_sampling import ADASYN\n", "\n", "ada = ADASYN()\n", "\n", "print(\"Обучающая выборка: \", df_train.shape)\n", "print(df_train.Rating_index.value_counts())\n", "\n", "X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Rating_index\"])\n", "df_train_adasyn = pd.DataFrame(X_resampled)\n", "\n", "print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n", "print(df_train_adasyn.Rating_index.value_counts())\n", "\n", "df_train_adasyn" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.6" } }, "nbformat": 4, "nbformat_minor": 2 }