724 lines
23 KiB
Plaintext
724 lines
23 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Загрузка данных в DataFrame"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 76,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"Index: 100 entries, Bytedance to iCapital Network\n",
|
|||
|
"Data columns (total 9 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 Valuation 100 non-null object\n",
|
|||
|
" 1 Country 100 non-null object\n",
|
|||
|
" 2 State 79 non-null object\n",
|
|||
|
" 3 City 99 non-null object\n",
|
|||
|
" 4 Industries 99 non-null object\n",
|
|||
|
" 5 FoundedYear 100 non-null int64 \n",
|
|||
|
" 6 Name of Founders 100 non-null object\n",
|
|||
|
" 7 TotalFunding 100 non-null object\n",
|
|||
|
" 8 Number of Employees 100 non-null object\n",
|
|||
|
"dtypes: int64(1), object(8)\n",
|
|||
|
"memory usage: 7.8+ KB\n",
|
|||
|
"(100, 10)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Valuation</th>\n",
|
|||
|
" <th>Country</th>\n",
|
|||
|
" <th>State</th>\n",
|
|||
|
" <th>City</th>\n",
|
|||
|
" <th>Industries</th>\n",
|
|||
|
" <th>FoundedYear</th>\n",
|
|||
|
" <th>Name of Founders</th>\n",
|
|||
|
" <th>TotalFunding</th>\n",
|
|||
|
" <th>Number of Employees</th>\n",
|
|||
|
" <th>IsChina</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>Company</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>Bytedance</th>\n",
|
|||
|
" <td>140.0</td>\n",
|
|||
|
" <td>China</td>\n",
|
|||
|
" <td>Beijing</td>\n",
|
|||
|
" <td>Beijing</td>\n",
|
|||
|
" <td>Content, Data Mining, Internet</td>\n",
|
|||
|
" <td>2012</td>\n",
|
|||
|
" <td>Yiming Zhang</td>\n",
|
|||
|
" <td>7440.00</td>\n",
|
|||
|
" <td>10.000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>SpaceX</th>\n",
|
|||
|
" <td>100.3</td>\n",
|
|||
|
" <td>United States</td>\n",
|
|||
|
" <td>California</td>\n",
|
|||
|
" <td>Hawthorne</td>\n",
|
|||
|
" <td>Aerospace, Manufacturing, Space Travel, Transp...</td>\n",
|
|||
|
" <td>2002</td>\n",
|
|||
|
" <td>Elon Musk</td>\n",
|
|||
|
" <td>383.02</td>\n",
|
|||
|
" <td>5,000-10,000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>Stripe</th>\n",
|
|||
|
" <td>95.0</td>\n",
|
|||
|
" <td>United States</td>\n",
|
|||
|
" <td>California</td>\n",
|
|||
|
" <td>San Francisco</td>\n",
|
|||
|
" <td>Finance, FinTech, Mobile Payments, SaaS</td>\n",
|
|||
|
" <td>2010</td>\n",
|
|||
|
" <td>John Collison, Patrick Collison</td>\n",
|
|||
|
" <td>300.00</td>\n",
|
|||
|
" <td>1,000-5,000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>Klarna</th>\n",
|
|||
|
" <td>45.6</td>\n",
|
|||
|
" <td>Sweden</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>Stockholm</td>\n",
|
|||
|
" <td>E-Commerce, FinTech, Payments, Shopping</td>\n",
|
|||
|
" <td>2005</td>\n",
|
|||
|
" <td>Niklas Adalberth, Sebastian Siemiatkowski, Vic...</td>\n",
|
|||
|
" <td>3471.72</td>\n",
|
|||
|
" <td>5,000-10,000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>Epic Games</th>\n",
|
|||
|
" <td>42.0</td>\n",
|
|||
|
" <td>United States</td>\n",
|
|||
|
" <td>North Carolina</td>\n",
|
|||
|
" <td>Cary</td>\n",
|
|||
|
" <td>Developer Platform, Gaming, Software, Video Games</td>\n",
|
|||
|
" <td>1991</td>\n",
|
|||
|
" <td>Mark Rein, Tim Sweeney</td>\n",
|
|||
|
" <td>544.93</td>\n",
|
|||
|
" <td>1,000-5,000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Valuation Country State City \\\n",
|
|||
|
"Company \n",
|
|||
|
"Bytedance 140.0 China Beijing Beijing \n",
|
|||
|
"SpaceX 100.3 United States California Hawthorne \n",
|
|||
|
"Stripe 95.0 United States California San Francisco \n",
|
|||
|
"Klarna 45.6 Sweden NaN Stockholm \n",
|
|||
|
"Epic Games 42.0 United States North Carolina Cary \n",
|
|||
|
"\n",
|
|||
|
" Industries FoundedYear \\\n",
|
|||
|
"Company \n",
|
|||
|
"Bytedance Content, Data Mining, Internet 2012 \n",
|
|||
|
"SpaceX Aerospace, Manufacturing, Space Travel, Transp... 2002 \n",
|
|||
|
"Stripe Finance, FinTech, Mobile Payments, SaaS 2010 \n",
|
|||
|
"Klarna E-Commerce, FinTech, Payments, Shopping 2005 \n",
|
|||
|
"Epic Games Developer Platform, Gaming, Software, Video Games 1991 \n",
|
|||
|
"\n",
|
|||
|
" Name of Founders TotalFunding \\\n",
|
|||
|
"Company \n",
|
|||
|
"Bytedance Yiming Zhang 7440.00 \n",
|
|||
|
"SpaceX Elon Musk 383.02 \n",
|
|||
|
"Stripe John Collison, Patrick Collison 300.00 \n",
|
|||
|
"Klarna Niklas Adalberth, Sebastian Siemiatkowski, Vic... 3471.72 \n",
|
|||
|
"Epic Games Mark Rein, Tim Sweeney 544.93 \n",
|
|||
|
"\n",
|
|||
|
" Number of Employees IsChina \n",
|
|||
|
"Company \n",
|
|||
|
"Bytedance 10.000 1 \n",
|
|||
|
"SpaceX 5,000-10,000 0 \n",
|
|||
|
"Stripe 1,000-5,000 0 \n",
|
|||
|
"Klarna 5,000-10,000 0 \n",
|
|||
|
"Epic Games 1,000-5,000 0 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 76,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\"data/unicorns.csv\", index_col=\"Company\", sep=';')\n",
|
|||
|
"\n",
|
|||
|
"df.info()\n",
|
|||
|
"\n",
|
|||
|
"df[\"Valuation\"] = df[\"Valuation\"].apply(\n",
|
|||
|
" lambda x: float(x[:-4].replace(',', '.')),\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"df[\"TotalFunding\"] = df[\"TotalFunding\"].apply(\n",
|
|||
|
" lambda x: float(x.strip(\"$M\").replace(\",\", \"\")),\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"df[\"IsChina\"] = [int(country == 'China') for country in df[\"Country\"]]\n",
|
|||
|
"print(df.shape)\n",
|
|||
|
"\n",
|
|||
|
"df.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Получение сведений о пропущенных данных"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Типы пропущенных данных:\n",
|
|||
|
"- None - представление пустых данных в Python\n",
|
|||
|
"- NaN - представление пустых данных в Pandas\n",
|
|||
|
"- '' - пустая строка"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 77,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Valuation 0\n",
|
|||
|
"Country 0\n",
|
|||
|
"State 21\n",
|
|||
|
"City 1\n",
|
|||
|
"Industries 1\n",
|
|||
|
"FoundedYear 0\n",
|
|||
|
"Name of Founders 0\n",
|
|||
|
"TotalFunding 0\n",
|
|||
|
"Number of Employees 0\n",
|
|||
|
"IsChina 0\n",
|
|||
|
"dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Valuation False\n",
|
|||
|
"Country False\n",
|
|||
|
"State True\n",
|
|||
|
"City True\n",
|
|||
|
"Industries True\n",
|
|||
|
"FoundedYear False\n",
|
|||
|
"Name of Founders False\n",
|
|||
|
"TotalFunding False\n",
|
|||
|
"Number of Employees False\n",
|
|||
|
"IsChina False\n",
|
|||
|
"dtype: bool\n",
|
|||
|
"\n",
|
|||
|
"State процент пустых значений: %21.00\n",
|
|||
|
"City процент пустых значений: %1.00\n",
|
|||
|
"Industries процент пустых значений: %1.00\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Количество пустых значений признаков\n",
|
|||
|
"print(df.isnull().sum())\n",
|
|||
|
"\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"# Есть ли пустые значения признаков\n",
|
|||
|
"print(df.isnull().any())\n",
|
|||
|
"\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"# Процент пустых значений признаков\n",
|
|||
|
"for i in df.columns:\n",
|
|||
|
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
|||
|
" if null_rate > 0:\n",
|
|||
|
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Заполнение пропущенных данных\n",
|
|||
|
"\n",
|
|||
|
"https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n",
|
|||
|
"\n",
|
|||
|
"https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 78,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# fillna_df = df.fillna(0)\n",
|
|||
|
"\n",
|
|||
|
"# print(fillna_df.shape)\n",
|
|||
|
"\n",
|
|||
|
"# print(fillna_df.isnull().any())\n",
|
|||
|
"\n",
|
|||
|
"# # Замена пустых данных на 0\n",
|
|||
|
"# df[\"AgeFillNA\"] = df[\"Age\"].fillna(0)\n",
|
|||
|
"\n",
|
|||
|
"# # Замена пустых данных на медиану\n",
|
|||
|
"# df[\"AgeFillMedian\"] = df[\"Age\"].fillna(df[\"Age\"].median())\n",
|
|||
|
"\n",
|
|||
|
"# df.tail()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 79,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# df[\"AgeCopy\"] = df[\"Age\"]\n",
|
|||
|
"\n",
|
|||
|
"# # Замена данных сразу в DataFrame без копирования\n",
|
|||
|
"# df.fillna({\"AgeCopy\": 0}, inplace=True)\n",
|
|||
|
"\n",
|
|||
|
"# df.tail()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Удаление наблюдений с пропусками"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 80,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"(100, 7)\n",
|
|||
|
"Valuation False\n",
|
|||
|
"Country False\n",
|
|||
|
"FoundedYear False\n",
|
|||
|
"Name of Founders False\n",
|
|||
|
"TotalFunding False\n",
|
|||
|
"Number of Employees False\n",
|
|||
|
"IsChina False\n",
|
|||
|
"dtype: bool\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df = df.dropna(axis=1)\n",
|
|||
|
"\n",
|
|||
|
"print(df.shape)\n",
|
|||
|
"\n",
|
|||
|
"print(df.isnull().any())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Создание выборок данных\n",
|
|||
|
"\n",
|
|||
|
"Библиотека scikit-learn\n",
|
|||
|
"\n",
|
|||
|
"https://scikit-learn.org/stable/index.html"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"<img src=\"assets/lec2-split.png\" width=\"600\" style=\"background-color: white\">"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 81,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Функция для создания выборок\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"def split_stratified_into_train_val_test(\n",
|
|||
|
" df_input,\n",
|
|||
|
" stratify_colname=\"y\",\n",
|
|||
|
" frac_train=0.6,\n",
|
|||
|
" frac_val=0.15,\n",
|
|||
|
" frac_test=0.25,\n",
|
|||
|
" random_state=None,\n",
|
|||
|
"):\n",
|
|||
|
" \"\"\"\n",
|
|||
|
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
|
|||
|
" following fractional ratios provided by the user, where each subset is\n",
|
|||
|
" stratified by the values in a specific column (that is, each subset has\n",
|
|||
|
" the same relative frequency of the values in the column). It performs this\n",
|
|||
|
" splitting by running train_test_split() twice.\n",
|
|||
|
"\n",
|
|||
|
" Parameters\n",
|
|||
|
" ----------\n",
|
|||
|
" df_input : Pandas dataframe\n",
|
|||
|
" Input dataframe to be split.\n",
|
|||
|
" stratify_colname : str\n",
|
|||
|
" The name of the column that will be used for stratification. Usually\n",
|
|||
|
" this column would be for the label.\n",
|
|||
|
" frac_train : float\n",
|
|||
|
" frac_val : float\n",
|
|||
|
" frac_test : float\n",
|
|||
|
" The ratios with which the dataframe will be split into train, val, and\n",
|
|||
|
" test data. The values should be expressed as float fractions and should\n",
|
|||
|
" sum to 1.0.\n",
|
|||
|
" random_state : int, None, or RandomStateInstance\n",
|
|||
|
" Value to be passed to train_test_split().\n",
|
|||
|
"\n",
|
|||
|
" Returns\n",
|
|||
|
" -------\n",
|
|||
|
" df_train, df_val, df_test :\n",
|
|||
|
" Dataframes containing the three splits.\n",
|
|||
|
" \"\"\"\n",
|
|||
|
"\n",
|
|||
|
" if frac_train + frac_val + frac_test != 1.0:\n",
|
|||
|
" raise ValueError(\n",
|
|||
|
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
|||
|
" % (frac_train, frac_val, frac_test)\n",
|
|||
|
" )\n",
|
|||
|
"\n",
|
|||
|
" if stratify_colname not in df_input.columns:\n",
|
|||
|
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
|||
|
"\n",
|
|||
|
" X = df_input # Contains all columns.\n",
|
|||
|
" y = df_input[\n",
|
|||
|
" [stratify_colname]\n",
|
|||
|
" ] # Dataframe of just the column on which to stratify.\n",
|
|||
|
"\n",
|
|||
|
" # Split original dataframe into train and temp dataframes.\n",
|
|||
|
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
|||
|
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
|||
|
" )\n",
|
|||
|
"\n",
|
|||
|
" # Split the temp dataframe into val and test dataframes.\n",
|
|||
|
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
|||
|
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
|||
|
" df_temp,\n",
|
|||
|
" y_temp,\n",
|
|||
|
" stratify=y_temp,\n",
|
|||
|
" test_size=relative_frac_test,\n",
|
|||
|
" random_state=random_state,\n",
|
|||
|
" )\n",
|
|||
|
"\n",
|
|||
|
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
|||
|
"\n",
|
|||
|
" return df_train, df_val, df_test"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 82,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"IsChina\n",
|
|||
|
"0 86\n",
|
|||
|
"1 14\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Обучающая выборка: (60, 3)\n",
|
|||
|
"IsChina\n",
|
|||
|
"0 52\n",
|
|||
|
"1 8\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Контрольная выборка: (20, 3)\n",
|
|||
|
"IsChina\n",
|
|||
|
"0 17\n",
|
|||
|
"1 3\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Тестовая выборка: (20, 3)\n",
|
|||
|
"IsChina\n",
|
|||
|
"0 17\n",
|
|||
|
"1 3\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Вывод распределения количества наблюдений по меткам (классам)\n",
|
|||
|
"print(df.IsChina.value_counts())\n",
|
|||
|
"\n",
|
|||
|
"data = df[[\"TotalFunding\", \"Valuation\", \"IsChina\"]].copy()\n",
|
|||
|
"\n",
|
|||
|
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
|
|||
|
" data,\n",
|
|||
|
" stratify_colname=\"IsChina\",\n",
|
|||
|
" frac_train=0.60,\n",
|
|||
|
" frac_val=0.20,\n",
|
|||
|
" frac_test=0.20,\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
|||
|
"print(df_train.IsChina.value_counts())\n",
|
|||
|
"\n",
|
|||
|
"print(\"Контрольная выборка: \", df_val.shape)\n",
|
|||
|
"print(df_val.IsChina.value_counts())\n",
|
|||
|
"\n",
|
|||
|
"print(\"Тестовая выборка: \", df_test.shape)\n",
|
|||
|
"print(df_test.IsChina.value_counts())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выборка с избытком (oversampling)\n",
|
|||
|
"\n",
|
|||
|
"https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n",
|
|||
|
"\n",
|
|||
|
"https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n",
|
|||
|
"\n",
|
|||
|
"Выборка с недостатком (undersampling)\n",
|
|||
|
"\n",
|
|||
|
"https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n",
|
|||
|
"\n",
|
|||
|
"Библиотека imbalanced-learn\n",
|
|||
|
"\n",
|
|||
|
"https://imbalanced-learn.org/stable/"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 83,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Обучающая выборка: (60, 3)\n",
|
|||
|
"IsChina\n",
|
|||
|
"0 52\n",
|
|||
|
"1 8\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Обучающая выборка после oversampling: (105, 3)\n",
|
|||
|
"IsChina\n",
|
|||
|
"1 53\n",
|
|||
|
"0 52\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>TotalFunding</th>\n",
|
|||
|
" <th>Valuation</th>\n",
|
|||
|
" <th>IsChina</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>208.000000</td>\n",
|
|||
|
" <td>9.500000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>4044.200000</td>\n",
|
|||
|
" <td>15.500000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>447.120000</td>\n",
|
|||
|
" <td>6.500000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>2121.000000</td>\n",
|
|||
|
" <td>6.600000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>2686.010000</td>\n",
|
|||
|
" <td>39.000000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>100</th>\n",
|
|||
|
" <td>1306.334794</td>\n",
|
|||
|
" <td>14.179790</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>101</th>\n",
|
|||
|
" <td>1492.220325</td>\n",
|
|||
|
" <td>10.610196</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>102</th>\n",
|
|||
|
" <td>1125.438822</td>\n",
|
|||
|
" <td>16.887502</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>103</th>\n",
|
|||
|
" <td>1728.312129</td>\n",
|
|||
|
" <td>7.708914</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>104</th>\n",
|
|||
|
" <td>1785.708076</td>\n",
|
|||
|
" <td>7.004370</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>105 rows × 3 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" TotalFunding Valuation IsChina\n",
|
|||
|
"0 208.000000 9.500000 0\n",
|
|||
|
"1 4044.200000 15.500000 1\n",
|
|||
|
"2 447.120000 6.500000 0\n",
|
|||
|
"3 2121.000000 6.600000 1\n",
|
|||
|
"4 2686.010000 39.000000 0\n",
|
|||
|
".. ... ... ...\n",
|
|||
|
"100 1306.334794 14.179790 1\n",
|
|||
|
"101 1492.220325 10.610196 1\n",
|
|||
|
"102 1125.438822 16.887502 1\n",
|
|||
|
"103 1728.312129 7.708914 1\n",
|
|||
|
"104 1785.708076 7.004370 1\n",
|
|||
|
"\n",
|
|||
|
"[105 rows x 3 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 83,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from imblearn.over_sampling import ADASYN\n",
|
|||
|
"\n",
|
|||
|
"ada = ADASYN()\n",
|
|||
|
"\n",
|
|||
|
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
|||
|
"print(df_train.IsChina.value_counts())\n",
|
|||
|
"\n",
|
|||
|
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"IsChina\"]) # type: ignore\n",
|
|||
|
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
|
|||
|
"print(df_train_adasyn.IsChina.value_counts())\n",
|
|||
|
"\n",
|
|||
|
"df_train_adasyn"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": ".venv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.4"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|