Compare commits
No commits in common. "lab5-6" and "main" have entirely different histories.
4
.gitignore
vendored
4
.gitignore
vendored
@ -275,6 +275,4 @@ cython_debug/
|
||||
# JS
|
||||
node_modules/
|
||||
|
||||
test.csv
|
||||
описания_датасетов/.~lock.cars.odt#
|
||||
описания_датасетов/.~lock.houses.odt#
|
||||
test.csv
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
244
data/dollar.csv
244
data/dollar.csv
@ -1,244 +0,0 @@
|
||||
"my_date","my_value","bullet","bulletClass","label"
|
||||
"28.03.2023","76.5662","","",""
|
||||
"31.03.2023","77.0863","","",""
|
||||
"01.04.2023","77.3233","","",""
|
||||
"04.04.2023","77.9510","","",""
|
||||
"05.04.2023","79.3563","","",""
|
||||
"06.04.2023","79.4961","","",""
|
||||
"07.04.2023","80.6713","","",""
|
||||
"08.04.2023","82.3988","","",""
|
||||
"11.04.2023","81.7441","","",""
|
||||
"12.04.2023","82.1799","","",""
|
||||
"13.04.2023","82.0934","","",""
|
||||
"14.04.2023","81.6758","","",""
|
||||
"15.04.2023","81.5045","","",""
|
||||
"18.04.2023","81.6279","","",""
|
||||
"19.04.2023","81.6028","","",""
|
||||
"20.04.2023","81.6549","","",""
|
||||
"21.04.2023","81.6188","","",""
|
||||
"22.04.2023","81.4863","","",""
|
||||
"25.04.2023","81.2745","","",""
|
||||
"26.04.2023","81.5499","","",""
|
||||
"27.04.2023","81.6274","","",""
|
||||
"28.04.2023","81.5601","","",""
|
||||
"29.04.2023","80.5093","","",""
|
||||
"03.05.2023","79.9609","","",""
|
||||
"04.05.2023","79.3071","","",""
|
||||
"05.05.2023","78.6139","","",""
|
||||
"06.05.2023","76.8207","","",""
|
||||
"11.05.2023","76.6929","","",""
|
||||
"12.05.2023","75.8846","round","min-pulsating-bullet","мин"
|
||||
"13.05.2023","77.2041","","",""
|
||||
"16.05.2023","79.1004","","",""
|
||||
"17.05.2023","79.9798","","",""
|
||||
"18.05.2023","80.7642","","",""
|
||||
"19.05.2023","80.0366","","",""
|
||||
"20.05.2023","79.9093","","",""
|
||||
"23.05.2023","79.9379","","",""
|
||||
"24.05.2023","80.1665","","",""
|
||||
"25.05.2023","79.9669","","",""
|
||||
"26.05.2023","79.9841","","",""
|
||||
"27.05.2023","79.9667","","",""
|
||||
"30.05.2023","80.0555","","",""
|
||||
"31.05.2023","80.6872","","",""
|
||||
"01.06.2023","80.9942","","",""
|
||||
"02.06.2023","80.9657","","",""
|
||||
"03.06.2023","80.8756","","",""
|
||||
"06.06.2023","81.3294","","",""
|
||||
"07.06.2023","81.2502","","",""
|
||||
"08.06.2023","81.4581","","",""
|
||||
"09.06.2023","82.0930","","",""
|
||||
"10.06.2023","82.6417","","",""
|
||||
"14.06.2023","83.6405","","",""
|
||||
"15.06.2023","84.3249","","",""
|
||||
"16.06.2023","83.9611","","",""
|
||||
"17.06.2023","83.6498","","",""
|
||||
"20.06.2023","83.9866","","",""
|
||||
"21.06.2023","84.2336","","",""
|
||||
"22.06.2023","84.2467","","",""
|
||||
"23.06.2023","83.6077","","",""
|
||||
"24.06.2023","84.0793","","",""
|
||||
"27.06.2023","84.6642","","",""
|
||||
"28.06.2023","85.0504","","",""
|
||||
"29.06.2023","85.6192","","",""
|
||||
"30.06.2023","87.0341","","",""
|
||||
"01.07.2023","88.3844","","",""
|
||||
"04.07.2023","89.3255","","",""
|
||||
"05.07.2023","89.5450","","",""
|
||||
"06.07.2023","90.3380","","",""
|
||||
"07.07.2023","92.5695","","",""
|
||||
"08.07.2023","91.6879","","",""
|
||||
"11.07.2023","91.4931","","",""
|
||||
"12.07.2023","90.5045","","",""
|
||||
"13.07.2023","90.6253","","",""
|
||||
"14.07.2023","90.1757","","",""
|
||||
"15.07.2023","90.1190","","",""
|
||||
"18.07.2023","90.4217","","",""
|
||||
"19.07.2023","90.6906","","",""
|
||||
"20.07.2023","91.2046","","",""
|
||||
"21.07.2023","90.8545","","",""
|
||||
"22.07.2023","90.3846","","",""
|
||||
"25.07.2023","90.4890","","",""
|
||||
"26.07.2023","90.0945","","",""
|
||||
"27.07.2023","90.0468","","",""
|
||||
"28.07.2023","90.0225","","",""
|
||||
"29.07.2023","90.9783","","",""
|
||||
"01.08.2023","91.5923","","",""
|
||||
"02.08.2023","91.7755","","",""
|
||||
"03.08.2023","92.8410","","",""
|
||||
"04.08.2023","93.7792","","",""
|
||||
"05.08.2023","94.8076","","",""
|
||||
"08.08.2023","96.5668","","",""
|
||||
"09.08.2023","96.0755","","",""
|
||||
"10.08.2023","97.3999","","",""
|
||||
"11.08.2023","97.2794","","",""
|
||||
"12.08.2023","98.2066","","",""
|
||||
"15.08.2023","101.0399","","",""
|
||||
"16.08.2023","97.4217","","",""
|
||||
"17.08.2023","96.7045","","",""
|
||||
"18.08.2023","93.7460","","",""
|
||||
"19.08.2023","93.4047","","",""
|
||||
"22.08.2023","94.1424","","",""
|
||||
"23.08.2023","94.1185","","",""
|
||||
"24.08.2023","94.4421","","",""
|
||||
"25.08.2023","94.4007","","",""
|
||||
"26.08.2023","94.7117","","",""
|
||||
"29.08.2023","95.4717","","",""
|
||||
"30.08.2023","95.7070","","",""
|
||||
"31.08.2023","95.9283","","",""
|
||||
"01.09.2023","96.3344","","",""
|
||||
"02.09.2023","96.3411","","",""
|
||||
"05.09.2023","96.6199","","",""
|
||||
"06.09.2023","97.5383","","",""
|
||||
"07.09.2023","97.8439","","",""
|
||||
"08.09.2023","98.1961","","",""
|
||||
"09.09.2023","97.9241","","",""
|
||||
"12.09.2023","96.5083","","",""
|
||||
"13.09.2023","94.7035","","",""
|
||||
"14.09.2023","95.9794","","",""
|
||||
"15.09.2023","96.1609","","",""
|
||||
"16.09.2023","96.6338","","",""
|
||||
"19.09.2023","96.6472","","",""
|
||||
"20.09.2023","96.2236","","",""
|
||||
"21.09.2023","96.6172","","",""
|
||||
"22.09.2023","96.0762","","",""
|
||||
"23.09.2023","96.0419","","",""
|
||||
"26.09.2023","96.1456","","",""
|
||||
"27.09.2023","96.2378","","",""
|
||||
"28.09.2023","96.5000","","",""
|
||||
"29.09.2023","97.0018","","",""
|
||||
"30.09.2023","97.4147","","",""
|
||||
"03.10.2023","98.4785","","",""
|
||||
"04.10.2023","99.2677","","",""
|
||||
"05.10.2023","99.4555","","",""
|
||||
"06.10.2023","99.6762","","",""
|
||||
"07.10.2023","100.4911","","",""
|
||||
"10.10.2023","101.3598","round","max-pulsating-bullet","макс"
|
||||
"11.10.2023","99.9349","","",""
|
||||
"12.10.2023","99.9808","","",""
|
||||
"13.10.2023","96.9948","","",""
|
||||
"14.10.2023","97.3075","","",""
|
||||
"17.10.2023","97.2865","","",""
|
||||
"18.10.2023","97.3458","","",""
|
||||
"19.10.2023","97.3724","","",""
|
||||
"20.10.2023","97.3074","","",""
|
||||
"21.10.2023","95.9053","","",""
|
||||
"24.10.2023","94.7081","","",""
|
||||
"25.10.2023","93.5224","","",""
|
||||
"26.10.2023","93.1507","","",""
|
||||
"27.10.2023","93.5616","","",""
|
||||
"28.10.2023","93.2174","","",""
|
||||
"31.10.2023","93.2435","","",""
|
||||
"01.11.2023","92.0226","","",""
|
||||
"02.11.2023","93.2801","","",""
|
||||
"03.11.2023","93.1730","","",""
|
||||
"04.11.2023","93.0351","","",""
|
||||
"08.11.2023","92.4151","","",""
|
||||
"09.11.2023","92.1973","","",""
|
||||
"10.11.2023","91.9266","","",""
|
||||
"11.11.2023","92.0535","","",""
|
||||
"14.11.2023","92.1185","","",""
|
||||
"15.11.2023","91.2570","","",""
|
||||
"16.11.2023","89.4565","","",""
|
||||
"17.11.2023","88.9466","","",""
|
||||
"18.11.2023","89.1237","","",""
|
||||
"21.11.2023","88.4954","","",""
|
||||
"22.11.2023","87.8701","","",""
|
||||
"23.11.2023","88.1648","","",""
|
||||
"24.11.2023","88.1206","","",""
|
||||
"25.11.2023","88.8133","","",""
|
||||
"28.11.2023","88.7045","","",""
|
||||
"29.11.2023","88.6102","","",""
|
||||
"30.11.2023","88.8841","","",""
|
||||
"01.12.2023","88.5819","","",""
|
||||
"02.12.2023","89.7619","","",""
|
||||
"05.12.2023","90.6728","","",""
|
||||
"06.12.2023","91.5823","","",""
|
||||
"07.12.2023","92.7826","","",""
|
||||
"08.12.2023","92.5654","","",""
|
||||
"09.12.2023","91.6402","","",""
|
||||
"12.12.2023","90.9846","","",""
|
||||
"13.12.2023","90.2158","","",""
|
||||
"14.12.2023","89.8926","","",""
|
||||
"15.12.2023","89.6741","","",""
|
||||
"16.12.2023","89.6966","","",""
|
||||
"19.12.2023","90.4162","","",""
|
||||
"20.12.2023","90.0870","","",""
|
||||
"21.12.2023","90.4056","","",""
|
||||
"22.12.2023","91.7062","","",""
|
||||
"23.12.2023","91.9389","","",""
|
||||
"26.12.2023","91.9690","","",""
|
||||
"27.12.2023","91.7069","","",""
|
||||
"28.12.2023","91.7051","","",""
|
||||
"29.12.2023","90.3041","","",""
|
||||
"30.12.2023","89.6883","","",""
|
||||
"10.01.2024","90.4040","","",""
|
||||
"11.01.2024","89.3939","","",""
|
||||
"12.01.2024","88.7818","","",""
|
||||
"13.01.2024","88.1324","","",""
|
||||
"16.01.2024","87.6772","","",""
|
||||
"17.01.2024","87.6457","","",""
|
||||
"18.01.2024","88.3540","","",""
|
||||
"19.01.2024","88.6610","","",""
|
||||
"20.01.2024","88.5896","","",""
|
||||
"23.01.2024","87.9724","","",""
|
||||
"24.01.2024","87.9199","","",""
|
||||
"25.01.2024","88.2829","","",""
|
||||
"26.01.2024","88.6562","","",""
|
||||
"27.01.2024","89.5159","","",""
|
||||
"30.01.2024","89.6090","","",""
|
||||
"31.01.2024","89.2887","","",""
|
||||
"01.02.2024","89.6678","","",""
|
||||
"02.02.2024","90.2299","","",""
|
||||
"03.02.2024","90.6626","","",""
|
||||
"06.02.2024","91.2434","","",""
|
||||
"07.02.2024","90.6842","","",""
|
||||
"08.02.2024","91.1514","","",""
|
||||
"09.02.2024","91.2561","","",""
|
||||
"10.02.2024","90.8901","","",""
|
||||
"13.02.2024","91.0758","","",""
|
||||
"14.02.2024","91.2057","","",""
|
||||
"15.02.2024","91.4316","","",""
|
||||
"16.02.2024","91.8237","","",""
|
||||
"17.02.2024","92.5492","","",""
|
||||
"20.02.2024","92.4102","","",""
|
||||
"21.02.2024","92.3490","","",""
|
||||
"22.02.2024","92.4387","","",""
|
||||
"23.02.2024","92.7519","","",""
|
||||
"27.02.2024","92.6321","","",""
|
||||
"28.02.2024","92.0425","","",""
|
||||
"29.02.2024","91.8692","","",""
|
||||
"01.03.2024","90.8423","","",""
|
||||
"02.03.2024","91.3336","","",""
|
||||
"05.03.2024","91.3534","","",""
|
||||
"06.03.2024","91.1604","","",""
|
||||
"07.03.2024","90.3412","","",""
|
||||
"08.03.2024","90.7493","","",""
|
||||
"12.03.2024","90.6252","","",""
|
||||
"13.03.2024","90.8818","","",""
|
||||
"19.03.2024","91.9829","","",""
|
||||
"20.03.2024","92.2243","","",""
|
||||
"21.03.2024","92.6861","","",""
|
||||
"22.03.2024","91.9499","","",""
|
||||
"23.03.2024","92.6118","","",""
|
||||
"26.03.2024","92.7761","","",""
|
|
3756
data/ds_salaries.csv
3756
data/ds_salaries.csv
File diff suppressed because it is too large
Load Diff
21614
data/kc_house_data.csv
21614
data/kc_house_data.csv
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
90837
data/neo.csv
90837
data/neo.csv
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
@ -1,312 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Загрузка данных в DataFrame"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"df = pd.read_csv(\"../data/kc_house_data.csv\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Получение сведений о пропущенных данных"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(df.isnull().sum())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(df.isnull().any())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for i in df.columns:\n",
|
||||
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
||||
" if null_rate > 0:\n",
|
||||
" print(f\"{i} процент пустых значений: {null_rate:.2f}%\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Создание выборок данных"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def split_stratified_into_train_val_test(\n",
|
||||
" df_input,\n",
|
||||
" stratify_colname=\"y\",\n",
|
||||
" frac_train=0.6,\n",
|
||||
" frac_val=0.15,\n",
|
||||
" frac_test=0.25,\n",
|
||||
" random_state=None,\n",
|
||||
"):\n",
|
||||
" \"\"\"\n",
|
||||
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
|
||||
" following fractional ratios provided by the user, where each subset is\n",
|
||||
" stratified by the values in a specific column (that is, each subset has\n",
|
||||
" the same relative frequency of the values in the column). It performs this\n",
|
||||
" splitting by running train_test_split() twice.\n",
|
||||
"\n",
|
||||
" Parameters\n",
|
||||
" ----------\n",
|
||||
" df_input : Pandas dataframe\n",
|
||||
" Input dataframe to be split.\n",
|
||||
" stratify_colname : str\n",
|
||||
" The name of the column that will be used for stratification. Usually\n",
|
||||
" this column would be for the label.\n",
|
||||
" frac_train : float\n",
|
||||
" frac_val : float\n",
|
||||
" frac_test : float\n",
|
||||
" The ratios with which the dataframe will be split into train, val, and\n",
|
||||
" test data. The values should be expressed as float fractions and should\n",
|
||||
" sum to 1.0.\n",
|
||||
" random_state : int, None, or RandomStateInstance\n",
|
||||
" Value to be passed to train_test_split().\n",
|
||||
"\n",
|
||||
" Returns\n",
|
||||
" -------\n",
|
||||
" df_train, df_val, df_test :\n",
|
||||
" Dataframes containing the three splits.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" if frac_train + frac_val + frac_test != 1.0:\n",
|
||||
" raise ValueError(\n",
|
||||
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
||||
" % (frac_train, frac_val, frac_test)\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if stratify_colname not in df_input.columns:\n",
|
||||
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
||||
"\n",
|
||||
" X = df_input # Contains all columns.\n",
|
||||
" y = df_input[\n",
|
||||
" [stratify_colname]\n",
|
||||
" ] # Dataframe of just the column on which to stratify.\n",
|
||||
"\n",
|
||||
" # Split original dataframe into train and temp dataframes.\n",
|
||||
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
||||
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Split the temp dataframe into val and test dataframes.\n",
|
||||
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
||||
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
||||
" df_temp,\n",
|
||||
" y_temp,\n",
|
||||
" stratify=y_temp,\n",
|
||||
" test_size=relative_frac_test,\n",
|
||||
" random_state=random_state,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
||||
"\n",
|
||||
" return df_train, df_val, df_test"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[3 5 4 1 2]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(df.condition.unique())\n",
|
||||
"\n",
|
||||
"data = df[\n",
|
||||
" [\n",
|
||||
" \"price\",\n",
|
||||
" \"bedrooms\",\n",
|
||||
" \"bathrooms\",\n",
|
||||
" \"sqft_living\",\n",
|
||||
" \"sqft_lot\",\n",
|
||||
" \"floors\",\n",
|
||||
" \"view\",\n",
|
||||
" \"condition\",\n",
|
||||
" \"grade\",\n",
|
||||
" \"sqft_above\",\n",
|
||||
" \"sqft_basement\",\n",
|
||||
" \"yr_built\",\n",
|
||||
" \"yr_renovated\",\n",
|
||||
" \"zipcode\",\n",
|
||||
" \"lat\",\n",
|
||||
" \"long\",\n",
|
||||
" ]\n",
|
||||
"].copy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Обучающая выборка: (12967, 16)\n",
|
||||
"condition\n",
|
||||
"3 8418\n",
|
||||
"4 3407\n",
|
||||
"5 1021\n",
|
||||
"2 103\n",
|
||||
"1 18\n",
|
||||
"Name: count, dtype: int64\n",
|
||||
"Контрольная выборка: (4323, 16)\n",
|
||||
"condition\n",
|
||||
"3 2806\n",
|
||||
"4 1136\n",
|
||||
"5 340\n",
|
||||
"2 35\n",
|
||||
"1 6\n",
|
||||
"Name: count, dtype: int64\n",
|
||||
"Тестовая выборка: (4323, 16)\n",
|
||||
"condition\n",
|
||||
"3 2807\n",
|
||||
"4 1136\n",
|
||||
"5 340\n",
|
||||
"2 34\n",
|
||||
"1 6\n",
|
||||
"Name: count, dtype: int64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
|
||||
" data,\n",
|
||||
" stratify_colname=\"condition\",\n",
|
||||
" frac_train=0.60,\n",
|
||||
" frac_val=0.20,\n",
|
||||
" frac_test=0.20,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
||||
"print(df_train.condition.value_counts())\n",
|
||||
"\n",
|
||||
"print(\"Контрольная выборка: \", df_val.shape)\n",
|
||||
"print(df_val.condition.value_counts())\n",
|
||||
"\n",
|
||||
"print(\"Тестовая выборка: \", df_test.shape)\n",
|
||||
"print(df_test.condition.value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Обучающая выборка: (12967, 16)\n",
|
||||
"condition\n",
|
||||
"3 8418\n",
|
||||
"4 3407\n",
|
||||
"5 1021\n",
|
||||
"2 103\n",
|
||||
"1 18\n",
|
||||
"Name: count, dtype: int64\n",
|
||||
"Обучающая выборка после oversampling: (42073, 16)\n",
|
||||
"condition\n",
|
||||
"5 8464\n",
|
||||
"2 8421\n",
|
||||
"1 8420\n",
|
||||
"3 8418\n",
|
||||
"4 8350\n",
|
||||
"Name: count, dtype: int64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from imblearn.over_sampling import ADASYN\n",
|
||||
"\n",
|
||||
"ada = ADASYN()\n",
|
||||
"\n",
|
||||
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
||||
"print(df_train.condition.value_counts())\n",
|
||||
"\n",
|
||||
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"condition\"])\n",
|
||||
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
|
||||
"\n",
|
||||
"print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
|
||||
"print(df_train_adasyn.condition.value_counts())"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -1,648 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Загрузка данных в DataFrame"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"df = pd.read_csv(\"../data/car_price_prediction.csv\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>ID</th>\n",
|
||||
" <th>Price</th>\n",
|
||||
" <th>Levy</th>\n",
|
||||
" <th>Manufacturer</th>\n",
|
||||
" <th>Model</th>\n",
|
||||
" <th>Prod_year</th>\n",
|
||||
" <th>Category</th>\n",
|
||||
" <th>Leather interior</th>\n",
|
||||
" <th>Fuel type</th>\n",
|
||||
" <th>Engine volume</th>\n",
|
||||
" <th>Mileage</th>\n",
|
||||
" <th>Cylinders</th>\n",
|
||||
" <th>Gear_box_type</th>\n",
|
||||
" <th>Drive_wheels</th>\n",
|
||||
" <th>Doors</th>\n",
|
||||
" <th>Wheel</th>\n",
|
||||
" <th>Color</th>\n",
|
||||
" <th>Airbags</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>45654403</td>\n",
|
||||
" <td>13328</td>\n",
|
||||
" <td>1399</td>\n",
|
||||
" <td>LEXUS</td>\n",
|
||||
" <td>RX 450</td>\n",
|
||||
" <td>2010</td>\n",
|
||||
" <td>Jeep</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>Hybrid</td>\n",
|
||||
" <td>3.5</td>\n",
|
||||
" <td>186005 km</td>\n",
|
||||
" <td>6.0</td>\n",
|
||||
" <td>Automatic</td>\n",
|
||||
" <td>4x4</td>\n",
|
||||
" <td>04-May</td>\n",
|
||||
" <td>Left wheel</td>\n",
|
||||
" <td>Silver</td>\n",
|
||||
" <td>12</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>44731507</td>\n",
|
||||
" <td>16621</td>\n",
|
||||
" <td>1018</td>\n",
|
||||
" <td>CHEVROLET</td>\n",
|
||||
" <td>Equinox</td>\n",
|
||||
" <td>2011</td>\n",
|
||||
" <td>Jeep</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>Petrol</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>192000 km</td>\n",
|
||||
" <td>6.0</td>\n",
|
||||
" <td>Tiptronic</td>\n",
|
||||
" <td>4x4</td>\n",
|
||||
" <td>04-May</td>\n",
|
||||
" <td>Left wheel</td>\n",
|
||||
" <td>Black</td>\n",
|
||||
" <td>8</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>45774419</td>\n",
|
||||
" <td>8467</td>\n",
|
||||
" <td>-</td>\n",
|
||||
" <td>HONDA</td>\n",
|
||||
" <td>FIT</td>\n",
|
||||
" <td>2006</td>\n",
|
||||
" <td>Hatchback</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>Petrol</td>\n",
|
||||
" <td>1.3</td>\n",
|
||||
" <td>200000 km</td>\n",
|
||||
" <td>4.0</td>\n",
|
||||
" <td>Variator</td>\n",
|
||||
" <td>Front</td>\n",
|
||||
" <td>04-May</td>\n",
|
||||
" <td>Right-hand drive</td>\n",
|
||||
" <td>Black</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>45769185</td>\n",
|
||||
" <td>3607</td>\n",
|
||||
" <td>862</td>\n",
|
||||
" <td>FORD</td>\n",
|
||||
" <td>Escape</td>\n",
|
||||
" <td>2011</td>\n",
|
||||
" <td>Jeep</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>Hybrid</td>\n",
|
||||
" <td>2.5</td>\n",
|
||||
" <td>168966 km</td>\n",
|
||||
" <td>4.0</td>\n",
|
||||
" <td>Automatic</td>\n",
|
||||
" <td>4x4</td>\n",
|
||||
" <td>04-May</td>\n",
|
||||
" <td>Left wheel</td>\n",
|
||||
" <td>White</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>45809263</td>\n",
|
||||
" <td>11726</td>\n",
|
||||
" <td>446</td>\n",
|
||||
" <td>HONDA</td>\n",
|
||||
" <td>FIT</td>\n",
|
||||
" <td>2014</td>\n",
|
||||
" <td>Hatchback</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>Petrol</td>\n",
|
||||
" <td>1.3</td>\n",
|
||||
" <td>91901 km</td>\n",
|
||||
" <td>4.0</td>\n",
|
||||
" <td>Automatic</td>\n",
|
||||
" <td>Front</td>\n",
|
||||
" <td>04-May</td>\n",
|
||||
" <td>Left wheel</td>\n",
|
||||
" <td>Silver</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" ID Price Levy Manufacturer Model Prod_year Category \\\n",
|
||||
"0 45654403 13328 1399 LEXUS RX 450 2010 Jeep \n",
|
||||
"1 44731507 16621 1018 CHEVROLET Equinox 2011 Jeep \n",
|
||||
"2 45774419 8467 - HONDA FIT 2006 Hatchback \n",
|
||||
"3 45769185 3607 862 FORD Escape 2011 Jeep \n",
|
||||
"4 45809263 11726 446 HONDA FIT 2014 Hatchback \n",
|
||||
"\n",
|
||||
" Leather interior Fuel type Engine volume Mileage Cylinders \\\n",
|
||||
"0 Yes Hybrid 3.5 186005 km 6.0 \n",
|
||||
"1 No Petrol 3 192000 km 6.0 \n",
|
||||
"2 No Petrol 1.3 200000 km 4.0 \n",
|
||||
"3 Yes Hybrid 2.5 168966 km 4.0 \n",
|
||||
"4 Yes Petrol 1.3 91901 km 4.0 \n",
|
||||
"\n",
|
||||
" Gear_box_type Drive_wheels Doors Wheel Color Airbags \n",
|
||||
"0 Automatic 4x4 04-May Left wheel Silver 12 \n",
|
||||
"1 Tiptronic 4x4 04-May Left wheel Black 8 \n",
|
||||
"2 Variator Front 04-May Right-hand drive Black 2 \n",
|
||||
"3 Automatic 4x4 04-May Left wheel White 0 \n",
|
||||
"4 Automatic Front 04-May Left wheel Silver 4 "
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Получение сведений о пропущенных данных"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"ID 0\n",
|
||||
"Price 0\n",
|
||||
"Levy 0\n",
|
||||
"Manufacturer 0\n",
|
||||
"Model 0\n",
|
||||
"Prod_year 0\n",
|
||||
"Category 0\n",
|
||||
"Leather interior 0\n",
|
||||
"Fuel type 0\n",
|
||||
"Engine volume 0\n",
|
||||
"Mileage 0\n",
|
||||
"Cylinders 0\n",
|
||||
"Gear_box_type 0\n",
|
||||
"Drive_wheels 0\n",
|
||||
"Doors 0\n",
|
||||
"Wheel 0\n",
|
||||
"Color 0\n",
|
||||
"Airbags 0\n",
|
||||
"dtype: int64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(df.isnull().sum())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"ID False\n",
|
||||
"Price False\n",
|
||||
"Levy False\n",
|
||||
"Manufacturer False\n",
|
||||
"Model False\n",
|
||||
"Prod_year False\n",
|
||||
"Category False\n",
|
||||
"Leather interior False\n",
|
||||
"Fuel type False\n",
|
||||
"Engine volume False\n",
|
||||
"Mileage False\n",
|
||||
"Cylinders False\n",
|
||||
"Gear_box_type False\n",
|
||||
"Drive_wheels False\n",
|
||||
"Doors False\n",
|
||||
"Wheel False\n",
|
||||
"Color False\n",
|
||||
"Airbags False\n",
|
||||
"dtype: bool\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(df.isnull().any())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['1399' '1018' '-' '862' '446' '891' '761' '751' '394' '1053' '1055'\n",
|
||||
" '1079' '810' '2386' '1850' '531' '586' '1249' '2455' '583' '1537' '1288'\n",
|
||||
" '915' '1750' '707' '1077' '1486' '1091' '650' '382' '1436' '1194' '503'\n",
|
||||
" '1017' '1104' '639' '629' '919' '781' '530' '640' '765' '777' '779' '934'\n",
|
||||
" '769' '645' '1185' '1324' '830' '1187' '1111' '760' '642' '1604' '1095'\n",
|
||||
" '966' '473' '1138' '1811' '988' '917' '1156' '687' '11714' '836' '1347'\n",
|
||||
" '2866' '1646' '259' '609' '697' '585' '475' '690' '308' '1823' '1361'\n",
|
||||
" '1273' '924' '584' '2078' '831' '1172' '893' '1872' '1885' '1266' '447'\n",
|
||||
" '2148' '1730' '730' '289' '502' '333' '1325' '247' '879' '1342' '1327'\n",
|
||||
" '1598' '1514' '1058' '738' '1935' '481' '1522' '1282' '456' '880' '900'\n",
|
||||
" '798' '1277' '442' '1051' '790' '1292' '1047' '528' '1211' '1493' '1793'\n",
|
||||
" '574' '930' '1998' '271' '706' '1481' '1677' '1661' '1286' '1408' '1090'\n",
|
||||
" '595' '1451' '1267' '993' '1714' '878' '641' '749' '1511' '603' '353'\n",
|
||||
" '877' '1236' '1141' '397' '784' '1024' '1357' '1301' '770' '922' '1438'\n",
|
||||
" '753' '607' '1363' '638' '490' '431' '565' '517' '833' '489' '1760' '986'\n",
|
||||
" '1841' '1620' '1360' '474' '1099' '978' '1624' '1946' '1268' '1307' '696'\n",
|
||||
" '649' '666' '2151' '551' '800' '971' '1323' '2377' '1845' '1083' '694'\n",
|
||||
" '463' '419' '345' '1515' '1505' '2056' '1203' '729' '460' '1356' '876'\n",
|
||||
" '911' '1190' '780' '448' '2410' '1848' '1148' '834' '1275' '1028' '1197'\n",
|
||||
" '724' '890' '1705' '505' '789' '2959' '518' '461' '1719' '2858' '3156'\n",
|
||||
" '2225' '2177' '1968' '1888' '1308' '2736' '1103' '557' '2195' '843'\n",
|
||||
" '1664' '723' '4508' '562' '501' '2018' '1076' '1202' '3301' '691' '1440'\n",
|
||||
" '1869' '1178' '418' '1820' '1413' '488' '1304' '363' '2108' '521' '1659'\n",
|
||||
" '87' '1411' '1528' '3292' '7058' '1578' '627' '874' '1996' '1488' '5679'\n",
|
||||
" '1234' '5603' '400' '889' '3268' '875' '949' '2265' '441' '742' '425'\n",
|
||||
" '2476' '2971' '614' '1816' '1375' '1405' '2297' '1062' '1113' '420'\n",
|
||||
" '2469' '658' '1951' '2670' '2578' '1995' '1032' '994' '1011' '2421'\n",
|
||||
" '1296' '155' '494' '426' '1086' '961' '2236' '1829' '764' '1834' '1054'\n",
|
||||
" '617' '1529' '2266' '637' '626' '1832' '1016' '2002' '1756' '746' '1285'\n",
|
||||
" '2690' '1118' '5332' '980' '1807' '970' '1228' '1195' '1132' '1768'\n",
|
||||
" '1384' '1080' '7063' '1817' '1452' '1975' '1368' '702' '1974' '1781'\n",
|
||||
" '1036' '944' '663' '364' '1539' '1345' '1680' '2209' '741' '1575' '695'\n",
|
||||
" '1317' '294' '1525' '424' '997' '1473' '1552' '2819' '2188' '1668' '3057'\n",
|
||||
" '799' '1502' '2606' '552' '1694' '1759' '1110' '399' '1470' '1174' '5877'\n",
|
||||
" '1474' '1688' '526' '686' '5908' '1107' '2070' '1468' '1246' '1685' '556'\n",
|
||||
" '1533' '1917' '1346' '732' '692' '579' '421' '362' '3505' '1855' '2711'\n",
|
||||
" '1586' '3739' '681' '1708' '2278' '1701' '722' '1482' '928' '827' '832'\n",
|
||||
" '527' '604' '173' '1341' '3329' '1553' '859' '167' '916' '828' '2082'\n",
|
||||
" '1176' '1108' '975' '3008' '1516' '2269' '1699' '2073' '1031' '1503'\n",
|
||||
" '2364' '1030' '1442' '5666' '2715' '1437' '2067' '1426' '2908' '1279'\n",
|
||||
" '866' '4283' '279' '2658' '3015' '2004' '1391' '4736' '748' '1466' '644'\n",
|
||||
" '683' '2705' '1297' '731' '1252' '2216' '3141' '3273' '1518' '1723'\n",
|
||||
" '1588' '972' '682' '1094' '668' '175' '967' '402' '3894' '1960' '1599'\n",
|
||||
" '2000' '2084' '1621' '714' '1109' '3989' '873' '1572' '1163' '1991'\n",
|
||||
" '1716' '1673' '2562' '2874' '965' '462' '605' '1948' '1736' '3518' '2054'\n",
|
||||
" '2467' '1681' '1272' '1205' '750' '2156' '2566' '115' '524' '3184' '676'\n",
|
||||
" '1678' '612' '328' '955' '1441' '1675' '3965' '2909' '623' '822' '867'\n",
|
||||
" '3025' '1993' '792' '636' '4057' '3743' '2337' '2570' '2418' '2472'\n",
|
||||
" '3910' '1662' '2123' '2628' '3208' '2080' '3699' '2913' '864' '2505'\n",
|
||||
" '870' '7536' '1924' '1671' '1064' '1836' '1866' '4741' '841' '1369'\n",
|
||||
" '5681' '3112' '1366' '2223' '1198' '1039' '3811' '3571' '1387' '1171'\n",
|
||||
" '1365' '1531' '1590' '11706' '2308' '4860' '1641' '1045' '1901']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(df[\"Levy\"].unique())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df[\"Levy\"] = df[\"Levy\"].replace({'-' : None})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Levy процент пустых значений: 30.25%\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for i in df.columns:\n",
|
||||
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
||||
" if null_rate > 0:\n",
|
||||
" print(f\"{i} процент пустых значений: {null_rate:.2f}%\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Заполнение пропущенных данных"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df.fillna({\"Levy\": 0}, inplace=True)\n",
|
||||
"for i in df.columns:\n",
|
||||
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
||||
" if null_rate > 0:\n",
|
||||
" print(f\"{i} процент пустых значений: {null_rate:.2f}%\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Создание выборок данных"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def split_stratified_into_train_val_test(\n",
|
||||
" df_input,\n",
|
||||
" stratify_colname=\"y\",\n",
|
||||
" frac_train=0.6,\n",
|
||||
" frac_val=0.15,\n",
|
||||
" frac_test=0.25,\n",
|
||||
" random_state=None,\n",
|
||||
"):\n",
|
||||
" \"\"\"\n",
|
||||
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
|
||||
" following fractional ratios provided by the user, where each subset is\n",
|
||||
" stratified by the values in a specific column (that is, each subset has\n",
|
||||
" the same relative frequency of the values in the column). It performs this\n",
|
||||
" splitting by running train_test_split() twice.\n",
|
||||
"\n",
|
||||
" Parameters\n",
|
||||
" ----------\n",
|
||||
" df_input : Pandas dataframe\n",
|
||||
" Input dataframe to be split.\n",
|
||||
" stratify_colname : str\n",
|
||||
" The name of the column that will be used for stratification. Usually\n",
|
||||
" this column would be for the label.\n",
|
||||
" frac_train : float\n",
|
||||
" frac_val : float\n",
|
||||
" frac_test : float\n",
|
||||
" The ratios with which the dataframe will be split into train, val, and\n",
|
||||
" test data. The values should be expressed as float fractions and should\n",
|
||||
" sum to 1.0.\n",
|
||||
" random_state : int, None, or RandomStateInstance\n",
|
||||
" Value to be passed to train_test_split().\n",
|
||||
"\n",
|
||||
" Returns\n",
|
||||
" -------\n",
|
||||
" df_train, df_val, df_test :\n",
|
||||
" Dataframes containing the three splits.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" if frac_train + frac_val + frac_test != 1.0:\n",
|
||||
" raise ValueError(\n",
|
||||
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
||||
" % (frac_train, frac_val, frac_test)\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if stratify_colname not in df_input.columns:\n",
|
||||
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
||||
"\n",
|
||||
" X = df_input # Contains all columns.\n",
|
||||
" y = df_input[\n",
|
||||
" [stratify_colname]\n",
|
||||
" ] # Dataframe of just the column on which to stratify.\n",
|
||||
"\n",
|
||||
" # Split original dataframe into train and temp dataframes.\n",
|
||||
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
||||
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Split the temp dataframe into val and test dataframes.\n",
|
||||
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
||||
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
||||
" df_temp,\n",
|
||||
" y_temp,\n",
|
||||
" stratify=y_temp,\n",
|
||||
" test_size=relative_frac_test,\n",
|
||||
" random_state=random_state,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
||||
"\n",
|
||||
" return df_train, df_val, df_test"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['Automatic' 'Tiptronic' 'Variator' 'Manual']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(df.Gear_box_type.unique())\n",
|
||||
"\n",
|
||||
"data = df[\n",
|
||||
" [\n",
|
||||
" \"Price\",\n",
|
||||
" \"Gear_box_type\",\n",
|
||||
" ]\n",
|
||||
"].copy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Обучающая выборка: (11542, 2)\n",
|
||||
"Gear_box_type\n",
|
||||
"Automatic 8108\n",
|
||||
"Tiptronic 1861\n",
|
||||
"Manual 1125\n",
|
||||
"Variator 448\n",
|
||||
"Name: count, dtype: int64\n",
|
||||
"Контрольная выборка: (3847, 2)\n",
|
||||
"Gear_box_type\n",
|
||||
"Automatic 2703\n",
|
||||
"Tiptronic 620\n",
|
||||
"Manual 375\n",
|
||||
"Variator 149\n",
|
||||
"Name: count, dtype: int64\n",
|
||||
"Тестовая выборка: (3848, 2)\n",
|
||||
"Gear_box_type\n",
|
||||
"Automatic 2703\n",
|
||||
"Tiptronic 621\n",
|
||||
"Manual 375\n",
|
||||
"Variator 149\n",
|
||||
"Name: count, dtype: int64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
|
||||
" data,\n",
|
||||
" stratify_colname=\"Gear_box_type\",\n",
|
||||
" frac_train=0.60,\n",
|
||||
" frac_val=0.20,\n",
|
||||
" frac_test=0.20,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
||||
"print(df_train.Gear_box_type.value_counts())\n",
|
||||
"\n",
|
||||
"print(\"Контрольная выборка: \", df_val.shape)\n",
|
||||
"print(df_val.Gear_box_type.value_counts())\n",
|
||||
"\n",
|
||||
"print(\"Тестовая выборка: \", df_test.shape)\n",
|
||||
"print(df_test.Gear_box_type.value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Выборка с избытком (oversampling)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Обучающая выборка: (11542, 2)\n",
|
||||
"Gear_box_type\n",
|
||||
"Automatic 8108\n",
|
||||
"Tiptronic 1861\n",
|
||||
"Manual 1125\n",
|
||||
"Variator 448\n",
|
||||
"Name: count, dtype: int64\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "ValueError",
|
||||
"evalue": "could not convert string to float: 'Automatic'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_9996\\2277749880.py\u001b[0m in \u001b[0;36m?\u001b[1;34m()\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Обучающая выборка: \"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mGear_box_type\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalue_counts\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 8\u001b[1;33m \u001b[0mX_resampled\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_resampled\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mada\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit_resample\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"Gear_box_type\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 9\u001b[0m \u001b[0mdf_train_adasyn\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_resampled\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 10\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Обучающая выборка после oversampling: \"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdf_train_adasyn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\imblearn\\base.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 204\u001b[0m \u001b[0my_resampled\u001b[0m \u001b[1;33m:\u001b[0m \u001b[0marray\u001b[0m\u001b[1;33m-\u001b[0m\u001b[0mlike\u001b[0m \u001b[0mof\u001b[0m \u001b[0mshape\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mn_samples_new\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 205\u001b[0m \u001b[0mThe\u001b[0m \u001b[0mcorresponding\u001b[0m \u001b[0mlabel\u001b[0m \u001b[0mof\u001b[0m \u001b[1;33m`\u001b[0m\u001b[0mX_resampled\u001b[0m\u001b[1;33m`\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 206\u001b[0m \"\"\"\n\u001b[0;32m 207\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_params\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 208\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit_resample\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||
"\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\imblearn\\base.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 102\u001b[0m \u001b[0mThe\u001b[0m \u001b[0mcorresponding\u001b[0m \u001b[0mlabel\u001b[0m \u001b[0mof\u001b[0m \u001b[1;33m`\u001b[0m\u001b[0mX_resampled\u001b[0m\u001b[1;33m`\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 103\u001b[0m \"\"\"\n\u001b[0;32m 104\u001b[0m \u001b[0mcheck_classification_targets\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 105\u001b[0m \u001b[0marrays_transformer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mArraysTransformer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 106\u001b[1;33m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbinarize_y\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_check_X_y\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 107\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 108\u001b[0m self.sampling_strategy_ = check_sampling_strategy(\n\u001b[0;32m 109\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msampling_strategy\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sampling_type\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\imblearn\\base.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, X, y, accept_sparse)\u001b[0m\n\u001b[0;32m 157\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_check_X_y\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 158\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0maccept_sparse\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 159\u001b[0m \u001b[0maccept_sparse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m\"csr\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"csc\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 160\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbinarize_y\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcheck_target_type\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindicate_one_vs_all\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 161\u001b[1;33m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_data\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreset\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maccept_sparse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 162\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbinarize_y\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\sklearn\\base.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001b[0m\n\u001b[0;32m 646\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;34m\"estimator\"\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mcheck_y_params\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 647\u001b[0m \u001b[0mcheck_y_params\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mdefault_check_params\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 648\u001b[0m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minput_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"y\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 649\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 650\u001b[1;33m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mcheck_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 651\u001b[0m \u001b[0mout\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 652\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 653\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mno_val_X\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mcheck_params\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"ensure_2d\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[0;32m 1297\u001b[0m raise ValueError(\n\u001b[0;32m 1298\u001b[0m \u001b[1;33mf\"\u001b[0m\u001b[1;33m{\u001b[0m\u001b[0mestimator_name\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m requires y to be passed, but the target y is None\u001b[0m\u001b[1;33m\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1299\u001b[0m \u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1300\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1301\u001b[1;33m X = check_array(\n\u001b[0m\u001b[0;32m 1302\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1303\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maccept_sparse\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1304\u001b[0m \u001b[0maccept_large_sparse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maccept_large_sparse\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[0;32m 1009\u001b[0m \u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1010\u001b[0m \u001b[0marray\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mxp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1011\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1012\u001b[0m \u001b[0marray\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_asarray_with_order\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mxp\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mxp\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1013\u001b[1;33m \u001b[1;32mexcept\u001b[0m \u001b[0mComplexWarning\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mcomplex_warning\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1014\u001b[0m raise ValueError(\n\u001b[0;32m 1015\u001b[0m \u001b[1;34m\"Complex data not supported\\n{}\\n\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1016\u001b[0m \u001b[1;33m)\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mcomplex_warning\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\sklearn\\utils\\_array_api.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(array, dtype, order, copy, xp, device)\u001b[0m\n\u001b[0;32m 741\u001b[0m \u001b[1;31m# Use NumPy API to support order\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 742\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mcopy\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 743\u001b[0m \u001b[0marray\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 744\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 745\u001b[1;33m \u001b[0marray\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 746\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 747\u001b[0m \u001b[1;31m# At this point array is a NumPy ndarray. We convert it to an array\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 748\u001b[0m \u001b[1;31m# container that is consistent with the input's namespace.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, dtype, copy)\u001b[0m\n\u001b[0;32m 2149\u001b[0m def __array__(\n\u001b[0;32m 2150\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mnpt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDTypeLike\u001b[0m \u001b[1;33m|\u001b[0m \u001b[1;32mNone\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mbool_t\u001b[0m \u001b[1;33m|\u001b[0m \u001b[1;32mNone\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2151\u001b[0m \u001b[1;33m)\u001b[0m \u001b[1;33m->\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mndarray\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2152\u001b[0m \u001b[0mvalues\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_values\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2153\u001b[1;33m \u001b[0marr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2154\u001b[0m if (\n\u001b[0;32m 2155\u001b[0m \u001b[0mastype_is_view\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0marr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2156\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0musing_copy_on_write\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[1;31mValueError\u001b[0m: could not convert string to float: 'Automatic'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from imblearn.over_sampling import ADASYN\n",
|
||||
"\n",
|
||||
"ada = ADASYN()\n",
|
||||
"\n",
|
||||
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
||||
"print(df_train.Gear_box_type.value_counts())\n",
|
||||
"\n",
|
||||
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Gear_box_type\"])\n",
|
||||
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
|
||||
"\n",
|
||||
"print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
|
||||
"print(df_train_adasyn.Gear_box_type.value_counts())"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
3418
notebooks/lab4.ipynb
3418
notebooks/lab4.ipynb
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
13326
notebooks/lab6_1.ipynb
13326
notebooks/lab6_1.ipynb
File diff suppressed because it is too large
Load Diff
@ -1,17 +0,0 @@
|
||||
import numpy as np
|
||||
from sklearn.base import BaseEstimator, TransformerMixin
|
||||
|
||||
|
||||
class CarsFeatures(BaseEstimator, TransformerMixin):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def fit(self, X, y=None):
|
||||
return self
|
||||
|
||||
def transform(self, X, y=None):
|
||||
X["Age"] = 2020 - X["Prod. year"]
|
||||
return X
|
||||
|
||||
def get_feature_names_out(self, features_in):
|
||||
return np.append(features_in, ["Age"], axis=0)
|
@ -1,100 +0,0 @@
|
||||
import math
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import numpy as np
|
||||
from pandas import DataFrame
|
||||
from sklearn import cluster
|
||||
from sklearn.metrics import silhouette_samples, silhouette_score
|
||||
|
||||
|
||||
def run_agglomerative(
|
||||
df: DataFrame, num_clusters: int | None = 2
|
||||
) -> cluster.AgglomerativeClustering:
|
||||
agglomerative = cluster.AgglomerativeClustering(
|
||||
n_clusters=num_clusters,
|
||||
compute_distances=True,
|
||||
)
|
||||
return agglomerative.fit(df)
|
||||
|
||||
|
||||
def get_linkage_matrix(model: cluster.AgglomerativeClustering) -> np.ndarray:
|
||||
counts = np.zeros(model.children_.shape[0]) # type: ignore
|
||||
n_samples = len(model.labels_)
|
||||
for i, merge in enumerate(model.children_): # type: ignore
|
||||
current_count = 0
|
||||
for child_idx in merge:
|
||||
if child_idx < n_samples:
|
||||
current_count += 1
|
||||
else:
|
||||
current_count += counts[child_idx - n_samples]
|
||||
counts[i] = current_count
|
||||
|
||||
return np.column_stack([model.children_, model.distances_, counts]).astype(float)
|
||||
|
||||
|
||||
def print_cluster_result(
|
||||
df: DataFrame, clusters_num: int, labels: np.ndarray, separator: str = ", "
|
||||
):
|
||||
for cluster_id in range(clusters_num):
|
||||
cluster_indices = np.where(labels == cluster_id)[0]
|
||||
print(f"Cluster {cluster_id + 1} ({len(cluster_indices)}):")
|
||||
rules = [str(df.index[idx]) for idx in cluster_indices]
|
||||
print(separator.join(rules))
|
||||
print("")
|
||||
print("--------")
|
||||
|
||||
|
||||
def run_kmeans(
|
||||
df: DataFrame, num_clusters: int, random_state: int
|
||||
) -> Tuple[np.ndarray, np.ndarray]:
|
||||
kmeans = cluster.KMeans(n_clusters=num_clusters, random_state=random_state)
|
||||
labels = kmeans.fit_predict(df)
|
||||
return labels, kmeans.cluster_centers_
|
||||
|
||||
|
||||
def fit_kmeans(
|
||||
reduced_data: np.ndarray, num_clusters: int, random_state: int
|
||||
) -> cluster.KMeans:
|
||||
kmeans = cluster.KMeans(n_clusters=num_clusters, random_state=random_state)
|
||||
kmeans.fit(reduced_data)
|
||||
return kmeans
|
||||
|
||||
|
||||
def _get_kmeans_range(
|
||||
df: DataFrame | np.ndarray, random_state: int
|
||||
) -> Tuple[List, range]:
|
||||
max_clusters = int(math.sqrt(len(df)))
|
||||
clusters_range = range(2, max_clusters + 1)
|
||||
kmeans_per_k = [
|
||||
cluster.KMeans(n_clusters=k, random_state=random_state).fit(df)
|
||||
for k in clusters_range
|
||||
]
|
||||
return kmeans_per_k, clusters_range
|
||||
|
||||
|
||||
def get_clusters_inertia(df: DataFrame, random_state: int) -> Tuple[List, range]:
|
||||
kmeans_per_k, clusters_range = _get_kmeans_range(df, random_state)
|
||||
return [model.inertia_ for model in kmeans_per_k], clusters_range
|
||||
|
||||
|
||||
def get_clusters_silhouette_scores(
|
||||
df: DataFrame, random_state: int
|
||||
) -> Tuple[List, range]:
|
||||
kmeans_per_k, clusters_range = _get_kmeans_range(df, random_state)
|
||||
return [
|
||||
float(silhouette_score(df, model.labels_)) for model in kmeans_per_k
|
||||
], clusters_range
|
||||
|
||||
|
||||
def get_clusters_silhouettes(df: np.ndarray, random_state: int) -> Dict:
|
||||
kmeans_per_k, _ = _get_kmeans_range(df, random_state)
|
||||
clusters_silhouettes: Dict = {}
|
||||
for model in kmeans_per_k:
|
||||
silhouette_value = silhouette_score(df, model.labels_)
|
||||
sample_silhouette_values = silhouette_samples(df, model.labels_)
|
||||
clusters_silhouettes[model.n_clusters] = (
|
||||
silhouette_value,
|
||||
sample_silhouette_values,
|
||||
model,
|
||||
)
|
||||
return clusters_silhouettes
|
@ -1,242 +0,0 @@
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import matplotlib.cm as cm
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from pandas import DataFrame
|
||||
from scipy.cluster import hierarchy
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
|
||||
def draw_data_2d(
|
||||
df: DataFrame,
|
||||
col1: int,
|
||||
col2: int,
|
||||
y: List | None = None,
|
||||
classes: List | None = None,
|
||||
subplot: Any | None = None,
|
||||
):
|
||||
ax = None
|
||||
if subplot is None:
|
||||
_, ax = plt.subplots()
|
||||
else:
|
||||
ax = subplot
|
||||
scatter = ax.scatter(df[df.columns[col1]], df[df.columns[col2]], c=y)
|
||||
ax.set(xlabel=df.columns[col1], ylabel=df.columns[col2])
|
||||
if classes is not None:
|
||||
ax.legend(
|
||||
scatter.legend_elements()[0], classes, loc="lower right", title="Classes"
|
||||
)
|
||||
|
||||
|
||||
def draw_dendrogram(linkage_matrix: np.ndarray):
|
||||
hierarchy.dendrogram(linkage_matrix, truncate_mode="level", p=3)
|
||||
|
||||
|
||||
def draw_cluster_results(
|
||||
df: DataFrame,
|
||||
col1: int,
|
||||
col2: int,
|
||||
labels: np.ndarray,
|
||||
cluster_centers: np.ndarray,
|
||||
subplot: Any | None = None,
|
||||
):
|
||||
ax = None
|
||||
if subplot is None:
|
||||
ax = plt
|
||||
else:
|
||||
ax = subplot
|
||||
|
||||
centroids = cluster_centers
|
||||
u_labels = np.unique(labels)
|
||||
|
||||
for i in u_labels:
|
||||
ax.scatter(
|
||||
df[labels == i][df.columns[col1]],
|
||||
df[labels == i][df.columns[col2]],
|
||||
label=i,
|
||||
)
|
||||
|
||||
ax.scatter(centroids[:, col1], centroids[:, col2], s=80, color="k")
|
||||
|
||||
|
||||
def draw_clusters(reduced_data: np.ndarray, kmeans: KMeans):
|
||||
h = 0.02
|
||||
|
||||
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
|
||||
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
|
||||
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
|
||||
|
||||
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
|
||||
|
||||
Z = Z.reshape(xx.shape)
|
||||
plt.figure(1)
|
||||
plt.clf()
|
||||
plt.imshow(
|
||||
Z,
|
||||
interpolation="nearest",
|
||||
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
|
||||
cmap=plt.cm.Paired, # type: ignore
|
||||
aspect="auto",
|
||||
origin="lower",
|
||||
)
|
||||
|
||||
plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)
|
||||
centroids = kmeans.cluster_centers_
|
||||
plt.scatter(
|
||||
centroids[:, 0],
|
||||
centroids[:, 1],
|
||||
marker="x",
|
||||
s=169,
|
||||
linewidths=3,
|
||||
color="w",
|
||||
zorder=10,
|
||||
)
|
||||
plt.title(
|
||||
"K-means clustering (PCA-reduced data)\n"
|
||||
"Centroids are marked with white cross"
|
||||
)
|
||||
plt.xlim(x_min, x_max)
|
||||
plt.ylim(y_min, y_max)
|
||||
plt.xticks(())
|
||||
plt.yticks(())
|
||||
|
||||
|
||||
def _draw_cluster_scores(
|
||||
data: List,
|
||||
clusters_range: range,
|
||||
score_name: str,
|
||||
title: str,
|
||||
):
|
||||
plt.figure(figsize=(8, 5))
|
||||
plt.plot(clusters_range, data, "bo-")
|
||||
plt.xlabel("$k$", fontsize=8)
|
||||
plt.ylabel(score_name, fontsize=8)
|
||||
plt.title(title)
|
||||
|
||||
|
||||
def draw_elbow_diagram(inertias: List, clusters_range: range):
|
||||
_draw_cluster_scores(inertias, clusters_range, "Inertia", "The Elbow Diagram")
|
||||
|
||||
|
||||
def draw_silhouettes_diagram(silhouette: List, clusters_range: range):
|
||||
_draw_cluster_scores(
|
||||
silhouette, clusters_range, "Silhouette score", "The Silhouette score"
|
||||
)
|
||||
|
||||
|
||||
def _draw_silhouette(
|
||||
ax: Any,
|
||||
reduced_data: np.ndarray,
|
||||
n_clusters: int,
|
||||
silhouette_avg: float,
|
||||
sample_silhouette_values: List,
|
||||
cluster_labels: List,
|
||||
):
|
||||
ax.set_xlim([-0.1, 1])
|
||||
ax.set_ylim([0, len(reduced_data) + (n_clusters + 1) * 10])
|
||||
|
||||
y_lower = 10
|
||||
for i in range(n_clusters):
|
||||
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
|
||||
|
||||
ith_cluster_silhouette_values.sort()
|
||||
|
||||
size_cluster_i = ith_cluster_silhouette_values.shape[0]
|
||||
y_upper = y_lower + size_cluster_i
|
||||
|
||||
color = cm.nipy_spectral(float(i) / n_clusters) # type: ignore
|
||||
ax.fill_betweenx(
|
||||
np.arange(y_lower, y_upper),
|
||||
0,
|
||||
ith_cluster_silhouette_values,
|
||||
facecolor=color,
|
||||
edgecolor=color,
|
||||
alpha=0.7,
|
||||
)
|
||||
|
||||
ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
|
||||
|
||||
y_lower = y_upper + 10 # 10 for the 0 samples
|
||||
|
||||
ax.set_title("The silhouette plot for the various clusters.")
|
||||
ax.set_xlabel("The silhouette coefficient values")
|
||||
ax.set_ylabel("Cluster label")
|
||||
|
||||
ax.axvline(x=silhouette_avg, color="red", linestyle="--")
|
||||
|
||||
ax.set_yticks([])
|
||||
ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
|
||||
|
||||
|
||||
def _draw_cluster_data(
|
||||
ax: Any,
|
||||
reduced_data: np.ndarray,
|
||||
n_clusters: int,
|
||||
cluster_labels: np.ndarray,
|
||||
cluster_centers: np.ndarray,
|
||||
):
|
||||
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters) # type: ignore
|
||||
ax.scatter(
|
||||
reduced_data[:, 0],
|
||||
reduced_data[:, 1],
|
||||
marker=".",
|
||||
s=30,
|
||||
lw=0,
|
||||
alpha=0.7,
|
||||
c=colors,
|
||||
edgecolor="k",
|
||||
)
|
||||
|
||||
ax.scatter(
|
||||
cluster_centers[:, 0],
|
||||
cluster_centers[:, 1],
|
||||
marker="o",
|
||||
c="white",
|
||||
alpha=1,
|
||||
s=200,
|
||||
edgecolor="k",
|
||||
)
|
||||
|
||||
for i, c in enumerate(cluster_centers):
|
||||
ax.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")
|
||||
|
||||
ax.set_title("The visualization of the clustered data.")
|
||||
ax.set_xlabel("Feature space for the 1st feature")
|
||||
ax.set_ylabel("Feature space for the 2nd feature")
|
||||
|
||||
|
||||
def draw_silhouettes(reduced_data: np.ndarray, silhouettes: Dict):
|
||||
for key, value in silhouettes.items():
|
||||
fig, (ax1, ax2) = plt.subplots(1, 2)
|
||||
fig.set_size_inches(18, 7)
|
||||
|
||||
n_clusters = key
|
||||
silhouette_avg = value[0]
|
||||
sample_silhouette_values = value[1]
|
||||
cluster_labels = value[2].labels_
|
||||
cluster_centers = value[2].cluster_centers_
|
||||
|
||||
_draw_silhouette(
|
||||
ax1,
|
||||
reduced_data,
|
||||
n_clusters,
|
||||
silhouette_avg,
|
||||
sample_silhouette_values,
|
||||
cluster_labels,
|
||||
)
|
||||
|
||||
_draw_cluster_data(
|
||||
ax2,
|
||||
reduced_data,
|
||||
n_clusters,
|
||||
cluster_labels,
|
||||
cluster_centers,
|
||||
)
|
||||
|
||||
plt.suptitle(
|
||||
"Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
|
||||
% n_clusters,
|
||||
fontsize=14,
|
||||
fontweight="bold",
|
||||
)
|
206
poetry.lock
generated
206
poetry.lock
generated
@ -467,17 +467,6 @@ files = [
|
||||
[package.dependencies]
|
||||
colorama = {version = "*", markers = "platform_system == \"Windows\""}
|
||||
|
||||
[[package]]
|
||||
name = "cloudpickle"
|
||||
version = "3.1.0"
|
||||
description = "Pickler class to extend the standard pickle.Pickler functionality"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "cloudpickle-3.1.0-py3-none-any.whl", hash = "sha256:fe11acda67f61aaaec473e3afe030feb131d78a43461b718185363384f1ba12e"},
|
||||
{file = "cloudpickle-3.1.0.tar.gz", hash = "sha256:81a929b6e3c7335c863c771d673d105f02efdb89dfaba0c90495d1c64796601b"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "colorama"
|
||||
version = "0.4.6"
|
||||
@ -672,17 +661,6 @@ files = [
|
||||
[package.extras]
|
||||
tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"]
|
||||
|
||||
[[package]]
|
||||
name = "farama-notifications"
|
||||
version = "0.0.4"
|
||||
description = "Notifications for all Farama Foundation maintained libraries."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "Farama-Notifications-0.0.4.tar.gz", hash = "sha256:13fceff2d14314cf80703c8266462ebf3733c7d165336eee998fc58e545efd18"},
|
||||
{file = "Farama_Notifications-0.0.4-py3-none-any.whl", hash = "sha256:14de931035a41961f7c056361dc7f980762a143d05791ef5794a751a2caf05ae"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fastjsonschema"
|
||||
version = "2.20.0"
|
||||
@ -697,41 +675,6 @@ files = [
|
||||
[package.extras]
|
||||
devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"]
|
||||
|
||||
[[package]]
|
||||
name = "featuretools"
|
||||
version = "1.31.0"
|
||||
description = "a framework for automated feature engineering"
|
||||
optional = false
|
||||
python-versions = "<4,>=3.9"
|
||||
files = [
|
||||
{file = "featuretools-1.31.0-py3-none-any.whl", hash = "sha256:87c94e9ae959c89acd83da96bd2583f3ef0f6daaa9639cbb6e46dbde2c742a18"},
|
||||
{file = "featuretools-1.31.0.tar.gz", hash = "sha256:01bfb17fcc1715b4c3623c7bc94a8982122c4a0fa03350ed478601bb81f90155"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
cloudpickle = ">=1.5.0"
|
||||
holidays = ">=0.17"
|
||||
numpy = ">=1.25.0"
|
||||
packaging = ">=20.0"
|
||||
pandas = ">=2.0.0"
|
||||
psutil = ">=5.7.0"
|
||||
scipy = ">=1.10.0"
|
||||
tqdm = ">=4.66.3"
|
||||
woodwork = ">=0.28.0"
|
||||
|
||||
[package.extras]
|
||||
autonormalize = ["autonormalize (>=2.0.1)"]
|
||||
complete = ["featuretools[dask,nlp,premium]"]
|
||||
dask = ["dask[dataframe] (>=2023.2.0)", "distributed (>=2023.2.0)"]
|
||||
dev = ["black[jupyter] (>=23.1.0)", "featuretools[dask,docs,test]", "pre-commit (>=2.20.0)", "ruff (>=0.1.6)"]
|
||||
docs = ["Sphinx (==5.1.1)", "autonormalize (>=2.0.1)", "click (>=7.0.0)", "featuretools[dask,test]", "ipython (==8.4.0)", "jupyter (==1.0.0)", "jupyter-client (>=8.0.2)", "matplotlib (==3.7.2)", "myst-parser (==0.18.0)", "nbconvert (==6.5.0)", "nbsphinx (==0.8.9)", "pydata-sphinx-theme (==0.9.0)", "sphinx-copybutton (==0.5.0)", "sphinx-inline-tabs (==2022.1.2b11)"]
|
||||
nlp = ["nlp-primitives (>=2.12.0)"]
|
||||
premium = ["premium-primitives (>=0.0.3)"]
|
||||
sklearn = ["featuretools-sklearn-transformer (>=1.0.0)"]
|
||||
sql = ["featuretools-sql (>=0.0.1)", "psycopg2-binary (>=2.9.3)"]
|
||||
test = ["boto3 (>=1.34.32)", "composeml (>=0.8.0)", "graphviz (>=0.8.4)", "moto[all] (>=5.0.0)", "pip (>=23.3.0)", "pyarrow (>=14.0.1)", "pympler (>=0.8)", "pytest (>=7.1.2)", "pytest-cov (>=3.0.0)", "pytest-timeout (>=2.1.0)", "pytest-xdist (>=2.5.0)", "smart-open (>=5.0.0)", "urllib3 (>=1.26.18)"]
|
||||
tsfresh = ["featuretools-tsfresh-primitives (>=1.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "flask"
|
||||
version = "3.0.3"
|
||||
@ -879,36 +822,6 @@ files = [
|
||||
{file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "gymnasium"
|
||||
version = "1.0.0"
|
||||
description = "A standard API for reinforcement learning and a diverse set of reference environments (formerly Gym)."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "gymnasium-1.0.0-py3-none-any.whl", hash = "sha256:b6f40e1e24c5bd419361e1a5b86a9117d2499baecc3a660d44dfff4c465393ad"},
|
||||
{file = "gymnasium-1.0.0.tar.gz", hash = "sha256:9d2b66f30c1b34fe3c2ce7fae65ecf365d0e9982d2b3d860235e773328a3b403"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
cloudpickle = ">=1.2.0"
|
||||
farama-notifications = ">=0.0.1"
|
||||
numpy = ">=1.21.0"
|
||||
typing-extensions = ">=4.3.0"
|
||||
|
||||
[package.extras]
|
||||
all = ["ale-py (>=0.9)", "box2d-py (==2.3.5)", "cython (<3)", "flax (>=0.5.0)", "imageio (>=2.14.1)", "jax (>=0.4.0)", "jaxlib (>=0.4.0)", "matplotlib (>=3.0)", "moviepy (>=1.0.0)", "mujoco (>=2.1.5)", "mujoco-py (>=2.1,<2.2)", "opencv-python (>=3.0)", "pygame (>=2.1.3)", "swig (==4.*)", "torch (>=1.0.0)"]
|
||||
atari = ["ale-py (>=0.9)"]
|
||||
box2d = ["box2d-py (==2.3.5)", "pygame (>=2.1.3)", "swig (==4.*)"]
|
||||
classic-control = ["pygame (>=2.1.3)", "pygame (>=2.1.3)"]
|
||||
jax = ["flax (>=0.5.0)", "jax (>=0.4.0)", "jaxlib (>=0.4.0)"]
|
||||
mujoco = ["imageio (>=2.14.1)", "mujoco (>=2.1.5)"]
|
||||
mujoco-py = ["cython (<3)", "cython (<3)", "mujoco-py (>=2.1,<2.2)", "mujoco-py (>=2.1,<2.2)"]
|
||||
other = ["matplotlib (>=3.0)", "moviepy (>=1.0.0)", "opencv-python (>=3.0)"]
|
||||
testing = ["dill (>=0.3.7)", "pytest (==7.1.3)", "scipy (>=1.7.3)"]
|
||||
torch = ["torch (>=1.0.0)"]
|
||||
toy-text = ["pygame (>=2.1.3)", "pygame (>=2.1.3)"]
|
||||
|
||||
[[package]]
|
||||
name = "h11"
|
||||
version = "0.14.0"
|
||||
@ -920,20 +833,6 @@ files = [
|
||||
{file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "holidays"
|
||||
version = "0.60"
|
||||
description = "World Holidays Framework"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
files = [
|
||||
{file = "holidays-0.60-py3-none-any.whl", hash = "sha256:d857949c5ee35655215a10c5a26e6a856bdc3beccc4fbbc8debef98dfba17b82"},
|
||||
{file = "holidays-0.60.tar.gz", hash = "sha256:495fc5123f5d92b92673237375eb8c15a03d21c647b089db509b7d9612267556"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
python-dateutil = "*"
|
||||
|
||||
[[package]]
|
||||
name = "httpcore"
|
||||
version = "1.0.5"
|
||||
@ -1015,25 +914,6 @@ examples = ["keras (>=2.4.3)", "matplotlib (>=3.1.2)", "pandas (>=1.0.5)", "seab
|
||||
optional = ["keras (>=2.4.3)", "pandas (>=1.0.5)", "tensorflow (>=2.4.3)"]
|
||||
tests = ["black (>=23.3.0)", "flake8 (>=3.8.2)", "keras (>=2.4.3)", "mypy (>=1.3.0)", "pandas (>=1.0.5)", "pytest (>=5.0.1)", "pytest-cov (>=2.9.0)", "tensorflow (>=2.4.3)"]
|
||||
|
||||
[[package]]
|
||||
name = "importlib-resources"
|
||||
version = "6.4.5"
|
||||
description = "Read resources from Python packages"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "importlib_resources-6.4.5-py3-none-any.whl", hash = "sha256:ac29d5f956f01d5e4bb63102a5a19957f1b9175e45649977264a1416783bb717"},
|
||||
{file = "importlib_resources-6.4.5.tar.gz", hash = "sha256:980862a1d16c9e147a59603677fa2aa5fd82b87f223b6cb870695bcfce830065"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
|
||||
cover = ["pytest-cov"]
|
||||
doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
|
||||
enabler = ["pytest-enabler (>=2.2)"]
|
||||
test = ["jaraco.test (>=5.4)", "pytest (>=6,!=8.1.*)", "zipp (>=3.17)"]
|
||||
type = ["pytest-mypy"]
|
||||
|
||||
[[package]]
|
||||
name = "ipykernel"
|
||||
version = "6.29.5"
|
||||
@ -2828,11 +2708,6 @@ files = [
|
||||
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"},
|
||||
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"},
|
||||
{file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"},
|
||||
{file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"},
|
||||
{file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"},
|
||||
{file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"},
|
||||
{file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"},
|
||||
{file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"},
|
||||
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"},
|
||||
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"},
|
||||
{file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"},
|
||||
@ -2906,27 +2781,6 @@ dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy (==1.10.0)", "pycodest
|
||||
doc = ["jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.13.1)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0,<=7.3.7)", "sphinx-design (>=0.4.0)"]
|
||||
test = ["Cython", "array-api-strict (>=2.0)", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"]
|
||||
|
||||
[[package]]
|
||||
name = "seaborn"
|
||||
version = "0.13.2"
|
||||
description = "Statistical data visualization"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987"},
|
||||
{file = "seaborn-0.13.2.tar.gz", hash = "sha256:93e60a40988f4d65e9f4885df477e2fdaff6b73a9ded434c1ab356dd57eefff7"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
matplotlib = ">=3.4,<3.6.1 || >3.6.1"
|
||||
numpy = ">=1.20,<1.24.0 || >1.24.0"
|
||||
pandas = ">=1.2"
|
||||
|
||||
[package.extras]
|
||||
dev = ["flake8", "flit", "mypy", "pandas-stubs", "pre-commit", "pytest", "pytest-cov", "pytest-xdist"]
|
||||
docs = ["ipykernel", "nbconvert", "numpydoc", "pydata_sphinx_theme (==0.10.0rc2)", "pyyaml", "sphinx (<6.0.0)", "sphinx-copybutton", "sphinx-design", "sphinx-issues"]
|
||||
stats = ["scipy (>=1.7)", "statsmodels (>=0.12)"]
|
||||
|
||||
[[package]]
|
||||
name = "send2trash"
|
||||
version = "1.8.3"
|
||||
@ -3085,27 +2939,6 @@ files = [
|
||||
{file = "tornado-6.4.1.tar.gz", hash = "sha256:92d3ab53183d8c50f8204a51e6f91d18a15d5ef261e84d452800d4ff6fc504e9"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tqdm"
|
||||
version = "4.67.0"
|
||||
description = "Fast, Extensible Progress Meter"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "tqdm-4.67.0-py3-none-any.whl", hash = "sha256:0cd8af9d56911acab92182e88d763100d4788bdf421d251616040cc4d44863be"},
|
||||
{file = "tqdm-4.67.0.tar.gz", hash = "sha256:fe5a6f95e6fe0b9755e9469b77b9c3cf850048224ecaa8293d7d2d31f97d869a"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
colorama = {version = "*", markers = "platform_system == \"Windows\""}
|
||||
|
||||
[package.extras]
|
||||
dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"]
|
||||
discord = ["requests"]
|
||||
notebook = ["ipywidgets (>=6)"]
|
||||
slack = ["slack-sdk"]
|
||||
telegram = ["requests"]
|
||||
|
||||
[[package]]
|
||||
name = "traitlets"
|
||||
version = "5.14.3"
|
||||
@ -3132,17 +2965,6 @@ files = [
|
||||
{file = "types_python_dateutil-2.9.0.20240821-py3-none-any.whl", hash = "sha256:f5889fcb4e63ed4aaa379b44f93c32593d50b9a94c9a60a0c854d8cc3511cd57"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typing-extensions"
|
||||
version = "4.12.2"
|
||||
description = "Backported and Experimental Type Hints for Python 3.8+"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
|
||||
{file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tzdata"
|
||||
version = "2024.1"
|
||||
@ -3288,33 +3110,7 @@ files = [
|
||||
{file = "widgetsnbextension-4.0.13.tar.gz", hash = "sha256:ffcb67bc9febd10234a362795f643927f4e0c05d9342c727b65d2384f8feacb6"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "woodwork"
|
||||
version = "0.31.0"
|
||||
description = "a data typing library for machine learning"
|
||||
optional = false
|
||||
python-versions = "<4,>=3.9"
|
||||
files = [
|
||||
{file = "woodwork-0.31.0-py3-none-any.whl", hash = "sha256:5cb3370553b5f466f8c8599b1bf559584dc0b798cc1f2da26bbd7029d256c6f9"},
|
||||
{file = "woodwork-0.31.0.tar.gz", hash = "sha256:6ef82af1d5b6525b02efe6417c574c810cfdcc606cb266bd0d7fb17a1d066b67"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
importlib-resources = ">=5.10.0"
|
||||
numpy = ">=1.25.0"
|
||||
pandas = ">=2.0.0"
|
||||
python-dateutil = ">=2.8.2"
|
||||
scikit-learn = ">=1.1.0"
|
||||
scipy = ">=1.10.0"
|
||||
|
||||
[package.extras]
|
||||
complete = ["woodwork[updater]"]
|
||||
dev = ["click (>=8.1.7)", "pre-commit (>=2.20.0)", "ruff (>=0.1.6)", "woodwork[docs,test]"]
|
||||
docs = ["Sphinx (==5.1.1)", "ipython (==8.4.0)", "jupyter (==1.0.0)", "myst-parser (==0.18.0)", "nbconvert (==6.5.0)", "nbsphinx (==0.8.9)", "pyarrow (>=14.0.1)", "pydata-sphinx-theme (==0.9.0)", "sphinx-copybutton (==0.5.0)", "sphinx-inline-tabs (==2022.1.2b11)"]
|
||||
test = ["boto3 (>=1.34.32)", "moto[all] (>=5.0.0)", "pyarrow (>=14.0.1)", "pytest (>=7.0.1)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=2.1.0)", "smart-open (>=5.0.0)"]
|
||||
updater = ["alteryx-open-src-update-checker (>=3.1.0)"]
|
||||
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.12"
|
||||
content-hash = "76a7ecc0524f2a9a187e4242566cf9813bf2265aa4176553ea4f33c9a4c78f17"
|
||||
content-hash = "a7e3d516bde2d6e4173d8a9770fb5337a0c806dadaeda355084b262c1995f7ea"
|
||||
|
@ -17,15 +17,8 @@ apiflask = "^2.2.0"
|
||||
flask-cors = "^5.0.0"
|
||||
scikit-learn = "^1.5.2"
|
||||
imbalanced-learn = "^0.12.3"
|
||||
featuretools = "^1.31.0"
|
||||
seaborn = "^0.13.2"
|
||||
gymnasium = "^1.0.0"
|
||||
scipy = "^1.14.1"
|
||||
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
ipykernel = "^6.29.5"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user