lab_5_done
This commit is contained in:
parent
049a4ba858
commit
2a65236e3b
135
lab_5/lab5.ipynb
Normal file
135
lab_5/lab5.ipynb
Normal file
@ -0,0 +1,135 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from sklearn.preprocessing import LabelEncoder\n",
|
||||
"from sklearn.cluster import AgglomerativeClustering\n",
|
||||
"from sklearn.cluster import KMeans\n",
|
||||
"from sklearn.metrics import silhouette_score\n",
|
||||
"from sklearn.decomposition import PCA\n",
|
||||
"from sklearn import metrics\n",
|
||||
"from imblearn.over_sampling import RandomOverSampler\n",
|
||||
"from imblearn.under_sampling import RandomUnderSampler\n",
|
||||
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
|
||||
"from sklearn.metrics import ConfusionMatrixDisplay\n",
|
||||
"from sklearn.compose import ColumnTransformer\n",
|
||||
"from sklearn.pipeline import Pipeline\n",
|
||||
"from sklearn.impute import SimpleImputer\n",
|
||||
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
|
||||
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier\n",
|
||||
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
|
||||
"from sklearn.metrics import (\n",
|
||||
" precision_score, recall_score, accuracy_score, roc_auc_score, f1_score,\n",
|
||||
" matthews_corrcoef, cohen_kappa_score, confusion_matrix\n",
|
||||
")\n",
|
||||
"from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
|
||||
"import numpy as np\n",
|
||||
"import featuretools as ft\n",
|
||||
"from sklearn.metrics import accuracy_score, classification_report\n",
|
||||
"\n",
|
||||
"# Функция для применения oversampling\n",
|
||||
"def apply_oversampling(X, y):\n",
|
||||
" oversampler = RandomOverSampler(random_state=42)\n",
|
||||
" X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
|
||||
" return X_resampled, y_resampled\n",
|
||||
"\n",
|
||||
"# Функция для применения undersampling\n",
|
||||
"def apply_undersampling(X, y):\n",
|
||||
" undersampler = RandomUnderSampler(random_state=42)\n",
|
||||
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
|
||||
" return X_resampled, y_resampled\n",
|
||||
"\n",
|
||||
"def split_stratified_into_train_val_test(\n",
|
||||
" df_input,\n",
|
||||
" stratify_colname=\"y\",\n",
|
||||
" frac_train=0.6,\n",
|
||||
" frac_val=0.15,\n",
|
||||
" frac_test=0.25,\n",
|
||||
" random_state=None,\n",
|
||||
"):\n",
|
||||
" \"\"\"\n",
|
||||
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
|
||||
" following fractional ratios provided by the user, where each subset is\n",
|
||||
" stratified by the values in a specific column (that is, each subset has\n",
|
||||
" the same relative frequency of the values in the column). It performs this\n",
|
||||
" splitting by running train_test_split() twice.\n",
|
||||
"\n",
|
||||
" Parameters\n",
|
||||
" ----------\n",
|
||||
" df_input : Pandas dataframe\n",
|
||||
" Input dataframe to be split.\n",
|
||||
" stratify_colname : str\n",
|
||||
" The name of the column that will be used for stratification. Usually\n",
|
||||
" this column would be for the label.\n",
|
||||
" frac_train : float\n",
|
||||
" frac_val : float\n",
|
||||
" frac_test : float\n",
|
||||
" The ratios with which the dataframe will be split into train, val, and\n",
|
||||
" test data. The values should be expressed as float fractions and should\n",
|
||||
" sum to 1.0.\n",
|
||||
" random_state : int, None, or RandomStateInstance\n",
|
||||
" Value to be passed to train_test_split().\n",
|
||||
"\n",
|
||||
" Returns\n",
|
||||
" -------\n",
|
||||
" df_train, df_val, df_test :\n",
|
||||
" Dataframes containing the three splits.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" if frac_train + frac_val + frac_test != 1.0:\n",
|
||||
" raise ValueError(\n",
|
||||
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
||||
" % (frac_train, frac_val, frac_test)\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if stratify_colname not in df_input.columns:\n",
|
||||
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
||||
"\n",
|
||||
" X = df_input # Contains all columns.\n",
|
||||
" y = df_input[\n",
|
||||
" [stratify_colname]\n",
|
||||
" ] # Dataframe of just the column on which to stratify.\n",
|
||||
"\n",
|
||||
" # Split original dataframe into train and temp dataframes.\n",
|
||||
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
||||
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Split the temp dataframe into val and test dataframes.\n",
|
||||
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
||||
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
||||
" df_temp,\n",
|
||||
" y_temp,\n",
|
||||
" stratify=y_temp,\n",
|
||||
" test_size=relative_frac_test,\n",
|
||||
" random_state=random_state,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
||||
"\n",
|
||||
" return df_train, df_val, df_test\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"df = pd.read_csv(\"../static/csv/AgeDataset-V1.csv\", nrows=10000)\n",
|
||||
"df.info()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Loading…
Reference in New Issue
Block a user