328 lines
68 KiB
Plaintext
328 lines
68 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 27,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')\n",
|
||
|
"0 8212\n",
|
||
|
"1 8215\n",
|
||
|
"2 8216\n",
|
||
|
"3 8217\n",
|
||
|
"4 8218\n",
|
||
|
" ... \n",
|
||
|
"8031 19860\n",
|
||
|
"8032 19863\n",
|
||
|
"8033 19864\n",
|
||
|
"8034 19865\n",
|
||
|
"8035 19866\n",
|
||
|
"Name: Date_numeric, Length: 8036, dtype: int64\n",
|
||
|
"Зашумленные столбцы: []\n",
|
||
|
"Смещение: Open 1.086680\n",
|
||
|
"High 1.086383\n",
|
||
|
"Low 1.087102\n",
|
||
|
"Close 1.086685\n",
|
||
|
"Adj Close 1.213587\n",
|
||
|
"Volume 13.602510\n",
|
||
|
"Date_numeric 0.000505\n",
|
||
|
"dtype: float64\n",
|
||
|
"Сильно смещенные столбцы: ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']\n",
|
||
|
"Данные за последние 25 лет\n",
|
||
|
"Выбросы в столбце 'Open':\n",
|
||
|
"Series([], Name: Open, dtype: float64)\n",
|
||
|
"\n",
|
||
|
"Выбросы в столбце 'High':\n",
|
||
|
"Series([], Name: High, dtype: float64)\n",
|
||
|
"\n",
|
||
|
"Выбросы в столбце 'Low':\n",
|
||
|
"Series([], Name: Low, dtype: float64)\n",
|
||
|
"\n",
|
||
|
"Выбросы в столбце 'Close':\n",
|
||
|
"Series([], Name: Close, dtype: float64)\n",
|
||
|
"\n",
|
||
|
"Выбросы в столбце 'Adj Close':\n",
|
||
|
"7321 114.799438\n",
|
||
|
"7322 117.926170\n",
|
||
|
"7323 118.010414\n",
|
||
|
"7324 117.982330\n",
|
||
|
"7325 114.593483\n",
|
||
|
"7326 114.565414\n",
|
||
|
"7327 113.676071\n",
|
||
|
"Name: Adj Close, dtype: float64\n",
|
||
|
"\n",
|
||
|
"Выбросы в столбце 'Volume':\n",
|
||
|
"0 224358400\n",
|
||
|
"1 58732800\n",
|
||
|
"2 34777600\n",
|
||
|
"33 48320000\n",
|
||
|
"103 46131200\n",
|
||
|
" ... \n",
|
||
|
"6444 51851700\n",
|
||
|
"6544 62091100\n",
|
||
|
"6550 33210100\n",
|
||
|
"6639 45573000\n",
|
||
|
"8019 66610700\n",
|
||
|
"Name: Volume, Length: 451, dtype: int64\n",
|
||
|
"\n",
|
||
|
"Выбросы в столбце 'Date_numeric':\n",
|
||
|
"Series([], Name: Date_numeric, dtype: int64)\n",
|
||
|
"\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABKYAAAMWCAYAAADLc44dAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACiyElEQVR4nOzdd3gU5drH8d+mbUJIoaZQQkAkiPRm6E2pYqUJUkTxSBdFQVSKIHYFQco5CEeaimJDQRAp0jsoTUooinQSQofkef/gzZ4sSSCBJJPdfD/XtZfszLMz9z1Z5969d4rNGGMEAAAAAAAAZDMPqwMAAAAAAABA7kRjCgAAAAAAAJagMQUAAAAAAABL0JgCAAAAAACAJWhMAQAAAAAAwBI0pgAAAAAAAGAJGlMAAAAAAACwBI0pAAAAAAAAWILGFAAAAAAAACxBYwouy2azadiwYdmyrgULFqhSpUry9fWVzWZTbGxstqwXAJD5qB//U6JECXXt2vW2X9uqVavMDQgAcjDqB5A1aEwhhWnTpslmszk9ChcurIYNG2r+/PlWh3fHduzYoWHDhunAgQPpGn/q1Cm1bdtWfn5+Gj9+vKZPny5/f/+sDVLS9u3b1alTJxUpUkR2u13h4eHq2LGjtm/fnuXrBoDbQf1wlt31I2n7b9iwIdX5DRo00L333ptl6weA20X9cJbT6geQ1bysDgA514gRIxQZGSljjI4dO6Zp06apRYsW+uGHH1z6F9IdO3Zo+PDhatCggUqUKHHL8evXr1d8fLzeeOMNNWnSJOsDlDR37lx16NBB+fPnV/fu3RUZGakDBw5oypQp+uqrr/T555/rkUceyZZYACCjqB/XWVE/Mmr37t3y8OB3SgA5A/XjOleoH0BmojGFNDVv3lzVqlVzPO/evbtCQkI0e/Zsly4MGXX8+HFJUnBwcKYt8/z582n+6rFv3z49+eSTKlmypJYvX65ChQo55vXr109169bVk08+qW3btqlkyZKZFhMAZBbqx3XZXT9uh91uz7RlAcCdon5c5wr1A8hM/ESGdAsODpafn5+8vJz7mefPn9cLL7ygYsWKyW63q0yZMnrvvfdkjJEkXbx4UVFRUYqKitLFixcdrzt9+rTCwsJUq1YtJSQkSJK6du2qvHnzav/+/WratKn8/f0VHh6uESNGOJZ3M5s3b1bz5s0VGBiovHnzqnHjxlqzZo1j/rRp09SmTRtJUsOGDR2HCi9dujTV5TVo0EBdunSRJFWvXl02m83pWhxz5sxR1apV5efnp4IFC6pTp076+++/nZaRlNO+ffvUokULBQQEqGPHjmnm8O677+rChQuaPHmyU1NKkgoWLKhJkybp/PnzeueddxzThw0bJpvNpl27dqlt27YKDAxUgQIF1K9fP126dCnFOmbMmOGIO3/+/Grfvr0OHz6cIvd7771XO3bsUMOGDZUnTx4VKVLEab0AkB7Uj+ypH7cjtWtMbdu2TfXr15efn5+KFi2qkSNHaurUqbLZbKmehrJixQrVqFFDvr6+KlmypD777LNMjRFA7kX9yFn141a5xsbGytPTU2PHjnVMO3nypDw8PFSgQAGn7fncc88pNDT0jmOCe6AxhTTFxcXp5MmTOnHihLZv367nnntO586dU6dOnRxjjDFq3bq1PvzwQzVr1kwffPCBypQpo4EDB2rAgAGSJD8/P/33v//V3r17NWTIEMdre/Xqpbi4OE2bNk2enp6O6QkJCWrWrJlCQkL0zjvvqGrVqho6dKiGDh1603i3b9+uunXrauvWrXrppZf02muvKSYmRg0aNNDatWslSfXq1VPfvn0lSa+88oqmT5+u6dOnq2zZsqkuc8iQIerRo4ek64cWT58+Xc8++6yk60Wmbdu28vT01OjRo/XMM89o7ty5qlOnToqLE167dk1NmzZV4cKF9d577+mxxx5LM48ffvhBJUqUUN26dVOdX69ePZUoUUI//vhjinlt27bVpUuXNHr0aLVo0UJjx451xJ9k1KhR6ty5s0qXLq0PPvhA/fv31+LFi1WvXr0UcZ85c0bNmjVTxYoV9f777ysqKkovv/yyW5zrDyDrUD+sqR83bv8bH1evXr3la//++281bNhQ27dv1+DBg/X8889r5syZGjNmTKrj9+7dq8cff1z333+/3n//feXLl09du3bleogAbgv1w9r6cae5BgcH695779Xy5csdr1uxYoVsNptOnz6tHTt2OKb/9ttvaX7fQS5kgBtMnTrVSErxsNvtZtq0aU5jv/32WyPJjBw50mn6448/bmw2m9m7d69j2uDBg42Hh4dZvny5mTNnjpFkPvroI6fXdenSxUgyffr0cUxLTEw0LVu2ND4+PubEiROO6ZLM0KFDHc8ffvhh4+PjY/bt2+eYduTIERMQEGDq1avnmJa07iVLlmRoe6xfv94x7cqVK6Zw4cLm3nvvNRcvXnRMnzdvnpFkXn/99RQ5DRo06Jbrio2NNZLMQw89dNNxrVu3NpLM2bNnjTHGDB061EgyrVu3dhrXs2dPI8ls3brVGGPMgQMHjKenpxk1apTTuN9//914eXk5Ta9fv76RZD777DPHtMuXL5vQ0FDz2GOP3TIXALkP9SP17ZEd9SP5+m72KFeunNNrIiIiTJcuXRzP+/TpY2w2m9m8ebNj2qlTp0z+/PmNJBMTE+P0Wklm+fLljmnHjx83drvdvPDCC+mKGQCMoX6ktT2yu34kX9+N0ptrr169TEhIiOP5gAEDTL169UzhwoXNhAkTjDHX64rNZjNjxoxJV3xwfxwxhTSNHz9eixYt0qJFizRjxgw1bNhQTz/9tObOnesY89NPP8nT09PxK0CSF154QcYYpyNrhg0bpnLlyqlLly7q2bOn6tevn+J1SXr37u34t81mU+/evXXlyhX98ssvqY5PSEjQwoUL9fDDDztddyksLExPPPGEVqxYobNnz97WdkjNhg0bdPz4cfXs2VO+vr6O6S1btlRUVFSqRzM999xzt1xufHy8JCkgIOCm45Lm35hTr169nJ736dNH0vW/k3T9ouqJiYlq27at06/ooaGhKl26tJYsWeL0+rx58zr9QuXj46MaNWpo//79t8wFQO5F/UhbVtWP5JJv/+SPChUq3PK1CxYsUHR0tCpVquSYlj9//jRPAbnnnnucfvEuVKiQypQpQ50AcFuoH2nLjvqRlozkWrduXR07dky7d++WdP3IqHr16qlu3br67bffJF0/isoYwxFTcODi50hTjRo1nC4+2KFDB1WuXFm9e/dWq1at5OPjo4MHDyo8PDxFIyXp0NSDBw86pvn4+OjTTz9V9erV5evr67hexY08PDxSXNT77rvvlqQ0b7F64sQJXbhwQWXKlEkxr2zZskpMTNThw4dVrly59CV/C0l5pba+qKgorVixwmmal5eXihYtesvlJm3HpAZVWtJqYJUuXdrpealSpeTh4eHYbnv27JExJsW4JN7e3k7PixYtmuJvlC9fPm3btu3miQDI1agfacuq+pHcjds/Sb58+XTy5MlbxhcdHZ1i+l133ZXq+OLFi6e6njNnzqQzWgD4H+pH2rKjfqQlI7kmNZt+++03FS1aVJs3b9bIkSNVqFAhvffee455gYGBqlixYqbEB9fHEVNINw8PDzVs2FD//POP9uzZc1vL+PnnnyVJly5duu1luCK73Z6u23EHBQUpLCzslo2fbdu2qUiRIgoMDLzpuBsLb2Jiomw2mxYsWJDqr+mTJk1yGp/83PvkTDouBAkASagfty+99cMq1AkAWYn6cfusqh/h4eGKjIzU8uXLtXr1ahljFB0drbp16+rw4cM6ePCgfvvtN9WqVStH1zdkL94JyJBr165Jks6dOydJioiI0JEjR1Ic4bNr1y7H/CTbtm3TiBEj1K1bN1WuXFlPP/204uLiUqwjMTExxSkAf/75p6Trdw9KTaFChZQnTx7HIaM3xuLh4aFixYpJStmsuR1JeaW2vt27dzvlnVGtWrVSTExMil89kvz22286cOBAqrfMvbHY7t27V4mJiY7tVqpUKRljFBkZqSZNmqR43Hfffbc
|
||
|
"text/plain": [
|
||
|
"<Figure size 1200x800 with 7 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
},
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'Date' и 'Date_numeric'\n",
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'Open' и 'High'\n",
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'Open' и 'Low'\n",
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'Open' и 'Close'\n",
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'Open' и 'Adj Close'\n",
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'High' и 'Open'\n",
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'High' и 'Low'\n",
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'High' и 'Close'\n",
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'High' и 'Adj Close'\n",
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'Low' и 'Open'\n",
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'Low' и 'High'\n",
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'Low' и 'Close'\n",
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'Low' и 'Adj Close'\n",
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'Close' и 'Open'\n",
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'Close' и 'High'\n",
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'Close' и 'Low'\n",
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'Close' и 'Adj Close'\n",
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'Adj Close' и 'Open'\n",
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'Adj Close' и 'High'\n",
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'Adj Close' и 'Low'\n",
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'Adj Close' и 'Close'\n",
|
||
|
"Просачивание данных: Высокая корреляция (1.00) между столбцами 'Date_numeric' и 'Date'\n",
|
||
|
"Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',\n",
|
||
|
" 'Date_numeric', 'log_close', 'log_volume', 'Close_binned'],\n",
|
||
|
" dtype='object')\n",
|
||
|
"Обучающая выборка: (4821, 11)\n",
|
||
|
"Close\n",
|
||
|
"0.765625 14\n",
|
||
|
"0.750000 12\n",
|
||
|
"0.789063 11\n",
|
||
|
"0.882813 10\n",
|
||
|
"0.703125 10\n",
|
||
|
" ..\n",
|
||
|
"91.949997 1\n",
|
||
|
"48.685001 1\n",
|
||
|
"103.870003 1\n",
|
||
|
"2.414063 1\n",
|
||
|
"4.280000 1\n",
|
||
|
"Name: count, Length: 3678, dtype: int64\n",
|
||
|
"Контрольная выборка: (1607, 11)\n",
|
||
|
"Close\n",
|
||
|
"0.750000 6\n",
|
||
|
"0.796875 5\n",
|
||
|
"3.320313 4\n",
|
||
|
"0.835938 4\n",
|
||
|
"0.601563 4\n",
|
||
|
" ..\n",
|
||
|
"0.414063 1\n",
|
||
|
"111.070000 1\n",
|
||
|
"11.790000 1\n",
|
||
|
"59.610001 1\n",
|
||
|
"2.472656 1\n",
|
||
|
"Name: count, Length: 1438, dtype: int64\n",
|
||
|
"Тестовая выборка: (1607, 11)\n",
|
||
|
"Close\n",
|
||
|
"0.750000 6\n",
|
||
|
"0.765625 6\n",
|
||
|
"3.000000 4\n",
|
||
|
"0.601563 4\n",
|
||
|
"0.707031 4\n",
|
||
|
" ..\n",
|
||
|
"98.599998 1\n",
|
||
|
"56.110001 1\n",
|
||
|
"0.621094 1\n",
|
||
|
"21.740000 1\n",
|
||
|
"98.000000 1\n",
|
||
|
"Name: count, Length: 1444, dtype: int64\n",
|
||
|
"Обучающая выборка после undersampling: (4773, 11)\n",
|
||
|
"Close\n",
|
||
|
"0.765625 14\n",
|
||
|
"0.750000 12\n",
|
||
|
"0.789063 11\n",
|
||
|
"0.882813 10\n",
|
||
|
"0.703125 10\n",
|
||
|
" ..\n",
|
||
|
"98.230003 1\n",
|
||
|
"27.315001 1\n",
|
||
|
"8.995000 1\n",
|
||
|
"81.930000 1\n",
|
||
|
"12.357500 1\n",
|
||
|
"Name: count, Length: 3641, dtype: int64\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"import numpy as np\n",
|
||
|
"import pandas as pd\n",
|
||
|
"from sklearn.feature_selection import mutual_info_regression\n",
|
||
|
"from sklearn.model_selection import train_test_split\n",
|
||
|
"from imblearn.over_sampling import ADASYN\n",
|
||
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
||
|
"import matplotlib.pyplot as plt\n",
|
||
|
"import seaborn as sns\n",
|
||
|
"\n",
|
||
|
"df = pd.read_csv(\"data/Coffe.csv\")\n",
|
||
|
"print(df.columns)\n",
|
||
|
"\n",
|
||
|
"df['Date'] = pd.to_datetime(df['Date'])\n",
|
||
|
"df['Date_numeric'] = (df['Date'] - pd.Timestamp('1970-01-01')).dt.days\n",
|
||
|
"print(df['Date_numeric'])\n",
|
||
|
"\n",
|
||
|
"noisy_features = []\n",
|
||
|
"for col in df.columns:\n",
|
||
|
" if df[col].isnull().sum() / len(df) > 0.1: \n",
|
||
|
" noisy_features.append(col)\n",
|
||
|
"print(f\"Зашумленные столбцы: {noisy_features}\")\n",
|
||
|
" \n",
|
||
|
"skewness = df.select_dtypes(include=[np.number]).skew()\n",
|
||
|
"print(f\"Смещение: {skewness}\")\n",
|
||
|
"\n",
|
||
|
"skewed_features = skewness[abs(skewness) > 1].index.tolist()\n",
|
||
|
"print(f\"Сильно смещенные столбцы: {skewed_features}\")\n",
|
||
|
"\n",
|
||
|
"for col in df.select_dtypes(include=['number']).columns:\n",
|
||
|
" if col == 'id':\n",
|
||
|
" continue\n",
|
||
|
" Q1 = df[col].quantile(0.25)\n",
|
||
|
" Q3 = df[col].quantile(0.75)\n",
|
||
|
" IQR = Q3 - Q1\n",
|
||
|
" lower_bound = Q1 - 1.5 * IQR\n",
|
||
|
" upper_bound = Q3 + 1.5 * IQR\n",
|
||
|
" outliers = df[col][(df[col] < lower_bound) | (df[col] > upper_bound)]\n",
|
||
|
" print(f\"Выбросы в столбце '{col}':\\n{outliers}\\n\")\n",
|
||
|
"\n",
|
||
|
"numeric_cols = df.select_dtypes(include=['number']).columns\n",
|
||
|
"numeric_cols = [col for col in numeric_cols if col != 'id']\n",
|
||
|
"\n",
|
||
|
"plt.figure(figsize=(12, 8))\n",
|
||
|
"\n",
|
||
|
"for i, col in enumerate(numeric_cols, 1):\n",
|
||
|
" plt.subplot(len(numeric_cols) // 3 + 1, 3, i) \n",
|
||
|
" sns.boxplot(data=df, x=col)\n",
|
||
|
" plt.title(f'Boxplot for {col}')\n",
|
||
|
"\n",
|
||
|
"plt.tight_layout()\n",
|
||
|
"plt.show()\n",
|
||
|
"\n",
|
||
|
"if len(df.columns) >= 2:\n",
|
||
|
" for col1 in df.columns:\n",
|
||
|
" for col2 in df.columns:\n",
|
||
|
" if col1 != col2:\n",
|
||
|
" correlation = df[col1].corr(df[col2])\n",
|
||
|
" if abs(correlation) > 0.9:\n",
|
||
|
" print(f\"Просачивание данных: Высокая корреляция ({correlation:.2f}) между столбцами '{col1}' и '{col2}'\")\n",
|
||
|
"\n",
|
||
|
"df['log_close'] = np.log(df['Close'] + 1)\n",
|
||
|
"df['log_volume'] = np.log(df['Volume'] + 1)\n",
|
||
|
"\n",
|
||
|
"def split_stratified_into_train_val_test(\n",
|
||
|
" df_input,\n",
|
||
|
" stratify_colname=\"y\",\n",
|
||
|
" frac_train=0.6,\n",
|
||
|
" frac_val=0.15,\n",
|
||
|
" frac_test=0.25,\n",
|
||
|
" random_state=None,\n",
|
||
|
"):\n",
|
||
|
" if frac_train + frac_val + frac_test != 1.0:\n",
|
||
|
" raise ValueError(\n",
|
||
|
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
||
|
" % (frac_train, frac_val, frac_test)\n",
|
||
|
" )\n",
|
||
|
"\n",
|
||
|
" if stratify_colname not in df_input.columns:\n",
|
||
|
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
||
|
"\n",
|
||
|
" X = df_input \n",
|
||
|
" y = df_input[\n",
|
||
|
" [stratify_colname]\n",
|
||
|
" ] \n",
|
||
|
"\n",
|
||
|
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
||
|
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
||
|
" )\n",
|
||
|
"\n",
|
||
|
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
||
|
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
||
|
" df_temp,\n",
|
||
|
" y_temp,\n",
|
||
|
" stratify=y_temp,\n",
|
||
|
" test_size=relative_frac_test,\n",
|
||
|
" random_state=random_state,\n",
|
||
|
" )\n",
|
||
|
"\n",
|
||
|
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
||
|
"\n",
|
||
|
" return df_train, df_val, df_test\n",
|
||
|
"\n",
|
||
|
"bins = [df['Close'].min(), df['Close'].quantile(0.33), df['Close'].quantile(0.66), df['Close'].max()]\n",
|
||
|
"labels = ['Low', 'Medium', 'High']\n",
|
||
|
"df['Close_binned'] = pd.cut(df['Close'], bins=bins, labels=labels)\n",
|
||
|
"df = df.dropna()\n",
|
||
|
"# Now stratify using the binned values\n",
|
||
|
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
|
||
|
" df, stratify_colname=\"Close_binned\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
|
||
|
")\n",
|
||
|
"\n",
|
||
|
"print(df_train.columns) \n",
|
||
|
" \n",
|
||
|
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
||
|
"print(df_train.Close.value_counts()) \n",
|
||
|
"\n",
|
||
|
"print(\"Контрольная выборка: \", df_val.shape)\n",
|
||
|
"print(df_val.Close.value_counts())\n",
|
||
|
"\n",
|
||
|
"print(\"Тестовая выборка: \", df_test.shape)\n",
|
||
|
"print(df_test.Close.value_counts())\n",
|
||
|
"\n",
|
||
|
"rus = RandomUnderSampler(random_state=42)\n",
|
||
|
"X_resampled, y_resampled = rus.fit_resample(df_train, df_train[\"Close_binned\"])\n",
|
||
|
"df_train_rus = pd.DataFrame(X_resampled)\n",
|
||
|
"print(\"Обучающая выборка после undersampling: \", df_train_rus.shape)\n",
|
||
|
"print(df_train_rus.Close.value_counts())\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.12.5"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2
|
||
|
}
|