{ "cells": [ { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')\n", "0 8212\n", "1 8215\n", "2 8216\n", "3 8217\n", "4 8218\n", " ... \n", "8031 19860\n", "8032 19863\n", "8033 19864\n", "8034 19865\n", "8035 19866\n", "Name: Date_numeric, Length: 8036, dtype: int64\n", "Зашумленные столбцы: []\n", "Смещение: Open 1.086680\n", "High 1.086383\n", "Low 1.087102\n", "Close 1.086685\n", "Adj Close 1.213587\n", "Volume 13.602510\n", "Date_numeric 0.000505\n", "dtype: float64\n", "Сильно смещенные столбцы: ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']\n", "Данные за последние 25 лет\n", "Выбросы в столбце 'Open':\n", "Series([], Name: Open, dtype: float64)\n", "\n", "Выбросы в столбце 'High':\n", "Series([], Name: High, dtype: float64)\n", "\n", "Выбросы в столбце 'Low':\n", "Series([], Name: Low, dtype: float64)\n", "\n", "Выбросы в столбце 'Close':\n", "Series([], Name: Close, dtype: float64)\n", "\n", "Выбросы в столбце 'Adj Close':\n", "7321 114.799438\n", "7322 117.926170\n", "7323 118.010414\n", "7324 117.982330\n", "7325 114.593483\n", "7326 114.565414\n", "7327 113.676071\n", "Name: Adj Close, dtype: float64\n", "\n", "Выбросы в столбце 'Volume':\n", "0 224358400\n", "1 58732800\n", "2 34777600\n", "33 48320000\n", "103 46131200\n", " ... \n", "6444 51851700\n", "6544 62091100\n", "6550 33210100\n", "6639 45573000\n", "8019 66610700\n", "Name: Volume, Length: 451, dtype: int64\n", "\n", "Выбросы в столбце 'Date_numeric':\n", "Series([], Name: Date_numeric, dtype: int64)\n", "\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Просачивание данных: Высокая корреляция (1.00) между столбцами 'Date' и 'Date_numeric'\n", "Просачивание данных: Высокая корреляция (1.00) между столбцами 'Open' и 'High'\n", "Просачивание данных: Высокая корреляция (1.00) между столбцами 'Open' и 'Low'\n", "Просачивание данных: Высокая корреляция (1.00) между столбцами 'Open' и 'Close'\n", "Просачивание данных: Высокая корреляция (1.00) между столбцами 'Open' и 'Adj Close'\n", "Просачивание данных: Высокая корреляция (1.00) между столбцами 'High' и 'Open'\n", "Просачивание данных: Высокая корреляция (1.00) между столбцами 'High' и 'Low'\n", "Просачивание данных: Высокая корреляция (1.00) между столбцами 'High' и 'Close'\n", "Просачивание данных: Высокая корреляция (1.00) между столбцами 'High' и 'Adj Close'\n", "Просачивание данных: Высокая корреляция (1.00) между столбцами 'Low' и 'Open'\n", "Просачивание данных: Высокая корреляция (1.00) между столбцами 'Low' и 'High'\n", "Просачивание данных: Высокая корреляция (1.00) между столбцами 'Low' и 'Close'\n", "Просачивание данных: Высокая корреляция (1.00) между столбцами 'Low' и 'Adj Close'\n", "Просачивание данных: Высокая корреляция (1.00) между столбцами 'Close' и 'Open'\n", "Просачивание данных: Высокая корреляция (1.00) между столбцами 'Close' и 'High'\n", "Просачивание данных: Высокая корреляция (1.00) между столбцами 'Close' и 'Low'\n", "Просачивание данных: Высокая корреляция (1.00) между столбцами 'Close' и 'Adj Close'\n", "Просачивание данных: Высокая корреляция (1.00) между столбцами 'Adj Close' и 'Open'\n", "Просачивание данных: Высокая корреляция (1.00) между столбцами 'Adj Close' и 'High'\n", "Просачивание данных: Высокая корреляция (1.00) между столбцами 'Adj Close' и 'Low'\n", "Просачивание данных: Высокая корреляция (1.00) между столбцами 'Adj Close' и 'Close'\n", "Просачивание данных: Высокая корреляция (1.00) между столбцами 'Date_numeric' и 'Date'\n", "Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',\n", " 'Date_numeric', 'log_close', 'log_volume', 'Close_binned'],\n", " dtype='object')\n", "Обучающая выборка: (4821, 11)\n", "Close\n", "0.765625 14\n", "0.750000 12\n", "0.789063 11\n", "0.882813 10\n", "0.703125 10\n", " ..\n", "91.949997 1\n", "48.685001 1\n", "103.870003 1\n", "2.414063 1\n", "4.280000 1\n", "Name: count, Length: 3678, dtype: int64\n", "Контрольная выборка: (1607, 11)\n", "Close\n", "0.750000 6\n", "0.796875 5\n", "3.320313 4\n", "0.835938 4\n", "0.601563 4\n", " ..\n", "0.414063 1\n", "111.070000 1\n", "11.790000 1\n", "59.610001 1\n", "2.472656 1\n", "Name: count, Length: 1438, dtype: int64\n", "Тестовая выборка: (1607, 11)\n", "Close\n", "0.750000 6\n", "0.765625 6\n", "3.000000 4\n", "0.601563 4\n", "0.707031 4\n", " ..\n", "98.599998 1\n", "56.110001 1\n", "0.621094 1\n", "21.740000 1\n", "98.000000 1\n", "Name: count, Length: 1444, dtype: int64\n", "Обучающая выборка после undersampling: (4773, 11)\n", "Close\n", "0.765625 14\n", "0.750000 12\n", "0.789063 11\n", "0.882813 10\n", "0.703125 10\n", " ..\n", "98.230003 1\n", "27.315001 1\n", "8.995000 1\n", "81.930000 1\n", "12.357500 1\n", "Name: count, Length: 3641, dtype: int64\n" ] } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn.feature_selection import mutual_info_regression\n", "from sklearn.model_selection import train_test_split\n", "from imblearn.over_sampling import ADASYN\n", "from imblearn.under_sampling import RandomUnderSampler\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "df = pd.read_csv(\"data/Coffe.csv\")\n", "print(df.columns)\n", "\n", "df['Date'] = pd.to_datetime(df['Date'])\n", "df['Date_numeric'] = (df['Date'] - pd.Timestamp('1970-01-01')).dt.days\n", "print(df['Date_numeric'])\n", "\n", "noisy_features = []\n", "for col in df.columns:\n", " if df[col].isnull().sum() / len(df) > 0.1: \n", " noisy_features.append(col)\n", "print(f\"Зашумленные столбцы: {noisy_features}\")\n", " \n", "skewness = df.select_dtypes(include=[np.number]).skew()\n", "print(f\"Смещение: {skewness}\")\n", "\n", "skewed_features = skewness[abs(skewness) > 1].index.tolist()\n", "print(f\"Сильно смещенные столбцы: {skewed_features}\")\n", "\n", "for col in df.select_dtypes(include=['number']).columns:\n", " if col == 'id':\n", " continue\n", " Q1 = df[col].quantile(0.25)\n", " Q3 = df[col].quantile(0.75)\n", " IQR = Q3 - Q1\n", " lower_bound = Q1 - 1.5 * IQR\n", " upper_bound = Q3 + 1.5 * IQR\n", " outliers = df[col][(df[col] < lower_bound) | (df[col] > upper_bound)]\n", " print(f\"Выбросы в столбце '{col}':\\n{outliers}\\n\")\n", "\n", "numeric_cols = df.select_dtypes(include=['number']).columns\n", "numeric_cols = [col for col in numeric_cols if col != 'id']\n", "\n", "plt.figure(figsize=(12, 8))\n", "\n", "for i, col in enumerate(numeric_cols, 1):\n", " plt.subplot(len(numeric_cols) // 3 + 1, 3, i) \n", " sns.boxplot(data=df, x=col)\n", " plt.title(f'Boxplot for {col}')\n", "\n", "plt.tight_layout()\n", "plt.show()\n", "\n", "if len(df.columns) >= 2:\n", " for col1 in df.columns:\n", " for col2 in df.columns:\n", " if col1 != col2:\n", " correlation = df[col1].corr(df[col2])\n", " if abs(correlation) > 0.9:\n", " print(f\"Просачивание данных: Высокая корреляция ({correlation:.2f}) между столбцами '{col1}' и '{col2}'\")\n", "\n", "df['log_close'] = np.log(df['Close'] + 1)\n", "df['log_volume'] = np.log(df['Volume'] + 1)\n", "\n", "def split_stratified_into_train_val_test(\n", " df_input,\n", " stratify_colname=\"y\",\n", " frac_train=0.6,\n", " frac_val=0.15,\n", " frac_test=0.25,\n", " random_state=None,\n", "):\n", " if frac_train + frac_val + frac_test != 1.0:\n", " raise ValueError(\n", " \"fractions %f, %f, %f do not add up to 1.0\"\n", " % (frac_train, frac_val, frac_test)\n", " )\n", "\n", " if stratify_colname not in df_input.columns:\n", " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", "\n", " X = df_input \n", " y = df_input[\n", " [stratify_colname]\n", " ] \n", "\n", " df_train, df_temp, y_train, y_temp = train_test_split(\n", " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", " )\n", "\n", " relative_frac_test = frac_test / (frac_val + frac_test)\n", " df_val, df_test, y_val, y_test = train_test_split(\n", " df_temp,\n", " y_temp,\n", " stratify=y_temp,\n", " test_size=relative_frac_test,\n", " random_state=random_state,\n", " )\n", "\n", " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", "\n", " return df_train, df_val, df_test\n", "\n", "bins = [df['Close'].min(), df['Close'].quantile(0.33), df['Close'].quantile(0.66), df['Close'].max()]\n", "labels = ['Low', 'Medium', 'High']\n", "df['Close_binned'] = pd.cut(df['Close'], bins=bins, labels=labels)\n", "df = df.dropna()\n", "# Now stratify using the binned values\n", "df_train, df_val, df_test = split_stratified_into_train_val_test(\n", " df, stratify_colname=\"Close_binned\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n", ")\n", "\n", "print(df_train.columns) \n", " \n", "print(\"Обучающая выборка: \", df_train.shape)\n", "print(df_train.Close.value_counts()) \n", "\n", "print(\"Контрольная выборка: \", df_val.shape)\n", "print(df_val.Close.value_counts())\n", "\n", "print(\"Тестовая выборка: \", df_test.shape)\n", "print(df_test.Close.value_counts())\n", "\n", "rus = RandomUnderSampler(random_state=42)\n", "X_resampled, y_resampled = rus.fit_resample(df_train, df_train[\"Close_binned\"])\n", "df_train_rus = pd.DataFrame(X_resampled)\n", "print(\"Обучающая выборка после undersampling: \", df_train_rus.shape)\n", "print(df_train_rus.Close.value_counts())\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.5" } }, "nbformat": 4, "nbformat_minor": 2 }