1257 lines
276 KiB
Plaintext
Raw Normal View History

2025-02-13 23:39:55 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Загрузка данных"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',\n",
" 'SP_open', 'SP_high', 'SP_low', 'SP_close', 'SP_Ajclose', 'SP_volume',\n",
" 'DJ_open', 'DJ_high', 'DJ_low', 'DJ_close', 'DJ_Ajclose', 'DJ_volume',\n",
" 'EG_open', 'EG_high', 'EG_low', 'EG_close', 'EG_Ajclose', 'EG_volume',\n",
" 'EU_Price', 'EU_open', 'EU_high', 'EU_low', 'EU_Trend', 'OF_Price',\n",
" 'OF_Open', 'OF_High', 'OF_Low', 'OF_Volume', 'OF_Trend', 'OS_Price',\n",
" 'OS_Open', 'OS_High', 'OS_Low', 'OS_Trend', 'SF_Price', 'SF_Open',\n",
" 'SF_High', 'SF_Low', 'SF_Volume', 'SF_Trend', 'USB_Price', 'USB_Open',\n",
" 'USB_High', 'USB_Low', 'USB_Trend', 'PLT_Price', 'PLT_Open', 'PLT_High',\n",
" 'PLT_Low', 'PLT_Trend', 'PLD_Price', 'PLD_Open', 'PLD_High', 'PLD_Low',\n",
" 'PLD_Trend', 'RHO_PRICE', 'USDI_Price', 'USDI_Open', 'USDI_High',\n",
" 'USDI_Low', 'USDI_Volume', 'USDI_Trend', 'GDX_Open', 'GDX_High',\n",
" 'GDX_Low', 'GDX_Close', 'GDX_Adj Close', 'GDX_Volume', 'USO_Open',\n",
" 'USO_High', 'USO_Low', 'USO_Close', 'USO_Adj Close', 'USO_Volume'],\n",
" dtype='object')\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Date</th>\n",
" <th>Open</th>\n",
" <th>High</th>\n",
" <th>Low</th>\n",
" <th>Close</th>\n",
" <th>Adj Close</th>\n",
" <th>Volume</th>\n",
" <th>SP_open</th>\n",
" <th>SP_high</th>\n",
" <th>SP_low</th>\n",
" <th>...</th>\n",
" <th>GDX_Low</th>\n",
" <th>GDX_Close</th>\n",
" <th>GDX_Adj Close</th>\n",
" <th>GDX_Volume</th>\n",
" <th>USO_Open</th>\n",
" <th>USO_High</th>\n",
" <th>USO_Low</th>\n",
" <th>USO_Close</th>\n",
" <th>USO_Adj Close</th>\n",
" <th>USO_Volume</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2011-12-15</td>\n",
" <td>154.740005</td>\n",
" <td>154.949997</td>\n",
" <td>151.710007</td>\n",
" <td>152.330002</td>\n",
" <td>152.330002</td>\n",
" <td>21521900</td>\n",
" <td>123.029999</td>\n",
" <td>123.199997</td>\n",
" <td>121.989998</td>\n",
" <td>...</td>\n",
" <td>51.570000</td>\n",
" <td>51.680000</td>\n",
" <td>48.973877</td>\n",
" <td>20605600</td>\n",
" <td>36.900002</td>\n",
" <td>36.939999</td>\n",
" <td>36.049999</td>\n",
" <td>36.130001</td>\n",
" <td>36.130001</td>\n",
" <td>12616700</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2011-12-16</td>\n",
" <td>154.309998</td>\n",
" <td>155.369995</td>\n",
" <td>153.899994</td>\n",
" <td>155.229996</td>\n",
" <td>155.229996</td>\n",
" <td>18124300</td>\n",
" <td>122.230003</td>\n",
" <td>122.949997</td>\n",
" <td>121.300003</td>\n",
" <td>...</td>\n",
" <td>52.040001</td>\n",
" <td>52.680000</td>\n",
" <td>49.921513</td>\n",
" <td>16285400</td>\n",
" <td>36.180000</td>\n",
" <td>36.500000</td>\n",
" <td>35.730000</td>\n",
" <td>36.270000</td>\n",
" <td>36.270000</td>\n",
" <td>12578800</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2011-12-19</td>\n",
" <td>155.479996</td>\n",
" <td>155.860001</td>\n",
" <td>154.360001</td>\n",
" <td>154.869995</td>\n",
" <td>154.869995</td>\n",
" <td>12547200</td>\n",
" <td>122.059998</td>\n",
" <td>122.320000</td>\n",
" <td>120.029999</td>\n",
" <td>...</td>\n",
" <td>51.029999</td>\n",
" <td>51.169998</td>\n",
" <td>48.490578</td>\n",
" <td>15120200</td>\n",
" <td>36.389999</td>\n",
" <td>36.450001</td>\n",
" <td>35.930000</td>\n",
" <td>36.200001</td>\n",
" <td>36.200001</td>\n",
" <td>7418200</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2011-12-20</td>\n",
" <td>156.820007</td>\n",
" <td>157.429993</td>\n",
" <td>156.580002</td>\n",
" <td>156.979996</td>\n",
" <td>156.979996</td>\n",
" <td>9136300</td>\n",
" <td>122.180000</td>\n",
" <td>124.139999</td>\n",
" <td>120.370003</td>\n",
" <td>...</td>\n",
" <td>52.369999</td>\n",
" <td>52.990002</td>\n",
" <td>50.215282</td>\n",
" <td>11644900</td>\n",
" <td>37.299999</td>\n",
" <td>37.610001</td>\n",
" <td>37.220001</td>\n",
" <td>37.560001</td>\n",
" <td>37.560001</td>\n",
" <td>10041600</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2011-12-21</td>\n",
" <td>156.979996</td>\n",
" <td>157.529999</td>\n",
" <td>156.130005</td>\n",
" <td>157.160004</td>\n",
" <td>157.160004</td>\n",
" <td>11996100</td>\n",
" <td>123.930000</td>\n",
" <td>124.360001</td>\n",
" <td>122.750000</td>\n",
" <td>...</td>\n",
" <td>52.419998</td>\n",
" <td>52.959999</td>\n",
" <td>50.186852</td>\n",
" <td>8724300</td>\n",
" <td>37.669998</td>\n",
" <td>38.240002</td>\n",
" <td>37.520000</td>\n",
" <td>38.110001</td>\n",
" <td>38.110001</td>\n",
" <td>10728000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1713</th>\n",
" <td>2018-12-24</td>\n",
" <td>119.570000</td>\n",
" <td>120.139999</td>\n",
" <td>119.570000</td>\n",
" <td>120.019997</td>\n",
" <td>120.019997</td>\n",
" <td>9736400</td>\n",
" <td>239.039993</td>\n",
" <td>240.839996</td>\n",
" <td>234.270004</td>\n",
" <td>...</td>\n",
" <td>20.650000</td>\n",
" <td>21.090000</td>\n",
" <td>21.090000</td>\n",
" <td>60507000</td>\n",
" <td>9.490000</td>\n",
" <td>9.520000</td>\n",
" <td>9.280000</td>\n",
" <td>9.290000</td>\n",
" <td>9.290000</td>\n",
" <td>21598200</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1714</th>\n",
" <td>2018-12-26</td>\n",
" <td>120.620003</td>\n",
" <td>121.000000</td>\n",
" <td>119.570000</td>\n",
" <td>119.660004</td>\n",
" <td>119.660004</td>\n",
" <td>14293500</td>\n",
" <td>235.970001</td>\n",
" <td>246.179993</td>\n",
" <td>233.759995</td>\n",
" <td>...</td>\n",
" <td>20.530001</td>\n",
" <td>20.620001</td>\n",
" <td>20.620001</td>\n",
" <td>76365200</td>\n",
" <td>9.250000</td>\n",
" <td>9.920000</td>\n",
" <td>9.230000</td>\n",
" <td>9.900000</td>\n",
" <td>9.900000</td>\n",
" <td>40978800</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1715</th>\n",
" <td>2018-12-27</td>\n",
" <td>120.570000</td>\n",
" <td>120.900002</td>\n",
" <td>120.139999</td>\n",
" <td>120.570000</td>\n",
" <td>120.570000</td>\n",
" <td>11874400</td>\n",
" <td>242.570007</td>\n",
" <td>248.289993</td>\n",
" <td>238.960007</td>\n",
" <td>...</td>\n",
" <td>20.700001</td>\n",
" <td>20.969999</td>\n",
" <td>20.969999</td>\n",
" <td>52393000</td>\n",
" <td>9.590000</td>\n",
" <td>9.650000</td>\n",
" <td>9.370000</td>\n",
" <td>9.620000</td>\n",
" <td>9.620000</td>\n",
" <td>36578700</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1716</th>\n",
" <td>2018-12-28</td>\n",
" <td>120.800003</td>\n",
" <td>121.080002</td>\n",
" <td>120.720001</td>\n",
" <td>121.059998</td>\n",
" <td>121.059998</td>\n",
" <td>6864700</td>\n",
" <td>249.580002</td>\n",
" <td>251.399994</td>\n",
" <td>246.449997</td>\n",
" <td>...</td>\n",
" <td>20.570000</td>\n",
" <td>20.600000</td>\n",
" <td>20.600000</td>\n",
" <td>49835000</td>\n",
" <td>9.540000</td>\n",
" <td>9.650000</td>\n",
" <td>9.380000</td>\n",
" <td>9.530000</td>\n",
" <td>9.530000</td>\n",
" <td>22803400</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1717</th>\n",
" <td>2018-12-31</td>\n",
" <td>120.980003</td>\n",
" <td>121.260002</td>\n",
" <td>120.830002</td>\n",
" <td>121.250000</td>\n",
" <td>121.250000</td>\n",
" <td>8449400</td>\n",
" <td>249.559998</td>\n",
" <td>250.190002</td>\n",
" <td>247.470001</td>\n",
" <td>...</td>\n",
" <td>20.559999</td>\n",
" <td>21.090000</td>\n",
" <td>21.090000</td>\n",
" <td>53866600</td>\n",
" <td>9.630000</td>\n",
" <td>9.710000</td>\n",
" <td>9.440000</td>\n",
" <td>9.660000</td>\n",
" <td>9.660000</td>\n",
" <td>28417400</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1718 rows × 81 columns</p>\n",
"</div>"
],
"text/plain": [
" Date Open High Low Close Adj Close \\\n",
"0 2011-12-15 154.740005 154.949997 151.710007 152.330002 152.330002 \n",
"1 2011-12-16 154.309998 155.369995 153.899994 155.229996 155.229996 \n",
"2 2011-12-19 155.479996 155.860001 154.360001 154.869995 154.869995 \n",
"3 2011-12-20 156.820007 157.429993 156.580002 156.979996 156.979996 \n",
"4 2011-12-21 156.979996 157.529999 156.130005 157.160004 157.160004 \n",
"... ... ... ... ... ... ... \n",
"1713 2018-12-24 119.570000 120.139999 119.570000 120.019997 120.019997 \n",
"1714 2018-12-26 120.620003 121.000000 119.570000 119.660004 119.660004 \n",
"1715 2018-12-27 120.570000 120.900002 120.139999 120.570000 120.570000 \n",
"1716 2018-12-28 120.800003 121.080002 120.720001 121.059998 121.059998 \n",
"1717 2018-12-31 120.980003 121.260002 120.830002 121.250000 121.250000 \n",
"\n",
" Volume SP_open SP_high SP_low ... GDX_Low GDX_Close \\\n",
"0 21521900 123.029999 123.199997 121.989998 ... 51.570000 51.680000 \n",
"1 18124300 122.230003 122.949997 121.300003 ... 52.040001 52.680000 \n",
"2 12547200 122.059998 122.320000 120.029999 ... 51.029999 51.169998 \n",
"3 9136300 122.180000 124.139999 120.370003 ... 52.369999 52.990002 \n",
"4 11996100 123.930000 124.360001 122.750000 ... 52.419998 52.959999 \n",
"... ... ... ... ... ... ... ... \n",
"1713 9736400 239.039993 240.839996 234.270004 ... 20.650000 21.090000 \n",
"1714 14293500 235.970001 246.179993 233.759995 ... 20.530001 20.620001 \n",
"1715 11874400 242.570007 248.289993 238.960007 ... 20.700001 20.969999 \n",
"1716 6864700 249.580002 251.399994 246.449997 ... 20.570000 20.600000 \n",
"1717 8449400 249.559998 250.190002 247.470001 ... 20.559999 21.090000 \n",
"\n",
" GDX_Adj Close GDX_Volume USO_Open USO_High USO_Low USO_Close \\\n",
"0 48.973877 20605600 36.900002 36.939999 36.049999 36.130001 \n",
"1 49.921513 16285400 36.180000 36.500000 35.730000 36.270000 \n",
"2 48.490578 15120200 36.389999 36.450001 35.930000 36.200001 \n",
"3 50.215282 11644900 37.299999 37.610001 37.220001 37.560001 \n",
"4 50.186852 8724300 37.669998 38.240002 37.520000 38.110001 \n",
"... ... ... ... ... ... ... \n",
"1713 21.090000 60507000 9.490000 9.520000 9.280000 9.290000 \n",
"1714 20.620001 76365200 9.250000 9.920000 9.230000 9.900000 \n",
"1715 20.969999 52393000 9.590000 9.650000 9.370000 9.620000 \n",
"1716 20.600000 49835000 9.540000 9.650000 9.380000 9.530000 \n",
"1717 21.090000 53866600 9.630000 9.710000 9.440000 9.660000 \n",
"\n",
" USO_Adj Close USO_Volume \n",
"0 36.130001 12616700 \n",
"1 36.270000 12578800 \n",
"2 36.200001 7418200 \n",
"3 37.560001 10041600 \n",
"4 38.110001 10728000 \n",
"... ... ... \n",
"1713 9.290000 21598200 \n",
"1714 9.900000 40978800 \n",
"1715 9.620000 36578700 \n",
"1716 9.530000 22803400 \n",
"1717 9.660000 28417400 \n",
"\n",
"[1718 rows x 81 columns]"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"df = pd.read_csv(\"../static/csv/FINAL_USO.csv\")\n",
"print(df.columns)\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## **1-я бизнес-цель (регрессия)**: \n",
"\n",
"Предсказание цены на золото с целью принятия инвесторами решения о покупке товаров."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Целевой признак: цена закрытия Adj Close.\n",
"\n",
"Вход: Volume, Hight, Low, Close, Open.\\\n",
"Достижимый уровень качества: предсказания должны иметь погрешность в среднем не более 5$. Для проверки будет использоваться метрика MAE (средняя абсолютная ошибка)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.discriminant_analysis import StandardScaler\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
"from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score\n",
"from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
"import seaborn as sns\n",
"from sklearn.model_selection import cross_val_predict\n",
"from sklearn.metrics import mean_squared_error\n",
"import numpy as np\n",
"from sklearn import metrics\n",
"import sklearn.preprocessing as preproc\n",
"from sklearn.linear_model import LinearRegression, Ridge\n",
"from sklearn.metrics import mean_absolute_error\n",
"from mlxtend.evaluate import bias_variance_decomp\n",
"from sklearn.neural_network import MLPRegressor\n",
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//FINAL_USO.csv\")\n",
"data = df['Volume', 'High', 'Open', 'Close', 'Low','Adj Close']\n",
"\n",
"X = data.drop('Adj Close', axis=1)\n",
"y = data['Adj Close']\n",
"\n",
"# Разделение данных на обучающую и тестовую выборки\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"# Преобразование числовых данных\n",
"#заполнение пустых значений медианой\n",
"num_imputer = SimpleImputer(strategy=\"median\")\n",
"\n",
"preprocessing_num = Pipeline(\n",
" [\n",
" (\"imputer\", num_imputer)\n",
" ]\n",
")\n",
"\n",
"#Категориальных данных нет, поэтому преобразовывать их не надо\n",
"\n",
"\n",
"# Общая предобработка (только числовые данные)\n",
"preprocessing = ColumnTransformer(\n",
" [\n",
" (\"nums\", preprocessing_num, X.columns)\n",
" ]\n",
")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Лнейная регрессия:"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Лучшие гиперпараметры: {'preprocessing': MinMaxScaler()}\n",
"Cредняя абсолютная ошибка (MAE) = 1.8424538380756087e-14\n",
"Смещение: -5.1553225998619436e-11\n",
"Дисперсия: 3.270386026049708e-11\n",
"R^2 = 1.0\n"
]
}
],
"source": [
"pipeline_lin_reg = Pipeline([\n",
" ('preprocessing', preprocessing),\n",
" ('model', LinearRegression())]\n",
")\n",
"\n",
"# Определение сетки гиперпараметров (возможных знач-ий гиперпараметров) для перебора\n",
"param_grid = {\n",
" #как будут масштабироваться признаки\n",
" 'preprocessing': [StandardScaler(), preproc.MinMaxScaler(), preproc.MaxAbsScaler(), None]\n",
"}\n",
"\n",
"# Создание объекта GridSearchCV для поиска лучших гиперпараметров по сетке с максимальным знач-ием \n",
"# отрицательного корня из среднеквадратичной ошибки (отриц., чтобы искался не минимум, а максимум)\n",
"grid_search = GridSearchCV(pipeline_lin_reg, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)\n",
"\n",
"# Обучение модели с перебором гиперпараметров\n",
"grid_search.fit(X_train, y_train)\n",
"\n",
"print(\"Лучшие гиперпараметры: \", grid_search.best_params_)\n",
"\n",
"# Лучшая модель лин. регрессии\n",
"best_model = grid_search.best_estimator_\n",
"\n",
"y_pred = best_model.predict(X_test)\n",
"\n",
"print(f'Cредняя абсолютная ошибка (MAE) = {mean_absolute_error(y_test, y_pred)}')\n",
"\n",
"\n",
"# Оценка дисперсии и смещения\n",
"cv_results = grid_search.cv_results_\n",
"mean_test_score = cv_results['mean_test_score']\n",
"std_test_score = cv_results['std_test_score']\n",
"\n",
"print(f\"Смещение: {mean_test_score.mean()}\")\n",
"print(f\"Дисперсия: {std_test_score.mean()}\")\n",
"\n",
"from sklearn.metrics import r2_score\n",
"\n",
"print(f'R^2 = {r2_score(y_test, y_pred)}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Гребневая регрессия"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Лучшие гиперпараметры: {'model__alpha': 0, 'preprocessing': StandardScaler()}\n",
"Cредняя абсолютная ошибка (MAE) = 5.494726121130867e-13\n",
"Смещение: -0.4263701358095246\n",
"Дисперсия: 0.02072744817291101\n",
"R^2 = 1.0\n"
]
}
],
"source": [
"pipeline_ridge = Pipeline([\n",
" ('preprocessing', preprocessing),\n",
" ('model', Ridge())]\n",
")\n",
"\n",
"# Определение сетки гиперпараметров (возможных знач-ий гиперпараметров) для перебора\n",
"param_grid = {\n",
" #как будут масштабироваться признаки\n",
" 'preprocessing': [StandardScaler(), preproc.MinMaxScaler(), preproc.MaxAbsScaler(), None],\n",
" #сила регуляризации\n",
" 'model__alpha': [0, 0.5, 1.0, 1.5, 2.0, 5.0, 10.0] \n",
"}\n",
"\n",
"# Создание объекта GridSearchCV для поиска лучших гиперпараметров по сетке с максимальным знач-ием \n",
"# отрицательного корня из среднеквадратичной ошибки (отриц., чтобы искался не минимум, а максимум)\n",
"grid_search = GridSearchCV(pipeline_ridge, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=0)\n",
"\n",
"# Обучение модели с перебором гиперпараметров\n",
"grid_search.fit(X_train, y_train)\n",
"\n",
"print(\"Лучшие гиперпараметры: \", grid_search.best_params_)\n",
"\n",
"# Лучшая модель регрессии\n",
"best_model = grid_search.best_estimator_\n",
"\n",
"y_pred = best_model.predict(X_test)\n",
"\n",
"print(f'Cредняя абсолютная ошибка (MAE) = {mean_absolute_error(y_test, y_pred)}')\n",
"\n",
"\n",
"cv_results = grid_search.cv_results_\n",
"mean_test_score = cv_results['mean_test_score']\n",
"std_test_score = cv_results['std_test_score']\n",
"\n",
"print(f\"Смещение: {mean_test_score.mean()}\")\n",
"print(f\"Дисперсия: {std_test_score.mean()}\")\n",
"\n",
"print(f'R^2 = {r2_score(y_test, y_pred)}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Гребнавая регрессия дала более точные результаты, чем линейная."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Метод градиентного бустинга (набор деревьев решений)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Лучшие гиперпараметры: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 300, 'preprocessing': None}\n",
"Cредняя абсолютная ошибка (MAE) = 0.040833243038698064\n",
"Смещение: -0.2177327926836486\n",
"Дисперсия: 0.021373424060567556\n",
"R^2 = 0.9999842165416633\n"
]
}
],
"source": [
"from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score\n",
"from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier\n",
"# Конвейер\n",
"pipeline_grad = Pipeline([\n",
" ('preprocessing', preprocessing),\n",
" ('model', GradientBoostingRegressor())\n",
"])\n",
"\n",
"# Определение сетки гиперпараметров\n",
"param_grid = {\n",
" 'preprocessing': [StandardScaler(), preproc.MinMaxScaler(), preproc.MaxAbsScaler(), None],\n",
" 'model__n_estimators': [100, 200, 300],\n",
" #Скорость обучения\n",
" 'model__learning_rate': [0.1, 0.2],\n",
" #Максимальная глубина дерева\n",
" 'model__max_depth': [3, 5, 7]\n",
"}\n",
"\n",
"# Создание объекта GridSearchCV\n",
"grid_search = GridSearchCV(pipeline_grad, param_grid, cv=2, scoring='neg_root_mean_squared_error', n_jobs=-1)\n",
"\n",
"# Обучение модели с перебором гиперпараметров\n",
"grid_search.fit(X_train, y_train)\n",
"\n",
"print(\"Лучшие гиперпараметры: \", grid_search.best_params_)\n",
"\n",
"# Лучшая модель случайного леса\n",
"best_model = grid_search.best_estimator_\n",
"\n",
"\n",
"y_pred = best_model.predict(X_test)\n",
"\n",
"\n",
"print(f'Cредняя абсолютная ошибка (MAE) = {mean_absolute_error(y_test, y_pred)}')\n",
"\n",
"\n",
"# Получение предсказаний на кросс-валидации\n",
"y_cv_pred = cross_val_predict(best_model, X_train, y_train, cv=3)\n",
"\n",
"cv_results = grid_search.cv_results_\n",
"mean_test_score = cv_results['mean_test_score']\n",
"std_test_score = cv_results['std_test_score']\n",
"\n",
"print(f\"Смещение: {mean_test_score.mean()}\")\n",
"print(f\"Дисперсия: {std_test_score.mean()}\")\n",
"\n",
"print(f'R^2 = {r2_score(y_test, y_pred)}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Вывод**: \n",
"\n",
"Все 3 модели регрессии показали допустимый уровень \"погрешности\". \n",
"\n",
"R² (коэффициент детерминации): 0.99 — это очень высокий уровень, указывающий на то, что модель объясняет 99% вариации целевой переменной. Это свидетельствует о высокой предсказательной способности модели.\n",
"\n",
"Из всех моделей градиентный бустинг показал самую низкую \"погрешность\"."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## **2-я бизнес-цель (классификация):** \n",
"\n",
"Определить оптимальные коэффициенты для различных факторов, влияющих на цену золота. \n",
"\n",
"Целевой признак: Adj Close.\n",
"\n",
"Вход: Volume, Hight, Low, Close, Open. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Результаты для задачи классификации:\n",
"Model: Logistic Regression\n",
"Best Parameters: {'model__C': 10, 'model__solver': 'liblinear'}\n",
"Accuracy: 0.9825581395348837\n",
"Precision: 1.0\n",
"Recall: 0.9469026548672567\n",
"F1-score: 0.9727272727272728\n",
"\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAhQAAAHHCAYAAADnOMH5AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABOwklEQVR4nO3deVxU1fsH8M8AMiAwICogCojgAokbpiEumAjinppftwLFNZfUXCt3kzJL00xNU9Q0LTUrS41UXMmve2lIgrikgAoBArLO+f3hj/k2Asp4LwzTfN697iu598y5zx1HeHjOOfcqhBACRERERBKY6DsAIiIiMnxMKIiIiEgyJhREREQkGRMKIiIikowJBREREUnGhIKIiIgkY0JBREREkjGhICIiIsmYUBAREZFkTCiM1LVr1xAUFARbW1soFArs3btX1v5v3LgBhUKByMhIWfs1ZAEBAQgICJCtv6ysLIwcORJOTk5QKBSYPHmybH1XFdHR0VAoFIiOjpalv8jISCgUCty4cUOW/giYP38+FAqFvsOgKoAJhR4lJCRgzJgxaNCgASwsLKBSqeDv749PPvkEjx49qtBzh4aG4vfff8d7772HrVu3onXr1hV6vsoUFhYGhUIBlUpV6vt47do1KBQKKBQKLFu2TOf+7969i/nz5+PixYsyRPv8lixZgsjISIwbNw5bt27Fa6+9VqHnq1+/Pnr27Fmh55DLkiVLZE+Sn1ScnBRvZmZmqFu3LsLCwnDnzp0KPTdRlSRIL/bt2ycsLS2FnZ2dmDRpkvj888/Fp59+KgYNGiSqVasmRo0aVWHnzsnJEQDEO++8U2HnUKvV4tGjR6KwsLDCzlGW0NBQYWZmJkxNTcXOnTtLHJ83b56wsLAQAMSHH36oc/9nzpwRAMSmTZt0el1eXp7Iy8vT+Xxladu2rfD395etv2dxc3MTPXr0qLTzCSFEUVGRePTokSgqKtLpdVZWViI0NLTE/sLCQvHo0SOhVqslx7Zp0yYBQCxcuFBs3bpVrF+/XoSHhwtTU1Ph4eEhHj16JPkchqCgoMBorpWezky/6YxxSkxMxKBBg+Dm5obDhw+jTp06mmPjx49HfHw8fvzxxwo7//379wEAdnZ2FXYOhUIBCwuLCuv/WZRKJfz9/fHVV19h4MCBWse2b9+OHj16YPfu3ZUSS05ODqpXrw5zc3NZ+7137x68vb1l66+wsBBqtVr2OKUwMTGR9XNkamoKU1NT2foDgJCQEE2Fb+TIkahVqxY++OADfP/99yU+exVJCIHc3FxYWlpW2jkBwMzMDGZm/FFCHPLQi6VLlyIrKwtffPGFVjJRzNPTE2+++abm68LCQixatAgeHh5QKpWoX78+3n77beTl5Wm9rrgkfeLECbRp0wYWFhZo0KABtmzZomkzf/58uLm5AQCmT58OhUKB+vXrA3g8VFD8538qbYw0KioK7du3h52dHaytrdG4cWO8/fbbmuNlzaE4fPgwOnToACsrK9jZ2aFPnz6IjY0t9Xzx8fEICwuDnZ0dbG1tMXz4cOTk5JT9xj5hyJAh2L9/P9LT0zX7zpw5g2vXrmHIkCEl2qelpWHatGnw8fGBtbU1VCoVQkJCcOnSJU2b6OhovPjiiwCA4cOHa8rdxdcZEBCApk2b4ty5c+jYsSOqV6+ueV+enEMRGhoKCwuLEtcfHByMGjVq4O7du6VeV/G8gsTERPz444+aGIrnBdy7dw/h4eFwdHSEhYUFmjdvjs2bN2v1Ufz3s2zZMqxYsULz2frjjz/K9d6WpbyfVbVajfnz58PZ2RnVq1dH586d8ccff6B+/foICwsrca3/nENx7do19O/fH05OTrCwsEC9evUwaNAgZGRkAHiczGZnZ2Pz5s2a96a4z7LmUOzfvx+dOnWCjY0NVCoVXnzxRWzfvv253oMOHToAeDyk+U9Xr17FgAEDYG9vDwsLC7Ru3Rrff/99idf/9ttv6NSpEywtLVGvXj0sXrwYmzZtKhF38b/3gwcPonXr1rC0tMS6desAAOnp6Zg8eTJcXFygVCrh6emJDz74AGq1WutcO3bsgK+vr+a6fXx88Mknn2iOFxQUYMGCBWjYsCEsLCxQs2ZNtG/fHlFRUZo2pX1/kPN7FhkOppV68MMPP6BBgwZo165dudqPHDkSmzdvxoABA/DWW2/h9OnTiIiIQGxsLL799luttvHx8RgwYADCw8MRGhqKjRs3IiwsDL6+vnjhhRfQr18/2NnZYcqUKRg8eDC6d+8Oa2trneK/cuUKevbsiWbNmmHhwoVQKpWIj4/HyZMnn/q6X375BSEhIWjQoAHmz5+PR48eYdWqVfD398f58+dLJDMDBw6Eu7s7IiIicP78eWzYsAEODg744IMPyhVnv379MHbsWOzZswcjRowA8Lg60aRJE7Rq1apE++vXr2Pv3r149dVX4e7ujpSUFKxbtw6dOnXCH3/8AWdnZ3h5eWHhwoWYO3cuRo8erfnh8c+/y9TUVISEhGDQoEEYNmwYHB0dS43vk08+weHDhxEaGoqYmBiYmppi3bp1+Pnnn7F161Y4OzuX+jovLy9s3boVU6ZMQb169fDWW28BAGrXro1Hjx4hICAA8fHxmDBhAtzd3fHNN98gLCwM6enpWokqAGzatAm5ubkYPXo0lEol7O3ty/XelqW8n9XZs2dj6dKl6NWrF4KDg3Hp0iUEBwcjNzf3qf3n5+cjODgYeXl5mDhxIpycnHDnzh3s27cP6enpsLW1xdatWzFy5Ei0adMGo0ePBgB4eHiU2WdkZCRGjBiBF154AbNnz4adnR0uXLiAAwcOlJp4PkvxD/0aNWpo9l25cgX+/v6oW7cuZs2aBSsrK3z99dfo27cvdu/ejVdeeQUAcOfOHXTu3BkKhQKzZ8+GlZUVNmzYAKVSWeq54uLiMHjwYIwZMwajRo1C48aNkZOTg06dOuHOnTsYM2YMXF1dcerUKcyePRtJSUlYsWIFgMe/FAwePBhdunTR/JuKjY3FyZMnNZ+T+fPnIyIiQvN+ZmZm4uzZszh//jy6du1a5nsg5/csMiD6HnMxNhkZGQKA6NOnT7naX7x4UQAQI0eO1No/bdo0AUAcPnxYs8/NzU0AEMeOHdPsu3fvnlAqleKtt97S7EtMTCx1/kBoaKhwc3MrEcO8efPEPz8qy5cvFwDE/fv3y4y7+Bz/nGfQokUL4eDgIFJTUzX7Ll26JExMTMTrr79e4nwjRozQ6vOVV14RNWvWLPOc/7wOKysrIYQQAwYMEF26dBFCPB6Pd3JyEgsWLCj1PcjNzS0xVp+YmCiUSqVYuHChZt/T5lB06tRJABBr164t9VinTp209h08eFAAEIsXLxbXr18X1tbWom/fvs+8RiFKn9OwYsUKAUB8+eWXmn35+fnCz89PWFtbi8zMTM11ARAqlUrcu3fvuc/3T+X9rCYnJwszM7MS1zl//nwBQGvuw5EjRwQAceTIESGEEBcuXBAAxDfffPPUWMuaQ1E87yExMVEIIUR6erqwsbERbdu2LTEP4FnzLIr7+uWXX8T9+/fF7du3xa5du0Tt2rWFUqkUt2/f1rTt0qWL8PHxEbm5uVr9t2vXTjRs2FCzb+LEiUKhUIgLFy5o9qWmpgp7e3utuIX437/3AwcOaMW1aNEiYWVlJf7880+t/bNmzRKmpqbi1q1bQggh3nzzTaFSqZ46z6l58+bPnDfz5PeHivieRYaBQx6VLDMzEwBgY2NTrvY//fQTAGDq1Kla+4t/K31yroW3t7fmt2bg8W+tjRs3xvXr15875icVz7347rvvSpRQy5KUlISLFy8iLCxM67fgZs2aoWvXrprr/KexY8dqfd2hQwekpqZq3sPyGDJkCKKjo5GcnIzDhw8jOTm5zN86lUolTEwe/5MoKipCamqqZjjn/Pnz5T6nUqnE8OHDy9U2KCgIY8aMwcKFC9GvXz9YWFhoytbP46effoKTkxMGDx6s2VetWjVMmjQJWVlZOHr0qFb7/v37o3bt2s99vifPDTz7s3ro0CEUFhbijTfe0Go3ceLEZ57D1tYWAHDw4EGdhr/
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: Random Forest Classification\n",
"Best Parameters: {'model__max_depth': None, 'model__n_estimators': 100}\n",
"Accuracy: 1.0\n",
"Precision: 1.0\n",
"Recall: 1.0\n",
"F1-score: 1.0\n",
"\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAhQAAAHHCAYAAADnOMH5AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABSbElEQVR4nO3deVhUZfsH8O+AMqwDIrviiDvkrrmEayK45BKauYOipoH7/lauFeWSplmalpppVq6vS6apuJK5V4YkuG+oICAo6zy/P3w5P0dAZzyHZZzvx+tcl5zlOfcZDjP33M9zzlEJIQSIiIiIZLAo6QCIiIjI9DGhICIiItmYUBAREZFsTCiIiIhINiYUREREJBsTCiIiIpKNCQURERHJxoSCiIiIZGNCQURERLIxoSgiFy5cQGBgIBwdHaFSqbBlyxZF2798+TJUKhVWrVqlaLumrE2bNmjTpo1i7aWlpWHIkCHw8PCASqXCmDFjFGvbVPA8K91Kw++ncuXKCA0N1ZtX0PvfqlWroFKpcPny5WKPUaVSYcaMGcW+X3PzUicU8fHxeOedd1ClShVYW1tDo9HA398fn3/+OR49elSk+w4JCcFff/2Fjz76CGvWrEHjxo2LdH/FKTQ0FCqVChqNpsDX8cKFC1CpVFCpVJg3b57R7d+8eRMzZszAmTNnFIj2xX388cdYtWoVRowYgTVr1mDAgAFFur/KlStLr5tKpYKdnR2aNGmC7777rkj3a2qefp2enDIyMko6vHyOHj2KGTNmIDk52ajtoqKiEBwcDA8PD1hZWcHNzQ1dunTBpk2biiZQBZXE+9/OnTuZNJSwMiUdQFHZsWMH3nrrLajVagwcOBC1a9dGVlYWDh8+jIkTJ+LcuXP4+uuvi2Tfjx49QnR0NN577z1EREQUyT60Wi0ePXqEsmXLFkn7z1OmTBk8fPgQ27ZtQ69evfSWrV27FtbW1i/85n7z5k3MnDkTlStXRv369Q3ebvfu3S+0v8Ls27cPzZo1w/Tp0xVt91nq16+P8ePHAwBu3bqFFStWICQkBJmZmRg6dGixxVHaPfk6PcnKyqoEonm2o0ePYubMmQgNDYWTk5NB20yfPh2zZs1C9erV8c4770Cr1SIxMRE7d+5Ejx49sHbtWvTt27doAzdQbGwsLCz+/7tpYe9/AwYMQO/evaFWq4skjp07d2LJkiUFJhWPHj1CmTIv7cddqfFSvsKXLl1C7969odVqsW/fPnh6ekrLwsPDERcXhx07dhTZ/u/evQsABr95vAiVSgVra+sia/951Go1/P398cMPP+RLKNatW4fOnTtj48aNxRLLw4cPYWtrq/iHyZ07d+Dn56dYezk5OdDpdM+Ms0KFCujfv7/0c2hoKKpUqYIFCxYwoXjC06+TUnQ6HbKyskr0b2vDhg2YNWsWevbsiXXr1ul9aZg4cSJ+/fVXZGdnl1h8T3s6QSjs/c/S0hKWlpbFFZaekvx9mhXxEho+fLgAII4cOWLQ+tnZ2WLWrFmiSpUqwsrKSmi1WjF16lSRkZGht55WqxWdO3cWhw4dEq+++qpQq9XCx8dHrF69Wlpn+vTpAoDepNVqhRBChISESP9/Ut42T9q9e7fw9/cXjo6Ows7OTtSoUUNMnTpVWn7p0iUBQKxcuVJvu71794oWLVoIW1tb4ejoKLp27Sr++eefAvd34cIFERISIhwdHYVGoxGhoaEiPT39ua9XSEiIsLOzE6tWrRJqtVrcv39fWvbHH38IAGLjxo0CgJg7d660LDExUYwfP17Url1b2NnZCQcHB9GhQwdx5swZaZ39+/fne/2ePM7WrVuLV155RZw4cUK0bNlS2NjYiNGjR0vLWrduLbU1cOBAoVar8x1/YGCgcHJyEjdu3Cjw+AqL4dKlS0IIIRISEsTgwYOFm5ubUKvVom7dumLVqlV6beT9fubOnSsWLFggqlSpIiwsLMTp06cLfV3zzq+nNW7cWFhZWenNO3jwoOjZs6fw9vYWVlZWomLFimLMmDHi4cOHeuvl/a6uX78uunXrJuzs7ISLi4sYP368yMnJ0Vv3/v37IiQkRGg0GuHo6CgGDhwoTp8+Lfs8i42NFf369RMajUa4uLiI999/X+h0OnH16lXRtWtX4eDgINzd3cW8efMKfW0MeZ2elJaWJsaNGycqVqworKysRI0aNcTcuXOFTqfTWw+ACA8PF99//73w8/MTZcqUEZs3bxZCCHH9+nUxaNAg4ebmJqysrISfn5/45ptv8u1r0aJFws/PT9jY2AgnJyfRqFEjsXbtWr3XoLBzqSC1atUSzs7OIjU19bmvRUHvA2fPnhUhISHCx8dHqNVq4e7uLgYNGiTu3bunt21qaqoYPXq00Gq1wsrKSri6uoqAgABx8uRJaZ1///1XBAcHC3d3d6FWq0WFChXE22+/LZKTk6V1tFqtCAkJKfR4897zVq5cWeCx79y5U7Rq1UrY29sLBwcH0bhxY+n1E8Kwcz0kJKTA1zkPADF9+nS9/Z46dUp06NBBODg4CDs7O/H666+L6OhovXXyYj58+LAYO3ascHFxEba2tqJ79+7izp07z/39mJuXskKxbds2VKlSBa+99ppB6w8ZMgSrV69Gz549MX78eBw7dgyRkZGIiYnB5s2b9daNi4tDz549ERYWhpCQEHz77bcIDQ1Fo0aN8MorryA4OBhOTk4YO3Ys+vTpg06dOsHe3t6o+M+dO4c33ngDdevWxaxZs6BWqxEXF4cjR448c7vffvsNHTt2RJUqVTBjxgw8evQIixcvhr+/P06dOoXKlSvrrd+rVy/4+PggMjISp06dwooVK+Dm5oZPP/3UoDiDg4MxfPhwbNq0CYMHDwbwuDpRq1YtNGzYMN/6Fy9exJYtW/DWW2/Bx8cHCQkJWLZsGVq3bo1//vkHXl5e8PX1xaxZszBt2jQMGzYMLVu2BAC932ViYiI6duyI3r17o3///nB3dy8wvs8//xz79u1DSEgIoqOjYWlpiWXLlmH37t1Ys2YNvLy8CtzO19cXa9aswdixY1GxYkWptO7q6opHjx6hTZs2iIuLQ0REBHx8fPDzzz8jNDQUycnJGD16tF5bK1euREZGBoYNGwa1Wg1nZ2eDXts8OTk5uH79OsqVK6c3/+eff8bDhw8xYsQIlC9fHn/88QcWL16M69ev4+eff9ZbNzc3F0FBQWjatCnmzZuH3377DfPnz0fVqlUxYsQIAIAQAt26dcPhw4cxfPhw+Pr6YvPmzQgJCckXk7Hn2dtvvw1fX1988skn2LFjBz788EM4Oztj2bJleP311/Hpp59i7dq1mDBhAl599VW0atXqua9LdnY27t27pzfP1tYWtra2EEKga9eu2L9/P8LCwlC/fn38+uuvmDhxIm7cuIEFCxbobbdv3z789NNPiIiIgIuLCypXroyEhAQ0a9YMKpUKERERcHV1xS+//IKwsDCkpqZKA3SXL1+OUaNGoWfPnhg9ejQyMjLw559/4tixY+jbty+Cg4Px77//4ocffsCCBQvg4uIC4PG5VJALFy7g/PnzGDx4MBwcHJ77OhRkz549uHjxIgYNGgQPDw+pe/fcuXP4/fffoVKpAADDhw/Hhg0bEBERAT8/PyQmJuLw4cOIiYlBw4YNkZWVhaCgIGRmZmLkyJHw8PDAjRs3sH37diQnJ8PR0THfvo19/1u1ahUGDx6MV155BVOnToWTkxNOnz6NXbt2SV06hpzr77zzDm7evIk9e/ZgzZo1z32Nzp07h5YtW0Kj0WDSpEkoW7Ysli1bhjZt2uDAgQNo2rSp3vojR45EuXLlMH36dFy+fBkLFy5EREQEfvzxR4N/L2ahpDMapaWkpAgAolu3bgatf+bMGQFADBkyRG/+hAkTBACxb98+aZ5WqxUAxMGDB6V5d+7cEWq1WowfP16a9+S30ycZWqFYsGCBACDu3r1baNwFfTOpX7++cHNzE4mJidK8s2fPCgsLCzFw4MB8+xs8eLBem2+++aYoX758oft88jjs7OyEEEL07NlTtGvXTgghRG5urvDw8BAzZ84s8DXIyMgQubm5+Y5DrVa
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: Gradient Boosting Classification\n",
"Best Parameters: {'model__learning_rate': 0.01, 'model__max_depth': 3, 'model__n_estimators': 100}\n",
"Accuracy: 1.0\n",
"Precision: 1.0\n",
"Recall: 1.0\n",
"F1-score: 1.0\n",
"\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAhQAAAHHCAYAAADnOMH5AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABVB0lEQVR4nO3deXxMZ/s/8M9kIpN1EiErEUlsiX0pJdaKRCgiVCmaEJQGbeye1tpq+tCWUkWptZTW+tilCEWq1i4amkQstQSJJJLIOvfvD9/Mz0jCjHOyTPN5e53XS845c5/rzEwm11z3fZ+jEEIIEBEREUlgUt4BEBERkfFjQkFERESSMaEgIiIiyZhQEBERkWRMKIiIiEgyJhREREQkGRMKIiIikowJBREREUnGhIKIiIgkY0Khh7i4OPj7+8PW1hYKhQI7d+6Utf1r165BoVBg7dq1srZrzDp37ozOnTvL1l5GRgZGjBgBZ2dnKBQKvP/++7K1XdEV9/6aPXs2FApF+QX1L2Msz2dF+KypXbs2QkNDddYV9xm7du1aKBQKXLt2rcxjVCgUmD17dpkf19gZTUKRkJCAd955B56enjA3N4darYavry++/PJLPH78uFSPHRISgj/++APz5s3Dhg0b0KpVq1I9XlkKDQ2FQqGAWq0u9nmMi4uDQqGAQqHAZ599ZnD7t2/fxuzZs3Hx4kUZon15n3zyCdauXYsxY8Zgw4YNGDp0aKkfU6PRYP369ejWrRuqV6+OKlWqwNHREf7+/vjmm2+Qk5NT6jGUJ0Nf+8I/IE8vjo6O6NKlC/bv31+6weohKysLs2fPRnR0dHmHUqzo6GgEBwfD2dkZZmZmcHR0RK9evbB9+/byDu2FyuMzdt++fUwa5CaMwJ49e4SFhYWws7MT48ePF99884346quvxMCBA0WVKlXEyJEjS+3YWVlZAoD44IMPSu0YGo1GPH78WOTn55faMUoSEhIiTE1NhVKpFFu2bCmyfdasWcLc3FwAEAsWLDC4/TNnzggAYs2aNQY9LicnR+Tk5Bh8vJK0adNG+Pr6ytbei2RlZYmAgAABQLRr105ERkaK1atXi88++0z06tVLKJVKMXz48DKJJTExschrkJeXJx4/flyqxzX0tV+zZo0AIObOnSs2bNgg1q9fLxYsWCAaNmwoAIjdu3eXarwvcv/+fQFAzJo1q8i2sng+n2fmzJkCgKhbt66YOXOm+Pbbb8X8+fNF586dBQCxceNGIUTx74Wylp2dLXJzc7U/l/QZm5+fLx4/fiw0Gk2pxBEeHi5K+hP4+PFjkZeXVyrH/TczLY8kxhCJiYkYOHAg3N3dceTIEbi4uGi3hYeHIz4+Hnv37i2149+/fx8AYGdnV2rHUCgUMDc3L7X2X0SlUsHX1xfff/89BgwYoLNt06ZN6NmzJ7Zt21YmsWRlZcHS0hJmZmaytnvv3j34+PjI1l5+fj40Gk2JcUZERODgwYNYtGgR3nvvPZ1tEydORFxcHKKioiQdQwpTU1OYmlbMX//AwECdb6hhYWFwcnLC999/j9dff70cIytZeT6fW7duxdy5c9G/f39s2rQJVapU0W6bPHkyDh48iLy8vHKJrTgqlUrn55I+Y5VKJZRKZVmFpaM8P4+NWnlnNC8yevRoAUCcPHlSr/3z8vLE3LlzhaenpzAzMxPu7u5i+vTpIjs7W2c/d3d30bNnT/Hzzz+LV155RahUKuHh4SHWrVun3WfWrFkCgM7i7u4uhHjyzb7w/08rfMzTDh06JHx9fYWtra2wsrIS9erVE9OnT9duL+lbw+HDh0X79u2FpaWlsLW1Fb179xZ//fVXsceLi4sTISEhwtbWVqjVahEaGioyMzNf+HyFhIQIKysrsXbtWqFSqcTDhw+123799VcBQGzbtq1IhSI5OVlMnDhRNGrUSFhZWQkbGxvRvXt3cfHiRe0+R48eLfL8PX2enTp1Eg0bNhRnz54VHTp0EBYWFuK9997TbuvUqZO2rbfffluoVKoi5+/v7y/s7OzErVu3ij2/kmJITEwUQgiRlJQkhg8fLhwdHYVKpRJNmjQRa9eu1Wmj8PVZsGCBWLhwofD09BQmJibiwoULxR7zxo0bQqlUiu7duz/nmdf1vGPk5OSIGTNmiBYtWgi1Wi0sLS1F+/btxZEjR4q08/DhQxESEiLUarWwtbUVb7/9trhw4UKR91dx71MhhNiwYYNo0aKFMDc3F1WrVhVvvvmmuHHjhs4+ha/bpUuXROfOnYWFhYVwdXUV//3vf7X7vOi1L05hheLMmTM66zUajVCr1eLtt9/WWZ+RkSEmTJggatasKczMzES9evXEggULinyj1fcz4cyZM8Lf319Uq1ZNmJubi9q1a4thw4bpvD7PLoXViuKeTwAiPDxc7NixQzRs2FCYmZkJHx8fsX///iLnfvToUdGyZUuhUqmEp6enWL58eYmv0bMaNGgg7O3tRXp6+gv3Le6z5rfffhMhISHCw8NDqFQq4eTkJIYNGyYePHig89j09HTx3nvvCXd3d2FmZiYcHByEn5+fOHfunHafv//+WwQHBwsnJyehUqlEjRo1xJtvvilSU1O1+7i7u4uQkBCd5624z9jC90Ph72qhffv2iY4dOwpra2thY2MjWrVqpa3ACCHE8ePHRf/+/YWbm5swMzMTNWvWFO+//77IysrS7hMSElLs61no6de20Pnz50X37t2FjY2NsLKyEq+99pqIiYnR2acw5hMnToiIiAhRvXp1YWlpKYKCgsS9e/de+PoYu4r5FeUpu3fvhqenJ9q1a6fX/iNGjMC6devQv39/TJw4EadPn0ZkZCRiY2OxY8cOnX3j4+PRv39/hIWFISQkBKtXr0ZoaChatmyJhg0bIjg4GHZ2doiIiMCgQYPQo0cPWFtbGxT/pUuX8Prrr6NJkyaYO3cuVCoV4uPjcfLkyec+7qeffkJgYCA8PT0xe/ZsPH78GEuWLIGvry/Onz+P2rVr6+w/YMAAeHh4IDIyEufPn8eqVavg6OiI//73v3rFGRwcjNGjR2P79u0YPnw4gCfViQYNGqBFixZF9r969Sp27tyJN954Ax4eHkhKSsKKFSvQqVMn/PXXX3B1dYW3tzfmzp2LmTNnYtSoUejQoQMA6LyWycnJCAwMxMCBAzFkyBA4OTkVG9+XX36JI0eOICQkBDExMVAqlVixYgUOHTqEDRs2wNXVtdjHeXt7Y8OGDYiIiEDNmjUxceJEAICDgwMeP36Mzp07Iz4+HmPHjoWHhwd+/PFHhIaGIjU1tUhlYc2aNcjOzsaoUaOgUqlgb29f7DH379+PgoICDBky5AXPelHFHSM9PR2rVq3CoEGDMHLkSDx69AjffvstAgIC8Ouvv6JZs2YAACEE+vTpgxMnTmD06NHw9vbGjh07EBISotex582bhxkzZmDAgAEYMWIE7t+/jyVLlqBjx464cOGCzjfIhw8fonv37ggODsaAAQOwdetWTJ06FY0bN0ZgYKBer31J0tLS8ODBAwghcO/ePSxZsgQZGRk6z6cQAr1798bRo0cRFhaGZs2a4eDBg5g8eTJu3bqFhQsXavfV5zPh3r178Pf3h4ODA6ZNmwY7Oztcu3ZNO/7AwcEBy5Ytw5gxY9C3b18EBwcDAJo0afLcczlx4gS2b9+Od999FzY2Nli8eDH69euHGzduoFq1agCACxcuoHv37nBxccGcOXNQUFCAuXPnwsHB4YXPVVxcHC5fvozhw4fDxsbmhfsXJyoqClevXsWwYcPg7OyMS5cu4ZtvvsGlS5fwyy+/aAebjh49Glu3bsXYsWPh4+OD5ORknDhxArGxsWjRogVyc3MREBCAnJwcjBs3Ds7Ozrh16xb27NmD1NRU2NraFjm2oZ+xa9euxfDhw9GwYUNMnz4ddnZ2uHDhAg4cOIC33noLAPDjjz8iKysLY8aMQbVq1fDrr79iyZIl+Oeff/Djjz8CAN555x3cvn0bUVFR2LBhwwufo0uXLqFDhw5Qq9WYMmUKqlSpghUrVqBz5844duwY2rRpo7P/uHHjULVqVcyaNQvXrl3DokWLMHb
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
"from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay\n",
"\n",
"# Загружаем набор данных\n",
"df = pd.read_csv(\"..//static//csv//FINAL_USO.csv\")\n",
"\n",
"numerical_cols = ['Volume', 'High', 'Open', 'Close', 'Low']\n",
"\n",
"# Создаем преобразователь для категориальных и числовых столбцов\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', StandardScaler(), numerical_cols)\n",
" ])\n",
"\n",
"# Список моделей и их гиперпараметров для задачи регрессии\n",
"models_reg = {\n",
" \"Linear Regression\": (LinearRegression(), {}),\n",
" \"Random Forest Regression\": (RandomForestRegressor(), {\n",
" 'model__n_estimators': [100, 200],\n",
" 'model__max_depth': [None, 10, 20]\n",
" }),\n",
" \"Gradient Boosting Regression\": (GradientBoostingRegressor(), {\n",
" 'model__n_estimators': [100, 200],\n",
" 'model__learning_rate': [0.01, 0.1],\n",
" 'model__max_depth': [3, 5]\n",
" })\n",
"}\n",
"\n",
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
"X_reg = df[numerical_cols]\n",
"y_reg = df['Adj Close']\n",
"\n",
"# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n",
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
"\n",
"# Список моделей и их гиперпараметров для задачи классификации\n",
"models_class = {\n",
" \"Logistic Regression\": (LogisticRegression(), {\n",
" 'model__C': [0.1, 1, 10],\n",
" 'model__solver': ['liblinear', 'lbfgs']\n",
" }),\n",
" \"Random Forest Classification\": (RandomForestClassifier(), {\n",
" 'model__n_estimators': [100, 200],\n",
" 'model__max_depth': [None, 10, 20]\n",
" }),\n",
" \"Gradient Boosting Classification\": (GradientBoostingClassifier(), {\n",
" 'model__n_estimators': [100, 200],\n",
" 'model__learning_rate': [0.01, 0.1],\n",
" 'model__max_depth': [3, 5]\n",
" })\n",
"}\n",
"\n",
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
"X_class = df[ numerical_cols]\n",
"y_class = (df['Adj Close'] > df['Adj Close'].mean()).astype(int)\n",
"\n",
"# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n",
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
"\n",
"# Обучаем и оцениваем модели для задачи классификации\n",
"print(\"Результаты для задачи классификации:\")\n",
"for name, (model, params) in models_class.items():\n",
" pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('model', model)\n",
" ])\n",
" grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')\n",
" grid_search.fit(X_train_class, y_train_class)\n",
" best_model = grid_search.best_estimator_\n",
" y_pred_class = best_model.predict(X_test_class)\n",
" accuracy = accuracy_score(y_test_class, y_pred_class)\n",
" precision = precision_score(y_test_class, y_pred_class)\n",
" recall = recall_score(y_test_class, y_pred_class)\n",
" f1 = f1_score(y_test_class, y_pred_class)\n",
" print(f\"Model: {name}\")\n",
" print(f\"Best Parameters: {grid_search.best_params_}\")\n",
" print(f\"Accuracy: {accuracy}\")\n",
" print(f\"Precision: {precision}\")\n",
" print(f\"Recall: {recall}\")\n",
" print(f\"F1-score: {f1}\")\n",
" print()\n",
"\n",
" # Визуализация матрицы ошибок\n",
" cm = confusion_matrix(y_test_class, y_pred_class)\n",
" disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Less', 'More'])\n",
" disp.plot(cmap=plt.cm.Blues)\n",
" plt.title(f'Confusion Matrix for {name}')\n",
" plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Вывод**:\n",
"\n",
"Градиентный бустинг и случайный лес выдали наилучшие результаты. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Давайте проанализируем полученные значения метрик и определим, являются ли они нормальными или их можно улучшить.\n",
"\n",
"### Оценка смещения и дисперсии для задачи регрессии:\n",
"\n",
"### Вывод для задачи регрессии:\n",
"\n",
"- **Random Forest Regression** демонстрирует наилучшие результаты по метрикам MAE и R², что указывает на высокую точность и стабильность модели.\n",
"- **Linear Regression** и **Gradient Boosting Regression** также показывают хорошие результаты, но уступают случайному лесу.\n",
"\n",
"### Вывод для задачи классификации:\n",
"\n",
"- **Random Forest Classification** демонстрирует наилучшие результаты по всем метрикам (Accuracy, Precision, Recall, F1-score), что указывает на высокую точность и стабильность модели.\n",
"- **Logistic Regression** и **Gradient Boosting Classification** также показывают хорошие результаты, но уступают случайному лесу.\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Для оценки смещения (bias) и дисперсии (variance) моделей можно использовать метод перекрестной проверки (cross-validation). Этот метод позволяет оценить, насколько хорошо модель обобщается на новых данных.\n",
"\n",
"Оценка смещения и дисперсии для задачи регрессии:\n",
"Для задачи регрессии мы будем использовать метрики MAE (Mean Absolute Error) и R² (R-squared) для оценки смещения и дисперсии.\n",
"\n",
"Оценка смещения и дисперсии для задачи классификации:\n",
"Для задачи классификации мы будем использовать метрики Accuracy, Precision, Recall и F1-score для оценки смещения и дисперсии.\n",
"\n",
"Пример кода для оценки смещения и дисперсии:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Оценка смещения и дисперсии для задачи регрессии:\n",
"Model: Linear Regression\n",
"MAE (Cross-Validation): Mean = 3.475420657900542e-14, Std = 2.3108544967235046e-14\n",
"R² (Cross-Validation): Mean = 1.0, Std = 0.0\n",
"\n",
"Model: Random Forest Regression\n",
"MAE (Cross-Validation): Mean = 4.770713368258853, Std = 9.027907502951276\n",
"R² (Cross-Validation): Mean = -0.8676362010013315, Std = 3.6735082182967664\n",
"\n",
"Model: Gradient Boosting Regression\n",
"MAE (Cross-Validation): Mean = 4.790726208613611, Std = 8.978223486805094\n",
"R² (Cross-Validation): Mean = -0.8531326799804774, Std = 3.6480201756306525\n",
"\n",
"Оценка смещения и дисперсии для задачи классификации:\n",
"Model: Logistic Regression\n",
"Accuracy (Cross-Validation): Mean = 0.9469472506610617, Std = 0.09607008028935687\n",
"Precision (Cross-Validation): Mean = 0.9903846153846153, Std = 0.019230769230769253\n",
"Recall (Cross-Validation): Mean = 0.8244897959183675, Std = 0.34090796763789555\n",
"F1-score (Cross-Validation): Mean = 0.8430120359555126, Std = 0.29664350339720796\n",
"\n",
"Model: Random Forest Classification\n",
"Accuracy (Cross-Validation): Mean = 0.99533527696793, Std = 0.009329446064139945\n",
"Precision (Cross-Validation): Mean = 1.0, Std = 0.0\n",
"Recall (Cross-Validation): Mean = 0.9795918367346939, Std = 0.04081632653061225\n",
"F1-score (Cross-Validation): Mean = 0.9904843365764995, Std = 0.016633019819396317\n",
"\n",
"Model: Gradient Boosting Classification\n",
"Accuracy (Cross-Validation): Mean = 0.9988338192419824, Std = 0.0023323615160349754\n",
"Precision (Cross-Validation): Mean = 1.0, Std = 0.0\n",
"Recall (Cross-Validation): Mean = 0.9959183673469388, Std = 0.008163265306122458\n",
"F1-score (Cross-Validation): Mean = 0.9979381443298969, Std = 0.004123711340206171\n",
"\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import cross_val_score\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
"from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"# Загружаем набор данных\n",
"df = pd.read_csv(\"..//static//csv//FINAL_USO.csv\")\n",
"\n",
"# Определяем категориальные и числовые столбцы\n",
"numerical_cols = ['Volume', 'High', 'Open', 'Close', 'Low']\n",
"\n",
"# Создаем преобразователь для категориальных и числовых столбцов\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', StandardScaler(), numerical_cols)\n",
" ])\n",
"\n",
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
"X_reg = df[numerical_cols]\n",
"y_reg = df['Adj Close']\n",
"\n",
"# Список моделей для задачи регрессии\n",
"models_reg = {\n",
" \"Linear Regression\": LinearRegression(),\n",
" \"Random Forest Regression\": RandomForestRegressor(),\n",
" \"Gradient Boosting Regression\": GradientBoostingRegressor()\n",
"}\n",
"\n",
"# Оценка смещения и дисперсии для задачи регрессии\n",
"print(\"Оценка смещения и дисперсии для задачи регрессии:\")\n",
"for name, model in models_reg.items():\n",
" pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('model', model)\n",
" ])\n",
" mae_scores = -cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='neg_mean_absolute_error')\n",
" r2_scores = cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='r2')\n",
" print(f\"Model: {name}\")\n",
" print(f\"MAE (Cross-Validation): Mean = {mae_scores.mean()}, Std = {mae_scores.std()}\")\n",
" print(f\"R² (Cross-Validation): Mean = {r2_scores.mean()}, Std = {r2_scores.std()}\")\n",
" print()\n",
"\n",
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
"X_class = df[numerical_cols]\n",
"y_class = (df['Adj Close'] > df['Adj Close'].mean()).astype(int)\n",
"\n",
"# Список моделей для задачи классификации\n",
"models_class = {\n",
" \"Logistic Regression\": LogisticRegression(),\n",
" \"Random Forest Classification\": RandomForestClassifier(),\n",
" \"Gradient Boosting Classification\": GradientBoostingClassifier()\n",
"}\n",
"\n",
"# Оценка смещения и дисперсии для задачи классификации\n",
"print(\"Оценка смещения и дисперсии для задачи классификации:\")\n",
"for name, model in models_class.items():\n",
" pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('model', model)\n",
" ])\n",
" accuracy_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='accuracy')\n",
" precision_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='precision')\n",
" recall_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='recall')\n",
" f1_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='f1')\n",
" print(f\"Model: {name}\")\n",
" print(f\"Accuracy (Cross-Validation): Mean = {accuracy_scores.mean()}, Std = {accuracy_scores.std()}\")\n",
" print(f\"Precision (Cross-Validation): Mean = {precision_scores.mean()}, Std = {precision_scores.std()}\")\n",
" print(f\"Recall (Cross-Validation): Mean = {recall_scores.mean()}, Std = {recall_scores.std()}\")\n",
" print(f\"F1-score (Cross-Validation): Mean = {f1_scores.mean()}, Std = {f1_scores.std()}\")\n",
" print()\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABccAAAJOCAYAAABycr+9AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACOp0lEQVR4nOzdeXgT5f7+8TuFNKUrIIWCFAoFZQcF2RUQbFkE8YgoHGUREQVUwA08IlTRuoMKsriAC7gjHj0CZfcguADiEQQEZFGkbEIDLU1LO78//DXfhqRtCmnTZt6v6+pFMnkm85kkz+TJzZOJxTAMQwAAAAAAAAAAmEiQvwsAAAAAAAAAAKC0EY4DAAAAAAAAAEyHcBwAAAAAAAAAYDqE4wAAAAAAAAAA0yEcBwAAAAAAAACYDuE4AAAAAAAAAMB0CMcBAAAAAAAAAKZDOA4AAAAAAAAAMB3CcQAAAAAAAACA6RCOA3AaNmyYwsPDS3Wb+/fvl8Vi0YIFC0p1u4Hso48+UtWqVXXmzBl/l1Kg9u3b6+GHH/a6/ZkzZ3TnnXcqJiZGFotF48aNK7ni4GSxWDR16lR/l1FqLuZ4tHbtWlksFq1du9bndQEAYDbFGYN8//33Cg4O1oEDB3xex5w5c1SnTh05HA6v13n33XfVqFEjWa1WVa5c2ec1wd2wYcMUFxfn7zJKVdeuXdW1a9cLWjcuLk7Dhg3zaT1AeUY4DniwYMECWSwWWSwWrV+/3u12wzAUGxsri8Wi66+/3g8VFl9OTo5q1aoli8WipUuX+rscn8jIyNDUqVNLJIzKe/49/d19990+356v5OTkaMqUKbr33ntd/qMjLi5OFotFPXr08Lje66+/7ty/TZs2eWzz8MMPy2Kx6JZbbvF4e16wWNDfM88842z7yCOPaNasWUpNTfVqv55++mktWLBA99xzj959913dfvvtXq13ofIer7y/sLAwtW3bVu+8806Jbhd/mzp1qiwWi4KCgvT777+73W6321WpUiVZLBaNHTvWDxUCAFC4/J8nLBaLKlasqEsvvVTDhg3ToUOH/F1eQPnXv/6lQYMGqW7dus5lXbt2dXn8K1WqpBYtWmjGjBnKzc11Wf/2229X586d1a5dO3Xp0kU7duxw3jZs2DBlZWVp7ty5XtWyc+dODRs2TPHx8Xr99dc1b9483+xkAfLGTHl/VqtVcXFxuu+++3Tq1KkS3TZcP/9MmzbNY5t//vOfslgspT4JDYD3Kvq7AKAsCwkJ0aJFi9S5c2eX5evWrdMff/whm83mp8qKb/Xq1Tp8+LDi4uK0cOFC9erVy98lXbSMjAwlJSVJ0gX/r3lhrrvuOg0ZMsRt+WWXXebzbfnKF198oV27dumuu+5yuy0kJERr1qxRamqqYmJiXG5buHChQkJClJmZ6fF+DcPQ+++/r7i4OH3xxRc6ffq0IiIiPLYdNGiQevfu7bb8iiuucF6+4YYbFBkZqddee01PPPFEkfu1evVqtW/fXlOmTCmyra+0atVKDzzwgCTp8OHDeuONNzR06FA5HA6NHDmy1Orwp7Nnz6piRf8NFWw2m95//323bxksXrzYTxUBAFA8TzzxhOrVq6fMzEx9++23WrBggdavX69t27YpJCTE3+WVe1u3btXKlSu1YcMGt9tq166t5ORkSdLx48e1aNEijR8/XseOHdNTTz3lbDd58mTn+H7cuHEaPXq01qxZI+nv8fPQoUP10ksv6d5775XFYim0nrVr1yo3N1cvv/yyGjRo4KvdLNLs2bMVHh6u9PR0rVq1Sq+++qq2bNnicaJXIHr99dfd/tOjNIWEhOj999/XY4895rI8PT1dn3/+OX0dKOOYOQ4Uonfv3vr444917tw5l+WLFi1S69at3QLGsuy9997TlVdeqfHjx2vJkiVKT0/3d0ll3mWXXabbbrvN7a9t27aFrpeRkeFx+blz55SVlXVRNRX1vM2fP1+dOnXSpZde6nZbp06dFB4erg8//NBl+R9//KH//ve/6tOnT4H3u3btWv3xxx966623dO7cuULDySuvvNLj49a0aVNnm6CgIA0YMEDvvPOODMModJ8k6ejRoz79Wqo3z8Wll17qrP2hhx7S+vXrFR4erunTp/usDm/5q7+GhIT4NRzv3bu33n//fbflixYtKvT1CgBAWdGrVy/ddtttuvPOO/XGG2/owQcf1N69e/Xvf//b36WViILGwSVl/vz5qlOnjtq3b+92W1RUlHMsN27cOH399deqW7euXn31VeXk5Djb5Z/4YhiGgoJcY5KBAwfqwIEDzsC8MEePHpUkn45bvXlMBwwYoNtuu02jRo3SRx99pFtuuUXffPONvv/+e5/V4Y3c3NwCJ9uUJKvV6teJa71799Yvv/yin376yWX5559/rqysLF133XV+qgyANwjHgUIMGjRIJ06c0IoVK5zLsrKy9Mknn2jw4MEe18nNzdWMGTPUtGlThYSEqEaNGho1apROnjzp0u7zzz9Xnz59VKtWLdlsNsXHx+vJJ590GahJf8+IbtasmX755Rd169ZNoaGhuvTSS/Xcc895vR9nz57VZ599pltvvVUDBw7U2bNn9fnnnxfY/rffflNiYqLCwsJUq1YtPfHEE24B5gcffKDWrVsrIiJCkZGRat68uV5++WW3+7n55ptVtWpVhYaGqn379vrPf/5TZL0FnT8t/7nk9u/fr+joaElSUlKS8+ts+c9NuHPnTg0YMEBVq1ZVSEiI2rRp4/MPInnPz+bNm3XNNdcoNDRUjz76qPMrdi+88IJmzJih+Ph42Ww2/fLLL5L+ngl99dVXKywsTJUrV9YNN9zg8hVO6f++JvnLL79o8ODBqlKlitu3GPLLzMzUsmXLCjx1SkhIiP7xj39o0aJFLsvff/99ValSRYmJiQXe98KFC9WkSRN169ZNPXr00MKFC719iAp03XXX6cCBA9q6dWuBbfLO47xv3z795z//cT7P+/fvl/T3B5ARI0aoRo0aCgkJUcuWLfX222+73EdRz4W3oqOj1ahRI+3du9dlubd9Pjc3V1OnTlWtWrUUGhqqbt266ZdffnE751/e17DXrVun0aNHq3r16qpdu7bz9qVLlzpfOxEREerTp4+2b9/usq3U1FQNHz5ctWvXls1mU82aNXXDDTc4HzdJ2rRpkxITE1WtWjVVqlRJ9erV0x133OFyP57O9/njjz+qV69eioyMVHh4uLp3765vv/3WpU3ePnzzzTeaMGGCoqOjFRYWphtvvFHHjh3z9iHX4MGDtXXrVu3cudNl31avXl3gMdib14QknTp1SsOGDVNUVJQqV66soUOHFvj14ws9luzevVs33XSTYmJiFBISotq1a+vWW29VWlqadw8AACDgXH311ZLkNp7wxJv3c8MwNG3aNNWuXds5vti+fbvb+CJvXHm+vPfs/PdZ3M8p54+DJcnhcGjKlClq0KCBbDabYmNj9fDDD7udu9vhcGj8+PGKjo5WRESE+vXrpz/++KPIxybPkiVLdO211xY5o1v6eyx81VVX6fTp084QO79Vq1bpjTfecDkVoCS1bt1aVatWLfTzk/T3afnyvuUYHR3tNo567bXX1LRpU9lsNtWqVUtjxoxxG3sU9pgWR0Gvs++++049e/ZUVFSUQkND1aVLF33zzTdu669du1Zt2rRRSEiI4uPjNXfuXI+vobxT3C1cuNC5b8uWLZMkHTp0SHfccYdq1Kghm82mpk2b6q233nLb1quvvqqmTZsqNDRUVapUUZs2bVw+r5w+fVrjxo1TXFycbDabqlevruuuu05btmxxtvF0zvH09HQ98MADio2Nlc1m0+WXX64XXnjB7XNt3j4sWbJEzZo1c9aatx/e6NChg+rVq+f2OWvhwoXq2bOnqlat6nE9b14TkjRv3jzFx8erUqVKatu2rf773/96vD9v+935srOzlZSUpIYNGyokJESXXHKJOnfu7JKDAIGM06oAhYiLi1OHDh30/vvvO09DsnTpUqWlpenWW2/VK6+84rbOqFGjtGDBAg0fPlz33Xef9u3bp5kzZ+rHH3/UN998I6vVKun
"text/plain": [
"<Figure size 1500x600 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABpMAAASlCAYAAABEPCH1AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXhV1dk/7ieBkICAiEBApICzKAXFSnHWIjhhcUSxMql1oqKpVnEAsVW+alV8FUWtqK2o1NlXKYIItVYqrYpWKyoq4sCopUGGMGT//uiPvMZkYwIhJ4n3fV1cctZZ++znHPaKe+Vz9tpZSZIkAQAAAAAAAOXIznQBAAAAAAAA1FzCJAAAAAAAAFIJkwAAAAAAAEglTAIAAAAAACCVMAkAAAAAAIBUwiQAAAAAAABSCZMAAAAAAABIJUwCAAAAAAAglTAJAAAAAACAVMIkACIi4oMPPohevXrF1ltvHVlZWfHUU09luqQSgwYNig4dOmRs//fff39kZWXFvHnzSrXfeOONscMOO0S9evWia9euERHRoUOHGDRoULXXePXVV0dWVla17zeTNue4OOSQQ+KQQw6p0noAANi4TTl/mzFjRmRlZcWMGTO2SE3fpbxz/ppg3rx5kZWVFffff3/Gaihv7lPevDJtPlUdsrKy4uqrr672/WbK5hwXmR5rQM0nTALqlDvuuCOysrKie/fumS6l1hk4cGD885//jGuvvTb+8Ic/xD777LPF91lYWBijRo2KLl26ROPGjaNhw4ax5557xqWXXhpffPHFFt//5pgyZUr86le/iv333z/uu+++uO6667b4PleuXBlXX311jTu5z8rKiqysrDjzzDPLff6KK64o6bN06dJqrg4A4Ptrwy/xN/zJy8uLXXbZJYYOHRqLFi3KdHk1XibO+SP++0v9448/Plq3bh0NGjSIVq1aRZ8+feKJJ56olv1vjkzMKydNmlTjAqMNX/bLzs6OTz/9tMzzhYWF0bBhw8jKyoqhQ4dmoEKAyquf6QIAqtKECROiQ4cOMWvWrJg7d27stNNOmS6pVli1alXMnDkzrrjiimo7kf3oo4+iZ8+eMX/+/DjppJPi5z//eTRo0CDeeuutuPfee+PJJ5+M999/v1pq+S6nn356nHLKKZGbm1vS9uKLL0Z2dnbce++90aBBg5L29957L7Kzt8x3NVauXBmjRo2KiChzVc2VV14Zl1122RbZb0Xk5eXF448/HnfccUepzyMi4uGHH468vLxYvXp1hqoDAPh+u+aaa6Jjx46xevXqePnll+POO++MSZMmxdtvvx2NGjWqtjruueeeKC4urtQ2Bx10UKxatarMOWZ1SDvn35JGjhwZ11xzTey8885x9tlnR/v27ePLL7+MSZMmxQknnBATJkyI/v37V0st3+Xbc5+0eWV586mqNGnSpBg7dmy5gdKqVauifv3M/fozNzc3Hn744fjVr35Vqr02BIMA3+bKJKDO+Pjjj+OVV16Jm2++OVq2bBkTJkzIdEmpVqxYkekSSlmyZElERDRr1qzKXnNj73HdunVx/PHHx6JFi2LGjBnx8MMPx/nnnx9nnXVW3HbbbfHRRx/FSSedVGW1bK569epFXl5eqWXkFi9eHA0bNiwzqczNzY2cnJzqLjHq168feXl51b7fDY444ogoLCyMP/3pT6XaX3nllfj444/j6KOPzlBlAAAceeSR8bOf/SzOPPPMuP/+++PCCy+Mjz/+OJ5++unUbbbEnCUnJ6fSgUJ2dnbk5eVtsS9sbUzaOf+mSpIkVq1alfr8Y489Ftdcc02ceOKJ8c4778SoUaNiyJAhcckll8T06dNj8uTJ0bRp0yqppSp8e+6TNq8sbz5VXfLy8jIaJh111FHx8MMPl2l/6KGHzJGAWkeYBNQZEyZMiG222SaOPvroOPHEE1PDpGXLlsVFF10UHTp0iNzc3Nh+++1jwIABpZbfWr16dVx99dWxyy67RF5eXrRp0yaOP/74+PDDDyMifS3h8tYnHjRoUDRu3Dg+/PDDOOqoo6JJkyZx2mmnRUTEX/7ylzjppJPiBz/4QeTm5ka7du3ioosuKneCMWfOnDj55JOjZcuW0bBhw9h1113jiiuuiIiI6dOnR1ZWVjz55JNltnvooYciKysrZs6cWe7ncfXVV0f79u0jIuKSSy6JrKysUuuYv/HGG3HkkUdG06ZNo3HjxvGTn/wk/va3v5V6jQ3LZ/z5z3+O8847L1q1ahXbb799ufuLiHj88cfjzTffjCuuuCIOOOCAMs83bdo0rr322tTtIyJ++9vfxn777RfbbrttNGzYMLp16xaPPfZYmX5Tp06NAw44IJo1axaNGzeOXXfdNS6//PJSfW677bbYY489olGjRrHNNtvEPvvsEw899FCZ97dhje+srKy47777YsWKFSVLhmz4Ny9v3fDvOubWrFkTI0aMiG7dusXWW28dW221VRx44IExffr0kteYN29etGzZMiIiRo0aVbLfDd++K++eSevWrYtf//rXseOOO0Zubm506NAhLr/88igqKirVr0OHDnHMMcfEyy+/HPvuu2/k5eXFDjvsEL///e83+m/wTW3bto2DDjqo1OcW8d9x2blz59hzzz3L3e7RRx+Nbt26RcOGDaNFixbxs5/9LD7//PMy/Z566qnYc889Iy8vL/bcc89yj/WIiOLi4hgzZkzssccekZeXF/n5+XH22WfHv//97+98D991HAAA1BWHHXZYRPz3C3kRG5+zVOb86k9/+lMcfPDB0aRJk2jatGn86Ec/KnU+Vd49kx555JHo1q1byTadO3eOW2+9teT5tLlXRc4jN7yvzz//PPr27RuNGzeOli1bxsUXXxzr16/f6Ge0sXP+yp5nP//887HPPvtEw4YN46677krd51VXXRXNmzeP8ePHl/sFtd69e8cxxxyTuv1bb70VgwYNih122CHy8vKidevWMWTIkPjyyy9L9Vu+fHlceOGFJfOTVq1axeGHHx6vv/56SZ8PPvggTjjhhGjdunXk5eXF9ttvH6ecckr85z//KfX+Nsx9NjavTLtn0ncdLxWZLw8aNCjGjh0bEVFqSccNyrtnUmXmuH/961+joKAgWrZsGVtttVUcd9xxJaFZRfTv3z9mz54dc+bMKWlbuHBhvPjii6lXmC1evDjOOOOMyM/Pj7y8vOjSpUs88MADZfotW7YsBg0aFFtvvXU0a9YsBg4cGMuWLSv3NefMmRMnnnhiNG/ePPLy8mKfffaJZ5555jvrr8hxAHx/WOYOqDMmTJgQxx9/fDRo0CBOPfXUuPPOO+Pvf/97/OhHPyrp8/XXX8eBBx4Y7777bgwZMiT23nvvWLp0aTzzzDPx2WefRYsWLWL9+vVxzDHHxLRp0+KUU06JYcOGxfLly2Pq1Knx9ttvx4477ljp2tatWxe9e/eOAw44IH7729+WLCXx6KOPxsqVK+Pcc8+NbbfdNmbNmhW33XZbfPbZZ/Hoo4+WbP/WW2/FgQceGDk5OfHzn/88OnToEB9++GH87//+b1x77bVxyCGHRLt27WLChAlx3HHHlflcdtxxx+jRo0e5tR1//PHRrFmzuOiii+LUU0+No446Kho3bhwREe+8804ceOCB0bRp0/jVr34VOTk5cdddd8UhhxwSf/7zn8vcm+q8886Lli1bxogRIzb6TcYNJ62nn356pT/LDW699dY49thj47TTTos1a9bEI488EieddFI8++yzJd/weuedd+KYY46JH/7wh3HNNddEbm5uzJ07N/7617+WvM4999wTF1xwQZx44okxbNiwWL16dbz11lvx6quvpp7c/+EPf4i77747Zs2aFb/73e8iImK//fYrt29FjrnCwsL43e9+F6eeemqcddZZsXz58rj33nujd+/eMWvWrOjatWu0bNky7rzzzjj33HPjuOOOi+OPPz4iIn74wx+mfkZnnnlmPPDAA3HiiSfGL3/5y3j11Vdj9OjR8e6775YJY+bOnRsnnnhinHHGGTFw4MAYP358DBo0KLp16xZ77LFHhf5N+vfvH8OGDYuvv/46GjduHOvWrYt
"text/plain": [
"<Figure size 1700x1200 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import cross_val_score\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
"from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"# Загружаем набор данных\n",
"df = pd.read_csv(\"..//static//csv//FINAL_USO.csv\")\n",
"\n",
"# Определяем категориальные и числовые столбцы\n",
"numerical_cols = ['Volume', 'High', 'Open', 'Close', 'Low']\n",
"\n",
"# Создаем преобразователь для категориальных и числовых столбцов\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', StandardScaler(), numerical_cols)\n",
" ])\n",
"\n",
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
"X_reg = df[ numerical_cols]\n",
"y_reg = df['Adj Close']\n",
"\n",
"# Список моделей для задачи регрессии\n",
"models_reg = {\n",
" \"Linear Regression\": LinearRegression(),\n",
" \"Random Forest Regression\": RandomForestRegressor(),\n",
" \"Gradient Boosting Regression\": GradientBoostingRegressor()\n",
"}\n",
"\n",
"# Оценка смещения и дисперсии для задачи регрессии\n",
"mae_means = []\n",
"mae_stds = []\n",
"r2_means = []\n",
"r2_stds = []\n",
"\n",
"for name, model in models_reg.items():\n",
" pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('model', model)\n",
" ])\n",
" mae_scores = -cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='neg_mean_absolute_error')\n",
" r2_scores = cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='r2')\n",
" mae_means.append(mae_scores.mean())\n",
" mae_stds.append(mae_scores.std())\n",
" r2_means.append(r2_scores.mean())\n",
" r2_stds.append(r2_scores.std())\n",
"\n",
"# Визуализация результатов для задачи регрессии\n",
"fig, ax = plt.subplots(1, 2, figsize=(15, 6))\n",
"\n",
"ax[0].bar(models_reg.keys(), mae_means, yerr=mae_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
"ax[0].set_ylabel('MAE')\n",
"ax[0].set_title('Mean Absolute Error (MAE) for Regression Models')\n",
"ax[0].yaxis.grid(True)\n",
"\n",
"ax[1].bar(models_reg.keys(), r2_means, yerr=r2_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
"ax[1].set_ylabel('R²')\n",
"ax[1].set_title('R-squared (R²) for Regression Models')\n",
"ax[1].yaxis.grid(True)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
"X_class = df[numerical_cols]\n",
"y_class = (df['Adj Close'] > df['Adj Close'].mean()).astype(int)\n",
"\n",
"# Список моделей для задачи классификации\n",
"models_class = {\n",
" \"Logistic Regression\": LogisticRegression(),\n",
" \"Random Forest Classification\": RandomForestClassifier(),\n",
" \"Gradient Boosting Classification\": GradientBoostingClassifier()\n",
"}\n",
"\n",
"# Оценка смещения и дисперсии для задачи классификации\n",
"accuracy_means = []\n",
"accuracy_stds = []\n",
"precision_means = []\n",
"precision_stds = []\n",
"recall_means = []\n",
"recall_stds = []\n",
"f1_means = []\n",
"f1_stds = []\n",
"\n",
"for name, model in models_class.items():\n",
" pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('model', model)\n",
" ])\n",
" accuracy_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='accuracy')\n",
" precision_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='precision')\n",
" recall_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='recall')\n",
" f1_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='f1')\n",
" accuracy_means.append(accuracy_scores.mean())\n",
" accuracy_stds.append(accuracy_scores.std())\n",
" precision_means.append(precision_scores.mean())\n",
" precision_stds.append(precision_scores.std())\n",
" recall_means.append(recall_scores.mean())\n",
" recall_stds.append(recall_scores.std())\n",
" f1_means.append(f1_scores.mean())\n",
" f1_stds.append(f1_scores.std())\n",
"\n",
"# Визуализация результатов для задачи классификации\n",
"fig, ax = plt.subplots(2, 2, figsize=(17, 12))\n",
"\n",
"ax[0, 0].bar(models_class.keys(), accuracy_means, yerr=accuracy_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
"ax[0, 0].set_ylabel('Accuracy')\n",
"ax[0, 0].set_title('Accuracy for Classification Models')\n",
"ax[0, 0].yaxis.grid(True)\n",
"\n",
"ax[0, 1].bar(models_class.keys(), precision_means, yerr=precision_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
"ax[0, 1].set_ylabel('Precision')\n",
"ax[0, 1].set_title('Precision for Classification Models')\n",
"ax[0, 1].yaxis.grid(True)\n",
"\n",
"ax[1, 0].bar(models_class.keys(), recall_means, yerr=recall_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
"ax[1, 0].set_ylabel('Recall')\n",
"ax[1, 0].set_title('Recall for Classification Models')\n",
"ax[1, 0].yaxis.grid(True)\n",
"\n",
"ax[1, 1].bar(models_class.keys(), f1_means, yerr=f1_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
"ax[1, 1].set_ylabel('F1-score')\n",
"ax[1, 1].set_title('F1-score for Classification Models')\n",
"ax[1, 1].yaxis.grid(True)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}