731 lines
27 KiB
Plaintext
731 lines
27 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
" ## Вариант 13 \n",
|
||
" https://www.kaggle.com/datasets/nancyalaswad90/yamana-gold-inc-stock-price?resource=download\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 105,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',\n",
|
||
" 'Day_of_week', 'Month', 'Year'],\n",
|
||
" dtype='object')"
|
||
]
|
||
},
|
||
"execution_count": 105,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"data = pd.read_csv(\"..//static//csv//Yamana_Gold_Inc._AUY.csv\", sep=\",\")\n",
|
||
"\n",
|
||
"# Преобразование даты\n",
|
||
"data['Date'] = pd.to_datetime(data['Date'])\n",
|
||
"\n",
|
||
"# Преобразование данных: создание новых признаков\n",
|
||
"data['Day_of_week'] = data['Date'].dt.dayofweek\n",
|
||
"data['Month'] = data['Date'].dt.month\n",
|
||
"data['Year'] = data['Date'].dt.year\n",
|
||
"data .columns"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#1. Бизнес-цели для набора данных по акции компании Yamana Gold Inc.\n",
|
||
"Цель 1: Прогнозирование изменения цены акции компании.\n",
|
||
"Прогнозирование цен на акции является одной из ключевых задач в области финансов и инвестирования. Задача состоит в предсказании будущих изменений стоимости акции на основе исторических данных, таких как открытие и закрытие торгов, объемы торгов и другие показатели.\n",
|
||
"\n",
|
||
"Цель 2: Оценка волатильности акций компании.\n",
|
||
"Измерение волатильности позволяет инвесторам оценить риск и принять решения по управлению капиталом. Задача заключается в прогнозировании уровня волатильности на основе исторической динамики цен, объемов торгов и других рыночных факторов.\n",
|
||
"\n",
|
||
"#2. Цели технического проекта для каждой бизнес-цели\n",
|
||
"Цель 1: Прогнозирование изменения цены акции компании\n",
|
||
"\n",
|
||
"Разработать модель машинного обучения для прогнозирования будущих цен акций на основе исторических данных.\n",
|
||
"Использовать регрессионные модели, такие как линейная регрессия или более сложные модели, например, LSTM (долгосрочная краткосрочная память) для временных рядов.\n",
|
||
"Цель 2: Оценка волатильности акций компании\n",
|
||
"\n",
|
||
"Создать модель, которая будет прогнозировать волатильность на основе исторических данных о ценах.\n",
|
||
"Использовать методы статистического анализа, такие как вычисление стандартного отклонения, или методы машинного обучения для более точной оценки волатильности.\n",
|
||
"\n",
|
||
"#3 Проверим датасет на пропуски и удалим при необходимости строки с недостающими данными"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 106,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Date 0\n",
|
||
"Open 0\n",
|
||
"High 0\n",
|
||
"Low 0\n",
|
||
"Close 0\n",
|
||
"Adj Close 0\n",
|
||
"Volume 0\n",
|
||
"Day_of_week 0\n",
|
||
"Month 0\n",
|
||
"Year 0\n",
|
||
"dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 106,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"\n",
|
||
"# Проверим на пропущенные значения\n",
|
||
"data.isnull().sum()\n",
|
||
"\n",
|
||
"# Заполним пропуски или удалим строки с пропусками\n",
|
||
"data = data.dropna()\n",
|
||
"\n",
|
||
"# Проверим, что данные очищены\n",
|
||
"data.isnull().sum()\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Конструирование признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 107,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Date</th>\n",
|
||
" <th>Open</th>\n",
|
||
" <th>High</th>\n",
|
||
" <th>Low</th>\n",
|
||
" <th>Close</th>\n",
|
||
" <th>Adj Close</th>\n",
|
||
" <th>Volume</th>\n",
|
||
" <th>Day_of_week</th>\n",
|
||
" <th>Month</th>\n",
|
||
" <th>Year</th>\n",
|
||
" <th>Price_Change</th>\n",
|
||
" <th>SMA_5</th>\n",
|
||
" <th>SMA_20</th>\n",
|
||
" <th>STD_5</th>\n",
|
||
" <th>STD_20</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>2001-06-22</td>\n",
|
||
" <td>3.428571</td>\n",
|
||
" <td>3.428571</td>\n",
|
||
" <td>3.428571</td>\n",
|
||
" <td>3.428571</td>\n",
|
||
" <td>2.806002</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2001</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2001-06-25</td>\n",
|
||
" <td>3.428571</td>\n",
|
||
" <td>3.428571</td>\n",
|
||
" <td>3.428571</td>\n",
|
||
" <td>3.428571</td>\n",
|
||
" <td>2.806002</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2001</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2001-06-26</td>\n",
|
||
" <td>3.714286</td>\n",
|
||
" <td>3.714286</td>\n",
|
||
" <td>3.714286</td>\n",
|
||
" <td>3.714286</td>\n",
|
||
" <td>3.039837</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2001</td>\n",
|
||
" <td>0.285715</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>2001-06-27</td>\n",
|
||
" <td>3.714286</td>\n",
|
||
" <td>3.714286</td>\n",
|
||
" <td>3.714286</td>\n",
|
||
" <td>3.714286</td>\n",
|
||
" <td>3.039837</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2001</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>2001-06-28</td>\n",
|
||
" <td>3.714286</td>\n",
|
||
" <td>3.714286</td>\n",
|
||
" <td>3.714286</td>\n",
|
||
" <td>3.714286</td>\n",
|
||
" <td>3.039837</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2001</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>3.6</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.156493</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Date Open High Low Close Adj Close Volume \\\n",
|
||
"0 2001-06-22 3.428571 3.428571 3.428571 3.428571 2.806002 0 \n",
|
||
"1 2001-06-25 3.428571 3.428571 3.428571 3.428571 2.806002 0 \n",
|
||
"2 2001-06-26 3.714286 3.714286 3.714286 3.714286 3.039837 0 \n",
|
||
"3 2001-06-27 3.714286 3.714286 3.714286 3.714286 3.039837 0 \n",
|
||
"4 2001-06-28 3.714286 3.714286 3.714286 3.714286 3.039837 0 \n",
|
||
"\n",
|
||
" Day_of_week Month Year Price_Change SMA_5 SMA_20 STD_5 STD_20 \n",
|
||
"0 4 6 2001 NaN NaN NaN NaN NaN \n",
|
||
"1 0 6 2001 0.000000 NaN NaN NaN NaN \n",
|
||
"2 1 6 2001 0.285715 NaN NaN NaN NaN \n",
|
||
"3 2 6 2001 0.000000 NaN NaN NaN NaN \n",
|
||
"4 3 6 2001 0.000000 3.6 NaN 0.156493 NaN "
|
||
]
|
||
},
|
||
"execution_count": 107,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Создаем новый признак - разницу между текущей и предыдущей ценой (Price_Change)\n",
|
||
"data['Price_Change'] = data['Close'].diff()\n",
|
||
"\n",
|
||
"# Создадим скользящие средние для 5 и 20 дней\n",
|
||
"data['SMA_5'] = data['Close'].rolling(window=5).mean()\n",
|
||
"data['SMA_20'] = data['Close'].rolling(window=20).mean()\n",
|
||
"\n",
|
||
"# Стандартное отклонение для 5 и 20 дней\n",
|
||
"data['STD_5'] = data['Close'].rolling(window=5).std()\n",
|
||
"data['STD_20'] = data['Close'].rolling(window=20).std()\n",
|
||
"\n",
|
||
"data.head()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
" Разделение данных на обучающую, контрольную и тестовую выборки"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 108,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"((3150, 10), (1050, 10), (1051, 10))"
|
||
]
|
||
},
|
||
"execution_count": 108,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"\n",
|
||
"# Преобразуем колонку 'Date' в тип datetime для правильного сортирования\n",
|
||
"data['Date'] = pd.to_datetime(data['Date'])\n",
|
||
"\n",
|
||
"# Сортируем данные по дате, чтобы не нарушить временную зависимость\n",
|
||
"data = data.sort_values(by='Date')\n",
|
||
"\n",
|
||
"# Отделяем целевую переменную (например, Price_Change) и признаки\n",
|
||
"X = data[['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'SMA_5', 'SMA_20', 'STD_5', 'STD_20']]\n",
|
||
"y = data['Price_Change']\n",
|
||
"\n",
|
||
"# Разделение на обучающую, контрольную и тестовую выборки (60%, 20%, 20%)\n",
|
||
"X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, shuffle=False)\n",
|
||
"X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False)\n",
|
||
"\n",
|
||
"# Проверка размеров выборок\n",
|
||
"(X_train.shape, X_val.shape, X_test.shape)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Конструирование признаков для решения задач"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 109,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Признаки для задачи прогнозирования изменений цен\n",
|
||
"data['Price_Change'] = data['Close'].diff()\n",
|
||
"\n",
|
||
"# Скользящие средние и стандартное отклонение\n",
|
||
"data['SMA_5'] = data['Close'].rolling(window=5).mean()\n",
|
||
"data['SMA_20'] = data['Close'].rolling(window=20).mean()\n",
|
||
"data['STD_5'] = data['Close'].rolling(window=5).std()\n",
|
||
"data['STD_20'] = data['Close'].rolling(window=20).std()\n",
|
||
"\n",
|
||
"# Признаки для оценки волатильности\n",
|
||
"data['Volatility'] = data['Close'].rolling(window=5).std()\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Подготовка признаков: one-hot encoding, дискретизация, синтез признаков, масштабирование\n",
|
||
"One-hot encoding: Применим для категориальных признаков (например, день недели).\n",
|
||
"Масштабирование: Стандартизируем числовые признаки."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 110,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" Day_of_week Month\n",
|
||
"0 4 6\n",
|
||
"1 0 6\n",
|
||
"2 1 6\n",
|
||
"3 2 6\n",
|
||
"4 3 6\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Close</th>\n",
|
||
" <th>SMA_5</th>\n",
|
||
" <th>SMA_20</th>\n",
|
||
" <th>STD_5</th>\n",
|
||
" <th>STD_20</th>\n",
|
||
" <th>Day_of_week_1</th>\n",
|
||
" <th>Day_of_week_2</th>\n",
|
||
" <th>Day_of_week_3</th>\n",
|
||
" <th>Day_of_week_4</th>\n",
|
||
" <th>Month_2</th>\n",
|
||
" <th>Month_3</th>\n",
|
||
" <th>Month_4</th>\n",
|
||
" <th>Month_5</th>\n",
|
||
" <th>Month_6</th>\n",
|
||
" <th>Month_7</th>\n",
|
||
" <th>Month_8</th>\n",
|
||
" <th>Month_9</th>\n",
|
||
" <th>Month_10</th>\n",
|
||
" <th>Month_11</th>\n",
|
||
" <th>Month_12</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>-0.721096</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>-0.507666</td>\n",
|
||
" <td>-0.507962</td>\n",
|
||
" <td>-0.502320</td>\n",
|
||
" <td>1.999048</td>\n",
|
||
" <td>-0.28793</td>\n",
|
||
" <td>-0.309491</td>\n",
|
||
" <td>-0.300916</td>\n",
|
||
" <td>-0.297137</td>\n",
|
||
" <td>3.335719</td>\n",
|
||
" <td>-0.30429</td>\n",
|
||
" <td>-0.311702</td>\n",
|
||
" <td>-0.296377</td>\n",
|
||
" <td>-0.311335</td>\n",
|
||
" <td>-0.298274</td>\n",
|
||
" <td>-0.303543</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>-0.721096</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>-0.507666</td>\n",
|
||
" <td>-0.507962</td>\n",
|
||
" <td>-0.502320</td>\n",
|
||
" <td>-0.500238</td>\n",
|
||
" <td>-0.28793</td>\n",
|
||
" <td>-0.309491</td>\n",
|
||
" <td>-0.300916</td>\n",
|
||
" <td>-0.297137</td>\n",
|
||
" <td>3.335719</td>\n",
|
||
" <td>-0.30429</td>\n",
|
||
" <td>-0.311702</td>\n",
|
||
" <td>-0.296377</td>\n",
|
||
" <td>-0.311335</td>\n",
|
||
" <td>-0.298274</td>\n",
|
||
" <td>-0.303543</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>-0.660890</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1.969800</td>\n",
|
||
" <td>-0.507962</td>\n",
|
||
" <td>-0.502320</td>\n",
|
||
" <td>-0.500238</td>\n",
|
||
" <td>-0.28793</td>\n",
|
||
" <td>-0.309491</td>\n",
|
||
" <td>-0.300916</td>\n",
|
||
" <td>-0.297137</td>\n",
|
||
" <td>3.335719</td>\n",
|
||
" <td>-0.30429</td>\n",
|
||
" <td>-0.311702</td>\n",
|
||
" <td>-0.296377</td>\n",
|
||
" <td>-0.311335</td>\n",
|
||
" <td>-0.298274</td>\n",
|
||
" <td>-0.303543</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>-0.660890</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>-0.507666</td>\n",
|
||
" <td>1.968649</td>\n",
|
||
" <td>-0.502320</td>\n",
|
||
" <td>-0.500238</td>\n",
|
||
" <td>-0.28793</td>\n",
|
||
" <td>-0.309491</td>\n",
|
||
" <td>-0.300916</td>\n",
|
||
" <td>-0.297137</td>\n",
|
||
" <td>3.335719</td>\n",
|
||
" <td>-0.30429</td>\n",
|
||
" <td>-0.311702</td>\n",
|
||
" <td>-0.296377</td>\n",
|
||
" <td>-0.311335</td>\n",
|
||
" <td>-0.298274</td>\n",
|
||
" <td>-0.303543</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>-0.660890</td>\n",
|
||
" <td>-0.686033</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>-0.269917</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>-0.507666</td>\n",
|
||
" <td>-0.507962</td>\n",
|
||
" <td>1.990763</td>\n",
|
||
" <td>-0.500238</td>\n",
|
||
" <td>-0.28793</td>\n",
|
||
" <td>-0.309491</td>\n",
|
||
" <td>-0.300916</td>\n",
|
||
" <td>-0.297137</td>\n",
|
||
" <td>3.335719</td>\n",
|
||
" <td>-0.30429</td>\n",
|
||
" <td>-0.311702</td>\n",
|
||
" <td>-0.296377</td>\n",
|
||
" <td>-0.311335</td>\n",
|
||
" <td>-0.298274</td>\n",
|
||
" <td>-0.303543</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Close SMA_5 SMA_20 STD_5 STD_20 Day_of_week_1 Day_of_week_2 \\\n",
|
||
"0 -0.721096 NaN NaN NaN NaN -0.507666 -0.507962 \n",
|
||
"1 -0.721096 NaN NaN NaN NaN -0.507666 -0.507962 \n",
|
||
"2 -0.660890 NaN NaN NaN NaN 1.969800 -0.507962 \n",
|
||
"3 -0.660890 NaN NaN NaN NaN -0.507666 1.968649 \n",
|
||
"4 -0.660890 -0.686033 NaN -0.269917 NaN -0.507666 -0.507962 \n",
|
||
"\n",
|
||
" Day_of_week_3 Day_of_week_4 Month_2 Month_3 Month_4 Month_5 \\\n",
|
||
"0 -0.502320 1.999048 -0.28793 -0.309491 -0.300916 -0.297137 \n",
|
||
"1 -0.502320 -0.500238 -0.28793 -0.309491 -0.300916 -0.297137 \n",
|
||
"2 -0.502320 -0.500238 -0.28793 -0.309491 -0.300916 -0.297137 \n",
|
||
"3 -0.502320 -0.500238 -0.28793 -0.309491 -0.300916 -0.297137 \n",
|
||
"4 1.990763 -0.500238 -0.28793 -0.309491 -0.300916 -0.297137 \n",
|
||
"\n",
|
||
" Month_6 Month_7 Month_8 Month_9 Month_10 Month_11 Month_12 \n",
|
||
"0 3.335719 -0.30429 -0.311702 -0.296377 -0.311335 -0.298274 -0.303543 \n",
|
||
"1 3.335719 -0.30429 -0.311702 -0.296377 -0.311335 -0.298274 -0.303543 \n",
|
||
"2 3.335719 -0.30429 -0.311702 -0.296377 -0.311335 -0.298274 -0.303543 \n",
|
||
"3 3.335719 -0.30429 -0.311702 -0.296377 -0.311335 -0.298274 -0.303543 \n",
|
||
"4 3.335719 -0.30429 -0.311702 -0.296377 -0.311335 -0.298274 -0.303543 "
|
||
]
|
||
},
|
||
"execution_count": 110,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"# Преобразуем дату, если это еще не сделано\n",
|
||
"data['Date'] = pd.to_datetime(data['Date'])\n",
|
||
"\n",
|
||
"# Добавим дополнительные признаки (день недели и месяц)\n",
|
||
"data['Day_of_week'] = data['Date'].dt.dayofweek\n",
|
||
"data['Month'] = data['Date'].dt.month\n",
|
||
"\n",
|
||
"# Проверим, что эти столбцы добавлены\n",
|
||
"print(data[['Day_of_week', 'Month']].head())\n",
|
||
"\n",
|
||
"# Выбираем признаки и целевую переменную\n",
|
||
"X = data[['Close', 'SMA_5', 'SMA_20', 'STD_5', 'STD_20', 'Day_of_week', 'Month']]\n",
|
||
"y = data['Price_Change']\n",
|
||
"\n",
|
||
"# Применяем one-hot encoding для категориальных признаков (Day_of_week и Month)\n",
|
||
"X = pd.get_dummies(X, columns=['Day_of_week', 'Month'], drop_first=True)\n",
|
||
"\n",
|
||
"# Масштабирование числовых признаков (Close, SMA, STD)\n",
|
||
"scaler = StandardScaler()\n",
|
||
"X_scaled = scaler.fit_transform(X)\n",
|
||
"\n",
|
||
"# Преобразуем обратно в DataFrame для удобства\n",
|
||
"X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)\n",
|
||
"\n",
|
||
"# Проверим результат\n",
|
||
"X_scaled_df.head()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 111,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import featuretools as ft\n",
|
||
"\n",
|
||
"# Создаем сущности для Featuretools\n",
|
||
"es = ft.EntitySet(id=\"stock_prices\")\n",
|
||
"es = es.add_dataframe(dataframe_name=\"stock_data\", dataframe=data, index=\"Date\")\n",
|
||
"\n",
|
||
"# Автоматическое создание признаков\n",
|
||
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name=\"stock_data\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#Оценка качества признаков\n",
|
||
"Оценка признаков на основе предсказательной способности модели и других критериев."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 112,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"5251\n",
|
||
"3150\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(np.float64(0.05230198011754029), 0.5415652186272203)"
|
||
]
|
||
},
|
||
"execution_count": 112,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Проверим размерности данных после удаления NaN\n",
|
||
"print(X_scaled_df.shape[0]) # Количество строк в X_scaled_df\n",
|
||
"print(y_train.shape[0]) # Количество строк в y_train\n",
|
||
"\n",
|
||
"# Если данные имеют разные размеры, синхронизируем их\n",
|
||
"df = pd.concat([X_scaled_df, y_train], axis=1).dropna()\n",
|
||
"X_scaled_df = df.drop(columns=y_train.name)\n",
|
||
"y_train = df[y_train.name]\n",
|
||
"\n",
|
||
"# Приводим индексы к одному виду\n",
|
||
"y_train = y_train.reset_index(drop=True)\n",
|
||
"X_scaled_df = X_scaled_df.reset_index(drop=True)\n",
|
||
"\n",
|
||
"# После этого продолжаем обучение модели\n",
|
||
"model = LinearRegression()\n",
|
||
"model.fit(X_scaled_df, y_train)\n",
|
||
"\n",
|
||
"# Прогнозирование и оценка качества\n",
|
||
"y_pred = model.predict(X_scaled_df)\n",
|
||
"\n",
|
||
"# Оценка качества модели\n",
|
||
"mse = mean_squared_error(y_train, y_pred) # Используем y_train, потому что данные для теста не созданы\n",
|
||
"r2 = r2_score(y_train, y_pred)\n",
|
||
"\n",
|
||
"mse, r2\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"MSE = 0.0523: Модель в среднем делает ошибку около 0.0523 при прогнозировании значений.\n",
|
||
"R² = 0.5416: Модель объясняет примерно 54.16% изменчивости целевой переменной\n",
|
||
"\n",
|
||
"Визуализируем"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|