AIM-PIbd-31-Kryukov-A-I/Lab_3/lab3.ipynb

252 lines
47 KiB
Plaintext
Raw Normal View History

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
" ## Вариант 13 \n",
" https://www.kaggle.com/datasets/nancyalaswad90/yamana-gold-inc-stock-price?resource=download\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"ename": "UnicodeDecodeError",
"evalue": "'utf-8' codec can't decode bytes in position 15-16: invalid continuation byte",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[29], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m..//static//csv//Yamana_Gold_Inc._AUY.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msep\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m,\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnrows\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10000\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4\u001b[0m data \u001b[38;5;241m.\u001b[39mcolumns\n",
"File \u001b[1;32mc:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[0;32m 1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[0;32m 1014\u001b[0m dialect,\n\u001b[0;32m 1015\u001b[0m delimiter,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1022\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[0;32m 1023\u001b[0m )\n\u001b[0;32m 1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[1;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[0;32m 619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[1;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[0;32m 623\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
"File \u001b[1;32mc:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m 1617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1898\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[1;34m(self, f, engine)\u001b[0m\n\u001b[0;32m 1895\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(msg)\n\u001b[0;32m 1897\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m-> 1898\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmapping\u001b[49m\u001b[43m[\u001b[49m\u001b[43mengine\u001b[49m\u001b[43m]\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1899\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[0;32m 1900\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
"File \u001b[1;32mc:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\io\\parsers\\c_parser_wrapper.py:93\u001b[0m, in \u001b[0;36mCParserWrapper.__init__\u001b[1;34m(self, src, **kwds)\u001b[0m\n\u001b[0;32m 90\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype_backend\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyarrow\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m 91\u001b[0m \u001b[38;5;66;03m# Fail here loudly instead of in cython after reading\u001b[39;00m\n\u001b[0;32m 92\u001b[0m import_optional_dependency(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyarrow\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m---> 93\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reader \u001b[38;5;241m=\u001b[39m \u001b[43mparsers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mTextReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43msrc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 95\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39munnamed_cols \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reader\u001b[38;5;241m.\u001b[39munnamed_cols\n\u001b[0;32m 97\u001b[0m \u001b[38;5;66;03m# error: Cannot determine type of 'names'\u001b[39;00m\n",
"File \u001b[1;32mparsers.pyx:574\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader.__cinit__\u001b[1;34m()\u001b[0m\n",
"File \u001b[1;32mparsers.pyx:663\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._get_header\u001b[1;34m()\u001b[0m\n",
"File \u001b[1;32mparsers.pyx:874\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._tokenize_rows\u001b[1;34m()\u001b[0m\n",
"File \u001b[1;32mparsers.pyx:891\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._check_tokenize_status\u001b[1;34m()\u001b[0m\n",
"File \u001b[1;32mparsers.pyx:2053\u001b[0m, in \u001b[0;36mpandas._libs.parsers.raise_parser_error\u001b[1;34m()\u001b[0m\n",
"File \u001b[1;32m<frozen codecs>:322\u001b[0m, in \u001b[0;36mdecode\u001b[1;34m(self, input, final)\u001b[0m\n",
"\u001b[1;31mUnicodeDecodeError\u001b[0m: 'utf-8' codec can't decode bytes in position 15-16: invalid continuation byte"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"data = pd.read_csv(\"..//static//csv//Yamana_Gold_Inc._AUY.csv\", sep=\",\", nrows=10000)\n",
"data .columns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#1. Бизнес-цели для набора данных по акции компании Yamana Gold Inc.\n",
"Цель 1: Прогнозирование изменения цены акции компании.\n",
"Прогнозирование цен на акции является одной из ключевых задач в области финансов и инвестирования. Задача состоит в предсказании будущих изменений стоимости акции на основе исторических данных, таких как открытие и закрытие торгов, объемы торгов и другие показатели.\n",
"\n",
"Цель 2: Оценка волатильности акций компании.\n",
"Измерение волатильности позволяет инвесторам оценить риск и принять решения по управлению капиталом. Задача заключается в прогнозировании уровня волатильности на основе исторической динамики цен, объемов торгов и других рыночных факторов.\n",
"\n",
"#2. Цели технического проекта для каждой бизнес-цели\n",
"Цель 1: Прогнозирование изменения цены акции компании\n",
"\n",
"Разработать модель машинного обучения для прогнозирования будущих цен акций на основе исторических данных.\n",
"Использовать регрессионные модели, такие как линейная регрессия или более сложные модели, например, LSTM (долгосрочная краткосрочная память) для временных рядов.\n",
"Цель 2: Оценка волатильности акций компании\n",
"\n",
"Создать модель, которая будет прогнозировать волатильность на основе исторических данных о ценах.\n",
"Использовать методы статистического анализа, такие как вычисление стандартного отклонения, или методы машинного обучения для более точной оценки волатильности.\n",
"\n",
"#3 Проверим датасет на пропуски и удалим при необходимости строки с недостающими данными"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Id 0\n",
"Name 0\n",
"Short description 0\n",
"Gender 0\n",
"Country 0\n",
"Occupation 0\n",
"Birth year 0\n",
"Death year 0\n",
"Manner of death 0\n",
"Age of death 0\n",
"dtype: int64"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"# Проверим на пропущенные значения\n",
"data.isnull().sum()\n",
"\n",
"# Заполним пропуски или удалим строки с пропусками\n",
"data = data.dropna()\n",
"\n",
"# Проверим, что данные очищены\n",
"data.isnull().sum()\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Конструирование признаков"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "\"None of [Index(['Close', 'SMA_5', 'SMA_20', 'STD_5', 'STD_20'], dtype='object')] are in the [columns]\"",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[28], line 4\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m train_test_split\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# Отделяем целевую переменную (например, Price_Change) и признаки\u001b[39;00m\n\u001b[1;32m----> 4\u001b[0m X \u001b[38;5;241m=\u001b[39m \u001b[43mdata\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mClose\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mSMA_5\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mSMA_20\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mSTD_5\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mSTD_20\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\n\u001b[0;32m 5\u001b[0m y \u001b[38;5;241m=\u001b[39m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mPrice_Change\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[0;32m 7\u001b[0m \u001b[38;5;66;03m# Разделение на обучающую, контрольную и тестовую выборки (60%, 20%, 20%)\u001b[39;00m\n",
"File \u001b[1;32mc:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\core\\frame.py:4108\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 4106\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[0;32m 4107\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[1;32m-> 4108\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_indexer_strict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcolumns\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m 4110\u001b[0m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[0;32m 4111\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n",
"File \u001b[1;32mc:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:6200\u001b[0m, in \u001b[0;36mIndex._get_indexer_strict\u001b[1;34m(self, key, axis_name)\u001b[0m\n\u001b[0;32m 6197\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 6198\u001b[0m keyarr, indexer, new_indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reindex_non_unique(keyarr)\n\u001b[1;32m-> 6200\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_raise_if_missing\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 6202\u001b[0m keyarr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtake(indexer)\n\u001b[0;32m 6203\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, Index):\n\u001b[0;32m 6204\u001b[0m \u001b[38;5;66;03m# GH 42790 - Preserve name from an Index\u001b[39;00m\n",
"File \u001b[1;32mc:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:6249\u001b[0m, in \u001b[0;36mIndex._raise_if_missing\u001b[1;34m(self, key, indexer, axis_name)\u001b[0m\n\u001b[0;32m 6247\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m nmissing:\n\u001b[0;32m 6248\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m nmissing \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(indexer):\n\u001b[1;32m-> 6249\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNone of [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m] are in the [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00maxis_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 6251\u001b[0m not_found \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(ensure_index(key)[missing_mask\u001b[38;5;241m.\u001b[39mnonzero()[\u001b[38;5;241m0\u001b[39m]]\u001b[38;5;241m.\u001b[39munique())\n\u001b[0;32m 6252\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not in index\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
"\u001b[1;31mKeyError\u001b[0m: \"None of [Index(['Close', 'SMA_5', 'SMA_20', 'STD_5', 'STD_20'], dtype='object')] are in the [columns]\""
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Отделяем целевую переменную (например, Price_Change) и признаки\n",
"X = data[['Close', 'SMA_5', 'SMA_20', 'STD_5', 'STD_20']]\n",
"y = data['Price_Change']\n",
"\n",
"# Разделение на обучающую, контрольную и тестовую выборки (60%, 20%, 20%)\n",
"X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, shuffle=False)\n",
"X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False)\n",
"\n",
"# Проверка размеров выборок\n",
"(X_train.shape, X_val.shape, X_test.shape)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Разобьем данные на выборки и сбалансируем их"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train: (6000, 5), Validation: (2000, 5), Test: (2000, 5)\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Разделение данных на признаки и целевую переменную\n",
"X = df.drop(columns=['Age of death', 'Name', 'Short description', 'Id'])\n",
"y = df['Age of death']\n",
"\n",
"# Разбиение на обучающую, контрольную и тестовую выборки\n",
"X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n",
"X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n",
"\n",
"# Проверка размеров выборок\n",
"print(f\"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Оценка сбалансированности и аугментация данных"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAqQAAAIQCAYAAABXHXzKAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABJmUlEQVR4nO3dfXzN9eP/8efZNZtthu1QLpbINTXSoijLiEr4iI+Y7EMfn1EoaUWuitIVclV9ZF3wUeqjC0WNpGK0hFwnn2nCtqRtLtrG9v790W/n69iwY4fXNo/77XZuN+f9fr1f79f7vM7h6fV+v19vm2VZlgAAAABDPEw3AAAAAFc2AikAAACMIpACAADAKAIpAAAAjCKQAgAAwCgCKQAAAIwikAIAAMAoAikAAACMIpACAADAKAIpAMBlU6dOVUFBgSSpoKBA06ZNM9wiuGL79u368MMPHe+3bNmiTz/91FyDcMUjkAKSEhISZLPZHC8/Pz81bNhQw4cPV3p6uunmAWXOm2++qRdeeEG//vqrXnzxRb355pummwQXHDt2TA8++KA2bNigvXv36uGHH9a2bdtMNwtXMC/TDQDKksmTJys8PFw5OTn69ttvNW/ePH322Wfavn27KleubLp5QJkxefJkDRw4UGPHjpWvr6/eeecd002CCyIjIx0vSWrYsKGGDBliuFW4ktksy7JMNwIwLSEhQQ888ICSk5PVunVrx/JHHnlEL730khYvXqx+/foZbCFQ9mRkZOjnn39WgwYNVKNGDdPNwUXYuXOn/vzzTzVv3lw+Pj6mm4MrGKfsgfO4/fbbJUkpKSmSpKNHj+rRRx9V8+bNFRAQoMDAQHXt2lVbt24tsm1OTo4mTpyohg0bys/PTzVr1lTPnj21b98+SdL+/fudLhM4+9WxY0dHXV999ZVsNpveffddPfHEE7Lb7fL399fdd9+tAwcOFNn3xo0b1aVLFwUFBaly5crq0KGD1q1bV+wxduzYsdj9T5w4sUjZd955RxEREapUqZJCQkLUt2/fYvd/vmM7U0FBgWbMmKGmTZvKz89PYWFhevDBB/XHH384latXr566d+9eZD/Dhw8vUmdxbX/++eeLfKaSlJubqwkTJujaa6+Vr6+vateurccee0y5ubnFflZn6tixo5o1a1Zk+QsvvCCbzab9+/c7Lc/MzNTIkSNVu3Zt+fr66tprr9Vzzz3nuA7zTBMnTiz2sxs0aJBTuYMHD2rw4MEKCwuTr6+vmjZtqjfeeMOpTOF3p/Dl6+urhg0batq0aTp7PGLz5s3q2rWrAgMDFRAQoE6dOmnDhg1OZQovb9m/f79CQ0N18803q1q1amrRooVsNpsSEhLO+7mdfXnMhb53rhyjO38fhX0QGhqqU6dOOa37z3/+42jvkSNHnNatWLFCt9xyi/z9/VWlShV169ZNO3bscCozaNAgBQQEFGnX+++/L5vNpq+++sqxzNXv2dy5c9W0aVP5+vqqVq1aiouLU2ZmplOZjh07On4LTZo0UUREhLZu3VrsbxS4XDhlD5xHYXisVq2aJOl///ufPvzwQ/3tb39TeHi40tPT9eqrr6pDhw7auXOnatWqJUnKz89X9+7dtXr1avXt21cPP/ywjh07psTERG3fvl3169d37KNfv3668847nfYbHx9fbHueeeYZ2Ww2jR07VhkZGZoxY4aioqK0ZcsWVapUSZL05ZdfqmvXroqIiNCECRPk4eGhhQsX6vbbb9c333yjG2+8sUi9V199teOmlOPHj2vYsGHF7nv8+PHq06eP/vGPf+i3337TK6+8oltvvVWbN29WcHBwkW2GDh2qW265RZL03//+V8uWLXNa/+CDDzpGpx966CGlpKRo9uzZ2rx5s9atWydvb+9iPwdXZGZmFnvDTUFBge6++259++23Gjp0qBo3bqxt27bp5Zdf1k8//eR0w0dpnTx5Uh06dNDBgwf14IMPqk6dOlq/fr3i4+N1+PBhzZgxo9jt3n77bcefR40a5bQuPT1dN910k2w2m4YPH64aNWpoxYoVio2NVXZ2tkaOHOlU/oknnlDjxo31559/OoJbaGioYmNjJUk7duzQLbfcosDAQD322GPy9vbWq6++qo4dO2rt2rVq27btOY/v7bffdvn6w8LLYwoV971z9Rgvxe/j2LFjWr58ue69917HsoULF8rPz085OTlFPoeYmBhFR0frueee08mTJzVv3jy1b99emzdvVr169Vz6jFw1ceJETZo0SVFRURo2bJj27NmjefPmKTk5+YK/p7Fjx17StgEXZAGwFi5caEmyVq1aZf3222/WgQMHrCVLlljVqlWzKlWqZP3666+WZVlWTk6OlZ+f77RtSkqK5evra02ePNmx7I033rAkWS+99FKRfRUUFDi2k2Q9//zzRco0bdrU6tChg+P9mjVrLEnWVVddZWVnZzuWv/fee5Yka+bMmY66GzRoYEVHRzv2Y1mWdfLkSSs8PNy64447iuzr5ptvtpo1a+Z4/9tvv1mSrAkTJjiW7d+/3/L09LSeeeYZp223bdtmeXl5FVm+d+9eS5L15ptvOpZNmDDBOvOvnG+++caSZC1atMhp25UrVxZZXrduXatbt25F2h4XF2ed/dfY2W1/7LHHrNDQUCsiIsLpM3377bctDw8P65tvvnHafv78+ZYka926dUX2d6YOHTpYTZs2LbL8+eeftyRZKSkpjmVTpkyx/P39rZ9++smp7OOPP255enpaqampTsuffPJJy2azOS2rW7euFRMT43gfGxtr1axZ0zpy5IhTub59+1pBQUHWyZMnLcv6v+/OmjVrHGVycnIsDw8P61//+pdjWY8ePSwfHx9r3759jmWHDh2yqlSpYt16662OZYW/lcLjy8nJserUqWN17drVkmQtXLiw6Id1hsLtk5OTnZYX971z9Rjd+fso/L7269fP6t69u2P5L7/8Ynl4eFj9+vWzJFm//fabZVmWdezYMSs4ONgaMmSIU1vT0tKsoKAgp+UxMTGWv79/kc9m6dKlRfqqpN+zjIwMy8fHx+rcubPT31GzZ8+2JFlvvPGGU51n/hY+++wzS5LVpUuXIr8n4HLhlD1whqioKNWoUUO1a9dW3759FRAQoGXLlumqq66SJPn6+srD46+fTX5+vn7//XcFBATouuuu0w8//OCo54MPPlD16tU1YsSIIvsozSmxgQMHqkqVKo73vXv3Vs2aNfXZZ59J+mvqlr179+rvf/+7fv/9dx05ckRHjhzRiRMn1KlTJ3399ddFThHn5OTIz8/vvPv973//q4KCAvXp08dR55EjR2S329WgQQOtWbPGqXxeXp6kvz6vc1m6dKmCgoJ0xx13ONUZERGhgICAInWeOnXKqdyRI0eKjFCd7eDBg3rllVc0fvz4IqdIly5dqsaNG6tRo0ZOdRZepnH2/ktj6dKluuWWW1S1alWnfUVFRSk/P19ff/21U/m8vLzzfnaWZemDDz7QXXfdJcuynOqMjo5WVlaW0/dRkrKysnTkyBGlpqZq+vTpKigocBxrfn6+vvjiC/Xo0UPXXHONY5uaNWvq73//u7799ltlZ2cX25Y5c+bo999/14QJEy7243HbMV6K38fgwYO1cuVKpaWlSfprdoHIyEg1bNjQqVxiYqIyMzPVr18/p7Z6enqqbdu2xX6fzv4+Hzt2rNjPIj8/v0jZkydPOpVZtWqV8vLyNHLkSMffUZI0ZMgQBQYGnnNKJ8uyFB8fr169ep13FBy41DhlD5xhzpw5atiwoby8vBQWFqbrrrvO6S/3goICzZw5U3PnzlVKSory8/Md6wpP60t/neq/7rrr5OXl3p9YgwYNnN7bbDZde+21juvI9u7dK0mKiYk5Zx1ZWVmqWrWq4/2RI0eK1Hu2vXv3yrKsc5Y7+1Rg4TVrxV0nd2adWVlZCg0NLXZ9RkaG0/svvvjC5RtnJkyYoFq1aunBBx/U+++/X2T/u3btOmedZ++/NPbu3asff/yxxPvKzMw872f322+/KTMzU6+99ppee+21EtXZo0cPx589PDw0btw49erVy1HfyZMndd111xW
"text/plain": [
"<Figure size 800x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"# Проверка распределения целевой переменной\n",
"plt.figure(figsize=(8, 6))\n",
"plt.hist(y_train, bins=30, alpha=0.7, label='Train')\n",
"plt.hist(y_val, bins=30, alpha=0.7, label='Validation')\n",
"plt.hist(y_test, bins=30, alpha=0.7, label='Test')\n",
"plt.legend()\n",
"plt.title('Распределение целевой переменной')\n",
"plt.show()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}