252 lines
47 KiB
Plaintext
252 lines
47 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
" ## Вариант 13 \n",
|
|||
|
" https://www.kaggle.com/datasets/nancyalaswad90/yamana-gold-inc-stock-price?resource=download\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 29,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"ename": "UnicodeDecodeError",
|
|||
|
"evalue": "'utf-8' codec can't decode bytes in position 15-16: invalid continuation byte",
|
|||
|
"output_type": "error",
|
|||
|
"traceback": [
|
|||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
|||
|
"\u001b[1;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)",
|
|||
|
"Cell \u001b[1;32mIn[29], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m..//static//csv//Yamana_Gold_Inc._AUY.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msep\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m,\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnrows\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10000\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4\u001b[0m data \u001b[38;5;241m.\u001b[39mcolumns\n",
|
|||
|
"File \u001b[1;32mc:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[0;32m 1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[0;32m 1014\u001b[0m dialect,\n\u001b[0;32m 1015\u001b[0m delimiter,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1022\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[0;32m 1023\u001b[0m )\n\u001b[0;32m 1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[1;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
|
|||
|
"File \u001b[1;32mc:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[0;32m 619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[1;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[0;32m 623\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
|
|||
|
"File \u001b[1;32mc:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m 1617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n",
|
|||
|
"File \u001b[1;32mc:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1898\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[1;34m(self, f, engine)\u001b[0m\n\u001b[0;32m 1895\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(msg)\n\u001b[0;32m 1897\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m-> 1898\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmapping\u001b[49m\u001b[43m[\u001b[49m\u001b[43mengine\u001b[49m\u001b[43m]\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1899\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[0;32m 1900\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
|
|||
|
"File \u001b[1;32mc:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\io\\parsers\\c_parser_wrapper.py:93\u001b[0m, in \u001b[0;36mCParserWrapper.__init__\u001b[1;34m(self, src, **kwds)\u001b[0m\n\u001b[0;32m 90\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype_backend\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyarrow\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m 91\u001b[0m \u001b[38;5;66;03m# Fail here loudly instead of in cython after reading\u001b[39;00m\n\u001b[0;32m 92\u001b[0m import_optional_dependency(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyarrow\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m---> 93\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reader \u001b[38;5;241m=\u001b[39m \u001b[43mparsers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mTextReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43msrc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 95\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39munnamed_cols \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reader\u001b[38;5;241m.\u001b[39munnamed_cols\n\u001b[0;32m 97\u001b[0m \u001b[38;5;66;03m# error: Cannot determine type of 'names'\u001b[39;00m\n",
|
|||
|
"File \u001b[1;32mparsers.pyx:574\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader.__cinit__\u001b[1;34m()\u001b[0m\n",
|
|||
|
"File \u001b[1;32mparsers.pyx:663\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._get_header\u001b[1;34m()\u001b[0m\n",
|
|||
|
"File \u001b[1;32mparsers.pyx:874\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._tokenize_rows\u001b[1;34m()\u001b[0m\n",
|
|||
|
"File \u001b[1;32mparsers.pyx:891\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._check_tokenize_status\u001b[1;34m()\u001b[0m\n",
|
|||
|
"File \u001b[1;32mparsers.pyx:2053\u001b[0m, in \u001b[0;36mpandas._libs.parsers.raise_parser_error\u001b[1;34m()\u001b[0m\n",
|
|||
|
"File \u001b[1;32m<frozen codecs>:322\u001b[0m, in \u001b[0;36mdecode\u001b[1;34m(self, input, final)\u001b[0m\n",
|
|||
|
"\u001b[1;31mUnicodeDecodeError\u001b[0m: 'utf-8' codec can't decode bytes in position 15-16: invalid continuation byte"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"data = pd.read_csv(\"..//static//csv//Yamana_Gold_Inc._AUY.csv\", sep=\",\", nrows=10000)\n",
|
|||
|
"data .columns"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#1. Бизнес-цели для набора данных по акции компании Yamana Gold Inc.\n",
|
|||
|
"Цель 1: Прогнозирование изменения цены акции компании.\n",
|
|||
|
"Прогнозирование цен на акции является одной из ключевых задач в области финансов и инвестирования. Задача состоит в предсказании будущих изменений стоимости акции на основе исторических данных, таких как открытие и закрытие торгов, объемы торгов и другие показатели.\n",
|
|||
|
"\n",
|
|||
|
"Цель 2: Оценка волатильности акций компании.\n",
|
|||
|
"Измерение волатильности позволяет инвесторам оценить риск и принять решения по управлению капиталом. Задача заключается в прогнозировании уровня волатильности на основе исторической динамики цен, объемов торгов и других рыночных факторов.\n",
|
|||
|
"\n",
|
|||
|
"#2. Цели технического проекта для каждой бизнес-цели\n",
|
|||
|
"Цель 1: Прогнозирование изменения цены акции компании\n",
|
|||
|
"\n",
|
|||
|
"Разработать модель машинного обучения для прогнозирования будущих цен акций на основе исторических данных.\n",
|
|||
|
"Использовать регрессионные модели, такие как линейная регрессия или более сложные модели, например, LSTM (долгосрочная краткосрочная память) для временных рядов.\n",
|
|||
|
"Цель 2: Оценка волатильности акций компании\n",
|
|||
|
"\n",
|
|||
|
"Создать модель, которая будет прогнозировать волатильность на основе исторических данных о ценах.\n",
|
|||
|
"Использовать методы статистического анализа, такие как вычисление стандартного отклонения, или методы машинного обучения для более точной оценки волатильности.\n",
|
|||
|
"\n",
|
|||
|
"#3 Проверим датасет на пропуски и удалим при необходимости строки с недостающими данными"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 27,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"Id 0\n",
|
|||
|
"Name 0\n",
|
|||
|
"Short description 0\n",
|
|||
|
"Gender 0\n",
|
|||
|
"Country 0\n",
|
|||
|
"Occupation 0\n",
|
|||
|
"Birth year 0\n",
|
|||
|
"Death year 0\n",
|
|||
|
"Manner of death 0\n",
|
|||
|
"Age of death 0\n",
|
|||
|
"dtype: int64"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 27,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"\n",
|
|||
|
"# Проверим на пропущенные значения\n",
|
|||
|
"data.isnull().sum()\n",
|
|||
|
"\n",
|
|||
|
"# Заполним пропуски или удалим строки с пропусками\n",
|
|||
|
"data = data.dropna()\n",
|
|||
|
"\n",
|
|||
|
"# Проверим, что данные очищены\n",
|
|||
|
"data.isnull().sum()\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Конструирование признаков"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 28,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"ename": "KeyError",
|
|||
|
"evalue": "\"None of [Index(['Close', 'SMA_5', 'SMA_20', 'STD_5', 'STD_20'], dtype='object')] are in the [columns]\"",
|
|||
|
"output_type": "error",
|
|||
|
"traceback": [
|
|||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
|||
|
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
|
|||
|
"Cell \u001b[1;32mIn[28], line 4\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m train_test_split\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# Отделяем целевую переменную (например, Price_Change) и признаки\u001b[39;00m\n\u001b[1;32m----> 4\u001b[0m X \u001b[38;5;241m=\u001b[39m \u001b[43mdata\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mClose\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mSMA_5\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mSMA_20\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mSTD_5\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mSTD_20\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\n\u001b[0;32m 5\u001b[0m y \u001b[38;5;241m=\u001b[39m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mPrice_Change\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[0;32m 7\u001b[0m \u001b[38;5;66;03m# Разделение на обучающую, контрольную и тестовую выборки (60%, 20%, 20%)\u001b[39;00m\n",
|
|||
|
"File \u001b[1;32mc:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\core\\frame.py:4108\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 4106\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[0;32m 4107\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[1;32m-> 4108\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_indexer_strict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcolumns\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m 4110\u001b[0m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[0;32m 4111\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n",
|
|||
|
"File \u001b[1;32mc:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:6200\u001b[0m, in \u001b[0;36mIndex._get_indexer_strict\u001b[1;34m(self, key, axis_name)\u001b[0m\n\u001b[0;32m 6197\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 6198\u001b[0m keyarr, indexer, new_indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reindex_non_unique(keyarr)\n\u001b[1;32m-> 6200\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_raise_if_missing\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 6202\u001b[0m keyarr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtake(indexer)\n\u001b[0;32m 6203\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, Index):\n\u001b[0;32m 6204\u001b[0m \u001b[38;5;66;03m# GH 42790 - Preserve name from an Index\u001b[39;00m\n",
|
|||
|
"File \u001b[1;32mc:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:6249\u001b[0m, in \u001b[0;36mIndex._raise_if_missing\u001b[1;34m(self, key, indexer, axis_name)\u001b[0m\n\u001b[0;32m 6247\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m nmissing:\n\u001b[0;32m 6248\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m nmissing \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(indexer):\n\u001b[1;32m-> 6249\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNone of [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m] are in the [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00maxis_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 6251\u001b[0m not_found \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(ensure_index(key)[missing_mask\u001b[38;5;241m.\u001b[39mnonzero()[\u001b[38;5;241m0\u001b[39m]]\u001b[38;5;241m.\u001b[39munique())\n\u001b[0;32m 6252\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not in index\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
|||
|
"\u001b[1;31mKeyError\u001b[0m: \"None of [Index(['Close', 'SMA_5', 'SMA_20', 'STD_5', 'STD_20'], dtype='object')] are in the [columns]\""
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"# Отделяем целевую переменную (например, Price_Change) и признаки\n",
|
|||
|
"X = data[['Close', 'SMA_5', 'SMA_20', 'STD_5', 'STD_20']]\n",
|
|||
|
"y = data['Price_Change']\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую, контрольную и тестовую выборки (60%, 20%, 20%)\n",
|
|||
|
"X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, shuffle=False)\n",
|
|||
|
"X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка размеров выборок\n",
|
|||
|
"(X_train.shape, X_val.shape, X_test.shape)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Разобьем данные на выборки и сбалансируем их"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 4,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Train: (6000, 5), Validation: (2000, 5), Test: (2000, 5)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"# Разделение данных на признаки и целевую переменную\n",
|
|||
|
"X = df.drop(columns=['Age of death', 'Name', 'Short description', 'Id'])\n",
|
|||
|
"y = df['Age of death']\n",
|
|||
|
"\n",
|
|||
|
"# Разбиение на обучающую, контрольную и тестовую выборки\n",
|
|||
|
"X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n",
|
|||
|
"X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка размеров выборок\n",
|
|||
|
"print(f\"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}\")\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Оценка сбалансированности и аугментация данных"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 5,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAqQAAAIQCAYAAABXHXzKAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABJmUlEQVR4nO3dfXzN9eP/8efZNZtthu1QLpbINTXSoijLiEr4iI+Y7EMfn1EoaUWuitIVclV9ZF3wUeqjC0WNpGK0hFwnn2nCtqRtLtrG9v790W/n69iwY4fXNo/77XZuN+f9fr1f79f7vM7h6fV+v19vm2VZlgAAAABDPEw3AAAAAFc2AikAAACMIpACAADAKAIpAAAAjCKQAgAAwCgCKQAAAIwikAIAAMAoAikAAACMIpACAADAKAIpAMBlU6dOVUFBgSSpoKBA06ZNM9wiuGL79u368MMPHe+3bNmiTz/91FyDcMUjkAKSEhISZLPZHC8/Pz81bNhQw4cPV3p6uunmAWXOm2++qRdeeEG//vqrXnzxRb355pummwQXHDt2TA8++KA2bNigvXv36uGHH9a2bdtMNwtXMC/TDQDKksmTJys8PFw5OTn69ttvNW/ePH322Wfavn27KleubLp5QJkxefJkDRw4UGPHjpWvr6/eeecd002CCyIjIx0vSWrYsKGGDBliuFW4ktksy7JMNwIwLSEhQQ888ICSk5PVunVrx/JHHnlEL730khYvXqx+/foZbCFQ9mRkZOjnn39WgwYNVKNGDdPNwUXYuXOn/vzzTzVv3lw+Pj6mm4MrGKfsgfO4/fbbJUkpKSmSpKNHj+rRRx9V8+bNFRAQoMDAQHXt2lVbt24tsm1OTo4mTpyohg0bys/PTzVr1lTPnj21b98+SdL+/fudLhM4+9WxY0dHXV999ZVsNpveffddPfHEE7Lb7fL399fdd9+tAwcOFNn3xo0b1aVLFwUFBaly5crq0KGD1q1bV+wxduzYsdj9T5w4sUjZd955RxEREapUqZJCQkLUt2/fYvd/vmM7U0FBgWbMmKGmTZvKz89PYWFhevDBB/XHH384latXr566d+9eZD/Dhw8vUmdxbX/++eeLfKaSlJubqwkTJujaa6+Vr6+vateurccee0y5ubnFflZn6tixo5o1a1Zk+QsvvCCbzab9+/c7Lc/MzNTIkSNVu3Zt+fr66tprr9Vzzz3nuA7zTBMnTiz2sxs0aJBTuYMHD2rw4MEKCwuTr6+vmjZtqjfeeMOpTOF3p/Dl6+urhg0batq0aTp7PGLz5s3q2rWrAgMDFRAQoE6dOmnDhg1OZQovb9m/f79CQ0N18803q1q1amrRooVsNpsSEhLO+7mdfXnMhb53rhyjO38fhX0QGhqqU6dOOa37z3/+42jvkSNHnNatWLFCt9xyi/z9/VWlShV169ZNO3bscCozaNAgBQQEFGnX+++/L5vNpq+++sqxzNXv2dy5c9W0aVP5+vqqVq1aiouLU2ZmplOZjh07On4LTZo0UUREhLZu3VrsbxS4XDhlD5xHYXisVq2aJOl///ufPvzwQ/3tb39TeHi40tPT9eqrr6pDhw7auXOnatWqJUnKz89X9+7dtXr1avXt21cPP/ywjh07psTERG3fvl3169d37KNfv3668847nfYbHx9fbHueeeYZ2Ww2jR07VhkZGZoxY4aioqK0ZcsWVapUSZL05ZdfqmvXroqIiNCECRPk4eGhhQsX6vbbb9c333yjG2+8sUi9V199teOmlOPHj2vYsGHF7nv8+PHq06eP/vGPf+i3337TK6+8oltvvVWbN29WcHBwkW2GDh2qW265RZL03//+V8uWLXNa/+CDDzpGpx966CGlpKRo9uzZ2rx5s9atWydvb+9iPwdXZGZmFnvDTUFBge6++259++23Gjp0qBo3bqxt27bp5Zdf1k8//eR0w0dpnTx5Uh06dNDBgwf14IMPqk6dOlq/fr3i4+N1+PBhzZgxo9jt3n77bcefR40a5bQuPT1dN910k2w2m4YPH64aNWpoxYoVio2NVXZ2tkaOHOlU/oknnlDjxo31559/OoJbaGioYmNjJUk7duzQLbfcosDAQD322GPy9vbWq6++qo4dO2rt2rVq27btOY/v7bffdvn6w8LLYwoV971z9Rgvxe/j2LFjWr58ue69917HsoULF8rPz085OTlFPoeYmBhFR0frueee08mTJzVv3jy1b99emzdvVr169Vz6jFw1ceJETZo0SVFRURo2bJj27NmjefPmKTk5+YK/p7Fjx17StgEXZAGwFi5caEmyVq1aZf3222/WgQMHrCVLlljVqlWzKlWqZP3666+WZVlWTk6OlZ+f77RtSkqK5evra02ePNmx7I033rAkWS+99FKRfRUUFDi2k2Q9//zzRco0bdrU6tChg+P9mjVrLEnWVVddZWVnZzuWv/fee5Yka+bMmY66GzRoYEVHRzv2Y1mWdfLkSSs8PNy64447iuzr5ptvtpo1a+Z4/9tvv1mSrAkTJjiW7d+/3/L09LSeeeYZp223bdtmeXl5FVm+d+9eS5L15ptvOpZNmDDBOvOvnG+++caSZC1atMhp25UrVxZZXrduXatbt25F2h4XF2ed/dfY2W1/7LHHrNDQUCsiIsLpM3377bctDw8P65tvvnHafv78+ZYka926dUX2d6YOHTpYTZs2LbL8+eeftyRZKSkpjmVTpkyx/P39rZ9++smp7OOPP255enpaqampTsuffPJJy2azOS2rW7euFRMT43gfGxtr1axZ0zpy5IhTub59+1pBQUHWyZMnLcv6v+/OmjVrHGVycnIsDw8P61//+pdjWY8ePSwfHx9r3759jmWHDh2yqlSpYt16662OZYW/lcLjy8nJserUqWN17drVkmQtXLiw6Id1hsLtk5OTnZYX971z9Rjd+fso/L7269fP6t69u2P5L7/8Ynl4eFj9+vWzJFm//fabZVmWdezYMSs4ONgaMmSIU1vT0tKsoKAgp+UxMTGWv79/kc9m6dKlRfqqpN+zjIwMy8fHx+rcubPT31GzZ8+2JFlvvPGGU51n/hY+++wzS5LVpUuXIr8n4HLhlD1whqioKNWoUUO1a9dW3759FRAQoGXLlumqq66SJPn6+srD46+fTX5+vn7//XcFBATouuuu0w8//OCo54MPPlD16tU1YsSIIvsozSmxgQMHqkqVKo73vXv3Vs2aNfXZZ59J+mvqlr179+rvf/+7fv/9dx05ckRHjhzRiRMn1KlTJ3399ddFThHn5OTIz8/vvPv973//q4KCAvXp08dR55EjR2S329WgQQOtWbPGqXxeXp6kvz6vc1m6dKmCgoJ0xx13ONUZERGhgICAInWeOnXKqdyRI0eKjFCd7eDBg3rllVc0fvz4IqdIly5dqsaNG6tRo0ZOdRZepnH2/ktj6dKluuWWW1S1alWnfUVFRSk/P19ff/21U/m8vLzzfnaWZemDDz7QXXfdJcuynOqMjo5WVlaW0/dRkrKysnTkyBGlpqZq+vTpKigocBxrfn6+vvjiC/Xo0UPXXHONY5uaNWvq73//u7799ltlZ2cX25Y5c+bo999/14QJEy7243HbMV6K38fgwYO1cuVKpaWlSfprdoHIyEg1bNjQqVxiYqIyMzPVr18/p7Z6enqqbdu2xX6fzv4+Hzt2rNjPIj8/v0jZkydPOpVZtWqV8vLyNHLkSMffUZI0ZMgQBQYGnnNKJ8uyFB8fr169ep13FBy41DhlD5xhzpw5atiwoby8vBQWFqbrrrvO6S/3goICzZw5U3PnzlVKSory8/Md6wpP60t/neq/7rrr5OXl3p9YgwYNnN7bbDZde+21juvI9u7dK0mKiYk5Zx1ZWVmqWrWq4/2RI0eK1Hu2vXv3yrKsc5Y7+1Rg4TVrxV0nd2adWVlZCg0NLXZ9RkaG0/svvvjC5RtnJkyYoFq1aunBBx/U+++/X2T/u3btOmedZ++/NPbu3asff/yxxPvKzMw872f322+/KTMzU6+99ppee+21EtXZo0cPx589PDw0btw49erVy1HfyZMndd111xW
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 800x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"# Проверка распределения целевой переменной\n",
|
|||
|
"plt.figure(figsize=(8, 6))\n",
|
|||
|
"plt.hist(y_train, bins=30, alpha=0.7, label='Train')\n",
|
|||
|
"plt.hist(y_val, bins=30, alpha=0.7, label='Validation')\n",
|
|||
|
"plt.hist(y_test, bins=30, alpha=0.7, label='Test')\n",
|
|||
|
"plt.legend()\n",
|
|||
|
"plt.title('Распределение целевой переменной')\n",
|
|||
|
"plt.show()\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Python 3",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.5"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|