Lab_3 #4

Closed
SooNooClose wants to merge 3 commits from Lab_3 into main
3 changed files with 747 additions and 8 deletions

2
.gitignore vendored
View File

@ -16,3 +16,5 @@ static/csv/diabetes.csv
static/csv/healthcare-dataset-stroke-data.csv static/csv/healthcare-dataset-stroke-data.csv
static/csv/heart_2020_cleaned.csv static/csv/heart_2020_cleaned.csv
static/csv/neo_v2.csv static/csv/neo_v2.csv
static/csv/Yamana_Gold_Inc._AUY.csv
static/csv/AgeDataset-V1.csv

730
Lab_3/lab3.ipynb Normal file
View File

@ -0,0 +1,730 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
" ## Вариант 13 \n",
" https://www.kaggle.com/datasets/nancyalaswad90/yamana-gold-inc-stock-price?resource=download\n"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',\n",
" 'Day_of_week', 'Month', 'Year'],\n",
" dtype='object')"
]
},
"execution_count": 105,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"data = pd.read_csv(\"..//static//csv//Yamana_Gold_Inc._AUY.csv\", sep=\",\")\n",
"\n",
"# Преобразование даты\n",
"data['Date'] = pd.to_datetime(data['Date'])\n",
"\n",
"# Преобразование данных: создание новых признаков\n",
"data['Day_of_week'] = data['Date'].dt.dayofweek\n",
"data['Month'] = data['Date'].dt.month\n",
"data['Year'] = data['Date'].dt.year\n",
"data .columns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#1. Бизнес-цели для набора данных по акции компании Yamana Gold Inc.\n",
"Цель 1: Прогнозирование изменения цены акции компании.\n",
"Прогнозирование цен на акции является одной из ключевых задач в области финансов и инвестирования. Задача состоит в предсказании будущих изменений стоимости акции на основе исторических данных, таких как открытие и закрытие торгов, объемы торгов и другие показатели.\n",
"\n",
"Цель 2: Оценка волатильности акций компании.\n",
"Измерение волатильности позволяет инвесторам оценить риск и принять решения по управлению капиталом. Задача заключается в прогнозировании уровня волатильности на основе исторической динамики цен, объемов торгов и других рыночных факторов.\n",
"\n",
"#2. Цели технического проекта для каждой бизнес-цели\n",
"Цель 1: Прогнозирование изменения цены акции компании\n",
"\n",
"Разработать модель машинного обучения для прогнозирования будущих цен акций на основе исторических данных.\n",
"Использовать регрессионные модели, такие как линейная регрессия или более сложные модели, например, LSTM (долгосрочная краткосрочная память) для временных рядов.\n",
"Цель 2: Оценка волатильности акций компании\n",
"\n",
"Создать модель, которая будет прогнозировать волатильность на основе исторических данных о ценах.\n",
"Использовать методы статистического анализа, такие как вычисление стандартного отклонения, или методы машинного обучения для более точной оценки волатильности.\n",
"\n",
"#3 Проверим датасет на пропуски и удалим при необходимости строки с недостающими данными"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Date 0\n",
"Open 0\n",
"High 0\n",
"Low 0\n",
"Close 0\n",
"Adj Close 0\n",
"Volume 0\n",
"Day_of_week 0\n",
"Month 0\n",
"Year 0\n",
"dtype: int64"
]
},
"execution_count": 106,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"# Проверим на пропущенные значения\n",
"data.isnull().sum()\n",
"\n",
"# Заполним пропуски или удалим строки с пропусками\n",
"data = data.dropna()\n",
"\n",
"# Проверим, что данные очищены\n",
"data.isnull().sum()\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Конструирование признаков"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Date</th>\n",
" <th>Open</th>\n",
" <th>High</th>\n",
" <th>Low</th>\n",
" <th>Close</th>\n",
" <th>Adj Close</th>\n",
" <th>Volume</th>\n",
" <th>Day_of_week</th>\n",
" <th>Month</th>\n",
" <th>Year</th>\n",
" <th>Price_Change</th>\n",
" <th>SMA_5</th>\n",
" <th>SMA_20</th>\n",
" <th>STD_5</th>\n",
" <th>STD_20</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2001-06-22</td>\n",
" <td>3.428571</td>\n",
" <td>3.428571</td>\n",
" <td>3.428571</td>\n",
" <td>3.428571</td>\n",
" <td>2.806002</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>2001</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2001-06-25</td>\n",
" <td>3.428571</td>\n",
" <td>3.428571</td>\n",
" <td>3.428571</td>\n",
" <td>3.428571</td>\n",
" <td>2.806002</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>2001</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2001-06-26</td>\n",
" <td>3.714286</td>\n",
" <td>3.714286</td>\n",
" <td>3.714286</td>\n",
" <td>3.714286</td>\n",
" <td>3.039837</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>2001</td>\n",
" <td>0.285715</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2001-06-27</td>\n",
" <td>3.714286</td>\n",
" <td>3.714286</td>\n",
" <td>3.714286</td>\n",
" <td>3.714286</td>\n",
" <td>3.039837</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>2001</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2001-06-28</td>\n",
" <td>3.714286</td>\n",
" <td>3.714286</td>\n",
" <td>3.714286</td>\n",
" <td>3.714286</td>\n",
" <td>3.039837</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>2001</td>\n",
" <td>0.000000</td>\n",
" <td>3.6</td>\n",
" <td>NaN</td>\n",
" <td>0.156493</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Date Open High Low Close Adj Close Volume \\\n",
"0 2001-06-22 3.428571 3.428571 3.428571 3.428571 2.806002 0 \n",
"1 2001-06-25 3.428571 3.428571 3.428571 3.428571 2.806002 0 \n",
"2 2001-06-26 3.714286 3.714286 3.714286 3.714286 3.039837 0 \n",
"3 2001-06-27 3.714286 3.714286 3.714286 3.714286 3.039837 0 \n",
"4 2001-06-28 3.714286 3.714286 3.714286 3.714286 3.039837 0 \n",
"\n",
" Day_of_week Month Year Price_Change SMA_5 SMA_20 STD_5 STD_20 \n",
"0 4 6 2001 NaN NaN NaN NaN NaN \n",
"1 0 6 2001 0.000000 NaN NaN NaN NaN \n",
"2 1 6 2001 0.285715 NaN NaN NaN NaN \n",
"3 2 6 2001 0.000000 NaN NaN NaN NaN \n",
"4 3 6 2001 0.000000 3.6 NaN 0.156493 NaN "
]
},
"execution_count": 107,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Создаем новый признак - разницу между текущей и предыдущей ценой (Price_Change)\n",
"data['Price_Change'] = data['Close'].diff()\n",
"\n",
"# Создадим скользящие средние для 5 и 20 дней\n",
"data['SMA_5'] = data['Close'].rolling(window=5).mean()\n",
"data['SMA_20'] = data['Close'].rolling(window=20).mean()\n",
"\n",
"# Стандартное отклонение для 5 и 20 дней\n",
"data['STD_5'] = data['Close'].rolling(window=5).std()\n",
"data['STD_20'] = data['Close'].rolling(window=20).std()\n",
"\n",
"data.head()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
" Разделение данных на обучающую, контрольную и тестовую выборки"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((3150, 10), (1050, 10), (1051, 10))"
]
},
"execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Преобразуем колонку 'Date' в тип datetime для правильного сортирования\n",
"data['Date'] = pd.to_datetime(data['Date'])\n",
"\n",
"# Сортируем данные по дате, чтобы не нарушить временную зависимость\n",
"data = data.sort_values(by='Date')\n",
"\n",
"# Отделяем целевую переменную (например, Price_Change) и признаки\n",
"X = data[['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'SMA_5', 'SMA_20', 'STD_5', 'STD_20']]\n",
"y = data['Price_Change']\n",
"\n",
"# Разделение на обучающую, контрольную и тестовую выборки (60%, 20%, 20%)\n",
"X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, shuffle=False)\n",
"X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False)\n",
"\n",
"# Проверка размеров выборок\n",
"(X_train.shape, X_val.shape, X_test.shape)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Конструирование признаков для решения задач"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [],
"source": [
"# Признаки для задачи прогнозирования изменений цен\n",
"data['Price_Change'] = data['Close'].diff()\n",
"\n",
"# Скользящие средние и стандартное отклонение\n",
"data['SMA_5'] = data['Close'].rolling(window=5).mean()\n",
"data['SMA_20'] = data['Close'].rolling(window=20).mean()\n",
"data['STD_5'] = data['Close'].rolling(window=5).std()\n",
"data['STD_20'] = data['Close'].rolling(window=20).std()\n",
"\n",
"# Признаки для оценки волатильности\n",
"data['Volatility'] = data['Close'].rolling(window=5).std()\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Подготовка признаков: one-hot encoding, дискретизация, синтез признаков, масштабирование\n",
"One-hot encoding: Применим для категориальных признаков (например, день недели).\n",
"Масштабирование: Стандартизируем числовые признаки."
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Day_of_week Month\n",
"0 4 6\n",
"1 0 6\n",
"2 1 6\n",
"3 2 6\n",
"4 3 6\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Close</th>\n",
" <th>SMA_5</th>\n",
" <th>SMA_20</th>\n",
" <th>STD_5</th>\n",
" <th>STD_20</th>\n",
" <th>Day_of_week_1</th>\n",
" <th>Day_of_week_2</th>\n",
" <th>Day_of_week_3</th>\n",
" <th>Day_of_week_4</th>\n",
" <th>Month_2</th>\n",
" <th>Month_3</th>\n",
" <th>Month_4</th>\n",
" <th>Month_5</th>\n",
" <th>Month_6</th>\n",
" <th>Month_7</th>\n",
" <th>Month_8</th>\n",
" <th>Month_9</th>\n",
" <th>Month_10</th>\n",
" <th>Month_11</th>\n",
" <th>Month_12</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>-0.721096</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>-0.507666</td>\n",
" <td>-0.507962</td>\n",
" <td>-0.502320</td>\n",
" <td>1.999048</td>\n",
" <td>-0.28793</td>\n",
" <td>-0.309491</td>\n",
" <td>-0.300916</td>\n",
" <td>-0.297137</td>\n",
" <td>3.335719</td>\n",
" <td>-0.30429</td>\n",
" <td>-0.311702</td>\n",
" <td>-0.296377</td>\n",
" <td>-0.311335</td>\n",
" <td>-0.298274</td>\n",
" <td>-0.303543</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>-0.721096</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>-0.507666</td>\n",
" <td>-0.507962</td>\n",
" <td>-0.502320</td>\n",
" <td>-0.500238</td>\n",
" <td>-0.28793</td>\n",
" <td>-0.309491</td>\n",
" <td>-0.300916</td>\n",
" <td>-0.297137</td>\n",
" <td>3.335719</td>\n",
" <td>-0.30429</td>\n",
" <td>-0.311702</td>\n",
" <td>-0.296377</td>\n",
" <td>-0.311335</td>\n",
" <td>-0.298274</td>\n",
" <td>-0.303543</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>-0.660890</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.969800</td>\n",
" <td>-0.507962</td>\n",
" <td>-0.502320</td>\n",
" <td>-0.500238</td>\n",
" <td>-0.28793</td>\n",
" <td>-0.309491</td>\n",
" <td>-0.300916</td>\n",
" <td>-0.297137</td>\n",
" <td>3.335719</td>\n",
" <td>-0.30429</td>\n",
" <td>-0.311702</td>\n",
" <td>-0.296377</td>\n",
" <td>-0.311335</td>\n",
" <td>-0.298274</td>\n",
" <td>-0.303543</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>-0.660890</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>-0.507666</td>\n",
" <td>1.968649</td>\n",
" <td>-0.502320</td>\n",
" <td>-0.500238</td>\n",
" <td>-0.28793</td>\n",
" <td>-0.309491</td>\n",
" <td>-0.300916</td>\n",
" <td>-0.297137</td>\n",
" <td>3.335719</td>\n",
" <td>-0.30429</td>\n",
" <td>-0.311702</td>\n",
" <td>-0.296377</td>\n",
" <td>-0.311335</td>\n",
" <td>-0.298274</td>\n",
" <td>-0.303543</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-0.660890</td>\n",
" <td>-0.686033</td>\n",
" <td>NaN</td>\n",
" <td>-0.269917</td>\n",
" <td>NaN</td>\n",
" <td>-0.507666</td>\n",
" <td>-0.507962</td>\n",
" <td>1.990763</td>\n",
" <td>-0.500238</td>\n",
" <td>-0.28793</td>\n",
" <td>-0.309491</td>\n",
" <td>-0.300916</td>\n",
" <td>-0.297137</td>\n",
" <td>3.335719</td>\n",
" <td>-0.30429</td>\n",
" <td>-0.311702</td>\n",
" <td>-0.296377</td>\n",
" <td>-0.311335</td>\n",
" <td>-0.298274</td>\n",
" <td>-0.303543</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Close SMA_5 SMA_20 STD_5 STD_20 Day_of_week_1 Day_of_week_2 \\\n",
"0 -0.721096 NaN NaN NaN NaN -0.507666 -0.507962 \n",
"1 -0.721096 NaN NaN NaN NaN -0.507666 -0.507962 \n",
"2 -0.660890 NaN NaN NaN NaN 1.969800 -0.507962 \n",
"3 -0.660890 NaN NaN NaN NaN -0.507666 1.968649 \n",
"4 -0.660890 -0.686033 NaN -0.269917 NaN -0.507666 -0.507962 \n",
"\n",
" Day_of_week_3 Day_of_week_4 Month_2 Month_3 Month_4 Month_5 \\\n",
"0 -0.502320 1.999048 -0.28793 -0.309491 -0.300916 -0.297137 \n",
"1 -0.502320 -0.500238 -0.28793 -0.309491 -0.300916 -0.297137 \n",
"2 -0.502320 -0.500238 -0.28793 -0.309491 -0.300916 -0.297137 \n",
"3 -0.502320 -0.500238 -0.28793 -0.309491 -0.300916 -0.297137 \n",
"4 1.990763 -0.500238 -0.28793 -0.309491 -0.300916 -0.297137 \n",
"\n",
" Month_6 Month_7 Month_8 Month_9 Month_10 Month_11 Month_12 \n",
"0 3.335719 -0.30429 -0.311702 -0.296377 -0.311335 -0.298274 -0.303543 \n",
"1 3.335719 -0.30429 -0.311702 -0.296377 -0.311335 -0.298274 -0.303543 \n",
"2 3.335719 -0.30429 -0.311702 -0.296377 -0.311335 -0.298274 -0.303543 \n",
"3 3.335719 -0.30429 -0.311702 -0.296377 -0.311335 -0.298274 -0.303543 \n",
"4 3.335719 -0.30429 -0.311702 -0.296377 -0.311335 -0.298274 -0.303543 "
]
},
"execution_count": 110,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.preprocessing import StandardScaler\n",
"import pandas as pd\n",
"\n",
"# Преобразуем дату, если это еще не сделано\n",
"data['Date'] = pd.to_datetime(data['Date'])\n",
"\n",
"# Добавим дополнительные признаки (день недели и месяц)\n",
"data['Day_of_week'] = data['Date'].dt.dayofweek\n",
"data['Month'] = data['Date'].dt.month\n",
"\n",
"# Проверим, что эти столбцы добавлены\n",
"print(data[['Day_of_week', 'Month']].head())\n",
"\n",
"# Выбираем признаки и целевую переменную\n",
"X = data[['Close', 'SMA_5', 'SMA_20', 'STD_5', 'STD_20', 'Day_of_week', 'Month']]\n",
"y = data['Price_Change']\n",
"\n",
"# Применяем one-hot encoding для категориальных признаков (Day_of_week и Month)\n",
"X = pd.get_dummies(X, columns=['Day_of_week', 'Month'], drop_first=True)\n",
"\n",
"# Масштабирование числовых признаков (Close, SMA, STD)\n",
"scaler = StandardScaler()\n",
"X_scaled = scaler.fit_transform(X)\n",
"\n",
"# Преобразуем обратно в DataFrame для удобства\n",
"X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)\n",
"\n",
"# Проверим результат\n",
"X_scaled_df.head()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
" warnings.warn(\n"
]
}
],
"source": [
"import featuretools as ft\n",
"\n",
"# Создаем сущности для Featuretools\n",
"es = ft.EntitySet(id=\"stock_prices\")\n",
"es = es.add_dataframe(dataframe_name=\"stock_data\", dataframe=data, index=\"Date\")\n",
"\n",
"# Автоматическое создание признаков\n",
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name=\"stock_data\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#Оценка качества признаков\n",
"Оценка признаков на основе предсказательной способности модели и других критериев."
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5251\n",
"3150\n"
]
},
{
"data": {
"text/plain": [
"(np.float64(0.05230198011754029), 0.5415652186272203)"
]
},
"execution_count": 112,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Проверим размерности данных после удаления NaN\n",
"print(X_scaled_df.shape[0]) # Количество строк в X_scaled_df\n",
"print(y_train.shape[0]) # Количество строк в y_train\n",
"\n",
"# Если данные имеют разные размеры, синхронизируем их\n",
"df = pd.concat([X_scaled_df, y_train], axis=1).dropna()\n",
"X_scaled_df = df.drop(columns=y_train.name)\n",
"y_train = df[y_train.name]\n",
"\n",
"# Приводим индексы к одному виду\n",
"y_train = y_train.reset_index(drop=True)\n",
"X_scaled_df = X_scaled_df.reset_index(drop=True)\n",
"\n",
"# После этого продолжаем обучение модели\n",
"model = LinearRegression()\n",
"model.fit(X_scaled_df, y_train)\n",
"\n",
"# Прогнозирование и оценка качества\n",
"y_pred = model.predict(X_scaled_df)\n",
"\n",
"# Оценка качества модели\n",
"mse = mean_squared_error(y_train, y_pred) # Используем y_train, потому что данные для теста не созданы\n",
"r2 = r2_score(y_train, y_pred)\n",
"\n",
"mse, r2\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"MSE = 0.0523: Модель в среднем делает ошибку около 0.0523 при прогнозировании значений.\n",
"R² = 0.5416: Модель объясняет примерно 54.16% изменчивости целевой переменной\n",
"\n",
"Визуализируем"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -10,16 +10,23 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 13, "execution_count": 1,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "ename": "FileNotFoundError",
"output_type": "stream", "evalue": "[Errno 2] No such file or directory: './/static//csv//csvLab1.csv'",
"text": [ "output_type": "error",
"Index(['Id', 'Name', 'Short description', 'Gender', 'Country', 'Occupation',\n", "traceback": [
" 'Birth year', 'Death year', 'Manner of death', 'Age of death'],\n", "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
" dtype='object')\n" "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[1], line 4\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mmatplotlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpyplot\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mplt\u001b[39;00m\n\u001b[1;32m----> 4\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m.//static//csv//csvLab1.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msep\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m,\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28mprint\u001b[39m(df\u001b[38;5;241m.\u001b[39mcolumns)\n",
"File \u001b[1;32mc:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[0;32m 1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[0;32m 1014\u001b[0m dialect,\n\u001b[0;32m 1015\u001b[0m delimiter,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1022\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[0;32m 1023\u001b[0m )\n\u001b[0;32m 1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[1;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[0;32m 619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[1;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[0;32m 623\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
"File \u001b[1;32mc:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m 1617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1880\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[1;34m(self, f, engine)\u001b[0m\n\u001b[0;32m 1878\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[0;32m 1879\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m-> 1880\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1881\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1882\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1883\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1884\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1885\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1886\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1887\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1888\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1889\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1890\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 1891\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n",
"File \u001b[1;32mc:\\Users\\alexk\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\io\\common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[0;32m 868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m 869\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[0;32m 870\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[0;32m 871\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[0;32m 872\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[1;32m--> 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[0;32m 874\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 875\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 876\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 877\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 878\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 879\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[0;32m 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
"\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: './/static//csv//csvLab1.csv'"
] ]
} }
], ],
@ -186,7 +193,7 @@
], ],
"metadata": { "metadata": {
"kernelspec": { "kernelspec": {
"display_name": "MIiLabs", "display_name": "Python 3",
"language": "python", "language": "python",
"name": "python3" "name": "python3"
}, },