{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Вот как можно переделать текст под ваш датасет, учитывая его особенности:\n", "\n", "**Регрессия**\n", "\n", "- Прогнозирование цены бриллианта:\n", " Цель: Используя такие параметры, как караты, огранка, цвет, чистота, глубина, таблица, размеры (x, y, z), можно предсказать цену бриллиантов.\n", "\n", "**Классификация**\n", "\n", "- Распределение бриллиантов по категориям чистоты:\n", " Цель: Распределить бриллианты по различным категориям чистоты (например, IF, VVS1, VVS2 и т.д.) с использованием данных о каратах, огранке, цвете, глубине, таблице и размерах." ] }, { "cell_type": "code", "metadata": { "ExecuteTime": { "end_time": "2025-01-19T15:30:36.844706Z", "start_time": "2025-01-19T15:30:36.697706Z" } }, "source": [ "import pandas as pd\n", "from sklearn import set_config\n", "\n", "set_config(transform_output=\"pandas\")\n", "\n", "random_state = 9\n", "\n", "file_path = 'data/Diamonds Prices2022.csv'\n", "df = pd.read_csv(file_path)\n", "\n", "# Функция для преобразования типа огранки (cut)\n", "def Cut_Type(value):\n", " if value == \"Fair\":\n", " return 0\n", " elif value == \"Good\":\n", " return 1\n", " elif value == \"Very Good\":\n", " return 2\n", " elif value == \"Premium\":\n", " return 3\n", " elif value == \"Ideal\":\n", " return 4\n", "\n", "df['Cut_Type'] = df['cut'].map(Cut_Type)\n", "\n", "df\n" ], "outputs": [ { "data": { "text/plain": [ " Unnamed: 0 carat cut color clarity depth table price x \\\n", "0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 \n", "1 2 0.21 Premium E SI1 59.8 61.0 326 3.89 \n", "2 3 0.23 Good E VS1 56.9 65.0 327 4.05 \n", "3 4 0.29 Premium I VS2 62.4 58.0 334 4.20 \n", "4 5 0.31 Good J SI2 63.3 58.0 335 4.34 \n", "... ... ... ... ... ... ... ... ... ... \n", "53938 53939 0.86 Premium H SI2 61.0 58.0 2757 6.15 \n", "53939 53940 0.75 Ideal D SI2 62.2 55.0 2757 5.83 \n", "53940 53941 0.71 Premium E SI1 60.5 55.0 2756 5.79 \n", "53941 53942 0.71 Premium F SI1 59.8 62.0 2756 5.74 \n", "53942 53943 0.70 Very Good E VS2 60.5 59.0 2757 5.71 \n", "\n", " y z Cut_Type \n", "0 3.98 2.43 4 \n", "1 3.84 2.31 3 \n", "2 4.07 2.31 1 \n", "3 4.23 2.63 3 \n", "4 4.35 2.75 1 \n", "... ... ... ... \n", "53938 6.12 3.74 3 \n", "53939 5.87 3.64 4 \n", "53940 5.74 3.49 3 \n", "53941 5.73 3.43 3 \n", "53942 5.76 3.47 2 \n", "\n", "[53943 rows x 12 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0caratcutcolorclaritydepthtablepricexyzCut_Type
010.23IdealESI261.555.03263.953.982.434
120.21PremiumESI159.861.03263.893.842.313
230.23GoodEVS156.965.03274.054.072.311
340.29PremiumIVS262.458.03344.204.232.633
450.31GoodJSI263.358.03354.344.352.751
.......................................
53938539390.86PremiumHSI261.058.027576.156.123.743
53939539400.75IdealDSI262.255.027575.835.873.644
53940539410.71PremiumESI160.555.027565.795.743.493
53941539420.71PremiumFSI159.862.027565.745.733.433
53942539430.70Very GoodEVS260.559.027575.715.763.472
\n", "

53943 rows × 12 columns

\n", "
" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 1 }, { "cell_type": "code", "metadata": { "ExecuteTime": { "end_time": "2025-01-19T15:32:05.619593Z", "start_time": "2025-01-19T15:31:59.880477Z" } }, "source": [ "from sklearn.utils import resample\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn import metrics\n", "from imblearn.over_sampling import RandomOverSampler\n", "from imblearn.under_sampling import RandomUnderSampler\n", "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", "from sklearn.metrics import ConfusionMatrixDisplay\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.linear_model import LinearRegression, LogisticRegression\n", "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier\n", "from sklearn.model_selection import train_test_split, GridSearchCV\n", "from sklearn.linear_model import SGDClassifier, SGDRegressor\n", "from sklearn.metrics import (\n", " precision_score, recall_score, accuracy_score, roc_auc_score, f1_score,\n", " matthews_corrcoef, cohen_kappa_score, confusion_matrix\n", ")\n", "from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n", "import numpy as np\n", "import featuretools as ft\n", "from sklearn.metrics import accuracy_score, classification_report\n", "\n", "# Загрузка данных\n", "df = pd.read_csv(\"data/Diamonds Prices2022.csv\")\n", "\n", "# Определение целевых переменных\n", "# Для задачи классификации я буду использовать 'cut' как целевую переменную\n", "X = df.drop('cut', axis=1) # Убираем target переменную\n", "y_class = df['cut'] # Задача классификации (например, классификация по типу огранки)\n", "y_reg = df['price'] # Задача регрессии (например, предсказание цены бриллианта)\n", "\n", "# Преобразование категориальных переменных\n", "categorical_features = ['color', 'clarity']\n", "numerical_features = ['carat', 'depth', 'table', 'x', 'y', 'z']\n", "\n", "# Создание ColumnTransformer с обработкой неизвестных категорий\n", "preprocessor = ColumnTransformer(\n", " transformers=[\n", " ('num', StandardScaler(), numerical_features),\n", " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)]) # Используем handle_unknown='ignore'\n", "\n", "# Разделение данных на обучающую и тестовую выборки\n", "X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(X, y_class, y_reg, test_size=0.2, random_state=42)\n", "\n", "def estimate_bias_variance(model, X, y):\n", " predictions = np.array([model.fit(X, y).predict(X) for _ in range(1000)])\n", " bias = np.mean((y - np.mean(predictions, axis=0)) ** 2)\n", " variance = np.mean(np.var(predictions, axis=0))\n", " return bias, variance\n", "\n", "# Просмотр обучающих и тестовых данных\n", "print(\"X_train\", X_train.head())\n", "print(\"y_class_train\", y_class_train.head())\n", "\n", "print(\"X_test\", X_test.head())\n", "print(\"y_class_test\", y_class_test.head())\n", "\n" ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "X_train Unnamed: 0 carat color clarity depth table price x y z\n", "9159 9160 1.01 E SI2 60.0 60.0 4540 6.57 6.49 3.92\n", "14131 14132 1.10 H VS2 62.5 58.0 5729 6.59 6.54 4.10\n", "15757 15758 1.50 E SI2 61.5 65.0 6300 7.21 7.17 4.42\n", "24633 24634 1.53 E SI1 61.3 59.0 12968 7.40 7.35 4.52\n", "49831 49832 0.84 D SI2 64.5 60.0 2167 5.92 5.84 3.79\n", "y_class_train 9159 Very Good\n", "14131 Premium\n", "15757 Good\n", "24633 Premium\n", "49831 Fair\n", "Name: cut, dtype: object\n", "X_test Unnamed: 0 carat color clarity depth table price x y z\n", "1388 1389 0.24 G VVS1 62.1 56.0 559 3.97 4.00 2.47\n", "19841 19842 1.21 F VS2 62.9 54.0 8403 6.78 6.82 4.28\n", "41647 41648 0.50 E SI1 61.7 68.0 1238 5.09 5.03 3.12\n", "41741 41742 0.50 D SI2 62.8 56.0 1243 5.06 5.03 3.17\n", "17244 17245 1.55 E SI2 62.3 55.0 6901 7.44 7.37 4.61\n", "y_class_test 1388 Ideal\n", "19841 Very Good\n", "41647 Fair\n", "41741 Ideal\n", "17244 Ideal\n", "Name: cut, dtype: object\n" ] } ], "execution_count": 2 }, { "cell_type": "code", "metadata": { "ExecuteTime": { "end_time": "2025-01-19T15:32:44.541170Z", "start_time": "2025-01-19T15:32:38.117434Z" } }, "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import classification_report, confusion_matrix\n", "\n", "# Загрузка данных\n", "file_path = 'data/Diamonds Prices2022.csv'\n", "df = pd.read_csv(file_path)\n", "\n", "# Очистка столбцов от пробелов\n", "df.columns = df.columns.str.strip()\n", "\n", "# Проверка столбцов\n", "print(df.columns)\n", "\n", "# Определение признаков и целевой переменной\n", "# Задача классификации: будем предсказывать 'cut' (тип огранки)\n", "X = df.drop('cut', axis=1) # Убираем целевую переменную\n", "y_class = df['cut'] # Целевая переменная для классификации\n", "\n", "# Преобразование категориальных признаков в числовые\n", "X = pd.get_dummies(X, drop_first=True) # Преобразуем категориальные признаки в числовые, исключая первую категорию\n", "\n", "# Разделение на обучающую и тестовую выборки\n", "X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.2, random_state=42)\n", "\n", "# Масштабирование данных\n", "scaler = StandardScaler()\n", "X_train_scaled = scaler.fit_transform(X_train)\n", "X_test_scaled = scaler.transform(X_test)\n", "\n", "# Обучение модели\n", "model = RandomForestClassifier(n_estimators=100, random_state=42)\n", "model.fit(X_train_scaled, y_train)\n", "\n", "# Прогнозирование\n", "y_pred = model.predict(X_test_scaled)\n", "\n", "# Оценка модели\n", "print(classification_report(y_test, y_pred))\n", "print(confusion_matrix(y_test, y_pred))\n", "\n" ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',\n", " 'price', 'x', 'y', 'z'],\n", " dtype='object')\n", " precision recall f1-score support\n", "\n", " Fair 0.91 0.87 0.89 328\n", " Good 0.77 0.69 0.73 1000\n", " Ideal 0.82 0.92 0.87 4316\n", " Premium 0.73 0.81 0.77 2734\n", " Very Good 0.66 0.47 0.55 2411\n", "\n", " accuracy 0.77 10789\n", " macro avg 0.78 0.75 0.76 10789\n", "weighted avg 0.76 0.77 0.76 10789\n", "\n", "[[ 286 31 1 7 3]\n", " [ 19 686 18 58 219]\n", " [ 6 10 3982 157 161]\n", " [ 0 11 312 2219 192]\n", " [ 2 154 527 589 1139]]\n" ] } ], "execution_count": 3 } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }