512 lines
19 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Вот как можно переделать текст под ваш датасет, учитывая его особенности:\n",
"\n",
"**Регрессия**\n",
"\n",
"- Прогнозирование цены бриллианта:\n",
" Цель: Используя такие параметры, как караты, огранка, цвет, чистота, глубина, таблица, размеры (x, y, z), можно предсказать цену бриллиантов.\n",
"\n",
"**Классификация**\n",
"\n",
"- Распределение бриллиантов по категориям чистоты:\n",
" Цель: Распределить бриллианты по различным категориям чистоты (например, IF, VVS1, VVS2 и т.д.) с использованием данных о каратах, огранке, цвете, глубине, таблице и размерах."
]
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2025-01-19T15:30:36.844706Z",
"start_time": "2025-01-19T15:30:36.697706Z"
}
},
"source": [
"import pandas as pd\n",
"from sklearn import set_config\n",
"\n",
"set_config(transform_output=\"pandas\")\n",
"\n",
"random_state = 9\n",
"\n",
"file_path = 'data/Diamonds Prices2022.csv'\n",
"df = pd.read_csv(file_path)\n",
"\n",
"# Функция для преобразования типа огранки (cut)\n",
"def Cut_Type(value):\n",
" if value == \"Fair\":\n",
" return 0\n",
" elif value == \"Good\":\n",
" return 1\n",
" elif value == \"Very Good\":\n",
" return 2\n",
" elif value == \"Premium\":\n",
" return 3\n",
" elif value == \"Ideal\":\n",
" return 4\n",
"\n",
"df['Cut_Type'] = df['cut'].map(Cut_Type)\n",
"\n",
"df\n"
],
"outputs": [
{
"data": {
"text/plain": [
" Unnamed: 0 carat cut color clarity depth table price x \\\n",
"0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 \n",
"1 2 0.21 Premium E SI1 59.8 61.0 326 3.89 \n",
"2 3 0.23 Good E VS1 56.9 65.0 327 4.05 \n",
"3 4 0.29 Premium I VS2 62.4 58.0 334 4.20 \n",
"4 5 0.31 Good J SI2 63.3 58.0 335 4.34 \n",
"... ... ... ... ... ... ... ... ... ... \n",
"53938 53939 0.86 Premium H SI2 61.0 58.0 2757 6.15 \n",
"53939 53940 0.75 Ideal D SI2 62.2 55.0 2757 5.83 \n",
"53940 53941 0.71 Premium E SI1 60.5 55.0 2756 5.79 \n",
"53941 53942 0.71 Premium F SI1 59.8 62.0 2756 5.74 \n",
"53942 53943 0.70 Very Good E VS2 60.5 59.0 2757 5.71 \n",
"\n",
" y z Cut_Type \n",
"0 3.98 2.43 4 \n",
"1 3.84 2.31 3 \n",
"2 4.07 2.31 1 \n",
"3 4.23 2.63 3 \n",
"4 4.35 2.75 1 \n",
"... ... ... ... \n",
"53938 6.12 3.74 3 \n",
"53939 5.87 3.64 4 \n",
"53940 5.74 3.49 3 \n",
"53941 5.73 3.43 3 \n",
"53942 5.76 3.47 2 \n",
"\n",
"[53943 rows x 12 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>carat</th>\n",
" <th>cut</th>\n",
" <th>color</th>\n",
" <th>clarity</th>\n",
" <th>depth</th>\n",
" <th>table</th>\n",
" <th>price</th>\n",
" <th>x</th>\n",
" <th>y</th>\n",
" <th>z</th>\n",
" <th>Cut_Type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>0.23</td>\n",
" <td>Ideal</td>\n",
" <td>E</td>\n",
" <td>SI2</td>\n",
" <td>61.5</td>\n",
" <td>55.0</td>\n",
" <td>326</td>\n",
" <td>3.95</td>\n",
" <td>3.98</td>\n",
" <td>2.43</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>0.21</td>\n",
" <td>Premium</td>\n",
" <td>E</td>\n",
" <td>SI1</td>\n",
" <td>59.8</td>\n",
" <td>61.0</td>\n",
" <td>326</td>\n",
" <td>3.89</td>\n",
" <td>3.84</td>\n",
" <td>2.31</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>0.23</td>\n",
" <td>Good</td>\n",
" <td>E</td>\n",
" <td>VS1</td>\n",
" <td>56.9</td>\n",
" <td>65.0</td>\n",
" <td>327</td>\n",
" <td>4.05</td>\n",
" <td>4.07</td>\n",
" <td>2.31</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>0.29</td>\n",
" <td>Premium</td>\n",
" <td>I</td>\n",
" <td>VS2</td>\n",
" <td>62.4</td>\n",
" <td>58.0</td>\n",
" <td>334</td>\n",
" <td>4.20</td>\n",
" <td>4.23</td>\n",
" <td>2.63</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>0.31</td>\n",
" <td>Good</td>\n",
" <td>J</td>\n",
" <td>SI2</td>\n",
" <td>63.3</td>\n",
" <td>58.0</td>\n",
" <td>335</td>\n",
" <td>4.34</td>\n",
" <td>4.35</td>\n",
" <td>2.75</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53938</th>\n",
" <td>53939</td>\n",
" <td>0.86</td>\n",
" <td>Premium</td>\n",
" <td>H</td>\n",
" <td>SI2</td>\n",
" <td>61.0</td>\n",
" <td>58.0</td>\n",
" <td>2757</td>\n",
" <td>6.15</td>\n",
" <td>6.12</td>\n",
" <td>3.74</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53939</th>\n",
" <td>53940</td>\n",
" <td>0.75</td>\n",
" <td>Ideal</td>\n",
" <td>D</td>\n",
" <td>SI2</td>\n",
" <td>62.2</td>\n",
" <td>55.0</td>\n",
" <td>2757</td>\n",
" <td>5.83</td>\n",
" <td>5.87</td>\n",
" <td>3.64</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53940</th>\n",
" <td>53941</td>\n",
" <td>0.71</td>\n",
" <td>Premium</td>\n",
" <td>E</td>\n",
" <td>SI1</td>\n",
" <td>60.5</td>\n",
" <td>55.0</td>\n",
" <td>2756</td>\n",
" <td>5.79</td>\n",
" <td>5.74</td>\n",
" <td>3.49</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53941</th>\n",
" <td>53942</td>\n",
" <td>0.71</td>\n",
" <td>Premium</td>\n",
" <td>F</td>\n",
" <td>SI1</td>\n",
" <td>59.8</td>\n",
" <td>62.0</td>\n",
" <td>2756</td>\n",
" <td>5.74</td>\n",
" <td>5.73</td>\n",
" <td>3.43</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53942</th>\n",
" <td>53943</td>\n",
" <td>0.70</td>\n",
" <td>Very Good</td>\n",
" <td>E</td>\n",
" <td>VS2</td>\n",
" <td>60.5</td>\n",
" <td>59.0</td>\n",
" <td>2757</td>\n",
" <td>5.71</td>\n",
" <td>5.76</td>\n",
" <td>3.47</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>53943 rows × 12 columns</p>\n",
"</div>"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 1
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2025-01-19T15:32:05.619593Z",
"start_time": "2025-01-19T15:31:59.880477Z"
}
},
"source": [
"from sklearn.utils import resample\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn import metrics\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"from sklearn.metrics import ConfusionMatrixDisplay\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier\n",
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
"from sklearn.linear_model import SGDClassifier, SGDRegressor\n",
"from sklearn.metrics import (\n",
" precision_score, recall_score, accuracy_score, roc_auc_score, f1_score,\n",
" matthews_corrcoef, cohen_kappa_score, confusion_matrix\n",
")\n",
"from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
"import numpy as np\n",
"import featuretools as ft\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"data/Diamonds Prices2022.csv\")\n",
"\n",
"# Определение целевых переменных\n",
"# Для задачи классификации я буду использовать 'cut' как целевую переменную\n",
"X = df.drop('cut', axis=1) # Убираем target переменную\n",
"y_class = df['cut'] # Задача классификации (например, классификация по типу огранки)\n",
"y_reg = df['price'] # Задача регрессии (например, предсказание цены бриллианта)\n",
"\n",
"# Преобразование категориальных переменных\n",
"categorical_features = ['color', 'clarity']\n",
"numerical_features = ['carat', 'depth', 'table', 'x', 'y', 'z']\n",
"\n",
"# Создание ColumnTransformer с обработкой неизвестных категорий\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', StandardScaler(), numerical_features),\n",
" ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)]) # Используем handle_unknown='ignore'\n",
"\n",
"# Разделение данных на обучающую и тестовую выборки\n",
"X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(X, y_class, y_reg, test_size=0.2, random_state=42)\n",
"\n",
"def estimate_bias_variance(model, X, y):\n",
" predictions = np.array([model.fit(X, y).predict(X) for _ in range(1000)])\n",
" bias = np.mean((y - np.mean(predictions, axis=0)) ** 2)\n",
" variance = np.mean(np.var(predictions, axis=0))\n",
" return bias, variance\n",
"\n",
"# Просмотр обучающих и тестовых данных\n",
"print(\"X_train\", X_train.head())\n",
"print(\"y_class_train\", y_class_train.head())\n",
"\n",
"print(\"X_test\", X_test.head())\n",
"print(\"y_class_test\", y_class_test.head())\n",
"\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"X_train Unnamed: 0 carat color clarity depth table price x y z\n",
"9159 9160 1.01 E SI2 60.0 60.0 4540 6.57 6.49 3.92\n",
"14131 14132 1.10 H VS2 62.5 58.0 5729 6.59 6.54 4.10\n",
"15757 15758 1.50 E SI2 61.5 65.0 6300 7.21 7.17 4.42\n",
"24633 24634 1.53 E SI1 61.3 59.0 12968 7.40 7.35 4.52\n",
"49831 49832 0.84 D SI2 64.5 60.0 2167 5.92 5.84 3.79\n",
"y_class_train 9159 Very Good\n",
"14131 Premium\n",
"15757 Good\n",
"24633 Premium\n",
"49831 Fair\n",
"Name: cut, dtype: object\n",
"X_test Unnamed: 0 carat color clarity depth table price x y z\n",
"1388 1389 0.24 G VVS1 62.1 56.0 559 3.97 4.00 2.47\n",
"19841 19842 1.21 F VS2 62.9 54.0 8403 6.78 6.82 4.28\n",
"41647 41648 0.50 E SI1 61.7 68.0 1238 5.09 5.03 3.12\n",
"41741 41742 0.50 D SI2 62.8 56.0 1243 5.06 5.03 3.17\n",
"17244 17245 1.55 E SI2 62.3 55.0 6901 7.44 7.37 4.61\n",
"y_class_test 1388 Ideal\n",
"19841 Very Good\n",
"41647 Fair\n",
"41741 Ideal\n",
"17244 Ideal\n",
"Name: cut, dtype: object\n"
]
}
],
"execution_count": 2
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2025-01-19T15:32:44.541170Z",
"start_time": "2025-01-19T15:32:38.117434Z"
}
},
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import classification_report, confusion_matrix\n",
"\n",
"# Загрузка данных\n",
"file_path = 'data/Diamonds Prices2022.csv'\n",
"df = pd.read_csv(file_path)\n",
"\n",
"# Очистка столбцов от пробелов\n",
"df.columns = df.columns.str.strip()\n",
"\n",
"# Проверка столбцов\n",
"print(df.columns)\n",
"\n",
"# Определение признаков и целевой переменной\n",
"# Задача классификации: будем предсказывать 'cut' (тип огранки)\n",
"X = df.drop('cut', axis=1) # Убираем целевую переменную\n",
"y_class = df['cut'] # Целевая переменная для классификации\n",
"\n",
"# Преобразование категориальных признаков в числовые\n",
"X = pd.get_dummies(X, drop_first=True) # Преобразуем категориальные признаки в числовые, исключая первую категорию\n",
"\n",
"# Разделение на обучающую и тестовую выборки\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.2, random_state=42)\n",
"\n",
"# Масштабирование данных\n",
"scaler = StandardScaler()\n",
"X_train_scaled = scaler.fit_transform(X_train)\n",
"X_test_scaled = scaler.transform(X_test)\n",
"\n",
"# Обучение модели\n",
"model = RandomForestClassifier(n_estimators=100, random_state=42)\n",
"model.fit(X_train_scaled, y_train)\n",
"\n",
"# Прогнозирование\n",
"y_pred = model.predict(X_test_scaled)\n",
"\n",
"# Оценка модели\n",
"print(classification_report(y_test, y_pred))\n",
"print(confusion_matrix(y_test, y_pred))\n",
"\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',\n",
" 'price', 'x', 'y', 'z'],\n",
" dtype='object')\n",
" precision recall f1-score support\n",
"\n",
" Fair 0.91 0.87 0.89 328\n",
" Good 0.77 0.69 0.73 1000\n",
" Ideal 0.82 0.92 0.87 4316\n",
" Premium 0.73 0.81 0.77 2734\n",
" Very Good 0.66 0.47 0.55 2411\n",
"\n",
" accuracy 0.77 10789\n",
" macro avg 0.78 0.75 0.76 10789\n",
"weighted avg 0.76 0.77 0.76 10789\n",
"\n",
"[[ 286 31 1 7 3]\n",
" [ 19 686 18 58 219]\n",
" [ 6 10 3982 157 161]\n",
" [ 0 11 312 2219 192]\n",
" [ 2 154 527 589 1139]]\n"
]
}
],
"execution_count": 3
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.8"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}