5 Commits
lab_12 ... main

Author SHA1 Message Date
Илья
110f79e4f5 cleaned comments 2025-06-17 12:27:34 +04:00
639e381daa Merge pull request 'lab_12' (#11) from lab_12 into main
Reviewed-on: #11
2025-05-15 16:08:15 +04:00
a610d16a7f Merge pull request 'lab_11' (#10) from lab_11 into main
Reviewed-on: #10
2025-05-15 16:08:07 +04:00
061214e244 small fixes and requirements 2025-05-03 17:56:58 +04:00
80e6ee0e8f lab 11 done 2025-05-03 17:38:40 +04:00
9 changed files with 1399 additions and 237 deletions

View File

@@ -328,7 +328,7 @@
},
{
"cell_type": "code",
"execution_count": 71,
"execution_count": null,
"id": "9a7b0970",
"metadata": {},
"outputs": [
@@ -356,7 +356,6 @@
"import random\n",
"\n",
"def create_individual(elements_num): \n",
" # Генерирует случайную двоичную строку той же длины, что и список элементов\n",
" return [random.randint(0, 1) for _ in range(elements_num)]\n",
"\n",
"def create_population(elements_num, population_size): \n",
@@ -375,7 +374,7 @@
},
{
"cell_type": "code",
"execution_count": 72,
"execution_count": null,
"id": "0809a0b1",
"metadata": {},
"outputs": [
@@ -397,7 +396,6 @@
" if individual[i] == 1:\n",
" total_value += prices[i]\n",
" total_weight += weights[i]\n",
" # Если общий вес превышает вместимость ранца, устанавливается значение 0 (неверное решение)\n",
" return total_value if total_weight <= capacity else 0\n",
"\n",
"evaluate_fitness([0, 1, 1, 1, 0], [7, 12, 19, 13, 20], [10, 11, 18, 15, 5], 50)"
@@ -413,7 +411,7 @@
},
{
"cell_type": "code",
"execution_count": 73,
"execution_count": null,
"id": "4d5a13d7",
"metadata": {},
"outputs": [
@@ -429,7 +427,6 @@
}
],
"source": [
"# одноточечный кроссинговер\n",
"def crossover(parent1, parent2):\n",
" point = random.randint(1, len(parent1) - 1)\n",
" return (parent1[:point] + parent2[point:], parent2[:point] + parent1[point:])\n",
@@ -447,7 +444,7 @@
},
{
"cell_type": "code",
"execution_count": 74,
"execution_count": null,
"id": "66021b53",
"metadata": {},
"outputs": [
@@ -467,7 +464,6 @@
"# Мутация 1: побитовая замена\n",
"def mutate_flip_bits(individual, mutation_rate):\n",
" for i in range(len(individual)):\n",
" # Сработает с некоторой вероятностью\n",
" if random.random() < mutation_rate:\n",
" individual[i] = 1 - individual[i]\n",
"\n",
@@ -497,24 +493,21 @@
},
{
"cell_type": "code",
"execution_count": 79,
"execution_count": null,
"id": "17093d62",
"metadata": {},
"outputs": [],
"source": [
"# Параметры алгоритма\n",
"population_size = 100\n",
"num_generations = 10\n",
"mutation_rate = 0.1\n",
"mutation_strategy = 'flip'\n",
"\n",
"# Выбор участников кроссинговера с помощью селекции на основе рулетки\n",
"def select_parents(population, weights, prices, capacity):\n",
" fitness_values = [evaluate_fitness(ind, weights, prices, capacity) for ind in population]\n",
" total_fitness = sum(fitness_values)\n",
" if total_fitness == 0:\n",
" return random.choice(population), random.choice(population)\n",
" # чем выше значение фитнес-функции, тем больше шанс на выбор\n",
" probabilities = [f / total_fitness for f in fitness_values]\n",
" return random.choices(population, weights=probabilities, k=2)\n",
"\n",

1345
lab_11/lab11.ipynb Normal file

File diff suppressed because one or more lines are too long

BIN
lab_11/requirements.txt Normal file

Binary file not shown.

View File

@@ -15,7 +15,7 @@
},
{
"cell_type": "code",
"execution_count": 532,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -35,7 +35,6 @@
"import seaborn as sns\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"# вывод всех столбцов\n",
"df = pd.read_csv(\"..//..//static//csv//flavors_of_cacao.csv\")\n",
"df.columns = df.columns.str.replace('\\n', '')\n",
"print(df.columns)"
@@ -85,7 +84,7 @@
},
{
"cell_type": "code",
"execution_count": 533,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -108,13 +107,10 @@
}
],
"source": [
"# Удаляем символ '%' и преобразуем столбец CocoaPercent в числовой формат\n",
"df['CocoaPercent'] = df['CocoaPercent'].str.replace('%', '').astype(float)\n",
"\n",
"# Выбираем столбцы для анализа\n",
"columns_to_check = ['CocoaPercent', 'Rating']\n",
"\n",
"# Функция для подсчета выбросов\n",
"def count_outliers(df, columns):\n",
" outliers_count = {}\n",
" for col in columns:\n",
@@ -123,21 +119,17 @@
" IQR = Q3 - Q1\n",
" lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n",
" \n",
" # Считаем количество выбросов\n",
" \n",
" outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]\n",
" outliers_count[col] = len(outliers)\n",
" \n",
" return outliers_count\n",
"\n",
"# Подсчитываем выбросы\n",
"outliers_count = count_outliers(df, columns_to_check)\n",
"\n",
"# Выводим количество выбросов для каждого столбца\n",
"for col, count in outliers_count.items():\n",
" print(f\"Количество выбросов в столбце '{col}': {count}\")\n",
"\n",
"# Создаем диаграммы размахов\n",
"plt.figure(figsize=(15, 10))\n",
"for i, col in enumerate(columns_to_check, 1):\n",
" plt.subplot(2, 2, i)\n",
@@ -158,7 +150,7 @@
},
{
"cell_type": "code",
"execution_count": 534,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -180,10 +172,8 @@
}
],
"source": [
"# Выбираем столбцы для очистки\n",
"columns_to_clean = ['CocoaPercent']\n",
"\n",
"# Функция для удаления выбросов\n",
"def remove_outliers(df, columns):\n",
" for col in columns:\n",
" Q1 = df[col].quantile(0.25)\n",
@@ -192,21 +182,15 @@
" lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n",
" \n",
" # Удаляем строки, содержащие выбросы\n",
" df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]\n",
" \n",
" return df\n",
"\n",
"# Удаляем выбросы\n",
"df_cleaned = remove_outliers(df, columns_to_clean)\n",
"\n",
"# Выводим количество удаленных строк\n",
"print(f\"Количество удаленных строк: {len(df) - len(df_cleaned)}\")\n",
"\n",
"# Создаем диаграммы размаха для очищенных данных\n",
"plt.figure(figsize=(15, 6))\n",
"\n",
"# Диаграмма размаха для CocoaPercent\n",
"plt.subplot(1, 2, 1)\n",
"sns.boxplot(x=df_cleaned['CocoaPercent'])\n",
"plt.title('Box Plot of CocoaPercent (Cleaned)')\n",
@@ -215,7 +199,6 @@
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"# Сохраняем очищенный датасет\n",
"df_cleaned.to_csv(\"..//..//static//csv//flavors_of_cacao_cleaned.csv\", index=False)\n",
"df = df_cleaned"
]
@@ -231,7 +214,7 @@
},
{
"cell_type": "code",
"execution_count": 535,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -266,17 +249,14 @@
}
],
"source": [
"# Количество пустых значений признаков\n",
"print(df.isnull().sum())\n",
"\n",
"print()\n",
"\n",
"# Есть ли пустые значения признаков\n",
"print(df.isnull().any())\n",
"\n",
"print()\n",
"\n",
"# Процент пустых значений признаков\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
@@ -292,7 +272,7 @@
},
{
"cell_type": "code",
"execution_count": 536,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -315,13 +295,10 @@
}
],
"source": [
"# Удаление пропущенных значений в столбцах BeanType и Broad BeanOrigin\n",
"df = df.dropna(subset=['BeanType', 'Broad BeanOrigin'])\n",
"\n",
"# Проверка на пропущенные значения после удаления\n",
"missing_values_after_drop = df.isnull().sum()\n",
"\n",
"# Вывод результатов после удаления\n",
"print(\"\\nКоличество пустых значений в каждом столбце после удаления:\")\n",
"print(missing_values_after_drop)"
]
@@ -337,7 +314,7 @@
},
{
"cell_type": "code",
"execution_count": 537,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -351,18 +328,13 @@
}
],
"source": [
"# Разделение на признаки (X) и целевую переменную (y)\n",
"# Предположим, что Rating - это целевая переменная\n",
"X = df.drop('Rating', axis=1)\n",
"y = df['Rating']\n",
"\n",
"# Разбиение на обучающую и остальную выборку (контрольную + тестовую)\n",
"X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=42)\n",
"\n",
"# Разбиение остатка на контрольную и тестовую выборки\n",
"X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42)\n",
"\n",
"# Вывод размеров выборок\n",
"print(\"Размер обучающей выборки:\", X_train.shape)\n",
"print(\"Размер контрольной выборки:\", X_val.shape)\n",
"print(\"Размер тестовой выборки:\", X_test.shape)"
@@ -370,7 +342,7 @@
},
{
"cell_type": "code",
"execution_count": 538,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -426,7 +398,6 @@
}
],
"source": [
"# Функция для анализа сбалансированности\n",
"def analyze_balance(y_train, y_val, y_test):\n",
" print(\"Распределение классов в обучающей выборке:\")\n",
" print(y_train.value_counts(normalize=True))\n",
@@ -437,7 +408,6 @@
" print(\"\\nРаспределение классов в тестовой выборке:\")\n",
" print(y_test.value_counts(normalize=True))\n",
"\n",
"# Анализ сбалансированности\n",
"analyze_balance(y_train, y_val, y_test)"
]
},
@@ -465,7 +435,7 @@
},
{
"cell_type": "code",
"execution_count": 539,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -485,7 +455,6 @@
"from sklearn.model_selection import train_test_split\n",
"from imblearn.over_sampling import SMOTE\n",
"\n",
"# вывод всех столбцов\n",
"df = pd.read_csv(\"..//..//static//csv//water_potability.csv\")\n",
"print(df.columns)"
]
@@ -538,7 +507,7 @@
},
{
"cell_type": "code",
"execution_count": 540,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -562,10 +531,8 @@
}
],
"source": [
"# Выбираем столбцы для анализа\n",
"columns_to_check = ['Hardness', 'Solids', 'Organic_carbon']\n",
"\n",
"# Функция для подсчета выбросов\n",
"def count_outliers(df, columns):\n",
" outliers_count = {}\n",
" for col in columns:\n",
@@ -575,20 +542,16 @@
" lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n",
" \n",
" # Считаем количество выбросов\n",
" outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]\n",
" outliers_count[col] = len(outliers)\n",
" \n",
" return outliers_count\n",
"\n",
"# Подсчитываем выбросы\n",
"outliers_count = count_outliers(df, columns_to_check)\n",
"\n",
"# Выводим количество выбросов для каждого столбца\n",
"for col, count in outliers_count.items():\n",
" print(f\"Количество выбросов в столбце '{col}': {count}\")\n",
"\n",
"# Создаем диаграммы размахов\n",
"plt.figure(figsize=(15, 10))\n",
"for i, col in enumerate(columns_to_check, 1):\n",
" plt.subplot(2, 2, i)\n",
@@ -607,7 +570,7 @@
},
{
"cell_type": "code",
"execution_count": 541,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -629,10 +592,8 @@
}
],
"source": [
"# Выбираем столбцы для очистки\n",
"columns_to_clean = ['Hardness', 'Solids', 'Organic_carbon']\n",
"\n",
"# Функция для удаления выбросов\n",
"def remove_outliers(df, columns):\n",
" for col in columns:\n",
" Q1 = df[col].quantile(0.25)\n",
@@ -641,33 +602,25 @@
" lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n",
" \n",
" # Удаляем строки, содержащие выбросы\n",
" df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]\n",
" \n",
" return df\n",
"\n",
"# Удаляем выбросы\n",
"df_cleaned = remove_outliers(df, columns_to_clean)\n",
"\n",
"# Выводим количество удаленных строк\n",
"print(f\"Количество удаленных строк: {len(df) - len(df_cleaned)}\")\n",
"\n",
"# Создаем диаграммы размаха для очищенных данных\n",
"plt.figure(figsize=(15, 6))\n",
"\n",
"# Диаграмма размаха для Hardness\n",
"plt.subplot(1, 3, 1)\n",
"sns.boxplot(x=df_cleaned['Hardness'])\n",
"plt.title('Box Plot of Hardness (Cleaned)')\n",
"plt.xlabel('Hardness')\n",
"\n",
"# Диаграмма размаха для Solids\n",
"plt.subplot(1, 3, 2)\n",
"sns.boxplot(x=df_cleaned['Solids'])\n",
"plt.title('Box Plot of Solids (Cleaned)')\n",
"plt.xlabel('Solids')\n",
"\n",
"# Диаграмма размаха для Organic_carbon\n",
"plt.subplot(1, 3, 3)\n",
"sns.boxplot(x=df_cleaned['Organic_carbon'])\n",
"plt.title('Box Plot of Organic_carbon (Cleaned)')\n",
@@ -676,7 +629,6 @@
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"# Сохраняем очищенный датасет\n",
"df_cleaned.to_csv(\"..//..//static//csv//water_potability_cleaned.csv\", index=False)\n",
"df = df_cleaned"
]
@@ -692,7 +644,7 @@
},
{
"cell_type": "code",
"execution_count": 542,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -730,17 +682,14 @@
}
],
"source": [
"# Количество пустых значений признаков\n",
"print(df.isnull().sum())\n",
"\n",
"print()\n",
"\n",
"# Есть ли пустые значения признаков\n",
"print(df.isnull().any())\n",
"\n",
"print()\n",
"\n",
"# Процент пустых значений признаков\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
@@ -756,7 +705,7 @@
},
{
"cell_type": "code",
"execution_count": 543,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -780,15 +729,12 @@
}
],
"source": [
"# Замена значений\n",
"df[\"ph\"] = df[\"ph\"].fillna(df[\"ph\"].median())\n",
"df[\"Sulfate\"] = df[\"Sulfate\"].fillna(df[\"Sulfate\"].median())\n",
"df[\"Trihalomethanes\"] = df[\"Trihalomethanes\"].fillna(df[\"Trihalomethanes\"].median())\n",
"\n",
"# Проверка на пропущенные значения после замены\n",
"missing_values_after_drop = df.isnull().sum()\n",
"\n",
"# Вывод результатов после замены\n",
"print(\"\\nКоличество пустых значений в каждом столбце после замены:\")\n",
"print(missing_values_after_drop)"
]
@@ -804,7 +750,7 @@
},
{
"cell_type": "code",
"execution_count": 544,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -818,18 +764,13 @@
}
],
"source": [
"# Разделение на признаки (X) и целевую переменную (y)\n",
"# Предположим, что Potability - это целевая переменная\n",
"X = df.drop('Potability', axis=1)\n",
"y = df['Potability']\n",
"\n",
"# Разбиение на обучающую и остальную выборку (контрольную + тестовую)\n",
"X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=42)\n",
"\n",
"# Разбиение остатка на контрольную и тестовую выборки\n",
"X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42)\n",
"\n",
"# Вывод размеров выборок\n",
"print(\"Размер обучающей выборки:\", X_train.shape)\n",
"print(\"Размер контрольной выборки:\", X_val.shape)\n",
"print(\"Размер тестовой выборки:\", X_test.shape)"
@@ -837,7 +778,7 @@
},
{
"cell_type": "code",
"execution_count": 545,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -865,7 +806,6 @@
}
],
"source": [
"# Функция для анализа сбалансированности\n",
"def analyze_balance(y_train, y_val, y_test):\n",
" print(\"Распределение классов в обучающей выборке:\")\n",
" print(y_train.value_counts(normalize=True))\n",
@@ -876,7 +816,6 @@
" print(\"\\nРаспределение классов в тестовой выборке:\")\n",
" print(y_test.value_counts(normalize=True))\n",
"\n",
"# Анализ сбалансированности\n",
"analyze_balance(y_train, y_val, y_test)"
]
},
@@ -889,7 +828,7 @@
},
{
"cell_type": "code",
"execution_count": 546,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -907,10 +846,8 @@
"source": [
"smote = SMOTE(random_state=42)\n",
"\n",
"# Применение SMOTE для балансировки обучающей выборки\n",
"X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)\n",
"\n",
"# Проверка сбалансированности после SMOTE\n",
"print(\"Сбалансированность обучающей выборки после SMOTE:\")\n",
"print(y_train_resampled.value_counts(normalize=True))"
]
@@ -938,7 +875,7 @@
},
{
"cell_type": "code",
"execution_count": 547,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -958,7 +895,6 @@
"from sklearn.model_selection import train_test_split\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"\n",
"# вывод всех столбцов\n",
"df = pd.read_csv(\"..//..//static//csv//diabetes.csv\")\n",
"print(df.columns)"
]
@@ -1009,7 +945,7 @@
},
{
"cell_type": "code",
"execution_count": 548,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -1033,10 +969,8 @@
}
],
"source": [
"# Выбираем столбцы для анализа\n",
"columns_to_check = ['Age', 'BloodPressure', 'BMI']\n",
"\n",
"# Функция для подсчета выбросов\n",
"def count_outliers(df, columns):\n",
" outliers_count = {}\n",
" for col in columns:\n",
@@ -1046,20 +980,16 @@
" lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n",
" \n",
" # Считаем количество выбросов\n",
" outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]\n",
" outliers_count[col] = len(outliers)\n",
" \n",
" return outliers_count\n",
"\n",
"# Подсчитываем выбросы\n",
"outliers_count = count_outliers(df, columns_to_check)\n",
"\n",
"# Выводим количество выбросов для каждого столбца\n",
"for col, count in outliers_count.items():\n",
" print(f\"Количество выбросов в столбце '{col}': {count}\")\n",
"\n",
"# Создаем диаграммы размахов\n",
"plt.figure(figsize=(15, 10))\n",
"for i, col in enumerate(columns_to_check, 1):\n",
" plt.subplot(2, 2, i)\n",
@@ -1078,7 +1008,7 @@
},
{
"cell_type": "code",
"execution_count": 549,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -1100,10 +1030,8 @@
}
],
"source": [
"# Выбираем столбцы для очистки\n",
"columns_to_clean = ['Age', 'BloodPressure', 'BMI']\n",
"\n",
"# Функция для удаления выбросов\n",
"def remove_outliers(df, columns):\n",
" for col in columns:\n",
" Q1 = df[col].quantile(0.25)\n",
@@ -1112,33 +1040,26 @@
" lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n",
" \n",
" # Удаляем строки, содержащие выбросы\n",
" df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]\n",
" \n",
" return df\n",
"\n",
"# Удаляем выбросы\n",
"df_cleaned = remove_outliers(df, columns_to_clean)\n",
"\n",
"# Выводим количество удаленных строк\n",
"print(f\"Количество удаленных строк: {len(df) - len(df_cleaned)}\")\n",
"\n",
"# Создаем диаграммы размаха для очищенных данных\n",
"plt.figure(figsize=(15, 6))\n",
"\n",
"# Диаграмма размаха для Age\n",
"plt.subplot(1, 3, 1)\n",
"sns.boxplot(x=df_cleaned['Age'])\n",
"plt.title('Box Plot of Age (Cleaned)')\n",
"plt.xlabel('Age')\n",
"\n",
"# Диаграмма размаха для BloodPressure\n",
"plt.subplot(1, 3, 2)\n",
"sns.boxplot(x=df_cleaned['BloodPressure'])\n",
"plt.title('Box Plot of BloodPressure (Cleaned)')\n",
"plt.xlabel('BloodPressure')\n",
"\n",
"# Диаграмма размаха для BMI\n",
"plt.subplot(1, 3, 3)\n",
"sns.boxplot(x=df_cleaned['BMI'])\n",
"plt.title('Box Plot of BMI (Cleaned)')\n",
@@ -1147,7 +1068,6 @@
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"# Сохраняем очищенный датасет\n",
"df_cleaned.to_csv(\"..//..//static//csv//diabetes_cleaned.csv\", index=False)\n",
"df = df_cleaned"
]
@@ -1163,7 +1083,7 @@
},
{
"cell_type": "code",
"execution_count": 550,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -1196,17 +1116,14 @@
}
],
"source": [
"# Количество пустых значений признаков\n",
"print(df.isnull().sum())\n",
"\n",
"print()\n",
"\n",
"# Есть ли пустые значения признаков\n",
"print(df.isnull().any())\n",
"\n",
"print()\n",
"\n",
"# Процент пустых значений признаков\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
@@ -1224,7 +1141,7 @@
},
{
"cell_type": "code",
"execution_count": 551,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -1238,18 +1155,13 @@
}
],
"source": [
"# Разделение на признаки (X) и целевую переменную (y)\n",
"# Предположим, что Outcome - это целевая переменная\n",
"X = df.drop('Outcome', axis=1)\n",
"y = df['Outcome']\n",
"\n",
"# Разбиение на обучающую и остальную выборку (контрольную + тестовую)\n",
"X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=42)\n",
"\n",
"# Разбиение остатка на контрольную и тестовую выборки\n",
"X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42)\n",
"\n",
"# Вывод размеров выборок\n",
"print(\"Размер обучающей выборки:\", X_train.shape)\n",
"print(\"Размер контрольной выборки:\", X_val.shape)\n",
"print(\"Размер тестовой выборки:\", X_test.shape)"
@@ -1257,7 +1169,7 @@
},
{
"cell_type": "code",
"execution_count": 552,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -1285,7 +1197,6 @@
}
],
"source": [
"# Функция для анализа сбалансированности\n",
"def analyze_balance(y_train, y_val, y_test):\n",
" print(\"Распределение классов в обучающей выборке:\")\n",
" print(y_train.value_counts(normalize=True))\n",
@@ -1296,7 +1207,6 @@
" print(\"\\nРаспределение классов в тестовой выборке:\")\n",
" print(y_test.value_counts(normalize=True))\n",
"\n",
"# Анализ сбалансированности\n",
"analyze_balance(y_train, y_val, y_test)"
]
},

View File

@@ -204,7 +204,7 @@
},
{
"cell_type": "code",
"execution_count": 137,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -244,17 +244,14 @@
}
],
"source": [
"# Количество пустых значений признаков\n",
"print(df.isnull().sum())\n",
"\n",
"print()\n",
"\n",
"# Есть ли пустые значения признаков\n",
"print(df.isnull().any())\n",
"\n",
"print()\n",
"\n",
"# Процент пустых значений признаков\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
@@ -270,7 +267,7 @@
},
{
"cell_type": "code",
"execution_count": 138,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -296,13 +293,10 @@
}
],
"source": [
"# Замена значений\n",
"df[\"bmi\"] = df[\"bmi\"].fillna(df[\"bmi\"].median())\n",
"\n",
"# Проверка на пропущенные значения после замены\n",
"missing_values_after_drop = df.isnull().sum()\n",
"\n",
"# Вывод результатов после замены\n",
"print(\"\\nКоличество пустых значений в каждом столбце после замены:\")\n",
"print(missing_values_after_drop)"
]
@@ -344,7 +338,7 @@
},
{
"cell_type": "code",
"execution_count": 140,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -358,19 +352,13 @@
}
],
"source": [
"# Разделение данных на признаки (X) и целевую переменную (y)\n",
"# В данном случае мы хотим предсказать 'stroke'\n",
"X = df.drop(columns=['stroke'])\n",
"y = df['stroke']\n",
"\n",
"# Разбиение данных на обучающую и тестовую выборки\n",
"# Сначала разделим на обучающую и тестовую\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)\n",
"\n",
"# Затем разделим обучающую выборку на обучающую и контрольную\n",
"X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3)\n",
"\n",
"# Проверка размеров выборок\n",
"print(\"Размер обучающей выборки:\", X_train.shape)\n",
"print(\"Размер контрольной выборки:\", X_val.shape)\n",
"print(\"Размер тестовой выборки:\", X_test.shape)"
@@ -385,7 +373,7 @@
},
{
"cell_type": "code",
"execution_count": 141,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -423,9 +411,7 @@
}
],
"source": [
"# Функция для анализа сбалансированности\n",
"def analyze_balance(y_train, y_val, y_test, y_name):\n",
" # Распределение классов\n",
" print(\"Распределение классов в обучающей выборке:\")\n",
" print(y_train.value_counts(normalize=True))\n",
" \n",
@@ -435,22 +421,18 @@
" print(\"\\nРаспределение классов в тестовой выборке:\")\n",
" print(y_test.value_counts(normalize=True))\n",
"\n",
" # Создание фигуры и осей для трех столбчатых диаграмм\n",
" fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)\n",
" fig.suptitle('Распределение в различных выборках')\n",
"\n",
" # Обучающая выборка\n",
" sns.barplot(x=y_train.value_counts().index, y=y_train.value_counts(normalize=True), ax=axes[0])\n",
" axes[0].set_title('Обучающая выборка')\n",
" axes[0].set_xlabel(y_name)\n",
" axes[0].set_ylabel('Доля')\n",
"\n",
" # Контрольная выборка\n",
" sns.barplot(x=y_val.value_counts().index, y=y_val.value_counts(normalize=True), ax=axes[1])\n",
" axes[1].set_title('Контрольная выборка')\n",
" axes[1].set_xlabel(y_name)\n",
"\n",
" # Тестовая выборка\n",
" sns.barplot(x=y_test.value_counts().index, y=y_test.value_counts(normalize=True), ax=axes[2])\n",
" axes[2].set_title('Тестовая выборка')\n",
" axes[2].set_xlabel(y_name)\n",
@@ -469,7 +451,7 @@
},
{
"cell_type": "code",
"execution_count": 142,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -509,11 +491,9 @@
"source": [
"ros = RandomOverSampler(random_state=42)\n",
"\n",
"# Применение RandomOverSampler для балансировки выборок\n",
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
"X_val_resampled, y_val_resampled = ros.fit_resample(X_val, y_val)\n",
"\n",
"# Проверка сбалансированности после RandomOverSampler\n",
"analyze_balance(y_train_resampled, y_val_resampled, y_test, 'stroke')"
]
},
@@ -530,7 +510,7 @@
},
{
"cell_type": "code",
"execution_count": 143,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -575,16 +555,12 @@
}
],
"source": [
"# Определение категориальных признаков\n",
"categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']\n",
"\n",
"# Применение one-hot encoding к обучающей выборке\n",
"X_train_encoded = pd.get_dummies(X_train_resampled, columns=categorical_features, drop_first=True)\n",
"\n",
"# Применение one-hot encoding к контрольной выборке\n",
"X_val_encoded = pd.get_dummies(X_val_resampled, columns=categorical_features, drop_first=True)\n",
"\n",
"# Применение one-hot encoding к тестовой выборке\n",
"X_test_encoded = pd.get_dummies(X_test, columns=categorical_features, drop_first=True)\n",
"\n",
"print(X_train_encoded.head())"
@@ -599,7 +575,7 @@
},
{
"cell_type": "code",
"execution_count": 144,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -644,21 +620,17 @@
}
],
"source": [
"# Определение числовых признаков для дискретизации\n",
"numerical_features = ['age']\n",
"\n",
"# Функция для дискретизации числовых признаков\n",
"def discretize_features(df, features, bins, labels):\n",
" for feature in features:\n",
" df[f'{feature}_bin'] = pd.cut(df[feature], bins=bins, labels=labels)\n",
" df.drop(columns=[feature], inplace=True)\n",
" return df\n",
"\n",
"# Заданные интервалы и метки\n",
"age_bins = [0, 25, 55, 100]\n",
"age_labels = [\"young\", \"middle-aged\", \"old\"]\n",
"\n",
"# Применение дискретизации к обучающей, контрольной и тестовой выборкам\n",
"X_train_encoded = discretize_features(X_train_encoded, numerical_features, bins=age_bins, labels=age_labels)\n",
"X_val_encoded = discretize_features(X_val_encoded, numerical_features, bins=age_bins, labels=age_labels)\n",
"X_test_encoded = discretize_features(X_test_encoded, numerical_features, bins=age_bins, labels=age_labels)\n",
@@ -741,7 +713,7 @@
},
{
"cell_type": "code",
"execution_count": 146,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -786,7 +758,6 @@
}
],
"source": [
"# Пример масштабирования числовых признаков\n",
"numerical_features = ['avg_glucose_level', 'bmi', 'glucose_age_deviation']\n",
"\n",
"scaler = StandardScaler()\n",
@@ -806,7 +777,7 @@
},
{
"cell_type": "code",
"execution_count": 147,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -872,7 +843,7 @@
}
],
"source": [
"data = X_train_encoded.copy() # Используем предобработанные данные\n",
"data = X_train_encoded.copy()\n",
"\n",
"es = ft.EntitySet(id=\"patients\")\n",
"\n",
@@ -918,7 +889,7 @@
},
{
"cell_type": "code",
"execution_count": 148,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -930,23 +901,15 @@
}
],
"source": [
"X_train_encoded = pd.get_dummies(X_train_encoded, drop_first=True)\n",
"X_val_encoded = pd.get_dummies(X_val_encoded, drop_first=True)\n",
"X_test_encoded = pd.get_dummies(X_test_encoded, drop_first=True)\n",
"\n",
"all_columns = X_train_encoded.columns\n",
"X_train_encoded = X_train_encoded.reindex(columns=all_columns, fill_value=0)\n",
"X_val_encoded = X_val_encoded.reindex(columns=all_columns, fill_value=0)\n",
"X_test_encoded = X_test_encoded.reindex(columns=all_columns, fill_value=0)\n",
"\n",
"# Выбор модели\n",
"model = RandomForestClassifier(n_estimators=100, random_state=42)\n",
"\n",
"# Начинаем отсчет времени\n",
"start_time = time.time()\n",
"model.fit(X_train_encoded, y_train_resampled)\n",
"\n",
"# Время обучения модели\n",
"train_time = time.time() - start_time\n",
"\n",
"print(f'Время обучения модели: {train_time:.2f} секунд')"
@@ -954,7 +917,7 @@
},
{
"cell_type": "code",
"execution_count": 149,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -985,11 +948,9 @@
}
],
"source": [
"# Получение важности признаков\n",
"importances = model.feature_importances_\n",
"feature_names = X_train_encoded.columns\n",
"\n",
"# Сортировка признаков по важности\n",
"feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})\n",
"feature_importance = feature_importance.sort_values(by='importance', ascending=False)\n",
"\n",
@@ -999,7 +960,7 @@
},
{
"cell_type": "code",
"execution_count": 150,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -1037,7 +998,6 @@
}
],
"source": [
"# Предсказание и оценка\n",
"y_pred = model.predict(X_test_encoded)\n",
"\n",
"accuracy = accuracy_score(y_test, y_pred)\n",
@@ -1052,12 +1012,10 @@
"print(f\"F1 Score: {f1}\")\n",
"print(f\"ROC AUC: {roc_auc}\")\n",
"\n",
"# Кросс-валидация\n",
"scores = cross_val_score(model, X_train_encoded, y_train_resampled, cv=5, scoring='accuracy')\n",
"accuracy_cv = scores.mean()\n",
"print(f\"Cross-validated Accuracy: {accuracy_cv}\")\n",
"\n",
"# Анализ важности признаков\n",
"feature_importances = model.feature_importances_\n",
"feature_names = X_train_encoded.columns\n",
"\n",
@@ -1069,7 +1027,6 @@
"plt.title('Feature Importance')\n",
"plt.show()\n",
"\n",
"# Проверка на переобучение\n",
"y_train_pred = model.predict(X_train_encoded)\n",
"\n",
"accuracy_train = accuracy_score(y_train_resampled, y_train_pred)\n",

View File

@@ -1153,7 +1153,7 @@
},
{
"cell_type": "code",
"execution_count": 86,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -1171,13 +1171,10 @@
"import numpy as np\n",
"from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score\n",
"\n",
"# Получаем уникальные классы для целевого признака из тренировочного набора данных\n",
"unique_classes = np.unique(y_train)\n",
"\n",
"# Генерируем случайные предсказания, выбирая случайное значение из области значений целевого признака\n",
"random_predictions = np.random.choice(unique_classes, size=len(y_test))\n",
"\n",
"# Вычисление метрик для ориентира\n",
"baseline_accuracy = accuracy_score(y_test, random_predictions)\n",
"baseline_precision = precision_score(y_test, random_predictions)\n",
"baseline_recall = recall_score(y_test, random_predictions)\n",
@@ -1624,7 +1621,6 @@
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn import neighbors, ensemble, neural_network\n",
"\n",
"# Словарь с вариантами гиперпараметров для каждой модели\n",
"param_grids = {\n",
" \"knn\": {\n",
" \"n_neighbors\": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], \n",
@@ -1648,22 +1644,17 @@
" }\n",
"}\n",
"\n",
"# Создаем экземпляры моделей\n",
"models = {\n",
" \"knn\": neighbors.KNeighborsClassifier(),\n",
" \"random_forest\": ensemble.RandomForestClassifier(),\n",
" \"mlp\": neural_network.MLPClassifier()\n",
"}\n",
"\n",
"# Словарь для хранения моделей с их лучшими параметрами\n",
"class_models = {}\n",
"\n",
"# Выполнение поиска по сетке для каждой модели\n",
"for model_name, model in models.items():\n",
" # Создаем GridSearchCV для текущей модели\n",
" gs_optimizer = GridSearchCV(estimator=model, param_grid=param_grids[model_name], scoring=\"f1\", n_jobs=-1)\n",
" \n",
" # Обучаем GridSearchCV\n",
" gs_optimizer.fit(preprocessed_df, y_train.values.ravel())\n",
" \n",
" # Получаем лучшие параметры\n",
@@ -1671,7 +1662,7 @@
" print(f\"Лучшие параметры для {model_name}: {best_params}\")\n",
" \n",
" class_models[model_name] = {\n",
" \"model\": model.set_params(**best_params) # Настраиваем модель с лучшими параметрами\n",
" \"model\": model.set_params(**best_params) \n",
" }"
]
},
@@ -2586,7 +2577,7 @@
},
{
"cell_type": "code",
"execution_count": 94,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -2603,10 +2594,8 @@
"import math\n",
"from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n",
"\n",
"# Базовое предсказание: среднее значение по y_train\n",
"baseline_predictions = [y_train.mean()] * len(y_test)\n",
"\n",
"# Вычисление метрик качества для ориентира\n",
"baseline_rmse = math.sqrt(\n",
" mean_squared_error(y_test, baseline_predictions)\n",
" )\n",
@@ -3111,7 +3100,6 @@
}
],
"source": [
"# Словарь с вариантами гиперпараметров для каждой модели\n",
"param_grids = {\n",
" \"knn\": {\n",
" \"n_neighbors\": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], \n",
@@ -3136,30 +3124,24 @@
" }\n",
"}\n",
"\n",
"# Создаем экземпляры моделей\n",
"models = {\n",
" \"knn\": neighbors.KNeighborsRegressor(),\n",
" \"random_forest\": ensemble.RandomForestRegressor(),\n",
" \"mlp\": neural_network.MLPRegressor()\n",
"}\n",
"\n",
"# Словарь для хранения моделей с их лучшими параметрами\n",
"class_models = {}\n",
"\n",
"# Выполнение поиска по сетке для каждой модели\n",
"for model_name, model in models.items():\n",
" # Создаем GridSearchCV для текущей модели\n",
" gs_optimizer = GridSearchCV(estimator=model, param_grid=param_grids[model_name], scoring='neg_mean_squared_error', n_jobs=-1)\n",
" \n",
" # Обучаем GridSearchCV\n",
" gs_optimizer.fit(preprocessed_df, y_train.values.ravel())\n",
" \n",
" # Получаем лучшие параметры\n",
" best_params = gs_optimizer.best_params_\n",
" print(f\"Лучшие параметры для {model_name}: {best_params}\")\n",
" \n",
" class_models[model_name] = {\n",
" \"model\": model.set_params(**best_params) # Настраиваем модель с лучшими параметрами\n",
" \"model\": model.set_params(**best_params)\n",
" }"
]
},
@@ -3323,7 +3305,7 @@
},
{
"cell_type": "code",
"execution_count": 100,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -3379,7 +3361,6 @@
}
],
"source": [
"# Создаем графики для всех моделей\n",
"for model_name, model_data in class_models.items():\n",
" print(f\"Model: {model_name}\")\n",
" y_pred = model_data[\"preds\"]\n",

View File

@@ -308,7 +308,7 @@
},
{
"cell_type": "code",
"execution_count": 353,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -348,17 +348,14 @@
}
],
"source": [
"# Количество пустых значений признаков\n",
"print(df.isnull().sum())\n",
"\n",
"print()\n",
"\n",
"# Есть ли пустые значения признаков\n",
"print(df.isnull().any())\n",
"\n",
"print()\n",
"\n",
"# Процент пустых значений признаков\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
@@ -367,11 +364,10 @@
},
{
"cell_type": "code",
"execution_count": 354,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Замена значений\n",
"df[\"bmi\"] = df[\"bmi\"].fillna(df[\"bmi\"].median())"
]
},

View File

@@ -114,7 +114,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -122,15 +122,9 @@
"import emoji\n",
"from num2words import num2words\n",
"\n",
"# Функция для преобразования эмоджи в слова\n",
"def emojis_words(text):\n",
" \n",
" # Модуль emoji: преобразование эмоджи в их словесные описания\n",
" text = emoji.demojize(text, delimiters=(\" \", \" \"))\n",
" \n",
" # Редактирование текста путём замены \":\" и\" _\", а так же - путём добавления пробела между отдельными словами\n",
" text = text.replace(\":\", \"\").replace(\"_\", \" \")\n",
" \n",
" return text\n",
"\n",
"def transform_text(text):\n",
@@ -140,22 +134,17 @@
" # Удаление из текста всех URL и ссылок\n",
" text = re.sub(r'http\\S+', '', text)\n",
"\n",
" # Преобразование эмоджи в текст\n",
" text = emojis_words(text)\n",
"\n",
" # Приведение к нижнему регистру\n",
" text = text.lower()\n",
"\n",
" # Удаление лишних пробелов\n",
" text = re.sub(r'\\s+', ' ', text) \n",
" \n",
" # Преобразование \"ё\" в \"е\"\n",
" text = text.replace(\"ё\", \"е\")\n",
"\n",
" # Удаление всех специальных символов\n",
" text = re.sub(r'[^a-zA-Zа-яА-Я0-9\\s]', '', text)\n",
"\n",
" # Преобразование чисел в слова\n",
" words: list[str] = text.split()\n",
" words = [num2words(word, lang=\"ru\") if word.isdigit() else word for word in words]\n",
" text = \" \".join(words)\n",
@@ -177,7 +166,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -197,16 +186,15 @@
" doc = sp(text)\n",
" \n",
" filtered_tokens = [\n",
" f\"{token.lemma_}_{token.pos_}_{token.morph}\" # Формирование строки с нужным форматом\n",
" f\"{token.lemma_}_{token.pos_}_{token.morph}\"\n",
" for token in doc\n",
" if token.text not in stop_words and len(token.text) <= 20 # Фильтрация \n",
" if token.text not in stop_words and len(token.text) <= 20 \n",
" ]\n",
" \n",
" return \" \".join(filtered_tokens)\n",
"\n",
"df[\"preprocessed_text\"] = df[\"preprocessed_text\"].apply(preprocess_text)\n",
"\n",
"# Выведем 10 токенов из первого текста\n",
"first_text_tokens = df[\"preprocessed_text\"].iloc[0].split()[:10]\n",
"print(\" \".join(first_text_tokens))"
]
@@ -220,7 +208,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -299,10 +287,8 @@
" n_grams: list[tuple] = list(ngrams(tokens, n))\n",
" return n_grams\n",
"\n",
"# Пример для биграмм (N=2)\n",
"df[\"bigrams\"] = df[\"preprocessed_text\"].apply(lambda x: generate_ngrams(x, n=2))\n",
"\n",
"# Пример для триграмм (N=3)\n",
"df[\"trigrams\"] = df[\"preprocessed_text\"].apply(lambda x: generate_ngrams(x, n=3))\n",
"\n",
"print(df.iloc[15:25])"

View File

@@ -98,7 +98,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"id": "5b915c12",
"metadata": {},
"outputs": [
@@ -117,11 +117,8 @@
"def preprocess_images(images):\n",
" processed_images = []\n",
" for img in images:\n",
" # Изменение размера\n",
" img_resized = cv2.resize(img, (128, 128))\n",
" # Преобразование в оттенки серого\n",
" img_gray = cv2.cvtColor(img_resized, cv2.COLOR_BGR2GRAY)\n",
" # Увеличение контраста с помощью выравнивания гистограммы\n",
" img_eq = cv2.equalizeHist(img_gray)\n",
" processed_images.append(img_eq)\n",
" return np.array(processed_images)\n",
@@ -156,7 +153,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"id": "7cc2f6b2",
"metadata": {},
"outputs": [
@@ -173,12 +170,9 @@
],
"source": [
"def apply_filters(img):\n",
" # Удаление шумов\n",
" img_blur = cv2.GaussianBlur(img, (5, 5), 0)\n",
" # Повышение резкости\n",
" kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])\n",
" img_sharp = cv2.filter2D(img_blur, -1, kernel)\n",
" # Определение границ\n",
" img_edges = cv2.Canny(img_sharp, 100, 200)\n",
" return img_edges\n",
"\n",